예제 #1
0
 def __init__(self,
              path,
              domain,
              year,
              incyear=None,
              skip_memory_error=False):
     domain = data_reader.clean_domain_url(domain)
     self.texts = self.load_domain(path,
                                   domain,
                                   year,
                                   incyear,
                                   skip_memory_error=skip_memory_error)
    def split_wayback_url(self, wayback_url):
        original_url = re.sub(r'http://web.archive.org/web/\d+/', "",
                              wayback_url)
        website_piece = re.sub(r"http(s?)\://", "", original_url)

        try:
            (domain, address) = website_piece.split("/", 1)
        except ValueError:
            domain = website_piece
            address = ""

        domain = data_reader.clean_domain_url(domain)

        return (domain, address)
def download_all(websites):
    global company_index_track
    
    counter = 0
    count_downloaded = 0
    count_skipped = 0
    total_websites = websites.shape[0]
    
    print("\n\n\nStarting the scraping of all websites.  A total of {0} websites\n\n".format(total_websites))


    last_company = get_last_company()    
    for index, company in websites.iterrows():    

        if (counter % 100) == 0: 
            downloaded = read_already_downloaded()
            
        counter += 1
        if counter < company_index_track or counter < last_company:
            continue
        

        print("\nStarting crawl number {0} of {1} : {2}".format(counter,total_websites, company['website']))

        if data_reader.clean_domain_url(company['website']) in downloaded:
            print(".Skipping {0}. Already downloaded".format(company['website']))
            count_skipped += 1
            continue

        crawler = waybackmachine_crawler(company['website'])
        year = company['founding_year'] + 1
        crawler.crawl_from_date(year,1,1)

        company_index_track = counter
        store_last_company(counter)
        count_downloaded += 1
        tot = count_downloaded + count_skipped
        print("\t. -- Download done.\n\ \t. STATS: {0} Downloaded ({1}%). {2} Skipped ({3}%)".format(count_downloaded,  round(count_downloaded*100/tot), count_skipped, round(count_skipped *100/tot)))
예제 #4
0
    def load_domain(self,
                    path,
                    domain,
                    year=None,
                    incyear=None,
                    force_download=False,
                    skip_memory_error=False):

        clean_domain = data_reader.clean_domain_url(domain)
        root_folder = "{0}/{1}".format(path, clean_domain).replace("//", "/")

        if year is None:
            file_folder = root_folder
        else:
            file_folder = "{0}/{1}/{2}".format(path, clean_domain,
                                               year).replace("//", "/")

        if os.path.exists(root_folder) is False or os.path.exists(
                file_folder) is False or os.path.isdir(file_folder) is False:
            if force_download is True:
                #depends on whether it is startup download of public download
                pdb.set_trace()
                download_year = year if year is not None else incyear
                download_year = int(download_year)
                self.force_download(root_folder, domain, download_year)
            else:
                return None

        files = []
        for file_name in os.listdir(file_folder):
            text = self.load_page(file_folder + "/" + file_name,
                                  skip_memory_error=skip_memory_error)
            text = re.sub(r"\s+", " ", text)
            files.append(text)

        return files