def __init__(self, path, domain, year, incyear=None, skip_memory_error=False): domain = data_reader.clean_domain_url(domain) self.texts = self.load_domain(path, domain, year, incyear, skip_memory_error=skip_memory_error)
def split_wayback_url(self, wayback_url): original_url = re.sub(r'http://web.archive.org/web/\d+/', "", wayback_url) website_piece = re.sub(r"http(s?)\://", "", original_url) try: (domain, address) = website_piece.split("/", 1) except ValueError: domain = website_piece address = "" domain = data_reader.clean_domain_url(domain) return (domain, address)
def download_all(websites): global company_index_track counter = 0 count_downloaded = 0 count_skipped = 0 total_websites = websites.shape[0] print("\n\n\nStarting the scraping of all websites. A total of {0} websites\n\n".format(total_websites)) last_company = get_last_company() for index, company in websites.iterrows(): if (counter % 100) == 0: downloaded = read_already_downloaded() counter += 1 if counter < company_index_track or counter < last_company: continue print("\nStarting crawl number {0} of {1} : {2}".format(counter,total_websites, company['website'])) if data_reader.clean_domain_url(company['website']) in downloaded: print(".Skipping {0}. Already downloaded".format(company['website'])) count_skipped += 1 continue crawler = waybackmachine_crawler(company['website']) year = company['founding_year'] + 1 crawler.crawl_from_date(year,1,1) company_index_track = counter store_last_company(counter) count_downloaded += 1 tot = count_downloaded + count_skipped print("\t. -- Download done.\n\ \t. STATS: {0} Downloaded ({1}%). {2} Skipped ({3}%)".format(count_downloaded, round(count_downloaded*100/tot), count_skipped, round(count_skipped *100/tot)))
def load_domain(self, path, domain, year=None, incyear=None, force_download=False, skip_memory_error=False): clean_domain = data_reader.clean_domain_url(domain) root_folder = "{0}/{1}".format(path, clean_domain).replace("//", "/") if year is None: file_folder = root_folder else: file_folder = "{0}/{1}/{2}".format(path, clean_domain, year).replace("//", "/") if os.path.exists(root_folder) is False or os.path.exists( file_folder) is False or os.path.isdir(file_folder) is False: if force_download is True: #depends on whether it is startup download of public download pdb.set_trace() download_year = year if year is not None else incyear download_year = int(download_year) self.force_download(root_folder, domain, download_year) else: return None files = [] for file_name in os.listdir(file_folder): text = self.load_page(file_folder + "/" + file_name, skip_memory_error=skip_memory_error) text = re.sub(r"\s+", " ", text) files.append(text) return files