def testMsgGen(self): email_template_path = "D:/Test/email_content_template.txt" email_content_save_path = "D:/Test/email_content_saved.txt" email_lines_before_table_path = "D:/Test/email_text_before_table.txt" email_lines_after_table_path = "D:/Test/email_text_after_table.txt" data_file_path = "D:/Test/data_sample.csv" # th for head cell, td for data cell email_template = FileHandler.read_all_from_file(email_template_path) cell_item_template = '<{0:s} style="-webkit-box-sizing: border-box;-moz-box-sizing: border-box;box-sizing: ' \ 'border-box;padding: 8px;text-align: left;line-height: 1.42857143;vertical-align: ' \ 'bottom;border-top: 1px solid #ddd;border-bottom: 2px solid #ddd;border: 1px solid ' \ '#ddd!important;border-bottom-width: 2px;background-color: #fff!important;">' \ '{1:s}</{0:s}>' row_item_template = '<tr style="-webkit-box-sizing: border-box;-moz-box-sizing: border-box;box-sizing:' \ ' border-box;page-break-inside: avoid;">{0:s}</tr>' line_format = '<p style="-webkit-box-sizing: border-box;-moz-box-sizing: border-box;box-sizing: ' \ 'border-box;orphans: 3;widows: 3;margin: 0 0 10px;">{0:s}</p><br>' before_table_lines = FileHandler.read_lines_from_file( email_lines_before_table_path, remove_blank_line=False) after_table_lines = FileHandler.read_lines_from_file( email_lines_after_table_path, remove_blank_line=False) before_table_str = "".join( [line_format.format(x, ) for x in before_table_lines]) after_table_str = "".join( [line_format.format(x, ) for x in after_table_lines]) table_cells_str = "" with open(data_file_path, mode='r', newline='') as csv_file: reader = csv.reader(csv_file, delimiter=',') header = next(reader) header_row_str = row_item_template.format("".join( [cell_item_template.format( "th", x, ) for x in header])) for row in reader: table_cells_str += row_item_template.format("".join( [cell_item_template.format( "td", x, ) for x in row])) email_content = email_template.format(before_table_str, 50, header_row_str, table_cells_str, after_table_str) FileHandler.remove_file_if_exist(email_content_save_path) FileHandler.append_line_to_file(email_content_save_path, email_content) return email_content
def testGetBlogs(self): niche = "Society/Law" proxy_site = BuyProxyOrg(buy_proxy_org_account) proxies = proxy_site.get_proxies(timeout=5) keyword_log_path = "/Users/superCat/Desktop/PycharmProjectPortable/Seeds/KeywordSuggestions/"+niche.replace('/', '-')+".txt" # countries = GoogleUtility.CountryCodeEnglish countries = ["uk", ] min_delay = 2 max_delay = 5 max_page = 2 days_ago = 4*365 target_keywords_init = ["legal case", "Labour law", "human rights law", "crime law", "Immigration law", "Family law", "Transactional law", "Company law", "Commercial law", "Admiralty law", "Intellectual property law", "international law", "tax law", "banking law", "competition law", "consumer law", "environmental law"] suggested_keywords = [] for country in countries: # temp_keywords = self.testGetSuggestionBatch(target_keywords_init, proxies=proxies, # country_code=country, # min_delay=min_delay, max_delay=max_delay) temp_keywords = list(set(FileHandler.read_lines_from_file(keyword_log_path))) # FileHandler.append_lines_to_file(keyword_log_path, temp_keywords, option="at") # suggested_keywords += temp_keywords crawl_keywords = [x for x in list(set(target_keywords_init + temp_keywords))] self.testGetLinksBatch_single_t(niche, keywords=crawl_keywords, page_count=max_page, index=0, length=100, country_code=country, source_type=GoogleConst.SourceTypeBlog, min_delay=min_delay, max_delay=max_delay, days_ago=days_ago, proxies=proxies, use_browser=False)
def testScrapePageBatch(self): save_path = "/Users/superCat/Desktop/PycharmProjectPortable/test/profile_archive_downloaded.csv" file_path = "/Users/superCat/Desktop/PycharmProjectPortable/test/profile_test_links.txt" CsvLogger.log_to_file_path(save_path, [ArchiveDetail.get_title()]) domains_links = FileHandler.read_lines_from_file(file_path) for link in domains_links: # link = "http://web.archive.org/web/20140711025724/http://susodigital.com/" #link ="http://web.archive.org/web/20130415001342/http://www.bbc.co.uk/" stop_event = multiprocessing.Event() inner_link, domain, path, link_class, ext, fragment = LinkUtility.get_link_detail( link) root_domain = LinkChecker.get_root_domain(domain)[1] path = "/index.html" link_s = LinkAttrs(link=link, path=path, ref_link="/", shadow_ref_link="/", source=path, res_type=LinkUtility.EXT_WEBPAGE, level=0) explorer = ArchiveExplorer( original_domain=root_domain, link=link, external_stop_event=stop_event, download_base_dir=FilePath.get_default_archive_dir(), max_thread=10, max_level=2) explorer.run() archive_detail = explorer.get_archive_detail() CsvLogger.log_to_file_path(save_path, [archive_detail.to_tuple()])
def testGetBestProfileBatch(self): file_path = "/Users/superCat/Desktop/PycharmProjectPortable/test/profile_test_source.txt" domains = FileHandler.read_lines_from_file(file_path) save_path = "/Users/superCat/Desktop/PycharmProjectPortable/test/profile_archive.csv" CsvLogger.log_to_file_path(save_path, [ArchiveDetail.get_title()]) for domain in domains: print("begin domain:", domain) try: archive = ArchiveOrg.get_best_archive(root_domain=domain, thread_size=100, profile_check=10, pass_threshold=0.9, res_limit=2000) CsvLogger.log_to_file_path(save_path, [archive.to_tuple()]) except Exception as ex: print(ex)
def testScrapePageBatch(self): save_path = "/Users/superCat/Desktop/PycharmProjectPortable/test/profile_archive_downloaded.csv" file_path = "/Users/superCat/Desktop/PycharmProjectPortable/test/profile_test_links.txt" CsvLogger.log_to_file_path(save_path, [ArchiveDetail.get_title()]) domains_links = FileHandler.read_lines_from_file(file_path) for link in domains_links: # link = "http://web.archive.org/web/20140711025724/http://susodigital.com/" #link ="http://web.archive.org/web/20130415001342/http://www.bbc.co.uk/" stop_event = multiprocessing.Event() inner_link, domain, path, link_class, ext, fragment = LinkUtility.get_link_detail(link) root_domain = LinkChecker.get_root_domain(domain)[1] path = "/index.html" link_s = LinkAttrs(link=link, path=path, ref_link="/", shadow_ref_link="/", source=path, res_type=LinkUtility.EXT_WEBPAGE, level=0) explorer = ArchiveExplorer(original_domain=root_domain, link=link, external_stop_event=stop_event, download_base_dir=FilePath.get_default_archive_dir(), max_thread=10, max_level=2) explorer.run() archive_detail = explorer.get_archive_detail() CsvLogger.log_to_file_path(save_path, [archive_detail.to_tuple()])
def testGetkeywordsRecursive(self, niche="Society/Law", level=1, keyword_init=[], proxies=None, country_code="us", min_delay=2, max_delay=5, offset=120): keyword_log_path = "/Users/superCat/Desktop/PycharmProjectPortable/Seeds/KeywordSuggestions/"+niche.replace('/', '-')+".txt" def save_callback(keywords: list): FileHandler.append_lines_to_file(keyword_log_path, keywords, option="at") if len(keyword_init) == 0: keyword_init = list(set(FileHandler.read_lines_from_file(keyword_log_path)))[offset:] for item in keyword_init: print(item) print("total keywords:", len(keyword_init)) if proxies is None: proxy_site = BuyProxyOrg(buy_proxy_org_account) proxies = proxy_site.get_proxies(timeout=5) current_level = 0 keywords_pool = keyword_init while current_level < level: keyword_init = self.testGetSuggestionBatch(keyword_init, proxies=proxies, country_code=country_code, min_delay=min_delay, max_delay=max_delay, callback=save_callback) keywords_pool += keyword_init current_level += 1 FileHandler.remove_file_if_exist(keyword_log_path) FileHandler.append_lines_to_file(keyword_log_path, keywords_pool, option="t")