Exemplo n.º 1
0
    def testMsgGen(self):
        email_template_path = "D:/Test/email_content_template.txt"
        email_content_save_path = "D:/Test/email_content_saved.txt"
        email_lines_before_table_path = "D:/Test/email_text_before_table.txt"
        email_lines_after_table_path = "D:/Test/email_text_after_table.txt"
        data_file_path = "D:/Test/data_sample.csv"
        # th for head cell, td for data cell
        email_template = FileHandler.read_all_from_file(email_template_path)
        cell_item_template = '<{0:s} style="-webkit-box-sizing: border-box;-moz-box-sizing: border-box;box-sizing: ' \
                             'border-box;padding: 8px;text-align: left;line-height: 1.42857143;vertical-align: ' \
                             'bottom;border-top: 1px solid #ddd;border-bottom: 2px solid #ddd;border: 1px solid ' \
                             '#ddd!important;border-bottom-width: 2px;background-color: #fff!important;">' \
                             '{1:s}</{0:s}>'
        row_item_template = '<tr style="-webkit-box-sizing: border-box;-moz-box-sizing: border-box;box-sizing:' \
                            ' border-box;page-break-inside: avoid;">{0:s}</tr>'
        line_format = '<p style="-webkit-box-sizing: border-box;-moz-box-sizing: border-box;box-sizing: ' \
                      'border-box;orphans: 3;widows: 3;margin: 0 0 10px;">{0:s}</p><br>'
        before_table_lines = FileHandler.read_lines_from_file(
            email_lines_before_table_path, remove_blank_line=False)
        after_table_lines = FileHandler.read_lines_from_file(
            email_lines_after_table_path, remove_blank_line=False)

        before_table_str = "".join(
            [line_format.format(x, ) for x in before_table_lines])
        after_table_str = "".join(
            [line_format.format(x, ) for x in after_table_lines])
        table_cells_str = ""
        with open(data_file_path, mode='r', newline='') as csv_file:
            reader = csv.reader(csv_file, delimiter=',')
            header = next(reader)
            header_row_str = row_item_template.format("".join(
                [cell_item_template.format(
                    "th",
                    x,
                ) for x in header]))
            for row in reader:
                table_cells_str += row_item_template.format("".join(
                    [cell_item_template.format(
                        "td",
                        x,
                    ) for x in row]))

        email_content = email_template.format(before_table_str, 50,
                                              header_row_str, table_cells_str,
                                              after_table_str)
        FileHandler.remove_file_if_exist(email_content_save_path)
        FileHandler.append_line_to_file(email_content_save_path, email_content)
        return email_content
 def testGetBlogs(self):
     niche = "Society/Law"
     proxy_site = BuyProxyOrg(buy_proxy_org_account)
     proxies = proxy_site.get_proxies(timeout=5)
     keyword_log_path = "/Users/superCat/Desktop/PycharmProjectPortable/Seeds/KeywordSuggestions/"+niche.replace('/', '-')+".txt"
     # countries = GoogleUtility.CountryCodeEnglish
     countries = ["uk", ]
     min_delay = 2
     max_delay = 5
     max_page = 2
     days_ago = 4*365
     target_keywords_init = ["legal case", "Labour law", "human rights law", "crime law", "Immigration law",
                             "Family law", "Transactional law", "Company law", "Commercial law", "Admiralty law",
                             "Intellectual property law", "international law", "tax law", "banking law", "competition law",
                             "consumer law", "environmental law"]
     suggested_keywords = []
     for country in countries:
         # temp_keywords = self.testGetSuggestionBatch(target_keywords_init, proxies=proxies,
         #                                                   country_code=country,
         #                                                   min_delay=min_delay, max_delay=max_delay)
         temp_keywords = list(set(FileHandler.read_lines_from_file(keyword_log_path)))
         # FileHandler.append_lines_to_file(keyword_log_path, temp_keywords, option="at")
         # suggested_keywords += temp_keywords
         crawl_keywords = [x for x in list(set(target_keywords_init + temp_keywords))]
         self.testGetLinksBatch_single_t(niche, keywords=crawl_keywords, page_count=max_page, index=0, length=100,
                                         country_code=country, source_type=GoogleConst.SourceTypeBlog,
                                         min_delay=min_delay, max_delay=max_delay, days_ago=days_ago,
                                         proxies=proxies, use_browser=False)
 def testScrapePageBatch(self):
     save_path = "/Users/superCat/Desktop/PycharmProjectPortable/test/profile_archive_downloaded.csv"
     file_path = "/Users/superCat/Desktop/PycharmProjectPortable/test/profile_test_links.txt"
     CsvLogger.log_to_file_path(save_path, [ArchiveDetail.get_title()])
     domains_links = FileHandler.read_lines_from_file(file_path)
     for link in domains_links:
         # link = "http://web.archive.org/web/20140711025724/http://susodigital.com/"
         #link ="http://web.archive.org/web/20130415001342/http://www.bbc.co.uk/"
         stop_event = multiprocessing.Event()
         inner_link, domain, path, link_class, ext, fragment = LinkUtility.get_link_detail(
             link)
         root_domain = LinkChecker.get_root_domain(domain)[1]
         path = "/index.html"
         link_s = LinkAttrs(link=link,
                            path=path,
                            ref_link="/",
                            shadow_ref_link="/",
                            source=path,
                            res_type=LinkUtility.EXT_WEBPAGE,
                            level=0)
         explorer = ArchiveExplorer(
             original_domain=root_domain,
             link=link,
             external_stop_event=stop_event,
             download_base_dir=FilePath.get_default_archive_dir(),
             max_thread=10,
             max_level=2)
         explorer.run()
         archive_detail = explorer.get_archive_detail()
         CsvLogger.log_to_file_path(save_path, [archive_detail.to_tuple()])
 def testGetBestProfileBatch(self):
     file_path = "/Users/superCat/Desktop/PycharmProjectPortable/test/profile_test_source.txt"
     domains = FileHandler.read_lines_from_file(file_path)
     save_path = "/Users/superCat/Desktop/PycharmProjectPortable/test/profile_archive.csv"
     CsvLogger.log_to_file_path(save_path, [ArchiveDetail.get_title()])
     for domain in domains:
         print("begin domain:", domain)
         try:
             archive = ArchiveOrg.get_best_archive(root_domain=domain, thread_size=100, profile_check=10, pass_threshold=0.9, res_limit=2000)
             CsvLogger.log_to_file_path(save_path, [archive.to_tuple()])
         except Exception as ex:
             print(ex)
 def testGetBestProfileBatch(self):
     file_path = "/Users/superCat/Desktop/PycharmProjectPortable/test/profile_test_source.txt"
     domains = FileHandler.read_lines_from_file(file_path)
     save_path = "/Users/superCat/Desktop/PycharmProjectPortable/test/profile_archive.csv"
     CsvLogger.log_to_file_path(save_path, [ArchiveDetail.get_title()])
     for domain in domains:
         print("begin domain:", domain)
         try:
             archive = ArchiveOrg.get_best_archive(root_domain=domain,
                                                   thread_size=100,
                                                   profile_check=10,
                                                   pass_threshold=0.9,
                                                   res_limit=2000)
             CsvLogger.log_to_file_path(save_path, [archive.to_tuple()])
         except Exception as ex:
             print(ex)
 def testScrapePageBatch(self):
     save_path = "/Users/superCat/Desktop/PycharmProjectPortable/test/profile_archive_downloaded.csv"
     file_path = "/Users/superCat/Desktop/PycharmProjectPortable/test/profile_test_links.txt"
     CsvLogger.log_to_file_path(save_path, [ArchiveDetail.get_title()])
     domains_links = FileHandler.read_lines_from_file(file_path)
     for link in domains_links:
         # link = "http://web.archive.org/web/20140711025724/http://susodigital.com/"
         #link ="http://web.archive.org/web/20130415001342/http://www.bbc.co.uk/"
         stop_event = multiprocessing.Event()
         inner_link, domain, path, link_class, ext, fragment = LinkUtility.get_link_detail(link)
         root_domain = LinkChecker.get_root_domain(domain)[1]
         path = "/index.html"
         link_s = LinkAttrs(link=link, path=path, ref_link="/", shadow_ref_link="/", source=path, res_type=LinkUtility.EXT_WEBPAGE, level=0)
         explorer = ArchiveExplorer(original_domain=root_domain, link=link,
                                    external_stop_event=stop_event,
                                    download_base_dir=FilePath.get_default_archive_dir(), max_thread=10, max_level=2)
         explorer.run()
         archive_detail = explorer.get_archive_detail()
         CsvLogger.log_to_file_path(save_path, [archive_detail.to_tuple()])
    def testGetkeywordsRecursive(self, niche="Society/Law", level=1, keyword_init=[],
                                 proxies=None, country_code="us", min_delay=2, max_delay=5, offset=120):
        keyword_log_path = "/Users/superCat/Desktop/PycharmProjectPortable/Seeds/KeywordSuggestions/"+niche.replace('/', '-')+".txt"

        def save_callback(keywords: list):
            FileHandler.append_lines_to_file(keyword_log_path, keywords, option="at")

        if len(keyword_init) == 0:
            keyword_init = list(set(FileHandler.read_lines_from_file(keyword_log_path)))[offset:]
            for item in keyword_init:
                print(item)
            print("total keywords:", len(keyword_init))
        if proxies is None:
            proxy_site = BuyProxyOrg(buy_proxy_org_account)
            proxies = proxy_site.get_proxies(timeout=5)
        current_level = 0
        keywords_pool = keyword_init
        while current_level < level:
            keyword_init = self.testGetSuggestionBatch(keyword_init, proxies=proxies, country_code=country_code,
                                                       min_delay=min_delay, max_delay=max_delay, callback=save_callback)
            keywords_pool += keyword_init
            current_level += 1
        FileHandler.remove_file_if_exist(keyword_log_path)
        FileHandler.append_lines_to_file(keyword_log_path, keywords_pool, option="t")