def __init__(self,
              ref: str,
              creation_strategy: str,
              save_dir="",
              file_limit=1000000,
              table_name="TEMP",
              write_ahead_mode=True):
     if len(save_dir) == 0:
         default_dir = get_db_buffer_default_dir()
     else:
         default_dir = save_dir
     self._table_name = table_name
     self._write_ahead_mode = write_ahead_mode
     FileHandler.create_file_if_not_exist(default_dir)
     self.filename = default_dir + ref
     # file_exist = os.path.exists(self.filename)
     self.db = sqlite3.connect(self.filename, timeout=10)
     self.cur = self.db.cursor()
     #self.cur.execute("PRAGMA journal_mode = MEMORY")
     #if not file_exist:
     if self._write_ahead_mode:
         self.cur.execute("PRAGMA journal_mode = WAL;")
         self.cur.execute("PRAGMA synchronous = OFF;")
     self.exclusive_access_file_limit = file_limit
     # cannot ensure uniqueness of data in multithread access
     #self.cur.execute("CREATE TABLE IF NOT EXISTS TEMP (LINK TEXT, RS_CODE INTEGER, LEV INTEGER, L_TYPE INTEGER, PRIMARY KEY(LINK));")
     self.cur.execute(creation_strategy)
     self.db.commit()
Пример #2
0
 def __init__(self, interval: int, ref: ProgressLogInterface, stop_event: Event):
     """
     logging prograss for long running method
     :param interval: period of logging in second
     :param ref: the reference object invoked logging
     :param stop_event: event to stop logging
     :return:
     """
     threading.Thread.__init__(self)
     self._interval = interval
     self._ref = ref
     self._stop_event = stop_event
     self.begin_time = int(time.time())
     self._ref_time = self.begin_time
     self._path = get_log_dir() + "Progress/"
     temp = ref.get_file_name()
     if len(temp) > 200:
         filename = temp[0:199]
     else:
         filename = temp
     if not filename.endswith(".csv"):
         filename += ".csv"
     self._file_path = self._path + filename
     FileHandler.create_file_if_not_exist(self._file_path)
     self._limit = ref.get_limit()
     self.limit_counter = 0
Пример #3
0
 def __init__(self, interval: int, ref: ProgressLogInterface,
              stop_event: Event):
     """
     logging prograss for long running method
     :param interval: period of logging in second
     :param ref: the reference object invoked logging
     :param stop_event: event to stop logging
     :return:
     """
     threading.Thread.__init__(self)
     self._interval = interval
     self._ref = ref
     self._stop_event = stop_event
     self.begin_time = int(time.time())
     self._ref_time = self.begin_time
     self._path = get_log_dir() + "Progress/"
     temp = ref.get_file_name()
     if len(temp) > 200:
         filename = temp[0:199]
     else:
         filename = temp
     if not filename.endswith(".csv"):
         filename += ".csv"
     self._file_path = self._path + filename
     FileHandler.create_file_if_not_exist(self._file_path)
     self._limit = ref.get_limit()
     self.limit_counter = 0
Пример #4
0
    def testEmailLogin(self):
        # Send the message via local SMTP server using Oauth2.
        from Email.SMTP import SMTP
        import httplib2
        from Email.Utility.Oauth2 import CustomOAuth2Credentials
        me = "*****@*****.**"
        you = "*****@*****.**"

        msg = get_msg(me, you)
        http = httplib2.Http()
        credentials = OAuth2Credentials.from_json(
            FileHandler.read_all_from_file(credentials_local_path))
        # scopes = credentials.retrieve_scopes(http)
        # for item in scopes:
        #     print(item)
        if credentials.access_token_expired:
            # http = credentials.authorize(http)
            credentials.refresh(http)
            jsoned = credentials.to_json()
            FileHandler.remove_file_if_exist(credentials_local_path)
            FileHandler.append_line_to_file(credentials_local_path,
                                            str(jsoned))
        auth_str = GenerateOAuth2String(me,
                                        access_token=credentials.access_token)
        s = SMTP(**gmail_provider)
        s.set_debuglevel(debuglevel=4)
        s.ehlo()
        s.starttls()
        s.authenticate_oauth2(auth_str)
        s.sendmail(me, you, msg.as_string())
        s.quit()
Пример #5
0
 def add_proxies(self, proxies: []):
     if proxies is not None:
         convtered = []
         for proxy in proxies:
             if isinstance(proxy, ProxyStruct):
                 convtered.append((proxy.addr, proxy.port, proxy.alt_port,
                                   proxy.user_name, proxy.psd))
         FileHandler.create_file_if_not_exist(self._file_path)
         CsvLogger.log_to_file_path(self._file_path, convtered)
Пример #6
0
 def testGmailAuthStep2(self):
     code = "4/zZRbhzmhulAsl6pasBMqmuOv5PCsdRuITTxyAWLkJOI#"
     credentials = flow.step2_exchange(code)
     access_token = credentials.access_token
     refresh_token = credentials.refresh_token
     print("access_token:", access_token, " refresh_token:", refresh_token)
     jsoned = credentials.to_json()
     FileHandler.remove_file_if_exist(credentials_local_path)
     FileHandler.append_line_to_file(credentials_local_path, str(jsoned))
     print(jsoned)
Пример #7
0
 def run(self):
     FileHandler.create_file_if_not_exist(self._file_path)
     cols = ["Index", "Time/Min"] + self._ref.get_column_names()
     self._append(cols)
     while not self._stop_event.is_set() and self.limit_counter < self._limit:
         current_time = int(time.time())
         gap = current_time - self._ref_time
         if gap >= self._interval:
             self._ref_time = current_time
             self.report_progress()
         time.sleep(1)
 def __init__(self, file_dir: str = "", file_name="UserAccounts.db"):
     if len(file_dir) == 0:
         file_dir = get_temp_db_dir()
     FileHandler.create_file_if_not_exist(file_dir)
     self._file_name = file_name
     file_path = file_dir + self._file_name
     self.db = sqlite3.connect(file_path)
     self.cur = self.db.cursor()
     self.cur.execute(
         "CREATE TABLE IF NOT EXISTS ACCOUNTS(TYPE INTEGER, USER_ID TEXT, PSD TEXT,"
         " LINK TEXT,ACCESS_ID TEXT, API_KEY TEXT, PROXY TEXT);")
     self.db.commit()
Пример #9
0
 def run(self):
     FileHandler.create_file_if_not_exist(self._file_path)
     cols = ["Index", "Time/Min"] + self._ref.get_column_names()
     self._append(cols)
     while not self._stop_event.is_set(
     ) and self.limit_counter < self._limit:
         current_time = int(time.time())
         gap = current_time - self._ref_time
         if gap >= self._interval:
             self._ref_time = current_time
             self.report_progress()
         time.sleep(1)
    def __init__(self,
                 file_name,
                 worker: ExternalTempInterface,
                 stop_event: Event,
                 buf_size=200,
                 output_f=1000,
                 dir_path="",
                 table_name="temp",
                 convert_input=True,
                 convert_output=True,
                 terminate_callback=None):
        """

        :param file_name:
        :param worker:
        :param stop_event:
        :param buf_size:
        :param dir_path:
        :param table_name:
        :param convert_input:
        :param convert_output: convert output to OnSiteLink by default, else return raw tuple data.
        :return:
        """
        self._file_name = file_name
        if len(dir_path) > 0:
            self._file_dir = dir_path
        else:
            self._file_dir = get_temp_db_dir()
        self._file_path = self._file_dir + self._file_name
        PrintLogger.print("ExternalTempDataDiskBuffer create path in init: " +
                          self._file_path)
        FileHandler.create_file_if_not_exist(self._file_path)
        self.stop_event = stop_event
        self._tab = table_name
        self._worker = worker
        self._get_lock = threading.RLock()
        self._put_lock = threading.RLock()
        self._convert_input = convert_input
        self._convert_output = convert_output
        FileBuffInterface.__init__(self,
                                   self._file_name,
                                   buf_size,
                                   output_f=output_f,
                                   power_save_mode=True,
                                   terminate_callback=terminate_callback)
        self.set_db_update_interval(10)

        self._is_reading = Event()
        self._need_to_vaccum = Event()
        self._total_record = self.count_all()
Пример #11
0
 def log_to_file_path(file_path: str, rows: [()]):
     if len(rows) > 0:
         try:
             path = file_path
             if not path.endswith(".csv"):
                 path += ".csv"
             FileHandler.create_file_if_not_exist(path)
             with open(path, mode="a", newline="") as csv_file:
                 wr = csv.writer(csv_file, delimiter=",")
                 for row in rows:
                     wr.writerow(row)
                 csv_file.close()
         except Exception as ex:
             ErrorLogger.log_error("CsvLogger", ex, "log_to_file_path()")
Пример #12
0
 def log_to_file_path(file_path: str, rows: [()]):
     if len(rows) > 0:
         try:
             path = file_path
             if not path.endswith(".csv"):
                 path += ".csv"
             FileHandler.create_file_if_not_exist(path)
             with open(path, mode='a', newline='') as csv_file:
                 wr = csv.writer(csv_file, delimiter=',')
                 for row in rows:
                     wr.writerow(row)
                 csv_file.close()
         except Exception as ex:
             ErrorLogger.log_error("CsvLogger", ex, "log_to_file_path()")
Пример #13
0
def get_msg(me, you):
    # me == my email address
    # you == recipient's email address
    html_file_path = "D:/Test/email_content_saved.txt"
    text_file_path = "D:/Test/email_text.txt"

    # Create message container - the correct MIME type is multipart/alternative.
    msg = MIMEMultipart('alternative')
    msg['Subject'] = "100+ HIGH TF/CF/DA EXPIRED DOMAINS TO BUY ONLY $10 EACH"
    msg['From'] = me
    msg['To'] = you

    # Create the body of the message (a plain-text and an HTML version).
    text = ""
    html = FileHandler.read_all_from_file(html_file_path, 't')

    # Record the MIME types of both parts - text/plain and text/html.
    part1 = MIMEText(text, 'plain')
    part2 = MIMEText(html, 'html')

    # Attach parts into message container.
    # According to RFC 2046, the last part of a multipart message, in this case
    # the HTML message, is best and preferred.
    msg.attach(part1)
    msg.attach(part2)
    return msg
 def testScrapePageBatch(self):
     save_path = "/Users/superCat/Desktop/PycharmProjectPortable/test/profile_archive_downloaded.csv"
     file_path = "/Users/superCat/Desktop/PycharmProjectPortable/test/profile_test_links.txt"
     CsvLogger.log_to_file_path(save_path, [ArchiveDetail.get_title()])
     domains_links = FileHandler.read_lines_from_file(file_path)
     for link in domains_links:
         # link = "http://web.archive.org/web/20140711025724/http://susodigital.com/"
         #link ="http://web.archive.org/web/20130415001342/http://www.bbc.co.uk/"
         stop_event = multiprocessing.Event()
         inner_link, domain, path, link_class, ext, fragment = LinkUtility.get_link_detail(
             link)
         root_domain = LinkChecker.get_root_domain(domain)[1]
         path = "/index.html"
         link_s = LinkAttrs(link=link,
                            path=path,
                            ref_link="/",
                            shadow_ref_link="/",
                            source=path,
                            res_type=LinkUtility.EXT_WEBPAGE,
                            level=0)
         explorer = ArchiveExplorer(
             original_domain=root_domain,
             link=link,
             external_stop_event=stop_event,
             download_base_dir=FilePath.get_default_archive_dir(),
             max_thread=10,
             max_level=2)
         explorer.run()
         archive_detail = explorer.get_archive_detail()
         CsvLogger.log_to_file_path(save_path, [archive_detail.to_tuple()])
 def testGetBlogs(self):
     niche = "Society/Law"
     proxy_site = BuyProxyOrg(buy_proxy_org_account)
     proxies = proxy_site.get_proxies(timeout=5)
     keyword_log_path = "/Users/superCat/Desktop/PycharmProjectPortable/Seeds/KeywordSuggestions/"+niche.replace('/', '-')+".txt"
     # countries = GoogleUtility.CountryCodeEnglish
     countries = ["uk", ]
     min_delay = 2
     max_delay = 5
     max_page = 2
     days_ago = 4*365
     target_keywords_init = ["legal case", "Labour law", "human rights law", "crime law", "Immigration law",
                             "Family law", "Transactional law", "Company law", "Commercial law", "Admiralty law",
                             "Intellectual property law", "international law", "tax law", "banking law", "competition law",
                             "consumer law", "environmental law"]
     suggested_keywords = []
     for country in countries:
         # temp_keywords = self.testGetSuggestionBatch(target_keywords_init, proxies=proxies,
         #                                                   country_code=country,
         #                                                   min_delay=min_delay, max_delay=max_delay)
         temp_keywords = list(set(FileHandler.read_lines_from_file(keyword_log_path)))
         # FileHandler.append_lines_to_file(keyword_log_path, temp_keywords, option="at")
         # suggested_keywords += temp_keywords
         crawl_keywords = [x for x in list(set(target_keywords_init + temp_keywords))]
         self.testGetLinksBatch_single_t(niche, keywords=crawl_keywords, page_count=max_page, index=0, length=100,
                                         country_code=country, source_type=GoogleConst.SourceTypeBlog,
                                         min_delay=min_delay, max_delay=max_delay, days_ago=days_ago,
                                         proxies=proxies, use_browser=False)
 def testRe(self):
     css_text = FileHandler.read_all_from_file("/Users/superCat/Desktop/PycharmProjectPortable/test/example.css")
     test_s = "if('undefined' === typeof wwhomepage) var wwhomepage = {}; wwhomepage.customPromoHeaders = {\" /web/20130415001342/http://www.bbc.co.uk\/news\/magazine-22094279\":"
     match = re.search(link_pattern, css_text)
     groups = match.group()
     if match is not None:
         for i in match.groups(0):
             print(i)
 def testRe(self):
     css_text = FileHandler.read_all_from_file(
         "/Users/superCat/Desktop/PycharmProjectPortable/test/example.css")
     test_s = "if('undefined' === typeof wwhomepage) var wwhomepage = {}; wwhomepage.customPromoHeaders = {\" /web/20130415001342/http://www.bbc.co.uk\/news\/magazine-22094279\":"
     match = re.search(link_pattern, css_text)
     groups = match.group()
     if match is not None:
         for i in match.groups(0):
             print(i)
Пример #18
0
 def log_error(ref: str, error: Exception, addtional: str = ""):
     path = get_log_dir() + ErrorLogger.FILE_NAME
     try:
         FileHandler.create_file_if_not_exist(path)
         lines = []
         lines.append(ref)
         lines.append("{0:d} {1:s}".format(ErrorLogger.Counter, str(datetime.datetime.now(tz=pytz.utc))))
         lines.append(str(error))
         if len(addtional) > 0:
             lines.append(addtional)
         with open(path, mode="a", newline="") as csv_file:
             wr = csv.writer(csv_file, delimiter=",")
             wr.writerow(lines)
             csv_file.close()
         # lines.append("")
         # FileHandler.append_lines_to_file(path, lines)
         ErrorLogger.Counter += 1
     except:
         pass
 def testCss2Parse(self):
     css_text = FileHandler.read_all_from_file("/Users/superCat/Desktop/PycharmProjectPortable/test/example.css")
     groups = []
     parse_str_sp = functools.partial(parse_str, groups, 1)
     temp = re.sub(link_pattern, parse_str_sp, css_text)
     # for item in groups:
     #     print(item)
     print("captured total: ", len(groups))
     for item in groups:
         if isinstance(item, LinkAttrs):
             print("res:", item.path,  "link:", item.link)
Пример #20
0
 def log_error(ref: str, error: Exception, addtional: str = ""):
     path = get_log_dir() + ErrorLogger.FILE_NAME
     try:
         FileHandler.create_file_if_not_exist(path)
         lines = []
         lines.append(ref)
         lines.append("{0:d} {1:s}".format(
             ErrorLogger.Counter, str(datetime.datetime.now(tz=pytz.utc))))
         lines.append(str(error))
         if len(addtional) > 0:
             lines.append(addtional)
         with open(path, mode='a', newline='') as csv_file:
             wr = csv.writer(csv_file, delimiter=',')
             wr.writerow(lines)
             csv_file.close()
         # lines.append("")
         # FileHandler.append_lines_to_file(path, lines)
         ErrorLogger.Counter += 1
     except:
         pass
 def testCss2Parse(self):
     css_text = FileHandler.read_all_from_file(
         "/Users/superCat/Desktop/PycharmProjectPortable/test/example.css")
     groups = []
     parse_str_sp = functools.partial(parse_str, groups, 1)
     temp = re.sub(link_pattern, parse_str_sp, css_text)
     # for item in groups:
     #     print(item)
     print("captured total: ", len(groups))
     for item in groups:
         if isinstance(item, LinkAttrs):
             print("res:", item.path, "link:", item.link)
 def testGetBestProfileBatch(self):
     file_path = "/Users/superCat/Desktop/PycharmProjectPortable/test/profile_test_source.txt"
     domains = FileHandler.read_lines_from_file(file_path)
     save_path = "/Users/superCat/Desktop/PycharmProjectPortable/test/profile_archive.csv"
     CsvLogger.log_to_file_path(save_path, [ArchiveDetail.get_title()])
     for domain in domains:
         print("begin domain:", domain)
         try:
             archive = ArchiveOrg.get_best_archive(root_domain=domain, thread_size=100, profile_check=10, pass_threshold=0.9, res_limit=2000)
             CsvLogger.log_to_file_path(save_path, [archive.to_tuple()])
         except Exception as ex:
             print(ex)
Пример #23
0
 def __init__(self, ref: str, creation_strategy: str, save_dir="", file_limit=1000000,
              table_name="TEMP", write_ahead_mode=True):
     if len(save_dir) == 0:
         default_dir = get_db_buffer_default_dir()
     else:
         default_dir = save_dir
     self._table_name = table_name
     self._write_ahead_mode = write_ahead_mode
     FileHandler.create_file_if_not_exist(default_dir)
     self.filename = default_dir + ref
     # file_exist = os.path.exists(self.filename)
     self.db = sqlite3.connect(self.filename, timeout=10)
     self.cur = self.db.cursor()
     #self.cur.execute("PRAGMA journal_mode = MEMORY")
     #if not file_exist:
     if self._write_ahead_mode:
         self.cur.execute("PRAGMA journal_mode = WAL;")
         self.cur.execute("PRAGMA synchronous = OFF;")
     self.exclusive_access_file_limit = file_limit
     # cannot ensure uniqueness of data in multithread access
     #self.cur.execute("CREATE TABLE IF NOT EXISTS TEMP (LINK TEXT, RS_CODE INTEGER, LEV INTEGER, L_TYPE INTEGER, PRIMARY KEY(LINK));")
     self.cur.execute(creation_strategy)
     self.db.commit()
    def testCssParse(self):
        css_text = FileHandler.read_all_from_file("/Users/superCat/Desktop/PycharmProjectPortable/test/example.css")

        section = css_text.split("}")
        groups = []
        parse_str_sp = functools.partial(parse_str, groups, 1)
        result = ""
        for sec in section:
            sec += "}"
            temp = re.sub(css_link_pattern, parse_str_sp, sec)
            result += temp
        for item in groups:
            print(item)

        print(result)
    def __init__(self, file_name,  worker: ExternalTempInterface, stop_event: Event, buf_size=200, output_f=1000,
                 dir_path="",  table_name="temp", convert_input=True, convert_output=True, terminate_callback=None):
        """

        :param file_name:
        :param worker:
        :param stop_event:
        :param buf_size:
        :param dir_path:
        :param table_name:
        :param convert_input:
        :param convert_output: convert output to OnSiteLink by default, else return raw tuple data.
        :return:
        """
        self._file_name = file_name
        if len(dir_path) > 0:
            self._file_dir = dir_path
        else:
            self._file_dir = get_temp_db_dir()
        self._file_path = self._file_dir + self._file_name
        PrintLogger.print("ExternalTempDataDiskBuffer create path in init: " + self._file_path)
        FileHandler.create_file_if_not_exist(self._file_path)
        self.stop_event = stop_event
        self._tab = table_name
        self._worker = worker
        self._get_lock = threading.RLock()
        self._put_lock = threading.RLock()
        self._convert_input = convert_input
        self._convert_output = convert_output
        FileBuffInterface.__init__(self, self._file_name, buf_size, output_f=output_f, power_save_mode=True,
                                   terminate_callback=terminate_callback)
        self.set_db_update_interval(10)

        self._is_reading = Event()
        self._need_to_vaccum = Event()
        self._total_record = self.count_all()
 def _write_to_power_save_db(self) -> bool:
     data = self.get_state_for_power_save_mode()
     if isinstance(data, Serializable):
         FileHandler.create_file_if_not_exist(self._recovery_file_path)
         try:
             db = sqlite3.connect(self._recovery_file_path)
             cur = db.cursor()
             cur.execute(
                 "CREATE TABLE IF NOT EXISTS STATE_TAB(STATE TEXT UNIQUE, STATE_V TEXT);"
             )
             data_converted = data.get_serializable_json()
             cur.execute(
                 "INSERT OR REPLACE INTO STATE_TAB (STATE, STATE_V) VALUES ( ?, ?);",
                 ("state", data_converted))
             db.commit()
             db.close()
             return True
         except Exception as ex:
             ErrorLogger.log_error(
                 "FileBuffInterface", ex,
                 "_write_to_power_save_db() " + self._recovery_file_path)
             return False
     else:
         return False
    def testGetkeywordsRecursive(self, niche="Society/Law", level=1, keyword_init=[],
                                 proxies=None, country_code="us", min_delay=2, max_delay=5, offset=120):
        keyword_log_path = "/Users/superCat/Desktop/PycharmProjectPortable/Seeds/KeywordSuggestions/"+niche.replace('/', '-')+".txt"

        def save_callback(keywords: list):
            FileHandler.append_lines_to_file(keyword_log_path, keywords, option="at")

        if len(keyword_init) == 0:
            keyword_init = list(set(FileHandler.read_lines_from_file(keyword_log_path)))[offset:]
            for item in keyword_init:
                print(item)
            print("total keywords:", len(keyword_init))
        if proxies is None:
            proxy_site = BuyProxyOrg(buy_proxy_org_account)
            proxies = proxy_site.get_proxies(timeout=5)
        current_level = 0
        keywords_pool = keyword_init
        while current_level < level:
            keyword_init = self.testGetSuggestionBatch(keyword_init, proxies=proxies, country_code=country_code,
                                                       min_delay=min_delay, max_delay=max_delay, callback=save_callback)
            keywords_pool += keyword_init
            current_level += 1
        FileHandler.remove_file_if_exist(keyword_log_path)
        FileHandler.append_lines_to_file(keyword_log_path, keywords_pool, option="t")
 def testGetBestProfileBatch(self):
     file_path = "/Users/superCat/Desktop/PycharmProjectPortable/test/profile_test_source.txt"
     domains = FileHandler.read_lines_from_file(file_path)
     save_path = "/Users/superCat/Desktop/PycharmProjectPortable/test/profile_archive.csv"
     CsvLogger.log_to_file_path(save_path, [ArchiveDetail.get_title()])
     for domain in domains:
         print("begin domain:", domain)
         try:
             archive = ArchiveOrg.get_best_archive(root_domain=domain,
                                                   thread_size=100,
                                                   profile_check=10,
                                                   pass_threshold=0.9,
                                                   res_limit=2000)
             CsvLogger.log_to_file_path(save_path, [archive.to_tuple()])
         except Exception as ex:
             print(ex)
    def testCssParse(self):
        css_text = FileHandler.read_all_from_file(
            "/Users/superCat/Desktop/PycharmProjectPortable/test/example.css")

        section = css_text.split("}")
        groups = []
        parse_str_sp = functools.partial(parse_str, groups, 1)
        result = ""
        for sec in section:
            sec += "}"
            temp = re.sub(css_link_pattern, parse_str_sp, sec)
            result += temp
        for item in groups:
            print(item)

        print(result)
Пример #30
0
    def testMsgGen(self):
        email_template_path = "D:/Test/email_content_template.txt"
        email_content_save_path = "D:/Test/email_content_saved.txt"
        email_lines_before_table_path = "D:/Test/email_text_before_table.txt"
        email_lines_after_table_path = "D:/Test/email_text_after_table.txt"
        data_file_path = "D:/Test/data_sample.csv"
        # th for head cell, td for data cell
        email_template = FileHandler.read_all_from_file(email_template_path)
        cell_item_template = '<{0:s} style="-webkit-box-sizing: border-box;-moz-box-sizing: border-box;box-sizing: ' \
                             'border-box;padding: 8px;text-align: left;line-height: 1.42857143;vertical-align: ' \
                             'bottom;border-top: 1px solid #ddd;border-bottom: 2px solid #ddd;border: 1px solid ' \
                             '#ddd!important;border-bottom-width: 2px;background-color: #fff!important;">' \
                             '{1:s}</{0:s}>'
        row_item_template = '<tr style="-webkit-box-sizing: border-box;-moz-box-sizing: border-box;box-sizing:' \
                            ' border-box;page-break-inside: avoid;">{0:s}</tr>'
        line_format = '<p style="-webkit-box-sizing: border-box;-moz-box-sizing: border-box;box-sizing: ' \
                      'border-box;orphans: 3;widows: 3;margin: 0 0 10px;">{0:s}</p><br>'
        before_table_lines = FileHandler.read_lines_from_file(
            email_lines_before_table_path, remove_blank_line=False)
        after_table_lines = FileHandler.read_lines_from_file(
            email_lines_after_table_path, remove_blank_line=False)

        before_table_str = "".join(
            [line_format.format(x, ) for x in before_table_lines])
        after_table_str = "".join(
            [line_format.format(x, ) for x in after_table_lines])
        table_cells_str = ""
        with open(data_file_path, mode='r', newline='') as csv_file:
            reader = csv.reader(csv_file, delimiter=',')
            header = next(reader)
            header_row_str = row_item_template.format("".join(
                [cell_item_template.format(
                    "th",
                    x,
                ) for x in header]))
            for row in reader:
                table_cells_str += row_item_template.format("".join(
                    [cell_item_template.format(
                        "td",
                        x,
                    ) for x in row]))

        email_content = email_template.format(before_table_str, 50,
                                              header_row_str, table_cells_str,
                                              after_table_str)
        FileHandler.remove_file_if_exist(email_content_save_path)
        FileHandler.append_line_to_file(email_content_save_path, email_content)
        return email_content
 def testScrapePageBatch(self):
     save_path = "/Users/superCat/Desktop/PycharmProjectPortable/test/profile_archive_downloaded.csv"
     file_path = "/Users/superCat/Desktop/PycharmProjectPortable/test/profile_test_links.txt"
     CsvLogger.log_to_file_path(save_path, [ArchiveDetail.get_title()])
     domains_links = FileHandler.read_lines_from_file(file_path)
     for link in domains_links:
         # link = "http://web.archive.org/web/20140711025724/http://susodigital.com/"
         #link ="http://web.archive.org/web/20130415001342/http://www.bbc.co.uk/"
         stop_event = multiprocessing.Event()
         inner_link, domain, path, link_class, ext, fragment = LinkUtility.get_link_detail(link)
         root_domain = LinkChecker.get_root_domain(domain)[1]
         path = "/index.html"
         link_s = LinkAttrs(link=link, path=path, ref_link="/", shadow_ref_link="/", source=path, res_type=LinkUtility.EXT_WEBPAGE, level=0)
         explorer = ArchiveExplorer(original_domain=root_domain, link=link,
                                    external_stop_event=stop_event,
                                    download_base_dir=FilePath.get_default_archive_dir(), max_thread=10, max_level=2)
         explorer.run()
         archive_detail = explorer.get_archive_detail()
         CsvLogger.log_to_file_path(save_path, [archive_detail.to_tuple()])
Пример #32
0
 def delete_proxy_file(self):
     FileHandler.remove_file_if_exist(self._file_path)
 def remove_power_save_db(self):
     FileHandler.remove_file_if_exist(self._recovery_file_path)
 def save_callback(keywords: list):
     FileHandler.append_lines_to_file(keyword_log_path, keywords, option="at")