def __init__(self, ref: str, creation_strategy: str, save_dir="", file_limit=1000000, table_name="TEMP", write_ahead_mode=True): if len(save_dir) == 0: default_dir = get_db_buffer_default_dir() else: default_dir = save_dir self._table_name = table_name self._write_ahead_mode = write_ahead_mode FileHandler.create_file_if_not_exist(default_dir) self.filename = default_dir + ref # file_exist = os.path.exists(self.filename) self.db = sqlite3.connect(self.filename, timeout=10) self.cur = self.db.cursor() #self.cur.execute("PRAGMA journal_mode = MEMORY") #if not file_exist: if self._write_ahead_mode: self.cur.execute("PRAGMA journal_mode = WAL;") self.cur.execute("PRAGMA synchronous = OFF;") self.exclusive_access_file_limit = file_limit # cannot ensure uniqueness of data in multithread access #self.cur.execute("CREATE TABLE IF NOT EXISTS TEMP (LINK TEXT, RS_CODE INTEGER, LEV INTEGER, L_TYPE INTEGER, PRIMARY KEY(LINK));") self.cur.execute(creation_strategy) self.db.commit()
def __init__(self, interval: int, ref: ProgressLogInterface, stop_event: Event): """ logging prograss for long running method :param interval: period of logging in second :param ref: the reference object invoked logging :param stop_event: event to stop logging :return: """ threading.Thread.__init__(self) self._interval = interval self._ref = ref self._stop_event = stop_event self.begin_time = int(time.time()) self._ref_time = self.begin_time self._path = get_log_dir() + "Progress/" temp = ref.get_file_name() if len(temp) > 200: filename = temp[0:199] else: filename = temp if not filename.endswith(".csv"): filename += ".csv" self._file_path = self._path + filename FileHandler.create_file_if_not_exist(self._file_path) self._limit = ref.get_limit() self.limit_counter = 0
def testEmailLogin(self): # Send the message via local SMTP server using Oauth2. from Email.SMTP import SMTP import httplib2 from Email.Utility.Oauth2 import CustomOAuth2Credentials me = "*****@*****.**" you = "*****@*****.**" msg = get_msg(me, you) http = httplib2.Http() credentials = OAuth2Credentials.from_json( FileHandler.read_all_from_file(credentials_local_path)) # scopes = credentials.retrieve_scopes(http) # for item in scopes: # print(item) if credentials.access_token_expired: # http = credentials.authorize(http) credentials.refresh(http) jsoned = credentials.to_json() FileHandler.remove_file_if_exist(credentials_local_path) FileHandler.append_line_to_file(credentials_local_path, str(jsoned)) auth_str = GenerateOAuth2String(me, access_token=credentials.access_token) s = SMTP(**gmail_provider) s.set_debuglevel(debuglevel=4) s.ehlo() s.starttls() s.authenticate_oauth2(auth_str) s.sendmail(me, you, msg.as_string()) s.quit()
def add_proxies(self, proxies: []): if proxies is not None: convtered = [] for proxy in proxies: if isinstance(proxy, ProxyStruct): convtered.append((proxy.addr, proxy.port, proxy.alt_port, proxy.user_name, proxy.psd)) FileHandler.create_file_if_not_exist(self._file_path) CsvLogger.log_to_file_path(self._file_path, convtered)
def testGmailAuthStep2(self): code = "4/zZRbhzmhulAsl6pasBMqmuOv5PCsdRuITTxyAWLkJOI#" credentials = flow.step2_exchange(code) access_token = credentials.access_token refresh_token = credentials.refresh_token print("access_token:", access_token, " refresh_token:", refresh_token) jsoned = credentials.to_json() FileHandler.remove_file_if_exist(credentials_local_path) FileHandler.append_line_to_file(credentials_local_path, str(jsoned)) print(jsoned)
def run(self): FileHandler.create_file_if_not_exist(self._file_path) cols = ["Index", "Time/Min"] + self._ref.get_column_names() self._append(cols) while not self._stop_event.is_set() and self.limit_counter < self._limit: current_time = int(time.time()) gap = current_time - self._ref_time if gap >= self._interval: self._ref_time = current_time self.report_progress() time.sleep(1)
def __init__(self, file_dir: str = "", file_name="UserAccounts.db"): if len(file_dir) == 0: file_dir = get_temp_db_dir() FileHandler.create_file_if_not_exist(file_dir) self._file_name = file_name file_path = file_dir + self._file_name self.db = sqlite3.connect(file_path) self.cur = self.db.cursor() self.cur.execute( "CREATE TABLE IF NOT EXISTS ACCOUNTS(TYPE INTEGER, USER_ID TEXT, PSD TEXT," " LINK TEXT,ACCESS_ID TEXT, API_KEY TEXT, PROXY TEXT);") self.db.commit()
def run(self): FileHandler.create_file_if_not_exist(self._file_path) cols = ["Index", "Time/Min"] + self._ref.get_column_names() self._append(cols) while not self._stop_event.is_set( ) and self.limit_counter < self._limit: current_time = int(time.time()) gap = current_time - self._ref_time if gap >= self._interval: self._ref_time = current_time self.report_progress() time.sleep(1)
def __init__(self, file_name, worker: ExternalTempInterface, stop_event: Event, buf_size=200, output_f=1000, dir_path="", table_name="temp", convert_input=True, convert_output=True, terminate_callback=None): """ :param file_name: :param worker: :param stop_event: :param buf_size: :param dir_path: :param table_name: :param convert_input: :param convert_output: convert output to OnSiteLink by default, else return raw tuple data. :return: """ self._file_name = file_name if len(dir_path) > 0: self._file_dir = dir_path else: self._file_dir = get_temp_db_dir() self._file_path = self._file_dir + self._file_name PrintLogger.print("ExternalTempDataDiskBuffer create path in init: " + self._file_path) FileHandler.create_file_if_not_exist(self._file_path) self.stop_event = stop_event self._tab = table_name self._worker = worker self._get_lock = threading.RLock() self._put_lock = threading.RLock() self._convert_input = convert_input self._convert_output = convert_output FileBuffInterface.__init__(self, self._file_name, buf_size, output_f=output_f, power_save_mode=True, terminate_callback=terminate_callback) self.set_db_update_interval(10) self._is_reading = Event() self._need_to_vaccum = Event() self._total_record = self.count_all()
def log_to_file_path(file_path: str, rows: [()]): if len(rows) > 0: try: path = file_path if not path.endswith(".csv"): path += ".csv" FileHandler.create_file_if_not_exist(path) with open(path, mode="a", newline="") as csv_file: wr = csv.writer(csv_file, delimiter=",") for row in rows: wr.writerow(row) csv_file.close() except Exception as ex: ErrorLogger.log_error("CsvLogger", ex, "log_to_file_path()")
def log_to_file_path(file_path: str, rows: [()]): if len(rows) > 0: try: path = file_path if not path.endswith(".csv"): path += ".csv" FileHandler.create_file_if_not_exist(path) with open(path, mode='a', newline='') as csv_file: wr = csv.writer(csv_file, delimiter=',') for row in rows: wr.writerow(row) csv_file.close() except Exception as ex: ErrorLogger.log_error("CsvLogger", ex, "log_to_file_path()")
def get_msg(me, you): # me == my email address # you == recipient's email address html_file_path = "D:/Test/email_content_saved.txt" text_file_path = "D:/Test/email_text.txt" # Create message container - the correct MIME type is multipart/alternative. msg = MIMEMultipart('alternative') msg['Subject'] = "100+ HIGH TF/CF/DA EXPIRED DOMAINS TO BUY ONLY $10 EACH" msg['From'] = me msg['To'] = you # Create the body of the message (a plain-text and an HTML version). text = "" html = FileHandler.read_all_from_file(html_file_path, 't') # Record the MIME types of both parts - text/plain and text/html. part1 = MIMEText(text, 'plain') part2 = MIMEText(html, 'html') # Attach parts into message container. # According to RFC 2046, the last part of a multipart message, in this case # the HTML message, is best and preferred. msg.attach(part1) msg.attach(part2) return msg
def testScrapePageBatch(self): save_path = "/Users/superCat/Desktop/PycharmProjectPortable/test/profile_archive_downloaded.csv" file_path = "/Users/superCat/Desktop/PycharmProjectPortable/test/profile_test_links.txt" CsvLogger.log_to_file_path(save_path, [ArchiveDetail.get_title()]) domains_links = FileHandler.read_lines_from_file(file_path) for link in domains_links: # link = "http://web.archive.org/web/20140711025724/http://susodigital.com/" #link ="http://web.archive.org/web/20130415001342/http://www.bbc.co.uk/" stop_event = multiprocessing.Event() inner_link, domain, path, link_class, ext, fragment = LinkUtility.get_link_detail( link) root_domain = LinkChecker.get_root_domain(domain)[1] path = "/index.html" link_s = LinkAttrs(link=link, path=path, ref_link="/", shadow_ref_link="/", source=path, res_type=LinkUtility.EXT_WEBPAGE, level=0) explorer = ArchiveExplorer( original_domain=root_domain, link=link, external_stop_event=stop_event, download_base_dir=FilePath.get_default_archive_dir(), max_thread=10, max_level=2) explorer.run() archive_detail = explorer.get_archive_detail() CsvLogger.log_to_file_path(save_path, [archive_detail.to_tuple()])
def testGetBlogs(self): niche = "Society/Law" proxy_site = BuyProxyOrg(buy_proxy_org_account) proxies = proxy_site.get_proxies(timeout=5) keyword_log_path = "/Users/superCat/Desktop/PycharmProjectPortable/Seeds/KeywordSuggestions/"+niche.replace('/', '-')+".txt" # countries = GoogleUtility.CountryCodeEnglish countries = ["uk", ] min_delay = 2 max_delay = 5 max_page = 2 days_ago = 4*365 target_keywords_init = ["legal case", "Labour law", "human rights law", "crime law", "Immigration law", "Family law", "Transactional law", "Company law", "Commercial law", "Admiralty law", "Intellectual property law", "international law", "tax law", "banking law", "competition law", "consumer law", "environmental law"] suggested_keywords = [] for country in countries: # temp_keywords = self.testGetSuggestionBatch(target_keywords_init, proxies=proxies, # country_code=country, # min_delay=min_delay, max_delay=max_delay) temp_keywords = list(set(FileHandler.read_lines_from_file(keyword_log_path))) # FileHandler.append_lines_to_file(keyword_log_path, temp_keywords, option="at") # suggested_keywords += temp_keywords crawl_keywords = [x for x in list(set(target_keywords_init + temp_keywords))] self.testGetLinksBatch_single_t(niche, keywords=crawl_keywords, page_count=max_page, index=0, length=100, country_code=country, source_type=GoogleConst.SourceTypeBlog, min_delay=min_delay, max_delay=max_delay, days_ago=days_ago, proxies=proxies, use_browser=False)
def testRe(self): css_text = FileHandler.read_all_from_file("/Users/superCat/Desktop/PycharmProjectPortable/test/example.css") test_s = "if('undefined' === typeof wwhomepage) var wwhomepage = {}; wwhomepage.customPromoHeaders = {\" /web/20130415001342/http://www.bbc.co.uk\/news\/magazine-22094279\":" match = re.search(link_pattern, css_text) groups = match.group() if match is not None: for i in match.groups(0): print(i)
def testRe(self): css_text = FileHandler.read_all_from_file( "/Users/superCat/Desktop/PycharmProjectPortable/test/example.css") test_s = "if('undefined' === typeof wwhomepage) var wwhomepage = {}; wwhomepage.customPromoHeaders = {\" /web/20130415001342/http://www.bbc.co.uk\/news\/magazine-22094279\":" match = re.search(link_pattern, css_text) groups = match.group() if match is not None: for i in match.groups(0): print(i)
def log_error(ref: str, error: Exception, addtional: str = ""): path = get_log_dir() + ErrorLogger.FILE_NAME try: FileHandler.create_file_if_not_exist(path) lines = [] lines.append(ref) lines.append("{0:d} {1:s}".format(ErrorLogger.Counter, str(datetime.datetime.now(tz=pytz.utc)))) lines.append(str(error)) if len(addtional) > 0: lines.append(addtional) with open(path, mode="a", newline="") as csv_file: wr = csv.writer(csv_file, delimiter=",") wr.writerow(lines) csv_file.close() # lines.append("") # FileHandler.append_lines_to_file(path, lines) ErrorLogger.Counter += 1 except: pass
def testCss2Parse(self): css_text = FileHandler.read_all_from_file("/Users/superCat/Desktop/PycharmProjectPortable/test/example.css") groups = [] parse_str_sp = functools.partial(parse_str, groups, 1) temp = re.sub(link_pattern, parse_str_sp, css_text) # for item in groups: # print(item) print("captured total: ", len(groups)) for item in groups: if isinstance(item, LinkAttrs): print("res:", item.path, "link:", item.link)
def log_error(ref: str, error: Exception, addtional: str = ""): path = get_log_dir() + ErrorLogger.FILE_NAME try: FileHandler.create_file_if_not_exist(path) lines = [] lines.append(ref) lines.append("{0:d} {1:s}".format( ErrorLogger.Counter, str(datetime.datetime.now(tz=pytz.utc)))) lines.append(str(error)) if len(addtional) > 0: lines.append(addtional) with open(path, mode='a', newline='') as csv_file: wr = csv.writer(csv_file, delimiter=',') wr.writerow(lines) csv_file.close() # lines.append("") # FileHandler.append_lines_to_file(path, lines) ErrorLogger.Counter += 1 except: pass
def testCss2Parse(self): css_text = FileHandler.read_all_from_file( "/Users/superCat/Desktop/PycharmProjectPortable/test/example.css") groups = [] parse_str_sp = functools.partial(parse_str, groups, 1) temp = re.sub(link_pattern, parse_str_sp, css_text) # for item in groups: # print(item) print("captured total: ", len(groups)) for item in groups: if isinstance(item, LinkAttrs): print("res:", item.path, "link:", item.link)
def testGetBestProfileBatch(self): file_path = "/Users/superCat/Desktop/PycharmProjectPortable/test/profile_test_source.txt" domains = FileHandler.read_lines_from_file(file_path) save_path = "/Users/superCat/Desktop/PycharmProjectPortable/test/profile_archive.csv" CsvLogger.log_to_file_path(save_path, [ArchiveDetail.get_title()]) for domain in domains: print("begin domain:", domain) try: archive = ArchiveOrg.get_best_archive(root_domain=domain, thread_size=100, profile_check=10, pass_threshold=0.9, res_limit=2000) CsvLogger.log_to_file_path(save_path, [archive.to_tuple()]) except Exception as ex: print(ex)
def testCssParse(self): css_text = FileHandler.read_all_from_file("/Users/superCat/Desktop/PycharmProjectPortable/test/example.css") section = css_text.split("}") groups = [] parse_str_sp = functools.partial(parse_str, groups, 1) result = "" for sec in section: sec += "}" temp = re.sub(css_link_pattern, parse_str_sp, sec) result += temp for item in groups: print(item) print(result)
def _write_to_power_save_db(self) -> bool: data = self.get_state_for_power_save_mode() if isinstance(data, Serializable): FileHandler.create_file_if_not_exist(self._recovery_file_path) try: db = sqlite3.connect(self._recovery_file_path) cur = db.cursor() cur.execute( "CREATE TABLE IF NOT EXISTS STATE_TAB(STATE TEXT UNIQUE, STATE_V TEXT);" ) data_converted = data.get_serializable_json() cur.execute( "INSERT OR REPLACE INTO STATE_TAB (STATE, STATE_V) VALUES ( ?, ?);", ("state", data_converted)) db.commit() db.close() return True except Exception as ex: ErrorLogger.log_error( "FileBuffInterface", ex, "_write_to_power_save_db() " + self._recovery_file_path) return False else: return False
def testGetkeywordsRecursive(self, niche="Society/Law", level=1, keyword_init=[], proxies=None, country_code="us", min_delay=2, max_delay=5, offset=120): keyword_log_path = "/Users/superCat/Desktop/PycharmProjectPortable/Seeds/KeywordSuggestions/"+niche.replace('/', '-')+".txt" def save_callback(keywords: list): FileHandler.append_lines_to_file(keyword_log_path, keywords, option="at") if len(keyword_init) == 0: keyword_init = list(set(FileHandler.read_lines_from_file(keyword_log_path)))[offset:] for item in keyword_init: print(item) print("total keywords:", len(keyword_init)) if proxies is None: proxy_site = BuyProxyOrg(buy_proxy_org_account) proxies = proxy_site.get_proxies(timeout=5) current_level = 0 keywords_pool = keyword_init while current_level < level: keyword_init = self.testGetSuggestionBatch(keyword_init, proxies=proxies, country_code=country_code, min_delay=min_delay, max_delay=max_delay, callback=save_callback) keywords_pool += keyword_init current_level += 1 FileHandler.remove_file_if_exist(keyword_log_path) FileHandler.append_lines_to_file(keyword_log_path, keywords_pool, option="t")
def testCssParse(self): css_text = FileHandler.read_all_from_file( "/Users/superCat/Desktop/PycharmProjectPortable/test/example.css") section = css_text.split("}") groups = [] parse_str_sp = functools.partial(parse_str, groups, 1) result = "" for sec in section: sec += "}" temp = re.sub(css_link_pattern, parse_str_sp, sec) result += temp for item in groups: print(item) print(result)
def testMsgGen(self): email_template_path = "D:/Test/email_content_template.txt" email_content_save_path = "D:/Test/email_content_saved.txt" email_lines_before_table_path = "D:/Test/email_text_before_table.txt" email_lines_after_table_path = "D:/Test/email_text_after_table.txt" data_file_path = "D:/Test/data_sample.csv" # th for head cell, td for data cell email_template = FileHandler.read_all_from_file(email_template_path) cell_item_template = '<{0:s} style="-webkit-box-sizing: border-box;-moz-box-sizing: border-box;box-sizing: ' \ 'border-box;padding: 8px;text-align: left;line-height: 1.42857143;vertical-align: ' \ 'bottom;border-top: 1px solid #ddd;border-bottom: 2px solid #ddd;border: 1px solid ' \ '#ddd!important;border-bottom-width: 2px;background-color: #fff!important;">' \ '{1:s}</{0:s}>' row_item_template = '<tr style="-webkit-box-sizing: border-box;-moz-box-sizing: border-box;box-sizing:' \ ' border-box;page-break-inside: avoid;">{0:s}</tr>' line_format = '<p style="-webkit-box-sizing: border-box;-moz-box-sizing: border-box;box-sizing: ' \ 'border-box;orphans: 3;widows: 3;margin: 0 0 10px;">{0:s}</p><br>' before_table_lines = FileHandler.read_lines_from_file( email_lines_before_table_path, remove_blank_line=False) after_table_lines = FileHandler.read_lines_from_file( email_lines_after_table_path, remove_blank_line=False) before_table_str = "".join( [line_format.format(x, ) for x in before_table_lines]) after_table_str = "".join( [line_format.format(x, ) for x in after_table_lines]) table_cells_str = "" with open(data_file_path, mode='r', newline='') as csv_file: reader = csv.reader(csv_file, delimiter=',') header = next(reader) header_row_str = row_item_template.format("".join( [cell_item_template.format( "th", x, ) for x in header])) for row in reader: table_cells_str += row_item_template.format("".join( [cell_item_template.format( "td", x, ) for x in row])) email_content = email_template.format(before_table_str, 50, header_row_str, table_cells_str, after_table_str) FileHandler.remove_file_if_exist(email_content_save_path) FileHandler.append_line_to_file(email_content_save_path, email_content) return email_content
def testScrapePageBatch(self): save_path = "/Users/superCat/Desktop/PycharmProjectPortable/test/profile_archive_downloaded.csv" file_path = "/Users/superCat/Desktop/PycharmProjectPortable/test/profile_test_links.txt" CsvLogger.log_to_file_path(save_path, [ArchiveDetail.get_title()]) domains_links = FileHandler.read_lines_from_file(file_path) for link in domains_links: # link = "http://web.archive.org/web/20140711025724/http://susodigital.com/" #link ="http://web.archive.org/web/20130415001342/http://www.bbc.co.uk/" stop_event = multiprocessing.Event() inner_link, domain, path, link_class, ext, fragment = LinkUtility.get_link_detail(link) root_domain = LinkChecker.get_root_domain(domain)[1] path = "/index.html" link_s = LinkAttrs(link=link, path=path, ref_link="/", shadow_ref_link="/", source=path, res_type=LinkUtility.EXT_WEBPAGE, level=0) explorer = ArchiveExplorer(original_domain=root_domain, link=link, external_stop_event=stop_event, download_base_dir=FilePath.get_default_archive_dir(), max_thread=10, max_level=2) explorer.run() archive_detail = explorer.get_archive_detail() CsvLogger.log_to_file_path(save_path, [archive_detail.to_tuple()])
def delete_proxy_file(self): FileHandler.remove_file_if_exist(self._file_path)
def remove_power_save_db(self): FileHandler.remove_file_if_exist(self._recovery_file_path)
def save_callback(keywords: list): FileHandler.append_lines_to_file(keyword_log_path, keywords, option="at")