def init_urls_to_parse(path_to_raw_file): global URLS_TO_PARSE global PROJECT PROJECT = input("Введите название проекта: ") tmp_urls = get_list(URLS_FILE, 'utf-8') if tmp_urls: URLS_TO_PARSE = tmp_urls else: tmp_urls = create_urls_to_parse(path_to_raw_file) URLS_TO_PARSE = tmp_urls
def get_current_email(infinite_loop=True): global global_emails last_email = get_last_used_email() first_free_email = 0 if last_email: last_email_position = global_emails.index(last_email) first_free_email = last_email_position + 1 try: current_email = global_emails[first_free_email] except IndexError: if infinite_loop: global_emails = get_list(email_list_full_path) current_email = get_current_email() return current_email else: print("Емейлов не осталось.") exit() else: current_email = global_emails[0] email_excluded = is_excluded(current_email) if email_excluded: while email_excluded: first_free_email += 1 current_email = global_emails[first_free_email] email_excluded = is_excluded(current_email) # email_log = get_email_log_path() # with open(email_log, "w") as f: # f.write("%s\n" % current_email) return current_email
def get_initial_info(): url_region_phrases_list = get_list(URL_AND_KEYS_FILE, encoding=READ_ENCODING) site, region, phrases = separate_url_and_region_and_phrases(url_region_phrases_list) return site, region, phrases
def initialize(): keys = get_list(os.path.join(INIT_DIR, KEYS_FILE), READ_ENCODING) pass
global_phrases = [] global_emails = [] PARSING_PATH_PARTICLE = "../CommerceParsing/" INIT_PATH_PARTICLE = PARSING_PATH_PARTICLE + "Init/" PHRASE_BUNCH_SIZE = 100 word_list_full_path = os.path.join(get_current_dir(), INIT_PATH_PARTICLE, "word_list.txt") email_list_full_path = os.path.join(get_current_dir(), INIT_PATH_PARTICLE, "email_list.txt") INDEX_LOG_FILE = "index_log.txt" ENC = 'utf-8' global_phrases = get_list(word_list_full_path, ENC) global_emails = get_list(email_list_full_path, ENC) def get_log_path(log): log_path = os.path.join(get_current_dir(), PARSING_PATH_PARTICLE, "log", log) return log_path def get_phrase_counter_val(): index_log_file = get_log_path(INDEX_LOG_FILE) try: with open(index_log_file, 'r') as f: line = f.readline() if line: phrase_counter = int(line)
import urllib from general.general import get_list, get_current_date, add_phrase_in_log from time import sleep from general.drv import get_driver import random import os BUKVARIX_DIR = "/home/michael/PycharmProjects/Bukvarix/" current_date = get_current_date() phrase_list = get_list(os.path.join(BUKVARIX_DIR, "Init/word_list.txt")) log_file = os.path.join(BUKVARIX_DIR, "Log/{}.txt".format(current_date)) BUNCH_SIZE = 100 import requests def pop_bunch(): global phrase_list counter = 0 phrase_bunch = "" while phrase_list and counter < 100: current_phrase = "{}\n".format(phrase_list.pop(0)) phrase_bunch += current_phrase counter += 1
def get_phrases(): yandex_parsing_init_dir = os.path.join( PROJECT_DIR, "../YandexParsing/init/phrases.txt") yandex_parsing_init_file = os.path.join(yandex_parsing_init_dir) phrases = get_list(yandex_parsing_init_file, READ_ENCODING) return phrases
from general.general import get_list, get_current_dir from general.drv import get_driver from general.drv import USE_PROXY import os from selenium.common.exceptions import NoSuchElementException from general.drv import send_proxy_to_black_list full_path_to_proxy_list = os.path.join( get_current_dir(), "../CommerceParsing/Init/proxy_list.txt") init_proxy_list = get_list(full_path_to_proxy_list) def test_proxies(): for i in range(len(init_proxy_list)): driver = get_driver() driver.get("http://ip-api.com/") element = driver.find_element_by_id("qr") element_text = element.text ip = element_text.split(",") try: region = driver.find_element_by_xpath( '//th[text()="Country"]/following-sibling::td').text except NoSuchElementException: send_proxy_to_black_list(a_proxy) driver.quit() print("{}, {}".format(ip, region))
def keyword_stuffing(): content_of_init_file = get_list(os.path.join(INIT_DIR, "init.txt"), READ_ENCODING) init(content_of_init_file) phrases = content_of_init_file[2:] parse_all(phrases)