def is_excluded(email): source_file = os.path.join(get_current_dir(), INIT_PATH_PARTICLE, "excluded_emails.txt") with open(source_file, "r") as f: excluded_emails = [line.rstrip('\n') for line in f] return email in excluded_emails
def parse_phrase_bunch(phrases): global phrase_counter logs_dir = os.path.join(get_current_dir(), "log") # clear_files(logs_dir) try: driver = get_driver() used_email = handle_login(driver) driver.get("https://tools.pixelplus.ru/tools/geo") email_log = get_log_path("email_log.txt") # phrase, write_mode, enc, full_path_to_file write_phrase_to_log(used_email, 'a', ENC, email_log) handle_phrases(phrases, driver) index_log = get_log_path("index_log.txt") write_phrase_to_log(phrase_counter, 'w', ENC, index_log) phrase_counter += PHRASE_BUNCH_SIZE except Exception as e: print("Проблема ^^^") print(e) driver.quit() parse_phrase_bunch(phrases)
def write_table_open_tag(site, region): global RESULT_FILE RESULT_FILE = os.path.join(get_current_dir(), PARSING_PATH_PARTICLE, 'Result/{domain}_{region}_result.html'.format(domain=site, region=region)) write_phrase_to_log("<html>\n<table>\n", write_mode='w', enc=WRITE_ENCODING, full_path_to_file=RESULT_FILE)
from general.drv import get_driver, get_proxy from general.general import get_current_dir, write_list_to_file, write_phrase_to_log, get_list import os import requests from requests.exceptions import ProxyError from random import randint from time import sleep from bs4 import BeautifulSoup # Регионы https://tech.yandex.ru/xml/doc/dg/reference/regions-docpage/ USE_SLEEP_TIME = True # При реальном парсинге всегда включать. Выключить задержку только для отладки. PARSING_PATH_PARTICLE = "../KeywordStuffing/" INIT_PATH_PARTICLE = PARSING_PATH_PARTICLE + "Init/" URL_AND_KEYS_FILE = os.path.join(get_current_dir(), INIT_PATH_PARTICLE, "init.csv") READ_ENCODING = 'utf-8' WRITE_ENCODING = 'utf-8' SIZE_OF_CHUNK = 10 ARSENKIN = 'https://arsenkin.ru/tools/filter/' RESULT_FILE = "" # Инициализируется в функции write_table_open_tag. PARSE_RUSSIA = False PARSE_MOSCOW = False # Обязательно строкой, а не цифрой! MOSCOW_REGION = '1' # С областью. RUSSIA_REGION = '225' def separate_url_and_region_and_phrases(url_phrases_list): url = url_phrases_list[0].strip()
def get_log_path(log): log_path = os.path.join(get_current_dir(), PARSING_PATH_PARTICLE, "log", log) return log_path
import os from selenium.webdriver.common.by import By from general.drv import get_driver, USE_PROXY from general.general import get_current_dir, get_list, clear_files, write_phrase_to_log global_phrases = [] global_emails = [] PARSING_PATH_PARTICLE = "../CommerceParsing/" INIT_PATH_PARTICLE = PARSING_PATH_PARTICLE + "Init/" PHRASE_BUNCH_SIZE = 100 word_list_full_path = os.path.join(get_current_dir(), INIT_PATH_PARTICLE, "word_list.txt") email_list_full_path = os.path.join(get_current_dir(), INIT_PATH_PARTICLE, "email_list.txt") INDEX_LOG_FILE = "index_log.txt" ENC = 'utf-8' global_phrases = get_list(word_list_full_path, ENC) global_emails = get_list(email_list_full_path, ENC) def get_log_path(log): log_path = os.path.join(get_current_dir(), PARSING_PATH_PARTICLE, "log", log) return log_path def get_phrase_counter_val(): index_log_file = get_log_path(INDEX_LOG_FILE) try:
import os from selenium.webdriver.common.keys import Keys from general.general import get_current_dir, get_list, clear_files, write_list_to_file, write_phrase_to_log from general.drv import get_driver, USE_PROXY from time import sleep SELECTED_REGION = 1 # https://tech.yandex.ru/xml/doc/dg/reference/regions-docpage/ PROJECT_DIR = get_current_dir() PAGES_TO_PARSE = 3 LOGS_DIR = os.path.join(get_current_dir(), "../YandexParsing/log") READ_ENCODING = 'utf-8' WRITE_ENCODING = 'utf-8' RESULT_FILE = os.path.join(LOGS_DIR, "{}_result.csv".format(SELECTED_REGION)) PARSE_RELATED_WORDS = False def get_region(): # https://tech.yandex.ru/xml/doc/dg/reference/regions-docpage/ regions = {225: "Россия", 1: "Москва"} return regions.get(SELECTED_REGION) def get_phrases(): yandex_parsing_init_dir = os.path.join( PROJECT_DIR, "../YandexParsing/init/phrases.txt") yandex_parsing_init_file = os.path.join(yandex_parsing_init_dir) phrases = get_list(yandex_parsing_init_file, READ_ENCODING) return phrases phrases = get_phrases()
from general.general import get_list, get_current_dir from general.drv import get_driver from general.drv import USE_PROXY import os from selenium.common.exceptions import NoSuchElementException from general.drv import send_proxy_to_black_list full_path_to_proxy_list = os.path.join( get_current_dir(), "../CommerceParsing/Init/proxy_list.txt") init_proxy_list = get_list(full_path_to_proxy_list) def test_proxies(): for i in range(len(init_proxy_list)): driver = get_driver() driver.get("http://ip-api.com/") element = driver.find_element_by_id("qr") element_text = element.text ip = element_text.split(",") try: region = driver.find_element_by_xpath( '//th[text()="Country"]/following-sibling::td').text except NoSuchElementException: send_proxy_to_black_list(a_proxy) driver.quit() print("{}, {}".format(ip, region))
from itertools import product import os from general.general import get_current_dir, write_phrase_to_log, clear_files import sys import re PATH_PARTICLE = "../SeoCombinator/" INIT_PATH_DIR = PATH_PARTICLE + "Init/" RESULT_PATH_DIR = os.path.join(PATH_PARTICLE, "Result") RESULT_FILE_PATH = os.path.join(RESULT_PATH_DIR, "result.txt") FILE_ENCODING = 'windows-1251' INIT_DIR = os.path.join(get_current_dir(), INIT_PATH_DIR) PLUS_WORDS = [] MINUS_WORDS = [] PHRASES = [] VARIANTS = [] WORDSTAT_LIMIT = 7 # Вордстат парсит только 7 слов. def handle_csv_with_one_list(csv_list_from_reader): # Файлы с общими плюс- и минус-словами содержат только одну строку. tmp_list = csv_list_from_reader[0] return [*filter(None, tmp_list)] def delete_empty_values(a_list): # Нулевой элемент в списке phrases содержит признак необходимости парсинга лесенкой. # На всех других листах кроме листов с плюс- и минус-словами этот столбец должен присутствовать, но быть пустым. # Поэтому нулевые элементы в списке не чистим от пустых значений.
from general.general import get_current_dir, clear_files from general.drv import send_proxy_to_black_set import os from bs4 import BeautifulSoup from general.drv import get_proxy import requests from requests.exceptions import ProxyError from general.general import write_phrase_to_log from time import sleep LINK_COL = 1 # В какой колонке в csv-файле находится ссылка. Начинть с 0. PARSING_PATH_PARTICLE = os.path.join(get_current_dir(), "../ParseCompetitors/") INIT_PATH_PARTICLE = PARSING_PATH_PARTICLE + "Init/" INIT_DIR = os.path.join(get_current_dir(), INIT_PATH_PARTICLE) RESULT_DIR = os.path.join(PARSING_PATH_PARTICLE, "Log") RESULT_FILE = os.path.join(RESULT_DIR, "result.csv") READ_FILE_ENCODING = 'utf-8' WRITE_FILE_ENCODING = 'windows-1251' def prepare_link_list(file_list): for f in file_list: with open(os.path.join(INIT_DIR, f), encoding=READ_FILE_ENCODING) as csvfile: import csv csv_reader = csv.reader(csvfile, delimiter=';') csv_data = list(csv_reader) return csv_data