예제 #1
0
def is_excluded(email):
    source_file = os.path.join(get_current_dir(), INIT_PATH_PARTICLE, "excluded_emails.txt")

    with open(source_file, "r") as f:
        excluded_emails = [line.rstrip('\n') for line in f]

    return email in excluded_emails
예제 #2
0
def parse_phrase_bunch(phrases):
    global phrase_counter

    logs_dir = os.path.join(get_current_dir(), "log")
    # clear_files(logs_dir)

    try:
        driver = get_driver()

        used_email = handle_login(driver)

        driver.get("https://tools.pixelplus.ru/tools/geo")

        email_log = get_log_path("email_log.txt")
        # phrase, write_mode, enc, full_path_to_file
        write_phrase_to_log(used_email, 'a', ENC, email_log)

        handle_phrases(phrases, driver)

        index_log = get_log_path("index_log.txt")
        write_phrase_to_log(phrase_counter, 'w', ENC, index_log)
        phrase_counter += PHRASE_BUNCH_SIZE

    except Exception as e:
        print("Проблема ^^^")
        print(e)
        driver.quit()
        parse_phrase_bunch(phrases)
예제 #3
0
def write_table_open_tag(site, region):
    global RESULT_FILE

    RESULT_FILE = os.path.join(get_current_dir(), PARSING_PATH_PARTICLE,
                               'Result/{domain}_{region}_result.html'.format(domain=site, region=region))
    write_phrase_to_log("<html>\n<table>\n",
                        write_mode='w',
                        enc=WRITE_ENCODING,
                        full_path_to_file=RESULT_FILE)
예제 #4
0
from general.drv import get_driver, get_proxy
from general.general import get_current_dir, write_list_to_file, write_phrase_to_log, get_list
import os
import requests
from requests.exceptions import ProxyError
from random import randint
from time import sleep
from bs4 import BeautifulSoup

# Регионы https://tech.yandex.ru/xml/doc/dg/reference/regions-docpage/

USE_SLEEP_TIME = True # При реальном парсинге всегда включать. Выключить задержку только для отладки.
PARSING_PATH_PARTICLE = "../KeywordStuffing/"
INIT_PATH_PARTICLE = PARSING_PATH_PARTICLE + "Init/"
URL_AND_KEYS_FILE = os.path.join(get_current_dir(), INIT_PATH_PARTICLE, "init.csv")
READ_ENCODING = 'utf-8'
WRITE_ENCODING = 'utf-8'
SIZE_OF_CHUNK = 10
ARSENKIN = 'https://arsenkin.ru/tools/filter/'
RESULT_FILE = "" # Инициализируется в функции write_table_open_tag.
PARSE_RUSSIA = False
PARSE_MOSCOW = False

# Обязательно строкой, а не цифрой!
MOSCOW_REGION = '1' # С областью.
RUSSIA_REGION = '225'


def separate_url_and_region_and_phrases(url_phrases_list):
    url = url_phrases_list[0].strip()
예제 #5
0
def get_log_path(log):
    log_path = os.path.join(get_current_dir(), PARSING_PATH_PARTICLE, "log", log)
    return log_path
예제 #6
0
import os
from selenium.webdriver.common.by import By
from general.drv import get_driver, USE_PROXY
from general.general import get_current_dir, get_list, clear_files, write_phrase_to_log

global_phrases = []
global_emails = []

PARSING_PATH_PARTICLE = "../CommerceParsing/"
INIT_PATH_PARTICLE = PARSING_PATH_PARTICLE + "Init/"

PHRASE_BUNCH_SIZE = 100

word_list_full_path = os.path.join(get_current_dir(), INIT_PATH_PARTICLE, "word_list.txt")
email_list_full_path = os.path.join(get_current_dir(), INIT_PATH_PARTICLE, "email_list.txt")

INDEX_LOG_FILE = "index_log.txt"
ENC = 'utf-8'

global_phrases =  get_list(word_list_full_path, ENC)


global_emails = get_list(email_list_full_path, ENC)

def get_log_path(log):
    log_path = os.path.join(get_current_dir(), PARSING_PATH_PARTICLE, "log", log)
    return log_path

def get_phrase_counter_val():
    index_log_file = get_log_path(INDEX_LOG_FILE)
    try:
예제 #7
0
import os
from selenium.webdriver.common.keys import Keys
from general.general import get_current_dir, get_list, clear_files, write_list_to_file, write_phrase_to_log
from general.drv import get_driver, USE_PROXY
from time import sleep

SELECTED_REGION = 1  # https://tech.yandex.ru/xml/doc/dg/reference/regions-docpage/
PROJECT_DIR = get_current_dir()
PAGES_TO_PARSE = 3
LOGS_DIR = os.path.join(get_current_dir(), "../YandexParsing/log")
READ_ENCODING = 'utf-8'
WRITE_ENCODING = 'utf-8'
RESULT_FILE = os.path.join(LOGS_DIR, "{}_result.csv".format(SELECTED_REGION))
PARSE_RELATED_WORDS = False


def get_region():
    # https://tech.yandex.ru/xml/doc/dg/reference/regions-docpage/
    regions = {225: "Россия", 1: "Москва"}
    return regions.get(SELECTED_REGION)


def get_phrases():
    yandex_parsing_init_dir = os.path.join(
        PROJECT_DIR, "../YandexParsing/init/phrases.txt")
    yandex_parsing_init_file = os.path.join(yandex_parsing_init_dir)
    phrases = get_list(yandex_parsing_init_file, READ_ENCODING)
    return phrases


phrases = get_phrases()
예제 #8
0
from general.general import get_list, get_current_dir
from general.drv import get_driver
from general.drv import USE_PROXY
import os
from selenium.common.exceptions import NoSuchElementException
from general.drv import send_proxy_to_black_list

full_path_to_proxy_list = os.path.join(
    get_current_dir(), "../CommerceParsing/Init/proxy_list.txt")

init_proxy_list = get_list(full_path_to_proxy_list)


def test_proxies():
    for i in range(len(init_proxy_list)):
        driver = get_driver()
        driver.get("http://ip-api.com/")

        element = driver.find_element_by_id("qr")
        element_text = element.text
        ip = element_text.split(",")

        try:
            region = driver.find_element_by_xpath(
                '//th[text()="Country"]/following-sibling::td').text
        except NoSuchElementException:
            send_proxy_to_black_list(a_proxy)

        driver.quit()
        print("{}, {}".format(ip, region))
예제 #9
0
from itertools import product
import os
from general.general import get_current_dir, write_phrase_to_log, clear_files
import sys
import re

PATH_PARTICLE = "../SeoCombinator/"
INIT_PATH_DIR = PATH_PARTICLE + "Init/"
RESULT_PATH_DIR = os.path.join(PATH_PARTICLE, "Result")
RESULT_FILE_PATH = os.path.join(RESULT_PATH_DIR, "result.txt")
FILE_ENCODING = 'windows-1251'

INIT_DIR = os.path.join(get_current_dir(), INIT_PATH_DIR)

PLUS_WORDS = []
MINUS_WORDS = []
PHRASES = []
VARIANTS = []
WORDSTAT_LIMIT = 7  # Вордстат парсит только 7 слов.


def handle_csv_with_one_list(csv_list_from_reader):
    # Файлы с общими плюс- и минус-словами содержат только одну строку.
    tmp_list = csv_list_from_reader[0]
    return [*filter(None, tmp_list)]


def delete_empty_values(a_list):
    # Нулевой элемент в списке phrases содержит признак необходимости парсинга лесенкой.
    # На всех других листах кроме листов с плюс- и минус-словами этот столбец должен присутствовать, но быть пустым.
    # Поэтому нулевые элементы в списке не чистим от пустых значений.
예제 #10
0
from general.general import get_current_dir, clear_files
from general.drv import send_proxy_to_black_set
import os
from bs4 import BeautifulSoup
from general.drv import get_proxy
import requests
from requests.exceptions import ProxyError
from general.general import write_phrase_to_log
from time import sleep

LINK_COL = 1  # В какой колонке в csv-файле находится ссылка. Начинть с 0.

PARSING_PATH_PARTICLE = os.path.join(get_current_dir(), "../ParseCompetitors/")
INIT_PATH_PARTICLE = PARSING_PATH_PARTICLE + "Init/"
INIT_DIR = os.path.join(get_current_dir(), INIT_PATH_PARTICLE)
RESULT_DIR = os.path.join(PARSING_PATH_PARTICLE, "Log")
RESULT_FILE = os.path.join(RESULT_DIR, "result.csv")
READ_FILE_ENCODING = 'utf-8'
WRITE_FILE_ENCODING = 'windows-1251'


def prepare_link_list(file_list):

    for f in file_list:
        with open(os.path.join(INIT_DIR, f),
                  encoding=READ_FILE_ENCODING) as csvfile:
            import csv
            csv_reader = csv.reader(csvfile, delimiter=';')
            csv_data = list(csv_reader)
    return csv_data