import copy
from settings import REDIS_CONFIG_LOCAL, MYSQL_CONFIG_LOCAL
from store import AmazonRedis, AmazonStorePro
from config import Config

rds = AmazonRedis(2, **copy.deepcopy(REDIS_CONFIG_LOCAL))

asin_set = set()
with open('asin', encoding='utf-8') as f:
    for line in f:
        line = line.strip()
        if line:
            asin_set.add(line)

print(asin_set)
# 1、选择站点
suffix = 'com'  # com\co.uk\co.jp\fr\it\es\de\in\ca
# 2、选择采集类型
entry = 7  # 1详情、7评论

task_lst = []
for asin in asin_set:
    mp = {'entry': entry}
    if entry == Config.DETAIL:
        mp['page_url'] = "https://www.amazon.{}/dp/{}".format(suffix, asin)
        task_lst.append(mp)

    elif entry == Config.COMMENT:
        # 3、1表示采全部星级,0表示采特定星级
        all_star = 1
        if all_star:
Пример #2
0
                        data['scgs_category_url'], data['scgs_tags'],
                        data['scgs_shop_name'], data['scgs_shop_url'],
                        data['scgs_generation_time'], data['scgs_platform'],
                        data['scgs_platform_url'], data['scgs_crawl_time'],
                        data['scgs_create_time'], data['scgs_status'],
                        data['scgs_questions'], data['scgs_is_delete'],
                        data['scgs_reserve_field_1'],
                        data['scgs_reserve_field_2'],
                        data['scgs_reserve_field_3'],
                        data['scgs_reserve_field_4'],
                        data['scgs_reserve_field_5'],
                        data['scgs_reserve_field_6'],
                        data['scgs_reserve_field_7'])

            except Exception as exp:
                traceback.print_exc()
                item_json['error'] = '{!r}'.format(exp)
                rds.rds.lpush(Config.REDIS_DATA_ERROR, json.dumps(item_json))

        print('finished insert')
        store.close()
    else:
        print('no item')
        time.sleep(30)


if __name__ == '__main__':
    rds = AmazonRedis()
    while True:
        data_insert(rds)
Пример #3
0
import queue
from config import Config
from crawl_func import clear_rds, start_crawl, start_thread
from store import AmazonRedis
from scan_task import scan_database


class ConfigSub(Config):
    REDIS_START_URLS_NAME = 'list'
    REDIS_SUB_DIR_NAME = 'lc'


if __name__ == '__main__':
    conf = ConfigSub()
    que = queue.Queue()
    rds = AmazonRedis()

    clear_rds(rds, conf)

    sign = scan_database(rds, conf)
    if sign:
        rds.delete_key('amazon:di:cy:dc01:markdate')

    new_loop = asyncio.new_event_loop()

    start_thread(new_loop)

    try:
        while True:
            start_crawl(rds, que, conf, new_loop)
            # 队列都为空,采集完成
Пример #4
0
import asyncio
import copy
import queue
from config import Config
from crawl_func import clear_rds, start_crawl, start_thread
from store import AmazonRedis
from settings import REDIS_CONFIG_LOCAL

if __name__ == '__main__':

    que = queue.Queue()
    rds = AmazonRedis(Config.REDIS_NUM, **copy.deepcopy(REDIS_CONFIG_LOCAL))

    clear_rds(rds, Config)

    new_loop = asyncio.new_event_loop()

    start_thread(new_loop)

    try:
        while True:
            start_crawl(rds, que, Config, new_loop)
            # 队列都为空,采集完成
            if not rds.exists_key(
                    Config.REDIS_START_URLS) and not rds.exists_key(
                        Config.REDIS_REQUEST_URLS) and not rds.exists_key(
                            Config.REDIS_CRAWL_URLS):
                break
    except KeyboardInterrupt:
        print('KeyboardInterrupt')
        new_loop.stop()
Пример #5
0
import asyncio
import copy
import queue
from config import Config
from crawl_func import clear_rds, start_crawl, start_thread
from store import AmazonRedis
from settings import REDIS_CONFIG_LOCAL


if __name__ == '__main__':

    que = queue.Queue()
    rds = AmazonRedis(Config.REDIS_NUM, **copy.deepcopy(REDIS_CONFIG_LOCAL))

    clear_rds(rds, Config)

    new_loop = asyncio.new_event_loop()

    start_thread(new_loop)

    try:
        while True:
            start_crawl(rds, que, Config, new_loop)
            # 队列都为空,采集完成
            if not rds.exists_key(Config.REDIS_START_URLS) and not rds.exists_key(Config.REDIS_REQUEST_URLS) and not rds.exists_key(Config.REDIS_CRAWL_URLS):
                break
    except KeyboardInterrupt:
        print('KeyboardInterrupt')
        new_loop.stop()

Пример #6
0
                push_data_into_redis(rds, Config, data_mp)
            else:
                print('no exist asin')

        print('push repeat done')
        store.close()
    else:
        print('no repeat asin')


def push_data_into_redis(rds, conf, data_mp):
    data_json = json.dumps(data_mp)
    rds.rds.lpush(conf.REDIS_DATA_LIST, data_json)


if __name__ == '__main__':
    rds = AmazonRedis()
    detail_today = rds.get_hash_field('amazon:di:cy:dc01:markdate', 'today')
    if detail_today:
        detail_today = detail_today.split()[0]
    today = time.strftime("%Y-%m-%d")
    if detail_today == today:
        print('start handling repeat asin')
        select_asin(rds)
    else:
        print('wait for detail finish')




Пример #7
0
import time
import sys
from store import AmazonRedis
from send_email import SendEmail

if __name__ == '__main__':
    rds = AmazonRedis()
    mail_today = rds.get_hash_field('amazon:di:cy:mail', 'today')
    detail_today = rds.get_hash_field('amazon:di:cy:dc01:markdate', 'today')
    if detail_today:
        detail_today = detail_today.split()[0]
    today = time.strftime("%Y-%m-%d")
    if mail_today == today:
        print('DI finish')
        sys.exit()
    if detail_today == today and not rds.exists_key('amazon:di:cy:repeatasin'):
        email = SendEmail()
        context = 'ok'
        email.send_message('DI', '*****@*****.**', '今日采集完成', context)
        today_date = time.strftime("%Y-%m-%d")
        rds.set_hash('amazon:di:cy:mail', {'today': today_date})
    else:
        print('DI no finish')
Пример #8
0
            rds.rds.lpush(Config.REDIS_REPEAT_ASIN, repeat_mp)
            print('repeat asin')
            rds.add_set(Config.REDIS_CATE_ASIN, cate_asin)
            continue
        rds.add_set(Config.REDIS_CATE_ASIN, cate_asin)
        print(row['scgs_id'])

        page_url = 'https://www.amazon.{}/dp/{}'.format(suffix, asin)
        mp = {'entry': task_type, 'page_url': page_url, 'category_info': task_category, 'category_entry': category_entry,
              'category_url': category_url, 'rank': 101, 'create_time': create_time}
        rds.rds.rpush('amazon:di:cy:detail', mp)
    store.close()


if __name__ == '__main__':
    rds = AmazonRedis()
    today = time.strftime("%Y-%m-%d")
    asin_today = rds.get_hash_field('amazon:di:cy:asin:markdate', 'today')
    if asin_today:
        asin_today = asin_today.split()[0]
    if asin_today == today:
        print('toady finish')
    else:
        list_today = rds.get_hash_field('amazon:di:cy:lc:markdate', 'today')
        if list_today:
            list_today = list_today.split()[0]
        if list_today == today:
            rds.set_hash('amazon:di:cy:asin:markdate', {'today': time.strftime("%Y-%m-%d %H:%M:%S")})
            print('scan_database')
            select_asin(rds)
            rds.delete_key(Config.REDIS_CATE_ASIN)