Пример #1
0
def get_info():
    global lock, f_success, f_asins, tool
    lock = Lock()
    tool = httptools.httptools()

    f1 = open(config.category_listings_path + 'less_9800.csv')
    lines = f1.readlines()
    f1.close()
    f2 = open(config.category_listings_path + 'split_more9800.csv')
    lines2 = f2.readlines()
    f2.close()
    lines.extend(lines2)

    f_asins = open(config.get_ids_path + 'ids.csv', 'w')
    f_success = open(config.get_ids_path + 'success_url.csv', 'w')
    f_success.write('listings_count' + '\t' + 'id_count' + '\t' + 'url' + '\n')
    f_success.flush()

    pool = Pool(15)
    pool.map(handle, lines)
    pool.close()
    pool.join()

    f_asins.close()
    f_success.close()
Пример #2
0
def handle():
    global lock, tool, success_file
    tool = httptools.httptools()
    lock = Lock()
    success_file = open(config.pro_html_path + 'ids_FromProHtml.txt', 'aw')

    path = config.get_ids_path + 'pro_ids.csv'
    f1 = open(path)
    while True:  #为了减少内存使用,每次只读取10000行进行抓取
        lines = []
        for i in range(10000):
            line = f1.readline()
            if not line:
                break
            lines.append(line)
        if lines == []:
            break
        pool = Pool(10)
        pool.map(get_product_html, lines)
        pool.close()
        pool.join()

        if not line:
            break
    f1.close()

    success_file.close()
Пример #3
0
def get_root_listings():
    root_cate = config.root_category_id
    url = config.url.replace('[category]', root_cate)
    tool = httptools.httptools()
    html = tool.gethtml(url)
    num = regex_listing.search(html)
    if num != None:
        num = num.group(1)
        num = regex_num.search(num)
        if num != None:
            num = num.group()
            num = num.replace(',', '')
            if num != '':
                return num
    return 0
Пример #4
0
def handle():
    global tool, f, lock
    tool = httptools.httptools()
    lock = Lock()
    path = config.category_listings_path + 'leaf_cate_id.csv'
    f = open(path, 'w')

    root_id = config.root_category_id
    cate_list = get_current_page_info(root_id)

    pool = Pool(15)
    pool.map(get_leaf_cate_id, cate_list)
    pool.close()
    pool.join()

    f.close()
Пример #5
0
def get_suit_url():
    global f_suit_url, lock, tool
    tool = httptools.httptools()
    lock = Lock()

    f = open(config.category_listings_path + 'more_9800.csv')
    lines = f.readlines()
    f.close()

    f_suit_url = open(config.category_listings_path + 'split_more9800.csv',
                      'w')

    pool = Pool(15)
    pool.map(handle_url, lines)
    pool.close()
    pool.join()

    f_suit_url.close()
Пример #6
0
def get_status():
    print '1 get spot status'
    print '2 pass'
    num_input = raw_input('input number:')
    if judge(num_input, 2) == 0:
        print 'Input is not legal, please re-enter.'
        get_status()
    if num_input == '2':
        return
    tool = httptools.httptools()
    root_url = config.url.replace('[category]', config.root_category_id)
    listings = get_listings(tool, root_url)
    except_space = int(listings) * 0.1745 / 1024
    left_space = get_left_space()
    print 'listings:', listings
    print 'excepted space:', str(int(except_space)) + 'G'
    print 'left space:', str(left_space) + 'G'
    get_running_process()
Пример #7
0
def get_category_listings():
    global lock, f_less, f_more, tool
    tool = httptools.httptools()
    lock = Lock()
    #     leafs=getLeaf(config.root_category_id)
    path = config.category_listings_path + 'leaf_cate_id.csv'
    f = open(path, 'r')
    leafs = f.readlines()
    f.close()

    f_less = open(config.category_listings_path + 'less_9800.csv', 'w')
    f_more = open(config.category_listings_path + 'more_9800.csv', 'w')

    pool = Pool(15)
    pool.map(handle, leafs)
    pool.close()
    pool.join()

    f_less.close()
    f_more.close()