Пример #1
0
def urls_huifu():
    db_urls = [item['url'] for item in url_list.find()]
    index_urls = [item['url'] for item in item_info.find()]
    x = set(db_urls)
    y = set(index_urls)
    rest_of_urls = x - y

    for url in rest_of_urls:
        is_zhuanzhuan = 'http://zhuanzhuan' in url.split('.')[0]
        is_oldxiangqingye = 'http://sz' in url.split('.')[0]
        if is_zhuanzhuan:
            get_zhuan_info(url)
        elif is_oldxiangqingye:
            print(url)
            get_item_info(url)
        else:
            pass
Пример #2
0
import time
from page_parsing import URL_list
from page_parsing import item_info

while True:
    print(item_info.find().count())
    time.sleep(2)
Пример #3
0
import time
from page_parsing import url_list, item_info


while True:
    print('url_list:', url_list.find().count())
    time.sleep(5)
    print('item_info:', item_info.find().count())
    time.sleep(5)
Пример #4
0
#encoding=utf-8
__author__ = 'Administrator'
from multiprocessing import Pool
from channel_extract import channel_list
from page_parsing import get_links_from, get_item_info, url_lists, item_info
import pymongo


def get_all_links_from(channel):
    for num in range(1, 101):
        get_links_from(channel, num, 1)


total_urls = [i['url'] for i in url_lists.find()]
used_urls = [i['url'] for i in item_info.find()]
x = set(total_urls)
y = set(used_urls)
left_urls = x - y

if __name__ == '__main__':
    pool = Pool(processes=6)
    pool.map(get_item_info, list(left_urls))
Пример #5
0
import time
from page_parsing import url_list,item_info

while True:
    #print url_list.find().coount()
    print item_info.find().count()
    time.sleep(5)
Пример #6
0
import time
from page_parsing import url_list
from page_parsing import item_info

while True:
    # count the number of url_list and item_info
    print('The number of url list', url_list.find().count())
    print('The number of items information', item_info.find().count())
    time.sleep(10)
Пример #7
0
#!/usr/bin/env python
# -*- coding:utf-8 -*-

"""
@author: Jan
@software: PyCharm Community Edition
@time: 2016/2/15 21:21
"""

import time
from page_parsing import url_list, item_info

# 每5秒查询表的记录数
while True:
    url_counts = url_list.find().count()
    info_counts = item_info.find().count()
    now_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
    print str(url_counts) + '  ' + str(info_counts) + '  ' + str(now_time)
    time.sleep(5)
Пример #8
0
from multiprocessing import Pool #
from channel_extract import channel_list
from page_parsing import get_links_from, get_item_info, url_list, item_info

def get_all_links_from(channel):
    for num in range(1,51):
        get_links_from(channel,num)




if __name__=='__main__':   # 一种类似作文开头的感谢领导的套话格式,防止上下程序串混乱了,没特别的意思
    pool = Pool()  # 创建进程池
    pool.map(get_all_links_from, channel_list.split())
    # map函数的特点是把括号内的后一个参数放到前一个参数(函数)里去依次执行。约定俗成map第一个参数为不带 () 的函数。
    # channel_list 是引用过来的,我们之前定义过它是一个长字符串,将它分成一段段,split()函数会将一个字符串自动变成分割好的一个大list

# 断点续传
    db_urls = [ item['url'] for item in url_list.find() ]  # 用列表解析式装入所要爬取的链接
    index_urls = [ item['url'] for item in item_info.find() ]   # 所引出详情信息数据库中所有的现存的 url 字段,这里的item是一个字典,不是字典的方法 items()
    x = set(db_urls)     # 转换成集合的数据结构
    y = set(index_urls)    rest_of_urls = x - y  # 把这个链接替换到上面的pool.map的链接即可


# 设计思路:
# 1.分两个数据库,第一个用于只用于存放抓取下来的 url (ulr_list);第二个则储存 url 对应的物品详情信息(item_info)
# 2.在抓取过程中在第二个数据库中写入数据的同时,新增一个字段(key) 'index_url' 即该详情对应的链接
# 3.若抓取中断,在第二个存放详情页信息的数据库中的 url 字段应该是第一个数据库中 url 集合的子集
# 4.两个集合的 url 相减得出圣贤应该抓取的 url 还有哪些
Пример #9
0
#!/usr/bin/env python
# -*- coding:utf-8 -*-
"""
@author: Jan
@software: PyCharm Community Edition
@time: 2016/2/15 21:21
"""

import time
from page_parsing import url_list, item_info

# 每5秒查询表的记录数
while True:
    url_counts = url_list.find().count()
    info_counts = item_info.find().count()
    now_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
    print str(url_counts) + '  ' + str(info_counts) + '  ' + str(now_time)
    time.sleep(5)
Пример #10
0
from multiprocessing import Pool
from page_parsing import get_item_info_from, url_list, item_info, get_links_from
from channel_extracing import channel_list

# ================================================= < <链接去重 > > =====================================================

# 设计思路:
# 1.分两个数据库,第一个用于只用于存放抓取下来的 url (ulr_list);第二个则储存 url 对应的物品详情信息(item_info)
# 2.在抓取过程中在第二个数据库中写入数据的同时,新增一个字段(key) 'index_url' 即该详情对应的链接
# 3.若抓取中断,在第二个存放详情页信息的数据库中的 url 字段应该是第一个数据库中 url 集合的子集
# 4.两个集合的 url 相减得出圣贤应该抓取的 url 还有哪些

db_urls = [item['url'] for item in url_list.find()]  # 用列表解析式装入所有要爬取的链接
index_urls = [item['url']
              for item in item_info.find()]  # 所引出详情信息数据库中所有的现存的 url 字段
x = set(db_urls)  # 转换成集合的数据结构
y = set(index_urls)
rest_of_urls = x - y  # 相减

# ======================================================================================================================
Пример #11
0
import time
from page_parsing import url_list,item_info

while True:
    print('message:',item_info.find().count())
    print('URL:',url_list.find().count())
    time.sleep(5)
Пример #12
0
from channel_extrack import url_list
from page_parsing import get_links_from
from multiprocessing import Pool
import time
from page_parsing import get_item_info
from page_parsing import URL_list
from page_parsing import item_info

item_all = [i['url'] for i in URL_list.find()]
item_any = [i['url'] for i in item_info.find()]
x = set(item_all)
y = set(item_any)
item_result = x - y


def get_links_from_urllist(channel):

    for num in range(1, 101):
        try:
            while True:
                if get_links_from(channel, num) == None:
                    break
                else:
                    get_links_from(channel, num)
        except KeyboardInterrupt:
            break


if __name__ == "__main__":
    # get_item_infos(item_result)
    for i in item_result:
Пример #13
0
from multiprocessing import Pool
from channel_extract import channel_list
from page_parsing import get_links_from, url_list, get_item_info, item_info


def get_all_links_from(channel):
    for i in range(1, 101):
        info = get_links_from(channel, i)
        if info == 'none':
            break


if __name__ == '__main__':
    # get_all_links_from( 'http://bj.58.com/bijiben/')
    all_channels = channel_list.split()
    pool = Pool()
    # pool.map(get_all_links_from,all_channels)
    print('url_list.count is :%s' % url_list.count())  #88280

    all = set([item['url'] for item in url_list.find()])
    len1 = len(all)
    print('set url_list count is:%s' % len1)
    done = set([item['url']
                for item in item_info.find()])  #可省掉list.append(data)这一步
    len2 = len(done)
    print('set item_info count is:%s' % len2)

    set_undone = all - done
    len3 = len(set_undone)
    print('still need to insert count is:%s' % len3)
    pool.map(get_item_info, set_undone)
Пример #14
0
from page_parsing import url_list, item_info

print(url_list.find().count())
print([item['price'] for item in item_info.find()])
Пример #15
0
# __author__ = 'xjlin'
# -*- coding: utf-8 -*-
import time
from page_parsing import url_list
from page_parsing import  item_info

while True:
    print(url_list.find().count())
    print(item_info.find().count())
    time.sleep(5)
Пример #16
0
Файл: main.py Проект: qchs/58
from multiprocessing import Pool
from channel_extract import channel_list
from page_parsing import get_links_from ,url_list,get_item_info,item_info


def get_all_links_from(channel):
    for i in range(1,101):
        info = get_links_from(channel,i)
        if info == 'none':
            break

if __name__ =='__main__':
    # get_all_links_from( 'http://bj.58.com/bijiben/')
    all_channels = channel_list.split()
    pool = Pool()
    # pool.map(get_all_links_from,all_channels)
    print('url_list.count is :%s'%url_list.count())#88280

    all = set([item['url'] for item in url_list.find()])
    len1=len(all)
    print('set url_list count is:%s'%len1)
    done =  set([item['url'] for item in item_info.find()])#可省掉list.append(data)这一步
    len2 = len(done)
    print('set item_info count is:%s'%len2)

    set_undone = all - done
    len3= len(set_undone)
    print('still need to insert count is:%s'%len3)
    pool.map(get_item_info,set_undone)
Пример #17
0
from multiprocessing import Pool
from page_parsing import get_item_info_from, url_list, item_info, get_links_from
from channel_extracing import channel_list

#断点续传(去重实现)
db_urls = [item['url'] for item in url_list.find()]
index_urls = [item['url'] for item in item_info.find()]
x = set(db_urls)
y = set(index_urls)
rest_of_urls = x - y

# def get_all_links_from(channel):
#     for i in range(1,100):
#         get_links_from(channel,i)

if __name__ == '__main__':
    pool = Pool(processes=6)
    # pool = Pool()
    pool.map(get_all_links_from, channel_list.split())
    pool.close()
    pool.join()
Пример #18
0
    from page_parsing import get_item_info_from,url_list,item_info,get_links_from


    # ================================================= < <链接去重 > > =====================================================

    # 设计思路:
    # 1.分两个数据库,第一个用于只用于存放抓取下来的 url (ulr_list);第二个则储存 url 对应的物品详情信息(item_info)
    # 2.在抓取过程中在第二个数据库中写入数据的同时,新增一个字段(key) 'index_url' 即该详情对应的链接
    # 3.若抓取中断,在第二个存放详情页信息的数据库中的 url 字段应该是第一个数据库中 url 集合的子集
    # 4.两个集合的 url 相减得出剩下应该抓取的 url 还有哪些


    db_urls = [item['url'] for item in url_list.find()]     # 用列表解析式装入所有要爬取的链接
    index_urls = [item['url'] for item in item_info.find()] # 所引出详情信息数据库中所有的现存的 url 字段
    x = set(db_urls)                                        # 转换成集合的数据结构
    y = set(index_urls)
    rest_of_urls = x-y                                      # 相减

    # ======================================================================================================================




Пример #19
0
#!/usr/bin/env python
#-*- coding: utf-8 -*-

from multiprocessing import Pool
from page_parsing import get_item_info_from, url_list, item_info, get_links_from
from channel_extracing import channel_list

db_urls = [item['url'] for item in url_list.find()]
index_urls = [item['url'] for item in item_info.find()]
x = set(db_urls)
y = set(index_urls)
rest_of_urls = x-y

def get_all_links_from(channel):
     for i in range(1, 100):
         get_links_from(channel, i)


if __name__ == '__main__':
    pool = Pool(processes=6)
    pool.map(get_all_links_from, channel_list)
    pool.close()
    pool.join()
Пример #20
0
'''
这个文件是用来计数显示给我看的
每5秒查看一次item_info表并显示一共有多少数据
item_info表存放的是商品链接
'''

import time
from page_parsing import item_info

while True:
    print('已爬取【58同城】商品详情', end=' ')
    print((item_info.find()).count(), end=' ')
    print('条,' + '#每5秒从DB读取一次')
    time.sleep(5)
import time
from page_parsing import url_list
from page_parsing import item_info
# while True:
#     #print(url_list.find().count())
#     print(item_info.find().count())
#     time.sleep(5)

place_list = []
for i in item_info.find():
    #print(i['place'][0])
    place_list.append(i['place'][0])
place_index = list(set(place_list))
print(place_index)
Пример #22
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-

from multiprocessing import Pool
from page_parsing import get_item_info_from, url_list, item_info, get_links_from
from channel_extracing import channel_list

db_urls = [item["url"] for item in url_list.find()]
index_urls = [item["url"] for item in item_info.find()]
x = set(db_urls)
y = set(index_urls)
rest_of_urls = x - y


def get_all_links_from(channel):
    for i in range(1, 100):
        get_links_from(channel, i)


if __name__ == "__main__":
    pool = Pool(processes=6)
    # pool.map(get_all_links_from, channel_list) # 抓取所有商品链接(只需抓一次,此处未支持断点功能)
    pool.map(get_item_info_from, rest_of_urls)  # 抓取商品详情页
    pool.close()
    pool.join()