def urls_huifu(): db_urls = [item['url'] for item in url_list.find()] index_urls = [item['url'] for item in item_info.find()] x = set(db_urls) y = set(index_urls) rest_of_urls = x - y for url in rest_of_urls: is_zhuanzhuan = 'http://zhuanzhuan' in url.split('.')[0] is_oldxiangqingye = 'http://sz' in url.split('.')[0] if is_zhuanzhuan: get_zhuan_info(url) elif is_oldxiangqingye: print(url) get_item_info(url) else: pass
import time from page_parsing import URL_list from page_parsing import item_info while True: print(item_info.find().count()) time.sleep(2)
import time from page_parsing import url_list, item_info while True: print('url_list:', url_list.find().count()) time.sleep(5) print('item_info:', item_info.find().count()) time.sleep(5)
#encoding=utf-8 __author__ = 'Administrator' from multiprocessing import Pool from channel_extract import channel_list from page_parsing import get_links_from, get_item_info, url_lists, item_info import pymongo def get_all_links_from(channel): for num in range(1, 101): get_links_from(channel, num, 1) total_urls = [i['url'] for i in url_lists.find()] used_urls = [i['url'] for i in item_info.find()] x = set(total_urls) y = set(used_urls) left_urls = x - y if __name__ == '__main__': pool = Pool(processes=6) pool.map(get_item_info, list(left_urls))
import time from page_parsing import url_list,item_info while True: #print url_list.find().coount() print item_info.find().count() time.sleep(5)
import time from page_parsing import url_list from page_parsing import item_info while True: # count the number of url_list and item_info print('The number of url list', url_list.find().count()) print('The number of items information', item_info.find().count()) time.sleep(10)
#!/usr/bin/env python # -*- coding:utf-8 -*- """ @author: Jan @software: PyCharm Community Edition @time: 2016/2/15 21:21 """ import time from page_parsing import url_list, item_info # 每5秒查询表的记录数 while True: url_counts = url_list.find().count() info_counts = item_info.find().count() now_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) print str(url_counts) + ' ' + str(info_counts) + ' ' + str(now_time) time.sleep(5)
from multiprocessing import Pool # from channel_extract import channel_list from page_parsing import get_links_from, get_item_info, url_list, item_info def get_all_links_from(channel): for num in range(1,51): get_links_from(channel,num) if __name__=='__main__': # 一种类似作文开头的感谢领导的套话格式,防止上下程序串混乱了,没特别的意思 pool = Pool() # 创建进程池 pool.map(get_all_links_from, channel_list.split()) # map函数的特点是把括号内的后一个参数放到前一个参数(函数)里去依次执行。约定俗成map第一个参数为不带 () 的函数。 # channel_list 是引用过来的,我们之前定义过它是一个长字符串,将它分成一段段,split()函数会将一个字符串自动变成分割好的一个大list # 断点续传 db_urls = [ item['url'] for item in url_list.find() ] # 用列表解析式装入所要爬取的链接 index_urls = [ item['url'] for item in item_info.find() ] # 所引出详情信息数据库中所有的现存的 url 字段,这里的item是一个字典,不是字典的方法 items() x = set(db_urls) # 转换成集合的数据结构 y = set(index_urls) rest_of_urls = x - y # 把这个链接替换到上面的pool.map的链接即可 # 设计思路: # 1.分两个数据库,第一个用于只用于存放抓取下来的 url (ulr_list);第二个则储存 url 对应的物品详情信息(item_info) # 2.在抓取过程中在第二个数据库中写入数据的同时,新增一个字段(key) 'index_url' 即该详情对应的链接 # 3.若抓取中断,在第二个存放详情页信息的数据库中的 url 字段应该是第一个数据库中 url 集合的子集 # 4.两个集合的 url 相减得出圣贤应该抓取的 url 还有哪些
from multiprocessing import Pool from page_parsing import get_item_info_from, url_list, item_info, get_links_from from channel_extracing import channel_list # ================================================= < <链接去重 > > ===================================================== # 设计思路: # 1.分两个数据库,第一个用于只用于存放抓取下来的 url (ulr_list);第二个则储存 url 对应的物品详情信息(item_info) # 2.在抓取过程中在第二个数据库中写入数据的同时,新增一个字段(key) 'index_url' 即该详情对应的链接 # 3.若抓取中断,在第二个存放详情页信息的数据库中的 url 字段应该是第一个数据库中 url 集合的子集 # 4.两个集合的 url 相减得出圣贤应该抓取的 url 还有哪些 db_urls = [item['url'] for item in url_list.find()] # 用列表解析式装入所有要爬取的链接 index_urls = [item['url'] for item in item_info.find()] # 所引出详情信息数据库中所有的现存的 url 字段 x = set(db_urls) # 转换成集合的数据结构 y = set(index_urls) rest_of_urls = x - y # 相减 # ======================================================================================================================
import time from page_parsing import url_list,item_info while True: print('message:',item_info.find().count()) print('URL:',url_list.find().count()) time.sleep(5)
from channel_extrack import url_list from page_parsing import get_links_from from multiprocessing import Pool import time from page_parsing import get_item_info from page_parsing import URL_list from page_parsing import item_info item_all = [i['url'] for i in URL_list.find()] item_any = [i['url'] for i in item_info.find()] x = set(item_all) y = set(item_any) item_result = x - y def get_links_from_urllist(channel): for num in range(1, 101): try: while True: if get_links_from(channel, num) == None: break else: get_links_from(channel, num) except KeyboardInterrupt: break if __name__ == "__main__": # get_item_infos(item_result) for i in item_result:
from multiprocessing import Pool from channel_extract import channel_list from page_parsing import get_links_from, url_list, get_item_info, item_info def get_all_links_from(channel): for i in range(1, 101): info = get_links_from(channel, i) if info == 'none': break if __name__ == '__main__': # get_all_links_from( 'http://bj.58.com/bijiben/') all_channels = channel_list.split() pool = Pool() # pool.map(get_all_links_from,all_channels) print('url_list.count is :%s' % url_list.count()) #88280 all = set([item['url'] for item in url_list.find()]) len1 = len(all) print('set url_list count is:%s' % len1) done = set([item['url'] for item in item_info.find()]) #可省掉list.append(data)这一步 len2 = len(done) print('set item_info count is:%s' % len2) set_undone = all - done len3 = len(set_undone) print('still need to insert count is:%s' % len3) pool.map(get_item_info, set_undone)
from page_parsing import url_list, item_info print(url_list.find().count()) print([item['price'] for item in item_info.find()])
# __author__ = 'xjlin' # -*- coding: utf-8 -*- import time from page_parsing import url_list from page_parsing import item_info while True: print(url_list.find().count()) print(item_info.find().count()) time.sleep(5)
from multiprocessing import Pool from channel_extract import channel_list from page_parsing import get_links_from ,url_list,get_item_info,item_info def get_all_links_from(channel): for i in range(1,101): info = get_links_from(channel,i) if info == 'none': break if __name__ =='__main__': # get_all_links_from( 'http://bj.58.com/bijiben/') all_channels = channel_list.split() pool = Pool() # pool.map(get_all_links_from,all_channels) print('url_list.count is :%s'%url_list.count())#88280 all = set([item['url'] for item in url_list.find()]) len1=len(all) print('set url_list count is:%s'%len1) done = set([item['url'] for item in item_info.find()])#可省掉list.append(data)这一步 len2 = len(done) print('set item_info count is:%s'%len2) set_undone = all - done len3= len(set_undone) print('still need to insert count is:%s'%len3) pool.map(get_item_info,set_undone)
from multiprocessing import Pool from page_parsing import get_item_info_from, url_list, item_info, get_links_from from channel_extracing import channel_list #断点续传(去重实现) db_urls = [item['url'] for item in url_list.find()] index_urls = [item['url'] for item in item_info.find()] x = set(db_urls) y = set(index_urls) rest_of_urls = x - y # def get_all_links_from(channel): # for i in range(1,100): # get_links_from(channel,i) if __name__ == '__main__': pool = Pool(processes=6) # pool = Pool() pool.map(get_all_links_from, channel_list.split()) pool.close() pool.join()
from page_parsing import get_item_info_from,url_list,item_info,get_links_from # ================================================= < <链接去重 > > ===================================================== # 设计思路: # 1.分两个数据库,第一个用于只用于存放抓取下来的 url (ulr_list);第二个则储存 url 对应的物品详情信息(item_info) # 2.在抓取过程中在第二个数据库中写入数据的同时,新增一个字段(key) 'index_url' 即该详情对应的链接 # 3.若抓取中断,在第二个存放详情页信息的数据库中的 url 字段应该是第一个数据库中 url 集合的子集 # 4.两个集合的 url 相减得出剩下应该抓取的 url 还有哪些 db_urls = [item['url'] for item in url_list.find()] # 用列表解析式装入所有要爬取的链接 index_urls = [item['url'] for item in item_info.find()] # 所引出详情信息数据库中所有的现存的 url 字段 x = set(db_urls) # 转换成集合的数据结构 y = set(index_urls) rest_of_urls = x-y # 相减 # ======================================================================================================================
#!/usr/bin/env python #-*- coding: utf-8 -*- from multiprocessing import Pool from page_parsing import get_item_info_from, url_list, item_info, get_links_from from channel_extracing import channel_list db_urls = [item['url'] for item in url_list.find()] index_urls = [item['url'] for item in item_info.find()] x = set(db_urls) y = set(index_urls) rest_of_urls = x-y def get_all_links_from(channel): for i in range(1, 100): get_links_from(channel, i) if __name__ == '__main__': pool = Pool(processes=6) pool.map(get_all_links_from, channel_list) pool.close() pool.join()
''' 这个文件是用来计数显示给我看的 每5秒查看一次item_info表并显示一共有多少数据 item_info表存放的是商品链接 ''' import time from page_parsing import item_info while True: print('已爬取【58同城】商品详情', end=' ') print((item_info.find()).count(), end=' ') print('条,' + '#每5秒从DB读取一次') time.sleep(5)
import time from page_parsing import url_list from page_parsing import item_info # while True: # #print(url_list.find().count()) # print(item_info.find().count()) # time.sleep(5) place_list = [] for i in item_info.find(): #print(i['place'][0]) place_list.append(i['place'][0]) place_index = list(set(place_list)) print(place_index)
#!/usr/bin/env python # -*- coding: utf-8 -*- from multiprocessing import Pool from page_parsing import get_item_info_from, url_list, item_info, get_links_from from channel_extracing import channel_list db_urls = [item["url"] for item in url_list.find()] index_urls = [item["url"] for item in item_info.find()] x = set(db_urls) y = set(index_urls) rest_of_urls = x - y def get_all_links_from(channel): for i in range(1, 100): get_links_from(channel, i) if __name__ == "__main__": pool = Pool(processes=6) # pool.map(get_all_links_from, channel_list) # 抓取所有商品链接(只需抓一次,此处未支持断点功能) pool.map(get_item_info_from, rest_of_urls) # 抓取商品详情页 pool.close() pool.join()