def use_which_code(): urls = urls_() for url in urls: is_zhuanzhuan = 'http://zhuanzhuan' in url.split('.')[0] is_oldxiangqingye = 'http://sz' in url.split('.')[0] if is_zhuanzhuan: get_zhuan_info(url) elif is_oldxiangqingye: get_item_info(url) else: pass
def get_all_links(channel): for num in range(1,151): ip = random.choice(proxy_lists) if channel!="http://bj.ganji.com/shoujihaoma/": get_item_link(header,ip,channel,num) else: get_phone_links(header,ip,num) print("所有商品链接已保存成功!") for url in url_list.find("item_link"): ip = random.choice(proxy_lists) get_item_info(header,ip,url) for url in phNum_list.find("phone_link"): ip = random.choice(proxy_lists) get_phone_info(header,ip,url)
def urls_huifu(): db_urls = [item['url'] for item in url_list.find()] index_urls = [item['url'] for item in item_info.find()] x = set(db_urls) y = set(index_urls) rest_of_urls = x - y for url in rest_of_urls: is_zhuanzhuan = 'http://zhuanzhuan' in url.split('.')[0] is_oldxiangqingye = 'http://sz' in url.split('.')[0] if is_zhuanzhuan: get_zhuan_info(url) elif is_oldxiangqingye: print(url) get_item_info(url) else: pass
from multiprocessing import Pool from channel_extarct import channel_list from page_parsing import url_list from page_parsing import get_url_link from page_parsing import get_item_info import io import sys sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf8') # 改变标准输出的默认编码 def get_all_links(channel): for num in range(1, 10): try: get_url_link(channel, num, who_sells=0) get_url_link(channel, num, who_sells=1) except: pass if __name__ == '__main__': pool = Pool() # pool.map(get_all_links, channel_list.split()) try: for item_url in url_list.find(): print(item_url['url']) get_item_info(item_url['url']) except: pass
from multiprocessing import Pool from channel_extract import channel_list from page_parsing import get_links_from, get_item_info import time import pymongo client = pymongo.MongoClient('localhost', 27017) ceshi = client['ceshi'] url_list = ceshi['url_list'] def get_all_links_from(channel): for num in range(1, 101): get_links_from(channel, num) if __name__ == '__main__': pool = Pool() #pool.map(get_all_links_from,channel_list.split()) for item in url_list.find(): if item['url'] != "http://jump.zhineng.58.com/jump": get_item_info(item['url']) else: pass
def get_all_item(item_url): url = item_url['url'] crb = item_url['crb'] get_item_info(url, crb)
def get_all_item_info(url): page_parsing.get_item_info(url)
from multiprocessing import Pool from channel_extract import channel_list from page_parsing import get_links_from from page_parsing import get_item_info def get_alt_links_from(channel): for run in range(1, 5): get_links_from(channel, run) if __name__ == '__main__': pool = Pool() #pool.map(get_all_links_from,channel_list.split()) for i in get_item_info(pool.map(get_alt_links_from, channel_list.split())): print(i)
def get_all_info_from(url): key = url.split('/') if key[3] == 'shoujihaoma': page_parsing.get_pnumber_info(url) else: page_parsing.get_item_info(url)
from page_parsing import url_list,get_item_info for item in url_list.find(): get_item_info(item['url'])
def getAllUrlLists(channel): for num in range(1, 101): for url in getUrlList(channel, num): get_item_info(url)
x = set(item_all) y = set(item_any) item_result = x - y def get_links_from_urllist(channel): for num in range(1, 101): try: while True: if get_links_from(channel, num) == None: break else: get_links_from(channel, num) except KeyboardInterrupt: break if __name__ == "__main__": # get_item_infos(item_result) for i in item_result: try: get_item_info(i) except: print(i) print('结束!') # pool = Pool(processes=4) # pool.map(get_links_from_urllist,url_list.split()) # for link in url_list.split(): # get_links_from_urllist(link)
def get_all_items_info(): for a in all_links: for b in a: page_parsing.get_item_info(b)