Пример #1
0
import requests,pymongo, time
from bs4 import BeautifulSoup
from multiprocessing import Pool
from channel_extact import channel_list
from pages_parsing import get_links_from

client = pymongo.MongoClient('localhost', 27017)
gan_ji = client['ganji']
url_list = gan_ji['url_list']
iterm_info = gan_ji['iterm_info']

db_urls = [iterm['url'] for iterm in url_list.find()]  # 在url_list中找到全部的iterm,iterm['url']是url
index_urls = [iterm['url'] for iterm in iterm_info.find()]
x = set(db_urls)
y = set(index_urls)
rest_of_urls = x - y
if __name__ == '__main__':
    pool = Pool()
    # pool = Pool(processes=6)
    pool.map(get_all_links_from, channel_list.split())  # 参数一是函数,接受参数二
Пример #2
0
from multiprocessing import Pool
from channel_extact import channel_list
from page_parasing import get_links_from, get_item_info, url_list


def get_all_links_from(channel):
    for i in range(1, 200):
        get_links_from(channel, i)


def get_all_item_info():
    for item in url_list.find():
        get_item_info(item['url'])


if __name__ == '__main__':
    pool = Pool()
    pool.map(get_all_links_from, channel_list.split())

get_all_item_info()
Пример #3
0
from multiprocessing import Pool
from channel_extact  import channel_list
from pages_parsing import get_links_from



def get_all_links_from(channel):
    for i in range(1,100):
        get_links_from(channel,i)

if __name__ == '__main__':
    pool = Pool()
    # pool = Pool(processes=6)
    pool.map(get_all_links_from,channel_list.split())
#print(channel_list.split())