示例#1
0
def crawl_detail_with_url(url, **kwargs):
    print('论文详情url地址:' + url)

    response = send_request(url)
    print('论文详情请求成功' + response.url)
    if response and url == response.url:
        title = kwargs['title']
        print('详情请求状态码', title, kwargs, response.status_code)
        app.send_task('tasks.parse_detail_data', args=(response.text, kwargs))
示例#2
0
def manage_crawl_task():
    while True:
        shop = r.lpop("shop")

        if shop:
            shop = shop.decode()
            shop = json.loads(shop)
            content = app.send_task('tasks.crawl', args=(shop, ))
            # content = app.send_task('tasks.crawl', kwargs=shop)
            print(content)
        time.sleep(0.001)
示例#3
0
def manage_crawl_task(urls):
    for url in urls:
        app.send_task('tasks.crawl', args=(url, ))
示例#4
0
def parse_page_data(response):
    """
    从旧网站中获取获取每一个分类的(政治、法律)关键字的搜索结果列表页中,提取论文详情的URL地址
    :param response:
    :return:
    """
    print('分页请求成功:' + response.url)

    # 论文标题列表
    # 使用正则获取当前分类关键字和当前的页码数
    pattern = re.compile('.*?q=(.*?)\+.*?WF_(.*?)&.*?p=(\d+)', re.S)
    result = re.findall(pattern, response.url)[0]
    print(result)
    keyword = parse.unquote(result[0])
    tag = result[1]
    currentPage = result[2]
    etree_html = etree.HTML(response.text)
    # record-item-list
    itemList = etree_html.xpath(
        '//div[@class="record-item-list"]/div[@class="record-item"]')
    print(tag + keyword + '第' + str(currentPage) + '页,' + '获取到了' +
          str(len(itemList)) + '条数据。')
    if len(itemList) > 0:
        for item in itemList:
            itemTitle = ''.join(
                item.xpath('.//a[@class="title"]//text()')).replace(' ', '')
            itemUrl = item.xpath('.//a[@class="title"]/@href')[0]
            itemId = itemUrl.split('/')[-1:][0]
            if tag == 'HY':
                # 会议的详情
                # http://www.wanfangdata.com.cn/details/detail.do?_type=conference&id=7730508
                # print(itemTitle, itemUrl, '会议的详情',itemId)
                newItemUrl = 'http://www.wanfangdata.com.cn/details/detail.do?_type=conference&id=' + itemId
                info = {
                    'searchKeyWord': keyword,
                    'searchType': 'conference',
                    'title': itemTitle
                }

                app.send_task('tasks.crawl_detail_with_url',
                              args=(newItemUrl, ),
                              kwargs=info)

            elif tag == "XW":
                # 学位的详情
                # http://www.wanfangdata.com.cn/details/detail.do?_type=degree&id=D01551993
                # print(itemTitle, itemUrl, '学位的详情',itemId)
                newItemUrl = 'http://www.wanfangdata.com.cn/details/detail.do?_type=degree&id=' + itemId
                info = {
                    'searchKeyWord': keyword,
                    'searchType': 'degree',
                    'title': itemTitle
                }
                app.send_task('tasks.crawl_detail_with_url',
                              args=(newItemUrl, ),
                              kwargs=info)

            elif tag == "QK":
                # 期刊的详情
                # http://www.wanfangdata.com.cn/details/detail.do?_type=perio&id=bjgydxxb-shkx201902004
                # print(itemTitle, itemUrl, '期刊的详情', itemId)
                newItemUrl = 'http://www.wanfangdata.com.cn/details/detail.do?_type=perio&id=' + itemId
                info = {
                    'searchKeyWord': keyword,
                    'searchType': 'perio',
                    'title': itemTitle
                }

                app.send_task('tasks.crawl_detail_with_url',
                              args=(newItemUrl, ),
                              kwargs=info)

    # 获取下一页
    nextUrls = etree_html.xpath('//p[@class="pager"]//a[@class="page"]/@href')
示例#5
0
from local_test.task1 import do_test
import random
from workers import app

for i in range(1, 1000):

    x = random.randint(0, 100)
    y = random.randint(0, 100)
    # do_test.delay()
    app.send_task('local_test.task1.do_test',
                  queue='task1_queue',
                  routing_key='task1_routing',
                  args=())
示例#6
0
def manage_crawl_task(urls):
    for url in urls:
        app.send_task('tasks.crawl_pageurl_and_detailurl', args=(url, ))