Python set_mongo_server示例，crawler.spiders.util.set_mongo_server Python示例

示例#1

0

显示文件

文件： user_info.py 项目： Ravin515/CrawlerMain

    def start_requests(self):
        start_url = "https://xueqiu.com/user/show.json?id="

        # get start url from MongoDB
        db = util.set_mongo_server()
        owner_ids = []
        for id in db.xq_cube_info.find({}, {'owner_id': 1, '_id': 0}):
            owner_ids.append(id['owner_id'])
        owner_ids = list(set(owner_ids))

        # iterate each symbol
        all_page_n = len(owner_ids)
        for i in range(all_page_n):
            now_page_n = i
            owner_id = owner_ids[i]
            url = start_url + str(owner_id)

            # progress
            if i % 1000 == 0:
                self.logger.info(
                    '%s (%s / %s) %s%%' %
                    (owner_id, str(now_page_n), str(all_page_n),
                     str(round(float(now_page_n) / all_page_n * 100, 1))))
            yield Request(url=url,
                          meta={'user_id': owner_id},
                          callback=self.parse)

示例#2

0

显示文件

    def start_requests(self):
        start_url = "http://xueqiu.com/friendships/followers.json?size=50&uid="

        # get start url from MongoDB
        db = util.set_mongo_server()
        owner_ids = []
        for id in db.xq_cube_info.find({}, {'owner_id': 1, '_id': 0}):
            owner_ids.append(id['owner_id'])
        owner_ids = list(set(owner_ids))

        # iterate each symbol
        all_page_n = len(owner_ids)
        for i in range(all_page_n):
            now_page_n = i
            owner_id = owner_ids[i]
            url = start_url + str(owner_id)

            # progress
            if i % 1000 == 0:
                self.logger.info(
                    '%s (%s / %s) %s%%' %
                    (owner_id, str(now_page_n), str(all_page_n),
                     str(round(float(now_page_n) / all_page_n * 100, 1)))
                )  #util.get_progress(now_page = i, all_page = all_page_n, logger = self.logger, spider_name = self.name, start_at = self.start_at)

            yield Request(url=url,
                          meta={'user_id': owner_id},
                          callback=self.parse)

示例#3

0

显示文件

文件： cube_rb.py 项目： Ravin515/CrawlerMain

    def start_requests(self):
        zh_url = 'https://xueqiu.com/cubes/rebalancing/history.json?count=50&page=1&cube_symbol='
        sp_url = 'https://xueqiu.com/service/tc/snowx/PAMID/cubes/rebalancing/history?count=20&page=1&cube_symbol='


        # get start url from MongoDB
        db = util.set_mongo_server()
        symbols = []

        for s in db.xq_cube_info.find({'cube_type':self.cube_type}, {'symbol': 1, '_id': 0}):
            symbols.append(s['symbol'])
        symbols = list(set(symbols))

        # iterate each symbol
        all_page_n = len(symbols)
        for i in range(all_page_n):
            symbol = symbols[i].strip()
            now_page_n = i

            if self.cube_type == 'SP':
                url = sp_url + symbol
            elif self.cube_type == 'ZH':
                url = zh_url + symbol

            # 进度条
            if i%500==0:
                self.logger.info('%s (%s / %s) %s%%' % (symbol, str(now_page_n), str(all_page_n), str(round(float(now_page_n) / all_page_n * 100, 1))))

            yield Request(url = url,
                      callback = self.parse, meta = {'cube_type':self.cube_type, 'symbol':symbol,'page':1})

示例#4

0

显示文件

文件： guba_reply_user_info.py 项目： zymITsky/CrawlerMain

    def start_requests(self):
        db = util.set_mongo_server()
        reply_author_urls = []
        #replys = list(db.CrawlerGuba.aggregate([{'$project':{'_id': 0, 'reply': 1}} ,{'$unwind': '$reply'}]))
        for url in db.guba_stock_posts.find({}, {
                'reply.reply_author_url': 1,
                '_id': 0
        }):
            if 'reply' in url:
                for e in url['reply']:
                    if 'reply_author_url' in e:
                        reply_author_urls.append(e['reply_author_url'])
        reply_author_urls = list(set(reply_author_urls))
        all_page_n = len(reply_author_urls)
        for i in range(all_page_n):
            reply_author_url = reply_author_urls[i]
            url = reply_author_url

            if i % 1000 == 0:
                self.logger.info('%s / %s' % (str(i), str(all_page_n)))
                util.get_progress(all_page=all_page_n,
                                  logger=self.logger,
                                  spider_name=self.name,
                                  start_at=self.start_at)

            yield Request(url=url,
                          meta={'reply_author_url': reply_author_url},
                          callback=self.parse)

示例#5

0

显示文件

    def start_requests(self):
        start_url = "https://xueqiu.com/v4/statuses/user_timeline.json?&count=20&user_id="

        ## get start url from MongoDB
        db = util.set_mongo_server()
        owner_ids = []
        for id in db.xq_cube_info.find({}, {'owner_id': 1, '_id': 0}):
            owner_ids.append(id['owner_id'])
        owner_ids = list(set(owner_ids))

        #owner_ids = ["1001223822"]

        # iterate each symbol
        all_page_n = len(owner_ids)
        for i in range(all_page_n):
            owner_id = owner_ids[i]
            now_page_n = i
            url = start_url + str(owner_id)

            # progress
            if i % 1000 == 0:
                self.logger.info(
                    '%s (%s / %s) %s%%' %
                    (owner_id, str(now_page_n), str(all_page_n),
                     str(round(float(now_page_n) / all_page_n * 100, 1))))
            #util.get_progress(all_page = all_page_n, logger = self.logger, spider_name = self.name, start_at = self.start_at)

            yield Request(url=url,
                          meta={'user_id': owner_id},
                          callback=self.parse)

示例#6

0

显示文件

文件： pipelines.py 项目： zymITsky/CrawlerMain

    def __init__(self):
        # set logger
        self.logger = util.set_logger('pipeline', LOG_FILE_PIPELINE)

        # 建立MongoDB server
        self.db = util.set_mongo_server()

        # 建立redis server
        self.redis_server = util.set_redis_server()

示例#7

0

显示文件

文件： user_cube.py 项目： Ravin515/CrawlerMain

    def start_requests(self):
        #start_url="https://xueqiu.com/stock/portfolio/stocks.json?size=5000&tuid="
        start_url = "https://stock.xueqiu.com/v5/stock/portfolio/stock/list.json?size=10000&category=3&pid=-120&uid="

        db = util.set_mongo_server()
        owner_ids = []
        for id in db.xq_cube_info.find({}, {'owner_id': 1, '_id': 0}):
            owner_ids.append(id['owner_id'])
        owner_ids = list(set(owner_ids))

        # iterate each symbol
        all_page_n = len(owner_ids)
        for i in range(all_page_n):
            now_page_n = i
            owner_id = owner_ids[i]
            url = start_url+str(owner_id)

            # progress
            if i%1000==0:
                self.logger.info('%s (%s / %s) %s%%' % (owner_id, str(now_page_n), str(all_page_n), str(round(float(now_page_n) / all_page_n * 100, 1))))

            yield Request(url = url, meta = {'user_id': owner_id}, callback = self.parse)

示例#8

0

显示文件

文件： cube_ret.py 项目： zymITsky/CrawlerMain

    def start_requests(self):
        zh_url = 'https://xueqiu.com/cubes/nav_daily/all.json?cube_symbol='
        sp_url = 'https://xueqiu.com/service/tc/snowx/PAMID/cubes/nav_daily/all?cube_symbol='

        # get start url from MongoDB
        db = util.set_mongo_server()
        
        symbols = []
        for s in db.xq_cube_info.find({'cube_type':self.cube_type}, {'symbol': 1, '_id': 0}):
            symbols.append(s['symbol'])
        symbols = list(set(symbols))

        for s in db.fail.find({}, {'cube_symbol': 1, '_id': 0}):
            symbols.append(s['cube_symbol'])
        symbols = list(set(symbols))
        print(len(symbols))

        # iterate each symbol
        all_page_n = len(symbols)
        for i in range(all_page_n):
            now_page_n = i
            symbol = symbols[i].strip()
            if self.cube_type == 'SP':
                url = sp_url + symbol
            elif self.cube_type == 'ZH':
                url = zh_url + symbol

        #    # 进度条
            if i%1000==0:
                 self.logger.info('%s (%s / %s) %s%%' % (symbol, str(now_page_n), str(all_page_n), str(round(float(now_page_n) / all_page_n * 100, 1))))
                #util.get_progress(now_page = i, all_page = all_page_n, logger = self.logger, spider_name = self.name, start_at = self.start_at)

            yield Request(url = url,
                    meta = {'symbol': symbol, 
                    'cube_type':self.cube_type},
                        callback = self.parse)

示例#9

0

显示文件

文件： test.py 项目： zymITsky/CrawlerMain

import base64
import os
import random
import time
import json, urllib
import re
import pymongo
import redis
import logging
import urllib.request
from datetime import datetime, timedelta
from crawler.settings import *
from crawler.spiders import util
from scrapy import signals
from scrapy.downloadermiddlewares.retry import RetryMiddleware
from scrapy.utils.response import response_status_message
from twisted.web._newclient import ResponseNeverReceived
from twisted.python.failure import Failure
from twisted.internet.error import TimeoutError, ConnectionRefusedError, ConnectError
from crawler.spiders import util

conn = util.set_redis_server()
db = util.set_mongo_server()

示例#10

0

显示文件

文件： mmb_hist.py 项目： zymITsky/CrawlerMain

class MMBHistSpider(Spider):
    name = 'MMBHist'
    logger = util.set_logger(name, LOG_FILE_MMB)
    handle_httpstatus_list = [404, 460, 504]
    db = util.set_mongo_server()

    # 抓“一家在售”
    if_crawl_onestore = True
    # 抓“多家在售”
    if_crawl_multstore = False


    def start_requests(self): 

        #“一家在售”的商品
        if self.if_crawl_onestore:
            bjids = []
            for id in self.db["MMB"].find({'bjid': {'$exists': True}}, {'bjid': 1, '_id': 0}):
                bjids.append(id['bjid'])
            bjids = list(set(bjids))

            # iterate each bjid
            all_page_n = len(bjids)
            for i in range(all_page_n):

                bjid = bjids[i].strip()
                now_page_n = i

                url = "http://tool.manmanbuy.com/history.aspx?action=gethistory&bjid=" + str(bjid)

                # 进度条
                if i%500==0:
                    self.logger.info('一家在售：(%s / %s) %s%%' % (str(now_page_n), str(all_page_n), str(round(float(now_page_n) / all_page_n * 100, 1))))

                yield Request(url = url, callback = self.parse)

        # “多家在售”的商品
        if self.if_crawl_multstore:
            p_infos = []
            # 挑出spid, name, url 不重复的记录
            pipeline = [
                {'$match':{'bjid':{'$exists':False}}},
                {'$group': {'_id': {'spid': '$spid', 'name': '$name', 'url': '$url'}}},
            ]
            cur = self.db.MMB.aggregate(pipeline)
            for i in cur:
                p_infos.append(i['_id'])

            all_page_n_mult = len(p_infos)
            for i in range(all_page_n_mult):
                p_info = p_infos[i]

                url = p_info['url']
                now_page_n = i

                # 进度条
                if i%500==0:
                    self.logger.info('多家在售： (%s / %s) %s%%' % (str(now_page_n), str(all_page_n_mult), str(round(float(now_page_n) / all_page_n_mult * 100, 1))))

                yield Request(url = url, meta = {"p_info":p_info}, callback = self.parse_mult)

            #yield Request(url = 'http://www.manmanbuy.com/pb_567731.aspx', meta = {"p_info":p_info}, callback = self.parse_mult)

    def parse_mult(self, response):
        try:
            if response.status == 200:
                # 把上一步的 item 传进来
                p_info = response.meta['p_info']

                # 解析同一个商品下的多家平台的链接
                nodes = response.xpath('//div[contains(@class, "pro-mall-list")]//ul//li//div[contains(@class, "item ")]')

                for n in nodes:
                    # 店铺名，不等于 siteName。例如同样siteName = 天猫。可以有sell_name = “vivo旗舰店”or “vivo天诚专卖店”
                    seller_name = n.xpath('div[contains(@class, "mall")]//text()').extract()
                    seller_name = ' '.join(' '.join(seller_name).split())
            
                    # get skuid
                    skuid = n.xpath('@skuid').extract()[0]

                    # get bjid
                    bjid = n.xpath('@v').extract()[0].strip()
                    bjid = ast.literal_eval(bjid)['bjid']

                    p_info.update({"seller_name":seller_name, "skuid":skuid, "bjid":bjid})

                    # 生成请求
                    url = "http://tool.manmanbuy.com/history.aspx?action=gethistory&bjid=" + str(bjid)

                    yield Request(url = url, meta = {"p_info":p_info}, callback = self.parse)

            else:
                self.logger.error('HTTP status not 200: %s \n %s' % (response.url, response.body))  
                
        except Exception as ex:
            self.logger.error('Parse Exception - "parse_mult": %s %s' % (str(ex), response.url))

    def parse(self, response):
        try:
            # 如果 200，按正常解析
            if response.status == 200:
                # 把上一步的 item 传进来(如果有)
                p_info = {}
                if "p_info" in response.meta:
                    p_info = response.meta['p_info']

                # 解析价格 json
                body = re.sub('[\s]', '', response.body.decode('gbk'))
                body = json.loads(body)

                # 在p_info中添加产品基本信息
                p_info.update({k: body[k] for k in ('siteName', 'siteId', 'zouShi', 'bjid', 'spName', 'spUrl', 'spbh', 'zouShi_test')})

                # p_hist 只包含价格/日期
                p_hist = body['datePrice']
                p_hist = re.findall("\[(.+?)\]", p_hist)

                # 把价格list“展开”
                docs = []
                lastcrawl = datetime.datetime.utcnow()
                for p in p_hist:
                    # date
                    m = re.search("Date.UTC\((.+?)\),([\d\.]+)", p)
                    if m:
                        date = m.group(1)
                        date = datetime.datetime.strptime(date, "%Y,%m,%d") - datetime.timedelta(hours = 8) # 把 strptime的结果转换成UTC
                    
                        # price
                        price = float(m.group(2).strip())
                        
                        # create doc and add to docs
                        doc = p_info
                        doc.update({"date":date, "price":price, "lastcrawl":lastcrawl})
                        docs.append(copy.deepcopy(doc))

                item = PriceItem()
                item['content'] = docs
                yield item
                    
            else:
                self.logger.error('Got %s: %s' % (response.status, response.url))

        except Exception as ex:
            self.logger.error('Parse Exception - "parse": %s %s' % (str(ex), response.url))
            self.logger.info(str(response.body))