Exemplo n.º 1
0
    def __init__(self, *args, **kwargs):
        self.areas = [
            {"name": "heping", "page": 0, "now": 1},
            {"name": "nankai", "page": 0, "now": 1},
            {"name": "hexi", "page": 0, "now": 1},
            {"name": "hebei", "page": 0, "now": 1},
            {"name": "hedong", "page": 0, "now": 1},
            {"name": "hongqiao", "page": 0, "now": 1},
            {"name": "xiqing", "page": 0, "now": 1},
            {"name": "beichen", "page": 0, "now": 1},
            {"name": "dongli", "page": 0, "now": 1},
            {"name": "jinnan", "page": 0, "now": 1},
            {"name": "tanggu", "page": 0, "now": 1},
            {"name": "kaifaqu", "page": 0, "now": 1},
            {"name": "diyidajie", "page": 0, "now": 1},
            {"name": "dierdajie", "page": 0, "now": 1},
            {"name": "disandajie", "page": 0, "now": 1},
            {"name": "disidajie", "page": 0, "now": 1},
            {"name": "diwudajie", "page": 0, "now": 1}
        ]

        self.area_now = 0
        self.util = util()
        self.list_url = 'https://tj.lianjia.com/ershoufang/{area}/pg{page}/'
        self.r = redis.Redis(host=settings.REDIS['host'], port=settings.REDIS['port'])

        super(CrawlLianjiaSpider, self).__init__(*args, **kwargs)
        if kwargs and "page" in kwargs:
            try:
                self.page_all = int(kwargs['page'])
            except TypeError, e:
                self.page_all = 100
 def __init__(self):
     self.util = util()
     self.r = redis.Redis(host=settings.REDIS['host'],
                          port=settings.REDIS['port'])
     self.property_map = {
         "房屋户型": 'layout',
         "所在楼层": 'flood',
         "建筑面积": 'area',
         "户型结构": 'apartment_structure',
         "建筑类型": 'building_type',
         "装修情况": 'renovation',
         "梯户比例": 'ladder',
         "供暖方式": 'heating',
         "产权年限": 'property_term',
         "交易权属": 'ownership',
         "挂牌时间": 'list_time',
         "上次交易": 'last_trade',
         "房屋朝向": 'direction',
         "房屋用途": 'purpose',
         "房屋年限": 'hold_years',
         "抵押信息": 'mortgage',
         "房本备件": 'house_register',
         "核心卖点": 'core_point',
         "周边配套": 'periphery',
         "交通出行": 'traffic',
         "小区介绍": 'residential_desc',
         "户型介绍": 'layout_desc',
         "配备电梯": 'elevator'
     }
 def __init__(self):
     self.util = util()
     self.verify_tool = Verify()
     self.r = redis.Redis(host='127.0.0.1')
     self.login_url = 'https://cas.baidu.com/?action=login'
     self.verify_url = 'http://cas.baidu.com/?action=image&key={rand}'
     self.page_now = 1
     self.max_page = 51
     self.verify_save_name = 'verify.jpg'
     self.sites_map = {
         '8918649': {'name': 'm.91pme.com', 'page_now': 1, 'time': None},
         '7802984': {'name': '91pme.com', 'page_now': 1, 'time': None},
         '8918810': {'name': 'mm.91pme.com', 'page_now': 1, 'time': None}
     }
     self.formdata = {
         "siteId": "7802984",
         "order": "start_time,desc",
         "offset": "0",
         "pageSize": "100",
         "tab": "visit",
         "timeSpan": "14",
         "indicators": "start_time,area,source,access_page,searchword,visitorId,ip,visit_time,visit_pages",
         "reportId": "4",
         "method": "trend/latest/a",
         "queryId": ""
     }
     self.lastest_access_time = {}
    def __init__(self, *args, **kwargs):
        self.util = util()
        self.url_format = 'http://brokers.fx678.com/articlelist/{id}/{page}'
        self.type_index = 0
        self.page_index = 1
        self.max_page = 1

        super(CrawlFx678ArticleSpider, self).__init__(*args, **kwargs)
        if kwargs and "max" in kwargs:
            self.max_page = int(kwargs['max'])
    def __init__(self, *args, **kwargs):
        self.util = util()
        date_diff = datetime.timedelta(days=1)
        self.date_end = self.date_now + date_diff

        super(CrawlFx678CalendarSpider, self).__init__(*args, **kwargs)
        if 'args' in kwargs:
            params = {
                x[0]: x[1]
                for x in [[l for l in m.split(":")]
                          for m in kwargs['args'].split(",")]
            }

            if "start" in params:
                try:
                    date_pat = re.compile(r"\d{4}\-\d{2}\-\d{2}")
                    if len(date_pat.findall(params['start'])) == 0:
                        timedelta = datetime.timedelta(
                            days=int(params['start']))
                        date_start = datetime.datetime.now() + timedelta
                    else:
                        date_start = datetime.datetime.strptime(
                            params['start'], "%Y-%m-%d")

                    self.date_now = date_start
                except ValueError as error:
                    print params['start'] + ' 不是正确格式的时间,已默认从今天开始抓取'

            if "max" in params:
                try:
                    self.max_days = int(params['max'])
                except ValueError as err:
                    print params['max'] + ' 不是正确的抓取天数,已默认抓取全部数据'

            if "after" in params:
                try:
                    self.after_days = int(params['after'])
                except ValueError as err:
                    print params['after'] + ' 不是正确的向后抓取天数,已默认抓取今天之后60天的数据'

            if "jiedu" in params:
                self.jiedu = params['jiedu']

            if self.max_days is not None:
                date_diff = datetime.timedelta(days=int(self.max_days))
                self.date_end = self.date_now + date_diff
            else:
                date_diff = datetime.timedelta(days=int(self.after_days))
                self.date_end = datetime.datetime.now() + date_diff
Exemplo n.º 6
0
    def __init__(self):
        self.all = []
        self.data_name = 'baidu_rate_%d' % int(time.time())
        self.util = util()
        self.verify_tool = Verify()
        self.verify_save_name = 'verify.jpg'
        self.r = redis.Redis(host='127.0.0.1')
        self.login_url = 'https://cas.baidu.com/?action=login'
        self.verify_url = 'http://cas.baidu.com/?action=image&key={rand}'
        self.page_now = 0
        self.sites_map = {
            '8918649': {
                'name': 'm.91pme.com',
                'page_now': 1
            },
            '7802984': {
                'name': '91pme.com',
                'page_now': 1
            },
        }

        self.formdata = {
            "productId": "fcWord,0",
            "fcPlanId": "-1",
            "fcUnitId": "-1",
            "siteId": "8918649",
            "st": "1512489600000",
            "et": "1512489600000",
            "indicators": "",
            "order": "bounce_ratio,desc",
            "offset": "0",
            "target": "-1",
            "flag": "fcWord",
            "userId": "0",
            "fcWordType": "fcSearchWord",
            "clientDevice": "all",
            "reportId": "6",
            "method": "pro/product/a",
            "queryId": ""
        }

        self.indicators = [
            "show_count", "clk_count", "cost_count", "ctr", "cpm", "pv_count",
            "visit_count", "visitor_count", "new_visitor_count",
            "new_visitor_ratio", "in_visit_count", "bounce_ratio",
            "avg_visit_time", "avg_visit_pages", "arrival_ratio",
            "trans_count", "trans_ratio", "avg_trans_cost", "income", "profit",
            "roi"
        ]
Exemplo n.º 7
0
    def __init__(self, *args, **kwargs):
        super(CrawlWeixinSearchSpider, self).__init__(*args, **kwargs)
        self.util = util()
        self.r = redis.Redis(host=REDIS['host'], port=REDIS['port'])
        self.page_url = "http://weixin.sogou.com/weixin?usip=&query={query}&ft=&tsn=1&et=&interation=&type=2&wxid=&page={page}&ie=utf8"
        self.type_index = 0
        self.type = [{'name': '五常大米', 'page_now': 1, 'page_all': 1}]
        self.type_now = self.type[0]
        self.only_hot = False
        self.typename = self.type_now['name']
        self.referer = "http://weixin.sogou.com/weixin?type=2&s_from=input&query={query}&ie=utf8&_sug_=y&_sug_type_=&w=01019900&sut=10939&sst0={time}&lkt=6%2C1513059170545%2C1513059180409"

        if 'args' in kwargs:
            params = {
                x[0]: x[1]
                for x in [[l for l in m.split(":")]
                          for m in kwargs['args'].split(",")]
            }

            if "hot" in params:
                self.only_hot = True
                print "Only crawl hot keywords"
 def __init__(self):
     self.base_url = "https://tj.lianjia.com/ershoufang/housestat?hid={house_id}&rid={residential_id}"
     self.r = redis.Redis(host=REDIS['host'])
     self.util = util()
     self.residential_id = None
Exemplo n.º 9
0
 def __init__(self):
     self.util = util()
     self.r = redis.Redis(host='127.0.0.1')
 def __init__(self):
     self.util = util()
     self.r = redis.Redis(host="127.0.0.1", port=6379, db=0)
     self.item_index = 0
Exemplo n.º 11
0
 def __init__(self):
     self.base_url = "https://d.weibo.com/p/aj/v6/mblog/mbloglist?ajwvr=6&domain=102803_ctg1_1760_-_ctg1_1760&pagebar=0&tab=home&current_page={current_page}&pre_page={pre_page}&page={page}&pl_name=Pl_Core_NewMixFeed__3&id=102803_ctg1_1760_-_ctg1_1760&script_uri=/&feed_type=1&domain_op=102803_ctg1_1760_-_ctg1_1760&__rnd={time}"
     self.page = self.current_page = self.pre_page = 1
     self.util = util()
     self.login_time = 0
     self.login_cmd = CmdWeiboLogin()
Exemplo n.º 12
0
 def __init__(self):
     self.util = util()
Exemplo n.º 13
0
 def __init__(self, *args, **kwargs):
     self.util = util()
     super(CrawlJin10ArticleSpider, self).__init__(*args, **kwargs)
     if kwargs and "all" in kwargs:
         self.crawl_all_page = bool(kwargs['all'])
Exemplo n.º 14
0
 def __init__(self):
     self.util = util()
     self.house_id = None
     self.baseurl = 'https://tj.lianjia.com/ershoufang/houseseerecord?id={id}'
     self.r = redis.Redis(host=settings.REDIS['host'],
                          port=settings.REDIS['port'])