Exemplo n.º 1
0
 def start_requests(self):
     Mysql().truncate_table('t_web_lj_district')
     id_url = Mysql().query_by_sql('''
                 select id,url
                 from t_web_lj_city
                 where url is not null
             ''')
     for iu in id_url:
         yield Request(iu['url'] + 'ershoufang/',
                       meta={'id': iu['id']},
                       callback=self.get_district,
                       dont_filter=True)
Exemplo n.º 2
0
 def start_requests(self):
     Mysql().truncate_table('t_web_lj_community')
     id_and_route = Mysql().query_by_sql('''
                         select di.id,di.route,ci.url
                         from t_web_lj_city ci,t_web_lj_district di
                         where di.city_id=ci.id
                     ''')
     for one in id_and_route:
         yield Request(one['url'] + 'ershoufang/' + one['route'] + '/',
                       meta={'id': one['id']},
                       callback=self.get_community,
                       dont_filter=True)
 def get_residence_url(self, response):
     li = Selector(response).xpath(
         '/html/body/div[4]/div[1]/ul/li').extract()
     for l in li:
         st = Selector(text=l)
         url = st.xpath('//*[@class="img"]/@href').extract_first()
         is_exist = Mysql().query_by_sql(
             "select * from t_web_lj_residence where url='%s'" % url)
         if isinstance(is_exist, list):
             continue
         district = st.xpath(
             '//*[@class="district"]/@href').extract_first().split('/')[-2]
         community = st.xpath(
             '//*[@class="bizcircle"]/@href').extract_first().split('/')[-2]
         yield Request(url,
                       meta={'d_c': district + '_' + community},
                       callback=self.get_residence_info,
                       dont_filter=True)
     page_box = Selector(response).xpath(
         '//*[@class="page-box house-lst-page-box"]').extract_first()
     if page_box is not None:
         totalPage = eval(
             Selector(text=page_box).xpath(
                 '//@page-data').extract_first())['totalPage']
         curPage = eval(
             Selector(text=page_box).xpath(
                 '//@page-data').extract_first())['curPage']
         if totalPage > curPage:
             yield Request(response.url[0:response.url.find('/', 30) + 1] +
                           'pg' + str(curPage + 1) + '/',
                           callback=self.get_residence_url,
                           dont_filter=True)
Exemplo n.º 4
0
 def start_requests(self):
     id_esf_url = Mysql().query_by_sql('''
                     select co.route,c.url
                     from t_web_lj_community co,t_web_lj_district d,t_web_lj_city c
                     where d.id=co.district_id and d.city_id=c.id
                 ''')
     for route_url in id_esf_url:
         yield Request(
             route_url['url'] + 'ershoufang/' + route_url['route'] + '/co32/',
             callback=self.get_esf_url,
             dont_filter=True
         )
    def start_requests(self):
        q_result = Mysql().query_by_sql('''
                            select co.id,di.route d_r,co.route c_r,ci.url
                            from t_web_lj_community co,t_web_lj_district di,t_web_lj_city ci
                            where co.district_id=di.id and di.city_id=ci.id;
                        ''')
        for one_r in q_result:
            self.d_c[one_r['d_r'] + '_' + one_r['c_r']] = one_r['id']

        for one_d in q_result:
            yield Request(one_d['url'] + 'xiaoqu/' + one_d['c_r'] + '/',
                          callback=self.get_residence_url,
                          dont_filter=True)
Exemplo n.º 6
0
    def start_requests(self):
        q_result = Mysql().query_by_sql('''
                            select ci.cn_name,co.route c_r,ci.url
                            from t_web_lj_community co,t_web_lj_district di,t_web_lj_city ci
                            where co.district_id=di.id and di.city_id=ci.id;
                        ''')

        for one_d in q_result:
            yield Request(
                one_d['url'] + 'xiaoqu/' + one_d['c_r'] + '/',
                meta={'rsd_ci': one_d['cn_name']},
                callback=self.get_residence_url
            )
Exemplo n.º 7
0
    dct['ring_num'] = data['ring_num']
    dct['lj_num'] = data['lj_num']

    dct['house_age'] = data['house_age']
    dct['property_type'] = data['property_type']
    dct['house_type'] = data['house_type']
    dct['house_owner'] = data['house_owner']
    dct['listing_date'] = data['listing_date']
    dct['total_price'] = data['total_price']
    dct['unit_price'] = data['unit_price']
    dct['last_deal'] = data['last_deal']
    dct['mortgage'] = data['mortgage']
    dct['house_backup'] = data['house_backup']

    dct['bsn_dt'] = None
    dct['tms'] = time.strftime("%Y-%m-%d %X", time.localtime())
    dct['url'] = data['url']
    dct['webbst_nm'] = u'链家'
    dct['crawl_time'] = data['crawl_time']
    dct['residence_url'] = None
    dct['residence_id'] = data['residence_id']
    # line = json.dumps(OrderedDict(dct), ensure_ascii=False, sort_keys=False) + '\n'
    # f.write(line)
    # break
    Mysql().insert_by_dict('t_web_lj_esf', dct)

end_time = datetime.now()
print 'end_time:', end_time

print 'seconds:', (end_time - start_time).seconds
Exemplo n.º 8
0
 def process_item(self, item, spider):
     Mysql().insert_by_item(item)
     return item