def _save_community(self):
     # 表中是否已有记录
     query_sql = "SELECT * FROM communities WHERE ORIGINAL_URL = '{}' and source_id ={} ".format(
         self._community_detail["url"], self.source_id)
     communityid_sql = "SELECT COMMUNITY_ID FROM communities WHERE ORIGINAL_URL = '{}' and source_id ={} ".format(
         self._community_detail["url"], self.source_id)
     if Dao.execute_query(query_sql) is not None:
         print(" {} is already exists ,so next".format(self._community_detail["name"]))
         return Dao.execute_query(communityid_sql)[0][0]
     # 数据插入操作
     Dao.execute_dmls(self._insert_community())
     return Dao.execute_query(communityid_sql)[0][0]
示例#2
0
    def _generate_seed_url(self):
        """
        generate all url to visit
        """
        # self._seed_url = "http://www.njhouse.com.cn/spf/inf/index.php?prjid=108510"
        # self._visit_pages(self._seed_url)
        # from page 1 to anypage which < 200

        # # 从数据库添加
        # self._seed_url = Dao._get_url_by_id(self.source_id)

        querysql = "SELECT COMMUNITY_ID,ORIGINAL_URL FROM communities WHERE   source_id ='{}' and status<2 ; ".format(
            self.source_id)
        result = Dao.execute_query(querysql)
        for COMMUNITY_ID, ORIGINAL_URL in result:
            try:
                self._apartment_detail["COMMUNITY_ID"] = int(COMMUNITY_ID)
                self._apartment_detail["create_time"] = time.strftime(
                    '%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
                # print("_generate_seed_url func : "+ORIGINAL_URL)
                self._visit_pages(ORIGINAL_URL)
                sql = "update communities set status = '2' where COMMUNITY_ID = '{}' ".format(
                    int(COMMUNITY_ID))
                Dao.execute_dmls(sql)
            except Exception as e:
                print(e)
                sql = "update communities set status = '-1' where COMMUNITY_ID = '{}' ".format(
                    int(COMMUNITY_ID))
                Dao.execute_dmls(sql)
 def _check_community(self, url):
     # 表中是否已有记录 完成的
     communityid_sql = "SELECT COMMUNITY_ID FROM communities WHERE ORIGINAL_URL = '{}' and source_id ={} and status = 2 ".format(
         url, self.source_id)
     result = Dao.execute_query(communityid_sql)
     if result == None:
         return False
     return True
def get_communities():
    dao = Dao()
    sql = '''SELECT DISTINCT
 COMMUNITY_ID,
  BUILDING_NUM,
  URL
FROM ehdc.buildings  WHERE STATUS = 0 LIMIT 1,100; '''
    result = dao.execute_query(sql)
    return result
示例#5
0
 def _save_merchant(self):
     # 表中是否已有记录
     query_sql = "SELECT * FROM merchant WHERE url = '{}'".format(
         self._merchant_detail["url"])
     if Dao.execute_query(query_sql) is not None:
         print(" {} is already exists ,so next".format(
             self._merchant_detail["name"]))
         return
     # 数据插入操作
     Dao.execute_dmls(self._insert_merchant())
 def _save_community(self):
     # 表中是否已有记录
     query_sql = "SELECT * FROM communities WHERE ORIGINAL_URL = '{}'".format(
         self._community_detail["url"])
     if Dao.execute_query(query_sql) is not None:
         print(" {} is already exists ,so next".format(
             self._community_detail["name"]))
         return
     # 数据插入操作
     Dao.execute_dmls(self._insert_community())
 def _save_community(self):
     # 表中是否已有记录
     query_sql = "SELECT * FROM ehdc.communities_llzg WHERE NAME = '{}' and AREA_NAME='{}' ".format(
         self._community_detail["name"],
         self._community_detail["area_name"])
     if Dao.execute_query(query_sql) is not None:
         print(" {} is already exists ,so next".format(
             self._community_detail["name"]))
         return
     # 数据插入操作
     Dao.execute_dmls(self._insert_community())
 def _save_apartment(self, apartment_detail):
     # 表中是否已有记录
     query_sql = "SELECT * FROM apartments WHERE COMMUNITY_ID  = {}  and FLOOR_NUM ='{}'  and APARTMENT_NUM ='{}' ".format(
         int(apartment_detail["COMMUNITY_ID"]), apartment_detail["FLOOR_NUM"],
         apartment_detail["APARTMENT_NUM"])
     if Dao.execute_query(query_sql) is not None:
         print(" {} is already exists ,so next".format(str(apartment_detail["COMMUNITY_ID"]) +
                                                       apartment_detail["FLOOR_NUM"] +
                                                       apartment_detail["APARTMENT_NUM"]))
         return
     # 数据插入操作
     try:
         Dao.execute_dmls(self._insert_apartment(apartment_detail))
     except Exception as e:
         print(e)
    def _generate_seed_url(self):
        """
        generate all url to visit
        """
        # self._seed_url = "http://www.njhouse.com.cn/spf/inf/index.php?prjid=108510"
        # self._visit_pages(self._seed_url)
        # from page 1 to anypage which < 200

        # # 从数据库添加
        # self._seed_url = Dao._get_url_by_id(self.source_id)

        querysql = "SELECT  COMMUNITY_ID, BUILDING_NUM, URL  FROM ehdc.buildings WHERE STATUS = 0  ; "
        result = Dao.execute_query(querysql)
        for COMMUNITY_ID, BUILDING_NUM, URL in result:
            self.execute(COMMUNITY_ID, BUILDING_NUM, URL)
                             display_name,logo,COMMUNITY_ID)
                VALUES ('',
                        '{}',
                        '{}',
                        '',
                        '{}',
                        '',
                        '{}',
                        '{}',
                        '{}',
                        '',
                        '{}',
                        '{}',
                        '{}',
                        '{}');'''.format(merchant['business_name'],
                                         merchant['phone_number'],
                                         merchant['address'], '',
                                         merchant['lat'], merchant['long'],
                                         merchant['sub_title'], param['cat'],
                                         merchant['logo'], COMMUNITY_ID)
                dao.execute_dmls(insertSql)
        except:
            pass


if __name__ == '__main__':
    querySql = 'SELECT a.name ,a.COMMUNITY_ID ,b.LATITUDE,b.LONGITUDE FROM  shengchan_20140815.communities a ,shengchan_20140815.community_poses b WHERE a.COMMUNITY_ID =b.COMMUNITY_ID AND a.AREA_ID <11 AND a.AREA_ID > 0 '
    dao = Dao()
    result = dao.execute_query(querySql)
    for name, COMMUNITY_ID, LATITUDE, LONGITUDE in result:
        getmerchants(LATITUDE, LONGITUDE, COMMUNITY_ID)
    Dao = Dao()
    # query_sql = "SELECT url,location FROM merchant  WHERE SOURCE_ID = 16      "
    # result = Dao.execute_query(query_sql)
    # for url , location  in result:
    #     bm=xBaiduMap()
    #     print(location ,url)
    #     if location is not None:
    #         zuobiao = bm.getLocation(location,"深圳")
    #         print(zuobiao)
    #         if zuobiao is None:
    #             continue
    #         LONGITUDE = zuobiao[1]
    #         LATITUDE = zuobiao [0]
    #         update_sql = "update merchant set LONGITUDE = '{}' ,LATITUDE = '{}' where url = '{}' and  SOURCE_ID = 16 ".format(LONGITUDE , LATITUDE , url)
    #         Dao.execute_dmls(update_sql)

    query_sql = " SELECT id, BAIDU_LATI , BAIDU_LONG FROM job_beijing  WHERE baidu_lati IS NOT NULL    "
    result = Dao.execute_query(query_sql)
    for COMMUNITY_ID, LATITUDE, LONGITUDE in result:
        bm = xBaiduMap()
        if LATITUDE is not None:
            try:
                location = bm.getAddress(LONGITUDE, LATITUDE)
            except Exception as e:
                print(e)
                continue
            if location is None:
                continue
            update_sql = "update job_beijing set addr = '{}' where id = '{}'   ".format(
                location, COMMUNITY_ID)
            Dao.execute_dmls(update_sql)
class DianpingMerchantCrawler(threading.Thread):
    def __init__(self,
                 LT1,
                 LG1,
                 LT2,
                 LG2,
                 cityname,
                 cityid,
                 cityenname,
                 name,
                 shopId=0,
                 categoryId=0):
        threading.Thread.__init__(self, name=name)
        self.cityid = cityid
        self.shopId = shopId
        self.categoryId = categoryId
        self.Lat1 = LT1
        self.Lat2 = LT2
        self.Long1 = LG1
        self.Long2 = LG2
        self.city_name = cityname
        self.values = {
            'promoId': '0',
            'shopType': '',
            'categoryId': '',
            'sortMode': '2',
            'shopSortItem': '1',
            'keyword': '',
            'searchType': '1',
            'branchGroupId': '0',
            'shippingTypeFilterValue': '0',
            'page': '1'
        }
        self.values["cityId"] = cityid
        self.values["cityEnName"] = cityenname
        self.url = "http://www.dianping.com/search/map/ajax/json"
        self.headers = {
            'User-Agent':
            'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0',
            'Referer':
            'http://www.dianping.com/search/map/category/{}/0'.format(cityid)
        }
        self.dao = Dao()
        self.query_sql = "SELECT shopType ,categoryId,NAME FROM category  WHERE categoryId <> shopType AND categoryId <>'None'  "
        self.result = self.dao.execute_query(self.query_sql)
        self.query_sql2 = "SELECT shopId FROM shop_bean where  city_name ='{}'".format(
            cityname)
        self.result2 = self.dao.execute_query(self.query_sql2)
        self.shopIds = []
        if self.result2 is not None:
            for shopid in self.result2:
                self.shopIds.append(shopid[0])

    def save_shop(self, shopRecordBean, categoryId):
        zuobiao = GoogleLatALng2Baidu(shopRecordBean["geoLng"],
                                      shopRecordBean["geoLat"])
        insert_sql = "insert into shop_bean (address ,poi ,phoneNo ,shopId ,defaultPic,expand ,shopName,geoLat ,shopDealId,geoLng ,addDate ,shopPower ,shopPowerTitle ,avgPrice,memberCardId ," \
                     "bookingSetting ,dishTag ,branchUrl ,promoId ,hasSceneryOrder ,shopRecordBean ,regionList ,categoryId ,LATITUDE , LONGITUDE,city_name ) " \
                     "values( '{}','{}','{}','{}','{}','{}','{}','{}','{}','{}','{}','{}','{}','{}','{}','{}','{}','{}','{}','{}',\"{}\",'{}','{}',{},{},'{}')".format(
            shopRecordBean["address"],
            shopRecordBean["poi"],
            shopRecordBean["phoneNo"],
            shopRecordBean["shopId"],
            shopRecordBean["defaultPic"],
            shopRecordBean["expand"],
            shopRecordBean["shopName"],
            shopRecordBean["geoLat"],
            shopRecordBean["shopDealId"],
            shopRecordBean["geoLng"],
            shopRecordBean["addDate"],
            shopRecordBean["shopPower"],
            shopRecordBean["shopPowerTitle"],
            shopRecordBean["avgPrice"],
            shopRecordBean["memberCardId"],
            shopRecordBean["bookingSetting"],
            shopRecordBean["dishTag"],
            shopRecordBean["branchUrl"],
            shopRecordBean["promoId"],
            shopRecordBean["hasSceneryOrder"],
            str(shopRecordBean["shopRecordBean"]).replace('\'', '\\\'').replace('\"', '\\\"'),
            shopRecordBean["regionList"],
            categoryId,
            zuobiao["LATITUDE"],
            zuobiao["LONGITUDE"], self.city_name)
        self.dao.execute_dmls(insert_sql)

    def savePageJson(self, page=1):
        try:
            self.values["page"] = page
            print(self.values, "begin")
            self.data = urllib.parse.urlencode(
                self.values).encode(encoding='UTF8')
            request = urllib.request.Request(url=self.url,
                                             headers=self.headers,
                                             data=self.data)
            m_fp = urllib.request.urlopen(request, timeout=500)
            html_str = m_fp.read().decode('utf-8')
            m_fp.close()
            # print(self.url,self.headers,self.data,html_str)
            s = json.loads(html_str)

            shopRecordBeanList = s["shopRecordBeanList"]
            # 去除重复的商家
            for shopRecordBean in shopRecordBeanList:
                if str(shopRecordBean["shopId"]) not in self.shopIds:
                    self.save_shop(shopRecordBean, self.values["categoryId"])
                    self.shopIds.append(str(shopRecordBean["shopId"]))
            # print("商家之一")
            # for key in shopRecordBean:
            # print("  {} = {}".format(key, shopRecordBean[key]))
            # for key in shopRecordBeanList[0]:
            # print("  {} = {}".format(key, shopRecordBeanList[0][key]))
            return s
        except Exception as e:
            print(e)
            return self.savePageJson(page)
            # except mysql.connector.Error as e :
            # print(e)

    def crawler_each_category(self, result):
        for shopType, categoryId, NAME in result:
            # print("shoptype = {} , categoryid = {} ,name = {}  begin ".format(shopType, categoryId, NAME))
            self.values["shopType"] = shopType
            self.values["categoryId"] = categoryId
            s = self.savePageJson(1)
            pageCount = s["pageCount"]
            for page in range(2, pageCount + 1):
                try:
                    self.savePageJson(page)
                except Exception as e:
                    print(e)
                    time.sleep(0.5)
                    self.savePageJson(page)

    def crawler_each_category_withzuobiao(self, result, Lat, Long):
        self.values["glat1"] = Lat
        self.values["glong1"] = Long
        self.values["glat2"] = Lat - 0.1
        self.values["glong2"] = Long + 0.2
        self.crawler_each_category(result)

    def run(self):
        if self.Lat1 == 0:
            if self.shopId == 0:
                self.crawler_each_category(self.result)
            elif self.categoryId == 0:
                result = [self.shopId, self.shopId, ""]
                results = []
                results.append(result)
                self.crawler_each_category(results)
            else:
                result = [self.shopId, self.categoryId, ""]
                results = []
                results.append(result)
                self.crawler_each_category(results)
        else:
            Lat = self.Lat1
            while (Lat >= self.Lat2):
                Long = self.Long1
                while (Long <= self.Long2):
                    self.crawler_each_category_withzuobiao(
                        self.result, Lat, Long)
                    Long += 0.19
                Lat = Lat - 0.09
                # crawler_each_category(result)
        update_sql = "update dianping_cities set status = 0 where cityId = {} ".format(
            self.cityid)

    def stop(self):
        self.thread_stop = True