def formatcity_two(): city_list = [] for i in city_list_two: standard_city(i) print(standard_city(i)) if standard_city(i)[0] is True: city_list.append(standard_city(i)) else: print('wrong') print(set(city_list)) print(len(set(city_list)))
def update_lagou_fields(): companys = collection.find({'company_source': '拉钩'}, no_cursor_timeout=True) for company in companys[220000:]: address = company['address'] city = company['city'] region = company['region'] if city is not None and region is not None and address is not None: address_string = city + region + address elif city is not None and address is not None and region is None: address_string = city + address elif city is None and address is not None and region is not None: address_string = address + region elif city is not None and region is not None and address is None: address_string = city + region elif city is None and address is not None and region is None: address_string = address elif city is not None and region is None and address is None: address_string = city else: address_string = '' result, real_city = standard_city(address_string) if result: company['fj_city'] = real_city r, real_region = standard_region(real_city, address_string) if r: company['fj_region'] = real_region else: company['fj_region'] = None else: company['fj_city'] = None company['fj_region'] = None collection.update_one({'company_id': company['company_id'], 'company_source': company['company_source']}, {'$set': company}) print('{}已经更新了'.format(company['company_id']))
def insert_db(self): data = self.serialization_info() data['crawler_time'] = datetime.datetime.now() if data['city'] and data['region']: standard_string = data['city'] + data['region'] else: standard_string = None # 格式化城市区域 result, real_city = standard_city(data['city']) if result: data['fj_city'] = real_city try: r, real_region = standard_region(real_city, standard_string) if r: data['fj_region'] = real_region else: data['fj_region'] = None except StandarCityError as e: log.error(e) else: data['fj_city'] = None data['fj_region'] = None # 创建组合索引时使用的代码 try: collection.insert_one(data) log.info('插入数据={}'.format(data)) except DuplicateKeyError as e: log.error('该数据已经存在,company_source={},company_id={}'.format( data['company_source'], data['company_id']))
def insert_db(self): data = serialization_info(self) compare(data) city_success, data['city'] = standard_city(data['city']) region_success, data['region'] = standard_block( data['city'], data['region']) # todo 插入判断 if city_success is False or region_success is False: log.error('城市区域数据格式化失败data={}'.format(data)) elif not coll.find_one({ 'city': data['city'], 'region': data['region'], 'district_name': data['district_name'], 'source': data['source'], 'trade_date': data['trade_date'], 'area': data['area'] }): coll.insert_one(data) log.info('插入数据={}'.format(data)) else: log.info('已经存在数据={}'.format(data))
def delete_region(): count = 0 for i in collection.find({'region': None}, no_cursor_timeout=True): city = i['city'] name = i['district_name'] result, city_fj = standard_city(city) if result: d = collection_offline.find_one({'city': city_fj, 'name': name}) if d: print('库里小区名={}, 成交小区名={}'.format(d['name'], name)) collection.update_one({'_id': i['_id']}, {'$set': { 'region': d['region'] }}) a = collection_offline.find_one({'city': city_fj, 'alias': name}) if a: print('找到成交别名了,库里小区名={}, 成交小区名={}'.format(a['name'], name)) collection.update_one({'_id': i['_id']}, {'$set': { 'region': a['region'] }}) else: print('匹配不到') collection.remove({'_id': i['_id']}) count = count + 1 continue print('delete count={}'.format(count))
def mongo_chanch(): for i in coll_name.find({}, no_cursor_timeout=True): name = i['comm_name'] city_name_ = i['city'] DistrictName_ = i['comm_addr'] UnitPrice = int(i['price']) update_time = i['time'] category = 'district' s_date = int(update_time.strftime('%Y%m')) city_name = standard_city(city_name_) DistrictName = standard_block(DistrictName_) data = { 'category': category, 'city': city_name, 'name': name, 'region': DistrictName, 's_date': s_date, 'zhugefang_esf_price': UnitPrice, } if not data['region']: continue print(data) save_coll.update_one( { 'region': DistrictName, 'city': city_name, 'name': name }, {'$set': data}, upsert=True)
async def standar_address(self, company): address = company['address'] city = company['city'] region = company['region'] if city is not None and region is not None and address is not None: address_string = city + region + address elif city is not None and address is not None and region is None: address_string = city + address elif city is None and address is not None and region is not None: address_string = address + region elif city is not None and region is not None and address is None: address_string = city + region elif city is None and address is not None and region is None: address_string = address elif city is not None and region is None and address is None: address_string = city else: address_string = '' result, real_city = standard_city(address_string) if result: company['fj_city'] = real_city r, real_region = standard_region(real_city, address_string) if r: company['fj_region'] = real_region else: company['fj_region'] = None else: company['fj_city'] = None company['fj_region'] = None return company
def mongo_chanch(): for i in coll_price.find({}, no_cursor_timeout=True): try: ResidentialAreaID = i['ResidentialAreaID'] city_name_ = i['city_name'] DistrictName_ = i['DistrictName'] UnitPrice = i['UnitPrice'] update_time = i['update_time'] name = \ coll_name.find_one( {'ResidentialAreaID': ResidentialAreaID, 'city_name': city_name_, 'DistrictName': DistrictName_})[ 'baseinfo']['json'][0]['residentialareaMap']['residentialareaName'] category = 'district' s_date = int(update_time.strftime('%Y%m')) city_name = standard_city(city_name_) DistrictName = standard_block(DistrictName_) data = { 'category': category, 'city': city_name, 'name': name, 'region': DistrictName, 's_date': s_date, 'fanggugu_esf_price': UnitPrice, } print(data) save_coll.update_one({'region': DistrictName, 'city': city_name, 'name': name}, {'$set': data}, upsert=True) except Exception as e: log.info(i)
def analyzer(): res = requests.get('http://114.80.150.196:8002/latestLog?') info_json = res.json() for i in info_json: url = i['url'] house_id = i['id'] if host in url: name_url_encode = re.search('query=(.*?)\&', url, re.S | re.M).group(1) name = urllib.parse.unquote(name_url_encode) r = requests.get( 'http://114.80.150.196:8002/fetchBody?id={}'.format(house_id)) try: r_info = r.json()['resBody'] j = json.loads(r_info) result, city = standard_city(j['city']) if result: collection.update({ 'city': city, 'name': name }, {'$set': { 'didi': j }}) print(city, name) except Exception as e: print('-')
async def standar_address(self, company): address = company['address'] result, real_city = standard_city(address) if result: company['fj_city'] = real_city r, real_region = standard_region(real_city, address) if r: company['fj_region'] = real_region else: company['fj_region'] = None else: company['fj_city'] = None company['fj_region'] = None return company
def start(): for i in coll_zhugefang.find(): try: if i['price'] == 0: i['price'] = '0' i['price'] = i['price'].strip() i['city'] = standard_city(i['city']) i['comm_addr'] = standard_block(i['comm_addr']).strip() print(i) if not i['comm_addr']: continue coll_save.insert_one(i) except Exception as e: log.info(i)
def update_51job_fields(): companys = collection.find({'company_source':'51job'},no_cursor_timeout=True) for company in companys[3376200:]: address = company['address'] result, real_city = standard_city(address) if result: company['fj_city'] = real_city r, real_region = standard_region(real_city, address) if r: company['fj_region'] = real_region else: company['fj_region'] = None else: company['fj_city'] = None company['fj_region'] = None collection.update_one({'company_id':company['company_id'],'company_source':company['company_source']},{'$set':company}) print('{}已经更新了'.format(company['company_id']))
def analyse_city(proxies): a_list = send_url(proxies=proxies) real_city_list = [] no_city_list = [] for a in a_list: city_dict = {} city_url = a.xpath('@href')[0] city_name = a.xpath('text()')[0] result, real_city = standard_city(city_name) if result: city_dict[city_name] = city_url real_city_list.append(city_dict) else: city_dict[city_name] = city_url no_city_list.append(city_dict) print(len(real_city_list)) print(real_city_list) print(len(no_city_list)) print(no_city_list)
def crawler_baike(): for city in city_list: print(city) i = urllib.parse.quote(city) url = 'https://baike.baidu.com/item/' + i res = requests.get(url=url, headers=headers) html = res.content.decode('UTF-8', 'ignore') # 中文名称 try: chinese_name = re.search(r'中文名称</dt>(.*?)<dd(.*?)>(.*?)</dd>', html, re.S | re.M).group(3).strip() chinese_name = re.sub('<[^>]+>', '', chinese_name).strip() except Exception as e: chinese_name = None # 外文名称 try: foreign_names = re.search(r'外文名称</dt>(.*?)<dd(.*?)>(.*?)</dd>', html, re.S | re.M).group(3).strip() foreign_names = re.sub('<[^>]+>', '', foreign_names).strip() except Exception as e: foreign_names = None # 别名 try: alias = re.search( r'别 名</dt>.*?<dd.*?>(.*?)</dd>', html, re.S | re.M).group(1).strip() alias = re.sub('<[^>]+>', '', alias).strip() except Exception as e: alias = None # 行政区划(Administrative_categories) try: administrative_division = re.search( r'行政区类别</dt>.*?<dd.*?>(.*?)</dd>', html, re.S | re.M).group(1).strip() administrative_division = re.sub('<[^>]+>', '', administrative_division).strip() except Exception as e: administrative_division = None # 所属地区(Attribution_area) try: affiliating_area = re.search(r'所属地区</dt>.*?<dd.*?>(.*?)</dd>', html, re.S | re.M).group(1).strip() affiliating_area = re.sub('<[^>]+>', '', affiliating_area).strip() except Exception as e: affiliating_area = None # 下辖地区(governs_area) try: governs_area = re.search(r'下辖地区</dt>.*?<dd.*?>(.*?)</dd>', html, re.S | re.M).group(1).strip() governs_area = re.sub('<[^>]+>', '', governs_area).strip() except Exception as e: governs_area = None # 政府驻地 try: government_resident = re.search(r'政府驻地</dt>.*?<dd.*?>(.*?)</dd>', html, re.S | re.M).group(1).strip() government_resident = re.sub('<[^>]+>', '', government_resident).strip() except Exception as e: government_resident = None # 电话区号(Telephone_code) try: area_code = re.search(r'电话区号</dt>.*?<dd.*?>(.*?)</dd>', html, re.S | re.M).group(1).strip() area_code = re.sub('<[^>]+>', '', area_code).strip() except Exception as e: area_code = None # 邮政区码 try: zip_code = re.search(r'邮政区码</dt>.*?<dd.*?>(.*?)</dd>', html, re.S | re.M).group(1).strip() zip_code = re.sub('<[^>]+>', '', zip_code).strip() except Exception as e: zip_code = None # 地理位置(geographical_position) try: geographic_position = re.search(r'地理位置</dt>.*?<dd.*?>(.*?)</dd>', html, re.S | re.M).group(1).strip() geographic_position = re.sub('<[^>]+>', '', geographic_position).strip() except Exception as e: geographic_position = None # 面积 try: area = re.search( r'面 积</dt>.*?<dd.*?>(.*?)</dd>', html, re.S | re.M).group(1).strip() area = re.sub('<[^>]+>', '', area).strip() except Exception as e: area = None # 人口 try: population = re.search( r'人 口</dt>.*?<dd.*?>(.*?)<', html, re.S | re.M).group(1).strip() population = re.sub('<[^>]+>', '', population).strip() except Exception as e: population = None # 方言 try: localism = re.search( r'方 言</dt>.*?<dd.*?>(.*?)</dd>', html, re.S | re.M).group(1).strip() localism = re.sub('<[^>]+>', '', localism).strip() except Exception as e: localism = None # 气候条件(Climatic_conditions) try: weather_conditions = re.search(r'气候条件</dt>.*?<dd.*?>(.*?)</dd>', html, re.S | re.M).group(1).strip() weather_conditions = re.sub('<[^>]+>', '', weather_conditions).strip() except Exception as e: weather_conditions = None # 著名景点 try: famous_scenery = re.search(r'著名景点</dt>.*?<dd.*?>(.*?)</dd>', html, re.S | re.M).group(1).strip() famous_scenery = re.sub('<[^>]+>', '', famous_scenery).strip() except Exception as e: famous_scenery = None # 机场 try: airport = re.search( r'机 场</dt>.*?<dd.*?>(.*?)</dd>', html, re.S | re.M).group(1).strip() airport = re.sub('<[^>]+>', '', airport).strip() except Exception as e: airport = None # 火车站 try: railway_station = re.search(r'火车站</dt>.*?<dd.*?>(.*?)</dd>', html, re.S | re.M).group(1).strip() railway_station = re.sub('<[^>]+>', '', railway_station).strip() except Exception as e: railway_station = None # 车牌代码 try: license_code = re.search(r'车牌代码</dt>.*?<dd.*?>(.*?)</dd>', html, re.S | re.M).group(1).strip() license_code = re.sub('<[^>]+>', '', license_code).strip() except Exception as e: license_code = None # 地区生产总值(Gross regional product) try: GRP = re.search(r'地区生产总值</dt>.*?<dd.*?>(.*?)</dd>', html, re.S | re.M).group(1).strip() GRP = re.sub('<[^>]+>', '', GRP).strip() except Exception as e: GRP = None # 人均生产总值 try: GNPP = re.search(r'人均生产总值</dt>.*?<dd.*?>(.*?)</dd>', html, re.S | re.M).group(1).strip() GNPP = re.sub('<[^>]+>', '', GNPP).strip() except Exception as e: GNPP = None # 人均支配收入(Per capita income) try: per_capita_income = re.search(r'人均支配收入</dt>.*?<dd.*?>(.*?)</dd', html, re.S | re.M).group(1).strip() per_capita_income = re.sub('<[^>]+>', '', per_capita_income).strip() except Exception as e: per_capita_income = None # 消费品零售额 try: retail_sales_of_consumer_goods = re.search( r'消费品零售额</dt>.*?<dd.*?>(.*?)<sup', html, re.S | re.M).group(1).strip() retail_sales_of_consumer_goods = re.sub( '<[^>]+>', '', retail_sales_of_consumer_goods).strip() except Exception as e: retail_sales_of_consumer_goods = None # 住户存款总额 try: total_household_deposits = re.search( r'住户存款总额</dt>.*?<dd.*?>(.*?)<sup', html, re.S | re.M).group(1).strip() total_household_deposits = re.sub( '<[^>]+>', '', total_household_deposits).strip() except Exception as e: total_household_deposits = None # 市树市花 try: were_flower = re.search(r'市树市花</dt>.*?<dd.*?>(.*?)</dd>', html, re.S | re.M).group(1).strip() were_flower = re.sub('<[^>]+>', '', were_flower).strip() except Exception as e: were_flower = None # 著名高校 try: famous_universities = re.search(r'著名高校</dt>.*?<dd.*?>(.*?)</dd>', html, re.S | re.M) if not famous_universities: famous_universities = re.search( r'学 校</dt>.*?<dd.*?>(.*?)</dd>', html, re.S | re.M) if not famous_universities: famous_universities = re.search( r'高等院校</dt>.*?<dd.*?>(.*?)</dd>', html, re.S | re.M) if not famous_universities: famous_universities = re.search( r'重点高校</dt>.*?<dd.*?>(.*?)</dd>', html, re.S | re.M) if not famous_universities: famous_universities = re.search( r'高等学府</dt>.*?<dd.*?>(.*?)</dd>', html, re.S | re.M) if not famous_universities: famous_universities = re.search( r'著名学府</dt>.*?<dd.*?>(.*?)</dd>', html, re.S | re.M) if not famous_universities: famous_universities = re.search( r'高 校</dt>.*?<dd.*?>(.*?)</dd>', html, re.S | re.M) if not famous_universities: famous_universities = re.search( r'主要高校</dt>.*?<dd.*?>(.*?)</dd>', html, re.S | re.M) if not famous_universities: famous_universities = re.search( r'大 学</dt>.*?<dd.*?>(.*?)</dd>', html, re.S | re.M) if not famous_universities: famous_universities = re.search( r'知名高校</dt>.*?<dd.*?>(.*?)</dd>', html, re.S | re.M) famous_universities = famous_universities.group(1).strip() famous_universities = re.sub('<[^>]+>', '', famous_universities).strip() except Exception as e: famous_universities = None # 市长 try: mayor = re.search( '市 长</dt>.*?<dd.*?>(.*?)</dd>', html, re.S | re.M).group(1) mayor = re.sub('<[^>]+>', '', mayor).strip() except Exception as e: mayor = None # 行政代码 try: administrative_code = re.search('行政代码</dt.*?<dd.*?>(.*?)</dd>', html, re.S | re.M).group(1) administrative_code = re.sub('<[^>]+>', '', administrative_code).strip() except Exception as e: administrative_code = None # 城市精神 try: city_spirit = re.search('城市精神</dt>.*?<dd.*?>(.*?)</dd>', html, re.S | re.M).group(1).strip() city_spirit = re.sub('<[^>]+>', '', city_spirit).strip() except Exception as e: city_spirit = None # 人类发展指数 try: human_development_index = re.search( '人类发展指数</dt>.*?<dd.*?>(.*?)</dd>', html, re.S | re.M).group(1).strip() human_development_index = re.sub('<[^>]+>', '', human_development_index).strip() except Exception as e: human_development_index = None # 城市简称 try: city_abbreviation = re.search('城市简称</dt>.*?<dd.*?>(.*?)</dd>', html, re.S | re.M).group(1).strip() city_abbreviation = re.sub('<[^>]+>', '', city_abbreviation).strip() except Exception as e: city_abbreviation = None is_true, city = standard_city(city) if not is_true: print(city) data = { 'chinese_name': chinese_name, 'foreign_names': foreign_names, 'alias': alias, 'administrative_division': administrative_division, 'affiliating_area': affiliating_area, 'governs_area': governs_area, 'government_resident': government_resident, 'area_code': area_code, 'zip_code': zip_code, 'geographic_position': geographic_position, 'area': area, 'population': population, 'famous_scenery': famous_scenery, 'localism': localism, 'weather_conditions': weather_conditions, 'airport': airport, 'railway_station': railway_station, 'license_code': license_code, 'GRP': GRP, 'GNPP': GNPP, 'per_capita_income': per_capita_income, 'retail_sales_of_consumer_goods': retail_sales_of_consumer_goods, 'total_household_deposits': total_household_deposits, 'were_flower': were_flower, 'famous_universities': famous_universities, 'mayor': mayor, 'administrative_code': administrative_code, 'city_spirit': city_spirit, 'human_development_index': human_development_index, 'city_abbreviation': city_abbreviation, 'city': city, 'update_time': datetime.now() } for i in data: try: data[i] = data[i].replace('\n', '').replace(' ', '') data[i] = re.sub('\[\d+\]', '', data[i]) except Exception as e: pass print(data) coll.update_one({'city': city}, {'$set': data}, True)