def to_data(table_name): global offset select_sql = '''SELECT source, source_id, others_info FROM detail_hotel_{0}'''.format(table_name) try: _data = [] for result in MysqlSource(db_config=config, table_or_query=select_sql, size=10000, is_table=False, is_dict_cursor=True): offset += 1 others_info = result['others_info'] if not others_info: continue others_info = json.loads(others_info) if 'first_img' not in others_info: continue first_img_url = others_info['first_img'] if not is_legal(first_img_url): continue md5_str = encode(first_img_url) source = result['source'] source_id = result['source_id'] _data.append((source, source_id, md5_str)) if len(_data) % 1000 == 0: insert_db(table_name, _data) _data = [] insert_db(table_name, _data) except Exception as exc: logger.exception(msg="[入库出现异常]", exc_info=exc)
def get_task(): sql = '''SELECT source, sid, suggest_type, suggest, city_id, country_id, s_city, s_region, s_country, s_extra, label_batch, others_info FROM ota_location_bak_1215;''' data = [] _count = 0 for line in MysqlSource(poi_ori_config, table_or_query=sql, size=2000, is_table=False, is_dict_cursor=False): _count += 1 new_line = list(line) new_line.insert(1, encode(line[1])) data.append(new_line) if len(data) == 1000: logger.info("[count: {}]".format(_count)) update_sql(data) data = [] update_sql(data)
def insert_unknown_keywords(_type, _keyword_or_keywords): conn = poi_ori_pool.connection() cursor = conn.cursor() sql = '''INSERT IGNORE INTO unknown_keywords (`type`, `key_hash`, `keywords`) VALUES (%s, %s, %s);''' if isinstance(_keyword_or_keywords, str): _hash_key = encode(_keyword_or_keywords) cursor.execute(sql, (_type, _hash_key, _keyword_or_keywords)) elif isinstance(_keyword_or_keywords, (list, set, tuple)): for each_keyword in _keyword_or_keywords: _hash_key = encode(each_keyword) cursor.execute(sql, (_type, _hash_key, each_keyword)) else: logger.debug( "[unknown _keyword_or_keywords type: {}][_type: {}][_keyword_or_keywords: {}]" .format(type(_keyword_or_keywords), _type, _keyword_or_keywords)) conn.commit() cursor.close() conn.close()
def qyer_city(): results = db.QyerCity.find({}) conn = pymysql.connect(**mysql_config) cursor = conn.cursor() insert_sql = '''INSERT IGNORE INTO ota_location_qyer_1215 (SOURCE, sid_md5, sid, suggest_type, suggest, city_id, country_id, s_city, s_region, s_country, s_extra, label_batch, others_info) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);''' data = [] _count = 0 for result in results: _count += 1 if _count % 1000 == 0: print(_count) city_list = result.get('city') for city in city_list: if city.get('type_name') == 'city': hotel_url = city.get('url') city_name = city.get('cn_name') city_name = city_name.replace('<span class="cGreen">', '').replace('</span>', '') hotel_url = urljoin('http:', hotel_url) if hotel_url.endswith('/'): sid = hotel_url.split('/')[-2] else: sid = hotel_url.split('/')[-1] hotel_url = hotel_url + '/' others_info = {'form': 'qyer_suggest'} others_info = json.dumps(others_info) results = ('qyer', encode(sid), sid, 1, hotel_url, 'NULL', 'NULL', city_name, 'NULL', 'NULL', 'NULL', '2017-12-13a', others_info) # print('qyer', encode(sid), sid, 1, hotel_url, 'NULL', 'NULL', city_name, 'NULL', 'NULL', 'NULL', # '2017-12-13a') data.append(results) if len(data) == 1000: try: cursor.executemany(insert_sql, data) conn.commit() data = [] except Exception as e: conn.rollback() try: cursor.executemany(insert_sql, results) conn.commit() except Exception as e: conn.rollback()
def qyer_baidu_city(): results = db.BaiDuSuggest.find({}) conn = pymysql.connect(**mysql_config) cursor = conn.cursor() insert_sql = "insert ignore into ota_location_qyer_1215(source,sid_md5,sid,suggest_type,suggest,city_id,country_id,s_city,s_region,s_country,s_extra,label_batch,others_info) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)" results_list = [] _count = 0 for result in results: _count += 1 city_list = result.get('city_url') for city in city_list: if 'poi' not in city: try: hotel_url = city.replace('///', '//') sid = re.search(r'place\.qyer\.com/(.*?)(?=/)', hotel_url).group(1) hotel_url = 'http://place.qyer.com/{0}/'.format(sid) city_name = sid others_info = {'from': 'baidu_suggest'} others_info = json.dumps(others_info) results_list.append( ('qyer', encode(sid), sid, 1, hotel_url, 'NULL', 'NULL', city_name, 'NULL', 'NULL', 'NULL', '2017-12-13a', others_info)) # print('qyer', sid, 1, hotel_url, 'NULL', 'NULL', city_name, 'NULL', 'NULL', 'NULL', '2017-12-13a', # others_info) if len(results_list) >= 2000: print('*' * 10, _count, '*' * 10) cursor.executemany(insert_sql, results_list) conn.commit() results_list = [] except Exception as e: pass else: cursor.executemany(insert_sql, results) conn.commit() print('*' * 100, _count, '*' * 100)
def _execute(self, **kwargs): url = self.task.kwargs['url'] flag = self.task.kwargs['flag'] table_name = self.task.kwargs['table_name'] md5_url = encode(url) with MySession(need_proxies=True, need_cache=True) as session: page = session.get(url, timeout=240) page.encoding = 'utf8' if len(page.text) == 0: raise ServiceStandardError( error_code=ServiceStandardError.PROXY_FORBIDDEN) else: content = page.text j_data = json.loads(content) if j_data['status'] not in ['OK', 'ZERO_RESULTS']: raise ServiceStandardError( error_code=ServiceStandardError.PROXY_FORBIDDEN) data = (md5_url, url, content, flag) conn = pymysql.connect(host='10.10.231.105', user='******', passwd='hourong', db='crawled_html', charset="utf8") try: with conn as cursor: sql = 'insert ignore into crawled_html.{0}(`md5`,`url`,`content`,`flag`) values (%s,%s,%s,%s)'.format( table_name) print(cursor.execute(sql, data)) except Exception as e: raise ServiceStandardError( error_code=ServiceStandardError.PROXY_FORBIDDEN, wrapped_exception=e) self.task.error_code = 0 return 'OK', url