Пример #1
0
def to_data(table_name):
    global offset
    select_sql = '''SELECT
  source,
  source_id,
  others_info
FROM detail_hotel_{0}'''.format(table_name)
    try:
        _data = []
        for result in MysqlSource(db_config=config,
                                  table_or_query=select_sql,
                                  size=10000,
                                  is_table=False,
                                  is_dict_cursor=True):
            offset += 1
            others_info = result['others_info']
            if not others_info:
                continue
            others_info = json.loads(others_info)
            if 'first_img' not in others_info:
                continue
            first_img_url = others_info['first_img']

            if not is_legal(first_img_url):
                continue
            md5_str = encode(first_img_url)
            source = result['source']
            source_id = result['source_id']
            _data.append((source, source_id, md5_str))
            if len(_data) % 1000 == 0:
                insert_db(table_name, _data)
                _data = []
        insert_db(table_name, _data)
    except Exception as exc:
        logger.exception(msg="[入库出现异常]", exc_info=exc)
Пример #2
0
def get_task():
    sql = '''SELECT
  source,
  sid,
  suggest_type,
  suggest,
  city_id,
  country_id,
  s_city,
  s_region,
  s_country,
  s_extra,
  label_batch,
  others_info
FROM ota_location_bak_1215;'''
    data = []
    _count = 0
    for line in MysqlSource(poi_ori_config, table_or_query=sql,
                            size=2000, is_table=False,
                            is_dict_cursor=False):
        _count += 1
        new_line = list(line)
        new_line.insert(1, encode(line[1]))
        data.append(new_line)
        if len(data) == 1000:
            logger.info("[count: {}]".format(_count))
            update_sql(data)
            data = []
    update_sql(data)
Пример #3
0
def insert_unknown_keywords(_type, _keyword_or_keywords):
    conn = poi_ori_pool.connection()
    cursor = conn.cursor()
    sql = '''INSERT IGNORE INTO unknown_keywords (`type`, `key_hash`, `keywords`) VALUES (%s, %s, %s);'''
    if isinstance(_keyword_or_keywords, str):
        _hash_key = encode(_keyword_or_keywords)
        cursor.execute(sql, (_type, _hash_key, _keyword_or_keywords))
    elif isinstance(_keyword_or_keywords, (list, set, tuple)):
        for each_keyword in _keyword_or_keywords:
            _hash_key = encode(each_keyword)
            cursor.execute(sql, (_type, _hash_key, each_keyword))
    else:
        logger.debug(
            "[unknown _keyword_or_keywords type: {}][_type: {}][_keyword_or_keywords: {}]"
            .format(type(_keyword_or_keywords), _type, _keyword_or_keywords))
    conn.commit()
    cursor.close()
    conn.close()
Пример #4
0
def qyer_city():
    results = db.QyerCity.find({})
    conn = pymysql.connect(**mysql_config)
    cursor = conn.cursor()
    insert_sql = '''INSERT IGNORE INTO ota_location_qyer_1215 (SOURCE, sid_md5, sid, suggest_type, suggest, city_id, country_id, s_city, s_region, s_country, s_extra, label_batch, others_info)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);'''

    data = []
    _count = 0
    for result in results:
        _count += 1
        if _count % 1000 == 0:
            print(_count)
        city_list = result.get('city')
        for city in city_list:
            if city.get('type_name') == 'city':
                hotel_url = city.get('url')
                city_name = city.get('cn_name')
                city_name = city_name.replace('<span class="cGreen">',
                                              '').replace('</span>', '')
                hotel_url = urljoin('http:', hotel_url)
                if hotel_url.endswith('/'):
                    sid = hotel_url.split('/')[-2]
                else:
                    sid = hotel_url.split('/')[-1]
                    hotel_url = hotel_url + '/'
            others_info = {'form': 'qyer_suggest'}
            others_info = json.dumps(others_info)
            results = ('qyer', encode(sid), sid, 1, hotel_url, 'NULL', 'NULL',
                       city_name, 'NULL', 'NULL', 'NULL', '2017-12-13a',
                       others_info)
            # print('qyer', encode(sid), sid, 1, hotel_url, 'NULL', 'NULL', city_name, 'NULL', 'NULL', 'NULL',
            #       '2017-12-13a')
            data.append(results)
            if len(data) == 1000:
                try:
                    cursor.executemany(insert_sql, data)
                    conn.commit()
                    data = []
                except Exception as e:
                    conn.rollback()

    try:
        cursor.executemany(insert_sql, results)
        conn.commit()
    except Exception as e:
        conn.rollback()
Пример #5
0
def qyer_baidu_city():
    results = db.BaiDuSuggest.find({})
    conn = pymysql.connect(**mysql_config)
    cursor = conn.cursor()
    insert_sql = "insert ignore into ota_location_qyer_1215(source,sid_md5,sid,suggest_type,suggest,city_id,country_id,s_city,s_region,s_country,s_extra,label_batch,others_info) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"
    results_list = []
    _count = 0
    for result in results:
        _count += 1
        city_list = result.get('city_url')
        for city in city_list:
            if 'poi' not in city:
                try:
                    hotel_url = city.replace('///', '//')
                    sid = re.search(r'place\.qyer\.com/(.*?)(?=/)',
                                    hotel_url).group(1)
                    hotel_url = 'http://place.qyer.com/{0}/'.format(sid)
                    city_name = sid
                    others_info = {'from': 'baidu_suggest'}
                    others_info = json.dumps(others_info)
                    results_list.append(
                        ('qyer', encode(sid), sid, 1, hotel_url, 'NULL',
                         'NULL', city_name, 'NULL', 'NULL', 'NULL',
                         '2017-12-13a', others_info))
                    # print('qyer', sid, 1, hotel_url, 'NULL', 'NULL', city_name, 'NULL', 'NULL', 'NULL', '2017-12-13a',
                    #       others_info)
                    if len(results_list) >= 2000:
                        print('*' * 10, _count, '*' * 10)
                        cursor.executemany(insert_sql, results_list)
                        conn.commit()
                        results_list = []
                except Exception as e:
                    pass
    else:
        cursor.executemany(insert_sql, results)
        conn.commit()
    print('*' * 100, _count, '*' * 100)
Пример #6
0
    def _execute(self, **kwargs):
        url = self.task.kwargs['url']
        flag = self.task.kwargs['flag']
        table_name = self.task.kwargs['table_name']

        md5_url = encode(url)
        with MySession(need_proxies=True, need_cache=True) as session:
            page = session.get(url, timeout=240)
            page.encoding = 'utf8'
            if len(page.text) == 0:
                raise ServiceStandardError(
                    error_code=ServiceStandardError.PROXY_FORBIDDEN)
            else:
                content = page.text
                j_data = json.loads(content)
                if j_data['status'] not in ['OK', 'ZERO_RESULTS']:
                    raise ServiceStandardError(
                        error_code=ServiceStandardError.PROXY_FORBIDDEN)

                data = (md5_url, url, content, flag)
                conn = pymysql.connect(host='10.10.231.105',
                                       user='******',
                                       passwd='hourong',
                                       db='crawled_html',
                                       charset="utf8")
                try:
                    with conn as cursor:
                        sql = 'insert ignore into crawled_html.{0}(`md5`,`url`,`content`,`flag`) values (%s,%s,%s,%s)'.format(
                            table_name)
                        print(cursor.execute(sql, data))
                except Exception as e:
                    raise ServiceStandardError(
                        error_code=ServiceStandardError.PROXY_FORBIDDEN,
                        wrapped_exception=e)
            self.task.error_code = 0
            return 'OK', url