Exemplo n.º 1
0
 def gen_for_records(record):
     try:
         # 如果有提取数据
         if isinstance(record, dict) and record:
             # 如果是列表数据
             if isinstance(record.get('datas', None), (list, tuple)):
                 datas = record['datas']
                 # 遍历列表中的_site_record_id进行转换
                 for idx, item in enumerate(datas):
                     if item.has_key("_site_record_id"):
                         datas[idx]['_site_record_id'] = get_md5(
                             datas[idx]['_site_record_id'])
             # 不是列表数据转换site_record_id
             elif record.has_key('_site_record_id'):
                 record['_site_record_id'] = get_md5(
                     record['_site_record_id'])
     except Exception:
         raise ParserErrorException("count _site_record_id failed")
Exemplo n.º 2
0
    def do_clean(self, item):

        primary_key_str = '|' + item.get("litigants", "") + '|' + item.get(
            "court_time", "") + '|' + item.get("province", "")
        new_record_id = tools.get_md5(primary_key_str)
        old_record_id = item.get("_record_id", "")
        if old_record_id != new_record_id:
            print("okold:{},new:{}".format(old_record_id, new_record_id))
            item["_record_id"] = new_record_id
            q_data.put(item)
Exemplo n.º 3
0
def short_url(url):
    return HZPOST_PATTERN.sub("HZPOST={}".format(get_md5(url)), url)[:512]
Exemplo n.º 4
0
            data_list.append(item)
            if len(data_list) >= 500:
                ret = pool.map(prox, data_list)
                del data_list[:]
                for i in range(q_data.qsize()):
                    insert_data_list.append(q_data.get())
                obj._insert_info_batch(obj.targetTable, insert_data_list)
                del insert_data_list[:]

            if num % 1000 == 0:
                print "sum_num:", num, len(
                    data_list), "time_cost:", time.time() - begin_time

            break

        except Exception as e:
            print traceback.format_exc()

    for i in range(q_data.qsize()):
        insert_data_list.append(q_data.get())
    obj._insert_info_batch(obj.targetTable, insert_data_list)
    del insert_data_list[:]

    print "time_cost:", time.time() - begin_time

    url = "http://www.landchina.com/default.aspx?tabid=386&comname=default&wmguid=75c72564-ffd9-426a-954b-8ac2df0903b7&recorderguid=43d2dda6-1a14-448c-b506-8e85cbb4a3bc"

    from i_util import tools

    print tools.get_md5('|' + url)