def gen_for_records(record): try: # 如果有提取数据 if isinstance(record, dict) and record: # 如果是列表数据 if isinstance(record.get('datas', None), (list, tuple)): datas = record['datas'] # 遍历列表中的_site_record_id进行转换 for idx, item in enumerate(datas): if item.has_key("_site_record_id"): datas[idx]['_site_record_id'] = get_md5( datas[idx]['_site_record_id']) # 不是列表数据转换site_record_id elif record.has_key('_site_record_id'): record['_site_record_id'] = get_md5( record['_site_record_id']) except Exception: raise ParserErrorException("count _site_record_id failed")
def do_clean(self, item): primary_key_str = '|' + item.get("litigants", "") + '|' + item.get( "court_time", "") + '|' + item.get("province", "") new_record_id = tools.get_md5(primary_key_str) old_record_id = item.get("_record_id", "") if old_record_id != new_record_id: print("okold:{},new:{}".format(old_record_id, new_record_id)) item["_record_id"] = new_record_id q_data.put(item)
def short_url(url): return HZPOST_PATTERN.sub("HZPOST={}".format(get_md5(url)), url)[:512]
data_list.append(item) if len(data_list) >= 500: ret = pool.map(prox, data_list) del data_list[:] for i in range(q_data.qsize()): insert_data_list.append(q_data.get()) obj._insert_info_batch(obj.targetTable, insert_data_list) del insert_data_list[:] if num % 1000 == 0: print "sum_num:", num, len( data_list), "time_cost:", time.time() - begin_time break except Exception as e: print traceback.format_exc() for i in range(q_data.qsize()): insert_data_list.append(q_data.get()) obj._insert_info_batch(obj.targetTable, insert_data_list) del insert_data_list[:] print "time_cost:", time.time() - begin_time url = "http://www.landchina.com/default.aspx?tabid=386&comname=default&wmguid=75c72564-ffd9-426a-954b-8ac2df0903b7&recorderguid=43d2dda6-1a14-448c-b506-8e85cbb4a3bc" from i_util import tools print tools.get_md5('|' + url)