def parse_data(file_name, path1, path2): """处理csv数据""" fr = open(file_name, 'rU') file_city = re.search(ur'/fail_(.*)\.csv', file_name).group(1) success = path1 + u'success_' + file_city + u'.csv' fail = path2 + u'fail_' + file_city + u'.csv' f1 = open(success, 'w+') headers1 = [ 'electr_supervise_no', 'id', 'province', 'city', 'district', 'location', 'bd_lat', 'bd_lon', 'tdt_lat', 'tdt_lon', 'flag' ] writer = csv.DictWriter(f1, fieldnames=headers1) writer.writeheader() f2 = open(fail, 'w+') headers2 = [ 'electr_supervise_no', 'id', 'province', 'city', 'right_city', 'location', 'data_source_url', 'bd_lat', 'bd_lon', 'tdt_lat', 'tdt_lon' 'flag' ] writer = csv.DictWriter(f2, fieldnames=headers2) writer.writeheader() data_list = csv.reader(fr) for row, data in enumerate(data_list): print row, data if row >= 2: print len(data) ele_no = data[0] id = data[1] province = data[2] right_city = data[4] location = data[5] bd_lat, bd_lon = float(data[7]), float(data[8]) tdt_lat, tdt_lon = float(data[9]), float(data[10]) source_url = data[6] if ele_no and len(ele_no) >= 4: if bd_lat and bd_lon: district = getAddressInfo(bd_lat, bd_lon)[1] flag = 1 elif tdt_lat and tdt_lon: district = getAddressInfo(tdt_lat, tdt_lon)[1] flag = 1 else: flag = 0 else: flag = 0 if flag == 1: write_line = '\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",%f,%f,%f,%f,%d' % ( ele_no, id, province, right_city, district, location, bd_lat, bd_lon, tdt_lat, tdt_lon, flag) f1.write(write_line + "\n") else: write_line = '\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",%f,%f,%f,%f,%d' % ( ele_no, id, province, right_city, right_city, location, source_url, bd_lat, bd_lon, tdt_lat, tdt_lon, flag) f2.write(write_line + "\n") f1.close() f2.close() fr.close()
def excelToCsv(filename): """读取excel写入csv""" book = xlrd.open_workbook(filename) sheet = book.sheets()[0] rows = sheet.nrows titles = sheet.row_values(1) id_no = titles.index('id') electr_supervise = titles.index('electr_supervise_no') location_no = titles.index('location') with open(u'new_data_1.csv', 'w+') as f: headers = [ 'electr_supervise_no', 'id', 'province', 'city', 'district', 'location', 'bd_lat', 'bd_lon', 'tdt_lat', 'tdt_lon', 'status' ] writer = csv.DictWriter(f, fieldnames=headers) writer.writeheader() with open(u'fail_data_1.csv', 'w+') as f1: headers = [ 'electr_supervise_no', 'id', 'province', 'city', 'district', 'location', 'bd_lat', 'bd_lon', 'tdt_lat', 'tdt_lon', 'status' ] writer = csv.DictWriter(f1, fieldnames=headers) writer.writeheader() for i in range(2, rows): id = sheet.cell_value(i, id_no) electr_supervise_no = sheet.cell_value(i, electr_supervise) province = u'北京市' location = sheet.cell_value(i, location_no) address = u'北京市' + location bd_lat, bd_lon = getGeoPoints(address) tdt_lat, tdt_lon = tiandituPoint(address) if bd_lat != 0 and bd_lon != 0: info = getAddressInfo(bd_lat, bd_lon) city = info[0] district = info[1] status = 1 if city != u'北京市': status = 0 else: status = 0 write_lines = '\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",%f,%f,%f,%f,%d' % ( electr_supervise_no, id, province, city, district, location, bd_lat, bd_lon, tdt_lat, tdt_lon, status) print write_lines if status == 1: f.write(write_lines + "\n") else: f1.write(write_lines + "\n")
def parse_es_data(success_citys, i): """按城市匹配修正es的数据""" bd_lat, bd_lon = float(0), float(0) tdt_lat, tdt_lon = float(0), float(0) district = None city_dict = get_city2() print 'cities-', len(city_dict.keys()) for key in city_dict.keys(): prefix = key[0:4] if prefix not in success_citys and prefix[0] == i: f1 = open( u'success_data_14/success_data_%s_%s.csv' % (city_dict[key], prefix), 'w+') headers1 = [ 'electr_supervise_no', 'id', 'province', 'city', 'district', 'location', 'bd_lat', 'bd_lon', 'tdt_lat', 'tdt_lon', 'flag' ] writer = csv.DictWriter(f1, fieldnames=headers1) writer.writeheader() f2 = open(u'fail_data_14/fail_data_%s.csv' % city_dict[key], 'w+') headers2 = [ 'electr_supervise_no', 'id', 'province', 'city', 'district', 'location', 'data_source_url', 'flag' ] writer = csv.DictWriter(f2, fieldnames=headers2) writer.writeheader() print u'%s-' % city_dict[key], prefix sql = '''{"query":{"bool":{"must":[{"prefix":{"electr_supervise_no":"%s"}}],"must_not":[],"should":[]}},"from":0,"size":10000,"sort":[],"aggs":{}}''' % prefix results = es.search("land_transaction_1_cn", "transaction", sql) if results['hits']['total'] > 0: data_list = results['hits']['hits'] print 'total-%d' % len(data_list) for data in data_list: id = data['_id'] electr_supervise_no = data['_source'][ 'electr_supervise_no'] province = province_code[prefix[0:2]] city = data['_source']['city'] location = data['_source']['location'] data_source_url = data['_source']['data_source_url'] if city == city_dict[key]: right_city = city_dict[key] flag = 1 try: address = city + location bd_lat, bd_lon = getGeoPoints(address) tdt_lat, tdt_lon = tiandituPoint(address) district = getAddressInfo(bd_lat, bd_lon)[1] except Exception as e: logger.debug(e) if (bd_lat == 0 and bd_lon == 0) or len(electr_supervise_no) <= 9: flag = 0 else: flag = 0 right_city = city_dict[key] if flag == 1: write_line = '\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",%f,%f,%f,%f,%d' % ( electr_supervise_no, id, province, city, district, location, bd_lat, bd_lon, tdt_lat, tdt_lon, flag) f1.write(write_line + "\n") else: write_line = '\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",%f,%f,%f,%f,%d' % ( electr_supervise_no, id, province, city, right_city, data_source_url, location, bd_lat, bd_lon, tdt_lat, tdt_lon, flag) f2.write(write_line + "\n") else: print u'%s没有数据' % city_dict[key] # f.write('\"%s没有数据\",\"city_id=%s\"' % (city_dict[key], prefix) + "\n") continue f1.close() f2.close() success_citys.append(prefix) print 'success-%s' % success_citys[-1]
def parse_es_data(f, i): """按城市匹配修正es的数据""" city_dict = get_city2() print 'cities-', len(city_dict.keys()) ids = [] for key in city_dict.keys(): dics = [] prefix = key[0:4] print u'%s-' % city_dict[key], prefix if prefix not in ids and prefix[0] == i: sql = '''{"query":{"bool":{"must":[{"prefix":{"electr_supervise_no":"%s"}}],"must_not":[],"should":[]}},"from":0,"size":10000,"sort":[],"aggs":{}}''' % prefix results = es.search("land_transaction_1_cn", "transaction", sql) if results['hits']['total'] > 0: data_list = results['hits']['hits'] print len(data_list) for data in data_list: id = data['_id'] electr_supervise_no = data['_source'][ 'electr_supervise_no'] province = province_code[prefix[0:2]] city = data['_source']['city'] location = data['_source']['location'] data_source_url = data['_source']['data_source_url'] dic = { "_index": "land_transaction_1_cn", "_type": "transaction", "_id": id, "_source": { "electr_supervise_no": electr_supervise_no, "province": province, "location": location } } if city == city_dict[key]: flag = 1 address = city + location bd_lat, bd_lon = getGeoPoints(address) tdt_lat, tdt_lon = tiandituPoint(address) district = getAddressInfo(bd_lat, bd_lon)[1] dic["_source"]["city"] = city dic["district"] = district dic["geopoint"]["bd_lat"], dic["geopoint"][ "bd_lon"] = bd_lat, bd_lon dic["geopoint_tdt"]["tdt_lat"], dic["geopoint_tdt"][ "tdt_lon"] = tdt_lat, tdt_lon dic["flag"] = flag if (bd_lat == 0 and bd_lon == 0) or len(electr_supervise_no) <= 9: flag = 0 dic["_source"]["data_source_url"] = data_source_url dic["flag"] = flag else: flag = 0 dic["_source"]["data_source_url"] = data_source_url dic["flag"] = flag dics.append(dic) else: print u'%s没有数据' % city_dict[key] f.write('\"%s没有数据\",\"city_id-%s\"' % (city_dict[key], key)) continue result = helpers.bulk(es, actions=dics) print result f.close()