def excelToCsv(filename): """读取excel写入csv""" book = xlrd.open_workbook(filename) sheet = book.sheets()[0] rows = sheet.nrows titles = sheet.row_values(1) id_no = titles.index('id') electr_supervise = titles.index('electr_supervise_no') location_no = titles.index('location') with open(u'new_data_1.csv', 'w+') as f: headers = [ 'electr_supervise_no', 'id', 'province', 'city', 'district', 'location', 'bd_lat', 'bd_lon', 'tdt_lat', 'tdt_lon', 'status' ] writer = csv.DictWriter(f, fieldnames=headers) writer.writeheader() with open(u'fail_data_1.csv', 'w+') as f1: headers = [ 'electr_supervise_no', 'id', 'province', 'city', 'district', 'location', 'bd_lat', 'bd_lon', 'tdt_lat', 'tdt_lon', 'status' ] writer = csv.DictWriter(f1, fieldnames=headers) writer.writeheader() for i in range(2, rows): id = sheet.cell_value(i, id_no) electr_supervise_no = sheet.cell_value(i, electr_supervise) province = u'北京市' location = sheet.cell_value(i, location_no) address = u'北京市' + location bd_lat, bd_lon = getGeoPoints(address) tdt_lat, tdt_lon = tiandituPoint(address) if bd_lat != 0 and bd_lon != 0: info = getAddressInfo(bd_lat, bd_lon) city = info[0] district = info[1] status = 1 if city != u'北京市': status = 0 else: status = 0 write_lines = '\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",%f,%f,%f,%f,%d' % ( electr_supervise_no, id, province, city, district, location, bd_lat, bd_lon, tdt_lat, tdt_lon, status) print write_lines if status == 1: f.write(write_lines + "\n") else: f1.write(write_lines + "\n")
def read_excel(file_name): """读取excel写入es""" workbook = xlrd.open_workbook(file_name) worksheet = workbook.sheets()[0] nrows = worksheet.nrows ncols = worksheet.ncols strs = worksheet.row_values(0) for i in range(1, nrows - 7): dict = {} geopoint_baidu = {} geopoint_tianditu = {} for j in range(ncols): str = strs[j] object = None if worksheet.cell_value(0, j) == 'location': state_index = j object = location = worksheet.cell_value(i, state_index) geopoint_baidu['lat'], geopoint_baidu['lon'] = getGeoPoints(location) geopoint_tianditu['lat'], geopoint_tianditu['lon'] = tiandituPoint(location) elif worksheet.cell_value(0, j) == 'lon' or worksheet.cell_value(0, j) == 'lat': continue elif worksheet.cell(i, j).ctype == 3: cell = worksheet.cell_value(i, j) date = xldate_as_datetime(cell, 0) date = datetime.strftime(date, '%Y%m%d') object = date elif worksheet.cell_value(0, j) == 'land_name': land_index = j object = worksheet.cell_value(i, j) else: object = worksheet.cell_value(i, j) dict[str] = object dict['geopoint'] = geopoint_baidu dict['geopoint_tianditu'] = geopoint_tianditu state_no = worksheet.cell_value(i, 0) code = state_no + worksheet.cell_value(i, land_index) id_ = abs(hash(code)) print '存入%d' % i es = get_es_client() es.index('land_transaction_cn_test', 'transaction', dict, id_)
def parse_es_data(success_citys, i): """按城市匹配修正es的数据""" bd_lat, bd_lon = float(0), float(0) tdt_lat, tdt_lon = float(0), float(0) district = None city_dict = get_city2() print 'cities-', len(city_dict.keys()) for key in city_dict.keys(): prefix = key[0:4] if prefix not in success_citys and prefix[0] == i: f1 = open( u'success_data_14/success_data_%s_%s.csv' % (city_dict[key], prefix), 'w+') headers1 = [ 'electr_supervise_no', 'id', 'province', 'city', 'district', 'location', 'bd_lat', 'bd_lon', 'tdt_lat', 'tdt_lon', 'flag' ] writer = csv.DictWriter(f1, fieldnames=headers1) writer.writeheader() f2 = open(u'fail_data_14/fail_data_%s.csv' % city_dict[key], 'w+') headers2 = [ 'electr_supervise_no', 'id', 'province', 'city', 'district', 'location', 'data_source_url', 'flag' ] writer = csv.DictWriter(f2, fieldnames=headers2) writer.writeheader() print u'%s-' % city_dict[key], prefix sql = '''{"query":{"bool":{"must":[{"prefix":{"electr_supervise_no":"%s"}}],"must_not":[],"should":[]}},"from":0,"size":10000,"sort":[],"aggs":{}}''' % prefix results = es.search("land_transaction_1_cn", "transaction", sql) if results['hits']['total'] > 0: data_list = results['hits']['hits'] print 'total-%d' % len(data_list) for data in data_list: id = data['_id'] electr_supervise_no = data['_source'][ 'electr_supervise_no'] province = province_code[prefix[0:2]] city = data['_source']['city'] location = data['_source']['location'] data_source_url = data['_source']['data_source_url'] if city == city_dict[key]: right_city = city_dict[key] flag = 1 try: address = city + location bd_lat, bd_lon = getGeoPoints(address) tdt_lat, tdt_lon = tiandituPoint(address) district = getAddressInfo(bd_lat, bd_lon)[1] except Exception as e: logger.debug(e) if (bd_lat == 0 and bd_lon == 0) or len(electr_supervise_no) <= 9: flag = 0 else: flag = 0 right_city = city_dict[key] if flag == 1: write_line = '\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",%f,%f,%f,%f,%d' % ( electr_supervise_no, id, province, city, district, location, bd_lat, bd_lon, tdt_lat, tdt_lon, flag) f1.write(write_line + "\n") else: write_line = '\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",%f,%f,%f,%f,%d' % ( electr_supervise_no, id, province, city, right_city, data_source_url, location, bd_lat, bd_lon, tdt_lat, tdt_lon, flag) f2.write(write_line + "\n") else: print u'%s没有数据' % city_dict[key] # f.write('\"%s没有数据\",\"city_id=%s\"' % (city_dict[key], prefix) + "\n") continue f1.close() f2.close() success_citys.append(prefix) print 'success-%s' % success_citys[-1]
def parse_es_data(f, i): """按城市匹配修正es的数据""" city_dict = get_city2() print 'cities-', len(city_dict.keys()) ids = [] for key in city_dict.keys(): dics = [] prefix = key[0:4] print u'%s-' % city_dict[key], prefix if prefix not in ids and prefix[0] == i: sql = '''{"query":{"bool":{"must":[{"prefix":{"electr_supervise_no":"%s"}}],"must_not":[],"should":[]}},"from":0,"size":10000,"sort":[],"aggs":{}}''' % prefix results = es.search("land_transaction_1_cn", "transaction", sql) if results['hits']['total'] > 0: data_list = results['hits']['hits'] print len(data_list) for data in data_list: id = data['_id'] electr_supervise_no = data['_source'][ 'electr_supervise_no'] province = province_code[prefix[0:2]] city = data['_source']['city'] location = data['_source']['location'] data_source_url = data['_source']['data_source_url'] dic = { "_index": "land_transaction_1_cn", "_type": "transaction", "_id": id, "_source": { "electr_supervise_no": electr_supervise_no, "province": province, "location": location } } if city == city_dict[key]: flag = 1 address = city + location bd_lat, bd_lon = getGeoPoints(address) tdt_lat, tdt_lon = tiandituPoint(address) district = getAddressInfo(bd_lat, bd_lon)[1] dic["_source"]["city"] = city dic["district"] = district dic["geopoint"]["bd_lat"], dic["geopoint"][ "bd_lon"] = bd_lat, bd_lon dic["geopoint_tdt"]["tdt_lat"], dic["geopoint_tdt"][ "tdt_lon"] = tdt_lat, tdt_lon dic["flag"] = flag if (bd_lat == 0 and bd_lon == 0) or len(electr_supervise_no) <= 9: flag = 0 dic["_source"]["data_source_url"] = data_source_url dic["flag"] = flag else: flag = 0 dic["_source"]["data_source_url"] = data_source_url dic["flag"] = flag dics.append(dic) else: print u'%s没有数据' % city_dict[key] f.write('\"%s没有数据\",\"city_id-%s\"' % (city_dict[key], key)) continue result = helpers.bulk(es, actions=dics) print result f.close()