예제 #1
0
def excelToCsv(filename):
    """读取excel写入csv"""
    book = xlrd.open_workbook(filename)
    sheet = book.sheets()[0]
    rows = sheet.nrows
    titles = sheet.row_values(1)
    id_no = titles.index('id')
    electr_supervise = titles.index('electr_supervise_no')
    location_no = titles.index('location')
    with open(u'new_data_1.csv', 'w+') as f:
        headers = [
            'electr_supervise_no', 'id', 'province', 'city', 'district',
            'location', 'bd_lat', 'bd_lon', 'tdt_lat', 'tdt_lon', 'status'
        ]
        writer = csv.DictWriter(f, fieldnames=headers)
        writer.writeheader()
        with open(u'fail_data_1.csv', 'w+') as f1:
            headers = [
                'electr_supervise_no', 'id', 'province', 'city', 'district',
                'location', 'bd_lat', 'bd_lon', 'tdt_lat', 'tdt_lon', 'status'
            ]
            writer = csv.DictWriter(f1, fieldnames=headers)
            writer.writeheader()

            for i in range(2, rows):

                id = sheet.cell_value(i, id_no)
                electr_supervise_no = sheet.cell_value(i, electr_supervise)
                province = u'北京市'
                location = sheet.cell_value(i, location_no)
                address = u'北京市' + location
                bd_lat, bd_lon = getGeoPoints(address)
                tdt_lat, tdt_lon = tiandituPoint(address)
                if bd_lat != 0 and bd_lon != 0:
                    info = getAddressInfo(bd_lat, bd_lon)
                    city = info[0]
                    district = info[1]
                    status = 1
                    if city != u'北京市':
                        status = 0
                else:
                    status = 0

                write_lines = '\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",%f,%f,%f,%f,%d' % (
                    electr_supervise_no, id, province, city, district,
                    location, bd_lat, bd_lon, tdt_lat, tdt_lon, status)
                print write_lines
                if status == 1:
                    f.write(write_lines + "\n")
                else:
                    f1.write(write_lines + "\n")
예제 #2
0
def read_excel(file_name):
    """读取excel写入es"""
    workbook = xlrd.open_workbook(file_name)
    worksheet = workbook.sheets()[0]
    nrows = worksheet.nrows
    ncols = worksheet.ncols
    strs = worksheet.row_values(0)
    for i in range(1, nrows - 7):
        dict = {}
        geopoint_baidu = {}
        geopoint_tianditu = {}
        for j in range(ncols):
            str = strs[j]
            object = None
            if worksheet.cell_value(0, j) == 'location':
                state_index = j
                object = location = worksheet.cell_value(i, state_index)
                geopoint_baidu['lat'], geopoint_baidu['lon'] = getGeoPoints(location)
                geopoint_tianditu['lat'], geopoint_tianditu['lon'] = tiandituPoint(location)
            elif worksheet.cell_value(0, j) == 'lon' or worksheet.cell_value(0, j) == 'lat':
                continue
            elif worksheet.cell(i, j).ctype == 3:
                cell = worksheet.cell_value(i, j)

                date = xldate_as_datetime(cell, 0)

                date = datetime.strftime(date, '%Y%m%d')
                object = date
            elif worksheet.cell_value(0, j) == 'land_name':
                land_index = j
                object = worksheet.cell_value(i, j)
            else:
                object = worksheet.cell_value(i, j)
            dict[str] = object
        dict['geopoint'] = geopoint_baidu
        dict['geopoint_tianditu'] = geopoint_tianditu
        state_no = worksheet.cell_value(i, 0)
        code = state_no + worksheet.cell_value(i, land_index)
        id_ = abs(hash(code))
        print '存入%d' % i

        es = get_es_client()
        es.index('land_transaction_cn_test', 'transaction', dict, id_)
예제 #3
0
def parse_es_data(success_citys, i):
    """按城市匹配修正es的数据"""
    bd_lat, bd_lon = float(0), float(0)
    tdt_lat, tdt_lon = float(0), float(0)
    district = None
    city_dict = get_city2()
    print 'cities-', len(city_dict.keys())
    for key in city_dict.keys():
        prefix = key[0:4]
        if prefix not in success_citys and prefix[0] == i:
            f1 = open(
                u'success_data_14/success_data_%s_%s.csv' %
                (city_dict[key], prefix), 'w+')
            headers1 = [
                'electr_supervise_no', 'id', 'province', 'city', 'district',
                'location', 'bd_lat', 'bd_lon', 'tdt_lat', 'tdt_lon', 'flag'
            ]
            writer = csv.DictWriter(f1, fieldnames=headers1)
            writer.writeheader()
            f2 = open(u'fail_data_14/fail_data_%s.csv' % city_dict[key], 'w+')
            headers2 = [
                'electr_supervise_no', 'id', 'province', 'city', 'district',
                'location', 'data_source_url', 'flag'
            ]
            writer = csv.DictWriter(f2, fieldnames=headers2)
            writer.writeheader()
            print u'%s-' % city_dict[key], prefix
            sql = '''{"query":{"bool":{"must":[{"prefix":{"electr_supervise_no":"%s"}}],"must_not":[],"should":[]}},"from":0,"size":10000,"sort":[],"aggs":{}}''' % prefix
            results = es.search("land_transaction_1_cn", "transaction", sql)
            if results['hits']['total'] > 0:
                data_list = results['hits']['hits']
                print 'total-%d' % len(data_list)
                for data in data_list:
                    id = data['_id']
                    electr_supervise_no = data['_source'][
                        'electr_supervise_no']
                    province = province_code[prefix[0:2]]
                    city = data['_source']['city']
                    location = data['_source']['location']
                    data_source_url = data['_source']['data_source_url']
                    if city == city_dict[key]:
                        right_city = city_dict[key]
                        flag = 1
                        try:
                            address = city + location
                            bd_lat, bd_lon = getGeoPoints(address)
                            tdt_lat, tdt_lon = tiandituPoint(address)
                            district = getAddressInfo(bd_lat, bd_lon)[1]
                        except Exception as e:
                            logger.debug(e)

                        if (bd_lat == 0 and bd_lon
                                == 0) or len(electr_supervise_no) <= 9:
                            flag = 0
                    else:
                        flag = 0
                        right_city = city_dict[key]
                    if flag == 1:
                        write_line = '\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",%f,%f,%f,%f,%d' % (
                            electr_supervise_no, id, province, city, district,
                            location, bd_lat, bd_lon, tdt_lat, tdt_lon, flag)
                        f1.write(write_line + "\n")
                    else:
                        write_line = '\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",%f,%f,%f,%f,%d' % (
                            electr_supervise_no, id, province, city,
                            right_city, data_source_url, location, bd_lat,
                            bd_lon, tdt_lat, tdt_lon, flag)
                        f2.write(write_line + "\n")
            else:
                print u'%s没有数据' % city_dict[key]
                # f.write('\"%s没有数据\",\"city_id=%s\"' % (city_dict[key], prefix) + "\n")
                continue

            f1.close()
            f2.close()
            success_citys.append(prefix)
            print 'success-%s' % success_citys[-1]
예제 #4
0
def parse_es_data(f, i):
    """按城市匹配修正es的数据"""
    city_dict = get_city2()
    print 'cities-', len(city_dict.keys())
    ids = []
    for key in city_dict.keys():
        dics = []
        prefix = key[0:4]
        print u'%s-' % city_dict[key], prefix
        if prefix not in ids and prefix[0] == i:
            sql = '''{"query":{"bool":{"must":[{"prefix":{"electr_supervise_no":"%s"}}],"must_not":[],"should":[]}},"from":0,"size":10000,"sort":[],"aggs":{}}''' % prefix
            results = es.search("land_transaction_1_cn", "transaction", sql)
            if results['hits']['total'] > 0:
                data_list = results['hits']['hits']
                print len(data_list)
                for data in data_list:
                    id = data['_id']
                    electr_supervise_no = data['_source'][
                        'electr_supervise_no']
                    province = province_code[prefix[0:2]]
                    city = data['_source']['city']
                    location = data['_source']['location']
                    data_source_url = data['_source']['data_source_url']
                    dic = {
                        "_index": "land_transaction_1_cn",
                        "_type": "transaction",
                        "_id": id,
                        "_source": {
                            "electr_supervise_no": electr_supervise_no,
                            "province": province,
                            "location": location
                        }
                    }
                    if city == city_dict[key]:
                        flag = 1
                        address = city + location
                        bd_lat, bd_lon = getGeoPoints(address)
                        tdt_lat, tdt_lon = tiandituPoint(address)
                        district = getAddressInfo(bd_lat, bd_lon)[1]
                        dic["_source"]["city"] = city
                        dic["district"] = district
                        dic["geopoint"]["bd_lat"], dic["geopoint"][
                            "bd_lon"] = bd_lat, bd_lon
                        dic["geopoint_tdt"]["tdt_lat"], dic["geopoint_tdt"][
                            "tdt_lon"] = tdt_lat, tdt_lon
                        dic["flag"] = flag
                        if (bd_lat == 0 and bd_lon
                                == 0) or len(electr_supervise_no) <= 9:
                            flag = 0
                            dic["_source"]["data_source_url"] = data_source_url
                            dic["flag"] = flag

                    else:
                        flag = 0
                        dic["_source"]["data_source_url"] = data_source_url
                        dic["flag"] = flag

                    dics.append(dic)

            else:
                print u'%s没有数据' % city_dict[key]
                f.write('\"%s没有数据\",\"city_id-%s\"' % (city_dict[key], key))
                continue

        result = helpers.bulk(es, actions=dics)
        print result
    f.close()