예제 #1
0
def parse_data(file_name, path1, path2):
    """处理csv数据"""
    fr = open(file_name, 'rU')
    file_city = re.search(ur'/fail_(.*)\.csv', file_name).group(1)
    success = path1 + u'success_' + file_city + u'.csv'
    fail = path2 + u'fail_' + file_city + u'.csv'
    f1 = open(success, 'w+')
    headers1 = [
        'electr_supervise_no', 'id', 'province', 'city', 'district',
        'location', 'bd_lat', 'bd_lon', 'tdt_lat', 'tdt_lon', 'flag'
    ]
    writer = csv.DictWriter(f1, fieldnames=headers1)
    writer.writeheader()
    f2 = open(fail, 'w+')
    headers2 = [
        'electr_supervise_no', 'id', 'province', 'city', 'right_city',
        'location', 'data_source_url', 'bd_lat', 'bd_lon', 'tdt_lat', 'tdt_lon'
        'flag'
    ]
    writer = csv.DictWriter(f2, fieldnames=headers2)
    writer.writeheader()
    data_list = csv.reader(fr)
    for row, data in enumerate(data_list):
        print row, data
        if row >= 2:
            print len(data)
            ele_no = data[0]
            id = data[1]
            province = data[2]
            right_city = data[4]
            location = data[5]
            bd_lat, bd_lon = float(data[7]), float(data[8])
            tdt_lat, tdt_lon = float(data[9]), float(data[10])
            source_url = data[6]
            if ele_no and len(ele_no) >= 4:
                if bd_lat and bd_lon:
                    district = getAddressInfo(bd_lat, bd_lon)[1]
                    flag = 1
                elif tdt_lat and tdt_lon:
                    district = getAddressInfo(tdt_lat, tdt_lon)[1]
                    flag = 1
                else:
                    flag = 0
            else:
                flag = 0
            if flag == 1:
                write_line = '\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",%f,%f,%f,%f,%d' % (
                    ele_no, id, province, right_city, district, location,
                    bd_lat, bd_lon, tdt_lat, tdt_lon, flag)
                f1.write(write_line + "\n")
            else:
                write_line = '\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",%f,%f,%f,%f,%d' % (
                    ele_no, id, province, right_city, right_city, location,
                    source_url, bd_lat, bd_lon, tdt_lat, tdt_lon, flag)
                f2.write(write_line + "\n")
    f1.close()
    f2.close()
    fr.close()
예제 #2
0
def excelToCsv(filename):
    """读取excel写入csv"""
    book = xlrd.open_workbook(filename)
    sheet = book.sheets()[0]
    rows = sheet.nrows
    titles = sheet.row_values(1)
    id_no = titles.index('id')
    electr_supervise = titles.index('electr_supervise_no')
    location_no = titles.index('location')
    with open(u'new_data_1.csv', 'w+') as f:
        headers = [
            'electr_supervise_no', 'id', 'province', 'city', 'district',
            'location', 'bd_lat', 'bd_lon', 'tdt_lat', 'tdt_lon', 'status'
        ]
        writer = csv.DictWriter(f, fieldnames=headers)
        writer.writeheader()
        with open(u'fail_data_1.csv', 'w+') as f1:
            headers = [
                'electr_supervise_no', 'id', 'province', 'city', 'district',
                'location', 'bd_lat', 'bd_lon', 'tdt_lat', 'tdt_lon', 'status'
            ]
            writer = csv.DictWriter(f1, fieldnames=headers)
            writer.writeheader()

            for i in range(2, rows):

                id = sheet.cell_value(i, id_no)
                electr_supervise_no = sheet.cell_value(i, electr_supervise)
                province = u'北京市'
                location = sheet.cell_value(i, location_no)
                address = u'北京市' + location
                bd_lat, bd_lon = getGeoPoints(address)
                tdt_lat, tdt_lon = tiandituPoint(address)
                if bd_lat != 0 and bd_lon != 0:
                    info = getAddressInfo(bd_lat, bd_lon)
                    city = info[0]
                    district = info[1]
                    status = 1
                    if city != u'北京市':
                        status = 0
                else:
                    status = 0

                write_lines = '\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",%f,%f,%f,%f,%d' % (
                    electr_supervise_no, id, province, city, district,
                    location, bd_lat, bd_lon, tdt_lat, tdt_lon, status)
                print write_lines
                if status == 1:
                    f.write(write_lines + "\n")
                else:
                    f1.write(write_lines + "\n")
예제 #3
0
def parse_es_data(success_citys, i):
    """按城市匹配修正es的数据"""
    bd_lat, bd_lon = float(0), float(0)
    tdt_lat, tdt_lon = float(0), float(0)
    district = None
    city_dict = get_city2()
    print 'cities-', len(city_dict.keys())
    for key in city_dict.keys():
        prefix = key[0:4]
        if prefix not in success_citys and prefix[0] == i:
            f1 = open(
                u'success_data_14/success_data_%s_%s.csv' %
                (city_dict[key], prefix), 'w+')
            headers1 = [
                'electr_supervise_no', 'id', 'province', 'city', 'district',
                'location', 'bd_lat', 'bd_lon', 'tdt_lat', 'tdt_lon', 'flag'
            ]
            writer = csv.DictWriter(f1, fieldnames=headers1)
            writer.writeheader()
            f2 = open(u'fail_data_14/fail_data_%s.csv' % city_dict[key], 'w+')
            headers2 = [
                'electr_supervise_no', 'id', 'province', 'city', 'district',
                'location', 'data_source_url', 'flag'
            ]
            writer = csv.DictWriter(f2, fieldnames=headers2)
            writer.writeheader()
            print u'%s-' % city_dict[key], prefix
            sql = '''{"query":{"bool":{"must":[{"prefix":{"electr_supervise_no":"%s"}}],"must_not":[],"should":[]}},"from":0,"size":10000,"sort":[],"aggs":{}}''' % prefix
            results = es.search("land_transaction_1_cn", "transaction", sql)
            if results['hits']['total'] > 0:
                data_list = results['hits']['hits']
                print 'total-%d' % len(data_list)
                for data in data_list:
                    id = data['_id']
                    electr_supervise_no = data['_source'][
                        'electr_supervise_no']
                    province = province_code[prefix[0:2]]
                    city = data['_source']['city']
                    location = data['_source']['location']
                    data_source_url = data['_source']['data_source_url']
                    if city == city_dict[key]:
                        right_city = city_dict[key]
                        flag = 1
                        try:
                            address = city + location
                            bd_lat, bd_lon = getGeoPoints(address)
                            tdt_lat, tdt_lon = tiandituPoint(address)
                            district = getAddressInfo(bd_lat, bd_lon)[1]
                        except Exception as e:
                            logger.debug(e)

                        if (bd_lat == 0 and bd_lon
                                == 0) or len(electr_supervise_no) <= 9:
                            flag = 0
                    else:
                        flag = 0
                        right_city = city_dict[key]
                    if flag == 1:
                        write_line = '\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",%f,%f,%f,%f,%d' % (
                            electr_supervise_no, id, province, city, district,
                            location, bd_lat, bd_lon, tdt_lat, tdt_lon, flag)
                        f1.write(write_line + "\n")
                    else:
                        write_line = '\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",%f,%f,%f,%f,%d' % (
                            electr_supervise_no, id, province, city,
                            right_city, data_source_url, location, bd_lat,
                            bd_lon, tdt_lat, tdt_lon, flag)
                        f2.write(write_line + "\n")
            else:
                print u'%s没有数据' % city_dict[key]
                # f.write('\"%s没有数据\",\"city_id=%s\"' % (city_dict[key], prefix) + "\n")
                continue

            f1.close()
            f2.close()
            success_citys.append(prefix)
            print 'success-%s' % success_citys[-1]
예제 #4
0
def parse_es_data(f, i):
    """按城市匹配修正es的数据"""
    city_dict = get_city2()
    print 'cities-', len(city_dict.keys())
    ids = []
    for key in city_dict.keys():
        dics = []
        prefix = key[0:4]
        print u'%s-' % city_dict[key], prefix
        if prefix not in ids and prefix[0] == i:
            sql = '''{"query":{"bool":{"must":[{"prefix":{"electr_supervise_no":"%s"}}],"must_not":[],"should":[]}},"from":0,"size":10000,"sort":[],"aggs":{}}''' % prefix
            results = es.search("land_transaction_1_cn", "transaction", sql)
            if results['hits']['total'] > 0:
                data_list = results['hits']['hits']
                print len(data_list)
                for data in data_list:
                    id = data['_id']
                    electr_supervise_no = data['_source'][
                        'electr_supervise_no']
                    province = province_code[prefix[0:2]]
                    city = data['_source']['city']
                    location = data['_source']['location']
                    data_source_url = data['_source']['data_source_url']
                    dic = {
                        "_index": "land_transaction_1_cn",
                        "_type": "transaction",
                        "_id": id,
                        "_source": {
                            "electr_supervise_no": electr_supervise_no,
                            "province": province,
                            "location": location
                        }
                    }
                    if city == city_dict[key]:
                        flag = 1
                        address = city + location
                        bd_lat, bd_lon = getGeoPoints(address)
                        tdt_lat, tdt_lon = tiandituPoint(address)
                        district = getAddressInfo(bd_lat, bd_lon)[1]
                        dic["_source"]["city"] = city
                        dic["district"] = district
                        dic["geopoint"]["bd_lat"], dic["geopoint"][
                            "bd_lon"] = bd_lat, bd_lon
                        dic["geopoint_tdt"]["tdt_lat"], dic["geopoint_tdt"][
                            "tdt_lon"] = tdt_lat, tdt_lon
                        dic["flag"] = flag
                        if (bd_lat == 0 and bd_lon
                                == 0) or len(electr_supervise_no) <= 9:
                            flag = 0
                            dic["_source"]["data_source_url"] = data_source_url
                            dic["flag"] = flag

                    else:
                        flag = 0
                        dic["_source"]["data_source_url"] = data_source_url
                        dic["flag"] = flag

                    dics.append(dic)

            else:
                print u'%s没有数据' % city_dict[key]
                f.write('\"%s没有数据\",\"city_id-%s\"' % (city_dict[key], key))
                continue

        result = helpers.bulk(es, actions=dics)
        print result
    f.close()