def match():
        # 先到匹配表里查 找不到再跑接口匹配
        for i in crawler_collection.find(no_cursor_timeout=True):
            source = 'youda'
            city = '上海'
            region = i['CJ_XQ']
            friendsName = i['CJ_LPMC']
            data = match_collection.find_one({'source': source, 'city': city, 'region': region, 'friendsName': friendsName})
            if data:
                crawler_collection.find_one_and_update({'_id': i['_id']}, {'$set': {'fj_city': data['city'],
                                                                                    'fj_region': data['region'],
                                                                                    'fj_name': data['fjName'],
                                                                                    'fj_flag': 1,
                                                                                    'update_time': datetime.utcnow()}})
                log.info('更新数据 _id={}'.format(i['_id']))
            else:
                friendsAddress = i['CJ_ZL']
                address = re.search('\d+(号|弄|支|支弄|单号|双号|甲号|乙号|丙号|丁号)', friendsAddress, re.S | re.M)
                if address:
                    data = match_collection.find_one({'source': source, 'city': city, 'region': region, 'friendsAddress': friendsAddress})
                    if data:
                        crawler_collection.find_one_and_update({'_id': i['_id']},{'$set': {'fj_city': data['city'],
                                                                                           'fj_region': data['region'],
                                                                                           'fj_name': data['fjName'],
                                                                                           'fj_flag': 1,
                                                                                           'update_time': datetime.utcnow()}})
                        log.info('更新数据 _id={}'.format(i['_id']))

        for j in crawler_collection.find({'fj_flag': None}, no_cursor_timeout=True):
            if j['CJ_FWYT'] in ['住宅', '综合社区', '别墅', '里弄房', '老公房']:
                city = '上海'
                region = j['CJ_XQ']
                friendsName = j['name_list']
                for name in friendsName:
                    data = match(city=city, region=region, keyword=name)
                    if data:
                        if data['flag'] == '精确匹配':
                            crawler_collection.find_one_and_update({'_id': j['_id']}, {'$set': {'fj_city': data['mcity'],
                                                                                                'fj_region': data['mregion'],
                                                                                                'fj_name': data['mname'],
                                                                                                'fj_flag': 1,
                                                                                                'update_time': datetime.utcnow().replace(tzinfo=timezone.utc)}})
                            log.info('更新数据 _id={}'.format(j['_id']))
                            break

        for k in crawler_collection.find({'fj_flag': None}, no_cursor_timeout=True):
            if k['CJ_FWYT'] in ['住宅', '综合社区', '别墅', '里弄房', '老公房']:
                city = '上海'
                region = k['CJ_XQ']
                if 'address' in k:
                    data = match(city=city, region=region, keyword=k['address'])
                    if data:
                        if data['flag'] == '精确匹配':
                            crawler_collection.find_one_and_update({'_id': k['_id']}, {'$set': {'fj_city': data['mcity'],
                                                                                                'fj_region': data['mregion'],
                                                                                                'fj_name': data['mname'],
                                                                                                'fj_flag': 1,
                                                                                                'update_time': datetime.utcnow().replace(tzinfo=timezone.utc)}})
                            log.info('更新数据 _id={}'.format(k['_id']))
Пример #2
0
def match_address_estate_type2(data):
    _id = data['_id']

    city = data['city']
    region = data['region']
    district_name = data['district_name']

    match_data = match(city=city, region=region, keyword=district_name)

    if match_data and match_data['flag'] == '精确匹配':
        address = match_data['maddress']

        collection_lianjia.find_one_and_update({'_id': _id},
                                               {'$set': {
                                                   'address': address
                                               }})
        print('更新地址 _id={} address={}'.format(_id, address))

        match_id = match_data['_id']

        m = MongoClient(host='192.168.0.136', port=27017)
        collection_seaweed = m['fangjia']['seaweed']

        seaweed_data = collection_seaweed.find_one({'_id': ObjectId(match_id)})
        if seaweed_data and 'estate_type2' in seaweed_data:
            collection_lianjia.find_one_and_update(
                {'_id': _id},
                {'$set': {
                    'estate_type2': seaweed_data['estate_type2']
                }})
            print('更新地址 _id={} estate_type2={}'.format(
                _id, seaweed_data['estate_type2']))

        m.close()
Пример #3
0
def match_data(i, m_address):
    match_data = match(city=i['city'], region=i['region'], keyword=m_address)
    if match_data:
        if '精确匹配' in match_data['flag']:
            collection_delete_repeat.find_one_and_update({'_id': i['_id']},
                                                         {'$set': {'fj_city': match_data['mcity'],
                                                                   'fj_region': match_data['mregion'],
                                                                   'fj_name': match_data['mname'],
                                                                   'fj_id': match_data['_id'],
                                                                   'fj_flag': 1}})
            print('匹配一条数据')
            return 1
        else:
            return 0
    else:
        return 0
Пример #4
0
def start(i):
    city = i['city']
    region = i['region']
    district_name = i['district_name']
    match_data = match(city=city, region=region, keyword=district_name)
    if match_data:
        if '精确匹配' in match_data['flag']:
            collection_delete_repeat.find_one_and_update({'_id': i['_id']}, {
                '$set': {
                    'fj_city': match_data['mcity'],
                    'fj_region': match_data['mregion'],
                    'fj_name': match_data['mname'],
                    'fj_id': match_data['_id'],
                    'fj_flag': 1
                }
            })
            print('匹配一条数据')
    def add_fj_name():
        # 先到匹配表里查 找不到再跑接口匹配
        for i in crawler_collection.find(no_cursor_timeout=True):
            source = 'res'
            city = '上海'
            region = i['area']
            friendsName = i['fullhousingname']
            data = collection_match.find_one({
                'source': source,
                'city': city,
                'region': region,
                'friendsName': friendsName
            })
            if data:
                crawler_collection.find_one_and_update({'_id': i['_id']}, {
                    '$set': {
                        'fj_city': data['city'],
                        'fj_region': data['region'],
                        'fj_name': data['fjName'],
                        'fj_flag': 1,
                        'update_time': datetime.utcnow()
                    }
                })
                print('更新数据 添加格式化城市区域小区名 _id={}'.format(data['_id']))
            else:
                friendsAddress = i['housingaddressall']
                address = re.search('\d+(号|弄|支|支弄|单号|双号|甲号|乙号|丙号|丁号)',
                                    friendsAddress, re.S | re.M)
                if address:
                    data = collection_match.find_one({
                        'source':
                        source,
                        'city':
                        city,
                        'region':
                        region,
                        'friendsAddress':
                        friendsAddress
                    })
                    if data:
                        crawler_collection.find_one_and_update(
                            {'_id': i['_id']}, {
                                '$set': {
                                    'fj_city':
                                    data['city'],
                                    'fj_region':
                                    data['region'],
                                    'fj_name':
                                    data['fjName'],
                                    'fj_flag':
                                    1,
                                    'update_time':
                                    datetime.utcnow().replace(
                                        tzinfo=timezone.utc)
                                }
                            })
                        print('更新数据 添加格式化城市区域小区名 _id={}'.format(data['_id']))

        for j in crawler_collection.find({'fj_flag': None},
                                         no_cursor_timeout=True):
            if j['propertytype'] in ['住宅', '综合社区', '别墅']:
                city = '上海'
                region = j['area']
                """
                取两个字段相同的部分为小区名
                """
                fullhousingname = j['fullhousingname']
                newdiskname = j['newdiskname']
                Counter(fullhousingname)
                Counter(newdiskname)
                c = Counter(fullhousingname) & Counter(newdiskname)
                friendsName = "".join(c.keys())

                data = match(city=city, region=region, keyword=friendsName)
                if data:
                    if data['flag'] == '精确匹配':
                        crawler_collection.find_one_and_update(
                            {'_id': j['_id']}, {
                                '$set': {
                                    'fj_city': data['mcity'],
                                    'fj_region': data['mregion'],
                                    'fj_name': data['mname'],
                                    'fj_flag': 1,
                                    'update_time': datetime.utcnow()
                                }
                            })
                        print('更新数据 _id={}'.format(j['_id']))
                else:
                    friendsAddress = j['houseaddress']
                    data = match(city=city,
                                 region=region,
                                 keyword=friendsAddress)
                    if data:
                        if data['flag'] == '精确匹配':
                            crawler_collection.find_one_and_update(
                                {'_id': j['_id']}, {
                                    '$set': {
                                        'fj_city': data['mcity'],
                                        'fj_region': data['mregion'],
                                        'fj_name': data['mname'],
                                        'fj_flag': 1,
                                        'update_time': datetime.utcnow()
                                    }
                                })
                            print('更新数据 _id={}'.format(j['_id']))
Пример #6
0
                username='******',
                password='******')
collection_res_2018 = m['deal_price']['res_second_2018_11']


for i in collection_res_2018.find({"fj_name": None}):
    fullhousingname = i['fullhousingname']
    print(fullhousingname)
    newdiskname = i['newdiskname']
    print(newdiskname)
    Counter(fullhousingname)
    Counter(newdiskname)
    c = Counter(fullhousingname) & Counter(newdiskname)
    new_name = "".join(c.keys())
    print(new_name)
    data = match(city='上海', region=i['area'], keyword=new_name)
    print(data)
    if data:
        if data['flag'] == '精确匹配':
            collection_res_2018.find_one_and_update({'_id': i['_id']}, {'$set': {'fj_city': data['mcity'],
                                                                                 'fj_region': data['mregion'],
                                                                                 'fj_name': data['mname'],
                                                                                 'fj_flag': 1,
                                                                                 'update_time': datetime.utcnow()}})
            print('更新一条数据 fj_name={}'.format(data['mname']))

# count = 0
# for i in collection_res_2018.find({'fj_flag': 1}):
#     if '地下' in i['houseaddress']:
#         collection_res_2018.find_one_and_update({'_id': i['_id']}, {'$set': {'floor': None}})