Exemplo n.º 1
0
class BrandSeriesAnalysis(object):
    """本类主要负责品牌与车系和车型的校验与分词提取
    """
    def __init__(self, item):
        self.item = item
        self.cars = CarSpecification()
        self.seg_rs = None

    @property
    def seg(self):
        from lib.seg import split
        self.seg_rs = self.seg_rs or split(self.item['car_title'])
        return self.seg_rs

    def verify_segment(self):
        if u"别克" in self.seg["cf"]:
            self.seg["cb"] = {u"别克"}
            self.seg["cf"].remove(u"别克")
        for cb in self.seg['cb']:
            for cf in self.seg['cf']:
                if self.verify_brandseries(cb, cf):
                    return {'brand': cb, 'series': cf}

        for cf in self.seg['cf']:
            return {'series': cf}

        for cb in self.seg['cb']:
            return {'brand': cb}

        return {}

    def verify_brand(self):
        # 品牌校验
        brand = self.item["car_brand"]
        brand = self.cars.trans_synonyms_brand(brand)
        return self.cars.has_brand(brand)

    def verify_series(self):
        # 车系校验
        serie = self.item["car_series"]
        serie = self.cars.trans_synonyms_series(serie)
        return self.cars.has_series(serie)

    def verify_brandseries(self, b, s):
        brand = self.cars.get_brand_by_series(s)
        return brand == b

    def verify_type(self, t):
        return self.cars.has_type(self.item["car_type"])

    def verify_emission(self, e):
        # 排量校验
        return self.cars.has_emission(e)
Exemplo n.º 2
0
class BrandSeriesAnalysis(object):
    """本类主要负责品牌与车系和车型的校验与分词提取
    """
    def __init__(self, item):
        self.item = item
        self.cars = CarSpecification()
        self.seg_rs = None

    @property
    def seg(self):
        from lib.seg import split
        self.seg_rs = self.seg_rs or split(self.item['car_title'])
        return self.seg_rs

    def verify_segment(self):
        if u"别克" in self.seg["cf"]:
            self.seg["cb"] = {u"别克"}
            self.seg["cf"].remove(u"别克")
        for cb in self.seg['cb']:
            for cf in self.seg['cf']:
                if self.verify_brandseries(cb, cf):
                    return {'brand': cb, 'series': cf}

        for cf in self.seg['cf']:
            return {'series': cf}

        for cb in self.seg['cb']:
            return {'brand': cb}

        return {}

    def verify_brand(self):
        # 品牌校验
        brand = self.item["car_brand"]
        brand = self.cars.trans_synonyms_brand(brand)
        return self.cars.has_brand(brand)

    def verify_series(self):
        # 车系校验
        serie = self.item["car_series"]
        serie = self.cars.trans_synonyms_series(serie)
        return self.cars.has_series(serie)

    def verify_brandseries(self, b, s):
        brand = self.cars.get_brand_by_series(s)
        return brand == b

    def verify_type(self, t):
        return self.cars.has_type(self.item["car_type"])

    def verify_emission(self, e):
        # 排量校验
        return self.cars.has_emission(e)
Exemplo n.º 3
0
    def __init__(self):
        ProcesserBase.__init__(self) 

        self.cars = CarSpecification()
        self.seg_rule = [
            ("series_num_zh", u"(\d{2,4}).+[年款]?", 1),
            ("logo_zh", u"[\u2E80-\u9FFF]+[版型级]", 0),
            #("transmission_zh", u"(手动)|(自动)|(手波)|(手自一体)|(无极变速)|(双离合)", 0),
            #("transmission", u"([AM]T)|(A[^T]+T)|CVT|GSG", 0),
            ("engine", u"(\d\.\d?)(?![\d|万])(L|l|T|t|升|TSI|FSI|TFSI)?", 1),
            ("imports_zh", u"(进口)|(国产)|([\u2E80-\u9FFF]+)国", 0)]

        self.rule1 = [("car_type", "pattern_zh", 0.1, 'str'),
                      ("car_brand", "brand", 0.2, 'str'),
                      ("car_series", "series", 0.2, 'str'),
                      ("car_emission", "engine", 0.08, 'abs'),
                      ("car_transmission", "transmission_zh", 0.04, 'str'),
                      ("car_title", "logo_zh", 0.1, 'str'),
                      ("car_title", "pattern_zh", 0.13, 'str'),
                      ("car_title", "series_num_zh", 0.1, 'str'),
                      ("car_description", "pattern_zh", 0.023, 'str'),
                      ("car_birth", "producted_year", 0.03, 'str'),
                      ("purchase_price_refer", "indicative_price", 0.024, 'num'),]
Exemplo n.º 4
0
 def __init__(self, item):
     self.item = item
     self.cars = CarSpecification()
     self.seg_rs = None
Exemplo n.º 5
0
 def __init__(self, item):
     self.item = item
     self.cars = CarSpecification()
     self.seg_rs = None
Exemplo n.º 6
0
class Processer(ProcesserBase):
    """主要字段的规整化、涉及到分词提取"""
    seq = "p22"
    
    def __init__(self):
        ProcesserBase.__init__(self) 

        self.cars = CarSpecification()
        self.seg_rule = [
            ("series_num_zh", u"(\d{2,4}).+[年款]?", 1),
            ("logo_zh", u"[\u2E80-\u9FFF]+[版型级]", 0),
            #("transmission_zh", u"(手动)|(自动)|(手波)|(手自一体)|(无极变速)|(双离合)", 0),
            #("transmission", u"([AM]T)|(A[^T]+T)|CVT|GSG", 0),
            ("engine", u"(\d\.\d?)(?![\d|万])(L|l|T|t|升|TSI|FSI|TFSI)?", 1),
            ("imports_zh", u"(进口)|(国产)|([\u2E80-\u9FFF]+)国", 0)]

        self.rule1 = [("car_type", "pattern_zh", 0.1, 'str'),
                      ("car_brand", "brand", 0.2, 'str'),
                      ("car_series", "series", 0.2, 'str'),
                      ("car_emission", "engine", 0.08, 'abs'),
                      ("car_transmission", "transmission_zh", 0.04, 'str'),
                      ("car_title", "logo_zh", 0.1, 'str'),
                      ("car_title", "pattern_zh", 0.13, 'str'),
                      ("car_title", "series_num_zh", 0.1, 'str'),
                      ("car_description", "pattern_zh", 0.023, 'str'),
                      ("car_birth", "producted_year", 0.03, 'str'),
                      ("purchase_price_refer", "indicative_price", 0.024, 'num'),]

    def _process(self, item, items, rule):
        """计算相似度,返回评分最高的一个
        """
        dc = DocumentCompare(rule)
        sim_rs = dc.mostsimilar(item, items)
        doc = sim_rs.get("doc")
        
        if doc:
            new_item = item.copy()
            new_item["car_type"] = doc["pattern_zh"]
            new_item["car_brand"] = doc["brand"]
            new_item["car_series"] = doc["series"]
            new_item["vehicle_code"] = doc["vehicle_code"]
            new_item["car_type_score"] = sim_rs["similarity"]
            self.logger.debug(repr(dc.opcodes))
            
            return new_item

    def simple_match(self, item, sseg, **kv):
        """主要评分算法,正则匹配提取,相似度计算
        1. 用正则提取标题里的年款,国别,车款描述,变速箱,排量信息
        2. 然后以提取出来的信息从数据库里查询匹配的车型,此时如果kv里面有品牌或者车系信息则加入到查询条件
        3. 最后将查询出来的车型与当前车源进行相似度计算
        返回评分最高的一个
        """
        results = sseg
        results.update(kv)
        datas = self.cars.get_car_data(**results)
        if not datas:
            datas = self.cars.get_car_data(**kv)
        try:
            f_item = self._process(item, datas, self.rule1)
            f_item['title_fetched'] = len(datas)
            self.logger.debug(repr(results))
            return f_item
        except:
            pass

    def simple_segment(self, title):
        """正则匹配提取
        """
        rs = segment(self.seg_rule, title)
        results = dict((k, v.get("content")) for k, v in rs.items() if v.get("content"))
        return results

    def process_final(self, item, **condition):
        """根据品牌(和排量)查询出该品牌下面所有的车型
        再将查询出来的车型与当前车源进行相似度计算
        返回评分最高的一个
        """
        vehicles = self.cars.get_vehicles_by_condition(**condition)
        f_item = self._process(item, vehicles, self.rule1)

        return f_item

    def process(self, data):
        data["car_brand_old"] = data['car_brand']
        data['car_series_old'] = data['car_series']
        data['car_type_old'] = data['car_type']

        def run(item):
            """主逻辑,如果识别成功,则返回更新过后的item
            否则返回(品牌, 车系)
            """
            # 一阶分数
            score_1 = 0
            # 二阶分数
            score_2 = 0
            # 三阶分数
            score_3 = 0

            bsa = BrandSeriesAnalysis(item)
            # 直接car_type与redbook里的parttern比较,
            if data['car_type']:
                car = bsa.verify_type(data['car_type'])
                if car:
                    item["car_type"] = car[3]
                    item["car_brand"] = car[1]
                    item["car_series"] = car[2]
                    item["vehicle_code"] = car[0]
                    item["car_type_score"] = 1
                    item['complete_step'] = u"车型完全匹配"
                    return item
            else:
                score_3 += 0.05
            
            # 验证当前品牌和车系,包括转换同义词
            new_brand = bsa.verify_brand()
            new_series = bsa.verify_series()
            if not new_brand and new_series:
                new_brand = self.cars.get_brand_by_series(new_series)

            sseg = self.simple_segment(item['car_title'])
            if item.get("car_emission"):
                sseg["engine"] = item["car_emission"]
            if item.get("car_publish_logo"):
                sseg["series_num_zh"] = item["car_publish_logo"]
            item['seg_title'] = json.dumps(sseg)
            new_displacement = sseg.get("engine") or bsa.verify_emission(item['car_emission'])

            if new_brand and new_series:
                smatched = self.simple_match(item, sseg, brand=new_brand, series=new_series)
                if smatched:
                    score_1 = 0.1
                    score_2 += 0.4
                    smatched['score_1'] = score_1
                    smatched['score_2'] = score_2
                    smatched['score_3'] = smatched['car_type_score']*40/100
                    smatched['car_type_score'] = (score_1 + score_2) + smatched['car_type_score']*40/100
                    smatched['complete_step'] = u"缺省品牌和车系校验通过"
                    return smatched
            # 如果以上步骤均未得出识别结果,则求助于jieba分词
            segs = bsa.verify_segment()
            seg_brand = segs.get("brand")
            seg_series = segs.get("series")
            if not seg_brand and seg_series:
                seg_brand = self.cars.get_brand_by_series(seg_series)
            if seg_brand and seg_series:
                score_1 = 0.1
                score_2 += 0.4
                smatched = self.simple_match(item, sseg, **segs)
                if smatched:
                    smatched['seg_brand'] = seg_brand
                    smatched['seg_series'] = seg_series
                    smatched['score_1'] = score_1
                    smatched['score_2'] = score_2
                    smatched['score_3'] = smatched['car_type_score']*40/100
                    smatched['car_type_score'] = (score_1 + score_2) + smatched['car_type_score']*40/100
                    smatched['complete_step'] = u"分词品牌和车系校验通过"
                    return smatched
                
            # 如果品牌通过验证,则进入主要评分算法如果算法有返回,则车型识别结束
            if new_brand:
                smatched = self.simple_match(item, sseg, brand=new_brand)
                if smatched:
                    score_1 = 0.05
                    score_2 += 0.4
                    smatched['score_1'] = score_1
                    smatched['score_2'] = score_2
                    smatched['score_3'] = smatched['car_type_score']*40/100
                    smatched['car_type_score'] = (score_1 + score_2) + smatched['car_type_score']*40/100
                    smatched['complete_step'] = u"缺省品牌校验通过"
                    return smatched

            if seg_brand:
                smatched = self.simple_match(item, sseg, brand=seg_brand)
                if smatched:
                    score_1 = 0.05
                    score_2 += 0.4
                    smatched['score_1'] = score_1
                    smatched['score_2'] = score_2
                    smatched['score_3'] = smatched['car_type_score']*40/100
                    smatched['car_type_score'] = (score_1 + score_2) + smatched['car_type_score']*40/100
                    smatched['complete_step'] = u"分词品牌校验通过"
                    return smatched

            # 最后补救,处理只有一个品牌被识别的情况,
            # 在品牌和车系都还未知的情况下,只根据正则提取出来的关键字从sqlite里查找相应的车型
            # 如果有记录,并且记录数小于100(为保证运行效率,只处理数量小于100的),
            # 则进行评分,得到结果后直接返回,车型识别结束
            if len(sseg) > 2:
                datas = self.cars.get_car_data(**sseg)
                if 0 < len(datas) <= 100:
                    t_item = self._process(item, datas, self.rule1)
                    t_item['title_fetched'] = len(datas)
                    score_2 += 0.1
                    t_item['score_1'] = score_1
                    t_item['score_2'] = score_2
                    t_item['score_3'] = t_item['car_type_score']*40/100
                    t_item['car_type_score'] = (score_1 + score_2) + t_item['car_type_score']*40/100
                    t_item['complete_step'] = '标题关键字抽取后匹配'
                    return t_item

        ritem = run(data)
        if ritem:
            self.logger.info("Processed an item: %s, VID: %s, Score: %f, title fetched %d items." % \
                             (ritem["id"],
                              ritem["vehicle_code"],
                              ritem["car_type_score"],
                              ritem.get("title_fetched", 0)))

            if ritem['car_brand'] not in ritem['car_title']:
                self.logger.warning(u"@car_brand (%s) not in car_title (%s)!!!" % (ritem['car_brand'], ritem['car_title']))
            if ritem['car_series'] not in ritem['car_title']:
                self.logger.warning(u"@car_series (%s) not in car_title (%s)!!!" % (ritem['car_series'], ritem['car_title']))
            return ritem
        else:
            # 若此时品牌还没有被识别,则放弃该条车源 Just no idea!
            self.logger.warning('(%s) Item ignore, brand not found![id]: %s [brand]: %s, [series]: %s, [title]: %s' % \
                                (data.get('domain', 'None'), data['id'], data['car_brand'], data['car_series'], data['car_title']))