def process(self, item): seg_rule = [("producted_year", u"(\d{2,4}).+[年款]?", 1), ("logo_zh", u"^.+[版型级]", 0), ("transmission_zh", u"(手动)|(自动)|(手波)|(手自一体)|(无极变速)|CVT|([AM]T)|(A[^T]+T)", 0), ("engine", u"(\d\.\d?)(?![\d|万])(L|l|T|t|升|CVT|TSI|TFSI)?", 1), ("imports_zh", u"(进口)|(国产)|([\u2E80-\u9FFF]+)国", 0)] vid = item['vehicle_code'] seg_rule.extend([('car_brand', item['car_brand'], 0), ('car_series', item['car_series'], 0)]) sseg = segment(seg_rule, item['car_title']) cursor = conn.cursor() cursor.execute("select producted_year, logo_zh, engine, transmission_zh from car_datas where vehicle_code = ?", (vid, )) rb_rs = cursor.fetchone() rb = dict(zip([i[0] for i in cursor.description], rb_rs)) #年款 car_publish_logo = sseg['producted_year']['content'] if not car_publish_logo: car_publish_logo = rb['producted_year'] def repl(obj): num = obj.group() inum = int(num) if inum < 1000: if inum < 80: y = inum + 2000 else: y = inum + 1900 else: y = inum if 1985 < y < 2014: return str(y) if car_publish_logo: car_publish_logo = re.sub('\d+', repl, car_publish_logo) #版型 car_publish_version = sseg['logo_zh']['content'] if not car_publish_version: car_publish_version = rb['logo_zh'] or '' #排量 car_emission = sseg['engine']['content'] if not car_emission: car_emission = rb['engine'] #车型短描述 keywords = [item['car_brand'], item['car_series'], car_publish_logo, car_publish_version, rb['transmission_zh'], car_emission] standard_title = ' '.join([k for k in keywords if k]) item['car_publish_logo'] = car_publish_logo item['car_publish_version'] = car_publish_version item['standard_title'] = standard_title return item
def process(self, item): seg_rule = [ ("producted_year", u"(\d{2,4}).+[年款]?", 1), ("logo_zh", u"^.+[版型级]", 0), ("transmission_zh", u"(手动)|(自动)|(手波)|(手自一体)|(无极变速)|CVT|([AM]T)|(A[^T]+T)", 0), ("engine", u"(\d\.\d?)(?![\d|万])(L|l|T|t|升|CVT|TSI|TFSI)?", 1), ("imports_zh", u"(进口)|(国产)|([\u2E80-\u9FFF]+)国", 0), ] vid = item["vehicle_code"] seg_rule.extend([("car_brand", item["car_brand"], 0), ("car_series", item["car_series"], 0)]) sseg = segment(seg_rule, item["car_title"]) cursor = conn.cursor() cursor.execute( "select producted_year, logo_zh, engine, transmission_zh from car_datas where vehicle_code = ?", (vid,) ) rb_rs = cursor.fetchone() rb = dict(zip([i[0] for i in cursor.description], rb_rs)) # 年款 car_publish_logo = sseg["producted_year"]["content"] if not car_publish_logo: car_publish_logo = rb["producted_year"] def repl(obj): num = obj.group() inum = int(num) if inum < 1000: if inum < 80: y = inum + 2000 else: y = inum + 1900 else: y = inum if 1985 < y < 2014: return str(y) if car_publish_logo: car_publish_logo = re.sub("\d+", repl, car_publish_logo) # 版型 car_publish_version = sseg["logo_zh"]["content"] if not car_publish_version: car_publish_version = rb["logo_zh"] or "" # 排量 car_emission = sseg["engine"]["content"] if not car_emission: car_emission = rb["engine"] # 车型短描述 keywords = [ item["car_brand"], item["car_series"], car_publish_logo, car_publish_version, rb["transmission_zh"], car_emission, ] standard_title = " ".join([k for k in keywords if k]) item["car_publish_logo"] = car_publish_logo item["car_publish_version"] = car_publish_version item["standard_title"] = standard_title return item
def simple_segment(self, title): """正则匹配提取 """ rs = segment(self.seg_rule, title) results = dict((k, v.get("content")) for k, v in rs.items() if v.get("content")) return results