def do_guess_normals(self, normals, top=1): if not normals: return [] smart_print(normals, "in guess normals") available = normals[0: top] normal_tags_dict = {} dict_from_items(normal_tags_dict, available) parents_list = [] for normal_item, value in available: parents_list.extend(self.memory_normal_items.normal_all_parents(normal_item.slug, value)) smart_print(parents_list, "parent_list") dict_from_items(normal_tags_dict, parents_list) top_normals = rank_dict(normal_tags_dict, top=1) for key, _ in normal_tags_dict.items(): relations = [] for top_normal, _ in top_normals: relation = top_normal == key or NormalItemEntity.direct_relation(key, top_normal) relations.append(relation) if not any(relations): del normal_tags_dict[key] return rank_dict(normal_tags_dict, top=top)
def normal_update(self, **kwargs): smart_print(kwargs, 'normal_update') try: slug = kwargs.get('slug', '') if not slug: return {'success': False} Normal.cls_update(**kwargs) except Exception: return {'success': False} else: return {'success': True}
def clusters(self, tags, origin, top_n=1): # 还没用到所有的分词和限制个数 TODO smart_print(tags, 'cluster tags') guessed_places, guessed_cities, guessed_normal = self.guess(tags, top_n) smart_print(guessed_places, "guessed places") smart_print(guessed_places, "guessed cities") smart_print(guessed_normal, "guessed normal") places = {} others = {} for key, value in guessed_places: if key in places: places[key] += value else: places.update({ key: value }) for key, value in guessed_normal: if key in others: others[key] += value else: others.update({ key: value }) return places, others
def place_all_parents(self, slug, value, increment=0.1): smart_print(slug, "in place_all_parents") exists = self.exists(slug) if not exists: return [], [], [] origin = self.get(slug) if origin.category == 'NORMAL': return [], [], [] countries = [] cities = [] places = [] if origin.category == 'PLACE': places.append((origin, value)) elif origin.category == 'AREA': cities.append((origin, value)) else: countries.append((origin, value)) value += increment for place, _ in places: parent = self.place_parent(place) if not parent: continue cities.append((parent, value)) value += increment for city, _ in cities: parent = self.place_parent(city) if not parent: continue countries.append((parent, value)) return places, cities, countries
def parse(self, words, weight=1, TF_IDF=True): if not isinstance(words, basestring): return [] results = [] smart_print(words) words = re.sub('\s', ENGLISH_SEGMENT_SEPARATOR, words) smart_print(words) words = to_str(words) for token in self.seg.seg_txt(words): token = token.decode('utf-8') token = re.sub('Z+', ' ', token).strip() if self.is_keyword(token): results.append(token) d = {} for r in results: if r in d: d[r] += weight * self.keywords.get(r, 1) if TF_IDF else weight else: d[r] = weight * self.keywords.get(r, 1) if TF_IDF else weight return d
def do_guess_places(self, countries, cities, places, top=1): countries_dict = {} cities_dict = {} places_dict = {} dict_from_items(countries_dict, countries) dict_from_items(cities_dict, cities) dict_from_items(places_dict, places) smart_print(countries_dict, "countries_dict") smart_print(cities_dict, "cities_dict") smart_print(places_dict, "places_dict") top_country = rank_dict(countries_dict, top=1) available_countries = [country for country, _ in top_country] top_cities = rank_dict(cities_dict, top=top) top_city = [] for city, value in top_cities: relations = [] for available_country in available_countries: relation = PlaceItemEntity.direct_relation(city, available_country) relations.append(relation) if any(relations): top_city.append((city, value)) break available_cities = [city for city, _ in top_city] top_places = [] for place, value in places: relations = [] for city in available_cities: relation = PlaceItemEntity.direct_relation(place, city) relations.append(relation) if any(relations): top_places.append((place, value)) if not available_countries: return cities_dict.items(), cities_dict.items() if not top_places and not top_city: return top_country, [] else: result = [] result.extend(top_places[: top-1]) result.extend(top_city) return result, top_city
def update(self, **kwargs): smart_print(kwargs, "before update") self.remove(**kwargs) self.add(**kwargs) return {'success': True}
def guess(self, tags, top_n=1): countries = [] cities = [] places = [] normals = [] smart_print(tags, "in guess tags") for tag, value in tags: items = tag.items smart_print(items, "tag_items") for item in items: category = item.category if category == 'NORMAL': normals.append((item, value)) else: item_places, item_cities, item_countries \ = self.memory_place_items.place_all_parents(item.slug, value) countries.extend(item_countries) cities.extend(item_cities) places.extend(item_places) for parent_name in tag.parents: parent = self.memory_normal_items.get(parent_name) if not parent: continue else: normals.append((parent, value)) smart_print(countries, "countries") smart_print(cities, 'cities') smart_print(places, 'places') smart_print(normals, 'normals') guessed_places, guessed_cities = self.do_guess_places(countries, cities, places, top_n) smart_print(guessed_places, "guessed places") smart_print(guessed_cities, "guessed cities") guessed_normals = self.do_guess_normals(normals, top_n) smart_print(guessed_normals, "guessed normals") return guessed_places, guessed_cities, guessed_normals
def rank(self): tags_list = [] for content in self.contents: tags = self.parse(content['content'], content.get('weight', 1)) tags_list.append(tags) # 根据内容和权重分词 {'content': '我爱北京', 'weight': 5} smart_print(tags_list, "分词结果") # 将分词结果聚合 tags_dict = self.aggregation(*tags_list) # {'北京': 5} smart_print(tags_dict, "标签聚合") filtered_tags = self.filter(tags_dict) smart_print(filtered_tags, "标签filter") top_n_tags_list = self.ranking(filtered_tags, top=10) # 将聚合结果排名 smart_print(top_n_tags_list, "排名前N") places, others = self.clusters(top_n_tags_list, filtered_tags, top_n=TOP_N) # 根据权重得出最权威的places和其他信息 smart_print(places, "地点") smart_print(others, "其他") result = self.format(places, others) smart_print(result, "结果") return result # 返回formatted的数据