def load__tag_to_words_count__dict(self, model_cache, documents_with_features): print "load item_id_to_tags__dict ..." item_id_to_tags__dict = {item_id1: self.filter_valid_tags(model_cache.tags_model__extract_tags(item1)) for item_id1, item1 in process_notifier(model_cache)} print "load tag_to_words_count__dict ..." tag_to_words_count__dict = defaultdict(lambda: defaultdict(lambda: 0)) test_item_ids = model_cache.test_item_ids() for item_id1, words_count1 in process_notifier(documents_with_features): if item_id1 in test_item_ids: continue for tag1 in item_id_to_tags__dict[item_id1]: for word1, count1 in words_count1.iteritems(): tag_to_words_count__dict[tag1][word1] += count1 return tag_to_words_count__dict
def divided_into_two_parts(self): """ 重复毕竟是小部分, 所以_set只存可能是重复的ID列表 """ candidate_set = set([]) uniq_set = set([]) feature1_dict = dict(self.default_features.items() + self.custom_features.items()) # group by 不支持范围,比如整数范围查询 group_by_columns = [ f1 for f1 in feature1_dict if feature1_dict[f1] == str ] if group_by_columns: table = self.features_tree group_by_query = [getattr(table, f1) for f1 in group_by_columns] group_concat = [ peewee_fn.group_concat(table.item_id).alias('item_ids') ] group_by_sql = table.select(*(group_concat)).group_by( *group_by_query) for i1 in process_notifier(group_by_sql): items_len = len(i1.item_ids) if items_len > 24: candidate_set = candidate_set | set(i1.item_ids.split(",")) elif items_len == 24: # 只有一个object_id uniq_set.add(i1.item_ids) else: raise Exception("item_ids is invalid") else: print feature1, "has none features" return (list(candidate_set), list(uniq_set))
def divided_into_two_parts(self): """ 重复毕竟是小部分, 所以_set只存可能是重复的ID列表 """ candidate_set = set([]) uniq_set = set([]) feature1_dict = dict(self.default_features.items() + self.custom_features.items()) # group by 不支持范围,比如整数范围查询 group_by_columns = [f1 for f1 in feature1_dict if feature1_dict[f1] == str] if group_by_columns: table = self.features_tree group_by_query = [getattr(table, f1) for f1 in group_by_columns] group_concat = [peewee_fn.group_concat(table.item_id).alias('item_ids')] group_by_sql = table.select(*(group_concat)).group_by(*group_by_query) for i1 in process_notifier(group_by_sql): items_len = len(i1.item_ids) if items_len > 24: candidate_set = candidate_set | set(i1.item_ids.split(",")) elif items_len == 24: # 只有一个object_id uniq_set.add(i1.item_ids) else: raise Exception("item_ids is invalid") else: print feature1, "has none features" return (list(candidate_set), list(uniq_set))
def mr_run(self): """ Overwrite BaseHadoopJobTask#run function. """ # TODO maybe model cache map_kv_dict = defaultdict(list) inputs = self.input() if not isinstance(inputs, list): inputs = [inputs] for input_hdfs_1 in inputs: for line2 in TargetUtils.line_read(input_hdfs_1): for map_key_3, map_val_3 in self.mapper(line2): map_kv_dict[map_key_3].append(map_val_3) with self.output().open("w") as output1: fixed_chunk = list() for reduce_key_2 in process_notifier(map_kv_dict.keys()): reduce_vals_2 = map_kv_dict[reduce_key_2] for _, reduce_val_2 in self.reducer(reduce_key_2, reduce_vals_2): fixed_chunk.append(reduce_val_2) if len(fixed_chunk) % self.chunk_size == 0: output1.write("\n".join(fixed_chunk) + "\n") fixed_chunk = list() del map_kv_dict[reduce_key_2] output1.write("\n".join(fixed_chunk) + "\n")
def write(tmp_items): if self.output_lambda: self.output_lambda([i1[1] for i1 in tmp_items]) else: for item_id, item1 in process_notifier(tmp_items): self.result[item_id] = item1 self.result.sync() return []
def filtered_item_ids_by_has_tags(): print "[load from original data]\n" cls.pull_data() # 过滤掉 没有Tag的items return [ item_id1 for item_id1, item1 in process_notifier(cls) if cls.tags_model__extract_tags(item1) ]
def load__tag_to_words_count__dict(self, model_cache, documents_with_features): print "load item_id_to_tags__dict ..." item_id_to_tags__dict = { item_id1: self.filter_valid_tags(model_cache.tags_model__extract_tags(item1)) for item_id1, item1 in process_notifier(model_cache) } print "load tag_to_words_count__dict ..." tag_to_words_count__dict = defaultdict(lambda: defaultdict(lambda: 0)) test_item_ids = model_cache.test_item_ids() for item_id1, words_count1 in process_notifier( documents_with_features): if item_id1 in test_item_ids: continue for tag1 in item_id_to_tags__dict[item_id1]: for word1, count1 in words_count1.iteritems(): tag_to_words_count__dict[tag1][word1] += count1 return tag_to_words_count__dict
def load_features_with_weight(self, tag_to_features_count__dict): for tag1, features_counter1 in process_notifier(tag_to_features_count__dict): for node1 in self.name_to_nodes[tag1]: node1.features_weight = self.classify.calculate_features_weight(features_counter1) for f1 in node1.features_weight.keys(): # temp Fix if f1 in self.classify.features_weight_machine.stop_words_set: del node1.features_weight[f1] for feature1 in features_counter1: self.feature_to_nodes[feature1].add(node1) return self
def func(): feature_in_doc_to_count = defaultdict(int) for item_id1, features in process_notifier(self.documents): for feature1 in features.iterkeys(): feature_in_doc_to_count[feature1] += 1 idf_result = IdfResult() all_num = float(len(self.documents)) for feature1, count1 in feature_in_doc_to_count.iteritems(): idf_result[feature1] = math.log(all_num / count1) return idf_result
def evaluate_effect(self, default_guess_count=10): if not self.is_run_test: print "没有设置is_run_test=True!" exit(0) test_items = self.model.test_items() print "要评估", len(test_items), "个条目" self.tags_tree.inspect('features_weight') result = [[self.model.tags_model__extract_tags(item1), self.recommend_tags(item1)] for item1 in process_notifier(test_items)] def inspect_result(result, filter_item_ids=set([])): for idx1, two_parts in enumerate(result): print "第", idx1 + 1, "个" original_tags, recommend_data = two_parts if recommend_data['item_id'] not in filter_item_ids: continue print "试题ID", recommend_data['item_id'] print "试题内容", recommend_data['item_content'] uprint(u"关键词列表 => 熵", recommend_data['features_weight']) uprint(u"原始标签:", original_tags) uprint(u"推荐标签:", recommend_data['recommend_tags']) uprint(u"推荐细节:", recommend_data['recommend_tags_detail']) print "\n" * 3 inspect_result(result) evaluate_items = [{"object_id": recommend_data['item_id'], "original_tags": original_tags, "recommend_tags": [i1['name'] for i1 in recommend_data['recommend_tags']]} for original_tags, recommend_data in result] # 手工评估 def ee(num=1): self.evaluate([{"object_id": recommend_data['item_id'], "original_tags": original_tags, "recommend_tags": [i1['name'] for i1 in recommend_data['recommend_tags'][0:num]]} for original_tags, recommend_data in result]) # -------------------------------------------- # 初中物理 查看知识点树在每条路径上被引用的分布情况 [(0, 357), (1, 217), (2, 24), (3, 1)] self.tags_tree.distribution([i1['recommend_tags'] for i1 in evaluate_items]) self.evaluate(evaluate_items) if ('eval_result' in evaluate_items[0]) and False: eval_items2 = filter(lambda i1: i1['eval_result'], evaluate_items) inspect_result(result, [i1['object_id'] for i1 in eval_items2]) self.tags_tree.distribution([i1['original_tags'] for i1 in eval_items2]) import pdb pdb.set_trace()
def load_features_with_weight(self, tag_to_features_count__dict): for tag1, features_counter1 in process_notifier( tag_to_features_count__dict): for node1 in self.name_to_nodes[tag1]: node1.features_weight = self.classify.calculate_features_weight( features_counter1) for f1 in node1.features_weight.keys(): # temp Fix if f1 in self.classify.features_weight_machine.stop_words_set: del node1.features_weight[f1] for feature1 in features_counter1: self.feature_to_nodes[feature1].add(node1) return self
def load__nested_region_dict(): # data format # [ {"name":"浙江", "code":31, "parent_code":1}, ... ] data = list(self.nested_region_data__func()) assert len(data) > 0 assert isinstance(data[0], dict) assert "name" in data[0] assert "code" in data[0] assert "parent_code" in data[0] print "load name_to_codes__dict ..." name_to_codes__dict = defaultdict(list) for d1 in process_notifier(data): name_to_codes__dict[ru_regexp.strip_regexp.sub("", d1['name'])].append(d1['code']) name_to_codes__dict = dict(name_to_codes__dict) print "load code_to_name__dict ..." code_to_name__dict = { d1['code'] : d1['name'] for d1 in process_notifier(data) } print "load codes_relations ..." codes_relations = { d1['code'] : d1['parent_code'] for d1 in process_notifier(data) } return [name_to_codes__dict, code_to_name__dict, codes_relations]
def func(): # 1. fetch all features uniq_keys = set([]) for item_id1, item1 in process_notifier(d1): [uniq_keys.add(k1) for k1 in item1.iterkeys()] uniq_keys = list(uniq_keys) # 2. feature1 => {doc1: count1, doc2: count2, ...} value_cache = defaultdict(dict) for item_id1, item1 in process_notifier(d1): for k1, c1 in item1.iteritems(): value_cache[k1][item_id1] = c1 # 3. calculate each feauture's entropy entropy_cache = dict() total_len = len(d1) for k1 in process_notifier(uniq_keys): exist_values = value_cache[k1].values() total_values = exist_values + [0] * (total_len - len(value_cache)) entropy_cache[k1] = scipy_entropy(total_values) return entropy_cache
def run(self): self.run_before_hook() # 1. check is already done. if self.is_collection_exists(): print "[info] %s already exists!" % (self.data_file_collection_model, ) return False # 2. check report status collection is valid if self.report_status_collection_model.count() == 0: self.report_status_collection_model.insert( {self.report_status_namespace: {}}) assert self.report_status_collection_model.count() == 1, "更新纪录 只能有一条!" # 3. output json with err data_file1 = self.source_task_instance.data_file source1 = luigi.HDFS(data_file1) tmp_file1 = open(self.tmp_filepath, 'w') for line1 in process_notifier( TargetUtils.line_read(source1), u"[read lines] %s" % source1): tmp_file1.write(line1 + "\n") tmp_file1.close() # 4. upload to mongodb CommandUtils.execute(self.mongo_ensure_index) CommandUtils.execute(self.mongoimport_command) # 5. clean tmp CommandUtils.execute("rm -f %s" % self.tmp_filepath) # 6. update report status item1 = self.report_status_collection_model.find()[0] del item1['_id'] item1[self.report_status_namespace][self.report_name] = { 'collection_name': self.collection_name, 'updated_at': arrow.now().datetime, } self.report_status_collection_model.find_and_modify( query={}, update={"$set": item1}, full_response=True ) self.run_after_hook() return True
def load__region_unit_data(): data = list(self.region_unit_data__func()) assert len(data) > 0 assert isinstance(data[0], dict) assert "id" in data[0] assert "name" in data[0] feature_to_unit_ids__dict = defaultdict(list) id_to_name__dict = dict() for line1 in process_notifier(data): id_to_name__dict[line1['id']] = line1['name'] features = jieba_parse(line1['name']) # TODO 移除特殊字符, 比如 "-" source1_region = ru_regexp.separate_regiones(line1['name'])[0] for kv in itertools.chain(*self.get_region_lines(source1_region)): features.extend(kv.values()) for feature1 in set(features): feature_to_unit_ids__dict[feature1].append(line1['id']) return [id_to_name__dict, dict(feature_to_unit_ids__dict)]
def cache__total_tag_to_features__dict(self): total_tag_to_features__dict = defaultdict(lambda: defaultdict(int)) # calculate model_cache's freq distribution test_item_ids = self.model.test_item_ids() for item_id1, item1 in process_notifier(self.model): if item_id1 in test_item_ids: continue item1_features_dict = self.extract_features_weight(item1) for tag2 in self.model.tags_model__extract_tags(item1): dict3 = total_tag_to_features__dict[tag2] for feature4, count4 in item1_features_dict.iteritems(): dict3[feature4] += count4 # remove defaultdict's func for k1 in total_tag_to_features__dict.keys(): total_tag_to_features__dict[k1] = dict(total_tag_to_features__dict[k1]) return dict(total_tag_to_features__dict)
def cache__total_tag_to_features__dict(self): total_tag_to_features__dict = defaultdict(lambda: defaultdict(int)) # calculate model_cache's freq distribution test_item_ids = self.model.test_item_ids() for item_id1, item1 in process_notifier(self.model): if item_id1 in test_item_ids: continue item1_features_dict = self.extract_features_weight(item1) for tag2 in self.model.tags_model__extract_tags(item1): dict3 = total_tag_to_features__dict[tag2] for feature4, count4 in item1_features_dict.iteritems(): dict3[feature4] += count4 # remove defaultdict's func for k1 in total_tag_to_features__dict.keys(): total_tag_to_features__dict[k1] = dict( total_tag_to_features__dict[k1]) return dict(total_tag_to_features__dict)
def evaluate_effect(self, default_guess_count=10): if not self.is_run_test: print "没有设置is_run_test=True!" exit(0) test_items = self.model.test_items() print "要评估", len(test_items), "个条目" self.tags_tree.inspect('features_weight') result = [[ self.model.tags_model__extract_tags(item1), self.recommend_tags(item1) ] for item1 in process_notifier(test_items)] def inspect_result(result, filter_item_ids=set([])): for idx1, two_parts in enumerate(result): print "第", idx1 + 1, "个" original_tags, recommend_data = two_parts if recommend_data['item_id'] not in filter_item_ids: continue print "试题ID", recommend_data['item_id'] print "试题内容", recommend_data['item_content'] uprint(u"关键词列表 => 熵", recommend_data['features_weight']) uprint(u"原始标签:", original_tags) uprint(u"推荐标签:", recommend_data['recommend_tags']) uprint(u"推荐细节:", recommend_data['recommend_tags_detail']) print "\n" * 3 inspect_result(result) evaluate_items = [{ "object_id": recommend_data['item_id'], "original_tags": original_tags, "recommend_tags": [i1['name'] for i1 in recommend_data['recommend_tags']] } for original_tags, recommend_data in result] # 手工评估 def ee(num=1): self.evaluate([{ "object_id": recommend_data['item_id'], "original_tags": original_tags, "recommend_tags": [i1['name'] for i1 in recommend_data['recommend_tags'][0:num]] } for original_tags, recommend_data in result]) # -------------------------------------------- # 初中物理 查看知识点树在每条路径上被引用的分布情况 [(0, 357), (1, 217), (2, 24), (3, 1)] self.tags_tree.distribution( [i1['recommend_tags'] for i1 in evaluate_items]) self.evaluate(evaluate_items) if ('eval_result' in evaluate_items[0]) and False: eval_items2 = filter(lambda i1: i1['eval_result'], evaluate_items) inspect_result(result, [i1['object_id'] for i1 in eval_items2]) self.tags_tree.distribution( [i1['original_tags'] for i1 in eval_items2]) import pdb pdb.set_trace()
def connect(cls, original_model, **kwargs): # assert original_model's behavior process_notifier(original_model) # setup args # NOTE below code is copied to README. default_kwargs = { 'cache_dir': os.getenv("ModelCacheDir"), # the default. # available storage_types are ['memory', 'sqlite', 'shelve', 'redis']. default # is 'shelve', which is faster than sqlite 'storage_type': 'shelve', # Sync unless current len(ModelCache) is less than original_model in `percentage`. # NOTE not used 'percentage': 0.9999, 'filter_lambda': lambda item1: False, 'read_id_lambda': lambda item1: str(item1['_id']), 'included_class': object, } for k1, v1 in kwargs.iteritems(): if k1 in default_kwargs: default_kwargs[k1] = v1 # validate storage assert default_kwargs['storage_type'] in valid_storages if (default_kwargs['cache_dir'] is None) and (default_kwargs['storage_type'] != "memory"): raise Exception(u"`cache_dir` should not be None when storage_type is not memory.") cache_dir = default_kwargs['cache_dir'] del default_kwargs['cache_dir'] # decorate class def _model_cache_decorator(decorated_class): # 1. included_class should not overwrite ModelCacheClass's important methods, # include `__init__`, `init__before`, `init__after`. # 2. ensure decorated_class's methods will overwrite ModelCache's. for k1 in ['init__before', 'init__after']: if k1 in dir(default_kwargs['included_class']): setattr(ModelCacheClass, k1, getattr(default_kwargs['included_class'], k1)) class _model_cache(decorated_class, ModelCacheClass, default_kwargs['included_class']): class OriginalClass(): pass # so we can setattr here. original = OriginalClass() for k1, v1 in default_kwargs.iteritems(): setattr(original, k1, v1) del k1 del v1 original.model = original_model # Thx http://stackoverflow.com/questions/4932438/how-to-create-a-custom-string-representation-for-a-class-object/4932473#4932473 class MetaClass(type): __metaclass__ = Forwardable _ = def_delegators('datadict', dict_attrs) def __repr__(self): is_total_len_enough = len(self) > 5 while is_total_len_enough and (len(self.first_five_items) < 5): for item_id1, item1 in self.iteritems(): self.first_five_items.append(item1) if len(self.first_five_items) == 5: break dots = ", ......" if is_total_len_enough else "" return (u"<%s has %i items:[%s%s]>" % (self.__name__, len(self), ", ".join([str(item1.item_id) for item1 in self.first_five_items]), dots, )).encode("UTF-8") __metaclass__ = MetaClass @classmethod def pickle_path(cls, name): return cls.cache_dir + "/" + name + ".cPickle" _model_cache.__name__ = decorated_class.__name__ _model_cache.__module__ = decorated_class.__module__ # so can pickle :) _model_cache.first_five_items = [] _model_cache.cache_dir = os.path.join(cache_dir or u"", _model_cache.__name__) if default_kwargs['storage_type'] != 'memory': if not os.path.isdir(_model_cache.cache_dir): os.makedirs(_model_cache.cache_dir) _model_cache.dbpath = None if _model_cache.cache_dir: _model_cache.dbpath = os.path.join(_model_cache.cache_dir, _model_cache.__name__ + ".db") _model_cache.connect() return _model_cache return _model_cache_decorator
def filtered_item_ids_by_has_tags(): print "[load from original data]\n" cls.pull_data() # 过滤掉 没有Tag的items return [item_id1 for item_id1, item1 in process_notifier(cls) if cls.tags_model__extract_tags(item1)]