def region_unit_data(self): """ features can be any type, include unicode and int. OPTIMIZE: add more features. """ def load__region_unit_data(): data = list(self.region_unit_data__func()) assert len(data) > 0 assert isinstance(data[0], dict) assert "id" in data[0] assert "name" in data[0] feature_to_unit_ids__dict = defaultdict(list) id_to_name__dict = dict() for line1 in process_notifier(data): id_to_name__dict[line1['id']] = line1['name'] features = jieba_parse(line1['name']) # TODO 移除特殊字符, 比如 "-" source1_region = ru_regexp.separate_regiones(line1['name'])[0] for kv in itertools.chain(*self.get_region_lines(source1_region)): features.extend(kv.values()) for feature1 in set(features): feature_to_unit_ids__dict[feature1].append(line1['id']) return [id_to_name__dict, dict(feature_to_unit_ids__dict)] return cpickle_cache(self.cache_dir + '/region_unit_data.cPickle', load__region_unit_data)
def process(cls, d1, cache_dir): """ d1 is {"feature1":count1, "feature2":count2, ... } """ def func(): # 1. fetch all features uniq_keys = set([]) for item_id1, item1 in process_notifier(d1): [uniq_keys.add(k1) for k1 in item1.iterkeys()] uniq_keys = list(uniq_keys) # 2. feature1 => {doc1: count1, doc2: count2, ...} value_cache = defaultdict(dict) for item_id1, item1 in process_notifier(d1): for k1, c1 in item1.iteritems(): value_cache[k1][item_id1] = c1 # 3. calculate each feauture's entropy entropy_cache = dict() total_len = len(d1) for k1 in process_notifier(uniq_keys): exist_values = value_cache[k1].values() total_values = exist_values + [0] * (total_len - len(value_cache)) entropy_cache[k1] = scipy_entropy(total_values) return entropy_cache return cpickle_cache(cache_dir + '/entropy.cPickle', func)
def nested_region_data(self): def load__nested_region_dict(): # data format # [ {"name":"浙江", "code":31, "parent_code":1}, ... ] data = list(self.nested_region_data__func()) assert len(data) > 0 assert isinstance(data[0], dict) assert "name" in data[0] assert "code" in data[0] assert "parent_code" in data[0] print "load name_to_codes__dict ..." name_to_codes__dict = defaultdict(list) for d1 in process_notifier(data): name_to_codes__dict[ru_regexp.strip_regexp.sub("", d1['name'])].append(d1['code']) name_to_codes__dict = dict(name_to_codes__dict) print "load code_to_name__dict ..." code_to_name__dict = { d1['code'] : d1['name'] for d1 in process_notifier(data) } print "load codes_relations ..." codes_relations = { d1['code'] : d1['parent_code'] for d1 in process_notifier(data) } return [name_to_codes__dict, code_to_name__dict, codes_relations] name_to_codes__dict, code_to_name__dict, codes_relations = \ cpickle_cache(self.cache_dir + '/nested_region_dict.cPickle', load__nested_region_dict) return [name_to_codes__dict, code_to_name__dict, codes_relations]
def tags_tree(self): """ load_features_with_weight """ # modify tags_tree def func(): if 'manual_kps' in self.opts: # 人工标记,如吕文星标记的初高中知识点短规则组合。 data = { kp1: Counter(features) for kp1, features in ReadManualKps.process( self.opts['manual_kps']).iteritems() } else: # 机器训练 data = self.model.tags_tree.load__tag_to_words_count__dict( self.model, self.documents_with_features) print "load_words_with_weight ..." self.model.tags_tree.load_features_with_weight(data) self.model.tags_tree.classify = None return self.model.tags_tree o1 = cpickle_cache(self.model.pickle_path('tags_tree'), func) o1.classify = self # Fix cPickle.UnpickleableError return o1
def all_item_ids(cls): def filtered_item_ids_by_has_tags(): print "[load from original data]\n" cls.pull_data() # 过滤掉 没有Tag的items return [item_id1 for item_id1, item1 in process_notifier(cls) if cls.tags_model__extract_tags(item1)] print "[load tags_items_ids]\n" return cpickle_cache(cls.pickle_path('tags_items_ids'), filtered_item_ids_by_has_tags)
def test_item_ids(cls): """ 在ModelCache数据准备好之后,在训练和评估之前,应该先选出test_item_ids. """ def func(): print "[select test item ids]\n" all_item_ids = cls.all_item_ids() random.shuffle(all_item_ids) if cls.classify.max_train_data_count == 0: return [] # compact return all_item_ids[-cls.classify.max_train_data_count:] ids = cpickle_cache(cls.pickle_path('test_item_ids'), func) return set(ids)
def setup(self): self.debug = False """ 主要数据结构为 total_tag_to_features__dict , 计算item与tags之间得相似度排序。 """ self.total_tag_to_features__dict = cpickle_cache(self.classify.cpath('text_engine'), self.cache__total_tag_to_features__dict) # remove features in stop_unicode_set for tag1, features_dict1 in self.total_tag_to_features__dict.iteritems(): self.filter_by_stop_list(features_dict1) self.debug = True
def setup(self): self.debug = False """ 主要数据结构为 total_tag_to_features__dict , 计算item与tags之间得相似度排序。 """ self.total_tag_to_features__dict = cpickle_cache( self.classify.cpath('text_engine'), self.cache__total_tag_to_features__dict) # remove features in stop_unicode_set for tag1, features_dict1 in self.total_tag_to_features__dict.iteritems( ): self.filter_by_stop_list(features_dict1) self.debug = True
def all_item_ids(cls): def filtered_item_ids_by_has_tags(): print "[load from original data]\n" cls.pull_data() # 过滤掉 没有Tag的items return [ item_id1 for item_id1, item1 in process_notifier(cls) if cls.tags_model__extract_tags(item1) ] print "[load tags_items_ids]\n" return cpickle_cache(cls.pickle_path('tags_items_ids'), filtered_item_ids_by_has_tags)
def cache__cpu(cpu_offset): fq = FileQueue(self.scope_count, self.chunk_size, self.process_count, cpu_offset, lambda chunk1: PickleFile(chunk1, io_prefix, cpu_prefix)) while_step = 0 while fq.has_todo(): while_step += 1 pn("[%s cache__cpu:%s] todo_list := %s, while_step := %s" % (self.cache_basename, cpu_offset, fq.todo_list, while_step)) for f1 in fq.todo_list: if not f1.is_exists('io'): continue if f1.is_exists('cpu'): f1.done = True continue try: io_items = cpickle_cache(f1.io_name(), lambda: not_exist) cpu_items = [[i1[0], self.item_func(i1[1])] for i1 in io_items] cpickle_cache(f1.cpu_name(), lambda: cpu_items) f1.done = True except: # 在IO进程中还没有写完这个文件 print "Maybe IO error happened ..." continue time.sleep(1)
def idf_cache(self): def func(): feature_in_doc_to_count = defaultdict(int) for item_id1, features in process_notifier(self.documents): for feature1 in features.iterkeys(): feature_in_doc_to_count[feature1] += 1 idf_result = IdfResult() all_num = float(len(self.documents)) for feature1, count1 in feature_in_doc_to_count.iteritems(): idf_result[feature1] = math.log(all_num / count1) return idf_result return cpickle_cache(self.cache_dir + '/idf.cPickle', func)
def tags_tree(self): """ load_features_with_weight """ # modify tags_tree def func(): if 'manual_kps' in self.opts: # 人工标记,如吕文星标记的初高中知识点短规则组合。 data = {kp1: Counter(features) for kp1, features in ReadManualKps.process(self.opts['manual_kps']).iteritems()} else: # 机器训练 data = self.model.tags_tree.load__tag_to_words_count__dict(self.model, self.documents_with_features) print "load_words_with_weight ..." self.model.tags_tree.load_features_with_weight(data) self.model.tags_tree.classify = None return self.model.tags_tree o1 = cpickle_cache(self.model.pickle_path('tags_tree'), func) o1.classify = self # Fix cPickle.UnpickleableError return o1
def persistent(filename, current_items): cpickle_cache(filename, lambda: current_items) return []
def recache(self): # compact with shelve module generate "dat, dir, bak" three postfix files io_prefix = self.cache_basename + '.io.' io_regexp = io_prefix + '[0-9]*' cpu_prefix = self.cache_basename + '.cpu.' cpu_regexp = cpu_prefix + '[0-9]*' os.system("cd %s" % os.path.dirname(self.cache_basename)) # A.1. 缓存IO def cache__io(): self.datasource.reconnect_after_fork() pn("[%s cache__io] begin total ..." % self.cache_basename) def persistent(filename, current_items): cpickle_cache(filename, lambda: current_items) return [] # A.1.1 如果全部缓存了,就不处理了 if (len(self.datasource) / self.chunk_size) + 1 == len(glob.glob(io_regexp)): pn("[%s cache__io] end total ..." % self.cache_basename) return False # A.1.2 否则还是重新处理一遍 current_items = [] idx = 0 for k1, v1 in self.datasource: current_items.append([k1, v1]) if len(current_items) >= self.chunk_size: cache_path = io_prefix + unicode(idx) os.system("rm -f %s" % cache_path) current_items = persistent(cache_path, current_items) idx += self.chunk_size if current_items: persistent(io_prefix + unicode(idx), current_items) pn("[%s cache__io] end total ..." % self.cache_basename) multiprocessing.Process(target=cache__io).start() # A.2. 在IO基础上缓存CPU def cache__cpu(cpu_offset): fq = FileQueue(self.scope_count, self.chunk_size, self.process_count, cpu_offset, lambda chunk1: PickleFile(chunk1, io_prefix, cpu_prefix)) while_step = 0 while fq.has_todo(): while_step += 1 pn("[%s cache__cpu:%s] todo_list := %s, while_step := %s" % (self.cache_basename, cpu_offset, fq.todo_list, while_step)) for f1 in fq.todo_list: if not f1.is_exists('io'): continue if f1.is_exists('cpu'): f1.done = True continue try: io_items = cpickle_cache(f1.io_name(), lambda: not_exist) cpu_items = [[i1[0], self.item_func(i1[1])] for i1 in io_items] cpickle_cache(f1.cpu_name(), lambda: cpu_items) f1.done = True except: # 在IO进程中还没有写完这个文件 print "Maybe IO error happened ..." continue time.sleep(1) for cpu_offset in xrange(self.process_count): multiprocessing.Process(target=cache__cpu, args=(cpu_offset,)).start() # B. 在前面基础上合并全部 # Check if extract from original is finished. acm = ActiveChildrenManagement() while acm.still(): time.sleep(acm.seconds) def write(tmp_items): if self.output_lambda: self.output_lambda([i1[1] for i1 in tmp_items]) else: for item_id, item1 in process_notifier(tmp_items): self.result[item_id] = item1 self.result.sync() return [] print "\n" * 5, "begin merge ..." tmp_items = [] fs = sorted(glob.glob(cpu_regexp), key=lambda f1: int(f1.split("/")[-1].split(".")[-1])) for f1 in fs: chunk = cpickle_cache(f1, lambda: not_exist) tmp_items.extend(chunk) if len(tmp_items) >= self.merge_size: tmp_items = write(tmp_items) tmp_items = write(tmp_items) # update cache result len self.result_len = self.output_len_lambda()