def __init__(self, corpus, name): self.corpus = corpus self.name = name self.N = 4 self.MAX_FETS = 3000 self.root_dir = corpus.samples_dir + "/" + self.name if not path.isdir(self.root_dir): os.mkdir(self.root_dir) self.tfidf_dir = self.root_dir + "/tfidf" #self.db_tfidf = leveldb.LevelDB(self.tfidf_dir) self.meta_dir = self.root_dir + "/meta" #self.db_meta = leveldb.LevelDB(self.meta_dir) self.content_dir = self.root_dir + "/content" self.db_content = leveldb.LevelDB(self.content_dir) #self.categories = Categories(self.db_content) #self.categories.load_categories() #self.categories.print_categories() self.tsm = TermSampleModel(self.root_dir, self.corpus.vocabulary)
class Samples(): # ---------------- __init__() ---------------- def __init__(self, corpus, name): self.corpus = corpus self.name = name self.N = 4 self.MAX_FETS = 3000 self.root_dir = corpus.samples_dir + "/" + self.name if not path.isdir(self.root_dir): os.mkdir(self.root_dir) self.tfidf_dir = self.root_dir + "/tfidf" #self.db_tfidf = leveldb.LevelDB(self.tfidf_dir) self.meta_dir = self.root_dir + "/meta" #self.db_meta = leveldb.LevelDB(self.meta_dir) self.content_dir = self.root_dir + "/content" self.db_content = leveldb.LevelDB(self.content_dir) #self.categories = Categories(self.db_content) #self.categories.load_categories() #self.categories.print_categories() self.tsm = TermSampleModel(self.root_dir, self.corpus.vocabulary) def merge(self, other_samples): self.tsm.merge(other_samples.tsm) # ---------------- clear() ---------------- def clear(self): pass def close_db(self, db): db = None # ---------------- get_int_value_in_db() ---------------- def get_int_value_in_db(self, db, key): try: str_value = db.Get(key) return int(str_value) except KeyError: return 0 # ---------------- set_int_value_in_db() ---------------- def set_int_value_in_db(self, db, key, value): db.Put(key, str(value)) # ---------------- get_total_samples() ---------------- def get_total_samples(self): total_samples = 0 for i in self.db_content.RangeIter(): row_id = i[0] if row_id.startswith("__"): continue total_samples += 1 return total_samples # ---------------- get_bad_samples() ---------------- def get_bad_samples(self): samples = self none_samples = [] empty_samples = [] normal_samples = [] rowidx = 0 for i in samples.db_content.RangeIter(): row_id = i[0] if row_id.startswith("__"): continue (sample_id, category, date, title, key, url, msgext) = decode_sample_meta(i[1]) (version, content, (cat1, cat2, cat3)) = msgext if content is None: none_samples.append((sample_id, url)) elif len(content) == 0: empty_samples.append((sample_id, url)) else: normal_samples.append((sample_id, url)) rowidx += 1 logging.debug(Logger.debug("Get %d bad samples. None: %d Empty: %d Normal: %d" % (len(none_samples) + len(empty_samples) +len(normal_samples), len(none_samples), len(empty_samples), len(normal_samples)))) return none_samples, empty_samples, normal_samples # ---------------- get_sample_meta() ---------------- ''' sample_meta: (sample_id, category, date, title, key, url, content) ''' def get_sample_meta(self, sample_id): try: str_sample_meta = self.db_content.Get(str(sample_id)) return decode_sample_meta(str_sample_meta) except KeyError: return None # ---------------- clone() ---------------- def clone(self, samples_name, samples_list = None, terms_list = None): samples = Samples(self.corpus, samples_name) samples.tsm = self.tsm.clone(samples_list, terms_list) return samples # ---------------- get_samples_list() ---------------- def get_samples_list(self): return os.listdir(self.samples_dir) # ---------------- get_categories() ---------------- def get_categories(self): return self.corpus.categories # ---------------- import_samples() ---------------- def import_samples(self, xls_file): categories = self.get_categories() categories.clear_categories() batch_content = import_samples_from_xls(self, categories, xls_file) categories.save_categories() self.db_content.Write(batch_content, sync=True) # ---------------- export_samples() ---------------- def export_samples(self, xls_file): export_samples_to_xls(self, xls_file) # ---------------- export_urls() ---------------- def export_urls(self, xls_file): none_samples, empty_samples, normal_samples = self.get_bad_samples() export_urls_to_xls(xls_file, none_samples, empty_samples, normal_samples) # ---------------- show() ---------------- def show(self): logging.debug(Logger.debug("Do nothing in show().")) # ---------------- get_categories_useinfo() ---------------- def get_categories_useinfo(self): categories = self.get_categories() db_content = self.db_content categories_useinfo = {} for category_1 in (~categories.categories_1): categories_useinfo[category_1] = 0 for category_2 in (~categories.categories_2): categories_useinfo[category_2] = 0 for category_3 in (~categories.categories_3): categories_useinfo[category_3] = 0 unknown_categories = {} rowidx = 0 for i in db_content.RangeIter(): row_id = i[0] if row_id.startswith("__"): continue (sample_id, category_id, date, title, key, url, msgext) = decode_sample_meta(i[1]) (version, content, (cat1, cat2, cat3)) = msgext if not category_id in categories_useinfo: if category_id in unknown_categories: unknown_categories[category_id] += 1 else: unknown_categories[category_id] = 1 else: categories_useinfo[category_id] += 1 rowidx += 1 return categories_useinfo, unknown_categories # ---------------- print_categories_useinfo() ---------------- def print_categories_useinfo(self, categories_useinfo): categories = self.get_categories() categories_useinfo_list = sorted_dict(categories_useinfo) for (category_id, category_used) in categories_useinfo_list: category_name = categories.get_category_name(category_id) str_category = "%d - %s %d samples" % (category_id, category_name, category_used) print str_category # ---------------- query_categories() ---------------- def query_categories(self, xls_file): categories = self.get_categories() categories_useinfo, unknown_categories = self.get_categories_useinfo() categories.export_categories_to_xls(categories_useinfo, xls_file) self.print_categories_useinfo(categories_useinfo) # ---------------- get_categories_1_weight_matrix() ---------------- def get_categories_1_weight_matrix(self): tsm = self.tsm cfm = CategoryFeatureMatrix() sfm = SampleFeatureMatrix() categories = self.get_categories() for category_name in categories.categories_1: category_id = categories.categories_1[category_name] positive_samples_list, unlabeled_samples_list = tsm.get_samples_list_by_category_1(category_id) print "\n%s(%d) Positive Samples: %d Unlabeled Samples: %d" % (category_name, category_id, len(positive_samples_list), len(unlabeled_samples_list)) terms_positive_degree = get_terms_positive_degree_by_category(tsm, positive_samples_list, unlabeled_samples_list) features = {} for term_id in terms_positive_degree: (pd_word, speciality, popularity) = terms_positive_degree[term_id] features[term_id] = pd_word cfm.set_features(category_id, features) for sample_id in positive_samples_list: (sample_category, sample_terms, term_map) = tsm.get_sample_row(sample_id) category_1_id = Categories.get_category_1_id(sample_category) sfm.set_sample_category(sample_id, category_1_id) for term_id in term_map: if term_id in terms_positive_degree: (pd_word, speciality, popularity) = terms_positive_degree[term_id] sfm.add_sample_feature(sample_id, term_id, pd_word) no_terms = False return cfm, sfm # ---------------- show_category_keywords() ---------------- # 按二分类正例度算法,获得每个分类的关键词排序列表。 def show_category_keywords(self, result_dir): if not os.path.isdir(result_dir): try: os.mkdir(result_dir) except OSError: logging.error(Logger.error("mkdir %s failed." % (result_dir))) return tsm = self.tsm categories = self.get_categories() for category_name in categories.categories_2: category_id = categories.categories_2[category_name] positive_samples_list, unlabeled_samples_list = tsm.get_samples_list_by_category_2(category_id) print "%s(%d) Positive Samples: %d Unlabeled Samples: %d" % (category_name, category_id, len(positive_samples_list), len(unlabeled_samples_list)) terms_positive_degree = get_terms_positive_degree_by_category(tsm, positive_samples_list, unlabeled_samples_list) pd.save_terms_positive_degree(terms_positive_degree, self.corpus.vocabulary, "%s/keywords_%d_%s.txt" % (result_dir, category_id, category_name)) samples_positive = None samples_unlabeled = None # ---------------- show_keywords_matrix() ---------------- def show_keywords_matrix(self): categories = self.get_categories() # 计算每个词条在各个类别中使用的总次数 # {term_id: (term_used, standard_deviation, category_info)} # category_info - {category_id:(term_weight, term_used_in_category, term_ratio)} term_category_matrix = {} tsm = self.tsm sfm_tfidf = FeatureWeight.transform(tsm, FeatureWeight.TFIDF) for (term_id, term_info) in tsm.term_matrix_iterator(): (_, (term_used, term_samples, sample_map)) = term_info if term_used < 50: continue category_info = {} if term_id in term_category_matrix: (_, _, category_info) = term_category_matrix[term_id] for sample_id in sample_map: term_used_in_sample = sample_map[sample_id] (category_id, sample_terms, term_map) = tsm.get_sample_row(sample_id) term_weight = 0.0 term_used_in_category = 0 term_ratio = 0.0 term_ratio_variance = 0.0 if category_id in category_info: (term_weight, term_used_in_category, term_ratio) = category_info[category_id] v = sfm_tfidf.get_sample_feature(sample_id, term_id) if v is None: continue category_info[category_id] = (term_weight + v, term_used_in_category + term_used_in_sample, term_ratio) term_category_matrix[term_id] = (term_used, 0.0, category_info) # 计算每个词条在各个类别中的使用占比。 for term_id in term_category_matrix: (term_used, _, category_info) = term_category_matrix[term_id] # 计算词条使用占比 term_weight_sum = 0.0 for category_id in category_info: (term_weight, term_used_in_category, _) = category_info[category_id] term_weight_sum += term_weight #term_weight_sum += term_used_in_category ratio_sum = 0.0 for category_id in category_info: (term_weight, term_used_in_category, _) = category_info[category_id] term_ratio = term_weight / term_weight_sum category_info[category_id] = (term_weight, term_used_in_category, term_ratio) ratio_sum += term_ratio term_category_matrix[term_id] = (term_used, 0.0, category_info) #ratio_mean = ratio_sum / len(category_info) ratio_mean = ratio_sum / len(categories.categories_2) # 计算标准差 sum_0 = 0.0 for category_id in category_info: (term_weight, term_used_in_category, term_ratio) = category_info[category_id] x = term_ratio - ratio_mean sum_0 += x * x #standard_deviation = math.sqrt(sum_0 / len(category_info)) standard_deviation = math.sqrt(sum_0 / len(categories.categories_2)) term_category_matrix[term_id] = (term_used, standard_deviation, category_info) # 输出结果 # 按标准差从大到小排序 terms_by_sd = {} for term_id in term_category_matrix: (term_used, standard_deviation, category_info) = term_category_matrix[term_id] terms_by_sd[term_id] = standard_deviation rowidx = 0 terms_by_sd_list = sorted_dict_by_values(terms_by_sd, reverse = True) for (term_id, standard_deviation) in terms_by_sd_list: (term_used, _, category_info) = term_category_matrix[term_id] term_text = self.corpus.vocabulary.get_term_text(term_id) str_term_categories = u"" category_info_list = sorted_dict_by_values(category_info, reverse = True) for (category_id, (term_weight, term_used_in_category, term_ratio)) in category_info_list: category_name = categories.get_category_name(category_id) str_term_categories += " <%s[%d]: %.2f%% (%d)> " % (category_name, category_id, term_ratio * 100, term_used_in_category) print "--------------------------------" print "<%d/%d> %s(%d) sd:%.6f %d used. %s" % (rowidx, len(terms_by_sd_list), term_text, term_id, standard_deviation, term_used, str_term_categories) rowidx += 1 # ---------------- query_by_id() ---------------- def query_by_id(self, sample_id): try: sample_content = self.db_content.Get(str(sample_id)) (_, category, date, title, key, url, msgext) = decode_sample_meta(sample_content) (version, content, (cat1, cat2, cat3)) = msgext print "sample id: %d" % (sample_id) print "category: %d" % (category) print "key: %s" % (key) print "url: %s" % (url) print "date: %s" % (date) print "title: %s" % (title) print "---------------- content ----------------" print "%s" % (content) sample_terms, term_map = self.corpus.vocabulary.seg_content(content) print "sample_terms: %d terms_count: %d" % (sample_terms, len(term_map)) #for term_id in term_map: terms_list = sorted_dict_by_values(term_map, reverse=True) for (term_id, term_used_in_sample) in terms_list: if term_used_in_sample <= 1: continue term_text = self.corpus.vocabulary.get_term_text(term_id) #sample_terms = term_map[term_id] print "%s(%d): %d" % (term_text, term_id, term_used_in_sample) except KeyError: print "Sample %d not found in db_content." % (sample_id) db_sm = self.tsm.open_db_sm() try: str_sample_info = db_sm.Get(str(sample_id)) (category, sample_terms, term_map) = msgpack.loads(str_sample_info) print "" print "---------------- keywords ----------------" print "" terms_list = sorted_dict_by_values(term_map, reverse = True) for (term_id, term_used_in_sample) in terms_list: if term_used_in_sample <= 1: continue term_text = self.corpus.vocabulary.get_term_text(term_id) print "%s\t%d\t(id:%d)" % (term_text, term_used_in_sample, term_id) except KeyError: print "Sample %d not found in db_sm." % (sample_id) finally: self.tsm.close_db(db_sm) # ---------------- rebuild() ---------------- def rebuild(self): self.tsm.rebuild(self.db_content) self.rebuild_categories() # ---------------- rebuild_categories() ---------------- def rebuild_categories(self): samples = self categories = samples.get_categories() db_content = samples.db_content #categories.clear_categories() batch_content = leveldb.WriteBatch() rowidx = 0 for i in db_content.RangeIter(): row_id = i[0] if row_id.startswith("__"): continue (sample_id, category, date, title, key, url, msgext) = decode_sample_meta(i[1]) (version, content, (cat1, cat2, cat3)) = msgext #try: #(version, content, (cat1, cat2, cat3)) = msgext #except ValueError: #bad_samples.append(sample_id) #rowidx += 1 #continue version = "1" msgext = (version, content, (cat1, cat2, cat3)) category_id = categories.create_or_get_category_id(cat1, cat2, cat3) sample_data = (sample_id, category_id, date, title, key, url, msgext) rowstr = msgpack.dumps(sample_data) batch_content.Put(str(sample_id), rowstr) #if category_id != category: #print category_id, category, cat1, cat2, cat3 self.tsm.set_sample_category(sample_id, category_id) #logging.debug(Logger.debug("[%d] %d %d=<%s:%s:%s:>" % (rowidx, sample_id, category_id, cat1, cat2, cat3))) rowidx += 1 db_content.Write(batch_content, sync=True) self.tsm.save_sample_matrix(self.tsm.sm_matrix) categories.save_categories() categories.print_categories() # ---------------- load() ---------------- def load(self): self.tsm.load() # ---------------- save_tfidf_matrix() ----------------- def save_tfidf_matrix(self, tm_tfidf): db_tfidf = self.open_db_tfidf() for sample_id in tm_tfidf: sample_info = tm_tfidf[sample_id] db_tfidf.Put(str(sample_id), msgpack.dumps(sample_info)) self.close_db(db_tfidf) # ---------------- load_tfidf_matrix() ---------------- def load_tfidf_matrix(self): db_tfidf = self.open_db_tfidf() tm_tfidf = TermMatrix() rowidx = 0 for i in db_tfidf.RangeIter(): row_id = i[0] if row_id[0:2] == "__": continue sample_id = int(row_id) sample_info = msgpack.loads(i[1]) if term_map is None: logging.warn(Logger.warn("term_map %d is None." % (rowidx))) continue tm_tfidf.matrix.append((sample_id, sample_info)) if rowidx % 1000 == 0: logging.debug(Logger.debug("load_tfidf_matrix() %d" % (rowidx))) rowidx += 1 self.close_db(db_tfidf) return tm_tfidf