def get_sample_meta(self, sample_id): try: str_sample_meta = self.db_content.Get(str(sample_id)) return decode_sample_meta(str_sample_meta) except KeyError: return None
def get_categories_useinfo(self): categories = self.get_categories() db_content = self.db_content categories_useinfo = {} for category_1 in (~categories.categories_1): categories_useinfo[category_1] = 0 for category_2 in (~categories.categories_2): categories_useinfo[category_2] = 0 for category_3 in (~categories.categories_3): categories_useinfo[category_3] = 0 unknown_categories = {} rowidx = 0 for i in db_content.RangeIter(): row_id = i[0] if row_id.startswith("__"): continue (sample_id, category_id, date, title, key, url, msgext) = decode_sample_meta(i[1]) (version, content, (cat1, cat2, cat3)) = msgext if not category_id in categories_useinfo: if category_id in unknown_categories: unknown_categories[category_id] += 1 else: unknown_categories[category_id] = 1 else: categories_useinfo[category_id] += 1 rowidx += 1 return categories_useinfo, unknown_categories
def get_bad_samples(self): samples = self none_samples = [] empty_samples = [] normal_samples = [] rowidx = 0 for i in samples.db_content.RangeIter(): row_id = i[0] if row_id.startswith("__"): continue (sample_id, category, date, title, key, url, msgext) = decode_sample_meta(i[1]) (version, content, (cat1, cat2, cat3)) = msgext if content is None: none_samples.append((sample_id, url)) elif len(content) == 0: empty_samples.append((sample_id, url)) else: normal_samples.append((sample_id, url)) rowidx += 1 logging.debug(Logger.debug("Get %d bad samples. None: %d Empty: %d Normal: %d" % (len(none_samples) + len(empty_samples) +len(normal_samples), len(none_samples), len(empty_samples), len(normal_samples)))) return none_samples, empty_samples, normal_samples
def refresh_content(self): db_content = self.samples.db_content urls = [] rowidx = 0 for i in db_content.RangeIter(): row_id = i[0] if row_id.startswith("__"): continue (sample_id, category, date, title, key, url, msgext) = decode_sample_meta(i[1]) (version, content, (cat1, cat2, cat3)) = msgext if content is None: logging.debug(Logger.debug("content is None: sample_id %d" % (sample_id))) urls.append((sample_id, category, date, title, key, url, cat1, cat2, cat3)) elif len(content) == 0 : logging.debug(Logger.debug("len(conntent) == 0: sample_id %d" % (sample_id))) urls.append((sample_id, category, date, title, key, url, cat1, cat2, cat3)) if rowidx % 100 == 0: logging.debug(Logger.debug("refresh content - %d" % (rowidx))) rowidx += 1 for (sample_id, category, date, title, key, url, cat1, cat2, cat3) in urls: logging.debug(Logger.debug("--------------------------------")) logging.debug(Logger.debug("sample_id: %d url:%s" % (sample_id, url))) try: rsp = requests.get(url) if rsp.ok: #filename = "no_%d.html" % sample_id #print rsp.text.encode('utf-8') #f = open(filename, "wb+") #f.write(rsp.text.encode('utf-8')) #f.close() content = rsp.text version = "1" msgext = (version, content, (cat1, cat2, cat3)) sample_data = (sample_id, category, date, title, key, url, msgext) rowstr = msgpack.dumps(sample_data) db_content.Put(str(sample_id), rowstr) else: version = "1" msgext = (version, "", (cat1, cat2, cat3)) sample_data = (sample_id, category, date, title, key, url, msgext) rowstr = msgpack.dumps(sample_data) db_content.Put(str(sample_id), rowstr) logging.warn(Logger.warn("Get page failed. status: %d sample_id: %d url: %s" % (rsp.status_code, sample_id, url))) except: version = "1" msgext = (version, None, (cat1, cat2, cat3)) sample_data = (sample_id, category, date, title, key, url, msgext) rowstr = msgpack.dumps(sample_data) db_content.Put(str(sample_id), rowstr) logging.warn(Logger.warn("Connection failed. sample_id: %d url: %s" % (sample_id, url)))
def purge(self): samples = self.samples db_content = samples.db_content none_samples, empty_samples, _ = samples.get_bad_samples() purged_samples = [ sample_id for (sample_id, url) in none_samples] logging.debug(Logger.debug("Purgging %d samples...." % (len(purged_samples)))) total_samples = samples.get_total_samples() for sample_id in purged_samples: db_content.Delete(str(sample_id)) logging.debug(Logger.debug("Purge None content sample %d" % (sample_id))) total_samples -= len(purged_samples) for (sample_id, url) in empty_samples: db_content.Delete(str(sample_id)) logging.debug(Logger.debug("Purge empty content sample %d" % (sample_id))) total_samples -= len(empty_samples) logging.debug(Logger.debug("Purge Done. Remaining %d samples." % (total_samples))) invalid_class_samples = [] invalid_categories = {} rowidx = 0 for i in db_content.RangeIter(): row_id = i[0] if row_id.startswith("__"): continue (sample_id, category, date, title, key, url, msgext) = decode_sample_meta(i[1]) (version, content, (cat1, cat2, cat3)) = msgext if not cat1 in self.main_categories: invalid_class_samples.append(sample_id) if (cat1, cat2) in invalid_categories: invalid_categories[(cat1, cat2)] += 1 else: invalid_categories[(cat1, cat2)] = 1 for (cat1, cat2) in invalid_categories: logging.debug(Logger.debug("<I> <%s:%s::> %d" % (cat1, cat2, invalid_categories[(cat1, cat2)]))) logging.debug(Logger.debug("Total invalid class samples %d in %d categories" % (len(invalid_class_samples), len(invalid_categories)) )) for sample_id in invalid_class_samples: db_content.Delete(str(sample_id)) logging.debug(Logger.debug("Deleted %d invalid class samples." % (len(invalid_class_samples))))
def query_by_id(self, sample_id): try: sample_content = self.db_content.Get(str(sample_id)) (_, category, date, title, key, url, msgext) = decode_sample_meta(sample_content) (version, content, (cat1, cat2, cat3)) = msgext print "sample id: %d" % (sample_id) print "category: %d" % (category) print "key: %s" % (key) print "url: %s" % (url) print "date: %s" % (date) print "title: %s" % (title) print "---------------- content ----------------" print "%s" % (content) sample_terms, term_map = self.corpus.vocabulary.seg_content(content) print "sample_terms: %d terms_count: %d" % (sample_terms, len(term_map)) #for term_id in term_map: terms_list = sorted_dict_by_values(term_map, reverse=True) for (term_id, term_used_in_sample) in terms_list: if term_used_in_sample <= 1: continue term_text = self.corpus.vocabulary.get_term_text(term_id) #sample_terms = term_map[term_id] print "%s(%d): %d" % (term_text, term_id, term_used_in_sample) except KeyError: print "Sample %d not found in db_content." % (sample_id) db_sm = self.tsm.open_db_sm() try: str_sample_info = db_sm.Get(str(sample_id)) (category, sample_terms, term_map) = msgpack.loads(str_sample_info) print "" print "---------------- keywords ----------------" print "" terms_list = sorted_dict_by_values(term_map, reverse = True) for (term_id, term_used_in_sample) in terms_list: if term_used_in_sample <= 1: continue term_text = self.corpus.vocabulary.get_term_text(term_id) print "%s\t%d\t(id:%d)" % (term_text, term_used_in_sample, term_id) except KeyError: print "Sample %d not found in db_sm." % (sample_id) finally: self.tsm.close_db(db_sm)
def rebuild_categories(self): samples = self categories = samples.get_categories() db_content = samples.db_content #categories.clear_categories() batch_content = leveldb.WriteBatch() rowidx = 0 for i in db_content.RangeIter(): row_id = i[0] if row_id.startswith("__"): continue (sample_id, category, date, title, key, url, msgext) = decode_sample_meta(i[1]) (version, content, (cat1, cat2, cat3)) = msgext #try: #(version, content, (cat1, cat2, cat3)) = msgext #except ValueError: #bad_samples.append(sample_id) #rowidx += 1 #continue version = "1" msgext = (version, content, (cat1, cat2, cat3)) category_id = categories.create_or_get_category_id(cat1, cat2, cat3) sample_data = (sample_id, category_id, date, title, key, url, msgext) rowstr = msgpack.dumps(sample_data) batch_content.Put(str(sample_id), rowstr) #if category_id != category: #print category_id, category, cat1, cat2, cat3 self.tsm.set_sample_category(sample_id, category_id) #logging.debug(Logger.debug("[%d] %d %d=<%s:%s:%s:>" % (rowidx, sample_id, category_id, cat1, cat2, cat3))) rowidx += 1 db_content.Write(batch_content, sync=True) self.tsm.save_sample_matrix(self.tsm.sm_matrix) categories.save_categories() categories.print_categories()
def export_samples_to_xls(samples, xls_file): wb = xlwt.Workbook(encoding='utf-8') ws = wb.add_sheet("negative opinions") ws.write(0, 0, 'CATEGORY') ws.write(0, 1, 'DATE') ws.write(0, 2, 'CAT1') ws.write(0, 3, 'CAT2') ws.write(0, 4, 'TITLE') ws.write(0, 5, 'KEY') ws.write(0, 6, 'URL') ws.write(0, 7, 'CONTENT') style_date = xlwt.XFStyle() style_date.num_format_str = 'YYYY.MM.DD' rowidx = 0 for i in samples.db_content.RangeIter(): row_id = i[0] if row_id.startswith("__"): continue (sample_id, category, date, title, key, url, msgext) = decode_sample_meta(i[1]) (version, content, (cat1, cat2, cat3)) = msgext if content is None: content = "" if len(content) >= 1024 * 32: content = content[:1024*32 - 1] ws.write(rowidx + 1, 0, category) (y, m, d, h, mi, s) = date ws.write(rowidx + 1, 1, datetime(y, m, d, h, mi, s), style_date) ws.write(rowidx + 1, 2, cat1) ws.write(rowidx + 1, 3, cat2) ws.write(rowidx + 1, 4, title) ws.write(rowidx + 1, 5, key) ws.write(rowidx + 1, 6, url) ws.write(rowidx + 1, 7, content) if rowidx % 100 == 0: logging.debug(Logger.debug("[%d] %d %s" % (rowidx, sample_id, title))) rowidx += 1 wb.save(xls_file)
def query_by_id(self, samples_positive, samples_unlabeled, sample_id): tsm_positive = samples_positive.tsm tsm_unlabeled = samples_unlabeled.tsm sensitive_words = { ##u"立案":3.0, ##u"获刑":3.0, ##u"受贿":3.0, ##u"有期徒刑":3.0, ##u"宣判":3.0, ##u"审计":2.0, ##u"调查":2.0 } sensitive_terms = self.transform_sensitive_terms(sensitive_words, self.vocabulary) try: sample_content = samples_unlabeled.db_content.Get(str(sample_id)) #(_, category, date, title, key, url, content) = msgpack.loads(sample_content) (_, category, date, title, key, url, msgext) = decode_sample_meta(sample_content) (version, content, (cat1, cat2, cat3)) = msgext print "sample id: %d" % (sample_id) print "category: %d" % (category) print "key: %s" % (key) print "url: %s" % (url) print "date: %s" % (date) print "title: %s" % (title) print "---------------- content ----------------" #print "%s" % (content) sample_terms, term_map = self.vocabulary.seg_content(content) print "sample_terms: %d terms_count: %d" % (sample_terms, len(term_map)) #for term_id in term_map: terms_list = sorted_dict_by_values(term_map, reverse=True) for (term_id, term_used_in_sample) in terms_list: term_text = self.vocabulary.get_term_text(term_id) #term_used_in_sample = term_map[term_id] print "%s(%d): %d" % (term_text, term_id, term_used_in_sample) except KeyError: print "Sample %d not found in db_content." % (sample_id) db_sm = samples_unlabeled.tsm.open_db_sm() try: str_sample_info = db_sm.Get(str(sample_id)) (category, sample_terms, term_map) = msgpack.loads(str_sample_info) print "" print "---------------- keywords ----------------" print "" terms = {} for term_id in term_map: term_text = self.vocabulary.get_term_text(term_id) term_used = term_map[term_id] (pd_word, speciality, popularity) = calculate_term_positive_degree(term_id, tsm_positive, tsm_unlabeled, sensitive_terms) terms[term_id] = (pd_word, speciality, popularity, term_used, term_text) terms_list = sorted_dict_by_values(terms, reverse = True) for (term_id, (pd_word, speciality, popularity, term_used, term_text)) in terms_list: print "%s\t%d\t[%.6f,%.6f,%.6f]\t(id:%d)" % (term_text, term_used, pd_word, speciality, popularity, term_id) except KeyError: print "Sample %d not found in db_sm." % (sample_id) samples_unlabeled.tsm.close_db(db_sm)
def fix_categories(self): samples = self.samples cat_map = { (u"触电伤害", u"") : (u"安全生产", u"触电伤害"), (u"其他", u"触电伤害") : (u"安全生产", u"触电伤害"), (u"其他", u"意外伤害") : (u"安全生产", u"触电伤害"), (u"其他(环境污染)", u"") : (u"安全生产", u"环境保护"), (u"其他(破坏电力设施)", u"") : (u"安全生产", u"外力破坏"), (u"其他", u"道路破坏") : (u"安全生产", u"外力破坏"), (u"其他", u"噪音污染") : (u"安全生产", u"环境保护"), (u"其他", u"环境污染") : (u"安全生产", u"环境保护"), (u"其他", u"外力破坏") : (u"安全生产", u"外力破坏"), (u"其他", u"破坏道路") : (u"安全生产", u"外力破坏"), (u"其他(交通安全)", u"") : (u"安全生产", u"交通安全"), (u"其他", u"肇事逃逸") : (u"安全生产", u"交通安全"), (u"其他", u"酒驾致伤") : (u"安全生产", u"交通安全"), (u"其他(树障清除)", u"") : (u"安全生产", u"隐患治理"), (u"其他", u"树障清理") : (u"安全生产", u"隐患治理"), (u"其他", u"树障清除") : (u"安全生产", u"隐患治理"), (u"雾霾", u"") : (u"安全生产", u"隐患治理"), (u"安全供电", u"") : (u"安全生产", u"违章作业"), (u"智能电网", u"") : (u"电网建设", u"智能电网"), (u"新能源并网", u"") : (u"电网建设", u"新能源并网"), (u"特高压", u"") : (u"经营管理", u"特高压"), (u"阶梯电价", u"") : (u"电力改革", u"电价调整"), (u"农网改造", u"") : (u"电力改革", u"农电改制"), (u"三集五大", u"") : (u"电力改革", u"三集五大"), (u"其他(电农体制改革", u"") : (u"电力改革", u"农电改制"), (u"农电改革", u"") : (u"电力改革", u"农电改制"), (u"电价调整", u"") : (u"电力改革", u"电价调整"), (u"工资福利", u"") : (u"人资管理", u"工资福利"), (u"人资劳务", u"") : (u"人资管理", u"人事劳务"), (u"人力资源", u"") : (u"人资管理", u""), (u"(其他)人资管理", u"") : (u"人资管理", u""), (u"人力资源", u"人事劳务") : (u"人资管理", u"人事劳务"), (u"会劳务", u"") : (u"人资管理", u"人事劳务"), (u"其他", u"劳动纪律") : (u"人资管理", u"劳动纪律"), (u"其他", u"打人致伤") : (u"人资管理", u"劳动纪律"), (u"同工同酬", u"") : (u"人资管理", u"同工同酬"), (u"作风建设", u"") : (u"党建作风", u""), (u"其他", u"作风建设") : (u"党建作风", u""), (u"其他", u"舆情宣传") : (u"党建作风", u"新闻宣传"), (u"其他", u"新闻宣传") : (u"党建作风", u"新闻宣传"), (u"作风建设", u"法律纠纷") : (u"党建作风", u""), (u"信访纠纷", u"") : (u"党建作风", u"腐败"), (u"腐败", u"") : (u"党建作风", u"腐败"), (u"其他", u"公车私用") : (u"党建作风", u"八项规定"), (u"腐 败", u"") : (u"党建作风", u"腐败"), (u"其他", u"借机敛财") : (u"党建作风", u"腐败"), (u"腐败", u"公车购置") : (u"党建作风", u"腐败"), (u"农网改造", u"违规收费") : (u"依法治企", u"违规收费"), (u"其他", u"强卖") : (u"依法治企", u"违规收费"), (u"电费电表", u"违规收费") : (u"依法治企", u"违规收费"), (u"其他", u"违规收费") : (u"依法治企", u"违规收费"), (u"其他(违规收费)", u"") : (u"依法治企", u"违规收费"), (u"其他(乱收费)", u"") : (u"依法治企", u"违规收费"), (u"其他", u"违规建房") : (u"依法治企", u"违规建房"), (u"其他", u"违规电器") : (u"依法治企", u"违规供电"), (u"其他", u"法律纠纷") : (u"依法治企", u"法律纠纷"), (u"其他(法律纠纷)", u"") : (u"依法治企", u"法律纠纷"), (u"相关利益方", u"") : (u"依法治企", u"审计业务"), (u"其他", u"财务审计") : (u"依法治企", u"审计业务"), (u"法律纠纷", u"") : (u"依法治企", u"法律纠纷"), (u"电动汽车", u"") : (u"业务拓展", u"电动汽车"), (u"国际业务", u"") : (u"业务拓展", u"国际业务"), (u"风电消纳", u"") : (u"业务拓展", u"产业"), (u"供电服务(三指定)", u"") :(u"供电服务", u""), (u"智能电表", u"") : (u"供电服务", u"电表"), (u"上海停电", u"") : (u"供电服务", u"停电"), (u"其他(意外停电)", u"") : (u"供电服务", u"停电"), (u"电价", u"") : (u"供电服务", u"电价"), (u"其他", u"窃电") : (u"供电服务", u"偷电行为"), (u"电费电表", u"") : (u"供电服务", u"电费"), (u"电表电费", u"") : (u"供电服务", u"电表"), (u"营销服务", u"业务投诉") : (u"供电服务", u"业务投诉"), (u"营销服务", u"停电") : (u"供电服务", u"停电"), (u"营销服务", u"电费") : (u"供电服务", u"电费"), (u"营销服务", u"电价") : (u"供电服务", u"电价"), (u"营销服务", u"电表") : (u"供电服务", u"电表"), } #bad_samples = [] rowidx = 0 for i in samples.db_content.RangeIter(): row_id = i[0] if row_id.startswith("__"): continue (sample_id, category, date, title, key, url, msgext) = decode_sample_meta(i[1]) (version, content, (cat1, cat2, cat3)) = msgext #try: #(version, content, (cat1, cat2, cat3)) = msgext #except ValueError: #bad_samples.append(sample_id) #cat1 = cat1.decode('utf-8') #cat2 = cat2.decode('utf-8') #cat3 = cat3.decode('utf-8') #if cat1 == u"农电改革": #logging.debug(Logger.debug("<%s:%s:%s:>" % (cat1, cat2, cat3))) #if (cat1, cat2) in cat_map: #logging.debug(Logger.debug("Found <%s:%s::> in cat_map" % (cat1, cat2))) #else: #logging.debug(Logger.debug("Not found <%s:%s::> in cat_map" % (cat1, cat2))) #print cat2.__class__, (cat1, cat2) == (cat1, u""), (cat1, cat2) == (cat1, u"") new_cat3 = cat3 #if cat2 == u"": #print "cat2 == NULL <%s:%s:%s:>" % (cat1, cat2, cat3) if (cat1, cat2) in cat_map: new_cat1, new_cat2 = cat_map[(cat1, cat2)] str_sample_meta = (sample_id, category, date, title, key, url, (version, content, (new_cat1, new_cat2, new_cat3))) samples.db_content.Put(str(sample_id), msgpack.dumps(str_sample_meta)) logging.debug(Logger.debug("<%s:%s:%s:> -> <%s:%s:%s:>" % (cat1, cat2, cat3, new_cat1, new_cat2, new_cat3))) rowidx += 1