def for_pivot(label, pred, output): pivot_map = dict() for i_ in xrange(len(label)): l_, p_ = label[i_][0], pred[i_][0][0] pivot_map[l_] = pivot_map.get(l_, dict()) pivot_map[l_][p_] = pivot_map[l_].get(p_, 0) + 1 with codecs.open(output, 'w', 'utf-8') as fo: fo.write('%-8s\t%-8s\t%s\n' % ('Actual', 'Predict', 'Count')) for k, v in pivot_map.items(): for k_, v_ in v.items(): fo.write('%-8s\t%-8s\t%d\n' % (dbutil.get_tag_name( db, k), dbutil.get_tag_name(db, k_), v_))
def __init__(self): global word2vec_model, viptag_model_20171221 self.db = dbcon.connect_torndb() self.mongo = dbcon.connect_mongo() self.feeder = Feeder() self.tagger = Tagger(itags=True) self.seg = Segmenter(tags=True) self.wfilter = word_filter.get_default_filter() self.w2v = Word2Vec.load(word2vec_model) self.trained_tag_clfs = self.__load_trained_clfs() self.vip_classifier = fasttext.load_model(viptag_model_20171221) self.yellows = dbutil.get_yellow_tags(self.db) self.vip_tags = { t.name: t.id for t in dbutil.get_sectored_tags(self.db, 1) } self.hyponym = { vip_name: set([ dbutil.get_tag_name(self.db, tid) for tid in dbutil.get_hyponym_tags(self.db, vip_id) ]) for vip_name, vip_id in self.vip_tags.iteritems() } self.importants = set( t.name.lower() for t in dbutil.get_tags_by_type(self.db, [11011, 11013])) self.thesaurus = self.__load_tag_novelties() self.thesaurus_ids = self.__load_tag_novelties(tid=True) self.tag_types = self.__load_tag_types() self.trusted_sources = dicts.get_known_company_source() self.replacements = { dbutil.get_tag_name(self.db, r['source']): [dbutil.get_tag_name(self.db, rtid) for rtid in r['replacement']] for r in self.mongo.keywords.replacement.find() } self.junk_terms = set( tag.name for tag in dbutil.get_tags_by_type(self.db, typeset=([11001]))) self.similarity_threshold = 0.4 self.textrank_window_size = 2 self.textrank_threshold = 0 self.source_tag_default_weight = 2 self.vip_lower = 0.3 self.important_threshold = 0.2 self.important_max_count = 5 print 'model inited'
def stat(output, *sources): with codecs.open(output, 'w', 'utf-8') as fo: for s in sources: count = _count(s) summary = '\nTagId\t\t\tTag Name\t\tCount\n'\ + '\n'.join(['%-8s\t\t%-8s\t\t%-8d' % (k, dbutil.get_tag_name(db, k), v) for k, v in count.items()])\ + '\nSum\t:%-8d Max\t:%-8d Min\t:%-8d' % (sum(count.values()), max(count.values()), min(count.values())) fo.write('%s:\n------\n' % (s.split('/')[-1]) + summary + '\n\n\n')
def dump_sectors(): db = dbcon.connect_torndb() with codecs.open('dumps/xiniu.tag', 'w', 'utf-8') as fo: for t1 in dbutil.get_sectored_tags(db, 1): for t2id in dbutil.get_tags_by_relation(db, t1.id, 54041): for t3id in dbutil.get_tags_by_relation(db, t2id, 54041): fo.write('%s\t%s\t%s\n' % (t1.name, dbutil.get_tag_name(db, t2id), dbutil.get_tag_name(db, t3id))) db.close()
def dump(): global mapping mongo = dbcon.connect_mongo() db = dbcon.connect_torndb() ke = KeywordExtractor() raw = mongo.raw.qmp.find( { "url": "http://vip.api.qimingpian.com/d/c3", "processed": True }, { 'postdata': 1, 'data.basic': 1 }) results = {} fo = codecs.open('dumps/20180726', 'w', 'utf-8') for qmp in raw: basic = qmp.get('data', {}).get('basic') tags = [] tags.append(basic.get('hangye1', '')) tags.append(basic.get('hangye2', '')) tags.extend(basic.get('tags_match', '').split('|')) tags = [tag for tag in tags if tag.strip()] sc = db.get( 'select companyId from source_company where source=13121 and sourceId=%s;', qmp['postdata']['id']) tag_qmp = set(tags) & set(mapping.keys()) if not tag_qmp: continue if not (sc and sc.companyId): continue orignal = copy(tag_qmp) tag_qmp = [mapping.get(tag) for tag in tag_qmp] tag_xiniu = [ dbutil.get_tag_name(db, tid) for tid in ke.extract_vip(sc.companyId).keys() ] url = 'http://www.xiniudata.com/company/%s/overview' % dbutil.get_company_code( db, sc.companyId) desc = db.get('select brief from company where id=%s;', sc.companyId).brief desc = desc.replace('\n', '') if desc else '' if set(tag_qmp) & set(tag_xiniu): # results[1] = results.get(1, 0) + 1 fo.write('%s\t%s\t1\t%s\t%s\n' % (','.join(orignal), ','.join(tag_xiniu), url, desc)) else: fo.write('%s\t%s\t0\t%s\t%s\n' % (','.join(orignal), ','.join(tag_xiniu), url, desc)) # results[0] = results.get(0, 0) + 1 for k, v in results.items(): print k, v
def __load_trained_clfs(self): model_dir = os.path.join( os.path.split(os.path.realpath(__file__))[0], 'models') clfs = {} for model_file in os.listdir(model_dir): if model_file.endswith('.model'): tid = model_file.split('.')[0] if not isinstance(tid, int): continue clfs[dbutil.get_tag_name(self.db, int(tid))] = joblib.load( os.path.join(model_dir, model_file)) return clfs
def predict(model, k=3, cid=None, raw_info=None): clf = fasttext.load_model(model, encoding='utf-8') if cid or raw_info: content = dbutil.get_company_info(db, cid).description if cid else raw_info content = [ ' '.join(nf.wfilter(nf.seg.cut4search(content.replace('\n', '')))) ] return '\n'.join([ '%-8s\t%f' % (dbutil.get_tag_name(db, l.replace(u'__label__', u'')), p) for l, p in clf.predict_proba(content, k=k)[0] ]) return 'No company id or text found.'
def __deduct_2nd(self, tags): deduct = [] tags = [(dbutil.get_tag_id(self.db, t)[0], t) for t in tags.keys()] for (tid, tag) in tags: if self.tag_types.get(tag, 0) == 11013: t1s = dbutil.get_hypernym_tags(self.db, tid, 1) for t1 in set(t1s) & set([t[0] for t in tags]): t2s = set(dbutil.get_hyponym_tags(self.db, t1, 2)) & set( dbutil.get_hypernym_tags(self.db, tid, 2)) for t2 in t2s: if t2 not in set([t[0] for t in tags]): deduct.append(t2) return {dbutil.get_tag_name(self.db, t2): 2.49 for t2 in deduct}
def summary(label, pred, output): pt, pos, true = dict(), dict(), dict() for i_ in xrange(len(label)): for l_ in label[i_]: true[l_] = true.get(l_, 0) + 1 for p_ in pred[i_]: pos[p_] = pos.get(p_, 0) + 1 if p_ in label[i_]: pt[p_] = pt.get(p_, 0) + 1 with codecs.open(output, 'w', 'utf-8') as fo: fo.write('Tag\t\tPrecision\tRecall\t\tPredict\t\tActual\n') for k in true: precision, recall = float(pt[k]) / float(pos[k]), float( pt[k]) / float(true[k]) fo.write( '%-8s\t%-8f\t%-8f\t%-8d\t%-8d\n' % ((dbutil.get_tag_name( db, k)), precision, recall, pos[k], true[k]))
def get_investor_portfolio_companies(db, mongo, iid): companies = {} pfls = db.query( 'select distinct company.id cid ' 'from company, funding, funding_investor_rel rel, corporate cp ' 'where rel.investorId=%s and funding.corporateId = company.corporateId ' 'and (company.active is null or company.active="Y") ' 'and company.corporateId=cp.id and (cp.active is null or cp.active="Y") ' 'and rel.fundingId=funding.id and (funding.active is null or funding.active="Y") ' 'and (rel.active is null or rel.active="Y") ' 'and funding.fundingDate>="2013-01-01" and funding.fundingDate<="2018-06-01" ' 'order by fundingDate asc;', iid) cids = [pfl.cid for pfl in pfls] # tags tags = {} for cid in cids: for t in dbutil.get_company_tags_info(db, cid, [11012, 11013]): tags[t.tid] = tags.get(t.tid, 0) + 1 normalizer = sum(tags.values()) ntags = { dbutil.get_tag_name(db, tid): round(count * 1.0 / normalizer, 4) for tid, count in tags.items() } companies['tags'] = sorted(ntags.items(), key=lambda x: -x[1])[:20] # count of news y2017 = datetime.strptime('2017-01-01', '%Y-%m-%d') companies['news'] = len( list( mongo.article.news.find({ 'investorIds': iid, 'processStatus': 1, 'date': { '$gte': y2017 } }))) # locations locations = [dbutil.get_company_location(db, cid)[1] for cid in cids] locations = { l: round(locations.count(l) * 1.0 / len(locations), 4) for l in set(locations) } companies['location'] = locations return companies
def predict(model, data_path=None, out_path=None, text=None): if not text and not data_path: print('Input at least one of valid text or data path') raise ValueError clf = fasttext.load_model(model, encoding='utf-8') if text: content = [' '.join(seg.cut4search(i)) for i in text] else: df = pd.read_csv(data_path, index_col='ID', encoding='utf_8_sig') content = [' '.join(seg.cut4search(i)) for i in df[u'原文本']] preds = clf.predict_proba(content, k=10) tags = [] for pred in preds: pred_sum = sum(p[1] for p in pred) tags.append(' '.join( dbutil.get_tag_name(db, int(p[0].replace(u'__label__', ''))) for p in pred if p[1] > 0.05 * pred_sum)) if text: return tags df['tag'] = tags df.to_csv(out_path, encoding='utf_8_sig')
def extract(self, cid, topn=15): # prepare contents source_tags, candidates, candidates_important = self.__prepare_tag_contents( cid) candidates_vips = self.extract_vip(cid) # generate results results = dict( self.__extract_vecrank(candidates, candidates_important, candidates_vips, topn)) results = self.merge( results, { dbutil.get_tag_name(self.db, tid): w for tid, w in candidates_vips.iteritems() }) # results = self.merge(results, self.__extract_important(candidates, candidates_important), 1) # results = self.merge(results, dict(self.__extract_textrank(candidates, topn))) results = self.__normalize(results) results = self.__normalize_replacement(results) return results
def extract_from_text(self, text): candidates = [] for content, _ in text.iteritems(): candidates.extend([x[0] for x in self.tagger.tag(content)]) candidates = self.wfilter(candidates) candidates_important = {} for content, weight in text.iteritems(): for tag in [ x[0] for x in self.tagger.tag(content) if x[1] == 'itag' or x[0] in self.importants ]: candidates_important[tag] = candidates_important.get( tag, 0) + weight desc = ' '.join( self.wfilter(self.seg.cut4search(' '.join(text.keys())))) candidates_vips = { int(tag.replace(u'__label__', '')): weight for (tag, weight) in self.vip_classifier.predict_proba([desc], 3)[0] if weight > self.vip_lower } results = {} results = self.merge( results, self.__extract_important(candidates, candidates_important), 1) results = self.merge(results, dict(self.__extract_textrank(candidates, 10))) # results = dict(self.__extract_vecrank(candidates, candidates_important, candidates_vips, 10)) results = self.merge( results, { dbutil.get_tag_name(self.db, tid): w for tid, w in candidates_vips.iteritems() }) results = self.__normalize(results) results = self.__normalize_replacement(results) deducts = self.__deduct_2nd(results) if len(deducts) < 3: results = self.merge(results, deducts) return results
def __extract_vecrank(self, candidates, candidates_important, candidates_vips, topn): graph = UndirectWeightedGraph() weights = collections.defaultdict(int) proper_hyponym = dict.fromkeys( set( chain(*[ self.hyponym.get(dbutil.get_tag_name(self.db, cv)) for cv in candidates_vips.iterkeys() ])), 2) for i in xrange(len(candidates)): for j in xrange(i + 1, i + self.textrank_window_size): if j >= len(candidates): break weights[(candidates[i], candidates[j])] += 1 if candidates[i] not in self.w2v: continue for word, weight in candidates_important.items(): if word == candidates[i] or word not in self.w2v: continue similarity = self.w2v.similarity(candidates[i], word) if similarity > self.similarity_threshold: weights[(candidates[i], word)] += similarity * weight for terms, weight in weights.iteritems(): graph.add_edge(terms[0], terms[1], weight) nodes_rank = graph.rank(self.thesaurus, proper_hyponym) topn = min(topn, len(candidates)) start = 0 for tag, weight in sorted(nodes_rank.items(), key=lambda x: -x[1])[:topn]: if tag in self.junk_terms: continue if start < 2: yield tag, round(weight, 2) elif weight >= self.textrank_threshold: yield tag, round(weight, 2) start += 1
def testing(test, model, path, one_label=True, pivot=True, auc=True): clf = fasttext.load_model(model, encoding='utf-8') tids, labels, contents = list(), list(), list() for line in codecs.open(test): tid, rest = line.split(' ', 1) tids.append(tid), labels.append([]) while rest.startswith(u'__label__'): label, rest = rest.split(' ', 1) labels[-1].append(label.replace(u'__label__', u'')) contents.append(rest) preds = [[(l.replace(u'__label__', u''), p) for l, p in lp] for lp in clf.predict_proba(contents, k=3)] with codecs.open(path + 'predict', 'w', 'utf-8') as fo: for i in xrange(len(tids)): ab_res = 'T' if labels[i][0] == preds[i][0][0] else 'F' res = 'T' if labels[i][0] in [l for l, _ in preds[i]] else 'F' fo.write( '%-8s%s\t%s\t%-30s%-30s%-s\n' % (tids[i], ab_res, res, dbutil.get_company_name(db, tids[i]), '&'.join([dbutil.get_tag_name(db, l) for l in labels[i]]), '\t'.join([ dbutil.get_tag_name(db, l) + ' ' + str(p) for l, p in preds[i] ]))) def summary(label, pred, output): pt, pos, true = dict(), dict(), dict() for i_ in xrange(len(label)): for l_ in label[i_]: true[l_] = true.get(l_, 0) + 1 for p_ in pred[i_]: pos[p_] = pos.get(p_, 0) + 1 if p_ in label[i_]: pt[p_] = pt.get(p_, 0) + 1 with codecs.open(output, 'w', 'utf-8') as fo: fo.write('Tag\t\tPrecision\tRecall\t\tPredict\t\tActual\n') for k in true: precision, recall = float(pt[k]) / float(pos[k]), float( pt[k]) / float(true[k]) fo.write( '%-8s\t%-8f\t%-8f\t%-8d\t%-8d\n' % ((dbutil.get_tag_name( db, k)), precision, recall, pos[k], true[k])) def for_pivot(label, pred, output): pivot_map = dict() for i_ in xrange(len(label)): l_, p_ = label[i_][0], pred[i_][0][0] pivot_map[l_] = pivot_map.get(l_, dict()) pivot_map[l_][p_] = pivot_map[l_].get(p_, 0) + 1 with codecs.open(output, 'w', 'utf-8') as fo: fo.write('%-8s\t%-8s\t%s\n' % ('Actual', 'Predict', 'Count')) for k, v in pivot_map.items(): for k_, v_ in v.items(): fo.write('%-8s\t%-8s\t%d\n' % (dbutil.get_tag_name( db, k), dbutil.get_tag_name(db, k_), v_)) def roc_auc(label, pred): y_true, y_prob = list(), list() for i_ in xrange(len(label)): y_true = y_true + [ 1 ] if label[i_][0] == pred[i_][0][0] else y_true + [0] y_prob.append(pred[i_][0][1]) fpr, tpr, thresholds = metrics.roc_curve(y_true, y_prob) auc_score = metrics.auc(fpr, tpr) plt.plot(fpr, tpr, lw=2, label='ROC curve (area = %.2f)' % auc_score) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.legend(loc='lower right') plt.show() return auc_score if one_label: summary([[l[0]] for l in labels], [[ps[0][0].replace(u'__label__', u'')] for ps in preds], path + 'one_label') else: summary(labels, [[p[0].replace(u'__label__', u'') for p in ps] for ps in preds], path + 'mul_labels') if pivot: for_pivot(labels, preds, path + 'pivot') if auc: print 'AUC: %f' % roc_auc(labels, preds)
def __get_sector_filter(self, source, ftype): sector_filters = self.mongo.keywords.sector_filters.find_one({'source': source, 'filter_type': ftype}) sector_filters = sector_filters.get('sectors', []) if sector_filters else [] sector_filters = [dbutil.get_tag_name(self.db, tid) for tid in sector_filters] return sector_filters