def load_data(): db = dbcon.connect_torndb() seg = Segmenter() X, Y = [], [] for item in db.query('select * from source_context;'): X.append(' '.join(list(seg.cut(item.content))).strip()) Y.append(item.type == 30010) db.close() return X, Y
def feed_doc(tag=u'金融'): mongo = dbcon.connect_mongo() segmenter = Segmenter(tag=True) wfilter = get_default_filter() for record in mongo.article.news.find({'tags': tag}): yield chain(*[ wfilter(segmenter.cut(piece['content'].strip())) for piece in record['contents'] if piece['content'].strip() ])
def __init__(self): self.data_dir = os.path.join( os.path.split(os.path.realpath(__file__))[0], '../data/tsb/company/ltp_cut') self.segmenter = Segmenter() self.mapping_id2in = {} self.mapping_in2id = {} self.max_id = 0 self.default_filter = word_filter.get_default_filter()
def __init__(self): self.db = dbcon.connect_torndb() self.mongo = dbcon.connect_mongo() self.non_trusted_discount = 0.5 self.brief_promote = 1.5 self.trusted_sources = dicts.get_known_company_source() self.wfilter = word_filter.get_default_filter() self.seg = Segmenter(tag=True)
def __init__(self): global word2vec_model, viptag_model_20171221 self.db = dbcon.connect_torndb() self.mongo = dbcon.connect_mongo() self.feeder = Feeder() self.tagger = Tagger(itags=True) self.seg = Segmenter(tags=True) self.wfilter = word_filter.get_default_filter() self.w2v = Word2Vec.load(word2vec_model) self.trained_tag_clfs = self.__load_trained_clfs() self.vip_classifier = fasttext.load_model(viptag_model_20171221) self.yellows = dbutil.get_yellow_tags(self.db) self.vip_tags = { t.name: t.id for t in dbutil.get_sectored_tags(self.db, 1) } self.hyponym = { vip_name: set([ dbutil.get_tag_name(self.db, tid) for tid in dbutil.get_hyponym_tags(self.db, vip_id) ]) for vip_name, vip_id in self.vip_tags.iteritems() } self.importants = set( t.name.lower() for t in dbutil.get_tags_by_type(self.db, [11011, 11013])) self.thesaurus = self.__load_tag_novelties() self.thesaurus_ids = self.__load_tag_novelties(tid=True) self.tag_types = self.__load_tag_types() self.trusted_sources = dicts.get_known_company_source() self.replacements = { dbutil.get_tag_name(self.db, r['source']): [dbutil.get_tag_name(self.db, rtid) for rtid in r['replacement']] for r in self.mongo.keywords.replacement.find() } self.junk_terms = set( tag.name for tag in dbutil.get_tags_by_type(self.db, typeset=([11001]))) self.similarity_threshold = 0.4 self.textrank_window_size = 2 self.textrank_threshold = 0 self.source_tag_default_weight = 2 self.vip_lower = 0.3 self.important_threshold = 0.2 self.important_max_count = 5 print 'model inited'
def __init__(self): self.segmenter = Segmenter(cut_all=True) self.first_mapping = { 1: u'技术', 2: u'产品', 3: u'设计', 4: u'运营', 5: u'市场', 6: u'职能' } self.first_positions = dict.fromkeys(self.first_mapping.keys()) self.train_first_positions()
def __init__(self, sector_setting='default'): self.seg = Segmenter(tag=True) self.wfilter = word_filter.get_default_filter() self.vips = {} if sector_setting == 'new': sector_setting_file = os.path.join( os.path.split(os.path.realpath(__file__))[0], '../common/dict/vip.cluster.frozen') elif sector_setting == 'default': sector_setting_file = os.path.join( os.path.split(os.path.realpath(__file__))[0], '../common/dict/sector.cluster.frozen') else: sector_setting_file = os.path.join( os.path.split(os.path.realpath(__file__))[0], '../common/dict/sector.cluster.frozen') db = dbcon.connect_torndb() for line in codecs.open(sector_setting_file, encoding='utf-8'): vip, tags = line.split('#')[0].lower(), line.split( '#')[1].strip().split(',') for tag in tags: try: self.vips[tag.lower()] = ( vip, dbutil.get_tag_novelty(db, tag, name=True) / len(tags)) except Exception, e: print tag, e
def __init__(self, opt=None): if not isinstance(opt, dict): opt = {} if opt.get('segmenter'): self.seg = opt.get('segmenter') else: self.seg = Segmenter() self.vectorizer = TfidfVectorizer( sublinear_tf=True, stop_words=stopword.get_standard_stopwords(), max_df=opt.get('max_df', 0.5), min_df=opt.get('min_df', 50), max_features=5000) self.selector = SelectKBest(chi2, k=opt.get('topk', 'all'))
def __init__(self): self.life_period = 1000 self.num_candidates = 800 self.min_similarity_threshold = 0.05 self.establish_discount = 0.75 self.dictionary = self.get_dict() self.id2in, self.in2id, self.corpus, self.max_id = self.get_corpus( self.dictionary) self.model, self.simi = self.train_model() self.segmenter = Segmenter() self.filter = Filter() self.feeder = Feeder() self.db = dbcon.connect_torndb() self.mongo = dbcon.connect_mongo()
def load_ruled_news(): global labels seg = Segmenter(tag=True) wfilter = word_filter.get_default_filter() trainx, trainy = [], [] mongo = dbcon.connect_mongo() for record in mongo.article.news.find({ '$and': [{ 'category': { '$ne': None } }, { 'category': { '$ne': 60199 } }, { 'category': { '$ne': 60106 } }], 'type': 60001, 'category_confidence': None }).limit(10000): contents = wfilter(seg.cut(record['title'])) contents.extend( wfilter( seg.cut(' '.join( [piece['content'] for piece in record['contents']])))) if len(contents) > 10: trainx.append(' '.join(contents)) trainy.append(int(labels.get(record['category']))) mongo.close() return np.array(trainx), np.array(trainy)
def load_data_l1(): db = dbcon.connect_torndb() seg = Segmenter() # tfidf = TfIdfExtractor() trainx, trainy = [], [] resutls = db.query( 'select company_sector.companyId, company_sector.sectorId from company_sector, sector ' 'where company_sector.verify="Y" and sector.id=company_sector.sectorId and sector.level=1 ' ) # 'and sector.id not in (6, 9, 10, 12, 13, 15, 16, 17, 18, 19, 999);') for result in resutls: desc = db.get('select description from company where id=%s', result.companyId) sid = result.sectorId if desc and desc.description.strip(): # trainx.append(desc.strip()) trainx.append(' '.join(seg.cut(desc.description.strip()))) trainy.append(int(sid)) # trainx, trainy = tfidf.train(trainx, trainy) db.close() return trainx, np.array(trainy)
class Companies(object): def __init__(self): self.data_dir = os.path.join( os.path.split(os.path.realpath(__file__))[0], '../data/tsb/company/ltp_cut') self.segmenter = Segmenter() self.mapping_id2in = {} self.mapping_in2id = {} self.max_id = 0 self.default_filter = word_filter.get_default_filter() def __iter__(self): global description_len_threshold, complete_threshold # db = torndb.Connection(**nlpconfig.get_mysql_config_tshbao()) db = dbcon.connect_torndb() index = 0 for result in iter(dbutil.get_all_company(db)): cid, desc = result.get('id'), result.get('context', '') score = dbutil.get_company_score(db, cid) if not (score and score > complete_threshold): continue if int(cid) > self.max_id: self.max_id = int(cid) if not os.path.exists(os.path.join(self.data_dir, str(cid))): words = list(self.segmenter.cut(desc)) else: words = [ line.split('\t')[0].strip() for line in codecs.open(os.path.join( self.data_dir, str(cid)), encoding='utf-8') if line.strip() ] if not words: continue words = self.default_filter(words) if len(words) < description_len_threshold: continue self.mapping_id2in[cid] = index self.mapping_in2id[index] = cid index += 1 yield [word.lower() for word in words] db.close() def get_mapping_id2in(self): return self.mapping_id2in def get_mapping_in2id(self): return self.mapping_in2id
class SourceCompany(object): def __init__(self, size_limit=None): self.db = dbcon.connect_torndb() self.seg = Segmenter(tag=True) self.wfilter = word_filter.get_default_filter() self.size_limit = size_limit def __iter__(self): if not self.size_limit: sql2use = 'select * from source_company where active is null or active="Y";' else: sql2use = 'select * from source_company where active is null or active="Y" ' \ 'order by rand() limit %s;' % self.size_limit for result in self.db.iter(sql2use): content = [] if result.brief and result.brief.strip(): content.extend(self.wfilter(self.seg.cut(result.brief))) if result.description and result.description.strip(): content.extend(self.wfilter(self.seg.cut(result.description.strip()))) if len(content) > 10: yield content
class UniversalIndexCreator(object): stopwords = stopword.get_standard_stopwords() seg = Segmenter() nameseg = NameSegmenter() def __init__(self, es=None): global logger_universal_index if not es: host, port = tsbconfig.get_es_config() self.es = Elasticsearch([{'host': host, 'port': port}]) else: self.es = es self.topic_tags = {} logger_universal_index.info('Universal Index Creator inited') def __check(self): global logger_universal_index if not self.es.indices.exists(["xiniudata2"]): logger_universal_index.info('Creating index xiniudata2') self.es.indices.create("xiniudata2") logger_universal_index.info('Created') self.es.indices.put_mapping("universal", mappings.get_universal_company_mapping(), "xiniudata2") logger_universal_index.info('Universal Company mapping created') def create_indice(self): global logger_universal_index self.__check() db = dbcon.connect_torndb() self.topic_tags = dbutil.get_topic_corresponding_tags(db) logger_universal_index.info('Start to create indice') logger_universal_index.info(str(self.es.info())) logger_universal_index.info('ES Config %s' % str(tsbconfig.get_es_config())) for cid in dbutil.get_all_company_id(db): try: self.create_single(db, cid) logger_universal_index.info( '%s index created, %s' % (cid, dbutil.get_company_name(db, cid))) except Exception, e: logger_universal_index.exception('%s failed # %s' % (cid, e)) db.close()
def __init__(self): global viptag_model, logger_news_pip self.db = dbcon.connect_torndb() self.mongo = dbcon.connect_mongo() self.seg = Segmenter(tag=True) self.wfilter = word_filter.get_default_filter() self.feeder = NewsFeeder() self.viptag_clf = fasttext.load_model(viptag_model) self.life_circle_linker = 100 self.life_circle_linker_max = 100 self.linker = CompanyLinker() logger_news_pip.info('Model inited')
def __init__(self): global word2vec_model, viptag_model_20171221, viptag_model_traditional, logger_tag logger_tag.info('Extractor model initing') self.db = dbcon.connect_torndb() self.mongo = dbcon.connect_mongo() self.feeder = Feeder() self.tagger = Tagger(itags=True) self.seg = Segmenter(itags=True) self.wfilter = word_filter.get_default_filter() self.gang = GangTag() self.w2v = Word2Vec.load(word2vec_model) self.similarity_threshold = 0.4 self.chain_simi_threshold = 0.25 self.vip_tags = {t.name: t.id for t in dbutil.get_sectored_tags(self.db, 1)} self.vip_classifier = fasttext.load_model(viptag_model_20171221) self.traditional_classifier = fasttext.load_model(viptag_model_traditional) self.trained_tag_clfs = self.__load_trained_clfs() self.important_lower = 0.1 self.important_threshold = 0.2 self.relevant_threshold = 0.4 self.vip_lower = 0.3 self.vip_threshold = 0.25 self.important_max_num = 5 self.max_contents_length = 20 self.yellows = dbutil.get_yellow_tags(self.db) self.importants = set(t.name.lower() for t in dbutil.get_tags_by_type(self.db, [11011, 11013])) self.thesaurus = self.__load_weighted_tags() self.thesaurus_ids = self.__load_weighted_tags(tid=True) self.junk_terms = self.__load_junk_tags() self.replacements = {r['source']: r['replacement'] for r in self.mongo.keywords.replacement.find()} self.trusted_sources = dicts.get_known_company_source() self.general_tagger = GeneralTagger() logger_tag.info('Extractor model inited')
def __init__(self): self.db = dbcon.connect_torndb() # self.tags = {t.name: (t.id, t.type) # for t in dbutil.get_tags_by_type(self.db, [11000, 11010, 11011, 11012, 11013])} self.tags = {t.name: (t.id, t.type) for t in dbutil.get_tags_by_type(self.db, [11011, 11012])} self.seg = Segmenter(itags=True) word2vec_model = os.path.join(os.path.split(os.path.realpath(__file__))[0], '../embedding/models/s400w3min20_20180118.binary.w2vmodel') self.w2v = Word2Vec.load(os.path.join(word2vec_model)) self.similarity_threshold = 0.3 self.max_candidates = { 11000: 5, 11010: 5, 11011: 5, 11013: 2 }
class Companies(object): def __init__(self): self.segmenter = Segmenter() self.feeder = Feeder() self.mapping_id2in = {} self.mapping_in2id = {} self.max_id = 0 self.default_filter = word_filter.get_default_filter() def __iter__(self): global description_len_threshold, complete_threshold db = dbcon.connect_torndb() index = 0 for cid in iter(dbutil.get_all_company_id(db)): contents = self.feeder.feed_string(cid) score = dbutil.get_company_score(db, cid) if not (score and score > complete_threshold): continue if int(cid) > self.max_id: self.max_id = int(cid) words = list(self.segmenter.cut(contents)) if not words: continue words = self.default_filter(words) if len(words) < description_len_threshold: continue self.mapping_id2in[cid] = index self.mapping_in2id[index] = cid index += 1 yield [word.lower() for word in words] db.close() def get_mapping_id2in(self): return self.mapping_id2in def get_mapping_in2id(self): return self.mapping_in2id
class TfIdfExtractor(FeatureExtractor): def __init__(self, opt=None): if not isinstance(opt, dict): opt = {} if opt.get('segmenter'): self.seg = opt.get('segmenter') else: self.seg = Segmenter() self.vectorizer = TfidfVectorizer( sublinear_tf=True, stop_words=stopword.get_standard_stopwords(), max_df=opt.get('max_df', 0.5), min_df=opt.get('min_df', 50), max_features=5000) self.selector = SelectKBest(chi2, k=opt.get('topk', 'all')) def train(self, docs, labels, seged=False): trainset = self.vectorizer.fit_transform(self.iter_docs(docs, seged)) # print len(self.vectorizer.get_feature_names()) # trainset = self.selector.fit_transform(trainset, labels) return trainset, labels def transform(self, docs, seged=False): return self.vectorizer.transform(self.iter_docs(docs, seged)) # return self.selector.transform(self.vectorizer.transform(self.iter_docs(docs, seged))) def iter_docs(self, docs, seged): for doc in docs: if not seged: yield ' '.join(self.seg.cut(doc)) else: yield doc
import codecs import fasttext import itertools from random import randint from collections import defaultdict import numpy as np import pandas as pd from sklearn import metrics import matplotlib.pyplot as plt from datetime import datetime db = dbcon.connect_torndb() mongodb = dbcon.connect_mongo() feeder = Feeder() seg = Segmenter() # tag size as 1800(sector 1, 2 & 3) or 2700(type 11011, 11012, 11013, 11014) def dump_data(train, test, tag_size=2700): if os.path.exists(train): print 'data exists.' return sql_1800 = ''' select c.id cid, t.id tid from company_tag_rel ct join company c on ct.companyId = c.id join tag t on ct.tagId = t.id where ct.verify = "Y" and (ct.active = "Y" or ct.active is null) and ct.modifyTime > "2017-06-01" and c.verify = "Y" and (c.active = "Y" or c.active is null) and t.sectorType is not null; ''' sql_2700 = ''' select c.id cid, t.id tid from company_tag_rel ct join company c on ct.companyId = c.id join tag t on ct.tagId = t.id where ct.verify = "Y" and (ct.active = "Y" or ct.active is null) and ct.modifyTime > "2017-06-01"'
def __init__(self, size_limit=None): self.db = dbcon.connect_torndb() self.seg = Segmenter(tag=True) self.wfilter = word_filter.get_default_filter() self.size_limit = size_limit
def __init__(self): self.mongo = dbcon.connect_mongo() self.db = dbcon.connect_torndb() self.seg = Segmenter(tag=True) self.wfilter = word_filter.get_default_filter()
learning_rate=0.1, continue_training=True) clf.fit(trainx, trainy) # prepare for simple sector cvipc = ClusterVIPClassifier() # prepare for mentioned company life_circle_linker = 100 life_circle_linker_max = 100 linker = CompanyLinker() # prepare for connection mongo = dbcon.connect_mongo() db = dbcon.connect_torndb() seg = Segmenter(tag=True) wfilter = word_filter.get_default_filter() logger_news_pip.info('start to process pending news') while True: for record in list( mongo.article.news.find({ 'type': { '$in': [60001, 60002, 60003] }, 'processStatus': 0 }).sort('date', pymongo.DESCENDING)): if record.get('source', 0) == 13022:
','.join(map(lambda x: str(x), labels.values()))) fo.write('@DATA \n') for i in xrange(len(y)): fo.write('%s,%s\n' % (','.join([str(item) for item in x[i]]), labels.get(y[i]))) def weighted_choice(choices): total = sum(w for c, w in choices) r = random.uniform(0, total) upto = 0 for c, w in choices: if upto + w > r: return c upto += w if __name__ == '__main__': print __file__ # upsample('template/fields.data') # scatter_sample('weka/field.train.arff') fc = FieldClassifier() s = Segmenter() c = u'通过贴图让用户简单地画漫画,并用漫画沟通、社交。网站上线1年,ipad端7月3日上线。IPAD版上线一周积累20万用户,第一周有11.000多幅漫画上传。' print fc.naive_classify(s.cut(c)) # fc.build_labeled_corpus() # fc.train('template/fields.1.data')
class DocumentsSimilarity(object): """ tfidf model based document similarity """ def __init__(self): self.life_period = 1000 self.num_candidates = 800 self.min_similarity_threshold = 0.05 self.establish_discount = 0.75 self.dictionary = self.get_dict() self.id2in, self.in2id, self.corpus, self.max_id = self.get_corpus( self.dictionary) self.model, self.simi = self.train_model() self.segmenter = Segmenter() self.filter = Filter() self.feeder = Feeder() self.db = dbcon.connect_torndb() self.mongo = dbcon.connect_mongo() def train_model(self): global cach_dir if not os.path.exists(cach_dir): os.mkdir(cach_dir) tfidf = models.TfidfModel(self.corpus) index = similarities.MatrixSimilarity(tfidf[self.corpus], num_best=self.num_candidates) return tfidf, index @classmethod def get_corpus(cls, dictionary): global logger_nlp, cach_dir companies = CompaniesVector(dictionary) fname = os.path.join( cach_dir, '%s.%s.corpus' % (datetime.datetime.now().strftime('%Y%m%d'), randint(0, 3600))) corpora.MmCorpus.serialize(fname, companies) logger_nlp.info('Corpus serialized') return companies.get_mapping_id2in(), companies.get_mapping_in2id( ), corpora.MmCorpus(fname), companies.max_id @classmethod def get_dict(cls): global stopwords, df_threshold_lower, df_threshold_upper, logger_nlp, cach_dir dates = datetime.datetime.now().strftime('%Y%m%d') if os.path.exists( os.path.join(cach_dir, '%s.%s.dict' % (dates, randint(0, 3600)))): try: dictionary = corpora.Dictionary.load( os.path.join(cach_dir, '%s.%s.dict' % (dates, randint(0, 3600)))) logger_nlp.info('Found dictionary file, loaded') return dictionary except: logger_nlp.error( 'Found dictionary file, fail to load, try to rebuild') pass companies = Companies() dictionary = corpora.Dictionary(company for company in companies) stop_ids = [ dictionary.token2id[word] for word in stopwords if word in dictionary.token2id ] low_df = [ tokenid for tokenid, docfreq in dictionary.dfs.iteritems() if docfreq <= df_threshold_lower ] high_df = [ tokenid for tokenid, docfreq in dictionary.dfs.iteritems() if docfreq > df_threshold_upper ] dictionary.filter_tokens(stop_ids + low_df + high_df) dictionary.compactify() dictionary.save( os.path.join(cach_dir, '%s.%s.dict' % (dates, randint(0, 3600)))) logger_nlp.info('Dictionary constructed, size %s' % len(dictionary.token2id)) return dictionary def get_similar(self, cid): global simi_threshold, complete_threshold # pooling if cid in self.id2in: vec = self.model[self.corpus[self.id2in[cid]]] simis = sorted(self.simi[vec], key=lambda x: -x[1])[1:self.num_candidates] simis = map(lambda x: (self.in2id[x[0]], round(x[1], 2)), simis) else: simis = self.get_similar4new(cid) # discount establish = dbutil.get_company_establish_date(self.db, cid).year simis = [(cid2, weight * self.__discount_year(establish, cid2)) for (cid2, weight) in simis] # sort and filter simis = sorted(simis, key=lambda x: -x[1]) simis = filter( lambda x: dbutil.get_company_score(self.db, x[0]) > complete_threshold and x[1] > self.min_similarity_threshold, simis) # dump and exit self.mongo.comps.candidates.update({'company': cid}, { '$set': { 'candidates': simis, 'modifyTime': datetime.datetime.now() } }, True) return simis def get_similar4new(self, cid): global logger_nlp # reload the model when life period goes down to 0, which means, reload after processing 200 new companies if int(cid) > self.max_id: self.life_period -= 1 if self.life_period == 0: logger_nlp.info('Reload recommend program') self.__init__() content = self.feeder.feed_string(cid) words = self.filter.filtermany(self.segmenter.cut(content)) vec = self.model[self.dictionary.doc2bow(words, allow_update=True)] simis = sorted(self.simi[vec], key=lambda x: -x[1])[1:self.num_candidates] simis = map(lambda x: (self.in2id[x[0]], round(x[1], 2)), simis) return simis def __discount_year(self, establish, cid2): diff = abs( dbutil.get_company_establish_date(self.db, cid2).year - establish) return self.establish_discount if diff > 5 else 1 def dump_full(self): global logger_nlp db = dbcon.connect_torndb() for cid in iter(dbutil.get_all_company_id(db)): try: self.get_similar(cid) logger_nlp.info('%s processed' % cid) except Exception, e: logger_nlp.exception('%s failed, %s' % (cid, e)) db.close()
class Feeder(object): def __init__(self): self.db = dbcon.connect_torndb() self.mongo = dbcon.connect_mongo() self.non_trusted_discount = 0.5 self.brief_promote = 1.5 self.trusted_sources = dicts.get_known_company_source() self.wfilter = word_filter.get_default_filter() self.seg = Segmenter(tag=True) def feed(self, cid, mode='default', quanlity='low'): feeds = { 'default': self.__feed_default, 'with_tag': self.__feed_with_tag }.get(mode, 'default')(cid) feeds = list(feeds) if quanlity == 'medium': ave = min(mean([feed[1] for feed in feeds]), 2) return filter(lambda x: x[1] >= ave, feeds) if quanlity == 'low': return feeds def feed_string(self, cid, mode='default'): feeds = list(self.feed(cid, mode, 'medium')) return ' '.join([feed[0].strip() for feed in feeds]) def feed_seged(self, cid, feed_mode='default'): return self.wfilter(self.seg.cut(self.feed_string(cid, feed_mode))) def feed_seged_fine(self, cid, feed_mode='default'): return self.wfilter( self.seg.cut4search(self.feed_string(cid, feed_mode))) def feed_relevant_string(self, cid): pass def __feed_with_tag(self, cid): for feed in self.__feed_default(cid): yield feed for source_tag in dbutil.get_source_company_tags( self.db, cid, self.trusted_sources): if source_tag and source_tag.strip(): yield source_tag, 2 def __feed_default(self, cid): cscore = dbutil.get_company_score(self.db, cid, 37010) # company info info = dbutil.get_company_info(self.db, cid) score = 1.5 if cscore > 0.5 else 1 if info.verify and info.verify == 'Y': score += 1 if info.brief and info.brief.strip(): yield self.__preprocess(info.brief.strip()), score if info.description and info.description.strip(): yield self.__preprocess(info.description.strip()), score # source company for info in dbutil.get_source_company_infos(self.db, cid): discount = self.non_trusted_discount if info.source not in self.trusted_sources else 1 if info.brief and info.brief.strip(): yield self.__preprocess( info.brief.strip()), discount * self.brief_promote if info.description and info.description.strip(): yield self.__preprocess(info.description.strip()), discount # iOS info = dbutil.get_recommend_artifact(self.db, cid) if info and info.description and info.description.strip(): ascore = 1 if (info.verify and info.verify == 'Y') else 0.5 yield self.__preprocess(info.description.strip()), ascore def __preprocess(self, content): # clean and narrow down candidates # 繁转简 content = hants.translate(unicode(content)) # 转小写 content = content.lower() return content.strip()
def __init__(self): self.seg = Segmenter(tag=True) self.wfilter = word_filter.get_default_filter()
class PositionClassifier(object): def __init__(self): self.segmenter = Segmenter(cut_all=True) self.first_mapping = { 1: u'技术', 2: u'产品', 3: u'设计', 4: u'运营', 5: u'市场', 6: u'职能' } self.first_positions = dict.fromkeys(self.first_mapping.keys()) self.train_first_positions() def train_first_positions(self): self.first_positions[1] = set([ u'工程师', u'技术', u'java', u'python', u'php', u'c++', u'c', u'android', u'ios', u'测试', u'web', u'前端', u'数据库', u'ruby', u'perl', u'node.js', u'c#', u'go', u'html5', u'flash', u'javascript', u'u3d', u'运维', u'网络', u'安全', u'数据仓库', u'dba', u'mysql', u'oracle', u'sqlserver', u'sql', u'硬件', u'嵌入式', u'驱动', u'材料', u'开发' ]) self.first_positions[2] = set([ u'产品', u'产品经理', u'策划', ]) self.first_positions[3] = set([ u'设计', u'设计师', u'游戏', u'ui', u'ue', ]) self.first_positions[4] = set([ u'运营', u'coo', u'编辑', u'主编', u'文案', u'售前', u'售后', u'客服', ]) self.first_positions[5] = set([ u'市场', u'销售', u'seo', u'sem', u'商务', u'客户', u'bd', u'公关', u'采购', u'物流', u'仓储', u'广告', u'媒介', u'招商', u'推广' ]) self.first_positions[6] = set([ u'人事', u'hr', u'行政', u'培训', u'绩效', u'前台', u'总助', u'秘书', u'文秘', u'财务', u'会计', u'出纳', u'税务', u'审计', u'hrm', u'hrd', u'财务', u'法务', u'律师', u'专利', u'招聘' ]) def get_first_positions(self): return self.first_positions.keys() def classify_first(self, position): position = set(map(lambda x: x.lower(), self.segmenter.cut(position))) return sorted([(k, len(position & v)) for k, v in self.first_positions.items()], key=lambda x: -x[1])[0][0] def get_first_name(self, key): return self.first_mapping.get(key)
import sys sys.path.append('..') sys.path.append('../..') reload(sys) import codecs import torndb from common.zhtools.segment import Segmenter from common.classifier.field import FieldClassifier if __name__ == '__main__': sql = 'select dealId,dealname,dealdesc from deal where joinDemoDay=2;' db = torndb.Connection('localhost:3306', 'demoday', 'root', '') clf = FieldClassifier(model='lr') seg = Segmenter() # clf.train() fo = codecs.open('tmp', 'w', 'utf-8') for rid, result in enumerate(db.query(sql)): did, doc = result.dealId, result.dealdesc try: label = clf.naive_classify(seg.cut(doc)) if label: print did, label fo.write('%s#%s\n' % (did, label[0])) except Exception, e: print did, 'fail' print e # if rid > 40: # break
class KeywordExtractor(object): def __init__(self): global word2vec_model, viptag_model_20171221 self.db = dbcon.connect_torndb() self.mongo = dbcon.connect_mongo() self.feeder = Feeder() self.tagger = Tagger(itags=True) self.seg = Segmenter(tags=True) self.wfilter = word_filter.get_default_filter() self.w2v = Word2Vec.load(word2vec_model) self.trained_tag_clfs = self.__load_trained_clfs() self.vip_classifier = fasttext.load_model(viptag_model_20171221) self.yellows = dbutil.get_yellow_tags(self.db) self.vip_tags = { t.name: t.id for t in dbutil.get_sectored_tags(self.db, 1) } self.hyponym = { vip_name: set([ dbutil.get_tag_name(self.db, tid) for tid in dbutil.get_hyponym_tags(self.db, vip_id) ]) for vip_name, vip_id in self.vip_tags.iteritems() } self.importants = set( t.name.lower() for t in dbutil.get_tags_by_type(self.db, [11011, 11013])) self.thesaurus = self.__load_tag_novelties() self.thesaurus_ids = self.__load_tag_novelties(tid=True) self.tag_types = self.__load_tag_types() self.trusted_sources = dicts.get_known_company_source() self.replacements = { dbutil.get_tag_name(self.db, r['source']): [dbutil.get_tag_name(self.db, rtid) for rtid in r['replacement']] for r in self.mongo.keywords.replacement.find() } self.junk_terms = set( tag.name for tag in dbutil.get_tags_by_type(self.db, typeset=([11001]))) self.similarity_threshold = 0.4 self.textrank_window_size = 2 self.textrank_threshold = 0 self.source_tag_default_weight = 2 self.vip_lower = 0.3 self.important_threshold = 0.2 self.important_max_count = 5 print 'model inited' def __load_trained_clfs(self): model_dir = os.path.join( os.path.split(os.path.realpath(__file__))[0], 'models') clfs = {} for model_file in os.listdir(model_dir): if model_file.endswith('.model'): tid = model_file.split('.')[0] if not isinstance(tid, int): continue clfs[dbutil.get_tag_name(self.db, int(tid))] = joblib.load( os.path.join(model_dir, model_file)) return clfs def __load_tag_novelties(self, tid=False): if not tid: return { tag.name: (tag.novelty or 1) for tag in dbutil.get_tags_by_type(self.db) } else: return { tag.id: (tag.novelty or 1) for tag in dbutil.get_tags_by_type(self.db) } def __load_tag_types(self): return { tag.name: (tag.type or 0) for tag in dbutil.get_tags_by_type(self.db) } def __extract_source_tag(self, cid): tags = dbutil.get_source_company_tags(self.db, cid, self.trusted_sources) if tags: return set( chain(*[ dbutil.analyze_source_tag( self.db, tname, self.replacements) for tname in tags if tname and tname.strip() ])) return set([]) def __extract_vecrank(self, candidates, candidates_important, candidates_vips, topn): graph = UndirectWeightedGraph() weights = collections.defaultdict(int) proper_hyponym = dict.fromkeys( set( chain(*[ self.hyponym.get(dbutil.get_tag_name(self.db, cv)) for cv in candidates_vips.iterkeys() ])), 2) for i in xrange(len(candidates)): for j in xrange(i + 1, i + self.textrank_window_size): if j >= len(candidates): break weights[(candidates[i], candidates[j])] += 1 if candidates[i] not in self.w2v: continue for word, weight in candidates_important.items(): if word == candidates[i] or word not in self.w2v: continue similarity = self.w2v.similarity(candidates[i], word) if similarity > self.similarity_threshold: weights[(candidates[i], word)] += similarity * weight for terms, weight in weights.iteritems(): graph.add_edge(terms[0], terms[1], weight) nodes_rank = graph.rank(self.thesaurus, proper_hyponym) topn = min(topn, len(candidates)) start = 0 for tag, weight in sorted(nodes_rank.items(), key=lambda x: -x[1])[:topn]: if tag in self.junk_terms: continue if start < 2: yield tag, round(weight, 2) elif weight >= self.textrank_threshold: yield tag, round(weight, 2) start += 1 def extract_vip(self, cid): desc = ' '.join( self.wfilter( self.seg.cut4search(self.feeder.feed_string(cid, 'with_tag')))) if not desc: return {} classifier_vips = [ (int(tag.replace(u'__label__', '')), weight) for (tag, weight) in self.vip_classifier.predict_proba([desc], 2)[0] if weight > self.vip_lower ] classifier_vips.sort(key=lambda x: -x[1]) # if 2 candidate vip label, check whether their probability is comparable if len(classifier_vips ) == 2 and classifier_vips[0][1] > classifier_vips[1][1] * 2: return {classifier_vips[0][0]: classifier_vips[0][1]} return dict(classifier_vips) def __extract_important(self, contents, candidates): # support assginment supports = deepcopy(candidates) for word in contents: if word not in self.w2v: continue for candidate in candidates.keys(): if candidate not in self.w2v: continue similarity = self.w2v.similarity(candidate, word) if similarity > self.similarity_threshold: supports[candidate] = supports.get(candidate, 0) + similarity # support selection results = {} candi_size, content_size = len(candidates), len(''.join(candidates)) for candidate, weight in supports.iteritems(): if candi_size >= 2 and weight < content_size / 20: continue results[candidate] = weight * self.thesaurus.get(candidate, 1) if len(results) == 0: return results # normalization normalizer = max(results.values()) for k, v in results.items(): results[k] = round(v / normalizer, 2) # narrow down results size if len(results) < 4: pass else: results = dict( filter(lambda x: x[1] > self.important_threshold, results.iteritems())) if len(results) > self.important_max_count: size = min( 10, max(int(ceil(len(results) / 2.0)), self.important_max_count)) results = dict( sorted(results.iteritems(), key=lambda x: -x[1])[:size]) return results def __extract_textrank(self, candidates, topn=15): """ weighted textrank, weights use tags' novelties """ graph = UndirectWeightedGraph() weights = collections.defaultdict(int) for i in xrange(len(candidates)): for j in xrange(i + 1, i + self.textrank_window_size): if j >= len(candidates): break weights[(candidates[i], candidates[j])] += 1 for terms, weight in weights.iteritems(): graph.add_edge(terms[0], terms[1], weight) nodes_rank = graph.rank(self.thesaurus) index = min(topn, len(candidates)) start = 0 for tag, weight in sorted(nodes_rank.items(), key=lambda x: -x[1])[:index]: if tag in self.junk_terms: continue if start < 2: yield tag, round(weight, 2) elif weight >= self.textrank_threshold: yield tag, round(weight, 2) start += 1 def __prepare_tag_contents(self, cid): # prepare contents contents = list(self.feeder.feed(cid, quanlity='medium')) candidates = [] for content, _ in contents: candidates.extend([x[0] for x in self.tagger.tag(content)]) candidates = self.wfilter(candidates) source_tags = self.__extract_source_tag(cid) candidates_important = {} for content, weight in contents: for tag in [ x[0] for x in self.tagger.tag(content) if x[1] == 'itag' or x[0] in self.importants ]: candidates_important[tag] = candidates_important.get( tag, 0) + weight for tag in source_tags: candidates_important[tag] = candidates_important.get( tag, 0) + self.source_tag_default_weight return source_tags, candidates, candidates_important def __normalize_replacement(self, tags): if type(tags) is dict: normalized_tags = {} for tag, weight in tags.items(): if tag in self.replacements: for replacement in self.replacements.get(tag): normalized_tags[replacement] = weight else: normalized_tags[tag] = weight else: normalized_tags = [] for tag in tags: if tag in self.replacements: for replacement in self.replacements.get(tag): normalized_tags.append(replacement) else: normalized_tags.append(tag) return normalized_tags def __normalize(self, d): if not d: return d normalizer = max(d.values()) + 1.0 for tag, weight in d.items(): type_promotion = { 11011: 1, 11013: 1.5, 11012: 2.5 }.get(self.tag_types.get(tag, 0), 0) d[tag] = round(weight / normalizer, 2) + type_promotion return d def merge(self, d1, d2, weight=0): # weight is a bonus weight for k, v in d2.iteritems(): d1[k] = d1.get(k, 0) + v + weight return d1 def extract(self, cid, topn=15): # prepare contents source_tags, candidates, candidates_important = self.__prepare_tag_contents( cid) candidates_vips = self.extract_vip(cid) # generate results results = dict( self.__extract_vecrank(candidates, candidates_important, candidates_vips, topn)) results = self.merge( results, { dbutil.get_tag_name(self.db, tid): w for tid, w in candidates_vips.iteritems() }) # results = self.merge(results, self.__extract_important(candidates, candidates_important), 1) # results = self.merge(results, dict(self.__extract_textrank(candidates, topn))) results = self.__normalize(results) results = self.__normalize_replacement(results) return results def extract_from_text(self, text): candidates = [] for content, _ in text.iteritems(): candidates.extend([x[0] for x in self.tagger.tag(content)]) candidates = self.wfilter(candidates) candidates_important = {} for content, weight in text.iteritems(): for tag in [ x[0] for x in self.tagger.tag(content) if x[1] == 'itag' or x[0] in self.importants ]: candidates_important[tag] = candidates_important.get( tag, 0) + weight desc = ' '.join( self.wfilter(self.seg.cut4search(' '.join(text.keys())))) candidates_vips = { int(tag.replace(u'__label__', '')): weight for (tag, weight) in self.vip_classifier.predict_proba([desc], 3)[0] if weight > self.vip_lower } results = {} results = self.merge( results, self.__extract_important(candidates, candidates_important), 1) results = self.merge(results, dict(self.__extract_textrank(candidates, 10))) # results = dict(self.__extract_vecrank(candidates, candidates_important, candidates_vips, 10)) results = self.merge( results, { dbutil.get_tag_name(self.db, tid): w for tid, w in candidates_vips.iteritems() }) results = self.__normalize(results) results = self.__normalize_replacement(results) deducts = self.__deduct_2nd(results) if len(deducts) < 3: results = self.merge(results, deducts) return results def __deduct_2nd(self, tags): deduct = [] tags = [(dbutil.get_tag_id(self.db, t)[0], t) for t in tags.keys()] for (tid, tag) in tags: if self.tag_types.get(tag, 0) == 11013: t1s = dbutil.get_hypernym_tags(self.db, tid, 1) for t1 in set(t1s) & set([t[0] for t in tags]): t2s = set(dbutil.get_hyponym_tags(self.db, t1, 2)) & set( dbutil.get_hypernym_tags(self.db, tid, 2)) for t2 in t2s: if t2 not in set([t[0] for t in tags]): deduct.append(t2) return {dbutil.get_tag_name(self.db, t2): 2.49 for t2 in deduct}