Exemplo n.º 1
0
def load_data():

    db = dbcon.connect_torndb()
    seg = Segmenter()
    X, Y = [], []
    for item in db.query('select * from source_context;'):
        X.append(' '.join(list(seg.cut(item.content))).strip())
        Y.append(item.type == 30010)
    db.close()
    return X, Y
Exemplo n.º 2
0
def feed_doc(tag=u'金融'):

    mongo = dbcon.connect_mongo()
    segmenter = Segmenter(tag=True)
    wfilter = get_default_filter()
    for record in mongo.article.news.find({'tags': tag}):
        yield chain(*[
            wfilter(segmenter.cut(piece['content'].strip()))
            for piece in record['contents'] if piece['content'].strip()
        ])
Exemplo n.º 3
0
    def __init__(self):

        self.data_dir = os.path.join(
            os.path.split(os.path.realpath(__file__))[0],
            '../data/tsb/company/ltp_cut')
        self.segmenter = Segmenter()
        self.mapping_id2in = {}
        self.mapping_in2id = {}
        self.max_id = 0
        self.default_filter = word_filter.get_default_filter()
Exemplo n.º 4
0
Arquivo: feed.py Projeto: yujiye/Codes
    def __init__(self):

        self.db = dbcon.connect_torndb()
        self.mongo = dbcon.connect_mongo()

        self.non_trusted_discount = 0.5
        self.brief_promote = 1.5
        self.trusted_sources = dicts.get_known_company_source()

        self.wfilter = word_filter.get_default_filter()
        self.seg = Segmenter(tag=True)
Exemplo n.º 5
0
    def __init__(self):

        global word2vec_model, viptag_model_20171221
        self.db = dbcon.connect_torndb()
        self.mongo = dbcon.connect_mongo()

        self.feeder = Feeder()
        self.tagger = Tagger(itags=True)
        self.seg = Segmenter(tags=True)
        self.wfilter = word_filter.get_default_filter()

        self.w2v = Word2Vec.load(word2vec_model)
        self.trained_tag_clfs = self.__load_trained_clfs()
        self.vip_classifier = fasttext.load_model(viptag_model_20171221)

        self.yellows = dbutil.get_yellow_tags(self.db)
        self.vip_tags = {
            t.name: t.id
            for t in dbutil.get_sectored_tags(self.db, 1)
        }
        self.hyponym = {
            vip_name: set([
                dbutil.get_tag_name(self.db, tid)
                for tid in dbutil.get_hyponym_tags(self.db, vip_id)
            ])
            for vip_name, vip_id in self.vip_tags.iteritems()
        }
        self.importants = set(
            t.name.lower()
            for t in dbutil.get_tags_by_type(self.db, [11011, 11013]))
        self.thesaurus = self.__load_tag_novelties()
        self.thesaurus_ids = self.__load_tag_novelties(tid=True)
        self.tag_types = self.__load_tag_types()
        self.trusted_sources = dicts.get_known_company_source()
        self.replacements = {
            dbutil.get_tag_name(self.db, r['source']):
            [dbutil.get_tag_name(self.db, rtid) for rtid in r['replacement']]
            for r in self.mongo.keywords.replacement.find()
        }
        self.junk_terms = set(
            tag.name
            for tag in dbutil.get_tags_by_type(self.db, typeset=([11001])))

        self.similarity_threshold = 0.4
        self.textrank_window_size = 2
        self.textrank_threshold = 0
        self.source_tag_default_weight = 2
        self.vip_lower = 0.3
        self.important_threshold = 0.2
        self.important_max_count = 5

        print 'model inited'
Exemplo n.º 6
0
    def __init__(self):

        self.segmenter = Segmenter(cut_all=True)
        self.first_mapping = {
            1: u'技术',
            2: u'产品',
            3: u'设计',
            4: u'运营',
            5: u'市场',
            6: u'职能'
        }
        self.first_positions = dict.fromkeys(self.first_mapping.keys())
        self.train_first_positions()
Exemplo n.º 7
0
    def __init__(self, sector_setting='default'):

        self.seg = Segmenter(tag=True)
        self.wfilter = word_filter.get_default_filter()
        self.vips = {}

        if sector_setting == 'new':
            sector_setting_file = os.path.join(
                os.path.split(os.path.realpath(__file__))[0],
                '../common/dict/vip.cluster.frozen')
        elif sector_setting == 'default':
            sector_setting_file = os.path.join(
                os.path.split(os.path.realpath(__file__))[0],
                '../common/dict/sector.cluster.frozen')
        else:
            sector_setting_file = os.path.join(
                os.path.split(os.path.realpath(__file__))[0],
                '../common/dict/sector.cluster.frozen')

        db = dbcon.connect_torndb()
        for line in codecs.open(sector_setting_file, encoding='utf-8'):
            vip, tags = line.split('#')[0].lower(), line.split(
                '#')[1].strip().split(',')
            for tag in tags:
                try:
                    self.vips[tag.lower()] = (
                        vip,
                        dbutil.get_tag_novelty(db, tag, name=True) / len(tags))
                except Exception, e:
                    print tag, e
Exemplo n.º 8
0
    def __init__(self, opt=None):

        if not isinstance(opt, dict):
            opt = {}

        if opt.get('segmenter'):
            self.seg = opt.get('segmenter')
        else:
            self.seg = Segmenter()
        self.vectorizer = TfidfVectorizer(
            sublinear_tf=True,
            stop_words=stopword.get_standard_stopwords(),
            max_df=opt.get('max_df', 0.5),
            min_df=opt.get('min_df', 50),
            max_features=5000)
        self.selector = SelectKBest(chi2, k=opt.get('topk', 'all'))
Exemplo n.º 9
0
    def __init__(self):

        self.life_period = 1000
        self.num_candidates = 800
        self.min_similarity_threshold = 0.05
        self.establish_discount = 0.75

        self.dictionary = self.get_dict()
        self.id2in, self.in2id, self.corpus, self.max_id = self.get_corpus(
            self.dictionary)
        self.model, self.simi = self.train_model()

        self.segmenter = Segmenter()
        self.filter = Filter()
        self.feeder = Feeder()
        self.db = dbcon.connect_torndb()
        self.mongo = dbcon.connect_mongo()
Exemplo n.º 10
0
def load_ruled_news():

    global labels

    seg = Segmenter(tag=True)
    wfilter = word_filter.get_default_filter()
    trainx, trainy = [], []

    mongo = dbcon.connect_mongo()
    for record in mongo.article.news.find({
            '$and': [{
                'category': {
                    '$ne': None
                }
            }, {
                'category': {
                    '$ne': 60199
                }
            }, {
                'category': {
                    '$ne': 60106
                }
            }],
            'type':
            60001,
            'category_confidence':
            None
    }).limit(10000):
        contents = wfilter(seg.cut(record['title']))
        contents.extend(
            wfilter(
                seg.cut(' '.join(
                    [piece['content'] for piece in record['contents']]))))
        if len(contents) > 10:
            trainx.append(' '.join(contents))
            trainy.append(int(labels.get(record['category'])))
    mongo.close()

    return np.array(trainx), np.array(trainy)
Exemplo n.º 11
0
def load_data_l1():

    db = dbcon.connect_torndb()
    seg = Segmenter()
    # tfidf = TfIdfExtractor()
    trainx, trainy = [], []
    resutls = db.query(
        'select company_sector.companyId, company_sector.sectorId from company_sector, sector '
        'where company_sector.verify="Y" and sector.id=company_sector.sectorId and sector.level=1 '
    )
    # 'and sector.id not in (6, 9, 10, 12, 13, 15, 16, 17, 18, 19, 999);')
    for result in resutls:
        desc = db.get('select description from company where id=%s',
                      result.companyId)
        sid = result.sectorId
        if desc and desc.description.strip():
            # trainx.append(desc.strip())
            trainx.append(' '.join(seg.cut(desc.description.strip())))
            trainy.append(int(sid))
    # trainx, trainy = tfidf.train(trainx, trainy)
    db.close()
    return trainx, np.array(trainy)
Exemplo n.º 12
0
class Companies(object):
    def __init__(self):

        self.data_dir = os.path.join(
            os.path.split(os.path.realpath(__file__))[0],
            '../data/tsb/company/ltp_cut')
        self.segmenter = Segmenter()
        self.mapping_id2in = {}
        self.mapping_in2id = {}
        self.max_id = 0
        self.default_filter = word_filter.get_default_filter()

    def __iter__(self):

        global description_len_threshold, complete_threshold
        # db = torndb.Connection(**nlpconfig.get_mysql_config_tshbao())
        db = dbcon.connect_torndb()
        index = 0
        for result in iter(dbutil.get_all_company(db)):
            cid, desc = result.get('id'), result.get('context', '')
            score = dbutil.get_company_score(db, cid)
            if not (score and score > complete_threshold):
                continue
            if int(cid) > self.max_id:
                self.max_id = int(cid)
            if not os.path.exists(os.path.join(self.data_dir, str(cid))):
                words = list(self.segmenter.cut(desc))
            else:
                words = [
                    line.split('\t')[0].strip()
                    for line in codecs.open(os.path.join(
                        self.data_dir, str(cid)),
                                            encoding='utf-8') if line.strip()
                ]
            if not words:
                continue
            words = self.default_filter(words)
            if len(words) < description_len_threshold:
                continue

            self.mapping_id2in[cid] = index
            self.mapping_in2id[index] = cid
            index += 1
            yield [word.lower() for word in words]
        db.close()

    def get_mapping_id2in(self):
        return self.mapping_id2in

    def get_mapping_in2id(self):
        return self.mapping_in2id
Exemplo n.º 13
0
Arquivo: w2v.py Projeto: yujiye/Codes
class SourceCompany(object):

    def __init__(self, size_limit=None):

        self.db = dbcon.connect_torndb()
        self.seg = Segmenter(tag=True)
        self.wfilter = word_filter.get_default_filter()
        self.size_limit = size_limit

    def __iter__(self):

        if not self.size_limit:
            sql2use = 'select * from source_company where active is null or active="Y";'
        else:
            sql2use = 'select * from source_company where active is null or active="Y" ' \
                      'order by rand() limit %s;' % self.size_limit
        for result in self.db.iter(sql2use):
            content = []
            if result.brief and result.brief.strip():
                content.extend(self.wfilter(self.seg.cut(result.brief)))
            if result.description and result.description.strip():
                content.extend(self.wfilter(self.seg.cut(result.description.strip())))
            if len(content) > 10:
                yield content
Exemplo n.º 14
0
class UniversalIndexCreator(object):

    stopwords = stopword.get_standard_stopwords()
    seg = Segmenter()
    nameseg = NameSegmenter()

    def __init__(self, es=None):

        global logger_universal_index
        if not es:
            host, port = tsbconfig.get_es_config()
            self.es = Elasticsearch([{'host': host, 'port': port}])
        else:
            self.es = es
        self.topic_tags = {}
        logger_universal_index.info('Universal Index Creator inited')

    def __check(self):

        global logger_universal_index
        if not self.es.indices.exists(["xiniudata2"]):
            logger_universal_index.info('Creating index xiniudata2')
            self.es.indices.create("xiniudata2")
            logger_universal_index.info('Created')
        self.es.indices.put_mapping("universal",
                                    mappings.get_universal_company_mapping(),
                                    "xiniudata2")
        logger_universal_index.info('Universal Company mapping created')

    def create_indice(self):

        global logger_universal_index
        self.__check()
        db = dbcon.connect_torndb()
        self.topic_tags = dbutil.get_topic_corresponding_tags(db)
        logger_universal_index.info('Start to create indice')
        logger_universal_index.info(str(self.es.info()))
        logger_universal_index.info('ES Config %s' %
                                    str(tsbconfig.get_es_config()))
        for cid in dbutil.get_all_company_id(db):
            try:
                self.create_single(db, cid)
                logger_universal_index.info(
                    '%s index created, %s' %
                    (cid, dbutil.get_company_name(db, cid)))
            except Exception, e:
                logger_universal_index.exception('%s failed # %s' % (cid, e))
        db.close()
Exemplo n.º 15
0
    def __init__(self):

        global viptag_model, logger_news_pip
        self.db = dbcon.connect_torndb()
        self.mongo = dbcon.connect_mongo()

        self.seg = Segmenter(tag=True)
        self.wfilter = word_filter.get_default_filter()
        self.feeder = NewsFeeder()

        self.viptag_clf = fasttext.load_model(viptag_model)

        self.life_circle_linker = 100
        self.life_circle_linker_max = 100
        self.linker = CompanyLinker()

        logger_news_pip.info('Model inited')
Exemplo n.º 16
0
Arquivo: key.py Projeto: yujiye/Codes
    def __init__(self):

        global word2vec_model, viptag_model_20171221, viptag_model_traditional, logger_tag
        logger_tag.info('Extractor model initing')

        self.db = dbcon.connect_torndb()
        self.mongo = dbcon.connect_mongo()

        self.feeder = Feeder()
        self.tagger = Tagger(itags=True)
        self.seg = Segmenter(itags=True)
        self.wfilter = word_filter.get_default_filter()

        self.gang = GangTag()

        self.w2v = Word2Vec.load(word2vec_model)
        self.similarity_threshold = 0.4
        self.chain_simi_threshold = 0.25

        self.vip_tags = {t.name: t.id for t in dbutil.get_sectored_tags(self.db, 1)}
        self.vip_classifier = fasttext.load_model(viptag_model_20171221)
        self.traditional_classifier = fasttext.load_model(viptag_model_traditional)
        self.trained_tag_clfs = self.__load_trained_clfs()

        self.important_lower = 0.1
        self.important_threshold = 0.2
        self.relevant_threshold = 0.4
        self.vip_lower = 0.3
        self.vip_threshold = 0.25
        self.important_max_num = 5
        self.max_contents_length = 20

        self.yellows = dbutil.get_yellow_tags(self.db)
        self.importants = set(t.name.lower() for t in dbutil.get_tags_by_type(self.db, [11011, 11013]))
        self.thesaurus = self.__load_weighted_tags()
        self.thesaurus_ids = self.__load_weighted_tags(tid=True)
        self.junk_terms = self.__load_junk_tags()
        self.replacements = {r['source']: r['replacement'] for r in self.mongo.keywords.replacement.find()}

        self.trusted_sources = dicts.get_known_company_source()

        self.general_tagger = GeneralTagger()

        logger_tag.info('Extractor model inited')
Exemplo n.º 17
0
    def __init__(self):

        self.db = dbcon.connect_torndb()
        # self.tags = {t.name: (t.id, t.type)
        #              for t in dbutil.get_tags_by_type(self.db, [11000, 11010, 11011, 11012, 11013])}
        self.tags = {t.name: (t.id, t.type)
                     for t in dbutil.get_tags_by_type(self.db, [11011, 11012])}
        self.seg = Segmenter(itags=True)
        word2vec_model = os.path.join(os.path.split(os.path.realpath(__file__))[0],
                                      '../embedding/models/s400w3min20_20180118.binary.w2vmodel')
        self.w2v = Word2Vec.load(os.path.join(word2vec_model))

        self.similarity_threshold = 0.3
        self.max_candidates = {
            11000: 5,
            11010: 5,
            11011: 5,
            11013: 2
        }
Exemplo n.º 18
0
class Companies(object):
    def __init__(self):

        self.segmenter = Segmenter()
        self.feeder = Feeder()
        self.mapping_id2in = {}
        self.mapping_in2id = {}
        self.max_id = 0
        self.default_filter = word_filter.get_default_filter()

    def __iter__(self):

        global description_len_threshold, complete_threshold
        db = dbcon.connect_torndb()
        index = 0
        for cid in iter(dbutil.get_all_company_id(db)):
            contents = self.feeder.feed_string(cid)
            score = dbutil.get_company_score(db, cid)
            if not (score and score > complete_threshold):
                continue
            if int(cid) > self.max_id:
                self.max_id = int(cid)
            words = list(self.segmenter.cut(contents))
            if not words:
                continue
            words = self.default_filter(words)
            if len(words) < description_len_threshold:
                continue

            self.mapping_id2in[cid] = index
            self.mapping_in2id[index] = cid
            index += 1
            yield [word.lower() for word in words]
        db.close()

    def get_mapping_id2in(self):
        return self.mapping_id2in

    def get_mapping_in2id(self):
        return self.mapping_in2id
Exemplo n.º 19
0
class TfIdfExtractor(FeatureExtractor):
    def __init__(self, opt=None):

        if not isinstance(opt, dict):
            opt = {}

        if opt.get('segmenter'):
            self.seg = opt.get('segmenter')
        else:
            self.seg = Segmenter()
        self.vectorizer = TfidfVectorizer(
            sublinear_tf=True,
            stop_words=stopword.get_standard_stopwords(),
            max_df=opt.get('max_df', 0.5),
            min_df=opt.get('min_df', 50),
            max_features=5000)
        self.selector = SelectKBest(chi2, k=opt.get('topk', 'all'))

    def train(self, docs, labels, seged=False):

        trainset = self.vectorizer.fit_transform(self.iter_docs(docs, seged))
        # print len(self.vectorizer.get_feature_names())
        # trainset = self.selector.fit_transform(trainset, labels)
        return trainset, labels

    def transform(self, docs, seged=False):

        return self.vectorizer.transform(self.iter_docs(docs, seged))
        # return self.selector.transform(self.vectorizer.transform(self.iter_docs(docs, seged)))

    def iter_docs(self, docs, seged):

        for doc in docs:
            if not seged:
                yield ' '.join(self.seg.cut(doc))
            else:
                yield doc
Exemplo n.º 20
0
import codecs
import fasttext
import itertools
from random import randint
from collections import defaultdict
import numpy as np
import pandas as pd
from sklearn import metrics
import matplotlib.pyplot as plt
from datetime import datetime

db = dbcon.connect_torndb()
mongodb = dbcon.connect_mongo()
feeder = Feeder()
seg = Segmenter()


# tag size as 1800(sector 1, 2 & 3) or 2700(type 11011, 11012, 11013, 11014)
def dump_data(train, test, tag_size=2700):
    if os.path.exists(train):
        print 'data exists.'
        return
    sql_1800 = ''' 
    select c.id cid, t.id tid from company_tag_rel ct join company c on ct.companyId = c.id join tag t on ct.tagId = t.id
    where ct.verify = "Y" and (ct.active = "Y" or ct.active is null) and ct.modifyTime > "2017-06-01" and c.verify = "Y"
    and (c.active = "Y" or c.active is null) and t.sectorType is not null;
    '''
    sql_2700 = '''
    select c.id cid, t.id tid from company_tag_rel ct join company c on ct.companyId = c.id join tag t on ct.tagId = t.id
    where ct.verify = "Y" and (ct.active = "Y" or ct.active is null) and ct.modifyTime > "2017-06-01"'
Exemplo n.º 21
0
Arquivo: w2v.py Projeto: yujiye/Codes
    def __init__(self, size_limit=None):

        self.db = dbcon.connect_torndb()
        self.seg = Segmenter(tag=True)
        self.wfilter = word_filter.get_default_filter()
        self.size_limit = size_limit
Exemplo n.º 22
0
Arquivo: w2v.py Projeto: yujiye/Codes
    def __init__(self):

        self.mongo = dbcon.connect_mongo()
        self.db = dbcon.connect_torndb()
        self.seg = Segmenter(tag=True)
        self.wfilter = word_filter.get_default_filter()
Exemplo n.º 23
0
                                    learning_rate=0.1,
                                    continue_training=True)
    clf.fit(trainx, trainy)

    # prepare for simple sector
    cvipc = ClusterVIPClassifier()

    # prepare for mentioned company
    life_circle_linker = 100
    life_circle_linker_max = 100
    linker = CompanyLinker()

    # prepare for connection
    mongo = dbcon.connect_mongo()
    db = dbcon.connect_torndb()
    seg = Segmenter(tag=True)
    wfilter = word_filter.get_default_filter()

    logger_news_pip.info('start to process pending news')

    while True:

        for record in list(
                mongo.article.news.find({
                    'type': {
                        '$in': [60001, 60002, 60003]
                    },
                    'processStatus': 0
                }).sort('date', pymongo.DESCENDING)):

            if record.get('source', 0) == 13022:
Exemplo n.º 24
0
                     ','.join(map(lambda x: str(x), labels.values())))
            fo.write('@DATA \n')
            for i in xrange(len(y)):
                fo.write('%s,%s\n' %
                         (','.join([str(item)
                                    for item in x[i]]), labels.get(y[i])))


def weighted_choice(choices):

    total = sum(w for c, w in choices)
    r = random.uniform(0, total)
    upto = 0
    for c, w in choices:
        if upto + w > r:
            return c
        upto += w


if __name__ == '__main__':

    print __file__

    # upsample('template/fields.data')
    # scatter_sample('weka/field.train.arff')
    fc = FieldClassifier()
    s = Segmenter()
    c = u'通过贴图让用户简单地画漫画,并用漫画沟通、社交。网站上线1年,ipad端7月3日上线。IPAD版上线一周积累20万用户,第一周有11.000多幅漫画上传。'
    print fc.naive_classify(s.cut(c))
    # fc.build_labeled_corpus()
    # fc.train('template/fields.1.data')
Exemplo n.º 25
0
class DocumentsSimilarity(object):
    """
    tfidf model based document similarity
    """
    def __init__(self):

        self.life_period = 1000
        self.num_candidates = 800
        self.min_similarity_threshold = 0.05
        self.establish_discount = 0.75

        self.dictionary = self.get_dict()
        self.id2in, self.in2id, self.corpus, self.max_id = self.get_corpus(
            self.dictionary)
        self.model, self.simi = self.train_model()

        self.segmenter = Segmenter()
        self.filter = Filter()
        self.feeder = Feeder()
        self.db = dbcon.connect_torndb()
        self.mongo = dbcon.connect_mongo()

    def train_model(self):

        global cach_dir
        if not os.path.exists(cach_dir):
            os.mkdir(cach_dir)
        tfidf = models.TfidfModel(self.corpus)
        index = similarities.MatrixSimilarity(tfidf[self.corpus],
                                              num_best=self.num_candidates)
        return tfidf, index

    @classmethod
    def get_corpus(cls, dictionary):

        global logger_nlp, cach_dir
        companies = CompaniesVector(dictionary)
        fname = os.path.join(
            cach_dir, '%s.%s.corpus' %
            (datetime.datetime.now().strftime('%Y%m%d'), randint(0, 3600)))
        corpora.MmCorpus.serialize(fname, companies)
        logger_nlp.info('Corpus serialized')
        return companies.get_mapping_id2in(), companies.get_mapping_in2id(
        ), corpora.MmCorpus(fname), companies.max_id

    @classmethod
    def get_dict(cls):

        global stopwords, df_threshold_lower, df_threshold_upper, logger_nlp, cach_dir
        dates = datetime.datetime.now().strftime('%Y%m%d')
        if os.path.exists(
                os.path.join(cach_dir,
                             '%s.%s.dict' % (dates, randint(0, 3600)))):
            try:
                dictionary = corpora.Dictionary.load(
                    os.path.join(cach_dir,
                                 '%s.%s.dict' % (dates, randint(0, 3600))))
                logger_nlp.info('Found dictionary file, loaded')
                return dictionary
            except:
                logger_nlp.error(
                    'Found dictionary file, fail to load, try to rebuild')
                pass
        companies = Companies()
        dictionary = corpora.Dictionary(company for company in companies)
        stop_ids = [
            dictionary.token2id[word] for word in stopwords
            if word in dictionary.token2id
        ]
        low_df = [
            tokenid for tokenid, docfreq in dictionary.dfs.iteritems()
            if docfreq <= df_threshold_lower
        ]
        high_df = [
            tokenid for tokenid, docfreq in dictionary.dfs.iteritems()
            if docfreq > df_threshold_upper
        ]
        dictionary.filter_tokens(stop_ids + low_df + high_df)
        dictionary.compactify()
        dictionary.save(
            os.path.join(cach_dir, '%s.%s.dict' % (dates, randint(0, 3600))))
        logger_nlp.info('Dictionary constructed, size %s' %
                        len(dictionary.token2id))
        return dictionary

    def get_similar(self, cid):

        global simi_threshold, complete_threshold

        # pooling
        if cid in self.id2in:
            vec = self.model[self.corpus[self.id2in[cid]]]
            simis = sorted(self.simi[vec],
                           key=lambda x: -x[1])[1:self.num_candidates]
            simis = map(lambda x: (self.in2id[x[0]], round(x[1], 2)), simis)
        else:
            simis = self.get_similar4new(cid)

        # discount
        establish = dbutil.get_company_establish_date(self.db, cid).year
        simis = [(cid2, weight * self.__discount_year(establish, cid2))
                 for (cid2, weight) in simis]

        # sort and filter
        simis = sorted(simis, key=lambda x: -x[1])
        simis = filter(
            lambda x: dbutil.get_company_score(self.db, x[0]) >
            complete_threshold and x[1] > self.min_similarity_threshold, simis)

        # dump and exit
        self.mongo.comps.candidates.update({'company': cid}, {
            '$set': {
                'candidates': simis,
                'modifyTime': datetime.datetime.now()
            }
        }, True)
        return simis

    def get_similar4new(self, cid):

        global logger_nlp
        # reload the model when life period goes down to 0, which means, reload after processing 200 new companies
        if int(cid) > self.max_id:
            self.life_period -= 1
        if self.life_period == 0:
            logger_nlp.info('Reload recommend program')
            self.__init__()

        content = self.feeder.feed_string(cid)
        words = self.filter.filtermany(self.segmenter.cut(content))
        vec = self.model[self.dictionary.doc2bow(words, allow_update=True)]
        simis = sorted(self.simi[vec],
                       key=lambda x: -x[1])[1:self.num_candidates]
        simis = map(lambda x: (self.in2id[x[0]], round(x[1], 2)), simis)
        return simis

    def __discount_year(self, establish, cid2):

        diff = abs(
            dbutil.get_company_establish_date(self.db, cid2).year - establish)
        return self.establish_discount if diff > 5 else 1

    def dump_full(self):

        global logger_nlp
        db = dbcon.connect_torndb()
        for cid in iter(dbutil.get_all_company_id(db)):
            try:
                self.get_similar(cid)
                logger_nlp.info('%s processed' % cid)
            except Exception, e:
                logger_nlp.exception('%s failed, %s' % (cid, e))
        db.close()
Exemplo n.º 26
0
Arquivo: feed.py Projeto: yujiye/Codes
class Feeder(object):
    def __init__(self):

        self.db = dbcon.connect_torndb()
        self.mongo = dbcon.connect_mongo()

        self.non_trusted_discount = 0.5
        self.brief_promote = 1.5
        self.trusted_sources = dicts.get_known_company_source()

        self.wfilter = word_filter.get_default_filter()
        self.seg = Segmenter(tag=True)

    def feed(self, cid, mode='default', quanlity='low'):

        feeds = {
            'default': self.__feed_default,
            'with_tag': self.__feed_with_tag
        }.get(mode, 'default')(cid)
        feeds = list(feeds)
        if quanlity == 'medium':
            ave = min(mean([feed[1] for feed in feeds]), 2)
            return filter(lambda x: x[1] >= ave, feeds)
        if quanlity == 'low':
            return feeds

    def feed_string(self, cid, mode='default'):

        feeds = list(self.feed(cid, mode, 'medium'))
        return ' '.join([feed[0].strip() for feed in feeds])

    def feed_seged(self, cid, feed_mode='default'):

        return self.wfilter(self.seg.cut(self.feed_string(cid, feed_mode)))

    def feed_seged_fine(self, cid, feed_mode='default'):

        return self.wfilter(
            self.seg.cut4search(self.feed_string(cid, feed_mode)))

    def feed_relevant_string(self, cid):

        pass

    def __feed_with_tag(self, cid):

        for feed in self.__feed_default(cid):
            yield feed
        for source_tag in dbutil.get_source_company_tags(
                self.db, cid, self.trusted_sources):
            if source_tag and source_tag.strip():
                yield source_tag, 2

    def __feed_default(self, cid):

        cscore = dbutil.get_company_score(self.db, cid, 37010)
        # company info
        info = dbutil.get_company_info(self.db, cid)
        score = 1.5 if cscore > 0.5 else 1
        if info.verify and info.verify == 'Y':
            score += 1
        if info.brief and info.brief.strip():
            yield self.__preprocess(info.brief.strip()), score
        if info.description and info.description.strip():
            yield self.__preprocess(info.description.strip()), score

        # source company
        for info in dbutil.get_source_company_infos(self.db, cid):
            discount = self.non_trusted_discount if info.source not in self.trusted_sources else 1
            if info.brief and info.brief.strip():
                yield self.__preprocess(
                    info.brief.strip()), discount * self.brief_promote
            if info.description and info.description.strip():
                yield self.__preprocess(info.description.strip()), discount

        # iOS
        info = dbutil.get_recommend_artifact(self.db, cid)
        if info and info.description and info.description.strip():
            ascore = 1 if (info.verify and info.verify == 'Y') else 0.5
            yield self.__preprocess(info.description.strip()), ascore

    def __preprocess(self, content):

        # clean and narrow down candidates
        # 繁转简
        content = hants.translate(unicode(content))
        # 转小写
        content = content.lower()

        return content.strip()
Exemplo n.º 27
0
Arquivo: feed.py Projeto: yujiye/Codes
    def __init__(self):

        self.seg = Segmenter(tag=True)
        self.wfilter = word_filter.get_default_filter()
Exemplo n.º 28
0
class PositionClassifier(object):
    def __init__(self):

        self.segmenter = Segmenter(cut_all=True)
        self.first_mapping = {
            1: u'技术',
            2: u'产品',
            3: u'设计',
            4: u'运营',
            5: u'市场',
            6: u'职能'
        }
        self.first_positions = dict.fromkeys(self.first_mapping.keys())
        self.train_first_positions()

    def train_first_positions(self):

        self.first_positions[1] = set([
            u'工程师', u'技术', u'java', u'python', u'php', u'c++', u'c',
            u'android', u'ios', u'测试', u'web', u'前端', u'数据库', u'ruby', u'perl',
            u'node.js', u'c#', u'go', u'html5', u'flash', u'javascript',
            u'u3d', u'运维', u'网络', u'安全', u'数据仓库', u'dba', u'mysql', u'oracle',
            u'sqlserver', u'sql', u'硬件', u'嵌入式', u'驱动', u'材料', u'开发'
        ])
        self.first_positions[2] = set([
            u'产品',
            u'产品经理',
            u'策划',
        ])
        self.first_positions[3] = set([
            u'设计',
            u'设计师',
            u'游戏',
            u'ui',
            u'ue',
        ])
        self.first_positions[4] = set([
            u'运营',
            u'coo',
            u'编辑',
            u'主编',
            u'文案',
            u'售前',
            u'售后',
            u'客服',
        ])
        self.first_positions[5] = set([
            u'市场', u'销售', u'seo', u'sem', u'商务', u'客户', u'bd', u'公关', u'采购',
            u'物流', u'仓储', u'广告', u'媒介', u'招商', u'推广'
        ])
        self.first_positions[6] = set([
            u'人事', u'hr', u'行政', u'培训', u'绩效', u'前台', u'总助', u'秘书', u'文秘',
            u'财务', u'会计', u'出纳', u'税务', u'审计', u'hrm', u'hrd', u'财务', u'法务',
            u'律师', u'专利', u'招聘'
        ])

    def get_first_positions(self):
        return self.first_positions.keys()

    def classify_first(self, position):

        position = set(map(lambda x: x.lower(), self.segmenter.cut(position)))
        return sorted([(k, len(position & v))
                       for k, v in self.first_positions.items()],
                      key=lambda x: -x[1])[0][0]

    def get_first_name(self, key):

        return self.first_mapping.get(key)
Exemplo n.º 29
0
import sys
sys.path.append('..')
sys.path.append('../..')
reload(sys)

import codecs
import torndb
from common.zhtools.segment import Segmenter
from common.classifier.field import FieldClassifier

if __name__ == '__main__':

    sql = 'select dealId,dealname,dealdesc from deal where joinDemoDay=2;'
    db = torndb.Connection('localhost:3306', 'demoday', 'root', '')
    clf = FieldClassifier(model='lr')
    seg = Segmenter()
    # clf.train()

    fo = codecs.open('tmp', 'w', 'utf-8')
    for rid, result in enumerate(db.query(sql)):
        did, doc = result.dealId, result.dealdesc
        try:
            label = clf.naive_classify(seg.cut(doc))
            if label:
                print did, label
                fo.write('%s#%s\n' % (did, label[0]))
        except Exception, e:
            print did, 'fail'
            print e
        # if rid > 40:
        #     break
Exemplo n.º 30
0
class KeywordExtractor(object):
    def __init__(self):

        global word2vec_model, viptag_model_20171221
        self.db = dbcon.connect_torndb()
        self.mongo = dbcon.connect_mongo()

        self.feeder = Feeder()
        self.tagger = Tagger(itags=True)
        self.seg = Segmenter(tags=True)
        self.wfilter = word_filter.get_default_filter()

        self.w2v = Word2Vec.load(word2vec_model)
        self.trained_tag_clfs = self.__load_trained_clfs()
        self.vip_classifier = fasttext.load_model(viptag_model_20171221)

        self.yellows = dbutil.get_yellow_tags(self.db)
        self.vip_tags = {
            t.name: t.id
            for t in dbutil.get_sectored_tags(self.db, 1)
        }
        self.hyponym = {
            vip_name: set([
                dbutil.get_tag_name(self.db, tid)
                for tid in dbutil.get_hyponym_tags(self.db, vip_id)
            ])
            for vip_name, vip_id in self.vip_tags.iteritems()
        }
        self.importants = set(
            t.name.lower()
            for t in dbutil.get_tags_by_type(self.db, [11011, 11013]))
        self.thesaurus = self.__load_tag_novelties()
        self.thesaurus_ids = self.__load_tag_novelties(tid=True)
        self.tag_types = self.__load_tag_types()
        self.trusted_sources = dicts.get_known_company_source()
        self.replacements = {
            dbutil.get_tag_name(self.db, r['source']):
            [dbutil.get_tag_name(self.db, rtid) for rtid in r['replacement']]
            for r in self.mongo.keywords.replacement.find()
        }
        self.junk_terms = set(
            tag.name
            for tag in dbutil.get_tags_by_type(self.db, typeset=([11001])))

        self.similarity_threshold = 0.4
        self.textrank_window_size = 2
        self.textrank_threshold = 0
        self.source_tag_default_weight = 2
        self.vip_lower = 0.3
        self.important_threshold = 0.2
        self.important_max_count = 5

        print 'model inited'

    def __load_trained_clfs(self):

        model_dir = os.path.join(
            os.path.split(os.path.realpath(__file__))[0], 'models')
        clfs = {}
        for model_file in os.listdir(model_dir):
            if model_file.endswith('.model'):
                tid = model_file.split('.')[0]
                if not isinstance(tid, int):
                    continue
                clfs[dbutil.get_tag_name(self.db, int(tid))] = joblib.load(
                    os.path.join(model_dir, model_file))
        return clfs

    def __load_tag_novelties(self, tid=False):

        if not tid:
            return {
                tag.name: (tag.novelty or 1)
                for tag in dbutil.get_tags_by_type(self.db)
            }
        else:
            return {
                tag.id: (tag.novelty or 1)
                for tag in dbutil.get_tags_by_type(self.db)
            }

    def __load_tag_types(self):

        return {
            tag.name: (tag.type or 0)
            for tag in dbutil.get_tags_by_type(self.db)
        }

    def __extract_source_tag(self, cid):

        tags = dbutil.get_source_company_tags(self.db, cid,
                                              self.trusted_sources)
        if tags:
            return set(
                chain(*[
                    dbutil.analyze_source_tag(
                        self.db, tname, self.replacements) for tname in tags
                    if tname and tname.strip()
                ]))
        return set([])

    def __extract_vecrank(self, candidates, candidates_important,
                          candidates_vips, topn):

        graph = UndirectWeightedGraph()
        weights = collections.defaultdict(int)
        proper_hyponym = dict.fromkeys(
            set(
                chain(*[
                    self.hyponym.get(dbutil.get_tag_name(self.db, cv))
                    for cv in candidates_vips.iterkeys()
                ])), 2)
        for i in xrange(len(candidates)):
            for j in xrange(i + 1, i + self.textrank_window_size):
                if j >= len(candidates):
                    break
                weights[(candidates[i], candidates[j])] += 1
            if candidates[i] not in self.w2v:
                continue
            for word, weight in candidates_important.items():
                if word == candidates[i] or word not in self.w2v:
                    continue
                similarity = self.w2v.similarity(candidates[i], word)
                if similarity > self.similarity_threshold:
                    weights[(candidates[i], word)] += similarity * weight
        for terms, weight in weights.iteritems():
            graph.add_edge(terms[0], terms[1], weight)
        nodes_rank = graph.rank(self.thesaurus, proper_hyponym)
        topn = min(topn, len(candidates))
        start = 0
        for tag, weight in sorted(nodes_rank.items(),
                                  key=lambda x: -x[1])[:topn]:
            if tag in self.junk_terms:
                continue
            if start < 2:
                yield tag, round(weight, 2)
            elif weight >= self.textrank_threshold:
                yield tag, round(weight, 2)
            start += 1

    def extract_vip(self, cid):

        desc = ' '.join(
            self.wfilter(
                self.seg.cut4search(self.feeder.feed_string(cid, 'with_tag'))))
        if not desc:
            return {}
        classifier_vips = [
            (int(tag.replace(u'__label__', '')), weight)
            for (tag,
                 weight) in self.vip_classifier.predict_proba([desc], 2)[0]
            if weight > self.vip_lower
        ]
        classifier_vips.sort(key=lambda x: -x[1])
        # if 2 candidate vip label, check whether their probability is comparable
        if len(classifier_vips
               ) == 2 and classifier_vips[0][1] > classifier_vips[1][1] * 2:
            return {classifier_vips[0][0]: classifier_vips[0][1]}
        return dict(classifier_vips)

    def __extract_important(self, contents, candidates):

        # support assginment
        supports = deepcopy(candidates)
        for word in contents:
            if word not in self.w2v:
                continue
            for candidate in candidates.keys():
                if candidate not in self.w2v:
                    continue
                similarity = self.w2v.similarity(candidate, word)
                if similarity > self.similarity_threshold:
                    supports[candidate] = supports.get(candidate,
                                                       0) + similarity
        # support selection
        results = {}
        candi_size, content_size = len(candidates), len(''.join(candidates))
        for candidate, weight in supports.iteritems():
            if candi_size >= 2 and weight < content_size / 20:
                continue
            results[candidate] = weight * self.thesaurus.get(candidate, 1)
        if len(results) == 0:
            return results
        # normalization
        normalizer = max(results.values())
        for k, v in results.items():
            results[k] = round(v / normalizer, 2)
        # narrow down results size
        if len(results) < 4:
            pass
        else:
            results = dict(
                filter(lambda x: x[1] > self.important_threshold,
                       results.iteritems()))
            if len(results) > self.important_max_count:
                size = min(
                    10,
                    max(int(ceil(len(results) / 2.0)),
                        self.important_max_count))
                results = dict(
                    sorted(results.iteritems(), key=lambda x: -x[1])[:size])
        return results

    def __extract_textrank(self, candidates, topn=15):
        """
        weighted textrank, weights use tags' novelties
        """
        graph = UndirectWeightedGraph()
        weights = collections.defaultdict(int)
        for i in xrange(len(candidates)):
            for j in xrange(i + 1, i + self.textrank_window_size):
                if j >= len(candidates):
                    break
                weights[(candidates[i], candidates[j])] += 1
        for terms, weight in weights.iteritems():
            graph.add_edge(terms[0], terms[1], weight)
        nodes_rank = graph.rank(self.thesaurus)
        index = min(topn, len(candidates))
        start = 0
        for tag, weight in sorted(nodes_rank.items(),
                                  key=lambda x: -x[1])[:index]:
            if tag in self.junk_terms:
                continue
            if start < 2:
                yield tag, round(weight, 2)
            elif weight >= self.textrank_threshold:
                yield tag, round(weight, 2)
            start += 1

    def __prepare_tag_contents(self, cid):

        # prepare contents
        contents = list(self.feeder.feed(cid, quanlity='medium'))
        candidates = []
        for content, _ in contents:
            candidates.extend([x[0] for x in self.tagger.tag(content)])
        candidates = self.wfilter(candidates)
        source_tags = self.__extract_source_tag(cid)
        candidates_important = {}
        for content, weight in contents:
            for tag in [
                    x[0] for x in self.tagger.tag(content)
                    if x[1] == 'itag' or x[0] in self.importants
            ]:
                candidates_important[tag] = candidates_important.get(
                    tag, 0) + weight
        for tag in source_tags:
            candidates_important[tag] = candidates_important.get(
                tag, 0) + self.source_tag_default_weight

        return source_tags, candidates, candidates_important

    def __normalize_replacement(self, tags):

        if type(tags) is dict:
            normalized_tags = {}
            for tag, weight in tags.items():
                if tag in self.replacements:
                    for replacement in self.replacements.get(tag):
                        normalized_tags[replacement] = weight
                else:
                    normalized_tags[tag] = weight
        else:
            normalized_tags = []
            for tag in tags:
                if tag in self.replacements:
                    for replacement in self.replacements.get(tag):
                        normalized_tags.append(replacement)
                else:
                    normalized_tags.append(tag)
        return normalized_tags

    def __normalize(self, d):

        if not d:
            return d
        normalizer = max(d.values()) + 1.0
        for tag, weight in d.items():
            type_promotion = {
                11011: 1,
                11013: 1.5,
                11012: 2.5
            }.get(self.tag_types.get(tag, 0), 0)
            d[tag] = round(weight / normalizer, 2) + type_promotion
        return d

    def merge(self, d1, d2, weight=0):

        # weight is a bonus weight
        for k, v in d2.iteritems():
            d1[k] = d1.get(k, 0) + v + weight
        return d1

    def extract(self, cid, topn=15):

        # prepare contents
        source_tags, candidates, candidates_important = self.__prepare_tag_contents(
            cid)
        candidates_vips = self.extract_vip(cid)

        # generate results
        results = dict(
            self.__extract_vecrank(candidates, candidates_important,
                                   candidates_vips, topn))
        results = self.merge(
            results, {
                dbutil.get_tag_name(self.db, tid): w
                for tid, w in candidates_vips.iteritems()
            })
        # results = self.merge(results, self.__extract_important(candidates, candidates_important), 1)
        # results = self.merge(results, dict(self.__extract_textrank(candidates, topn)))
        results = self.__normalize(results)
        results = self.__normalize_replacement(results)
        return results

    def extract_from_text(self, text):

        candidates = []
        for content, _ in text.iteritems():
            candidates.extend([x[0] for x in self.tagger.tag(content)])
        candidates = self.wfilter(candidates)
        candidates_important = {}
        for content, weight in text.iteritems():
            for tag in [
                    x[0] for x in self.tagger.tag(content)
                    if x[1] == 'itag' or x[0] in self.importants
            ]:
                candidates_important[tag] = candidates_important.get(
                    tag, 0) + weight
        desc = ' '.join(
            self.wfilter(self.seg.cut4search(' '.join(text.keys()))))
        candidates_vips = {
            int(tag.replace(u'__label__', '')): weight
            for (tag,
                 weight) in self.vip_classifier.predict_proba([desc], 3)[0]
            if weight > self.vip_lower
        }
        results = {}
        results = self.merge(
            results, self.__extract_important(candidates,
                                              candidates_important), 1)
        results = self.merge(results,
                             dict(self.__extract_textrank(candidates, 10)))
        # results = dict(self.__extract_vecrank(candidates, candidates_important, candidates_vips, 10))
        results = self.merge(
            results, {
                dbutil.get_tag_name(self.db, tid): w
                for tid, w in candidates_vips.iteritems()
            })
        results = self.__normalize(results)
        results = self.__normalize_replacement(results)
        deducts = self.__deduct_2nd(results)
        if len(deducts) < 3:
            results = self.merge(results, deducts)
        return results

    def __deduct_2nd(self, tags):

        deduct = []
        tags = [(dbutil.get_tag_id(self.db, t)[0], t) for t in tags.keys()]
        for (tid, tag) in tags:
            if self.tag_types.get(tag, 0) == 11013:
                t1s = dbutil.get_hypernym_tags(self.db, tid, 1)
                for t1 in set(t1s) & set([t[0] for t in tags]):
                    t2s = set(dbutil.get_hyponym_tags(self.db, t1, 2)) & set(
                        dbutil.get_hypernym_tags(self.db, tid, 2))
                    for t2 in t2s:
                        if t2 not in set([t[0] for t in tags]):
                            deduct.append(t2)
        return {dbutil.get_tag_name(self.db, t2): 2.49 for t2 in deduct}