示例#1
0
文件: feed.py 项目: yujiye/Codes
class NewsFeeder(object):
    def __init__(self):

        self.seg = Segmenter(tag=True)
        self.wfilter = word_filter.get_default_filter()

    def feed(self, record, granularity='default'):

        global logger_feeder

        try:
            contents = self.wfilter(
                self.seg.cut(record['title'].replace('\n', ' ')))
            if record.get('original_tags', []) and isinstance(
                    record.get('original_tags', []), list):
                contents.extend(record.get('original_tags', []))
            if granularity == 'fine':
                contents.extend(
                    self.wfilter(
                        self.seg.cut4search(' '.join([
                            piece['content'].replace('\n', ' ')
                            for piece in record['contents']
                        ]))))
            else:
                contents.extend(
                    self.wfilter(
                        self.seg.cut(' '.join([
                            piece['content'].replace('\n', ' ')
                            for piece in record['contents']
                        ]))))
            return contents
        except Exception, e:
            logger_feeder.error('Fail to feed, %s, %s' % (record['_id'], e))
            return []
示例#2
0
文件: w2v.py 项目: yujiye/Codes
class News(object):

    def __init__(self):

        self.mongo = dbcon.connect_mongo()
        self.db = dbcon.connect_torndb()
        self.seg = Segmenter(tag=True)
        self.wfilter = word_filter.get_default_filter()

    def __iter__(self):

        for news in self.mongo.article.news.find({'processStatus': 1}).sort('_id', DESCENDING).limit(200000):
            try:
                content = []
                content.extend(self.wfilter(self.seg.cut4search(news.get('title', ''))))
                for piece in news.get('contents', []):
                    content.extend(self.wfilter(self.seg.cut(piece.get('content', ''))))
                if len(content) > 10:
                    yield content
            except Exception, e:
                continue
        for c in self.db.query('select description from company where verify="Y" and modifyTime>"2016-06-01";'):
            try:
                if len(c.description) > 10:
                    yield self.wfilter(self.seg.cut4search(c.description))
            except:
                continue
示例#3
0
文件: loader.py 项目: yujiye/Codes
def load_data_l1_sources():

    with codecs.open(os.path.join(
            os.path.split(os.path.realpath(__file__))[0],
            'config/sector_name'),
                     encoding='utf-8') as f:
        config = {
            int(line.split('#')[0].strip()): line.split('#')[1].split(',')
            for line in f if line.strip()
        }
    db = dbcon.connect_torndb()
    seg = Segmenter()
    trainx, trainy = [], []
    for sid, names in config.iteritems():
        for name in names:
            ids = db.query(
                'select distinct id from source_company where field=%s;', name)
            for scid in ids:
                content = db.query(
                    'select content from source_context '
                    'where sourceCompanyId=%s and char_length(content)>20 and type=30020 '
                    'and confidence>0.7 '
                    'order by confidence desc;', scid.id)
                if len(content) > 0:
                    trainx.append(' '.join(seg.cut(
                        content[0].content.strip())))
                    trainy.append(sid)
    db.close()
    print set(trainy)
    return trainx, np.array(trainy)
示例#4
0
def feed_doc(tag=u'金融'):

    mongo = dbcon.connect_mongo()
    segmenter = Segmenter(tag=True)
    wfilter = get_default_filter()
    for record in mongo.article.news.find({'tags': tag}):
        yield chain(*[
            wfilter(segmenter.cut(piece['content'].strip()))
            for piece in record['contents'] if piece['content'].strip()
        ])
示例#5
0
文件: feature.py 项目: yujiye/Codes
def load_data():

    db = dbcon.connect_torndb()
    seg = Segmenter()
    X, Y = [], []
    for item in db.query('select * from source_context;'):
        X.append(' '.join(list(seg.cut(item.content))).strip())
        Y.append(item.type == 30010)
    db.close()
    return X, Y
示例#6
0
def load_ruled_news():

    global labels

    seg = Segmenter(tag=True)
    wfilter = word_filter.get_default_filter()
    trainx, trainy = [], []

    mongo = dbcon.connect_mongo()
    for record in mongo.article.news.find({
            '$and': [{
                'category': {
                    '$ne': None
                }
            }, {
                'category': {
                    '$ne': 60199
                }
            }, {
                'category': {
                    '$ne': 60106
                }
            }],
            'type':
            60001,
            'category_confidence':
            None
    }).limit(10000):
        contents = wfilter(seg.cut(record['title']))
        contents.extend(
            wfilter(
                seg.cut(' '.join(
                    [piece['content'] for piece in record['contents']]))))
        if len(contents) > 10:
            trainx.append(' '.join(contents))
            trainy.append(int(labels.get(record['category'])))
    mongo.close()

    return np.array(trainx), np.array(trainy)
示例#7
0
class Companies(object):
    def __init__(self):

        self.data_dir = os.path.join(
            os.path.split(os.path.realpath(__file__))[0],
            '../data/tsb/company/ltp_cut')
        self.segmenter = Segmenter()
        self.mapping_id2in = {}
        self.mapping_in2id = {}
        self.max_id = 0
        self.default_filter = word_filter.get_default_filter()

    def __iter__(self):

        global description_len_threshold, complete_threshold
        # db = torndb.Connection(**nlpconfig.get_mysql_config_tshbao())
        db = dbcon.connect_torndb()
        index = 0
        for result in iter(dbutil.get_all_company(db)):
            cid, desc = result.get('id'), result.get('context', '')
            score = dbutil.get_company_score(db, cid)
            if not (score and score > complete_threshold):
                continue
            if int(cid) > self.max_id:
                self.max_id = int(cid)
            if not os.path.exists(os.path.join(self.data_dir, str(cid))):
                words = list(self.segmenter.cut(desc))
            else:
                words = [
                    line.split('\t')[0].strip()
                    for line in codecs.open(os.path.join(
                        self.data_dir, str(cid)),
                                            encoding='utf-8') if line.strip()
                ]
            if not words:
                continue
            words = self.default_filter(words)
            if len(words) < description_len_threshold:
                continue

            self.mapping_id2in[cid] = index
            self.mapping_in2id[index] = cid
            index += 1
            yield [word.lower() for word in words]
        db.close()

    def get_mapping_id2in(self):
        return self.mapping_id2in

    def get_mapping_in2id(self):
        return self.mapping_in2id
示例#8
0
文件: w2v.py 项目: yujiye/Codes
class SourceCompany(object):

    def __init__(self, size_limit=None):

        self.db = dbcon.connect_torndb()
        self.seg = Segmenter(tag=True)
        self.wfilter = word_filter.get_default_filter()
        self.size_limit = size_limit

    def __iter__(self):

        if not self.size_limit:
            sql2use = 'select * from source_company where active is null or active="Y";'
        else:
            sql2use = 'select * from source_company where active is null or active="Y" ' \
                      'order by rand() limit %s;' % self.size_limit
        for result in self.db.iter(sql2use):
            content = []
            if result.brief and result.brief.strip():
                content.extend(self.wfilter(self.seg.cut(result.brief)))
            if result.description and result.description.strip():
                content.extend(self.wfilter(self.seg.cut(result.description.strip())))
            if len(content) > 10:
                yield content
示例#9
0
class Companies(object):
    def __init__(self):

        self.segmenter = Segmenter()
        self.feeder = Feeder()
        self.mapping_id2in = {}
        self.mapping_in2id = {}
        self.max_id = 0
        self.default_filter = word_filter.get_default_filter()

    def __iter__(self):

        global description_len_threshold, complete_threshold
        db = dbcon.connect_torndb()
        index = 0
        for cid in iter(dbutil.get_all_company_id(db)):
            contents = self.feeder.feed_string(cid)
            score = dbutil.get_company_score(db, cid)
            if not (score and score > complete_threshold):
                continue
            if int(cid) > self.max_id:
                self.max_id = int(cid)
            words = list(self.segmenter.cut(contents))
            if not words:
                continue
            words = self.default_filter(words)
            if len(words) < description_len_threshold:
                continue

            self.mapping_id2in[cid] = index
            self.mapping_in2id[index] = cid
            index += 1
            yield [word.lower() for word in words]
        db.close()

    def get_mapping_id2in(self):
        return self.mapping_id2in

    def get_mapping_in2id(self):
        return self.mapping_in2id
示例#10
0
文件: loader.py 项目: yujiye/Codes
def load_data_l1():

    db = dbcon.connect_torndb()
    seg = Segmenter()
    # tfidf = TfIdfExtractor()
    trainx, trainy = [], []
    resutls = db.query(
        'select company_sector.companyId, company_sector.sectorId from company_sector, sector '
        'where company_sector.verify="Y" and sector.id=company_sector.sectorId and sector.level=1 '
    )
    # 'and sector.id not in (6, 9, 10, 12, 13, 15, 16, 17, 18, 19, 999);')
    for result in resutls:
        desc = db.get('select description from company where id=%s',
                      result.companyId)
        sid = result.sectorId
        if desc and desc.description.strip():
            # trainx.append(desc.strip())
            trainx.append(' '.join(seg.cut(desc.description.strip())))
            trainy.append(int(sid))
    # trainx, trainy = tfidf.train(trainx, trainy)
    db.close()
    return trainx, np.array(trainy)
示例#11
0
文件: feature.py 项目: yujiye/Codes
class TfIdfExtractor(FeatureExtractor):
    def __init__(self, opt=None):

        if not isinstance(opt, dict):
            opt = {}

        if opt.get('segmenter'):
            self.seg = opt.get('segmenter')
        else:
            self.seg = Segmenter()
        self.vectorizer = TfidfVectorizer(
            sublinear_tf=True,
            stop_words=stopword.get_standard_stopwords(),
            max_df=opt.get('max_df', 0.5),
            min_df=opt.get('min_df', 50),
            max_features=5000)
        self.selector = SelectKBest(chi2, k=opt.get('topk', 'all'))

    def train(self, docs, labels, seged=False):

        trainset = self.vectorizer.fit_transform(self.iter_docs(docs, seged))
        # print len(self.vectorizer.get_feature_names())
        # trainset = self.selector.fit_transform(trainset, labels)
        return trainset, labels

    def transform(self, docs, seged=False):

        return self.vectorizer.transform(self.iter_docs(docs, seged))
        # return self.selector.transform(self.vectorizer.transform(self.iter_docs(docs, seged)))

    def iter_docs(self, docs, seged):

        for doc in docs:
            if not seged:
                yield ' '.join(self.seg.cut(doc))
            else:
                yield doc
示例#12
0
def dump_thesaurus(theme='source', topn=1000):

    db = dbcon.connect_torndb()
    seg = Segmenter()
    stopwords = stopword.get_standard_stopwords()
    tags = set(x.name for x in db.query('select name from tag where type>11001;'))
    vocab = {}

    if theme == 'source':
        query = 'select * from source_company where (active is null or active="Y");'
    else:
        query = 'select * from source_company where (active is null or active="Y");'

    for index, item in enumerate(db.iter(query)):
        for word in set(filter(lambda x: x not in stopwords and
                        len(x) > 1 and not x.isnumeric() and x not in tags and x.strip(), seg.cut(item.description))):
            vocab[word] = vocab.get(word, 0) + 1
        if index % 10000 == 0:
            low = [x[0] for x in vocab.iteritems() if x[1] < 20]
            for lowword in low:
                vocab.pop(lowword)
            print index, 'processed, size of vocab', len(vocab)
    db.close()

    vocab = sorted(vocab.iteritems(), key=lambda x: x[1], reverse=True)[:topn]
    with codecs.open(os.path.join(os.path.split(os.path.realpath(__file__))[0],
                                  'thesaurus/%s.%s.lowidf' % (theme, topn)), 'w', 'utf-8') as fo:
        fo.write('\n'.join([x[0] for x in vocab]))
示例#13
0
文件: demoday.py 项目: yujiye/Codes
reload(sys)

import codecs
import torndb
from common.zhtools.segment import Segmenter
from common.classifier.field import FieldClassifier

if __name__ == '__main__':

    sql = 'select dealId,dealname,dealdesc from deal where joinDemoDay=2;'
    db = torndb.Connection('localhost:3306', 'demoday', 'root', '')
    clf = FieldClassifier(model='lr')
    seg = Segmenter()
    # clf.train()

    fo = codecs.open('tmp', 'w', 'utf-8')
    for rid, result in enumerate(db.query(sql)):
        did, doc = result.dealId, result.dealdesc
        try:
            label = clf.naive_classify(seg.cut(doc))
            if label:
                print did, label
                fo.write('%s#%s\n' % (did, label[0]))
        except Exception, e:
            print did, 'fail'
            print e
        # if rid > 40:
        #     break

    fo.close()
    db.close()
示例#14
0
                                         (record['_id'], str(cids)))
                    life_circle_linker -= 1
                except Exception, e:
                    logger_news_pip.exception('mentioned failed, %s, %s' %
                                              (record['_id'], e))
                    mongo.article.news.update({'_id': record['_id']},
                                              {'$set': {
                                                  'processStatus': -2
                                              }})
                    continue

            # process category
            try:
                if (record.get('category', None) is None) and (record.get(
                        'type', 0) == 60001):
                    contents = wfilter(seg.cut(record['title']))
                    contents.extend(
                        wfilter(
                            seg.cut(' '.join([
                                piece['content']
                                for piece in record['contents']
                            ]))))
                    contents = np.array(
                        list(
                            vocab_processor.fit_transform(
                                np.array([' '.join(contents)]))))
                    label = clf.predict(contents)[0]
                    prob = clf.predict_proba(contents)[0][label]
                    category = labels_reverse.get(label)
                    if prob < 0.7:
                        category = 60199
示例#15
0
class DocumentsSimilarity(object):
    """
    tfidf model based document similarity
    """
    def __init__(self):

        self.life_period = 1000
        self.num_candidates = 800
        self.min_similarity_threshold = 0.05
        self.establish_discount = 0.75

        self.dictionary = self.get_dict()
        self.id2in, self.in2id, self.corpus, self.max_id = self.get_corpus(
            self.dictionary)
        self.model, self.simi = self.train_model()

        self.segmenter = Segmenter()
        self.filter = Filter()
        self.feeder = Feeder()
        self.db = dbcon.connect_torndb()
        self.mongo = dbcon.connect_mongo()

    def train_model(self):

        global cach_dir
        if not os.path.exists(cach_dir):
            os.mkdir(cach_dir)
        tfidf = models.TfidfModel(self.corpus)
        index = similarities.MatrixSimilarity(tfidf[self.corpus],
                                              num_best=self.num_candidates)
        return tfidf, index

    @classmethod
    def get_corpus(cls, dictionary):

        global logger_nlp, cach_dir
        companies = CompaniesVector(dictionary)
        fname = os.path.join(
            cach_dir, '%s.%s.corpus' %
            (datetime.datetime.now().strftime('%Y%m%d'), randint(0, 3600)))
        corpora.MmCorpus.serialize(fname, companies)
        logger_nlp.info('Corpus serialized')
        return companies.get_mapping_id2in(), companies.get_mapping_in2id(
        ), corpora.MmCorpus(fname), companies.max_id

    @classmethod
    def get_dict(cls):

        global stopwords, df_threshold_lower, df_threshold_upper, logger_nlp, cach_dir
        dates = datetime.datetime.now().strftime('%Y%m%d')
        if os.path.exists(
                os.path.join(cach_dir,
                             '%s.%s.dict' % (dates, randint(0, 3600)))):
            try:
                dictionary = corpora.Dictionary.load(
                    os.path.join(cach_dir,
                                 '%s.%s.dict' % (dates, randint(0, 3600))))
                logger_nlp.info('Found dictionary file, loaded')
                return dictionary
            except:
                logger_nlp.error(
                    'Found dictionary file, fail to load, try to rebuild')
                pass
        companies = Companies()
        dictionary = corpora.Dictionary(company for company in companies)
        stop_ids = [
            dictionary.token2id[word] for word in stopwords
            if word in dictionary.token2id
        ]
        low_df = [
            tokenid for tokenid, docfreq in dictionary.dfs.iteritems()
            if docfreq <= df_threshold_lower
        ]
        high_df = [
            tokenid for tokenid, docfreq in dictionary.dfs.iteritems()
            if docfreq > df_threshold_upper
        ]
        dictionary.filter_tokens(stop_ids + low_df + high_df)
        dictionary.compactify()
        dictionary.save(
            os.path.join(cach_dir, '%s.%s.dict' % (dates, randint(0, 3600))))
        logger_nlp.info('Dictionary constructed, size %s' %
                        len(dictionary.token2id))
        return dictionary

    def get_similar(self, cid):

        global simi_threshold, complete_threshold

        # pooling
        if cid in self.id2in:
            vec = self.model[self.corpus[self.id2in[cid]]]
            simis = sorted(self.simi[vec],
                           key=lambda x: -x[1])[1:self.num_candidates]
            simis = map(lambda x: (self.in2id[x[0]], round(x[1], 2)), simis)
        else:
            simis = self.get_similar4new(cid)

        # discount
        establish = dbutil.get_company_establish_date(self.db, cid).year
        simis = [(cid2, weight * self.__discount_year(establish, cid2))
                 for (cid2, weight) in simis]

        # sort and filter
        simis = sorted(simis, key=lambda x: -x[1])
        simis = filter(
            lambda x: dbutil.get_company_score(self.db, x[0]) >
            complete_threshold and x[1] > self.min_similarity_threshold, simis)

        # dump and exit
        self.mongo.comps.candidates.update({'company': cid}, {
            '$set': {
                'candidates': simis,
                'modifyTime': datetime.datetime.now()
            }
        }, True)
        return simis

    def get_similar4new(self, cid):

        global logger_nlp
        # reload the model when life period goes down to 0, which means, reload after processing 200 new companies
        if int(cid) > self.max_id:
            self.life_period -= 1
        if self.life_period == 0:
            logger_nlp.info('Reload recommend program')
            self.__init__()

        content = self.feeder.feed_string(cid)
        words = self.filter.filtermany(self.segmenter.cut(content))
        vec = self.model[self.dictionary.doc2bow(words, allow_update=True)]
        simis = sorted(self.simi[vec],
                       key=lambda x: -x[1])[1:self.num_candidates]
        simis = map(lambda x: (self.in2id[x[0]], round(x[1], 2)), simis)
        return simis

    def __discount_year(self, establish, cid2):

        diff = abs(
            dbutil.get_company_establish_date(self.db, cid2).year - establish)
        return self.establish_discount if diff > 5 else 1

    def dump_full(self):

        global logger_nlp
        db = dbcon.connect_torndb()
        for cid in iter(dbutil.get_all_company_id(db)):
            try:
                self.get_similar(cid)
                logger_nlp.info('%s processed' % cid)
            except Exception, e:
                logger_nlp.exception('%s failed, %s' % (cid, e))
        db.close()
示例#16
0
文件: field.py 项目: yujiye/Codes
                     ','.join(map(lambda x: str(x), labels.values())))
            fo.write('@DATA \n')
            for i in xrange(len(y)):
                fo.write('%s,%s\n' %
                         (','.join([str(item)
                                    for item in x[i]]), labels.get(y[i])))


def weighted_choice(choices):

    total = sum(w for c, w in choices)
    r = random.uniform(0, total)
    upto = 0
    for c, w in choices:
        if upto + w > r:
            return c
        upto += w


if __name__ == '__main__':

    print __file__

    # upsample('template/fields.data')
    # scatter_sample('weka/field.train.arff')
    fc = FieldClassifier()
    s = Segmenter()
    c = u'通过贴图让用户简单地画漫画,并用漫画沟通、社交。网站上线1年,ipad端7月3日上线。IPAD版上线一周积累20万用户,第一周有11.000多幅漫画上传。'
    print fc.naive_classify(s.cut(c))
    # fc.build_labeled_corpus()
    # fc.train('template/fields.1.data')
示例#17
0
文件: feed.py 项目: yujiye/Codes
class Feeder(object):
    def __init__(self):

        self.db = dbcon.connect_torndb()
        self.mongo = dbcon.connect_mongo()

        self.non_trusted_discount = 0.5
        self.brief_promote = 1.5
        self.trusted_sources = dicts.get_known_company_source()

        self.wfilter = word_filter.get_default_filter()
        self.seg = Segmenter(tag=True)

    def feed(self, cid, mode='default', quanlity='low'):

        feeds = {
            'default': self.__feed_default,
            'with_tag': self.__feed_with_tag
        }.get(mode, 'default')(cid)
        feeds = list(feeds)
        if quanlity == 'medium':
            ave = min(mean([feed[1] for feed in feeds]), 2)
            return filter(lambda x: x[1] >= ave, feeds)
        if quanlity == 'low':
            return feeds

    def feed_string(self, cid, mode='default'):

        feeds = list(self.feed(cid, mode, 'medium'))
        return ' '.join([feed[0].strip() for feed in feeds])

    def feed_seged(self, cid, feed_mode='default'):

        return self.wfilter(self.seg.cut(self.feed_string(cid, feed_mode)))

    def feed_seged_fine(self, cid, feed_mode='default'):

        return self.wfilter(
            self.seg.cut4search(self.feed_string(cid, feed_mode)))

    def feed_relevant_string(self, cid):

        pass

    def __feed_with_tag(self, cid):

        for feed in self.__feed_default(cid):
            yield feed
        for source_tag in dbutil.get_source_company_tags(
                self.db, cid, self.trusted_sources):
            if source_tag and source_tag.strip():
                yield source_tag, 2

    def __feed_default(self, cid):

        cscore = dbutil.get_company_score(self.db, cid, 37010)
        # company info
        info = dbutil.get_company_info(self.db, cid)
        score = 1.5 if cscore > 0.5 else 1
        if info.verify and info.verify == 'Y':
            score += 1
        if info.brief and info.brief.strip():
            yield self.__preprocess(info.brief.strip()), score
        if info.description and info.description.strip():
            yield self.__preprocess(info.description.strip()), score

        # source company
        for info in dbutil.get_source_company_infos(self.db, cid):
            discount = self.non_trusted_discount if info.source not in self.trusted_sources else 1
            if info.brief and info.brief.strip():
                yield self.__preprocess(
                    info.brief.strip()), discount * self.brief_promote
            if info.description and info.description.strip():
                yield self.__preprocess(info.description.strip()), discount

        # iOS
        info = dbutil.get_recommend_artifact(self.db, cid)
        if info and info.description and info.description.strip():
            ascore = 1 if (info.verify and info.verify == 'Y') else 0.5
            yield self.__preprocess(info.description.strip()), ascore

    def __preprocess(self, content):

        # clean and narrow down candidates
        # 繁转简
        content = hants.translate(unicode(content))
        # 转小写
        content = content.lower()

        return content.strip()
示例#18
0
class PositionClassifier(object):
    def __init__(self):

        self.segmenter = Segmenter(cut_all=True)
        self.first_mapping = {
            1: u'技术',
            2: u'产品',
            3: u'设计',
            4: u'运营',
            5: u'市场',
            6: u'职能'
        }
        self.first_positions = dict.fromkeys(self.first_mapping.keys())
        self.train_first_positions()

    def train_first_positions(self):

        self.first_positions[1] = set([
            u'工程师', u'技术', u'java', u'python', u'php', u'c++', u'c',
            u'android', u'ios', u'测试', u'web', u'前端', u'数据库', u'ruby', u'perl',
            u'node.js', u'c#', u'go', u'html5', u'flash', u'javascript',
            u'u3d', u'运维', u'网络', u'安全', u'数据仓库', u'dba', u'mysql', u'oracle',
            u'sqlserver', u'sql', u'硬件', u'嵌入式', u'驱动', u'材料', u'开发'
        ])
        self.first_positions[2] = set([
            u'产品',
            u'产品经理',
            u'策划',
        ])
        self.first_positions[3] = set([
            u'设计',
            u'设计师',
            u'游戏',
            u'ui',
            u'ue',
        ])
        self.first_positions[4] = set([
            u'运营',
            u'coo',
            u'编辑',
            u'主编',
            u'文案',
            u'售前',
            u'售后',
            u'客服',
        ])
        self.first_positions[5] = set([
            u'市场', u'销售', u'seo', u'sem', u'商务', u'客户', u'bd', u'公关', u'采购',
            u'物流', u'仓储', u'广告', u'媒介', u'招商', u'推广'
        ])
        self.first_positions[6] = set([
            u'人事', u'hr', u'行政', u'培训', u'绩效', u'前台', u'总助', u'秘书', u'文秘',
            u'财务', u'会计', u'出纳', u'税务', u'审计', u'hrm', u'hrd', u'财务', u'法务',
            u'律师', u'专利', u'招聘'
        ])

    def get_first_positions(self):
        return self.first_positions.keys()

    def classify_first(self, position):

        position = set(map(lambda x: x.lower(), self.segmenter.cut(position)))
        return sorted([(k, len(position & v))
                       for k, v in self.first_positions.items()],
                      key=lambda x: -x[1])[0][0]

    def get_first_name(self, key):

        return self.first_mapping.get(key)