Exemplo n.º 1
0
def dump_data():

    global logger_nf
    mongo = dbcon.connect_mongo()
    db = dbcon.connect_torndb()
    nf = NewsFeeder()
    mapping = {}
    for line in codecs.open('files/sector.20171221.mapping', encoding='utf-8'):
        tag, relevants = line.strip().split('#')[0], line.strip().split(
            '#')[1].split(',')
        tag = dbutil.get_tag_id(db, tag)[0]
        for relevant in relevants:
            mapping[dbutil.get_tag_id(db, relevant)[0]] = tag
    sectors = {
        s.id: s.tagId
        for s in db.query('select * from sector where tagId is not null;')
    }
    ftrain = codecs.open('tmp/20171221.fine.train', 'w', 'utf-8')
    ftest = codecs.open('tmp/20171221.fine.test', 'w', 'utf-8')
    count = 0
    for news in mongo.article.news.find({
            'processStatus': 1,
            'modifyUser': {
                '$ne': None
            },
            'sectors': {
                '$ne': None
            }
    }):
        labels = filter(
            lambda x: x in mapping.keys(),
            [sectors.get(s, -1) for s in news.get('sectors', []) if s != 999])
        labels.extend(
            [t for t in news.get('features', []) if t in mapping.keys()])
        labels = set([mapping.get(t) for t in labels])
        if not labels:
            continue
        labels = ['__label__%s' % t for t in labels]
        if len(labels) > 3:
            continue
        labels = ' '.join(labels)
        if not labels:
            continue
        contents = ' '.join(nf.feed(news,
                                    granularity='fine')).replace('\n', ' ')
        if not contents:
            continue
        if len(contents) < 50:
            continue
        count += 1
        if count % 10000 == 0:
            logger_nf.info('Dumping file, %s done' % count)
        if randint(1, 5) == 1:
            ftest.write('%s %s\n' % (labels, contents))
        else:
            ftrain.write('%s %s\n' % (labels, contents))
    ftrain.close()
    ftest.close()
    logger_nf.info('All news dumped')
Exemplo n.º 2
0
def comps():

    global logger_comps
    query = json.loads(request.data).get('payload')
    logger_comps.info('Comps Query, %s' % query)
    cid, tag, start, size = query.get('company'), query.get(
        'tag', 0), query.get('start', 0), query.get('size', 5)
    if tag == 0:
        comps_candidates = dbutil.get_company_comps(g.db, cid)
        logger_comps.info(comps_candidates)
        results = {
            'company': {
                'count':
                len(comps_candidates),
                'data':
                map(lambda x: {'id': dbutil.get_company_code(g.db, x)},
                    comps_candidates)[start:start + size],
                'tags':
                dbutil.prompt_tag_filter(g.db, comps_candidates)
            }
        }
    else:
        tag = dbutil.get_tag_id(g.db, tag)[0]
        comps_candidates = dbutil.get_filtered_company_comps(g.db, cid, tag)
        results = {
            'company': {
                'count':
                len(comps_candidates),
                'data':
                map(lambda x: {'id': dbutil.get_company_code(g.db, x)},
                    comps_candidates)[start:start + size]
            }
        }
    return make_response(jsonify(results))
Exemplo n.º 3
0
Arquivo: key.py Projeto: yujiye/Codes
    def extract_without_update(self, cid, topn=15):

        contents = list(self.feeder.feed(cid))
        results = {}
        if len(contents) > self.max_contents_length:
            contents = sorted(contents, key=lambda x: -x[1])[:self.max_contents_length]

        # important tag
        results = self.merge(results, self.__extract_important(contents), 1)
        # regular tag
        results = self.merge(results, dict(self.__extract_textrank(contents, topn)))
        # verified tag
        results = self.merge(results, dict.fromkeys(dbutil.get_company_tags_verified(self.db, cid), 1))
        # normalize
        results = self.__normalize(results)
        # update contents based tags
        results = self.__normalize_replacement(results)
        new_tags = self.update_vip_tags(cid, results, [])
        for tag, weight in sorted(results.items(), key=lambda x: -x[1])[:topn]:
            tid, active = dbutil.get_tag_id(self.db, tag)
            if tid in self.vip_tags.values():
                continue
            if active:
                new_tags.append(tid)
        return new_tags
Exemplo n.º 4
0
def load_blockchain():

    db = dbcon.connect_torndb()
    db.execute('delete from tags_rel where id=447283;')
    t3s = [t.tag2Id for t in db.query('select tag2Id from tags_rel where tagId=175747 and type=54041;')]
    db.execute('delete from tags_rel where tagId=175747;')
    db.execute('update tag set type=11010, sectorType=null where id in %s;', t3s)
    for line in codecs.open('files/blockchain', encoding='utf-8'):
        tags = line.strip().split()
        if len(tags) == 1:
            t2id = dbutil.get_tag_id(db, tags[0])[0]
            db.execute('update tag set type=11013, sectorType=2 where name=%s;', tags[0])
            dbutil.update_tags_rel(db, 175747, t2id, 1, 54041)
        if len(tags) == 2:
            t2id = dbutil.get_tag_id(db, tags[0])[0]
            t3id = dbutil.get_tag_id(db, tags[1])[0]
            db.execute('update tag set type=11013, sectorType=3 where name=%s;', tags[1])
            dbutil.update_tags_rel(db, t2id, t3id, 1, 54041)
    db.execute('update tag set sectorType=1, type=11012 where id=175747;')
    db.execute('insert into sector (sectorName, active, level, tagid, createtime) '
               'values ("区块链", "Y", 1, 175747, now());')
Exemplo n.º 5
0
Arquivo: key.py Projeto: yujiye/Codes
    def update_vip_tags(self, cid, support_tags, source_tags):

        vips = {}
        support_tag_ids = set(dbutil.get_tag_id(self.db, tag)[0] for tag in support_tags)
        support_vips = {self.vip_tags.get(tag): weight
                        for tag, weight in support_tags.items() if tag in self.vip_tags.keys()}
        for support_vip, support_weight in support_vips.iteritems():
            hyponyms = dbutil.get_hyponym_tags(self.db, support_vip)
            support_vips[support_vip] = support_weight + len(set(hyponyms) & support_tag_ids)
        source_vips = [self.vip_tags.get(tag) for tag in source_tags if tag in self.vip_tags.keys()]
        # desc = ' '.join(self.wfilter([x[0] for x in self.tagger.tag(self.feeder.feed_string(cid, 'with_tag'))]))
        desc = ' '.join(self.wfilter(self.seg.cut4search(self.feeder.feed_string(cid, 'with_tag'))))
        if not desc:
            desc = u'其他'
        # print desc
        classifier_vips = {int(tag.replace(u'__label__', '')): weight for (tag, weight) in
                           self.vip_classifier.predict_proba([desc], 3)[0] if weight > self.vip_lower}
        traditional = self.traditional_classifier.predict_proba([desc], 1)[0][0]
        if source_vips:
            for rank, vip in enumerate(sorted([t for t in source_vips if t in classifier_vips],
                                              key=lambda x: support_vips.get(x, 0)+classifier_vips.get(x, 0),
                                              reverse=True)):
                vips[vip] = 2.999 - round(rank/10.0, 1)
        if traditional[0].replace('__label__', '') == '1' and traditional[1] > 0.6:
            vips[604330] = round(2 + traditional[1], 2)
        elif len(vips) > 1:
            pass
        else:
            vip_candidates = sorted((set(support_vips.keys()) | set(classifier_vips.keys())),
                                    key=lambda x: -support_vips.get(x, 0.1)*0.1*classifier_vips.get(x, 0.01))
            if len(vip_candidates) == 0:
                pass
            elif len(vip_candidates) == 1:
                vip = vip_candidates[0]
                vips[vip] = max(2.9, round(2 + support_vips.get(vip, 0.01) * classifier_vips.get(vip, 0.01), 2))
            else:
                for rank, vip in enumerate(vip_candidates):
                    # print rank, vip, support_vips.get(vip, 0.01), classifier_vips.get(vip, 0.01)
                    rank_discount = {0: 1,
                                     1: 0.3}.get(rank, 0.2)
                    if support_vips.get(vip, 0.01)*rank_discount + classifier_vips.get(vip, 0.01) > self.vip_threshold:
                        vips[vip] = round(max(2.9-rank*0.01,
                                              2+support_vips.get(vip, 0.01)*classifier_vips.get(vip, 0.01)), 2)
        for tid, weight in vips.items():
            if self.replacements.get(tid):
                for rtid in self.replacements.get(tid, []):
                    dbutil.update_company_tag(self.db, cid, rtid, weight)
            else:
                dbutil.update_company_tag(self.db, cid, tid, weight)
            # print dbutil.get_tag_info(self.db, tid, 'name'), tid, weight
        return vips.keys()
Exemplo n.º 6
0
    def __deduct_2nd(self, tags):

        deduct = []
        tags = [(dbutil.get_tag_id(self.db, t)[0], t) for t in tags.keys()]
        for (tid, tag) in tags:
            if self.tag_types.get(tag, 0) == 11013:
                t1s = dbutil.get_hypernym_tags(self.db, tid, 1)
                for t1 in set(t1s) & set([t[0] for t in tags]):
                    t2s = set(dbutil.get_hyponym_tags(self.db, t1, 2)) & set(
                        dbutil.get_hypernym_tags(self.db, tid, 2))
                    for t2 in t2s:
                        if t2 not in set([t[0] for t in tags]):
                            deduct.append(t2)
        return {dbutil.get_tag_name(self.db, t2): 2.49 for t2 in deduct}
Exemplo n.º 7
0
Arquivo: rank.py Projeto: yujiye/Codes
def score():

    db = dbcon.connect_torndb()
    with codecs.open('dumps/rank', 'w', 'utf-8') as fo:
        for tag in [u'大数据', u'小程序', u'短视频', u'民宿', u'足球', u'咖啡']:
            cids = []
            tid = dbutil.get_tag_id(db, tag)[0]
            complete = db.query(
                'select rel.companyId cid from company_tag_rel rel, company_scores s '
                'where (rel.active="Y" or rel.active is null) and rel.companyId=s.companyId '
                'and s.type=37010 and tagId=%s order by score desc limit 100;',
                tid)
            cids.extend([c.cid for c in complete])
            yellows = db.query(
                'select companyId cid, count(*) c from company_tag_rel rel, tag '
                'where tag.id=tagId and tag.type=11100 and (tag.active is null or tag.active="Y") '
                'and (rel.active="Y" or rel.active is null) and companyId in '
                '(select distinct companyId from company_tag_rel where tagId=%s '
                'and (active is null or active="Y")) group by companyId order by c desc limit 100;',
                tid)
            cids.extend([c.cid for c in yellows])
            msgs = db.query(
                'select msg.companyId cid, count(*) c from company_message msg, company_tag_rel rel '
                'where msg.active="Y" and msg.companyId=rel.companyId and msg.publishTime>"2018-02-01" '
                'and rel.tagId=%s and (rel.active="Y" or rel.active is null) group by msg.companyId '
                'order by c desc limit 100;', tid)
            cids.extend([c.cid for c in msgs])
            cids = set(cids)
            for cid in cids:
                name = dbutil.get_company_name(db, cid)
                brief = dbutil.get_company_brief(db, cid)
                url = 'http://www.xiniudata.com/#/company/%s/overview' % dbutil.get_company_code(
                    db, cid)
                s1 = dbutil.get_company_score(db, cid, 37010)
                s1 = 1 if s1 >= 0.5 else s1
                s2 = (len(dbutil.get_company_tags_yellow(db, cid, False)) + 1 -
                      dbutil.get_company_yellow_time_deduction(db, cid)) / 9
                s3 = (log10(
                    len(dbutil.get_company_messages(db, cid, 'Y',
                                                    '2018-02-01')) + 1)) / 4
                s4 = db.get(
                    'select confidence from company_tag_rel where companyId=%s and tagId=%s;',
                    cid, tid).confidence
                fo.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' %
                         (tag, name, brief, url, s1, round(s2, 2), round(
                             s3, 2), round(s4, 2)))
Exemplo n.º 8
0
Arquivo: key.py Projeto: yujiye/Codes
    def update_contents_tags(self, cid, tags, source_tags, vips, topn):

        """
        normalize contents based tags and update mysql
        """

        old_tags = dbutil.get_company_tags_old(self.db, cid)
        new_tags = []
        for tag, weight in sorted(tags.items(), key=lambda x: -x[1])[:topn]:
            tid, active = dbutil.get_tag_id(self.db, tag)
            if tag in source_tags:
                weight += 0.009
            if tid in self.vip_tags.values():
                continue
            if active:
                new_tags.append(tid)
            if self.replacements.get(tid):
                new_tags.remove(tid)
                for rtid in self.replacements.get(tid, []):
                    if rtid in self.vip_tags.values():
                        continue
                    dbutil.update_company_tag(self.db, cid, rtid, weight, active=active)
                    new_tags.append(rtid)
            else:
                dbutil.update_company_tag(self.db, cid, tid, weight, active=active)
        # add classifed tags
        try:
            content = list(self.feeder.feed_seged(cid))
            if u'区块链' not in content:
                pass
            else:
                content = [np.mean([self.w2v[w] for w in content if w in self.w2v], axis=0)]
                for tid, clf in self.trained_tag_clfs.iteritems():
                    if clf.predict(content)[0] == 1:
                        dbutil.update_company_tag(self.db, cid, tid, 2.806, verify='N', active='Y')
                        new_tags.append(tid)
        except Exception, e:
            logger_tag.exception('Fail to classify, due to %s', e)
Exemplo n.º 9
0
    def __search_ranklist(self, **kwargs):

        query = dict(kwargs)
        start = query.get('start', 0)
        size = min(query.get('size', 10), self.max_result_size)
        sort = query.get('sort', 76001)
        order = query.get('order', 'default')
        tag = query.get('filter', {}).get('tag')
        if not tag:
            return {"company": {"count": 0, "data": [], 'sectors': []}}
        tag = tag[0]
        tid = dbutil.get_tag_id(self.db, tag)[0]

        general_query = UniversalQuery(query.get('input'), query.get('filter'))
        es_query = general_query.generate_query()
        logger_universal.info('ES %s, topic %s' % (es_query, tag))
        hits = self.es.search(index='xiniudata2', doc_type='universal',
                              body={"query": es_query, "sort": self.__generate_sort_search(sort, order, tid),
                                    "from": start, "size": size})
        count = hits['hits'].get('total', 0)
        hits = self.__organize(hits)
        self.logger.info('Result ready')
        sector_filters = self.__get_sector_filter(tag, 'tag')
        return {"company": {"count": count, "data": hits, 'sectors': sector_filters}}