Exemplo n.º 1
0
    def __init__(self, method='default'):

        self.db = dbconn.connect_torndb()
        self.mongo = dbconn.connect_mongo()
        # self.scorer = CompanyTagsRelevance()
        self.scorer = CompanyUserRelevance()

        self.daily_recommendation_size = 2
        self.pool_size = 100

        if method == 'controlled':
            self.general_pusher = PushScorer()
            self.candidates = dbutil.get_all_push_pool(self.db)
            self.__update_push_pool()
        if method == 'default':
            self.general_pusher = PushScorer()
            self.candidates = self.general_pusher.promote_general(self.db)
            self.__update_push_pool()

        # rounds and locations
        self.rounds = {
            cid: dbutil.get_company_round(self.db, cid)
            for cid in self.candidates
        }
        self.locations = {
            cid: dbutil.get_company_location(self.db, cid)[0]
            for cid in self.candidates
        }
Exemplo n.º 2
0
    def new_dominator(self, today=None, type='free', genre=None):

        today = today or datetime.today()
        one_week_before, three_month_before = today - timedelta(days=8), today - timedelta(days=90)
        dominate_domain = [item['_id'] for item in list(self.mongo.trend.appstore_rank.aggregate([
            {'$match': {'date': {'$gt': one_week_before, '$lte': today}, 'rank': {'$lte': 10}, 'type': type, 'genre': genre}},
            {'$group': {'_id': '$trackId', 'times': {'$sum': 1}}},
            {'$match': {'times': {'$gte': 7}}}]))]

        def never_dominate_before(track_id):

            top_rank = list(self.mongo.trend.appstore_rank.find({'trackId': track_id, 'type': type, 'genre': genre,
                                                        'date': {'$gt': three_month_before, '$lte': one_week_before}}
                                                       ).sort([('rank', 1)]).limit(10))
            return top_rank[-1]['rank'] > 30 if top_rank else True

        new_dominate_domain = filter(never_dominate_before, dominate_domain)
        new_dominator = set()
        for track_id in new_dominate_domain:
            for aid in dbutil.get_artifacts_from_iOS(self.db, track_id):
                cid = dbutil.get_artifact_company(self.db, aid)
                corp_round = dbutil.get_company_round(self.db, cid)
                if corp_round < 1060:
                    app_name = self.db.query('select name from artifact where id = %s' % (aid))[0]['name']
                    company_name = dbutil.get_company_name(self.db, cid)
                    logger_track.info('\nDate: %s Genre: %s Type: %s\nDominator: %s Company: %s\n\n'
                                      % (today, genre, type, app_name, company_name))
                    new_dominator.add((cid, company_name, app_name))
        return new_dominator
Exemplo n.º 3
0
    def update_3109(self, today=None):

        global logger_track
        today = today or datetime.today()
        one_week_before, three_month_before = today - timedelta(days=8), today - timedelta(days=90)
        types = ['free', 'charge', 'grossing']
        genres = self.__get_genres()
        for t in types:
            for g in genres:
                outstanding_apps_candidates = [item['_id'] for item in list(self.mongo.trend.appstore_rank.aggregate([
                    {'$match': {'date': {'$gt': one_week_before, '$lte': today}, 'rank': {'$lte': 10}, 'type': t,
                                'genre': g}},
                    {'$group': {'_id': '$trackId', 'times': {'$sum': 1}}},
                    {'$match': {'times': {'$gte': 7}}}]))]
                
                def previous_perform_poorly(track_id):
                    top_rank = list(self.mongo.trend.appstore_rank.find({'trackId': track_id, 'type': t, 'genre': g,
                                                                         'date': {'$gt': three_month_before,
                                                                                  '$lte': one_week_before}}
                                                                        ).sort([('rank', 1)]).limit(10))
                    return top_rank[-1]['rank'] > 30 if top_rank else True

                outstanding_apps = filter(previous_perform_poorly, outstanding_apps_candidates)
                for track_id in outstanding_apps:
                    for aid in dbutil.get_artifacts_from_iOS(self.db, track_id):
                        cid = dbutil.get_artifact_company(self.db, aid)
                        corp_round = dbutil.get_company_round(self.db, cid)
                        if corp_round < 1060:
                            msg = u'%s旗下 %s 近期在AppStore%s排名表现突出' % \
                                  (dbutil.get_company_name(self.db, cid),
                                   self.__normalize_iOS_name(dbutil.get_artifact_name(self.db, aid)),
                                   self.__get_rank_name(g, t))
                            detail = '%s,%s' % (g, t)
                            dbutil.update_continuous_company_message(self.db, cid, msg, 3109, 30, aid, 7, detail)
                            logger_track.info('3109, %s, %s, %s, %s' % (cid, aid, t, g))
Exemplo n.º 4
0
def __source_database(db, mongo, yesterday, day_seven):

    aggregates = [
        item.get('newCorporateIds', []) for item in
        mongo.task.corporate_decompose.find({'modifyTime': {
            '$gt': day_seven
        }})
    ]
    aggregates = set(chain(*aggregates))
    for c in db.query(
            'select company.id id, source_company.source source from company, source_company '
            'where company.createTime>%s and company.modifyTime>%s and company.id=source_company.companyId '
            'and (company.active is null or company.active="Y") and '
            '(source_company.active is null or source_company.active="Y");',
            day_seven, yesterday):
        if dbutil.get_company_round(db, c.id) > 1040:
            continue
        if dbutil.get_company_establish_date(db, c.id).year < 2016:
            continue
        if dbutil.get_company_corporate_id(db, c.id) in aggregates:
            continue
        if dbutil.get_company_source(db, c.id) == set([13050]):
            continue
        dbutil.update_extract_source_company(db,
                                             67001,
                                             c.source,
                                             c.id,
                                             only_insert=False)
Exemplo n.º 5
0
    def __extract_11120(self, cid):

        # 早期公司,小于等于B轮,成立时间在2010年之后
        if 0 <= dbutil.get_company_round(self.db, cid) <= 1040 \
                and dbutil.get_company_establish_date(self.db, cid).year >= 2010:
            yield 579089
        # 公司状态,融资中 等
        status = dbutil.get_company_status(self.db, cid)
        if status in (2010, 2020, 2025):
            yield {
                2010: 589014,
                2015: 589015,
                2020: 589016,
                2025: 589017
            }[status]
Exemplo n.º 6
0
def __source_news(db, mongo, today, yesterday):

    bad_news = [
        r.get('companyIds', [])
        for r in mongo.article.news.find({
            'createTime': {
                '$gt': yesterday,
                '$lt': today
            },
            'processStatus': 1,
            'type': 60001,
            'features': {
                '$ne': 578362
            },
            'modifyUser': {
                '$ne': 139
            }
        })
    ]
    bad_news = set(chain(*bad_news))
    for record in mongo.article.news.find({
            'createTime': {
                '$gt': yesterday,
                '$lt': today
            },
            'processStatus': 1,
            'type': 60001,
            'features': {
                '$ne': 578362
            },
            'modifyUser': {
                '$ne': 139
            }
    }):
        for cid in record.get('companyIds', []):
            if cid == 449316 or cid == 416649:
                dbutil.update_extract_source_company(db, 67002,
                                                     record['source'], cid,
                                                     record['_id'], False)
            if cid in bad_news:
                continue
            if dbutil.get_company_round(db, cid) > 1040:
                continue
            if dbutil.get_company_establish_date(db, cid).year < 2010:
                continue
            dbutil.update_extract_source_company(db, 67002, record['source'],
                                                 cid, record['_id'], False)
Exemplo n.º 7
0
    def init_filter(self):

        portfilios = {
            iid: dbutil.get_investor_portfilio(self.db, iid,
                                               ('2016-01-01', '2017-10-31'))
            for iid in self.funds.keys()
        }
        fund_rounds = {
            iid:
            np.mean([dbutil.get_company_round(self.db, p.cid) for p in ps])
            for iid, ps in portfilios.iteritems()
        }
        fund_activeness = {iid: len(ps) for iid, ps in portfilios.iteritems()}
        fund_locations = {
            iid: len(
                filter(
                    lambda x: dbutil.get_company_location(self.db, x.cid)[0] <
                    371, ps)) * 2 > len(ps)
            for iid, ps in portfilios.iteritems()
        }
        return fund_rounds, fund_activeness, fund_locations
Exemplo n.º 8
0
def __source_module_71001(db, mongo, yesterday, day_seven):

    aggregates = [
        item.get('newCorporateIds', []) for item in
        mongo.task.corporate_decompose.find({'modifyTime': {
            '$gt': day_seven
        }})
    ]
    aggregates = set(chain(*aggregates))
    # for c in db.query('select company.id id, source_company.source source from company, source_company '
    #                   'where company.createTime>%s and company.modifyTime>%s and company.id=source_company.companyId '
    #                   'and (company.active is null or company.active="Y") and '
    #                   '(source_company.active is null or source_company.active="Y");', day_seven, yesterday):
    for tc in mongo.task.company.find({
            'finishTime': {
                '$gte': yesterday
            },
            'processStatus': 1,
            'types': 'company_job'
    }):
        cid = tc.get('companyId')
        if dbutil.get_company_active(db, cid) == 'Y':
            if dbutil.get_company_round(db, cid) > 1040:
                continue
            # if dbutil.get_company_establish_date(db, cid).year < 2000:
            #     continue
            # if dbutil.get_company_corporate_id(db, cid) in aggregates:
            #     continue
            if dbutil.get_company_source(db, cid) == {13050}:
                dbutil.update_extract_source_company(db,
                                                     67001,
                                                     13050,
                                                     cid,
                                                     only_insert=False)
                dbutil.update_custom_sourcing_company(db, cid, 71001,
                                                      day_seven)
Exemplo n.º 9
0
    def create_single(self, db, cid):
        """
        create a single index for a particular company,
        completion id consists of its type and original id, including
            cxxxx, fxxx, axxxx, pxxxx, nxxxx, standing for company, full, artifact, product, nick
            kxxxx, keyword
        """

        # check whether to index this cid
        if not dbutil.get_company_index_type(db, cid):
            self.logger.info('should not index %s' % cid)
            return

        company = {}
        alias = set()
        company_score = dbutil.get_company_score(db, cid, 37020)
        company['ranking_score'] = company_score

        name = dbutil.get_company_name(db, cid).lower().replace(' ', '')
        code = dbutil.get_company_code(db, cid)
        company['cid'] = code
        completion = {
            'id': cid,
            '_name': name,
            '_code': code,
            '_prompt': 'name',
        }

        # First, Names
        # short name
        alias.add(name.lower())
        alias.add(''.join(lazy_pinyin(name.lower())))
        # full name
        full = dbutil.get_company_corporate_name(db, cid, False)
        if full and full.strip():
            alias.add(full.lower())
            # TODO temp solution
            alias.add(full.lower().replace(u'北京',
                                           '').replace(u'上海',
                                                       '').replace(u'深圳', ''))
        # artifact name
        aresults = dbutil.get_artifact_idname_from_cid(db, cid, True)
        if aresults:
            alias.update([
                self.valid_name(aname) for _, aname in aresults
                if self.valid_name(aname)
            ])
        # alias
        aliass = dbutil.get_alias_idname(db, cid)
        if aliass and len(aliass) < 20:
            alias.update([
                self.valid_name(aname) for _, aname in aliass
                if self.valid_name(aname)
            ])
        # corporate name
        corporate = dbutil.get_company_corporate_name(db, cid)
        if corporate and corporate.strip():
            alias.add(corporate.lower())
        # corporate full name
        corporate_full = dbutil.get_company_corporate_name(db, cid, False)
        if corporate_full and corporate_full.strip():
            alias.add(corporate_full.lower())
        # corporate alias
        corporate_alias = dbutil.get_corporate_alias(db, cid)
        if corporate_alias and len(corporate_alias) < 20:
            alias.update([
                self.valid_name(aname) for aname in corporate_alias
                if self.valid_name(aname)
            ])
        # check if there is a relevant digital coin
        dt = dbutil.get_company_digital_coin_info(db, cid)
        if dt:
            alias.add(dt.symbol.lower())
            # short name
            if dt.name:
                alias.add(dt.name.lower().replace(' ', ''))
            # english name
            if dt.enname:
                alias.add(dt.enname.lower())

        # create indice names
        completion['completionName'] = list(alias)
        company['name'] = name.lower()
        company['alias'] = self.analyze_names(alias)

        # Second, team identify, investor identify
        team = self.identifier.identify(cid)
        if team and len(team) > 0:
            company['team'] = team
        if dbutil.exist_company_tag(db, cid, 309129):
            company['investor'] = 44010

        # Third, keywords
        # regular tag
        tags_info = dbutil.get_company_tags_idname(db,
                                                   cid,
                                                   tag_out_type=(11000, 11001,
                                                                 11002))
        if tags_info:
            for tid, tname, weight in tags_info:
                company.setdefault('tags', []).append(tname.lower())
        # yellows, --> forget y take this out
        yellows = dbutil.get_company_tags_yellow(db, cid)
        if yellows:
            company['yellows'] = [yellow.lower() for yellow in yellows]

        # Forth, description
        desc = dbutil.get_company_solid_description(db, cid)
        if desc and desc.strip():
            desc = filter(lambda x: (x not in self.stopwords) and len(x) > 1,
                          list(self.seg.cut4search(desc)))
            company['description'] = (' '.join(desc)).lower()

        # Fifth, round and investors and members
        company['round'] = dbutil.get_company_round(db, cid)
        company['investors'] = dbutil.get_company_investor_names(db, cid)
        company['members'] = [
            name for _, name in dbutil.get_member_idname(db, cid)
        ]

        # Sixth, location
        lid, lname = dbutil.get_company_location(db, cid)
        company['location'] = lid

        # Seventh, establish date, create date, count of company message
        establish_date = dbutil.get_company_establish_date(db, cid)
        try:
            company['established'] = int(establish_date.strftime('%Y%m'))
        except Exception, e:
            pass
Exemplo n.º 10
0
    def __process_step1(self, record):

        global producer_news_task

        # not news
        if record.get('processStatus', 0) != 1:
            self.mongo.article.news.update(
                {'_id': ObjectId(record['news_id'])},
                {'$set': {
                    'processStatus': -1
                }})
            return
        if record.get('source', 0) == 'gongshang':
            return

        # update article news
        category = self.__map_category(record.get('categories', []))
        cids = record.get('companyIds', [])
        iids = record.get('investorIds', [])
        if category:
            self.mongo.article.news.update(
                {'_id': ObjectId(record['news_id'])},
                {'$set': {
                    'category': category
                }})
        self.mongo.article.news.update({'_id': ObjectId(record['news_id'])}, {
            '$set': {
                'companyIds': cids,
                'investorIds': iids,
                'modifyUser': record['modifyUser'],
                'categories': record.get('categories', [])
            }
        })

        # prepare features
        features = set()
        features.update(record.get('categories', []))
        features.add(record.get('sentiment'))
        # sector relevant features
        # orginal_features = [int(tid) for tid in
        #                     self.mongo.article.news.find_one({'_id': ObjectId(record['news_id'])}).get('features', [])]
        # industry_tags = [tid for tid in orginal_features if dbutil.get_tag_info(self.db, tid).type < 11050]
        # features.update(industry_tags)
        if {128, 578353, 578349, 578351, 578356, 578351} & set(
                record.get('categories', [])):
            features.update(record.get('newsTags', []))

        # generate step 2 task
        sector_update_flag = False
        if cids:
            startsups = filter(
                lambda cid: dbutil.get_company_round(self.db, cid) < 1041,
                cids)
            if startsups:
                news_tags = list(
                    chain(*[[
                        t.tid
                        for t in dbutil.get_company_tags_info(self.db, cid)
                        if t.verify and t.verify == "Y"
                    ] for cid in cids]))
                news_sectors = self.__map_sectors(news_tags)
                if len(news_sectors) <= 3:
                    sector_update_flag = True
                    features.update(news_tags)
                    self.mongo.article.news.update(
                        {'_id': ObjectId(record['news_id'])},
                        {'$set': {
                            'sectors': news_sectors
                        }})
        if not sector_update_flag:
            task2 = {
                'news_id': str(record['news_id']),
                'taskNewsId': str(record['_id']),
                'createTime': datetime.utcnow(),
                'newsTags': record.get('newsTags', []),
                'companyIds': cids,
                'processStatus': int(0),
                'section': 'step2'
            }
            if self.mongo.task.news.find({
                    'taskNewsId': str(record['_id'])
            }).count() == 0:
                self.mongo.task.news.insert_one(task2)

        # update article news
        self.mongo.article.news.update(
            {'_id': ObjectId(record['news_id'])},
            {'$set': {
                'processStatus': 1,
                'modifyTime': datetime.utcnow()
            }})
        self.__update_news_features(record['news_id'], features, 'skip')
        # re produce tags for mentioned companies
        for cid in cids:
            self.company_tagger.extract(cid, fast=True, update_only=True)
        # 大公司打上大公司标签
        if 578354 in features:
            for cid in cids:
                if self.mongo.article.news.find({
                        'processStatus': 1,
                        'companyIds': cid,
                        'features': 578354
                }).count() >= 3:
                    dbutil.update_company_tag(self.db,
                                              cid,
                                              599843,
                                              0,
                                              active='H')

        # track for company message and investor message
        for (feed_back,
             feed_type) in self.news_tracker.feed_1001_4tasks([record]):
            if feed_back:
                if feed_type == 'cm':
                    self.news_tracker.send_company_message_msg(feed_back)
                elif feed_type == 'im':
                    self.news_tracker.send_investor_message_msg(feed_back)

        # track for topic 30, 首次媒体报道
        self.track_topic_30(record)

        # send message to task company
        source = 'news_funding' if category == 60101 else 'news_regular'
        try:
            producer_news_task.send_messages(
                "task_company",
                json.dumps({
                    'source':
                    source,
                    'id':
                    record['news_id'],
                    'posting_time':
                    datetime.now().strftime('%Y-%m-%d:%H:%M:%S')
                }))
        except FailedPayloadsError, fpe:
            init_kafka()
            producer_news_task.send_messages(
                "task_company",
                json.dumps({
                    'source':
                    source,
                    'id':
                    record['news_id'],
                    'posting_time':
                    datetime.now().strftime('%Y-%m-%d:%H:%M:%S')
                }))
Exemplo n.º 11
0
    def create_single(self, db, cid):

        global logger_universal_index
        # check whether to index this cid
        if not dbutil.get_company_index_type(db, cid):
            logger_universal_index.info('should not index %s' % cid)
            return

        company = {}
        alias, artifacts = set(), set()
        company['ranking_score'] = dbutil.get_company_score(db, cid, 37020)

        name = dbutil.get_company_name(db, cid).lower().replace(' ', '')
        code = dbutil.get_company_code(db, cid)
        company['id'] = code

        # short name
        alias.add(name.lower())
        alias.add(''.join(lazy_pinyin(name.lower())))
        # full name
        full = dbutil.get_company_corporate_name(db, cid, False)
        if full and full.strip():
            alias.add(full.lower())
            alias.add(full.lower().replace(u'北京',
                                           '').replace(u'上海', '').replace(
                                               u'深圳', '').replace(u'成都', ''))
        # artifact name
        aresults = dbutil.get_artifact_idname_from_cid(db, cid, True)
        if aresults:
            alias.update([
                self.valid_name(aname) for _, aname in aresults
                if self.valid_name(aname)
            ])
        # alias
        aliass = dbutil.get_alias_idname(db, cid)
        if aliass and len(aliass) < 20:
            alias.update([
                self.valid_name(aname) for _, aname in aliass
                if self.valid_name(aname)
            ])
        # corporate name
        corporate = dbutil.get_company_corporate_name(db, cid)
        if corporate and corporate.strip():
            alias.add(corporate.lower())
        # corporate full name
        corporate_full = dbutil.get_company_corporate_name(db, cid, False)
        if corporate_full and corporate_full.strip():
            alias.add(corporate_full.lower())
        # corporate alias
        corporate_alias = dbutil.get_corporate_alias(db, cid)
        if corporate_alias and len(corporate_alias) < 20:
            alias.update([
                self.valid_name(aname) for aname in corporate_alias
                if self.valid_name(aname)
            ])
        # check if there is a relevant digital coin
        dt = dbutil.get_company_digital_coin_info(db, cid)
        if dt:
            alias.add(dt.symbol.lower())
            # short name
            if dt.name:
                alias.add(dt.name.lower().replace(' ', ''))
            # english name
            if dt.enname:
                alias.add(dt.enname.lower())

        # create indice names
        company['name'] = name.lower()
        company['alias'] = self.analyze_names(alias)

        # tag
        tags_info = dbutil.get_company_tags_idname(db,
                                                   cid,
                                                   tag_out_type=(11000, 11001,
                                                                 11002))
        if tags_info:
            for tid, tname, weight in tags_info:
                company.setdefault('tags', []).append(tname.lower())
                company.setdefault('features', []).append(tid)
        company['nested_tag'] = []
        for industry in dbutil.get_company_industries(db, cid):
            company.setdefault('nested_tag', []).append({
                'id': industry.industryId,
                'published': industry.publishTime,
                "category": "industry"
            })
        for topic in dbutil.get_company_topics(db, cid):
            msg_publish = dbutil.get_topic_message_company_publish(db, topic)
            company.setdefault('nested_tag', []).append({
                'id': topic.topicId,
                'published': msg_publish,
                "category": "topic"
            })
            topic_tag = self.topic_tags.get(topic.topicId)
            if topic_tag:
                company.setdefault('tags', []).append(topic_tag.lower())
        sectors = dbutil.get_company_sector_tag(db, cid)
        company['sector'] = sectors

        # description
        desc = dbutil.get_company_solid_description(db, cid)
        if desc and desc.strip():
            desc = filter(lambda x: (x not in self.stopwords) and len(x) > 1,
                          list(self.seg.cut4search(desc)))
            company['description'] = (' '.join(desc)).lower()

        # round and investors and members
        round = dbutil.get_company_round(db, cid)
        company['round'] = 1000 if round == 0 else round
        company['sort_round'] = dbutil.get_round_sort(db, company.get('round'))
        status = dbutil.get_company_status(db, cid)
        if status in {2020, 2025}:
            company['status'] = status
        elif dbutil.get_company_ipo_status(db, cid):
            company['status'] = -1
        else:
            company['status'] = -2
        company['investors'] = dbutil.get_company_investor_names(db, cid)
        company['investorId'] = dbutil.get_company_investors(db, cid)
        company['members'] = [
            name for _, name in dbutil.get_member_idname(db, cid)
        ]

        # location
        lid, lname = dbutil.get_company_location(db, cid)
        company['location'] = lid

        # establish date, create date, count of company message
        establish_date = dbutil.get_company_establish_date(db, cid)
        try:
            company['established'] = int(establish_date.strftime('%Y%m'))
        except Exception, e:
            pass