def dump(): db = dbcon.connect_torndb() with codecs.open('dumps/tags.rel', 'w', 'utf-8') as fo: for t in dbutil.get_tags_by_type(db, [11010, 11011, 11012, 11013]): rels = [(dbutil.get_tag_info(db, r.tag2Id, 'name'), dbutil.get_tag_info(db, r.tag2Id, 'type')) for r in dbutil.get_tags_rel(db, t.id, type=54020)] fo.write('%s\t%s\t%s\n' % (t.type, t.name, ','.join([x[0] for x in rels if x[1] == 11011]))) fo.write('%s\t\t%s\n' % (t.type, ','.join([x[0] for x in rels if x[1] == 11013]))) fo.write('%s\t\t%s\n' % (t.type, ','.join([x[0] for x in rels if x[1] == 11010]))) fo.write('%s\t\t%s\n' % (t.type, ','.join([x[0] for x in rels if x[1] == 11000]))) db.close()
def replace(self): for replacement in self.mongo.keywords.replacement.find({'active': 'Y'}): source = replacement.get('source') replaces = replacement.get('replacement') if len(replaces) > 1 and dbutil.get_tag_info(self.db, source, 'type') >= 11010: for c in reduce(lambda x, y: x & y, [set(dbutil.get_company_from_tag(self.db, replace)) for replace in replaces]): dbutil.update_company_tag(self.db, c, source, 1.503, 'P')
def update_relevant_tags(self, tid, t_type=None, t_name=None): if not t_type: t_type = dbutil.get_tag_info(self.db, tid, 'type') if not t_name: t_name = dbutil.get_tag_info(self.db, tid, 'name') for target_type in [11000, 11010, 11011, 11013]: candidates = [tag for tag, (_, t) in self.tags.items() if t == target_type] similarities = sorted([(tag, self.__get_similarity(t_name, tag)) for tag in candidates], key=lambda x: -x[1])[:self.max_candidates.get(target_type)] if len(filter(lambda x: x[1] > self.similarity_threshold, similarities)) == 0: similarities = [] else: similarities = filter(lambda x: x[1] > self.similarity_threshold, similarities) for tag, weight in similarities: dbutil.update_tags_rel(self.db, tid, self.tags.get(tag)[0], weight, 54020) if t_type != 11012: candidates = [tag for tag, (_, t) in self.tags.items() if t == 11012] vip, weight = max([(tag, self.__get_similarity(t_name, tag)) for tag in candidates]) dbutil.update_tags_rel(self.db, tid, self.tags.get(vip)[0], weight, 54020)
def __process_step2(self, record): global producer_news_task # not news if record.get('processStatus', 0) != 1: self.mongo.article.news.update( {'_id': ObjectId(record['news_id'])}, {'$set': { 'processStatus': -1 }}) return if record.get('source', 0) == 'gongshang': return # update article news news_tags = [int(tid) for tid in record.get('newsTags', [])] news_sectors = self.__map_sectors(news_tags) orginal_features = [ int(tid) for tid in self.mongo.article.news.find_one({ '_id': ObjectId(record['news_id']) }).get('features', []) ] industry_tags = [ tid for tid in orginal_features if dbutil.get_tag_info(self.db, tid).type < 11050 ] dup_industry_tags = [ tid for tid in industry_tags if tid not in news_tags ] update_features = [ tid for tid in orginal_features if tid not in dup_industry_tags ] self.mongo.article.news.update({'_id': ObjectId(record['news_id'])}, { '$set': { 'sectors': news_sectors, 'processStatus': 1, 'modifyTime': datetime.utcnow(), 'features': update_features } }) # '$addToSet': {'features': {'$each': news_tags}}}) # update tags as features features = record.get('newsTags', []) self.mongo.article.news.update( {'_id': ObjectId(record['news_id'])}, {'$set': { 'processStatus': 1, 'modifyTime': datetime.utcnow() }}) self.__update_news_features(record['news_id'], tn2_features=features)
def identify(self, today=None): if not today: today = datetime.today() # growth yesterday start = datetime.fromordinal( (today - timedelta(days=2)).date().toordinal()) df = pd.DataFrame( list( self.mongo.keywords.trend_statistc.find( {'date': { '$gte': start }}))) df = df.groupby(['tag', 'subtype'])['weight'].\ agg({'growth': lambda weight: (max(weight)-min(weight)+1)/(min(weight)+1)}) df.reset_index(inplace=True) df['rank'] = df.groupby('subtype')['growth'].rank(ascending=0, method='first') df['name'] = df.apply( lambda x: dbutil.get_tag_info(self.db, x[0], 'name'), 1) df = df.loc[(df['rank'] < 5) & (df['growth'] > 1)] for _, row in df.iterrows(): row = dict(row) if len( list( self.mongo.task.tag.find({ 'type': 'trend', 'id': row.get('tag'), 'processStatus': 0 }))) > 0: self.mongo.task.tag.update( { 'type': 'trend', 'id': row.get('tag'), 'processStatus': 0 }, {'$set': { 'modifyTime': datetime.utcnow() }}) else: self.mongo.task.tag.insert({ 'type': 'trend', 'id': row.get('tag'), 'processStatus': 0, 'name': row.get('name'), 'createTime': datetime.utcnow(), 'modifyTime': datetime.utcnow(), 'reason': row.get('subtype') })
def fit_tag(self): # update tag type if self.tag: original_type = dbutil.get_tag_info(self.db, self.tag, 'type') if original_type and original_type < 11011: dbutil.update_tag_type(self.db, self.tag, 11011, with_tag_id=True) # update company tag for tpc in dbutil.get_industry_companies(self.db, self.idid): if self.tag: dbutil.update_company_tag(self.db, tpc.companyId, self.tag, 1.502, verify='P')
def memorize(self, tid, today=None): global logger_tt if not today: today = datetime.today() yesterday = today - timedelta(days=1) today_int = int(today.strftime('%Y%m%d')) tag = dbutil.get_tag_info(self.db, tid, 'name') logger_tt.info('Start to process %s' % tid) # relevant companies cids = dbutil.get_company_from_tags(self.db, [tid]) codes = [dbutil.get_company_code(self.db, cid) for cid in cids] visits = self.mongo.log.user_log.find({ 'time': { '$gt': today - timedelta(hours=32), '$lte': today - timedelta(hours=8) }, 'requestURL': "/xiniudata-api/api2/service/company/basic", 'jsonRequest.payload.code': { '$in': codes } }) # visits = list(visits) # visits = [visit['jsonRequest']['payload']['code'] in codes for visit in visits] self.mongo.keywords.trend_statistc.update( { 'tag': tid, 'date': datetime.fromordinal(today.date().toordinal()), 'subtype': 'company_visit' }, {'$set': { 'type': 'company', 'weight': len(list(visits)) }}, True) subscriptions = dbutil.get_company_subscription_details( self.db, yesterday.strftime('%Y-%m-%d'), today.strftime('%Y-%m-%d'), *cids) self.mongo.keywords.trend_statistc.update( { 'tag': tid, 'date': datetime.fromordinal(today.date().toordinal()), 'subtype': 'company_subscribe' }, {'$set': { 'type': 'company', 'weight': len(subscriptions) }}, True) # logger_tt.info('Company done') # relevant news news = self.search_client.search('general', input=tag, filters={ 'date': today_int }, size=500).get('news', {}) news = list(news.get('data', [])) self.mongo.keywords.trend_statistc.update( { 'tag': tid, 'date': datetime.fromordinal(today.date().toordinal()), 'subtype': 'news_relevant' }, {'$set': { 'type': 'news', 'weight': len(news) }}, True) # logger_tt.info('News searched') news_read = self.mongo.log.user_log.find({ 'time': { '$gt': today - timedelta(hours=32), '$lte': today - timedelta(hours=8) }, 'requestURL': self.news_read_url, 'jsonRequest.payload.newsId': { '$in': news } }) self.mongo.keywords.trend_statistc.update( { 'tag': tid, 'date': datetime.fromordinal(today.date().toordinal()), 'subtype': 'news_read' }, {'$set': { 'type': 'news', 'weight': len(list(news_read)) }}, True) # logger_tt.info('News done') # search search = self.mongo.log.search.find({ 'time': { '$gt': today - timedelta(hours=32), '$lte': today - timedelta(hours=8) }, 'query.input': tag, 'userId': { '$ne': None } }) self.mongo.keywords.trend_statistc.update( { 'tag': tid, 'date': datetime.fromordinal(today.date().toordinal()), 'subtype': 'search_precise' }, {'$set': { 'type': 'search', 'weight': len(list(search)) }}, True)