def __init__(self, subeventid): """初始化特征词类 input subeventid: 子事件ID """ self.subeventid = subeventid self.mongo = _default_mongo(usedb=MONGO_DB_NAME)
def __init__(self, id, topicid): self.id = id self.comments_collection = EVENTS_COMMENTS_COLLECTION_PREFIX + str( topicid) self.mongo = _default_mongo(host=MONGOD_HOST, port=MONGOD_PORT, usedb=MONGO_DB_NAME)
def __init__(self, id): """初始化话题实例,输入为话题ID,ObjectID """ self.id = id self.other_subeventid = self.getOtherSubEventID() self.news_collection = EVENTS_NEWS_COLLECTION_PREFIX + str(id) self.sub_events_collection = SUB_EVENTS_COLLECTION self.events_collection = EVENTS_COLLECTION self.mongo = _default_mongo(usedb=MONGO_DB_NAME)
def __init__(self, db, host, port, collection, user_collection, weibo_collection): self.db_name = db self.host = host self.port = port self.db = _default_mongo(host, port, usedb=db) self.collection = collection self.user_collection = user_collection self.weibo_collection = weibo_collection log.msg('Mongod connect to {host}:{port}:{db}:{collection}'.format(host=host, port=port, db=db, collection=collection), level=log.INFO)
def __init__(self, topicid): self.id = topicid self.comments_cluster_collection = COMMENTS_CLUSTER_COLLECTION self.comments_collection = EVENTS_COMMENTS_COLLECTION_PREFIX + str( self.id) self.news_collection = EVENTS_NEWS_COLLECTION_PREFIX + str(self.id) self.mongo = _default_mongo(host=MONGOD_HOST, port=MONGOD_PORT, usedb=MONGO_DB_NAME)
def load_data_from_mongo(topic, input_file): mongo = _default_mongo() results = getDataByName(mongo, topic) print "length:", len(results) inputs = [] count = 0 testFile = open(input_file,'w') for r in results: r['title'] = '' r['content168'] = r['content168'].encode('utf-8') r['content'] = r['content168'] inputs.append(r) testFile.write(str(r['_id'])) testFile.write('\t') testFile.write(r['content']) testFile.write('\n') count += 1 print 'written', count testFile.close() return
"""将mongodb中的weibo数据导出到jl文件,加上news_content字段 usage: python etl_weibo2jl.py comment_54c5b301d8b487851c2434f9 apec_weibo.jl python etl_weibo2jl.py comment_54cb0a32f712cc19a1b02300 edu_weibo.jl """ import os import sys import json from utils import _default_mongo from load_settings import load_settings settings = load_settings() MONGOD_HOST = settings.get("MONGOD_HOST") MONGOD_PORT = settings.get("MONGOD_PORT") MONGO_DB_NAME = settings.get("MONGO_DB_NAME") mongo = _default_mongo(host=MONGOD_HOST, port=MONGOD_PORT, \ usedb=MONGO_DB_NAME) collection_name = sys.argv[1] results = mongo[collection_name].find() jl_file = sys.argv[2] fw = open(jl_file, 'w') for r in results: r['news_content'] = None fw.write('%s\n' % json.dumps(r)) fw.close()
def __init__(self, topicid): self.id = topicid self.comments_cluster_collection = COMMENTS_CLUSTER_COLLECTION self.comments_collection = EVENTS_COMMENTS_COLLECTION_PREFIX + str(self.id) self.news_collection = EVENTS_NEWS_COLLECTION_PREFIX + str(self.id) self.mongo = _default_mongo(host=MONGOD_HOST, port=MONGOD_PORT, usedb=MONGO_DB_NAME)
def __init__(self): self.mongo = _default_mongo(host=MONGOD_HOST, port=MONGOD_PORT, usedb=MONGO_DB_NAME)
def __init__(self, id, topicid): self.id = id self.topicid = topicid self.news_collection = EVENTS_NEWS_COLLECTION_PREFIX + str(topicid) self.mongo = _default_mongo(usedb=MONGO_DB_NAME)
for item in items: try: items_dict[item['label']].append(item) except: items_dict[item['label']] = [item] keywords_count_list = [] for label, one_items in items_dict.iteritems(): keywords_count = extract_keyword(one_items) keywords_count_list.append(keywords_count) results = tfidf_cal(keywords_count_list) return dict(zip(items_dict.keys(), results)) if __name__ == '__main__': topic = "APEC2014" topicid = "54916b0d955230e752f2a94e" mongo = _default_mongo(usedb=MONGO_DB_NAME) results = mongo[EVENTS_NEWS_COLLECTION_PREFIX + topicid].find() inputs = [{"title": r["title"].encode("utf-8"), "content": r["content168"].encode("utf-8"), \ "label": random.randint(0, 10)} for r in results] results = extract_feature(inputs, title_term_weight=5, content_term_weight=1) for k in results: print "-----------------------" for v0, v1 in k: print v0, v1
try: items_dict[item['label']].append(item) except: items_dict[item['label']] = [item] keywords_count_list = [] for label, one_items in items_dict.iteritems(): keywords_count = extract_keyword(one_items) keywords_count_list.append(keywords_count) results = tfidf_cal(keywords_count_list) return dict(zip(items_dict.keys(), results)) if __name__ == '__main__': topic = "APEC2014" topicid = "54916b0d955230e752f2a94e" mongo = _default_mongo(usedb=MONGO_DB_NAME) results = mongo[EVENTS_NEWS_COLLECTION_PREFIX + topicid].find() inputs = [{"title": r["title"].encode("utf-8"), "content": r["content168"].encode("utf-8"), \ "label": random.randint(0, 10)} for r in results] results = extract_feature(inputs, title_term_weight=5, content_term_weight=1) for k in results: print "-----------------------" for v0, v1 in k: print v0, v1
def __init__(self): self.mongo = _default_mongo(host=MONGOD_HOST, port=MONGOD_PORT, usedb=MONGO_DB_NAME) self.events_collection = EVENTS_COLLECTION
"""将mongodb中的comment数据导出到jl文件,加上news_content字段 usage: python etl.py comment_54916b0d955230e752f2a94e apec.jl """ import os import sys import json from utils import _default_mongo from load_settings import load_settings settings = load_settings() MONGOD_HOST = settings.get("MONGOD_HOST") MONGOD_PORT = settings.get("MONGOD_PORT") MONGO_DB_NAME = settings.get("MONGO_DB_NAME") mongo = _default_mongo(host=MONGOD_HOST, port=MONGOD_PORT, \ usedb=MONGO_DB_NAME) collection_name = sys.argv[1] results = mongo[collection_name].find() def get_news_content(news_id): result = mongo['post_' + collection_name.split('_')[1]].find_one({"_id": news_id}) if result: return None else: return result['content168'] jl_file = sys.argv[2] fw = open(jl_file, 'w') for r in results: r['news_content'] = get_news_content(r['news_id'])
def __init__(self): self.mongo = _default_mongo(usedb=MONGO_DB_NAME)
def __init__(self, id): self.id = id self.otherClusterId = self.getOtherClusterId() self.mongo = _default_mongo(host=MONGOD_HOST, port=MONGOD_PORT, usedb=MONGO_DB_NAME)
def __init__(self, id, topicid): self.id = id self.comments_collection = EVENTS_COMMENTS_COLLECTION_PREFIX + str(topicid) self.mongo = _default_mongo(host=MONGOD_HOST, port=MONGOD_PORT, usedb=MONGO_DB_NAME)
return us def diamond_classifier(item): # 其他类 sentiment = 0 if '【' in item['text'].encode('utf-8') and '】' in item['text'].encode('utf-8'): # 简单规则判断新闻类 sentiment = 4 else: # 积极、愤怒、悲伤3类情感分类器 sentiment = triple_classifier(item) return sentiment mongo = _default_mongo() module_keywords = get_module_keywords() for bankuai, lanmu, source, source_en, keywords_file in module_keywords: query_dict = { "timestamp": { "$gte": START_TS, "$lt": END_TS }, "keywords_hit": True, "rubbish": False } if source_en == "weibo_api_search_spider": query_dict["source_category"] = keywords_file