Пример #1
0
 def __init__(self, subeventid):
     """初始化特征词类
        input
            subeventid: 子事件ID
     """
     self.subeventid = subeventid
     self.mongo = _default_mongo(usedb=MONGO_DB_NAME)
Пример #2
0
 def __init__(self, id, topicid):
     self.id = id
     self.comments_collection = EVENTS_COMMENTS_COLLECTION_PREFIX + str(
         topicid)
     self.mongo = _default_mongo(host=MONGOD_HOST,
                                 port=MONGOD_PORT,
                                 usedb=MONGO_DB_NAME)
Пример #3
0
 def __init__(self, id):
     """初始化话题实例,输入为话题ID,ObjectID
     """
     self.id = id
     self.other_subeventid = self.getOtherSubEventID()
     self.news_collection = EVENTS_NEWS_COLLECTION_PREFIX + str(id)
     self.sub_events_collection = SUB_EVENTS_COLLECTION
     self.events_collection = EVENTS_COLLECTION
     self.mongo = _default_mongo(usedb=MONGO_DB_NAME)
Пример #4
0
 def __init__(self, db, host, port, collection, user_collection, weibo_collection):
     self.db_name = db
     self.host = host
     self.port = port
     self.db = _default_mongo(host, port, usedb=db)
     self.collection = collection
     self.user_collection = user_collection
     self.weibo_collection = weibo_collection
     log.msg('Mongod connect to {host}:{port}:{db}:{collection}'.format(host=host, port=port, db=db, collection=collection), level=log.INFO)
Пример #5
0
 def __init__(self, topicid):
     self.id = topicid
     self.comments_cluster_collection = COMMENTS_CLUSTER_COLLECTION
     self.comments_collection = EVENTS_COMMENTS_COLLECTION_PREFIX + str(
         self.id)
     self.news_collection = EVENTS_NEWS_COLLECTION_PREFIX + str(self.id)
     self.mongo = _default_mongo(host=MONGOD_HOST,
                                 port=MONGOD_PORT,
                                 usedb=MONGO_DB_NAME)
Пример #6
0
def load_data_from_mongo(topic, input_file):
    mongo = _default_mongo()
    results = getDataByName(mongo, topic)
    print "length:", len(results)

    inputs = []
    count = 0
    testFile = open(input_file,'w')
    for r in results:
        r['title'] = ''
        r['content168'] = r['content168'].encode('utf-8')
        r['content'] = r['content168']
        inputs.append(r)
        testFile.write(str(r['_id']))
        testFile.write('\t')
        testFile.write(r['content'])
        testFile.write('\n')
        count += 1
    print 'written', count
    testFile.close()
    return
Пример #7
0
"""将mongodb中的weibo数据导出到jl文件,加上news_content字段
   usage: 
   python etl_weibo2jl.py comment_54c5b301d8b487851c2434f9 apec_weibo.jl
   python etl_weibo2jl.py comment_54cb0a32f712cc19a1b02300 edu_weibo.jl
"""

import os
import sys
import json
from utils import _default_mongo
from load_settings import load_settings

settings = load_settings()
MONGOD_HOST = settings.get("MONGOD_HOST")
MONGOD_PORT = settings.get("MONGOD_PORT")
MONGO_DB_NAME = settings.get("MONGO_DB_NAME")

mongo = _default_mongo(host=MONGOD_HOST, port=MONGOD_PORT, \
        usedb=MONGO_DB_NAME)

collection_name = sys.argv[1]
results = mongo[collection_name].find()

jl_file = sys.argv[2]
fw = open(jl_file, 'w')
for r in results:
    r['news_content'] = None
    fw.write('%s\n' % json.dumps(r))
fw.close()

Пример #8
0
 def __init__(self, topicid):
     self.id = topicid
     self.comments_cluster_collection = COMMENTS_CLUSTER_COLLECTION
     self.comments_collection = EVENTS_COMMENTS_COLLECTION_PREFIX + str(self.id)
     self.news_collection = EVENTS_NEWS_COLLECTION_PREFIX + str(self.id)
     self.mongo = _default_mongo(host=MONGOD_HOST, port=MONGOD_PORT, usedb=MONGO_DB_NAME)
Пример #9
0
 def __init__(self):
     self.mongo = _default_mongo(host=MONGOD_HOST, port=MONGOD_PORT, usedb=MONGO_DB_NAME)
Пример #10
0
 def __init__(self, id, topicid):
     self.id = id
     self.topicid = topicid
     self.news_collection = EVENTS_NEWS_COLLECTION_PREFIX + str(topicid)
     self.mongo = _default_mongo(usedb=MONGO_DB_NAME)
Пример #11
0
    for item in items:
        try:
            items_dict[item['label']].append(item)
        except:
            items_dict[item['label']] = [item]

    keywords_count_list = []
    for label, one_items in items_dict.iteritems():
        keywords_count = extract_keyword(one_items)
        keywords_count_list.append(keywords_count)

    results = tfidf_cal(keywords_count_list)

    return dict(zip(items_dict.keys(), results))


if __name__ == '__main__':
    topic = "APEC2014"
    topicid = "54916b0d955230e752f2a94e"
    mongo = _default_mongo(usedb=MONGO_DB_NAME)
    results = mongo[EVENTS_NEWS_COLLECTION_PREFIX + topicid].find()
    inputs = [{"title": r["title"].encode("utf-8"), "content": r["content168"].encode("utf-8"), \
            "label": random.randint(0, 10)} for r in results]

    results = extract_feature(inputs, title_term_weight=5, content_term_weight=1)
    for k in results:
        print "-----------------------"
        for v0, v1 in k:
            print v0, v1

Пример #12
0
        try:
            items_dict[item['label']].append(item)
        except:
            items_dict[item['label']] = [item]

    keywords_count_list = []
    for label, one_items in items_dict.iteritems():
        keywords_count = extract_keyword(one_items)
        keywords_count_list.append(keywords_count)

    results = tfidf_cal(keywords_count_list)

    return dict(zip(items_dict.keys(), results))


if __name__ == '__main__':
    topic = "APEC2014"
    topicid = "54916b0d955230e752f2a94e"
    mongo = _default_mongo(usedb=MONGO_DB_NAME)
    results = mongo[EVENTS_NEWS_COLLECTION_PREFIX + topicid].find()
    inputs = [{"title": r["title"].encode("utf-8"), "content": r["content168"].encode("utf-8"), \
            "label": random.randint(0, 10)} for r in results]

    results = extract_feature(inputs,
                              title_term_weight=5,
                              content_term_weight=1)
    for k in results:
        print "-----------------------"
        for v0, v1 in k:
            print v0, v1
Пример #13
0
 def __init__(self):
     self.mongo = _default_mongo(host=MONGOD_HOST,
                                 port=MONGOD_PORT,
                                 usedb=MONGO_DB_NAME)
Пример #14
0
 def __init__(self):
     self.mongo = _default_mongo(host=MONGOD_HOST,
                                 port=MONGOD_PORT,
                                 usedb=MONGO_DB_NAME)
     self.events_collection = EVENTS_COLLECTION
Пример #15
0
"""将mongodb中的comment数据导出到jl文件,加上news_content字段
   usage: python etl.py comment_54916b0d955230e752f2a94e apec.jl
"""

import os
import sys
import json
from utils import _default_mongo
from load_settings import load_settings

settings = load_settings()
MONGOD_HOST = settings.get("MONGOD_HOST")
MONGOD_PORT = settings.get("MONGOD_PORT")
MONGO_DB_NAME = settings.get("MONGO_DB_NAME")

mongo = _default_mongo(host=MONGOD_HOST, port=MONGOD_PORT, \
        usedb=MONGO_DB_NAME)

collection_name = sys.argv[1]
results = mongo[collection_name].find()

def get_news_content(news_id):
    result = mongo['post_' + collection_name.split('_')[1]].find_one({"_id": news_id})
    if result:
        return None
    else:
        return result['content168']

jl_file = sys.argv[2]
fw = open(jl_file, 'w')
for r in results:
    r['news_content'] = get_news_content(r['news_id'])
Пример #16
0
 def __init__(self):
     self.mongo = _default_mongo(usedb=MONGO_DB_NAME)
Пример #17
0
 def __init__(self, id):
     self.id = id
     self.otherClusterId = self.getOtherClusterId()
     self.mongo = _default_mongo(host=MONGOD_HOST, port=MONGOD_PORT, usedb=MONGO_DB_NAME)
Пример #18
0
 def __init__(self, id, topicid):
     self.id = id
     self.comments_collection = EVENTS_COMMENTS_COLLECTION_PREFIX + str(topicid)
     self.mongo = _default_mongo(host=MONGOD_HOST, port=MONGOD_PORT, usedb=MONGO_DB_NAME)
Пример #19
0
 def __init__(self):
     self.mongo = _default_mongo(host=MONGOD_HOST, port=MONGOD_PORT, usedb=MONGO_DB_NAME)
     self.events_collection = EVENTS_COLLECTION
Пример #20
0
    return us

def diamond_classifier(item):
    # 其他类
    sentiment = 0

    if '【' in item['text'].encode('utf-8') and '】' in item['text'].encode('utf-8'):
        # 简单规则判断新闻类
        sentiment = 4
    else:
        # 积极、愤怒、悲伤3类情感分类器
        sentiment = triple_classifier(item)

    return sentiment

mongo = _default_mongo()

module_keywords = get_module_keywords()

for bankuai, lanmu, source, source_en, keywords_file in module_keywords:
    query_dict = {
        "timestamp": {
            "$gte": START_TS,
            "$lt": END_TS
        },
        "keywords_hit": True,
        "rubbish": False
    }

    if source_en == "weibo_api_search_spider":
        query_dict["source_category"] = keywords_file
Пример #21
0
 def __init__(self, id):
     self.id = id
     self.otherClusterId = self.getOtherClusterId()
     self.mongo = _default_mongo(host=MONGOD_HOST,
                                 port=MONGOD_PORT,
                                 usedb=MONGO_DB_NAME)