예제 #1
0
def transform_net_data(dbname, colname, newdbname, newcolname):
    dbo = dbt.db_connect_no_auth(dbname)
    neto = dbo[colname]
    dbn = dbt.db_connect_no_auth(newdbname)
    netn = dbn[newcolname]
    netn.create_index([("user", pymongo.ASCENDING),
                    ("follower", pymongo.ASCENDING),
                     ("type", pymongo.ASCENDING)],
                            unique=True)
    for status in neto.find({'scraped_times':1}):
        netn.insert(status)
예제 #2
0
def transform_data(dbname, colname, newdbname, newcolname, timeend):
    dbo = dbt.db_connect_no_auth(dbname)
    timeo = dbo[colname]
    dbn = dbt.db_connect_no_auth(newdbname)
    timen = dbn[newcolname]
    timen.create_index([('user.id', pymongo.ASCENDING),
                          ('id', pymongo.DESCENDING)])
    timen.create_index([('id', pymongo.ASCENDING)], unique=True)
    for status in timeo.find():
        ts = datetime.strptime(status['created_at'], '%a %b %d %H:%M:%S +0000 %Y')
        if ts <= timeend:
            timen.insert(status)
예제 #3
0
def transform_net_data(dbname, colname, newdbname, newcolname):
    # transform network data
    dbo = dbt.db_connect_no_auth(dbname)
    neto = dbo[colname]
    dbn = dbt.db_connect_no_auth(newdbname)
    netn = dbn[newcolname]
    netn.create_index([("user", pymongo.ASCENDING),
                       ("follower", pymongo.ASCENDING),
                       ("type", pymongo.ASCENDING)],
                      unique=True)
    for status in neto.find({'scraped_times': 1}):
        netn.insert(status)
예제 #4
0
def transform_data(dbname, colname, newdbname, newcolname, timeend):
    # transform tweet after some date point
    dbo = dbt.db_connect_no_auth(dbname)
    timeo = dbo[colname]
    dbn = dbt.db_connect_no_auth(newdbname)
    timen = dbn[newcolname]
    timen.create_index([('user.id', pymongo.ASCENDING),
                        ('id', pymongo.DESCENDING)])
    timen.create_index([('id', pymongo.ASCENDING)], unique=True)
    for status in timeo.find():
        ts = datetime.strptime(status['created_at'],
                               '%a %b %d %H:%M:%S +0000 %Y')
        if ts <= timeend:
            timen.insert(status)
예제 #5
0
파일: data_trans.py 프로젝트: wtgme/ohsn
def transform():
    db = dbt.db_connect_no_auth('rd')
    cols = db['com']
    db = dbt.db_connect_no_auth('drd')
    cold = db['com']
    cold.create_index([('id', pymongo.ASCENDING)], unique=True)
    for user in cols.find({'level': 3}, ['id', 'screen_name',
                               "description", "friends_count",
                               "followers_count", "statuses_count"]):
        cold.insert({'id': user['id'],
                     'screen_name':user['screen_name'],
                     'description': user['description'],
                     'friends_count': user['friends_count'],
                     'followers_count': user['followers_count'],
                     'statuses_count': user['statuses_count']})
예제 #6
0
파일: monitor.py 프로젝트: wtgme/ohsn
def check_change(time_index):
    """Count how many users have change their profiles, e.g., increase or descrease their follower numbers"""
    db = dbt.db_connect_no_auth('monitor')
    changedb = db['changes']
    changedb.create_index([('dataset', pymongo.ASCENDING),
                         ('statis_index', pymongo.DESCENDING)], unique=True)
    datasets = ['ded', 'drd', 'dyg']
    check_keys = ['description', 'friends_count', 'followers_count', 'statuses_count']
    for dataset in datasets[:1]:
        dbs = dbt.db_connect_no_auth(dataset)
        sample_user = dbs['com']
        sample_time = dbs['timeline']
        sample_net = dbs['net']
        """ record need to store """
        changes = {'dataset': dataset, 'statis_index': time_index}
        # check prof changes, 'description', 'friends_count', 'followers_count', 'statuses_count'
        for user in sample_user.find({'timeline_scraped_times': time_index, 'timeline_count': {'$gt': 0}}):
            # print dataset, user['id']
            last_tweet = sample_time.find({'user.id': user['id']},
                                          {'id':1, 'user':1, 'created_at':1}).sort([('id', -1)]).limit(1)[0]  # sort: 1 = ascending, -1 = descending
            if last_tweet:
                userc = last_tweet['user']
                for key in check_keys:
                    if user[key] != userc[key]:
                        value = changes.get(key, 0)
                        value += 1
                        changes[key] = value
                        """Update newest profiles in user database"""
                        sample_user.update_one({'id': user['id']}, {'$set': {key: userc[key]}}, upsert=False)
                        if 'count' in key:
                            if user[key] < userc[key]:
                                value = changes.get(key+'_inc', 0)
                                value += 1
                                changes[key+'_inc'] = value
                            elif user[key] > userc[key]:
                                value = changes.get(key+'_dec', 0)
                                value += 1
                                changes[key+'_dec'] = value

        """check following changes among users"""

        count = sample_net.count({'scraped_times': time_index})-sample_net.count({'scraped_times': time_index-1})
        changes['net_changes'] = count
        changes['statis_at'] = datetime.datetime.now()
        try:
            changedb.insert(changes)
        except pymongo.errors.DuplicateKeyError:
            pass
예제 #7
0
파일: re_collect.py 프로젝트: wtgme/ohsn
def check_change(time_index):
    db = dbt.db_connect_no_auth("monitor")
    changedb = db["changes"]
    changedb.create_index([("dataset", pymongo.ASCENDING), ("statis_index", pymongo.DESCENDING)], unique=True)
    datasets = ["ded", "drd", "dyg"]
    check_keys = ["description", "friends_count", "followers_count", "statuses_count"]
    for dataset in datasets:
        dbs = dbt.db_connect_no_auth(dataset)
        sample_user = dbs["com"]
        sample_time = dbs["timeline"]
        sample_net = dbs["net"]
        changes = {"dataset": dataset, "statis_index": time_index}
        # check prof changes, 'description', 'friends_count', 'followers_count', 'statuses_count'
        for user in sample_user.find({"timeline_scraped_times": time_index, "timeline_count": {"$gt": 0}}):
            # print dataset, user['id']
            last_tweet = (
                sample_time.find({"user.id": user["id"]}, {"id": 1, "user": 1, "created_at": 1})
                .sort([("id", -1)])
                .limit(1)[0]
            )  # sort: 1 = ascending, -1 = descending
            if last_tweet:
                userc = last_tweet["user"]
                # print last_tweet['id']
                for key in check_keys:
                    if user[key] != userc[key]:
                        value = changes.get(key, 0)
                        value += 1
                        changes[key] = value
                        sample_user.update_one({"id": user["id"]}, {"$set": {key: userc[key]}}, upsert=False)
                        if "count" in key:
                            if user[key] < userc[key]:
                                value = changes.get(key + "_inc", 0)
                                value += 1
                                changes[key + "_inc"] = value
                            elif user[key] > userc[key]:
                                value = changes.get(key + "_dec", 0)
                                value += 1
                                changes[key + "_dec"] = value

        # check following changes among users

        count = sample_net.count({"scraped_times": time_index}) - sample_net.count({"scraped_times": time_index - 1})
        changes["net_changes"] = count
        changes["statis_at"] = datetime.datetime.now().strftime("%a %b %d %H:%M:%S +0000 %Y")
        try:
            changedb.insert(changes)
        except pymongo.errors.DuplicateKeyError:
            pass
예제 #8
0
파일: statis.py 프로젝트: abiraja2004/ohsn
def active_user_list(dbname, comname, timename):
    db = dbt.db_connect_no_auth(dbname)
    time = db[timename]
    com = db[comname]
    date = []
    pred_users = pickle.load(open('data/ed-rel.pick', 'r'))
    for uid in pred_users:
        user = com.find_one({'id': int(uid)})
        if user['level'] != 1:
            last_tweet = time.find({
                'user.id': int(uid)
            }, {
                'id': 1,
                'user': 1,
                'created_at': 1
            }).sort([('id', -1)
                     ]).limit(1)[0]  # sort: 1 = ascending, -1 = descending
            datev = last_tweet['created_at']
            if isinstance(datev, basestring):
                datev = datetime.strptime(datev, '%a %b %d %H:%M:%S +0000 %Y')
            date.append(datetime(datev.year, datev.month, datev.day))
            # print user['screen_name'], datetime(datev.year, datev.month, datev.day)
    print len(date)
    df = pd.DataFrame({'PredictED_nonED': date}, index=date)
    df.groupby([df.PredictED_nonED.dt.year,
                df.PredictED_nonED.dt.month]).count().plot(kind="bar")
    plt.xlabel('(Year, Month)')
    plt.ylabel('Count')
    plt.show()
예제 #9
0
파일: ed_snowball.py 프로젝트: wtgme/ohsn
def re_snowball_friends(olddbname, oldcomname, newdbname, newcomname):
    newdb = dbt.db_connect_no_auth(newdbname)
    newcom = newdb[newcomname]
    newnet = newdb['net']
    newcom.create_index("id", unique=True)
    newcom.create_index([('level', pymongo.ASCENDING),
                         ('following_prelevel_node', pymongo.ASCENDING)],
                        unique=False)
    newcom.create_index([('level', pymongo.ASCENDING),
                         ('follower_prelevel_node', pymongo.ASCENDING)],
                        unique=False)
    newnet.create_index([("user", pymongo.ASCENDING),
                         ("follower", pymongo.ASCENDING)],
                        unique=True)

    '''Reteive ED core users'''
    ed_users = iot.get_values_one_field(olddbname, oldcomname, 'id', {'level': 1})
    list_size = len(ed_users)
    length = int(math.ceil(list_size/100.0))
    for index in xrange(length):
        index_begin = index*100
        index_end = min(list_size, index_begin+100)
        lookup.lookup_user_list(ed_users[index_begin:index_end], newcom, 1, 'N')

    level = 1
    while True:
        # Each call of snowball_following and snowball_follower only process up to 200 users
        print datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S"), 'Snowball followings of seeds for sample db', level
        following_flag = following.snowball_following(newcom, newnet, level, 'N')
        print datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S"), 'Snowball followees of seeds for sample db', level
        follower_flag = follower.snowball_follower(newcom, newnet, level, 'N')
        if following_flag == False and follower_flag == False:
            break
        else:
            continue
예제 #10
0
def process_tweet(dbname, comname, timename, label, filename):
    db = dbt.db_connect_no_auth(dbname)
    times = db[timename]
    user_list = iot.get_values_one_field(dbname, comname, 'id', {
        "timeline_count": {
            '$gt': 0
        },
        'lang': 'en'
    })
    target_users = []
    for user in user_list:
        context = ''
        for time in times.find({'user.id': user}).sort([('id', 1)]):
            # print time['created_at']
            if 'retweeted_status' in time:
                continue
            elif 'quoted_status' in time:
                continue
            else:
                text = process(time['text'])
                if text:
                    # print user, time['id'], text, '<-------', time['text']
                    context += text + ' '
                else:
                    continue
                    # print user, time['id'], 'None', '<-------', time['text']
        if len(context.split()) > 50:
            target_users.append(user)
            print '__label__' + label + ' , ' + context
    pickle.dump(target_users, open('data/' + filename + '.pick', 'w'))
예제 #11
0
파일: com_det.py 프로젝트: wtgme/ohsn
def core_ed():
    idset = set()
    db = dbt.db_connect_no_auth('fed')
    com = db['com']
    for user in com.find({'level':1}):
        idset.add(user['id_str'])
    return idset
예제 #12
0
파일: activity.py 프로젝트: wtgme/ohsn
def timeline_time(dbname, colname, timename):
    db = dbt.db_connect_no_auth(dbname)
    com = db[colname]
    timeline = db[timename]
    posts = {}
    dates = {}
    biolist =    ['text_anal.gw.value',
                  'text_anal.cw.value',
                  # 'text_anal.edword_count.value',
                  'text_anal.h.value',
                  'text_anal.a.value',
                  'text_anal.lw.value',
                  'text_anal.hw.value']
    for user in com.find({"$and":[
                         # {biolist[0]:{'$exists': True}},
                         {biolist[1]:{'$exists': True}},
                         {biolist[2]:{'$exists': True}},
                         # {biolist[3]:{'$exists': True}},
                         # {biolist[4]:{'$exists': True}},
                         # {biolist[5]:{'$exists': True}}
                        ]}):
        uid, timeline_count = user['id'], user['timeline_count']
        posts[uid] = timeline_count
        for tw in timeline.find({'user.id': uid}):
            ts = datetime.strptime(tw['created_at'],'%a %b %d %H:%M:%S +0000 %Y')
            datelist = dates.get(uid, [])
            datelist.append(ts)
            dates[uid] = datelist
    return posts, dates
예제 #13
0
파일: link_ed.py 프로젝트: wtgme/ohsn
def ed_user(dbname, colname):
    user_list = []
    db = dbt.db_connect_no_auth(dbname)
    com = db[colname]
    for user in com.find({'level': 1}, ['id']):
        user_list.append(str(user['id']))
    return user_list
예제 #14
0
파일: activity.py 프로젝트: wtgme/ohsn
def lifetime(dbname, comname, timename):
    db = dbt.db_connect_no_auth(dbname)
    com = db[comname]
    time = db[timename]
    during = []
    for user in com.find({"timeline_count": {'$gt': 0}}):
        newtweet = time.find({'user.id': user['id']}, no_cursor_timeout=True).sort([('id', -1)]).limit(1)[0]
        last = datetime.strptime(newtweet['created_at'],'%a %b %d %H:%M:%S +0000 %Y')
        account = datetime.strptime(user['created_at'],'%a %b %d %H:%M:%S +0000 %Y')
        print user['id'], last, account, (last.date() - account.date()).days + 1
        during.append((last.date() - account.date()).days + 1)
    pt.plot_config()
    plt.figure(1)
    plt.subplot(211)
    pt.sns.distplot(during)
    print np.mean(during), np.std(during)
    plt.axvline(np.mean(during), linestyle='--', color='k',
                label='Mean')
    plt.ylabel('PDF')
    plt.xlim(0, 2700)
    plt.legend()

    plt.subplot(212)
    pt.sns.boxplot(x=during)
    plt.ylabel('Quartile')
    plt.xlabel('Day')
    plt.xlim(0,2700)
    plt.show()
예제 #15
0
파일: activity.py 프로젝트: wtgme/ohsn
def create_time(dbname, colname):
    db = dbt.db_connect_no_auth(dbname)
    com = db[colname]
    created_time = {}
    # biolist =    ['text_anal.gw.value',
    #               'text_anal.cw.value',
    #               # 'text_anal.edword_count.value',
    #               'text_anal.h.value',
    #               'text_anal.a.value',
    #               'text_anal.lw.value',
    #               'text_anal.hw.value']
    # for user in com.find({"$and":[
    #                      # {biolist[0]:{'$exists': True}},
    #                      {biolist[1]:{'$exists': True}},
    #                      {biolist[2]:{'$exists': True}},
    #                      # {biolist[3]:{'$exists': True}},
    #                      # {biolist[4]:{'$exists': True}},
    #                      # {biolist[5]:{'$exists': True}}
    #     {'status':{'$exists': True}}
    #                     ]}):
    for user in com.find({}):
        ts = datetime.strptime(user['created_at'],'%a %b %d %H:%M:%S +0000 %Y')
        # print type(ts)
        created_time[user['id']] = ts
        # print ts
        # print user['created_at']
        # print '-----------------------'
    # print max(created_time.values()), min(created_time.values())
    return created_time
예제 #16
0
def timeline(dbname, timename):
    db = dbt.db_connect_no_auth(dbname)
    timeline = db[timename]
    dates = []
    for status in timeline.find(no_cursor_timeout=True):
        dates.append(status['created_at'])
    return dates
예제 #17
0
def load_behavior_network(db_name, collection='None', btype='communication'):
    '''Tweet: 0
    Retweet: 1;
    Reply: 2;
    Direct Mention: 3;
    undirect mention: 4 '''
    btype_dic = {'retweet': [1], 'reply': [2], 'mention': [3], 'communication': [2, 3]}
    DG = DiGraph()
    if collection is 'None':
        cols = db_name
    else:
        db = dbt.db_connect_no_auth(db_name)
        cols = db[collection]
    for row in cols.find({"type": {'$in': btype_dic[btype]}}):
        # if btype is 'retweet':
        #     n2 = row['id0']
        #     n1 = row['id1']
        # else:
        n1 = row['id0']
        n2 = row['id1']
        if n1 != n2:
            weightv = 1
            if (DG.has_node(n1)) and (DG.has_node(n2)) and (DG.has_edge(n1, n2)):
                DG[n1][n2]['weight'] += weightv
            else:
                DG.add_edge(n1, n2, weight=weightv)
    return DG
예제 #18
0
def image_main_color(dbname, colname):
    db = dbt.db_connect_no_auth(dbname)
    poi = db[colname]
    color_list = {}
    index = 0
    for user in poi.find(
        {
            'profile_banner_url': {
                '$exists': True
            },
            'liwc_anal.result.WC': {
                '$exists': True
            }
        }, ['id', 'profile_banner_url']):
        uid = user['id']
        url = user['profile_banner_url']
        index += 1
        if index % 100 == 0:
            print 'Have processed users:', index
        try:
            main_colors = ic.main_colors(url)
            color_list[uid] = main_colors
        except urllib2.HTTPError:
            continue
        # if len(color_list)>10000:
        #     break
    return color_list
예제 #19
0
def ed_user(dbname, colname):
    user_list = []
    db = dbt.db_connect_no_auth(dbname)
    com = db[colname]
    for user in com.find({'level': 1}, ['id']):
        user_list.append(str(user['id']))
    return user_list
예제 #20
0
def timeline_time(dbname, colname, timename):
    db = dbt.db_connect_no_auth(dbname)
    com = db[colname]
    timeline = db[timename]
    posts = {}
    dates = {}
    biolist =    ['text_anal.gw.value',
                  'text_anal.cw.value',
                  # 'text_anal.edword_count.value',
                  'text_anal.h.value',
                  'text_anal.a.value',
                  'text_anal.lw.value',
                  'text_anal.hw.value']
    for user in com.find({"$and":[
                         # {biolist[0]:{'$exists': True}},
                         {biolist[1]:{'$exists': True}},
                         {biolist[2]:{'$exists': True}},
                         # {biolist[3]:{'$exists': True}},
                         # {biolist[4]:{'$exists': True}},
                         # {biolist[5]:{'$exists': True}}
                        ]}):
        uid, timeline_count = user['id'], user['timeline_count']
        posts[uid] = timeline_count
        for tw in timeline.find({'user.id': uid}):
            ts = datetime.strptime(tw['created_at'],'%a %b %d %H:%M:%S +0000 %Y')
            datelist = dates.get(uid, [])
            datelist.append(ts)
            dates[uid] = datelist
    return posts, dates
예제 #21
0
def create_time(dbname, colname):
    db = dbt.db_connect_no_auth(dbname)
    com = db[colname]
    created_time = {}
    # biolist =    ['text_anal.gw.value',
    #               'text_anal.cw.value',
    #               # 'text_anal.edword_count.value',
    #               'text_anal.h.value',
    #               'text_anal.a.value',
    #               'text_anal.lw.value',
    #               'text_anal.hw.value']
    # for user in com.find({"$and":[
    #                      # {biolist[0]:{'$exists': True}},
    #                      {biolist[1]:{'$exists': True}},
    #                      {biolist[2]:{'$exists': True}},
    #                      # {biolist[3]:{'$exists': True}},
    #                      # {biolist[4]:{'$exists': True}},
    #                      # {biolist[5]:{'$exists': True}}
    #     {'status':{'$exists': True}}
    #                     ]}):
    for user in com.find({}):
        ts = datetime.strptime(user['created_at'],'%a %b %d %H:%M:%S +0000 %Y')
        # print type(ts)
        created_time[user['id']] = ts
        # print ts
        # print user['created_at']
        # print '-----------------------'
    # print max(created_time.values()), min(created_time.values())
    return created_time
예제 #22
0
def lifetime(dbname, comname, timename):
    db = dbt.db_connect_no_auth(dbname)
    com = db[comname]
    time = db[timename]
    during = []
    for user in com.find({"timeline_count": {'$gt': 0}}):
        newtweet = time.find({'user.id': user['id']}, no_cursor_timeout=True).sort([('id', -1)]).limit(1)[0]
        last = datetime.strptime(newtweet['created_at'],'%a %b %d %H:%M:%S +0000 %Y')
        account = datetime.strptime(user['created_at'],'%a %b %d %H:%M:%S +0000 %Y')
        print user['id'], last, account, (last.date() - account.date()).days + 1
        during.append((last.date() - account.date()).days + 1)
    pt.plot_config()
    plt.figure(1)
    plt.subplot(211)
    pt.sns.distplot(during)
    print np.mean(during), np.std(during)
    plt.axvline(np.mean(during), linestyle='--', color='k',
                label='Mean')
    plt.ylabel('PDF')
    plt.xlim(0, 2700)
    plt.legend()

    plt.subplot(212)
    pt.sns.boxplot(x=during)
    plt.ylabel('Quartile')
    plt.xlabel('Day')
    plt.xlim(0,2700)
    plt.show()
예제 #23
0
파일: topic_model.py 프로젝트: wtgme/ohsn
def read_document(dbname, colname, timecol, uset=None):
    db = dbt.db_connect_no_auth(dbname)
    col = db[colname]
    timelines = db[timecol]

    rtgrex = re.compile(r'RT (?<=^|(?<=[^a-zA-Z0-9-\.]))@([A-Za-z0-9_]+):')  # for Retweet
    mgrex = re.compile(r'(?<=^|(?<=[^a-zA-Z0-9-\.]))@([A-Za-z0-9_]+)')  # for mention
    hgrex = re.compile(r'(?<=^|(?<=[^a-zA-Z0-9]))#([A-Za-z0-9_]+)')  # for hashtags
    ugrex = re.compile(r'(https?://[^\s]+)')  # for url
    documents = list()
    ids = list()

    for user in col.find({'timeline_count': {'$gt': 0}}, ['id']):
        uid = user['id']
    # for uid in uset:
        textmass = ""
        for tweet in timelines.find({'user.id': uid}):
            text = tweet['text'].encode('utf8')
            # replace RT, @, # and Http://
            text = rtgrex.sub('', text)
            text = mgrex.sub('', text)
            text = hgrex.sub('', text)
            text = ugrex.sub('', text)
            text = text.strip()
            if not(text.endswith('.') or text.endswith('?') or text.endswith('!')):
                text += '.'
            textmass = textmass + " " + text.lower()
        words = textmass.split()
            # Any text with fewer than 50 words should be looked at with a certain degree of skepticism.
        if len(words) > 50:
            ids.append(uid)
            documents.append(textmass)
    pickle.dump(ids, open('data/doc_ids.pick', 'w'))
    return documents
예제 #24
0
파일: opinion.py 프로젝트: wtgme/ohsn
def data_4_opinionfinder(dbname, comname, timename, outpath, filter={}):
    db = dbt.db_connect_no_auth(dbname)
    time = db[timename]

    rtgrex = re.compile(r'RT (?<=^|(?<=[^a-zA-Z0-9-\.]))@([A-Za-z0-9_]+):')  # for Retweet
    mgrex = re.compile(r'(?<=^|(?<=[^a-zA-Z0-9-\.]))@([A-Za-z0-9_]+)')  # for mention
    hgrex = re.compile(r'(?<=^|(?<=[^a-zA-Z0-9]))#([A-Za-z0-9_]+)')  # for hashtags
    ugrex = re.compile(r'(https?://[^\s]+)')  # for url

    users = io.get_values_one_field(dbname, comname, 'id_str', filter)
    userlist = list()
    for user in users:
        documents = list()
        for tweet in time.find({'user.id': int(user)}):
            text = tweet['text'].encode('utf8')
            # replace RT, @, # and Http://
            text = rtgrex.sub('', text)
            text = mgrex.sub('', text)
            text = hgrex.sub('', text)
            text = ugrex.sub('', text)
            text = text.strip()
            if not(text.endswith('.') or text.endswith('?') or text.endswith('!')):
                text += '.'
            words = text.split()
            if len(words) > 0:
                documents.append(' '.join(words))
        if len(documents) > 0:
            with open(outpath+'/'+user+'.data', 'w') as fo:
                for document in documents:
                    fo.write(document+'\t\n')
            userlist.append(user)
    with open(outpath+'.doclist', 'w') as fo:
        for user in userlist:
            fo.write('database/'+outpath+'/'+ user+'.data\n')
예제 #25
0
def process_db(dbname, poicol, timecol, bnetcol, level):
    #### Connecting db and collections
    db = dbutil.db_connect_no_auth(dbname)
    sample_poi = db[poicol]
    print datetime.datetime.now().strftime(
        "%Y-%m-%d-%H-%M-%S") + "\t" + 'Connecting POI dbs well'

    sample_time = db[timecol]
    print datetime.datetime.now().strftime(
        "%Y-%m-%d-%H-%M-%S") + "\t" + 'Connecting timeline dbs well'

    sample_network = db[bnetcol]
    sample_network.create_index([("id0", pymongo.ASCENDING),
                                 ("id1", pymongo.ASCENDING),
                                 ("type", pymongo.ASCENDING),
                                 ("statusid", pymongo.ASCENDING)],
                                unique=True)
    # sample_poi.create_index([('timeline_count', pymongo.DESCENDING),
    #                   ('net_anal.tnmined', pymongo.ASCENDING),
    #                   ('level', pymongo.ASCENDING)], unique=False)
    # set every poi to have not been analysed.
    sample_poi.update_many({"net_anal.tnmined": True},
                           {'$set': {
                               "net_anal.tnmined": False
                           }},
                           upsert=False)
    print datetime.datetime.now().strftime(
        "%Y-%m-%d-%H-%M-%S") + "\t" + 'Connecting network dbs well'
    # sample_network.delete_many({'relationship': 'tweet'})

    network_mining(sample_poi, sample_time, sample_network, level)
예제 #26
0
파일: keyplay.py 프로젝트: abiraja2004/ohsn
def extract_behavior_subnetwork(db_name,
                                comname,
                                bnetname,
                                sbnetname,
                                index=0):
    db = dbt.db_connect_no_auth(db_name)
    if index != 0:
        comname, bnetname, sbnetname = comname + '_t' + str(
            index), bnetname + '_t' + str(index), sbnetname + '_t' + str(index)
    poi = db[comname]
    net = db[bnetname]
    tem = db[sbnetname]  # subset of behavior network
    tem.create_index([("id0", pymongo.ASCENDING), ("id1", pymongo.ASCENDING),
                      ("type", pymongo.ASCENDING),
                      ("statusid", pymongo.ASCENDING)],
                     unique=True)
    userl1 = set([])
    for user in poi.find({}, ['id']):
        userl1.add(user['id'])

    for user in userl1:
        for rel in net.find({'id0': user}):
            # follower = rel['id1']
            # if follower in userl1:
            try:
                tem.insert(rel)
            except pymongo.errors.DuplicateKeyError:
                pass
예제 #27
0
파일: monitor.py 프로젝트: abiraja2004/ohsn
def getuid(dbname, colname):
    db = dbt.db_connect_no_auth(dbname)
    sample_user = db[colname]
    uids = list()
    for user in sample_user.find({'level': 1}, ['id']):
        uids.append(user['id'])
    return uids
예제 #28
0
파일: friend_pro.py 프로젝트: wtgme/ohsn
def target_set(dbname, comname):
    db = dbt.db_connect_no_auth(dbname)
    com = db[comname]
    uset = set()
    for user in com.find({}, ['id']):
        uset.add(user['id'])
    return uset
예제 #29
0
def bio_statis(dbname, colname):
    db = dbutil.db_connect_no_auth(dbname)
    bio = db[colname]
    biolist =    ['results.gw.value',
                  'results.cw.value',
                  'results.edword_count.value',
                  'results.h.value',
                  'results.a.value',
                  'results.lw.value',
                  'results.hw.value']

    for name in biolist:
        user_count = {}
        for rec in bio.find({name:{'$exists': True}}):
            count = user_count.get(rec['uid'], 0)
            count += 1
            user_count[rec['uid']] = count
        change_count = 0
        for user in user_count.keys():
            if user_count[user] > 1:
                change_count += 1
        # print user_count
        percent = float(len(user_count))/61580
        change_per = float(change_count)/len(user_count)
        print ('%s, %.2f, %.2f' % (name, percent, change_per))

    count = bio.count({"$or":[{biolist[0]:{'$exists': True}},
                         {biolist[1]:{'$exists': True}},
                         # {biolist[2]:{'$exists': True}},
                         {biolist[3]:{'$exists': True}},
                         # {biolist[4]:{'$exists': True}},
                         {biolist[5]:{'$exists': True}},
                         {biolist[6]:{'$exists': True}}]})
    print ('Have anyone, %.2f' %(float(count)/61580))
예제 #30
0
파일: com_det.py 프로젝트: abiraja2004/ohsn
def core_ed():
    idset = set()
    db = dbt.db_connect_no_auth('fed')
    com = db['com']
    for user in com.find({'level': 1}):
        idset.add(user['id_str'])
    return idset
예제 #31
0
파일: k_core.py 프로젝트: abiraja2004/ohsn
def ed_user(dbname, colname):
    db = dbt.db_connect_no_auth(dbname)
    com = db[colname]
    userlist = []
    for user in com.find():
        userlist.append(user['id_str'])
    return userlist
예제 #32
0
def get_users(dbname, colname, filter):
    user_set = set()
    db = dbt.db_connect_no_auth(dbname)
    cols = db[colname]
    for user in cols.find(filter, ['id']):
        user_set.add(user['id'])
    return user_set
예제 #33
0
파일: doc2vec.py 프로젝트: abiraja2004/ohsn
def read_document(dbname, colname, timecol, uset=None):
    db = dbt.db_connect_no_auth(dbname)
    col = db[colname]
    timelines = db[timecol]
    for user in col.find({'timeline_count': {
            '$gt': 0
    }}, ['id', 'description'],
                         no_cursor_timeout=True):
        uid = user['id']
        text = process(user['description'])
        if text:
            print str(uid) + '\t' + ' '.join(text.split())
        else:
            textmass = ""
            for tweet in timelines.find({
                    'user.id': uid
            },
                                        no_cursor_timeout=True).sort([
                                            ('id', -1)
                                        ]).limit(5):
                text = process(tweet['text'])
                if text:
                    textmass += text + ' '
                else:
                    continue
            tokens = textmass.split()
            if len(tokens) >= 3:
                # topk = topKFrequent(tokens, 300)
                # words = [token for token in tokens if token in topk]
                print str(uid) + '\t' + ' '.join(tokens)
            else:
                continue
예제 #34
0
파일: activity.py 프로젝트: wtgme/ohsn
def timeline(dbname, timename):
    db = dbt.db_connect_no_auth(dbname)
    timeline = db[timename]
    dates = []
    for status in timeline.find(no_cursor_timeout=True):
        dates.append(status['created_at'])
    return dates
예제 #35
0
파일: k_core.py 프로젝트: wtgme/ohsn
def ed_user(dbname, colname):
    db = dbt.db_connect_no_auth(dbname)
    com = db[colname]
    userlist = []
    for user in com.find():
        userlist.append(user['id_str'])
    return userlist
예제 #36
0
def bio_change(dbname, colname, timename):
    db = dbt.db_connect_no_auth(dbname)
    com = db[colname]
    time = db[timename]
    filter = {
        'liwc_anal.result.i': {
            '$exists': True
        },
        'new_liwc_anal.result.i': {
            '$exists': True
        }
    }
    cw, gw, all = 0, 0, 0
    for user in com.find(filter):
        newtweet = time.find({
            'user.id': user['id']
        }, no_cursor_timeout=True).sort([('id', -1)]).limit(1)[0]
        oldtweet = time.find({
            'user.id': user['id']
        }, no_cursor_timeout=True).sort([('id', 1)]).limit(1)[0]
        newdes = newtweet['user']['description']
        olddes = oldtweet['user']['description']
        if newdes != olddes:
            all += 1
            newbio = des_miner.process_text(newdes)
            oldbio = des_miner.process_text(olddes)
            if 'cw' in newbio and 'cw' in oldbio:
                if newbio['cw']['value'] != oldbio['cw']['value']:
                    cw += 1
            if 'gw' in newbio and 'gw' in oldbio:
                if newbio['gw']['value'] != oldbio['gw']['value']:
                    gw += 1
    print cw, gw, all
예제 #37
0
파일: data_trans.py 프로젝트: wtgme/ohsn
def get_users(dbname, colname, filter):
    user_set = set()
    db = dbt.db_connect_no_auth(dbname)
    cols = db[colname]
    for user in cols.find(filter, ['id']):
        user_set.add(user['id'])
    return user_set
예제 #38
0
파일: re_collect.py 프로젝트: wtgme/ohsn
def getuid(dbname, colname):
    db = dbt.db_connect_no_auth(dbname)
    sample_user = db[colname]
    uids = list()
    for user in sample_user.find({"level": 1}, ["id"]):
        uids.append(user["id"])
    return uids
예제 #39
0
파일: keyplay.py 프로젝트: wtgme/ohsn
def extract_behavior_subnetwork(db_name, comname, bnetname, sbnetname, index=0):
    db = dbt.db_connect_no_auth(db_name)
    if index != 0:
        comname, bnetname, sbnetname = (
            comname + "_t" + str(index),
            bnetname + "_t" + str(index),
            sbnetname + "_t" + str(index),
        )
    poi = db[comname]
    net = db[bnetname]
    tem = db[sbnetname]  # subset of behavior network
    tem.create_index(
        [
            ("id0", pymongo.ASCENDING),
            ("id1", pymongo.ASCENDING),
            ("type", pymongo.ASCENDING),
            ("statusid", pymongo.ASCENDING),
        ],
        unique=True,
    )
    userl1 = set([])
    for user in poi.find({}, ["id"]):
        userl1.add(user["id"])

    for user in userl1:
        for rel in net.find({"id0": user}):
            follower = rel["id1"]
            if follower in userl1:
                try:
                    tem.insert(rel)
                except pymongo.errors.DuplicateKeyError:
                    pass
예제 #40
0
def target_set(dbname, comname):
    db = dbt.db_connect_no_auth(dbname)
    com = db[comname]
    uset = set()
    for user in com.find({}, ['id']):
        uset.add(user['id'])
    return uset
예제 #41
0
def word2vec_tweets(dbname, colname, timecol):
    # load word2vec of tweets and represent each users as the vector of word2vec
    model = gensim.models.Word2Vec.load('word2vec/fed_w2v.model')
    db = dbt.db_connect_no_auth(dbname)
    col = db[colname]
    timelines = db[timecol]
    for user in col.find({'timeline_count': {
            '$gt': 0
    }}, ['id'],
                         no_cursor_timeout=True):
        uid = user['id']
        user_vec = []
        for tweet in timelines.find({'user.id': uid}, no_cursor_timeout=True):
            text = tweet['text'].encode('utf8')
            # replace RT, @, and Http://
            text = text.strip().lower()
            text = re.sub(
                r"(?:(rt\ ?@)|@|https?://)\S+", "",
                text)  # replace RT @, @ and http:// keep hashtag but remove
            words = tokenizer.tokenize(text)
            # Any text with fewer than 50 words should be looked at with a certain degree of skepticism.
            # if len(words) > 5:
            for word in words:
                if word in model:
                    user_vec.append(model[word])
        if len(user_vec) > 0:
            vector = np.array(user_vec).mean(axis=0)
            col.update_one(
                {'id': uid},
                {'$set': {
                    'w2v.mined': True,
                    'w2v.result': vector.tolist()
                }},
                upsert=False)
예제 #42
0
def read_tweets(dbname, timecol):
    '''Read tweets, excluding retweets'''
    db = dbt.db_connect_no_auth(dbname)
    # col = db[colname]
    timelines = db[timecol]
    # documents = list()
    # ids = list()
    # for user in col.find({'timeline_count': {'$gt': 0}}, ['id'], no_cursor_timeout=True):
    # uid = user['id']
    for tweet in timelines.find({'retweeted_status': {
            '$exists': False
    }},
                                no_cursor_timeout=True):
        hashtags = tweet['entities']['hashtags']
        hash_set = set()
        for hash in hashtags:
            hash_set.add(hash['text'].encode('utf-8').lower().replace(
                '_', '').replace('-', ''))

        text = tweet['text'].encode('utf8')
        uid = tweet['user']['id']
        # replace RT, @, and Http://
        text = text.strip().lower()
        text = re.sub(
            r"(?:(rt\ ?@)|@|https?://)\S+", "",
            text)  # replace RT @, @ and http:// keep hashtag but remove
        words = tokenizer.tokenize(text)
        # Any text with fewer than 50 words should be looked at with a certain degree of skepticism.
        if len(words) > 3:
            print('%d\t%d\t%s\t%s') % (uid, tweet['id'], ' '.join(words),
                                       ' '.join(list(hash_set)))
예제 #43
0
파일: net_util.py 프로젝트: wtgme/ohsn
def load_behavior_network(db_name, collection='None', btype='communication'):
    '''Tweet: 0
    Retweet: 1;
    Reply: 2;
    Direct Mention: 3;
    undirect mention: 4 '''
    btype_dic = {'retweet': [1], 'reply': [2], 'mention': [3], 'communication': [2, 3]}
    DG = DiGraph()
    if collection is 'None':
        cols = db_name
    else:
        db = dbt.db_connect_no_auth(db_name)
        cols = db[collection]
    for row in cols.find({"type": {'$in': btype_dic[btype]}}):
        if btype is 'retweet':
            n2 = row['id0']
            n1 = row['id1']
        else:
            n1 = row['id0']
            n2 = row['id1']
        if n1 != n2:
            weightv = 1
            if (DG.has_node(n1)) and (DG.has_node(n2)) and (DG.has_edge(n1, n2)):
                DG[n1][n2]['weight'] += weightv
            else:
                DG.add_edge(n1, n2, weight=weightv)
    return DG
예제 #44
0
파일: tag_network.py 프로젝트: wtgme/ohsn
def user_hashtag_profile(dbname, hash_com):
    '''
    Map the hashtags that a user has used to communities of hashtag network
    Get the <commnity: proportion> vector for users' hashtag profiles
    :param dbname:
    :param hash_com:
    :return:
    '''
    ed_users = iot.get_values_one_field(dbname, 'scom', 'id')
    db = dbt.db_connect_no_auth(dbname)
    com_length = len(set(hash_com.values()))
    times = db['timeline']
    user_hash_profile = {}
    for uid in ed_users:
        counter = {}
        for tweet in times.find({'user.id': uid, '$where': 'this.entities.hashtags.length>0'}):
            hashtags = tweet['entities']['hashtags']
            hash_set = set()
            for hash in hashtags:
                hash_set.add(hash['text'].encode('utf-8').lower().replace('_', '').replace('-', ''))
            hash_list = list(hash_set)
            for hash in hash_list:
                v = counter.get(hash, 0)
                counter[hash] = v+1
        vector = [0.0]*com_length
        for hash in counter:
            if hash in hash_com:
                comid = hash_com[hash]
                vector[comid] += counter[hash]
        if sum(vector) == 0:
            user_hash_profile[uid] = np.array(vector)
        else:
            user_hash_profile[uid] = np.array(vector)/sum(vector)

    pickle.dump(user_hash_profile, open('data/user-hash-profile.pick', 'w'))
예제 #45
0
def network_snowball(dbname, mode='N'):
    db = dbt.db_connect_no_auth(dbname)
    ed_poi = db['ccom']
    ed_net = db['cnet']
    stream_users = db['poi']
    # echelon = dbt.db_connect_no_auth('echelon')
    # echelon_poi = echelon['poi']

    ed_poi.create_index("id", unique=True)
    ed_poi.create_index([('level', pymongo.ASCENDING),
                         ('following_prelevel_node', pymongo.ASCENDING)],
                        unique=False)
    ed_poi.create_index([('level', pymongo.ASCENDING),
                         ('follower_prelevel_node', pymongo.ASCENDING)],
                        unique=False)
    ed_net.create_index([("user", pymongo.ASCENDING),
                         ("follower", pymongo.ASCENDING),
                         ("type", pymongo.ASCENDING)],
                        unique=True)

    while True:
        ed_seed = profiles_check.seed_all_profile(stream_users)
        length = len(ed_seed)
        if length == 0:
            print datetime.datetime.now().strftime(
                "%Y-%m-%d-%H-%M-%S"), 'no seed users, finished!'
            break
        else:
            print 'seed users: ', length
            lookup.trans_seed_to_poi(ed_seed, ed_poi)
            continue

    statis = ''
    level = 1
    while level < 3:
        print datetime.datetime.now().strftime(
            "%Y-%m-%d-%H-%M-%S"
        ), 'Snowball followings of seeds for sample db', level
        following_flag = following.snowball_following(ed_poi, ed_net, level,
                                                      mode)
        print datetime.datetime.now().strftime(
            "%Y-%m-%d-%H-%M-%S"
        ), 'Snowball followees of seeds for sample db', level
        follower_flag = follower.snowball_follower(ed_poi, ed_net, level, mode)
        # count = ed_poi.count()
        # try:
        #     # nsize, esize = nt.size_gaint_comp_net_db(ed_net)
        #     # s = 'Start_level: ' + str(level) + ' all_users: ' + \
        #     #           str(count) + ' size_gc:' + str(nsize) + ' ed_gc: ' + str(esize) + '\n'
        #     print s
        #     statis += s
        # except networkx.exception.NetworkXPointlessConcept:
        #     nsize = 0
        #     pass
        if (following_flag == False and follower_flag == False):
            return statis
        else:
            level += 1
            continue
예제 #46
0
파일: io_util.py 프로젝트: abiraja2004/ohsn
def print_tweets(dbname, timeline):
    db = dbt.db_connect_no_auth(dbname)
    time = db[timeline]
    for tweet in time.find():
        try:
            print tweet['text']
        except UnicodeEncodeError:
            pass
예제 #47
0
파일: k_core.py 프로젝트: wtgme/ohsn
def verify_core_user(dbname, colname, usetlist):
    """verify the users in the largest K-core"""
    db = dbt.db_connect_no_auth(dbname)
    com = db[colname]
    for uid in usetlist:
        user = com.find_one({'id': int(uid)})
        if user['level'] != 1:
            print user['screen_name'].encode('utf-8')
예제 #48
0
def timeline(dbname, colname):
    db = dbt.db_connect_no_auth(dbname)
    timeline = db[colname]
    tlist = []
    for status in timeline.find():
        ts = datetime.strptime(status['created_at'], '%a %b %d %H:%M:%S +0000 %Y')
        tlist.append(ts)
    return tlist
예제 #49
0
파일: io_util.py 프로젝트: wtgme/ohsn
def print_tweets(dbname, timeline):
    db = dbt.db_connect_no_auth(dbname)
    time = db[timeline]
    for tweet in time.find():
        try:
            print tweet['text']
        except UnicodeEncodeError:
            pass
예제 #50
0
파일: trimuser.py 프로젝트: wtgme/ohsn
def trim_user(dbname, timename):
    db = dbt.db_connect_no_auth(dbname)
    time = db[timename]
    for tweet in time.find({'user.screen_name': {'$exists': True}}, no_cursor_timeout=True):
        user = tweet['user']
        # tweet['user'] = {'id': user['id']}
        # print tweet
        time.update_one({'id': tweet['id']}, {'$set':{"user": {'id': user['id']}}}, upsert=False)
예제 #51
0
파일: statistics.py 프로젝트: wtgme/ohsn
def get_period(dbname, timename, newtimename):
    db = dbt.db_connect_no_auth(dbname)
    timeline = db[timename]
    newtimeline = db[newtimename]
    start = datetime(2013, 7, 25)
    end=datetime(2013, 7, 29)
    for status in timeline.find({'created_at_date': {'$gte': start, '$lt': end}}, no_cursor_timeout=True):
        newtimeline.insert(status)
예제 #52
0
파일: k_core.py 프로젝트: abiraja2004/ohsn
def verify_core_user(dbname, colname, usetlist):
    """verify the users in the largest K-core"""
    db = dbt.db_connect_no_auth(dbname)
    com = db[colname]
    for uid in usetlist:
        user = com.find_one({'id': int(uid)})
        if user['level'] != 1:
            print user['screen_name'].encode('utf-8')
예제 #53
0
def beh_stat(dbname, comname, colname, filename=None):
    db = dbt.db_connect_no_auth(dbname)
    com = db[comname]
    timeline = db[colname]
    tweet_all, retweet_all, dmention_all, udmention_all, reply_all, hashtag_all, url_all, quota_all, count_sum_all = \
        0, 0, 0, 0, 0, 0, 0, 0, 0
    user_staits = {}
    for user in com.find({}, ['id'], no_cursor_timeout=True):
        tweet, retweet, dmention, udmention, reply, hashtag, url, quota, count_sum = 0, 0, 0, 0, 0, 0, 0, 0, 0
        for status in timeline.find({'user.id': user['id']}, no_cursor_timeout=True):
            count_sum += 1
            count_sum_all += 1
            if 'retweeted_status' in status:
                retweet += 1
                retweet_all += 1
            else:
                tweet += 1
                tweet_all += 1
            if len(status['entities']['user_mentions']) > 0:
                udmention_list = []
                replyf, udmentionf, dmentionf = False, False, False
                # get user mentions in retweet
                if ('retweeted_status' in status) and len(status['retweeted_status']['entities']['user_mentions'])>0:
                    for udmention_item in status['retweeted_status']['entities']['user_mentions']:
                        udmention_list.append(udmention_item['id'])

                for mention in status['entities']['user_mentions']:
                    if ('in_reply_to_user_id' in status) and (mention['id'] == status['in_reply_to_user_id']): # reply
                        replyf = True
                    elif mention['id'] in udmention_list:  # mentions in Retweet content; undirected mention
                        udmentionf = True
                    else:  # original mentions; directed mention
                        dmentionf = True
                if replyf:
                    reply += 1
                    reply_all += 1
                if udmentionf:
                    udmention += 1
                    udmention_all += 1
                if dmentionf:
                    dmention += 1
                    dmention_all += 1
            if len(status['entities']['hashtags']) > 0:
                hashtag += 1
                hashtag_all += 1
            if len(status['entities']['urls']) > 0:
                url += 1
                url_all += 1
            if 'quoted_status' in status:
                quota += 1
                quota_all += 1
        user_staits[user['id']] = (tweet, retweet, dmention,
                                   udmention, reply, hashtag, url, quota, count_sum)
    user_staits[-1] = (tweet_all, retweet_all, dmention_all, udmention_all,
                       reply_all, hashtag_all, url_all, quota_all, count_sum_all)
    if filename:
        pickle.dump(user_staits, open('data/'+filename+'.pick', 'w'))
    return user_staits
예제 #54
0
def beh_stat(dbname, comname, colname, filename=None):
    db = dbt.db_connect_no_auth(dbname)
    com = db[comname]
    timeline = db[colname]
    tweet_all, retweet_all, dmention_all, udmention_all, reply_all, hashtag_all, url_all, quota_all, count_sum_all = \
        0, 0, 0, 0, 0, 0, 0, 0, 0
    user_staits = {}
    for user in com.find({}, ['id'], no_cursor_timeout=True):
        tweet, retweet, dmention, udmention, reply, hashtag, url, quota, count_sum = 0, 0, 0, 0, 0, 0, 0, 0, 0
        for status in timeline.find({'user.id': user['id']}, no_cursor_timeout=True):
            count_sum += 1
            count_sum_all += 1
            if 'retweeted_status' in status:
                retweet += 1
                retweet_all += 1
            else:
                tweet += 1
                tweet_all += 1
            if len(status['entities']['user_mentions']) > 0:
                udmention_list = []
                replyf, udmentionf, dmentionf = False, False, False
                # get user mentions in retweet
                if ('retweeted_status' in status) and len(status['retweeted_status']['entities']['user_mentions'])>0:
                    for udmention_item in status['retweeted_status']['entities']['user_mentions']:
                        udmention_list.append(udmention_item['id'])

                for mention in status['entities']['user_mentions']:
                    if ('in_reply_to_user_id' in status) and (mention['id'] == status['in_reply_to_user_id']): # reply
                        replyf = True
                    elif mention['id'] in udmention_list:  # mentions in Retweet content; undirected mention
                        udmentionf = True
                    else:  # original mentions; directed mention
                        dmentionf = True
                if replyf:
                    reply += 1
                    reply_all += 1
                if udmentionf:
                    udmention += 1
                    udmention_all += 1
                if dmentionf:
                    dmention += 1
                    dmention_all += 1
            if len(status['entities']['hashtags']) > 0:
                hashtag += 1
                hashtag_all += 1
            if len(status['entities']['urls']) > 0:
                url += 1
                url_all += 1
            if 'quoted_status' in status:
                quota += 1
                quota_all += 1
        user_staits[user['id']] = (tweet, retweet, dmention,
                                   udmention, reply, hashtag, url, quota, count_sum)
    user_staits[-1] = (tweet_all, retweet_all, dmention_all, udmention_all,
                       reply_all, hashtag_all, url_all, quota_all, count_sum_all)
    if filename:
        pickle.dump(user_staits, open('data/'+filename+'.pick', 'w'))
    return user_staits
예제 #55
0
def transform():
    db = dbt.db_connect_no_auth('rd')
    cols = db['com']
    db = dbt.db_connect_no_auth('drd')
    cold = db['com']
    cold.create_index([('id', pymongo.ASCENDING)], unique=True)
    for user in cols.find({'level': 3}, [
            'id', 'screen_name', "description", "friends_count",
            "followers_count", "statuses_count"
    ]):
        cold.insert({
            'id': user['id'],
            'screen_name': user['screen_name'],
            'description': user['description'],
            'friends_count': user['friends_count'],
            'followers_count': user['followers_count'],
            'statuses_count': user['statuses_count']
        })
예제 #56
0
def states_change(dbname1, dbname2, comname1, comname2):
    db1 = dbt.db_connect_no_auth(dbname1)
    db2 = dbt.db_connect_no_auth(dbname2)
    com1 = db1[comname1]
    com2 = db2[comname2]
    count = 0
    index = 0
    for user1 in com1.find({'level': 1}):
        index += 1
        user1_ed = profiles_check.check_ed(user1)
        user2 = com2.find_one({'id': user1['id']})
        if user2:
            user2_ed = profiles_check.check_ed(user2)
            if user1_ed != user2_ed:
                print user1['id']
                count += 1
    print count
    print index
예제 #57
0
파일: keyplay.py 프로젝트: wtgme/ohsn
def get_retweeted_tweet(db_name):
    db = dbt.db_connect_no_auth(db_name)
    bnet = db["sbnet"]
    timeline = db["timeline"]
    for net in bnet.find({}):
        sid = net["statusid"]
        orig = timeline.find_one({"id": sid}, ["retweeted_status"])
        oid = orig["retweeted_status"]["id"]
        bnet.update({"statusid": sid}, {"$set": {"ostatusid": oid}})
예제 #58
0
def netstatis(dbname, behavior_name, g, userlist, comname):
    db = dbt.db_connect_no_auth(dbname)
    com = db[comname]
    g = g.as_undirected(combine_edges=dict(weight="sum"))

    # node_n = g.vcount()
    # edge_m = g.ecount()
    # degree_mean = np.mean(g.indegree())
    # degree_std = np.std(g.indegree())
    # density = g.density()
    # avg_path = g.average_path_length()
    # components = g.clusters()
    # comp_count = len(components)
    # giant_comp = components.giant()
    # giant_comp_r = float(giant_comp.vcount())/node_n
    # cluster_co_global = g.transitivity_undirected()
    # cluster_co_avg = g.transitivity_avglocal_undirected()
    # assort = g.assortativity_degree(directed=False)


    gnode = g.vs["name"]
    target_nodes = list(set(userlist).intersection(gnode))

    '''Remove nodes with tailed strength'''
    # strengths = np.array(g.strength(target_nodes, mode='OUT', loops=False, weights='weight'))
    # # maxv, minv = np.percentile(strengths, 97.5), np.percentile(strengths, 2.5)
    # maxv, minv = max(strengths), min(strengths)
    # index = np.logical_and(strengths >= minv, strengths <= maxv)
    # target_nodes = np.asarray(target_nodes, dtype=str)[index]

    degreess = g.degree(target_nodes, mode='OUT', loops=False)
    # strengths = g.strength(target_nodes, mode='OUT', loops=False, weights='weight')

    # print target_nodes
    divs = np.array(g.diversity(target_nodes, 'weight'))*np.log(degreess)

    '''Store in DB'''
    for i in xrange(len(target_nodes)):
        node = target_nodes[i]
        user = com.find_one({'id': int(node)})
        data = user.get('behavior', {})
        diver = divs[i]
        if not np.isfinite(diver):
            diver = 0.0
        data[behavior_name+'_div'] = diver
        com.update_one({'id': int(node)}, {'$set': {'behavior': data}}, upsert=False)

    divs[~np.isfinite(divs)] = 0.0

    # print node_n, edge_m, round(degree_mean, 3), round(degree_std, 3), round(density, 3), \
    #     round(avg_path, 3), comp_count, round(giant_comp_r, 3), round(cluster_co_global, 3), \
    #     round(cluster_co_avg, 3), round(assort, 3), \
    # print len(target_nodes), np.mean(degreess), np.std(degreess),\
    # np.mean(strengths), np.std(strengths),\
    #     np.mean(divs), np.std(divs)

    return divs
예제 #59
0
파일: data_trans.py 프로젝트: wtgme/ohsn
def test_timline():
    db = dbt.db_connect_no_auth('rd')
    cols = db['com']
    for user in cols.find({'timeline_count': {'$lt': 3200}}, ['id', 'timeline_count', 'statuses_count']):
        # print user
        if (user['statuses_count']-user['timeline_count']) > 100:
            print user['id']
            cols.update({'id': user['id']}, {'$set': {"timeline_count": 0,
                        'timeline_scraped_times': 0}}, upsert=False)
예제 #60
0
파일: keyplay.py 프로젝트: abiraja2004/ohsn
def get_retweeted_tweet(db_name):
    db = dbt.db_connect_no_auth(db_name)
    bnet = db['sbnet']
    timeline = db['timeline']
    for net in bnet.find({}):
        sid = net['statusid']
        orig = timeline.find_one({'id': sid}, ['retweeted_status'])
        oid = orig['retweeted_status']['id']
        bnet.update({'statusid': sid}, {'$set': {"ostatusid": oid}})