示例#1
0
def filter_ed_tweets():
    # Filter ed-related tweets, based on word2vec
    from ohsn.edrelated import edrelatedcom
    prorec = edrelatedcom.rec_user('fed', 'scom')
    proed = edrelatedcom.proed_users('fed', 'scom')

    # com = dbt.db_connect_col('fed', 'scom')
    times = dbt.db_connect_col('fed', 'timeline')
    ed_times = dbt.db_connect_col('fed', 'edtimeline')
    ed_times.create_index([('user.id', pymongo.ASCENDING),
                           ('id', pymongo.DESCENDING)])
    ed_times.create_index([('id', pymongo.ASCENDING)], unique=True)
    ed_list = set([
        'bmi', 'cw', 'ugw', 'gw', 'lbs', 'hw', 'lw', 'kg', 'ed',
        'eatingdisorder', 'anorexia', 'bulimia', 'anorexic', 'ana', 'bulimic',
        'anorexia', 'mia', 'thinspo', 'bulemia', 'purge', 'bulimia', 'binge',
        'selfharm', 'ednos', 'edprobs', 'edprob', 'proana', 'anamia', 'promia',
        'askanamia', 'bonespo', 'legspo'
    ])
    model = models.word2vec.Word2Vec.load('data/word2vec')
    # Rake = RAKE.Rake('/home/wt/Code/ohsn/ohsn/networkminer/stoplist/SmartStoplist.txt')
    import ohsn.api.profiles_check as pc
    print len(prorec + proed)
    for user in prorec + proed:
        for tweet in times.find({'user.id': int(user)}):
            text = tweet['text'].encode('utf8')
            # replace RT, @, # and Http://
            # text = text.strip().lower()
            # text = re.sub(r"(?:(rt\ ?@)|@|https?://)\S+", "", text) # replace RT @, @ and http://
            # keywords = Rake.run(text)
            keywords = pc.tokenizer_stoprm(text)
            sumsim = 0.0
            count = 0
            # for word in keywords:
            #     tokens = word[0].split()
            #     sima, ca = 0.0, 0.0
            #     for token in tokens:
            #         if token in model:
            #             for ed in ed_list:
            #                 sim = model.similarity(token, ed)
            #                 # if sim > maxsim:
            #                 sima += sim
            #                 ca += 1
            #     if ca != 0:
            #         sumsim += sima/ca
            #         count += 1
            for word in keywords:
                if word in model:
                    for ed in ed_list:
                        sim = model.similarity(word, ed)
                        sumsim += sim
                        count += 1
            if count != 0 and (sumsim / count
                               ) > 0.26:  # the average similarity of ed words
                try:
                    ed_times.insert(tweet)
                except pymongo.errors.DuplicateKeyError:
                    pass
示例#2
0
def recover_proed_community():
    # pro-recovery and pro-ed users, and their outlinked communities
    prorec = edrelatedcom.rec_user('fed', 'scom')
    proed = edrelatedcom.proed_users('fed', 'scom')
    cols = dbt.db_connect_col('fed', 'follownet')
    name_map, edges, set_map = {}, set(), {}
    for row in cols.find({},no_cursor_timeout=True):
        n1 = str(row['follower'])
        if n1 in prorec or n1 in proed:
            n2 = str(row['user'])
            n1id = name_map.get(n1, len(name_map))
            name_map[n1] = n1id
            n2id = name_map.get(n2, len(name_map))
            name_map[n2] = n2id
            edges.add((n1id, n2id))
    g = Graph(len(name_map), directed=True)
    g.vs["name"] = list(sorted(name_map, key=name_map.get)) # return keys ordered by values
    g.add_edges(list(edges))
    g.es["weight"] = 1
    g.vs["set"] = 0
    for v in g.vs:
        if v['name'] in prorec:
            v['set'] = 1
        elif v['name'] in proed:
            v['set'] = -1
    gt.summary(g)

    g.vs['deg'] = g.indegree()
    nodes = []
    for v in g.vs:
        if v['set'] == 1 or v['set'] == -1:
            nodes.append(v)
        elif v['deg'] > 3:
            nodes.append(v)
        else:
            pass
    print 'Filtered nodes: %d' %len(nodes)
    g = g.subgraph(nodes)
    gt.summary(g)
    g.write_graphml('rec-proed-follow.graphml')

    # sbnet have extended all interactions posted by ED users
    edusers = set(g.vs['name'])
    for btype in ['retweet', 'reply', 'mention']:
        gb = gt.load_beh_network('fed', 'sbnet', btype)
        gt.summary(gb)
        nodes = []
        for v in gb.vs:
            if v['name'] in edusers:
                nodes.append(v)
        gb = gb.subgraph(nodes)
        for v in gb.vs:
            v['set'] = g.vs.find(name=v['name'])['set']
        gt.summary(gb)
        gb.write_graphml('rec-proed-'+btype+'.graphml')
示例#3
0
def recover_proed_interaction():
    # interaction network of pro-recovery and pro-ed users
    prorec = edrelatedcom.rec_user('fed', 'scom')
    proed = edrelatedcom.proed_users('fed', 'scom')
    btype_dic = {'retweet': [1], 'reply': [2], 'mention': [3], 'communication': [2, 3]}
    for btype in ['retweet', 'reply', 'mention']:
        cols = dbt.db_connect_col('fed', 'sbnet')
        name_map, edges, set_map = {}, {}, {}
        for row in cols.find({'type': {'$in': btype_dic[btype]}}, no_cursor_timeout=True):
            n1 = str(row['id0'])
            n2 = str(row['id1'])
            if n1 in prorec or n1 in proed:
                if n1 != n2:
                    n1id = name_map.get(n1, len(name_map))
                    name_map[n1] = n1id
                    n2id = name_map.get(n2, len(name_map))
                    name_map[n2] = n2id
                    wt = edges.get((n1id, n2id), 0)
                    edges[(n1id, n2id)] = wt + 1
        g = Graph(len(name_map), directed=True)
        g.vs["name"] = list(sorted(name_map, key=name_map.get))
        g.add_edges(edges.keys())
        g.es["weight"] = edges.values()
        g.vs["set"] = 0
        for v in g.vs:
            if v['name'] in prorec:
                v['set'] = 1
            elif v['name'] in proed:
                v['set'] = -1
        gt.summary(g)


        edges = g.es.select(weight_gt=3)
        edge_nodes = []
        for edge in edges:
            source_vertex_id = edge.source
            target_vertex_id = edge.target
            source_vertex = g.vs[source_vertex_id]
            target_vertex = g.vs[target_vertex_id]
            edge_nodes.append(source_vertex['name'])
            edge_nodes.append(target_vertex['name'])

        nodes = []
        for v in g.vs:
            if v['set'] == 1 or v['set'] == -1:
                nodes.append(v)
            elif v['name'] in edge_nodes:
                nodes.append(v)
            else:
                pass
        print 'Filtered nodes: %d' %len(nodes)
        g = g.subgraph(nodes)
        gt.summary(g)
        g.write_graphml('rec-proed-'+btype+'.graphml')
示例#4
0
def classify_recovery_proed():
    # classification pro-recovery and pro-ed users
    prorec = edrelatedcom.rec_user('fed', 'scom')
    proed = edrelatedcom.proed_users('fed', 'scom')
    com = dbt.db_connect_col('fed', 'scom')
    documents = []
    for user in com.find():
        profile = user['description']
        if profile:
            tokens = pc.tokenizer_stoprm(profile)
            sentence = TaggedDocument(words=tokens, tags=[str(user['id'])])
            documents.append(sentence)
    cores = multiprocessing.cpu_count()
    size = 200
    window = 4
    simple_models = [
                # PV-DM w/concatenation - window=5 (both sides) approximates paper's 10-word total window size
                Doc2Vec(documents, dm=1, dm_concat=1, size=size, window=window, negative=5, hs=1, sample=1e-3, min_count=1, workers=cores),
                # PV-DBOW
                Doc2Vec(documents, dm=0, size=size, window=window, negative=5, hs=1, sample=1e-3, min_count=1, workers=cores),
                    ]
    model = ConcatenatedDoc2Vec([simple_models[1], simple_models[0]])
    X_train, y_train, X_test, y_test = [], [], [], []
    for doc in documents:
        tag = doc.tags[0]
        if (tag) in prorec:
            X_train.append(model.docvecs[tag])
            y_train.append(1)
        elif (tag) in proed:
            X_train.append(model.docvecs[tag])
            y_train.append(0)
        else:
            X_test.append(model.docvecs[tag])
            y_test.append(int(tag))
    print len(X_train)
    print len(X_test)
    print len(documents)
    logistic = linear_model.LogisticRegression(multi_class='multinomial', solver='newton-cg', n_jobs=multiprocessing.cpu_count())
    # svc_lin = SVC(kernel='linear', class_weight='balanced')
    logistic.fit(X_train, y_train)
    y_tlin = logistic.predict(X_train)
    y_lin = logistic.predict(X_test)
    for average in ['micro', 'macro']:
        train_precision, train_recall, train_f1, train_acc = get_scores(y_train, y_tlin, average)
        print "Train Prec (%s average): %.3f, recall: %.3f, F1: %.3f, Acc: %.3f" %( average,
                            train_precision, train_recall, train_f1, train_acc )
示例#5
0
def potential_users(dbname, comname):
    ed_users = edrelatedcom.ed_user(dbname, comname)
    rec_users = edrelatedcom.rec_user(dbname, comname)
    return list(set(ed_users).union(set(rec_users)))