def filter_ed_tweets(): # Filter ed-related tweets, based on word2vec from ohsn.edrelated import edrelatedcom prorec = edrelatedcom.rec_user('fed', 'scom') proed = edrelatedcom.proed_users('fed', 'scom') # com = dbt.db_connect_col('fed', 'scom') times = dbt.db_connect_col('fed', 'timeline') ed_times = dbt.db_connect_col('fed', 'edtimeline') ed_times.create_index([('user.id', pymongo.ASCENDING), ('id', pymongo.DESCENDING)]) ed_times.create_index([('id', pymongo.ASCENDING)], unique=True) ed_list = set([ 'bmi', 'cw', 'ugw', 'gw', 'lbs', 'hw', 'lw', 'kg', 'ed', 'eatingdisorder', 'anorexia', 'bulimia', 'anorexic', 'ana', 'bulimic', 'anorexia', 'mia', 'thinspo', 'bulemia', 'purge', 'bulimia', 'binge', 'selfharm', 'ednos', 'edprobs', 'edprob', 'proana', 'anamia', 'promia', 'askanamia', 'bonespo', 'legspo' ]) model = models.word2vec.Word2Vec.load('data/word2vec') # Rake = RAKE.Rake('/home/wt/Code/ohsn/ohsn/networkminer/stoplist/SmartStoplist.txt') import ohsn.api.profiles_check as pc print len(prorec + proed) for user in prorec + proed: for tweet in times.find({'user.id': int(user)}): text = tweet['text'].encode('utf8') # replace RT, @, # and Http:// # text = text.strip().lower() # text = re.sub(r"(?:(rt\ ?@)|@|https?://)\S+", "", text) # replace RT @, @ and http:// # keywords = Rake.run(text) keywords = pc.tokenizer_stoprm(text) sumsim = 0.0 count = 0 # for word in keywords: # tokens = word[0].split() # sima, ca = 0.0, 0.0 # for token in tokens: # if token in model: # for ed in ed_list: # sim = model.similarity(token, ed) # # if sim > maxsim: # sima += sim # ca += 1 # if ca != 0: # sumsim += sima/ca # count += 1 for word in keywords: if word in model: for ed in ed_list: sim = model.similarity(word, ed) sumsim += sim count += 1 if count != 0 and (sumsim / count ) > 0.26: # the average similarity of ed words try: ed_times.insert(tweet) except pymongo.errors.DuplicateKeyError: pass
def recover_proed_community(): # pro-recovery and pro-ed users, and their outlinked communities prorec = edrelatedcom.rec_user('fed', 'scom') proed = edrelatedcom.proed_users('fed', 'scom') cols = dbt.db_connect_col('fed', 'follownet') name_map, edges, set_map = {}, set(), {} for row in cols.find({},no_cursor_timeout=True): n1 = str(row['follower']) if n1 in prorec or n1 in proed: n2 = str(row['user']) n1id = name_map.get(n1, len(name_map)) name_map[n1] = n1id n2id = name_map.get(n2, len(name_map)) name_map[n2] = n2id edges.add((n1id, n2id)) g = Graph(len(name_map), directed=True) g.vs["name"] = list(sorted(name_map, key=name_map.get)) # return keys ordered by values g.add_edges(list(edges)) g.es["weight"] = 1 g.vs["set"] = 0 for v in g.vs: if v['name'] in prorec: v['set'] = 1 elif v['name'] in proed: v['set'] = -1 gt.summary(g) g.vs['deg'] = g.indegree() nodes = [] for v in g.vs: if v['set'] == 1 or v['set'] == -1: nodes.append(v) elif v['deg'] > 3: nodes.append(v) else: pass print 'Filtered nodes: %d' %len(nodes) g = g.subgraph(nodes) gt.summary(g) g.write_graphml('rec-proed-follow.graphml') # sbnet have extended all interactions posted by ED users edusers = set(g.vs['name']) for btype in ['retweet', 'reply', 'mention']: gb = gt.load_beh_network('fed', 'sbnet', btype) gt.summary(gb) nodes = [] for v in gb.vs: if v['name'] in edusers: nodes.append(v) gb = gb.subgraph(nodes) for v in gb.vs: v['set'] = g.vs.find(name=v['name'])['set'] gt.summary(gb) gb.write_graphml('rec-proed-'+btype+'.graphml')
def recover_proed_interaction(): # interaction network of pro-recovery and pro-ed users prorec = edrelatedcom.rec_user('fed', 'scom') proed = edrelatedcom.proed_users('fed', 'scom') btype_dic = {'retweet': [1], 'reply': [2], 'mention': [3], 'communication': [2, 3]} for btype in ['retweet', 'reply', 'mention']: cols = dbt.db_connect_col('fed', 'sbnet') name_map, edges, set_map = {}, {}, {} for row in cols.find({'type': {'$in': btype_dic[btype]}}, no_cursor_timeout=True): n1 = str(row['id0']) n2 = str(row['id1']) if n1 in prorec or n1 in proed: if n1 != n2: n1id = name_map.get(n1, len(name_map)) name_map[n1] = n1id n2id = name_map.get(n2, len(name_map)) name_map[n2] = n2id wt = edges.get((n1id, n2id), 0) edges[(n1id, n2id)] = wt + 1 g = Graph(len(name_map), directed=True) g.vs["name"] = list(sorted(name_map, key=name_map.get)) g.add_edges(edges.keys()) g.es["weight"] = edges.values() g.vs["set"] = 0 for v in g.vs: if v['name'] in prorec: v['set'] = 1 elif v['name'] in proed: v['set'] = -1 gt.summary(g) edges = g.es.select(weight_gt=3) edge_nodes = [] for edge in edges: source_vertex_id = edge.source target_vertex_id = edge.target source_vertex = g.vs[source_vertex_id] target_vertex = g.vs[target_vertex_id] edge_nodes.append(source_vertex['name']) edge_nodes.append(target_vertex['name']) nodes = [] for v in g.vs: if v['set'] == 1 or v['set'] == -1: nodes.append(v) elif v['name'] in edge_nodes: nodes.append(v) else: pass print 'Filtered nodes: %d' %len(nodes) g = g.subgraph(nodes) gt.summary(g) g.write_graphml('rec-proed-'+btype+'.graphml')
def classify_recovery_proed(): # classification pro-recovery and pro-ed users prorec = edrelatedcom.rec_user('fed', 'scom') proed = edrelatedcom.proed_users('fed', 'scom') com = dbt.db_connect_col('fed', 'scom') documents = [] for user in com.find(): profile = user['description'] if profile: tokens = pc.tokenizer_stoprm(profile) sentence = TaggedDocument(words=tokens, tags=[str(user['id'])]) documents.append(sentence) cores = multiprocessing.cpu_count() size = 200 window = 4 simple_models = [ # PV-DM w/concatenation - window=5 (both sides) approximates paper's 10-word total window size Doc2Vec(documents, dm=1, dm_concat=1, size=size, window=window, negative=5, hs=1, sample=1e-3, min_count=1, workers=cores), # PV-DBOW Doc2Vec(documents, dm=0, size=size, window=window, negative=5, hs=1, sample=1e-3, min_count=1, workers=cores), ] model = ConcatenatedDoc2Vec([simple_models[1], simple_models[0]]) X_train, y_train, X_test, y_test = [], [], [], [] for doc in documents: tag = doc.tags[0] if (tag) in prorec: X_train.append(model.docvecs[tag]) y_train.append(1) elif (tag) in proed: X_train.append(model.docvecs[tag]) y_train.append(0) else: X_test.append(model.docvecs[tag]) y_test.append(int(tag)) print len(X_train) print len(X_test) print len(documents) logistic = linear_model.LogisticRegression(multi_class='multinomial', solver='newton-cg', n_jobs=multiprocessing.cpu_count()) # svc_lin = SVC(kernel='linear', class_weight='balanced') logistic.fit(X_train, y_train) y_tlin = logistic.predict(X_train) y_lin = logistic.predict(X_test) for average in ['micro', 'macro']: train_precision, train_recall, train_f1, train_acc = get_scores(y_train, y_tlin, average) print "Train Prec (%s average): %.3f, recall: %.3f, F1: %.3f, Acc: %.3f" %( average, train_precision, train_recall, train_f1, train_acc )
def potential_users(dbname, comname): ed_users = edrelatedcom.ed_user(dbname, comname) rec_users = edrelatedcom.rec_user(dbname, comname) return list(set(ed_users).union(set(rec_users)))