def get_user_info(d): #i, uid = d #if i % 1000 == 0: # print i #try: i, uid = d u = pickle.load(open(os.path.join(INPUT_DIR, 'obj', uid), 'rb')) fname = os.path.join(INPUT_DIR, 'json', uid + '.json.gz') u.populate_tweets_from_file(fname, store_json=False, do_arabic_stemming=False, do_parse_created_at=False, do_lemmatize=False, do_tokenize=False) of = open(os.path.join(OUTPUT_DIR, uid + ".txt"), "w") mention_counter = Counter() for tw in u.tweets: ment_len = len(tw.mentions) if tw.mentions else 0 mention_counter[ment_len] += 1 if tw.mentions and len(tw.mentions) >= 3: of.write(tsn([tw.id, len(tw.mentions)])) of.close() return uid, mention_counter
def load_equality_constraint(self, identity_wordform, identity_id_children, identity_sent_children, isa_wordform, isa_id_children, isa_sent_children, equality_is_identity, is_negated, constraint_map, actually_add_constraint): """ :param identity_wordform: :param identity_id_children: :param identity_sent_children: :param isa_wordform: :param isa_id_children: :param isa_sent_children: :param equality_is_identity: :param is_negated: :param constraint_map: :return: """ # map from wordforms to IDs identity_id = self.identity_ids_map[identity_wordform] isa_id = self.identity_ids_map[ isa_wordform] if equality_is_identity else self.sentiment_ids_map[ isa_wordform] identity_mod_wfs = [self.identity_ids_map[i] for i in identity_id_children] + \ [self.sentiment_ids_map[i] for i in identity_sent_children] isa_mod_wfs = [self.identity_ids_map[i] for i in isa_id_children] + \ [self.sentiment_ids_map[i] for i in isa_sent_children] identities_in_constr = [identity_wordform ] + identity_id_children + isa_id_children if equality_is_identity: identities_in_constr.append(isa_wordform) # finally, we can construct the constraint! if actually_add_constraint: constraint_string = tsn([ 'EQUALITY ', is_negated, identity_id_children, identity_sent_children, identity_wordform, ' ----> ', isa_id_children, isa_sent_children, isa_wordform ], False) self.constraint_string_list.append(constraint_string) constraint = EqualityConstraint( identity=identity_id, equality_term=isa_id, identity_modifiers=identity_mod_wfs, equality_modifiers=isa_mod_wfs, is_negation=is_negated) self.all_constraints.append(constraint) # add constraint to all identities for identity in identities_in_constr: constraint_map[self.identity_ids_map[identity]].append( constraint) return True
def gen_conll_file(fil,ptb_dir, dp_dir): user = TwitterUser() user.populate_tweets_from_file(fil, do_tokenize=False) if 50 <= user.n_total_tweets <= 15000 and\ user.followers_count <= 25000 and user.creation_date <= MIN_ACCOUNT_AGE: dp_filename = os.path.join(dp_dir,str(user.user_id)+".gz") ptb_filename = os.path.join(ptb_dir,str(user.user_id)+".txt.gz") if not os.path.exists(dp_filename) or not os.path.exists(ptb_filename): return ['no_dp_ptb',[user.user_id,os.path.exists(dp_filename),os.path.exists(ptb_filename)]] penntreebank = {x[0] : x[1:] for x in read_grouped_by_newline_file(ptb_filename)} dependency_parse = read_grouped_by_newline_file(dp_filename) tweet_set = [(i,t) for i,t in enumerate(user.tweets) if t.retweeted is None and\ len(t.urls) == 0 and 'http:' not in t.text and\ langid.classify(t.text)[0] == 'en'] # non english speaker or spam if len(tweet_set) < 40: return ['notweets',user.user_id] data_to_return = [] for twit_it, tweet in tweet_set: data_for_tweet = [] ptb_for_tweet = penntreebank[str(tweet.id)] dp_for_tweet = dependency_parse[twit_it] if ptb_for_tweet[0].split("\t")[2] != DependencyParseObject(dp_for_tweet[0]).text: print 'ahhhhh, weird stuff' continue for i, p in enumerate(dp_for_tweet): d = DependencyParseObject(tsn([p,tweet.id,user.user_id,tweet.created_at.strftime("%m-%d-%y")],newline=False)) # get java features spl_java = ptb_for_tweet[i].split("\t") java_id, penn_pos_tag,word = spl_java[:3] java_features = '' if len(spl_java) == 3 else spl_java[3] d.features += [x for x in java_features.split("|") if x != ''] d.features.append("penn_treebank_pos="+penn_pos_tag) data_for_tweet.append(d) data_to_return.append(data_for_tweet) return ['success', [user.user_id,data_to_return]] else: return ['baduser',user.user_id]
D = np.concatenate((X_pub.todense(),w_vec_pub,head_vec_pub,last_vec_pub),axis=1) predicted_prob = models[0].predict_proba(D[test_inds,:]) stopword_test_inds_0 = [] stopword_test_inds_1 = [] for x in stopword_test_inds: if y_pub[x] == 1: stopword_test_inds_1.append(x) else: stopword_test_inds_0.append(x) if len(stopword_test_inds): extra_tn = len(stopword_test_inds_0) extra_fn = len(stopword_test_inds_1) y_pub = np.concatenate((y_pub[test_inds],np.array([0]*extra_tn),np.array([1]*extra_fn)),axis=0) predicted_prob = np.concatenate((predicted_prob,[[1,0]]*(extra_tn+extra_fn)),axis=0) test_inds = test_inds + stopword_test_inds_0 + stopword_test_inds_1 output_file = open("results/final_model_pub_res.tsv","w") eval_out = evaluate(.5, y_pub, predicted_prob,obj_inds_pub,test_inds,True,True,True) output_file.write(tsn(["final_model"] + eval_out[1:])) output_file.close() from utility_code.dependency_parse_object import DependencyParseObject test_data = {DependencyParseObject(x[0]).tweet_id : x for x in read_grouped_by_newline_file("test_data/final_conll_pub.txt")} write_out_predictions("results/predictions_pub_data.txt",test_data,obj_inds_pub,test_inds,y_pub,predicted_prob)
def event_constraint(self, identity_node, behavior_node, related_node, map_to_head, nodes_map, constraint_map): # no event constraint here if already the same event ... if behavior_node.dp_obj.id in self.sentence_ids_to_constraints[identity_node.dp_obj.id] or \ related_node.dp_obj.id in self.sentence_ids_to_constraints[identity_node.dp_obj.id]: return False beh_forms = get_forms_in_dict(behavior_node, self.sentiment_ids_map) is_negated = is_negated_node(behavior_node, map_to_head, nodes_map) if self.is_isa_relationship(behavior_node): self.equality_constraint_isa(identity_node, related_node, map_to_head, nodes_map, constraint_map, is_negated) return True # there are some behaviors in the list that really don't make sense for events # todo: until they are cleaned, just have a list of them and ignore events w/ them beh_forms = [b for b in beh_forms if b not in EXCLUDED_BEHAVIORS] if len(beh_forms) > 1: beh_forms = [b for b in beh_forms if b != "have"] if not len(beh_forms): if self.verbose: print 'excluded behavior only in event, returning' return False # if the identity node is being acted upon, lets only differentiate if the related node is # also an identity (right now, always the case) # todo: unhackify a bit if self.determine_is_actor(identity_node.dp_obj.id, related_node.dp_obj.id, behavior_node.dp_obj.text): actor_node = identity_node object_node = related_node else: object_node = identity_node actor_node = related_node actor_is_identity, actor_wordform, actor_id_children, actor_sent_children = \ self.get_wordform_and_mods_unsure(actor_node, nodes_map, map_to_head, beh_forms) object_is_identity, object_wordform, object_id_children, object_sent_children = \ self.get_wordform_and_mods_unsure(object_node, nodes_map, map_to_head, beh_forms) # cant have identity == related, wont be quadratic. try to find new if actor_wordform == object_wordform: l_id = len(actor_id_children) l_rel = len(object_id_children) if not l_id and not l_rel: # print 'nope, just returning' return False if not l_id: object_wordform = object_id_children[-1] del object_id_children[-1] else: actor_wordform = actor_id_children[-1] del actor_id_children[-1] # okay, all square with event. load 'er up # convert to ids actor_id = self.identity_ids_map[actor_wordform] if actor_is_identity \ else self.sentiment_ids_map[actor_wordform] object_id = self.identity_ids_map[object_wordform] if object_is_identity \ else self.sentiment_ids_map[object_wordform] actor_mod_ids = [self.identity_ids_map[i] for i in actor_id_children] + \ [self.sentiment_ids_map[i] for i in actor_sent_children] object_mod_ids = [self.identity_ids_map[i] for i in object_id_children] + \ [self.sentiment_ids_map[i] for i in object_sent_children] behavior_ids = [self.sentiment_ids_map[b] for b in beh_forms] constraint_string = tsn([ 'EVENT ', is_negated, ' ', actor_id_children, actor_sent_children, actor_wordform, ' ----> ', beh_forms, ' -----> ', object_id_children, object_sent_children, object_wordform ], False) # add constraint to all identities identities_in_constr = actor_id_children + object_id_children if object_is_identity: identities_in_constr.append(object_wordform) if actor_is_identity: identities_in_constr.append(actor_wordform) if self.use_events: # create constraint constraint = EventConstraint(actor=actor_id, behavior_terms=behavior_ids, object=object_id, actor_mods=actor_mod_ids, object_mods=object_mod_ids, behavior_is_negated=is_negated) self.constraint_string_list.append(constraint_string) self.all_constraints.append(constraint) self.sentence_ids_to_constraints[actor_node.dp_obj.id].add( object_node.dp_obj.id) self.sentence_ids_to_constraints[actor_node.dp_obj.id].add( behavior_node.dp_obj.id) self.sentence_ids_to_constraints[object_node.dp_obj.id].add( actor_node.dp_obj.id) self.sentence_ids_to_constraints[object_node.dp_obj.id].add( behavior_node.dp_obj.id) for identity in identities_in_constr: constraint_map[self.identity_ids_map[identity]].append( constraint) # else: # for identity in identities_in_constr: # self.identities.append(self.identity_ids_map[identity]) return True
def behavior_constraint(self, identity_node, behavior_node, nodes_map, constraint_map, map_to_head): iden_id = identity_node.dp_obj.id beh_id = behavior_node.dp_obj.id beh_text = behavior_node.dp_obj.text identity_is_actor = self.determine_is_actor(iden_id, beh_id, beh_text) # only actions taken # if not identity_is_actor: # return False # get behavior info # there are some behaviors in the list that really don't make sense for events beh_forms = [ b for b in get_forms_in_dict(behavior_node, self.sentiment_ids_map) if b not in EXCLUDED_BEHAVIORS and b != 'have' ] # if len(beh) > 1 then remove unnecessary behaviors if len(beh_forms) > 1: beh_forms = [b for b in beh_forms if b != 'have'] is_negated = is_negated_node(behavior_node, map_to_head, nodes_map) if not len(beh_forms): return False # no behavior constraint if there is already a constraint with this behavior if beh_id in self.sentence_ids_to_constraints[identity_node.dp_obj.id] or \ (len(self.sentence_ids_to_constraints[identity_node.dp_obj.id]) and beh_forms[0] in ['be', 'ain']): return False identity_info = self.get_wordform_and_mods_identity( identity_node, nodes_map, map_to_head, beh_forms) actor_wordform = object_wordform = None if identity_is_actor: actor_wordform, actor_id_children, actor_sent_children = identity_info object_id_children = object_sent_children = [] else: actor_id_children = actor_sent_children = [] object_wordform, object_id_children, object_sent_children = identity_info # convert to idsms actor_id = self.identity_ids_map[actor_wordform] if identity_is_actor \ else self.sentiment_ids_map[ZERO_IDENTITY_INDICATOR] object_id = self.identity_ids_map[object_wordform] if not identity_is_actor \ else self.sentiment_ids_map[ZERO_IDENTITY_INDICATOR] actor_mod_ids = [self.identity_ids_map[i] for i in actor_id_children] + \ [self.sentiment_ids_map[i] for i in actor_sent_children] object_mod_ids = [self.identity_ids_map[i] for i in object_id_children] + \ [self.sentiment_ids_map[i] for i in object_sent_children] behavior_ids = [self.sentiment_ids_map[b] for b in beh_forms] constraint_string = tsn([ 'BEHAVIOR ', is_negated, ' ', actor_id_children, actor_sent_children, actor_wordform, ' ----> ', beh_forms, ' -----> ', object_id_children, object_sent_children, object_wordform ], False) # add constraint to all identities identities_in_constr = actor_id_children + object_id_children if identity_is_actor: identities_in_constr.append(actor_wordform) else: identities_in_constr.append(object_wordform) if self.use_behaviors: # create constraint constraint = EventConstraint(actor=actor_id, behavior_terms=behavior_ids, object=object_id, actor_mods=actor_mod_ids, object_mods=object_mod_ids, behavior_is_negated=is_negated) self.constraint_string_list.append(constraint_string) self.all_constraints.append(constraint) self.sentence_ids_to_constraints[identity_node.dp_obj.id].add( behavior_node.dp_obj.id) for identity in identities_in_constr: constraint_map[self.identity_ids_map[identity]].append( constraint) return True
if tw.mentions and len(tw.mentions) >= 3: of.write(tsn([tw.id, len(tw.mentions)])) of.close() return uid, mention_counter #except: # return uid, Counter() objfiles = listdir(os.path.join(INPUT_DIR, 'obj/')) jsonfiles = set([ os.path.basename(f)[:-8] for f in listdir(os.path.join(INPUT_DIR, 'json/')) ]) print list(jsonfiles)[:5] print objfiles[:5] onlyfiles = [os.path.basename(o) for o in objfiles if o in jsonfiles] print 'N FILES: ', len(onlyfiles) print onlyfiles[:5] #results = [get_user_info((0,onlyfiles[0]))] pool = Pool(int(sys.argv[3])) results = pool.map(get_user_info, enumerate(onlyfiles)) pool.close() pool.terminate() of = open(os.path.join(OUTPUT_DIR, "mention_counts_total.tsv"), "w") for uid, mention_counter in results: for k, v in mention_counter.items(): of.write(tsn([uid, k, v])) of.close()
def run_baseline_on_conll_file(conll_filename, path_to_dicts, output_filename): features_from_conll, blah = get_all_features(conll_filename, None, None,None,None) labels, features, obj_inds = configure_features_for_wordvectors_and_remove_twitterner(features_from_conll)[0:3] ## for dictionary-based evaluation stopwords = get_stopwords() data = read_grouped_by_newline_file(conll_filename) dependency_parses = [] for x in data: dependency_parses.append([DependencyParseObject(o) for o in x]) # get all the dictionaries together p_look_in_dict = partial(look_in_dict, sets=[stopwords], set_names=["stopwords"]) act_dict = p_look_in_dict(dependency_parses, Dictionaries(os.path.join(path_to_dicts,'identities.txt'))) wordnet_dict = p_look_in_dict(dependency_parses, Dictionaries(os.path.join(path_to_dicts,'wordnet_identities.txt'))) racial_dict = p_look_in_dict(dependency_parses, Dictionaries(os.path.join(path_to_dicts,'racial_slur_identities.txt'))) national_dict = p_look_in_dict(dependency_parses, Dictionaries(os.path.join(path_to_dicts,'national_identities.txt'))) job_dict = p_look_in_dict(dependency_parses, Dictionaries(os.path.join(path_to_dicts,'job_identities.txt'))) all_ds = Dictionaries(os.path.join(path_to_dicts,'*identities.txt')) all_dict = p_look_in_dict(dependency_parses,all_ds) # get hte bootstrapped dictionary together tw_distant_supervision_identity_dat = get_twitter_distant_supervision_identity_dat(BOOTSTRAPPED_DICTIONARY_LOCATION) stopwords = get_stopwords() twit_sets = [] for v in [10, 100, 1000, 10000,50000]: twit_id = set(tw_distant_supervision_identity_dat[ (tw_distant_supervision_identity_dat.tot > v)].term.values) twit_id = twit_id - stopwords twit_sets.append([twit_id,"twit_identities"+str(v)]) all_random_ids = get_test_ids(conll_filename, 0, -1, -1) y = np.array(labels) output_file = open(output_filename, "w") #test all the basic dicts for d in [['act_dict',act_dict], ['racial_dict',racial_dict], ['nat_dict',national_dict], ['job_dict',job_dict], ['wordnet_dict',wordnet_dict], ['all_dict',all_dict]]: preds = get_isin_array(d[1],obj_inds) out = evaluate(.4, y, preds , obj_inds, all_random_ids, print_eval=True) output_file.write(tsn([d[0]] + out[1:])) # test the bootstrapped dicts for twit_set, twit_set_id in twit_sets: d = look_in_dict(dependency_parses,sets=[twit_set,stopwords],set_names=["twit_identities", "stopwords"]) out = evaluate(.4, y, get_isin_array(d,obj_inds), obj_inds, all_random_ids, print_eval=True) output_file.write(tsn([twit_set_id+"_alone"] + out[1:])) d = look_in_dict(dependency_parses, all_ds,[twit_set, stopwords],[twit_set_id,"stopwords"]) out = evaluate(.4, y, get_isin_array(d,obj_inds), obj_inds, all_random_ids, print_eval=True) output_file.write(tsn([twit_set_id+"_w_all"] + out[1:])) output_file.close()
def gen_conll_file(fil, ptb_dir, dp_dir): user = TwitterUser() user.populate_tweets_from_file(fil, do_tokenize=False) if 50 <= user.n_total_tweets <= 15000 and\ user.followers_count <= 25000 and user.creation_date <= MIN_ACCOUNT_AGE: dp_filename = os.path.join(dp_dir, str(user.user_id) + ".gz") ptb_filename = os.path.join(ptb_dir, str(user.user_id) + ".txt.gz") if not os.path.exists(dp_filename) or not os.path.exists(ptb_filename): return [ 'no_dp_ptb', [ user.user_id, os.path.exists(dp_filename), os.path.exists(ptb_filename) ] ] penntreebank = { x[0]: x[1:] for x in read_grouped_by_newline_file(ptb_filename) } dependency_parse = read_grouped_by_newline_file(dp_filename) tweet_set = [(i,t) for i,t in enumerate(user.tweets) if t.retweeted is None and\ len(t.urls) == 0 and 'http:' not in t.text and\ langid.classify(t.text)[0] == 'en'] # non english speaker or spam if len(tweet_set) < 40: return ['notweets', user.user_id] data_to_return = [] for twit_it, tweet in tweet_set: data_for_tweet = [] ptb_for_tweet = penntreebank[str(tweet.id)] dp_for_tweet = dependency_parse[twit_it] if ptb_for_tweet[0].split("\t")[2] != DependencyParseObject( dp_for_tweet[0]).text: print 'ahhhhh, weird stuff' continue for i, p in enumerate(dp_for_tweet): d = DependencyParseObject( tsn([ p, tweet.id, user.user_id, tweet.created_at.strftime("%m-%d-%y") ], newline=False)) # get java features spl_java = ptb_for_tweet[i].split("\t") java_id, penn_pos_tag, word = spl_java[:3] java_features = '' if len(spl_java) == 3 else spl_java[3] d.features += [x for x in java_features.split("|") if x != ''] d.features.append("penn_treebank_pos=" + penn_pos_tag) data_for_tweet.append(d) data_to_return.append(data_for_tweet) return ['success', [user.user_id, data_to_return]] else: return ['baduser', user.user_id]
for x in stopword_test_inds: if y_pub[x] == 1: stopword_test_inds_1.append(x) else: stopword_test_inds_0.append(x) if len(stopword_test_inds): extra_tn = len(stopword_test_inds_0) extra_fn = len(stopword_test_inds_1) y_pub = np.concatenate( (y_pub[test_inds], np.array([0] * extra_tn), np.array([1] * extra_fn)), axis=0) predicted_prob = np.concatenate( (predicted_prob, [[1, 0]] * (extra_tn + extra_fn)), axis=0) test_inds = test_inds + stopword_test_inds_0 + stopword_test_inds_1 output_file = open("results/final_model_pub_res.tsv", "w") eval_out = evaluate(.5, y_pub, predicted_prob, obj_inds_pub, test_inds, True, True, True) output_file.write(tsn(["final_model"] + eval_out[1:])) output_file.close() from utility_code.dependency_parse_object import DependencyParseObject test_data = { DependencyParseObject(x[0]).tweet_id: x for x in read_grouped_by_newline_file("test_data/final_conll_pub.txt") } write_out_predictions("results/predictions_pub_data.txt", test_data, obj_inds_pub, test_inds, y_pub, predicted_prob)
def run_baseline_on_conll_file(conll_filename, path_to_dicts, output_filename): features_from_conll, blah = get_all_features(conll_filename, None, None, None, None) labels, features, obj_inds = configure_features_for_wordvectors_and_remove_twitterner( features_from_conll)[0:3] ## for dictionary-based evaluation stopwords = get_stopwords() data = read_grouped_by_newline_file(conll_filename) dependency_parses = [] for x in data: dependency_parses.append([DependencyParseObject(o) for o in x]) # get all the dictionaries together p_look_in_dict = partial(look_in_dict, sets=[stopwords], set_names=["stopwords"]) act_dict = p_look_in_dict( dependency_parses, Dictionaries(os.path.join(path_to_dicts, 'identities.txt'))) wordnet_dict = p_look_in_dict( dependency_parses, Dictionaries(os.path.join(path_to_dicts, 'wordnet_identities.txt'))) racial_dict = p_look_in_dict( dependency_parses, Dictionaries(os.path.join(path_to_dicts, 'racial_slur_identities.txt'))) national_dict = p_look_in_dict( dependency_parses, Dictionaries(os.path.join(path_to_dicts, 'national_identities.txt'))) job_dict = p_look_in_dict( dependency_parses, Dictionaries(os.path.join(path_to_dicts, 'job_identities.txt'))) all_ds = Dictionaries(os.path.join(path_to_dicts, '*identities.txt')) all_dict = p_look_in_dict(dependency_parses, all_ds) # get hte bootstrapped dictionary together tw_distant_supervision_identity_dat = get_twitter_distant_supervision_identity_dat( BOOTSTRAPPED_DICTIONARY_LOCATION) stopwords = get_stopwords() twit_sets = [] for v in [10, 100, 1000, 10000, 50000]: twit_id = set(tw_distant_supervision_identity_dat[( tw_distant_supervision_identity_dat.tot > v)].term.values) twit_id = twit_id - stopwords twit_sets.append([twit_id, "twit_identities" + str(v)]) all_random_ids = get_test_ids(conll_filename, 0, -1, -1) y = np.array(labels) output_file = open(output_filename, "w") #test all the basic dicts for d in [['act_dict', act_dict], ['racial_dict', racial_dict], ['nat_dict', national_dict], ['job_dict', job_dict], ['wordnet_dict', wordnet_dict], ['all_dict', all_dict]]: preds = get_isin_array(d[1], obj_inds) out = evaluate(.4, y, preds, obj_inds, all_random_ids, print_eval=True) output_file.write(tsn([d[0]] + out[1:])) # test the bootstrapped dicts for twit_set, twit_set_id in twit_sets: d = look_in_dict(dependency_parses, sets=[twit_set, stopwords], set_names=["twit_identities", "stopwords"]) out = evaluate(.4, y, get_isin_array(d, obj_inds), obj_inds, all_random_ids, print_eval=True) output_file.write(tsn([twit_set_id + "_alone"] + out[1:])) d = look_in_dict(dependency_parses, all_ds, [twit_set, stopwords], [twit_set_id, "stopwords"]) out = evaluate(.4, y, get_isin_array(d, obj_inds), obj_inds, all_random_ids, print_eval=True) output_file.write(tsn([twit_set_id + "_w_all"] + out[1:])) output_file.close()