示例#1
0
def get_user_info(d):

    #i, uid = d
    #if i % 1000 == 0:
    #    print i
    #try:
    i, uid = d
    u = pickle.load(open(os.path.join(INPUT_DIR, 'obj', uid), 'rb'))
    fname = os.path.join(INPUT_DIR, 'json', uid + '.json.gz')

    u.populate_tweets_from_file(fname,
                                store_json=False,
                                do_arabic_stemming=False,
                                do_parse_created_at=False,
                                do_lemmatize=False,
                                do_tokenize=False)

    of = open(os.path.join(OUTPUT_DIR, uid + ".txt"), "w")
    mention_counter = Counter()
    for tw in u.tweets:
        ment_len = len(tw.mentions) if tw.mentions else 0
        mention_counter[ment_len] += 1
        if tw.mentions and len(tw.mentions) >= 3:
            of.write(tsn([tw.id, len(tw.mentions)]))
    of.close()
    return uid, mention_counter
    def load_equality_constraint(self, identity_wordform, identity_id_children,
                                 identity_sent_children, isa_wordform,
                                 isa_id_children, isa_sent_children,
                                 equality_is_identity, is_negated,
                                 constraint_map, actually_add_constraint):
        """
        :param identity_wordform:
        :param identity_id_children:
        :param identity_sent_children:
        :param isa_wordform:
        :param isa_id_children:
        :param isa_sent_children:
        :param equality_is_identity:
        :param is_negated:
        :param constraint_map:
        :return:
        """

        # map from wordforms to IDs
        identity_id = self.identity_ids_map[identity_wordform]
        isa_id = self.identity_ids_map[
            isa_wordform] if equality_is_identity else self.sentiment_ids_map[
                isa_wordform]
        identity_mod_wfs = [self.identity_ids_map[i] for i in identity_id_children] + \
                           [self.sentiment_ids_map[i] for i in identity_sent_children]
        isa_mod_wfs = [self.identity_ids_map[i] for i in isa_id_children] + \
                      [self.sentiment_ids_map[i] for i in isa_sent_children]

        identities_in_constr = [identity_wordform
                                ] + identity_id_children + isa_id_children
        if equality_is_identity:
            identities_in_constr.append(isa_wordform)

        # finally, we can construct the constraint!
        if actually_add_constraint:
            constraint_string = tsn([
                'EQUALITY ', is_negated, identity_id_children,
                identity_sent_children, identity_wordform, ' ----> ',
                isa_id_children, isa_sent_children, isa_wordform
            ], False)

            self.constraint_string_list.append(constraint_string)
            constraint = EqualityConstraint(
                identity=identity_id,
                equality_term=isa_id,
                identity_modifiers=identity_mod_wfs,
                equality_modifiers=isa_mod_wfs,
                is_negation=is_negated)

            self.all_constraints.append(constraint)

            # add constraint to all identities
            for identity in identities_in_constr:
                constraint_map[self.identity_ids_map[identity]].append(
                    constraint)

            return True
def gen_conll_file(fil,ptb_dir, dp_dir):
    user = TwitterUser()
    user.populate_tweets_from_file(fil, do_tokenize=False)

    if 50 <= user.n_total_tweets <= 15000 and\
       user.followers_count <= 25000 and user.creation_date <= MIN_ACCOUNT_AGE:

        dp_filename = os.path.join(dp_dir,str(user.user_id)+".gz")
        ptb_filename = os.path.join(ptb_dir,str(user.user_id)+".txt.gz")

        if not os.path.exists(dp_filename) or not os.path.exists(ptb_filename):
            return ['no_dp_ptb',[user.user_id,os.path.exists(dp_filename),os.path.exists(ptb_filename)]]

        penntreebank = {x[0] : x[1:] for x in read_grouped_by_newline_file(ptb_filename)}
        dependency_parse =  read_grouped_by_newline_file(dp_filename)

        tweet_set = [(i,t) for i,t in enumerate(user.tweets) if t.retweeted is None and\
                       len(t.urls) == 0 and 'http:' not in t.text and\
                       langid.classify(t.text)[0] == 'en']

        # non english speaker or spam
        if len(tweet_set) < 40:
            return ['notweets',user.user_id]


        data_to_return = []
        for twit_it, tweet in tweet_set:

            data_for_tweet = []

            ptb_for_tweet = penntreebank[str(tweet.id)]
            dp_for_tweet = dependency_parse[twit_it]

            if ptb_for_tweet[0].split("\t")[2] != DependencyParseObject(dp_for_tweet[0]).text:
                print 'ahhhhh, weird stuff'
                continue

            for i, p in enumerate(dp_for_tweet):
                d = DependencyParseObject(tsn([p,tweet.id,user.user_id,tweet.created_at.strftime("%m-%d-%y")],newline=False))
                # get java features
                spl_java = ptb_for_tweet[i].split("\t")
                java_id, penn_pos_tag,word = spl_java[:3]
                java_features = '' if len(spl_java) == 3 else spl_java[3]
                d.features += [x for x in java_features.split("|") if x != '']
                d.features.append("penn_treebank_pos="+penn_pos_tag)
                data_for_tweet.append(d)
            data_to_return.append(data_for_tweet)

        return ['success', [user.user_id,data_to_return]]
    else:
        return ['baduser',user.user_id]
D = np.concatenate((X_pub.todense(),w_vec_pub,head_vec_pub,last_vec_pub),axis=1)
predicted_prob = models[0].predict_proba(D[test_inds,:])

stopword_test_inds_0 = []
stopword_test_inds_1 = []

for x in stopword_test_inds:
    if y_pub[x] == 1:
        stopword_test_inds_1.append(x)
    else:
        stopword_test_inds_0.append(x)


if len(stopword_test_inds):
    extra_tn = len(stopword_test_inds_0)
    extra_fn = len(stopword_test_inds_1)
    y_pub = np.concatenate((y_pub[test_inds],np.array([0]*extra_tn),np.array([1]*extra_fn)),axis=0)
    predicted_prob = np.concatenate((predicted_prob,[[1,0]]*(extra_tn+extra_fn)),axis=0)
    test_inds = test_inds + stopword_test_inds_0 + stopword_test_inds_1


output_file = open("results/final_model_pub_res.tsv","w")
eval_out = evaluate(.5, y_pub, predicted_prob,obj_inds_pub,test_inds,True,True,True)
output_file.write(tsn(["final_model"] + eval_out[1:]))
output_file.close()

from utility_code.dependency_parse_object import DependencyParseObject
test_data = {DependencyParseObject(x[0]).tweet_id : x for x in
                             read_grouped_by_newline_file("test_data/final_conll_pub.txt")}

write_out_predictions("results/predictions_pub_data.txt",test_data,obj_inds_pub,test_inds,y_pub,predicted_prob)
    def event_constraint(self, identity_node, behavior_node, related_node,
                         map_to_head, nodes_map, constraint_map):

        # no event constraint here if already the same event ...
        if behavior_node.dp_obj.id in self.sentence_ids_to_constraints[identity_node.dp_obj.id] or \
                        related_node.dp_obj.id in self.sentence_ids_to_constraints[identity_node.dp_obj.id]:
            return False

        beh_forms = get_forms_in_dict(behavior_node, self.sentiment_ids_map)
        is_negated = is_negated_node(behavior_node, map_to_head, nodes_map)

        if self.is_isa_relationship(behavior_node):
            self.equality_constraint_isa(identity_node, related_node,
                                         map_to_head, nodes_map,
                                         constraint_map, is_negated)
            return True

        # there are some behaviors in the list that really don't make sense for events
        # todo: until they are cleaned, just have a list of them and ignore events w/ them
        beh_forms = [b for b in beh_forms if b not in EXCLUDED_BEHAVIORS]
        if len(beh_forms) > 1:
            beh_forms = [b for b in beh_forms if b != "have"]
        if not len(beh_forms):
            if self.verbose:
                print 'excluded behavior only in event, returning'
            return False

        # if the identity node is being acted upon, lets only differentiate if the related node is
        # also an identity (right now, always the case)
        # todo: unhackify a bit
        if self.determine_is_actor(identity_node.dp_obj.id,
                                   related_node.dp_obj.id,
                                   behavior_node.dp_obj.text):
            actor_node = identity_node
            object_node = related_node
        else:
            object_node = identity_node
            actor_node = related_node

        actor_is_identity, actor_wordform, actor_id_children, actor_sent_children = \
            self.get_wordform_and_mods_unsure(actor_node, nodes_map, map_to_head, beh_forms)
        object_is_identity, object_wordform, object_id_children, object_sent_children = \
            self.get_wordform_and_mods_unsure(object_node, nodes_map, map_to_head, beh_forms)

        # cant have identity == related, wont be quadratic. try to find new
        if actor_wordform == object_wordform:
            l_id = len(actor_id_children)
            l_rel = len(object_id_children)
            if not l_id and not l_rel:
                # print 'nope, just returning'
                return False
            if not l_id:
                object_wordform = object_id_children[-1]
                del object_id_children[-1]
            else:
                actor_wordform = actor_id_children[-1]
                del actor_id_children[-1]

        # okay, all square with event. load 'er up

        # convert to ids
        actor_id = self.identity_ids_map[actor_wordform] if actor_is_identity \
            else self.sentiment_ids_map[actor_wordform]
        object_id = self.identity_ids_map[object_wordform] if object_is_identity \
            else self.sentiment_ids_map[object_wordform]
        actor_mod_ids = [self.identity_ids_map[i] for i in actor_id_children] + \
                        [self.sentiment_ids_map[i] for i in actor_sent_children]
        object_mod_ids = [self.identity_ids_map[i] for i in object_id_children] + \
                         [self.sentiment_ids_map[i] for i in object_sent_children]
        behavior_ids = [self.sentiment_ids_map[b] for b in beh_forms]

        constraint_string = tsn([
            'EVENT ', is_negated, '   ', actor_id_children,
            actor_sent_children, actor_wordform, ' ----> ', beh_forms,
            ' -----> ', object_id_children, object_sent_children,
            object_wordform
        ], False)

        # add constraint to all identities
        identities_in_constr = actor_id_children + object_id_children
        if object_is_identity:
            identities_in_constr.append(object_wordform)
        if actor_is_identity:
            identities_in_constr.append(actor_wordform)

        if self.use_events:
            # create constraint
            constraint = EventConstraint(actor=actor_id,
                                         behavior_terms=behavior_ids,
                                         object=object_id,
                                         actor_mods=actor_mod_ids,
                                         object_mods=object_mod_ids,
                                         behavior_is_negated=is_negated)
            self.constraint_string_list.append(constraint_string)
            self.all_constraints.append(constraint)

            self.sentence_ids_to_constraints[actor_node.dp_obj.id].add(
                object_node.dp_obj.id)
            self.sentence_ids_to_constraints[actor_node.dp_obj.id].add(
                behavior_node.dp_obj.id)
            self.sentence_ids_to_constraints[object_node.dp_obj.id].add(
                actor_node.dp_obj.id)
            self.sentence_ids_to_constraints[object_node.dp_obj.id].add(
                behavior_node.dp_obj.id)

            for identity in identities_in_constr:
                constraint_map[self.identity_ids_map[identity]].append(
                    constraint)
        # else:
        #    for identity in identities_in_constr:
        #        self.identities.append(self.identity_ids_map[identity])

        return True
    def behavior_constraint(self, identity_node, behavior_node, nodes_map,
                            constraint_map, map_to_head):
        iden_id = identity_node.dp_obj.id
        beh_id = behavior_node.dp_obj.id
        beh_text = behavior_node.dp_obj.text
        identity_is_actor = self.determine_is_actor(iden_id, beh_id, beh_text)

        # only actions taken
        # if not identity_is_actor:
        #      return False

        # get behavior info
        # there are some behaviors in the list that really don't make sense for events
        beh_forms = [
            b for b in get_forms_in_dict(behavior_node, self.sentiment_ids_map)
            if b not in EXCLUDED_BEHAVIORS and b != 'have'
        ]
        # if len(beh) > 1 then remove unnecessary behaviors
        if len(beh_forms) > 1:
            beh_forms = [b for b in beh_forms if b != 'have']
        is_negated = is_negated_node(behavior_node, map_to_head, nodes_map)
        if not len(beh_forms):
            return False

        # no behavior constraint if there is already a constraint with this behavior
        if beh_id in self.sentence_ids_to_constraints[identity_node.dp_obj.id] or \
                (len(self.sentence_ids_to_constraints[identity_node.dp_obj.id]) and beh_forms[0] in ['be', 'ain']):
            return False

        identity_info = self.get_wordform_and_mods_identity(
            identity_node, nodes_map, map_to_head, beh_forms)
        actor_wordform = object_wordform = None
        if identity_is_actor:
            actor_wordform, actor_id_children, actor_sent_children = identity_info
            object_id_children = object_sent_children = []
        else:
            actor_id_children = actor_sent_children = []
            object_wordform, object_id_children, object_sent_children = identity_info

        # convert to idsms
        actor_id = self.identity_ids_map[actor_wordform] if identity_is_actor \
            else self.sentiment_ids_map[ZERO_IDENTITY_INDICATOR]
        object_id = self.identity_ids_map[object_wordform] if not identity_is_actor \
            else self.sentiment_ids_map[ZERO_IDENTITY_INDICATOR]
        actor_mod_ids = [self.identity_ids_map[i] for i in actor_id_children] + \
                        [self.sentiment_ids_map[i] for i in actor_sent_children]
        object_mod_ids = [self.identity_ids_map[i] for i in object_id_children] + \
                         [self.sentiment_ids_map[i] for i in object_sent_children]
        behavior_ids = [self.sentiment_ids_map[b] for b in beh_forms]

        constraint_string = tsn([
            'BEHAVIOR ', is_negated, '   ', actor_id_children,
            actor_sent_children, actor_wordform, ' ----> ', beh_forms,
            ' -----> ', object_id_children, object_sent_children,
            object_wordform
        ], False)

        # add constraint to all identities
        identities_in_constr = actor_id_children + object_id_children
        if identity_is_actor:
            identities_in_constr.append(actor_wordform)
        else:
            identities_in_constr.append(object_wordform)

        if self.use_behaviors:
            # create constraint
            constraint = EventConstraint(actor=actor_id,
                                         behavior_terms=behavior_ids,
                                         object=object_id,
                                         actor_mods=actor_mod_ids,
                                         object_mods=object_mod_ids,
                                         behavior_is_negated=is_negated)
            self.constraint_string_list.append(constraint_string)
            self.all_constraints.append(constraint)
            self.sentence_ids_to_constraints[identity_node.dp_obj.id].add(
                behavior_node.dp_obj.id)
            for identity in identities_in_constr:
                constraint_map[self.identity_ids_map[identity]].append(
                    constraint)

        return True
示例#7
0
        if tw.mentions and len(tw.mentions) >= 3:
            of.write(tsn([tw.id, len(tw.mentions)]))
    of.close()
    return uid, mention_counter

#except:
#    return uid, Counter()

objfiles = listdir(os.path.join(INPUT_DIR, 'obj/'))
jsonfiles = set([
    os.path.basename(f)[:-8] for f in listdir(os.path.join(INPUT_DIR, 'json/'))
])
print list(jsonfiles)[:5]
print objfiles[:5]
onlyfiles = [os.path.basename(o) for o in objfiles if o in jsonfiles]

print 'N FILES: ', len(onlyfiles)
print onlyfiles[:5]
#results = [get_user_info((0,onlyfiles[0]))]

pool = Pool(int(sys.argv[3]))
results = pool.map(get_user_info, enumerate(onlyfiles))
pool.close()
pool.terminate()

of = open(os.path.join(OUTPUT_DIR, "mention_counts_total.tsv"), "w")
for uid, mention_counter in results:
    for k, v in mention_counter.items():
        of.write(tsn([uid, k, v]))
of.close()
def run_baseline_on_conll_file(conll_filename, path_to_dicts, output_filename):

    features_from_conll, blah = get_all_features(conll_filename, None, None,None,None)
    labels, features, obj_inds = configure_features_for_wordvectors_and_remove_twitterner(features_from_conll)[0:3]
    
    ## for dictionary-based evaluation
    stopwords = get_stopwords()

    data = read_grouped_by_newline_file(conll_filename)
    dependency_parses = []
    for x in data:
        dependency_parses.append([DependencyParseObject(o) for o in x])

    # get all the dictionaries together
    p_look_in_dict = partial(look_in_dict, sets=[stopwords], set_names=["stopwords"])
    act_dict = p_look_in_dict(dependency_parses, Dictionaries(os.path.join(path_to_dicts,'identities.txt')))
    wordnet_dict = p_look_in_dict(dependency_parses, Dictionaries(os.path.join(path_to_dicts,'wordnet_identities.txt')))
    racial_dict = p_look_in_dict(dependency_parses, Dictionaries(os.path.join(path_to_dicts,'racial_slur_identities.txt')))
    national_dict = p_look_in_dict(dependency_parses, Dictionaries(os.path.join(path_to_dicts,'national_identities.txt')))
    job_dict = p_look_in_dict(dependency_parses, Dictionaries(os.path.join(path_to_dicts,'job_identities.txt')))
    
    all_ds = Dictionaries(os.path.join(path_to_dicts,'*identities.txt'))
    all_dict = p_look_in_dict(dependency_parses,all_ds)
    
    # get hte bootstrapped dictionary together
    tw_distant_supervision_identity_dat = get_twitter_distant_supervision_identity_dat(BOOTSTRAPPED_DICTIONARY_LOCATION)
    stopwords = get_stopwords()
    twit_sets = []
    
    for v in [10, 100, 1000, 10000,50000]:
        twit_id = set(tw_distant_supervision_identity_dat[
                      (tw_distant_supervision_identity_dat.tot > v)].term.values)
        twit_id = twit_id - stopwords
        twit_sets.append([twit_id,"twit_identities"+str(v)])
    
    all_random_ids = get_test_ids(conll_filename, 0, -1, -1)
    
    y = np.array(labels)

    output_file = open(output_filename, "w")

    #test all the basic dicts
    for d in [['act_dict',act_dict],
              ['racial_dict',racial_dict],
              ['nat_dict',national_dict],
              ['job_dict',job_dict],
              ['wordnet_dict',wordnet_dict],
              ['all_dict',all_dict]]:
        preds = get_isin_array(d[1],obj_inds)
        out = evaluate(.4, y, preds , obj_inds, all_random_ids, print_eval=True)
        output_file.write(tsn([d[0]] + out[1:]))
    
    # test the bootstrapped dicts
    for twit_set, twit_set_id in twit_sets:
        d = look_in_dict(dependency_parses,sets=[twit_set,stopwords],set_names=["twit_identities", "stopwords"])
        out = evaluate(.4, y, get_isin_array(d,obj_inds), obj_inds, all_random_ids, print_eval=True)
        output_file.write(tsn([twit_set_id+"_alone"] + out[1:]))
        d = look_in_dict(dependency_parses,
                     all_ds,[twit_set, stopwords],[twit_set_id,"stopwords"])
        out = evaluate(.4, y, get_isin_array(d,obj_inds), obj_inds, all_random_ids, print_eval=True)
        output_file.write(tsn([twit_set_id+"_w_all"] + out[1:]))

    output_file.close()
示例#9
0
def gen_conll_file(fil, ptb_dir, dp_dir):
    user = TwitterUser()
    user.populate_tweets_from_file(fil, do_tokenize=False)

    if 50 <= user.n_total_tweets <= 15000 and\
       user.followers_count <= 25000 and user.creation_date <= MIN_ACCOUNT_AGE:

        dp_filename = os.path.join(dp_dir, str(user.user_id) + ".gz")
        ptb_filename = os.path.join(ptb_dir, str(user.user_id) + ".txt.gz")

        if not os.path.exists(dp_filename) or not os.path.exists(ptb_filename):
            return [
                'no_dp_ptb',
                [
                    user.user_id,
                    os.path.exists(dp_filename),
                    os.path.exists(ptb_filename)
                ]
            ]

        penntreebank = {
            x[0]: x[1:]
            for x in read_grouped_by_newline_file(ptb_filename)
        }
        dependency_parse = read_grouped_by_newline_file(dp_filename)

        tweet_set = [(i,t) for i,t in enumerate(user.tweets) if t.retweeted is None and\
                       len(t.urls) == 0 and 'http:' not in t.text and\
                       langid.classify(t.text)[0] == 'en']

        # non english speaker or spam
        if len(tweet_set) < 40:
            return ['notweets', user.user_id]

        data_to_return = []
        for twit_it, tweet in tweet_set:

            data_for_tweet = []

            ptb_for_tweet = penntreebank[str(tweet.id)]
            dp_for_tweet = dependency_parse[twit_it]

            if ptb_for_tweet[0].split("\t")[2] != DependencyParseObject(
                    dp_for_tweet[0]).text:
                print 'ahhhhh, weird stuff'
                continue

            for i, p in enumerate(dp_for_tweet):
                d = DependencyParseObject(
                    tsn([
                        p, tweet.id, user.user_id,
                        tweet.created_at.strftime("%m-%d-%y")
                    ],
                        newline=False))
                # get java features
                spl_java = ptb_for_tweet[i].split("\t")
                java_id, penn_pos_tag, word = spl_java[:3]
                java_features = '' if len(spl_java) == 3 else spl_java[3]
                d.features += [x for x in java_features.split("|") if x != '']
                d.features.append("penn_treebank_pos=" + penn_pos_tag)
                data_for_tweet.append(d)
            data_to_return.append(data_for_tweet)

        return ['success', [user.user_id, data_to_return]]
    else:
        return ['baduser', user.user_id]
for x in stopword_test_inds:
    if y_pub[x] == 1:
        stopword_test_inds_1.append(x)
    else:
        stopword_test_inds_0.append(x)

if len(stopword_test_inds):
    extra_tn = len(stopword_test_inds_0)
    extra_fn = len(stopword_test_inds_1)
    y_pub = np.concatenate(
        (y_pub[test_inds], np.array([0] * extra_tn), np.array([1] * extra_fn)),
        axis=0)
    predicted_prob = np.concatenate(
        (predicted_prob, [[1, 0]] * (extra_tn + extra_fn)), axis=0)
    test_inds = test_inds + stopword_test_inds_0 + stopword_test_inds_1

output_file = open("results/final_model_pub_res.tsv", "w")
eval_out = evaluate(.5, y_pub, predicted_prob, obj_inds_pub, test_inds, True,
                    True, True)
output_file.write(tsn(["final_model"] + eval_out[1:]))
output_file.close()

from utility_code.dependency_parse_object import DependencyParseObject
test_data = {
    DependencyParseObject(x[0]).tweet_id: x
    for x in read_grouped_by_newline_file("test_data/final_conll_pub.txt")
}

write_out_predictions("results/predictions_pub_data.txt", test_data,
                      obj_inds_pub, test_inds, y_pub, predicted_prob)
示例#11
0
def run_baseline_on_conll_file(conll_filename, path_to_dicts, output_filename):

    features_from_conll, blah = get_all_features(conll_filename, None, None,
                                                 None, None)
    labels, features, obj_inds = configure_features_for_wordvectors_and_remove_twitterner(
        features_from_conll)[0:3]

    ## for dictionary-based evaluation
    stopwords = get_stopwords()

    data = read_grouped_by_newline_file(conll_filename)
    dependency_parses = []
    for x in data:
        dependency_parses.append([DependencyParseObject(o) for o in x])

    # get all the dictionaries together
    p_look_in_dict = partial(look_in_dict,
                             sets=[stopwords],
                             set_names=["stopwords"])
    act_dict = p_look_in_dict(
        dependency_parses,
        Dictionaries(os.path.join(path_to_dicts, 'identities.txt')))
    wordnet_dict = p_look_in_dict(
        dependency_parses,
        Dictionaries(os.path.join(path_to_dicts, 'wordnet_identities.txt')))
    racial_dict = p_look_in_dict(
        dependency_parses,
        Dictionaries(os.path.join(path_to_dicts,
                                  'racial_slur_identities.txt')))
    national_dict = p_look_in_dict(
        dependency_parses,
        Dictionaries(os.path.join(path_to_dicts, 'national_identities.txt')))
    job_dict = p_look_in_dict(
        dependency_parses,
        Dictionaries(os.path.join(path_to_dicts, 'job_identities.txt')))

    all_ds = Dictionaries(os.path.join(path_to_dicts, '*identities.txt'))
    all_dict = p_look_in_dict(dependency_parses, all_ds)

    # get hte bootstrapped dictionary together
    tw_distant_supervision_identity_dat = get_twitter_distant_supervision_identity_dat(
        BOOTSTRAPPED_DICTIONARY_LOCATION)
    stopwords = get_stopwords()
    twit_sets = []

    for v in [10, 100, 1000, 10000, 50000]:
        twit_id = set(tw_distant_supervision_identity_dat[(
            tw_distant_supervision_identity_dat.tot > v)].term.values)
        twit_id = twit_id - stopwords
        twit_sets.append([twit_id, "twit_identities" + str(v)])

    all_random_ids = get_test_ids(conll_filename, 0, -1, -1)

    y = np.array(labels)

    output_file = open(output_filename, "w")

    #test all the basic dicts
    for d in [['act_dict', act_dict], ['racial_dict', racial_dict],
              ['nat_dict', national_dict], ['job_dict', job_dict],
              ['wordnet_dict', wordnet_dict], ['all_dict', all_dict]]:
        preds = get_isin_array(d[1], obj_inds)
        out = evaluate(.4, y, preds, obj_inds, all_random_ids, print_eval=True)
        output_file.write(tsn([d[0]] + out[1:]))

    # test the bootstrapped dicts
    for twit_set, twit_set_id in twit_sets:
        d = look_in_dict(dependency_parses,
                         sets=[twit_set, stopwords],
                         set_names=["twit_identities", "stopwords"])
        out = evaluate(.4,
                       y,
                       get_isin_array(d, obj_inds),
                       obj_inds,
                       all_random_ids,
                       print_eval=True)
        output_file.write(tsn([twit_set_id + "_alone"] + out[1:]))
        d = look_in_dict(dependency_parses, all_ds, [twit_set, stopwords],
                         [twit_set_id, "stopwords"])
        out = evaluate(.4,
                       y,
                       get_isin_array(d, obj_inds),
                       obj_inds,
                       all_random_ids,
                       print_eval=True)
        output_file.write(tsn([twit_set_id + "_w_all"] + out[1:]))

    output_file.close()