def find_k_most_difficult_tweets(
        dbs, fig_dst, stat_dst, anno_coll_name="user", tweet_coll_name="tweets",
        cleaned=False, k=30, use_median=False):
    """
    Plots the average/median annotation times over all annotators (over all
    institutions) for each
    tweet. Does the same thing, but for the different institutions.
    Sorts them in all cases in ascending order before plotting. Also stores
    the k most difficult tweets in a separate .txt file.

    Parameters
    ----------
    dbs: list of strings - names of the existing DBs
    fig_dst: str - directory in which the plot will be stored.
    stat_dst: str - directory in which the stats will be stored.
    anno_coll_name: str - name of the collection holding the annotator data.
    tweet_coll_name: str - name of the collection holding the tweet data.
    cleaned: bool - True if the data should be cleaned, i.e. if tweet is
    "irrelevant", its remaining labels are ignored for computing average
    annotation times.
    k: int - number of tweets that should be retrieved.
    use_median: bool - True if median annotation times should be used instead
    of averages. Otherwise average annotation times will be used to determine
    difficult tweets.

    """
    dataset_type = "raw"
    if cleaned:
        dataset_type = "cleaned"

    avg_type = "average"
    if use_median:
        avg_type = "median"

    # Different visualization for MD and SU
    SU_ALL = [0, 3]
    MD_ALL = [1, 2, 4, 5, 6]
    # Experiment run in MD after election in Jan 2017
    LATER_ALL = [4, 5, 6]
    # institute_dbs = [SU_ALL, MD_ALL, LATER_ALL]
    # institutions = ["su", "md", "later"]

    # {tweet_id1: {anno time1, anno time2, ...}}
    # Stores annotation times over all institutions
    anno_times = {}
    # Same as <anno_times>, but for each institution now
    anno_times_md = {}
    anno_times_su = {}
    anno_times_later = {}

    # This DB contains all 500 tweets used in the experiment
    db_all = "lturannotationtool"
    all_tweets_coll, anno_coll = utility.load_tweets_annotators_from_db(
            db_all, tweet_coll_name, anno_coll_name)
    # a = 0
    # For every tweet
    for tweet_no, tweet in enumerate(all_tweets_coll.find()):
        # Use Twitter ID because _id differs for the same tweet as it was
        # created in multiple DBs.
        twitter_id = tweet["id_str"]
        anno_times[twitter_id] = []
        # Search in every DB for annotations of this tweet
        for db_idx, db in enumerate(dbs):
            tweet_coll, anno_coll = utility.load_tweets_annotators_from_db(
                db, tweet_coll_name, anno_coll_name)
            DUMMY = 0
            # Search in every annotator
            for idx, anno in enumerate(anno_coll.find()):
                username = anno["username"]
                # print "Collect tweets for annotator '{}'".format(username)
                # Tweet IDs labeled by this annotator
                labeled = anno["annotated_tweets"]
                for tid in labeled:
                    t = utility.get_tweet(tweet_coll, tid)
                    twitter_id_other = t["id_str"]
                    # We found a matching entry, so add it's annotation time
                    if twitter_id == twitter_id_other:
                        rel_label = t["relevance_label"][username]
                        l1 = t["relevance_time"][username]
                        # c1 = t["confidence_relevance_time"][username]

                        # Discard remaining labels if annotator chose
                        # "Irrelevant". Consider other sets of labels only
                        # iff either the cleaned dataset should be created
                        # and the label is "relevant" OR
                        # the raw dataset should be used.
                        if (cleaned and rel_label != "Irrelevant") \
                                or not cleaned:
                            l2 = t["fact_time"][username]
                            # c2 = t["confidence_fact_time"][username]
                            # Annotator labeled the 3rd set of labels as well
                            if username in tweet["opinion_label"]:
                                l3 = t["opinion_time"][username]
                                # c3 = t["confidence_opinion_time"][username]
                            else:
                                # 3rd set of labels might not have been
                                # assigned by annotator, so choose some low
                                # constants that max()
                                # calculations won't get affected
                                l3 = DUMMY
                                # c3 = DUMMY
                        else:
                            # Ignore remaining labels
                            l2 = DUMMY
                            # c2 = DUMMY
                            l3 = DUMMY
                            # c3 = DUMMY
                        ls = [l1, l2, l3]
                        # cs = [c1, c2, c3]
                        # Add up relative timers
                        total = sum(ls)
                        # Append time to all suitable data structures
                        # a) over all institutions
                        anno_times[twitter_id].append(total)
                        # b) SU
                        if db_idx in SU_ALL:
                            if twitter_id not in anno_times_su:
                                anno_times_su[twitter_id] = []
                            anno_times_su[twitter_id].append(total)
                        # c) MD (+LATER)
                        if db_idx in MD_ALL or db_idx in LATER_ALL:
                            if twitter_id not in anno_times_md:
                                anno_times_md[twitter_id] = []
                            anno_times_md[twitter_id].append(total)
                        # d) LATER
                        if db_idx in LATER_ALL:
                            if twitter_id not in anno_times_later:
                                anno_times_later[twitter_id] = []
                            anno_times_later[twitter_id].append(total)
        # a += 1
        # if a == 4:
        #     break
        print "Processed annotators:", (tweet_no + 1)
    names = ["total", "su", "md", "later"]
    datasets = [anno_times, anno_times_su, anno_times_md, anno_times_later]
    for dataset, dataset_name in zip(datasets, names):
        # Compute average annotation time per tweet
        avg_times = {}
        for tid, times in dataset.iteritems():
            # print anno_times[tid]
            # Use median or average of annotation times
            if use_median:
                avg = median(dataset[tid])
            else:
                avg = 1.0*sum(dataset[tid]) / len(dataset[tid])
            votes = len(dataset[tid])
            avg_times[tid] = (avg, votes)
        # Sort average annotation times ascendingly
        # THIS ONLY SORTS THE KEYS, so values must be retrieved from <avg_times>
        # http://stackoverflow.com/questions/6349296/sorting-a-dict-with-tuples-as-values
        sorted_avg_times_keys = sorted(avg_times.keys(),
                                       key=lambda x: avg_times[x][0])

        # Store k most difficult (= highest avg. annotation time) tweets in file
        fname = "{}_most_difficult_tweets_{}_{}.txt"\
            .format(dataset_name, avg_type, dataset_type)
        t = "Avg. time"
        if use_median:
            t = "Med. time"
        with open(stat_dst + fname, "w") as f:
            title = "Most difficult tweets"
            f.write(title + "\n")
            f.write("-" * len(title) + "\n\n")
            f.write("{:<19} | {:<9} | {:<6}\n"
                    .format("Twitter ID", t, "#Annotators who labeled tweet"))
            written = 0
            for tid in reversed(sorted_avg_times_keys):
                if written < k:
                    avg_time = avg_times[tid][0]
                    votes = avg_times[tid][1]
                    f.write("{:<21} {:9.2f}   {:<6}\n".format(tid, avg_time,
                                                              votes))
                written += 1

        # Plot infsci2017_results
        fname = "{}_most_difficult_tweets_{}_{}.png"\
            .format(dataset_name, avg_type, dataset_type)
        fig = plt.figure(figsize=(20, 3))
        ax = fig.add_subplot(111)
        width = 0.02
        x = range(len(sorted_avg_times_keys))
        y = [avg_times[t][0] for t in sorted_avg_times_keys]
        ax.bar(x, y, width, color="black")
        # Title
        title = "{} annotation time per tweet".format(avg_type.title())
        if dataset_name == "su" or dataset_name == "md" \
                or dataset_name == "later":
            title = "{} annotation time per tweet in {}"\
                .format(avg_type.title(), dataset_name)
        plt.title(title)
        # Hide the right and top spines (lines)
        ax.spines['right'].set_visible(False)
        ax.spines['top'].set_visible(False)
        # Only show ticks on the left and bottom spines
        ax.yaxis.set_ticks_position('left')
        ax.xaxis.set_ticks_position('bottom')
        # Limits of axes
        plt.xlim(-0.1, x[-1] + 1)
        plt.ylim(0, y[-1] + 0.5)
        # Set labels of axes
        ax.set_xlabel("Tweet")
        ax.set_ylabel("{} annotation time in s".format(avg_type))
        # Add legend outside of plot
        legend = ax.legend(loc="best", shadow=True, bbox_to_anchor=(0.5, 1.5))
        # plt.tick_params(
        #     axis='x',          # changes apply to the x-axis
        #     which='both',      # both major and minor ticks are affected
        #     bottom='off',      # ticks along the bottom edge are off
        #     top='off',         # ticks along the top edge are off
        #     labelbottom='off') # labels along the bottom edge are off
        plt.savefig(fig_dst + fname, bbox_inches='tight')
        plt.close()
예제 #2
0
def read_dataset(dbs,
                 db_idxs,
                 anno_coll_name="user",
                 tweet_coll_name="tweets",
                 cleaned=False,
                 min_annos=3):
    """
    Read dataset.

    Parameters
    ----------
    dbs: list of strings - names of the existing DBs.
    db_idxs: list of ints - name of the MongoDB from where data should be read.
    anno_coll_name: str - name of the collection holding the annotator data.
    tweet_coll_name: str - name of the collection holding the tweet data.
    cleaned: bool - True if the cleaned data is used as input.
    min_annos: int - minimum number of annotators who must've assigned a label
    to a tweet. Otherwise it'll be discarded.
    is_early: bool - True if only tweets from the early phase should be
    considered. Else only tweets from the late stage are considered.

    Returns
    --------
    list, dict.
    List of Annotator objects in institution with their tweets.
    Dictionary storing for each tweet how many annotators labeled it.

    """
    # Store a list of annotators per group/institution
    inst_annos = []

    for db_idx in db_idxs:
        # Get DB name
        db = dbs[db_idx]
        tweet_coll, anno_coll = utility.load_tweets_annotators_from_db(
            db, tweet_coll_name, anno_coll_name)
        # For each anno
        for anno in anno_coll.find():
            username = anno["username"]
            group = anno["group"]
            # Use username + "_" + group because 2 annotators of MD
            # labeled for S and M (otherwise their entries are overridden)
            dict_key = username + "_" + group
            inst_anno = Annotator(dict_key, group)
            # Tweet IDs labeled by this annotator
            labeled = anno["annotated_tweets"]
            for tid in labeled:
                second_label = EMPTY
                third_label = EMPTY
                fac_time = ZERO
                opi_time = ZERO
                tweet = utility.get_tweet(tweet_coll, tid)
                # Use Twitter ID because _id differs for the same
                # tweet as it was created in multiple DBs.
                tweet_id = tweet["id_str"]
                text = tweet["text"]
                first_label = tweet["relevance_label"][username]
                rel_time = tweet["relevance_time"][username]
                # Annotator labeled the 3rd set of labels as well
                # Discard remaining labels if annotator chose
                # "Irrelevant"
                # Consider other sets of labels iff either the cleaned
                # dataset should be created and the label is "relevant"
                # OR the raw dataset should be used.
                if (cleaned and first_label != "Irrelevant") or not \
                        cleaned:
                    second_label = tweet["fact_label"][username]
                    fac_time = tweet["fact_time"][username]
                    # Annotator labeled the 3rd set of labels as well
                    if username in tweet["opinion_label"]:
                        third_label = tweet["opinion_label"][username]
                        opi_time = tweet["opinion_time"][username]
                # Add annotation times and labels to annotator
                anno_time = sum([rel_time, fac_time, opi_time])
                labels = [first_label, second_label, third_label]
                inst_anno.add_tweet(tweet_id, anno_time, labels, text)
            # Store annotator
            inst_annos.append(inst_anno)

    # Count for each tweet how often it was labeled.  The reason for
    # NOT counting in the previous loop is that 3 annotators of MD (M) - see
    # anno.py for a detailed explanation at the top - labeled the same tweet
    # twice, so counting would be off by 1 for 3 annotators. Therefore,
    # anno.Annotator handles these exceptions and ignores the tweets that were
    # labeled a second time.
    inst_counts = count_annotators_per_tweet(inst_annos)

    # Now only keep tweets that were labeled sufficiently often by annotators
    # Create a list of tweet IDs that must be removed since they weren't labeled
    # by enough annotators
    removed_inst_tweets = [
        tid for tid in inst_counts if inst_counts[tid] < min_annos
    ]
    print "remove from INSTITUTION all:", len(removed_inst_tweets)

    # Delete tweets that weren't labeled by enough annotators
    for anno in inst_annos:
        anno.delete_tweets(removed_inst_tweets)

    # Test that all tweets were removed
    for anno in inst_annos:
        for tid in anno.all_tweets():
            if tid in removed_inst_tweets:
                raise Exception("can't happen")

    # Make sure that we only count tweets that were sufficiently often
    # labeled in the institution
    for tid in removed_inst_tweets:
        del inst_counts[tid]
    print "#tweets in dataset", len(inst_counts)
    print "#annos in dataset", len(inst_annos)
    return inst_annos, inst_counts
def read_dataset(dbs,
                 db_idxs,
                 inst_name,
                 thresholds,
                 anno_coll_name="user",
                 tweet_coll_name="tweets",
                 cleaned=False,
                 min_annos=1,
                 is_early=True,
                 both_stages=False):
    """
    Read dataset (tweet texts per annotator and their labels) per group
    and institution.
    Group L is ignored.

    Parameters
    ----------
    dbs: list of strings - names of the existing DBs.
    db_idxs: list of ints - name of the MongoDB from where data should be read.
    inst_name: string - name of the institution.
    thresholds: dict - thresholds for early/late annotation stage. Keys are
    institution name or group and the thresholds are the corresponding values.
    anno_coll_name: str - name of the collection holding the annotator data.
    tweet_coll_name: str - name of the collection holding the tweet data.
    cleaned: bool - True if the cleaned data is used as input.
    min_annos: int - minimum number of annotators who must've assigned a label
    to a tweet. Otherwise it'll be discarded.
    is_early: bool - True if only tweets from the early phase should be
    considered. Else only tweets from the late stage are considered.
    both_stages: bool - True if both stages should be used instead of just early
    or late stage. If True, <is_early> is ignored. If False, only one stage
    according to <is_early> is considered.

    Returns
    --------
    list, dict, dict, dict.
    List of Annotator objects in institution with their tweets separated into
    early and late stage.
    Dictionary (group names "S" and "M" are keys) with lists of Annotator
    objects per group as value with their tweets separated into early and late
    stage.
    Counters for institution, i.e. how often each tweet was labeled.
    Counters for groups, i.e. how often each tweet was labeled in the group.
    Counters only contain counts for tweets that were labeled sufficiently
    often. Counters return tuples (raw_count, normalized_count), where
    raw counts are divided by the max(raw_count) to obtain normalized counts.

    """
    # Store a list of annotators per group/institution
    inst_annos = []
    group_annos = {"S": [], "M": []}

    for db_idx in db_idxs:
        # Get DB name
        db = dbs[db_idx]
        tweet_coll, anno_coll = utility.load_tweets_annotators_from_db(
            db, tweet_coll_name, anno_coll_name)
        # For each anno
        for anno in anno_coll.find():
            username = anno["username"]
            group = anno["group"]
            # Use username + "_" + group because 2 annotators of MD
            # labeled for S and M (otherwise their entries are overridden)
            dict_key = username + "_" + group
            # Ignore annotations from group L
            if group != "L":
                group_anno = Annotator(dict_key, group)
                inst_anno = Annotator(dict_key, group)
                # Tweet IDs labeled by this annotator
                labeled = anno["annotated_tweets"]
                for idx, tid in enumerate(labeled):
                    second_label = EMPTY
                    third_label = EMPTY
                    fac_time = ZERO
                    opi_time = ZERO
                    tweet = utility.get_tweet(tweet_coll, tid)
                    # Use Twitter ID because _id differs for the same tweet as
                    # it was created in multiple DBs.
                    tweet_id = tweet["id_str"]
                    text = tweet["text"]
                    first_label = tweet["relevance_label"][username]
                    rel_time = tweet["relevance_time"][username]
                    # Annotator labeled the 3rd set of labels as well
                    # Discard remaining labels if annotator chose "Irrelevant"
                    # Consider other sets of labels iff either the cleaned
                    # dataset should be created and the label is "relevant" OR
                    # the raw dataset should be used.
                    if (cleaned
                            and first_label != "Irrelevant") or not cleaned:
                        second_label = tweet["fact_label"][username]
                        fac_time = tweet["fact_time"][username]
                        # Annotator labeled the 3rd set of labels as well
                        if username in tweet["opinion_label"]:
                            third_label = tweet["opinion_label"][username]
                            opi_time = tweet["opinion_time"][username]
                    # Add annotation times and labels to annotator
                    anno_time = sum([rel_time, fac_time, opi_time])
                    labels = [first_label, second_label, third_label]
                    # Use tweets from both stages
                    if both_stages:
                        group_anno.add_tweet(tweet_id, anno_time, labels, text)
                        inst_anno.add_tweet(tweet_id, anno_time, labels, text)
                    else:
                        # Use only tweets from early stage
                        if is_early:
                            if anno_time <= thresholds[group]:
                                group_anno.add_tweet(tweet_id, anno_time,
                                                     labels, text)
                            if anno_time <= thresholds[inst_name]:
                                inst_anno.add_tweet(tweet_id, anno_time,
                                                    labels, text)
                        # Use only tweets from late stage
                        else:
                            if anno_time > thresholds[group]:
                                group_anno.add_tweet(tweet_id, anno_time,
                                                     labels, text)
                            if anno_time > thresholds[inst_name]:
                                inst_anno.add_tweet(tweet_id, anno_time,
                                                    labels, text)
                # Store annotator in group/institution
                inst_annos.append(inst_anno)
                group_annos[group].append(group_anno)

    # Count for each tweet how often it was labeled - per group and institution
    # as the results could vary, e.g. some tweets are insufficiently labeled
    # in groups but sufficiently often in the whole institution. The reason for
    # NOT counting in the previous loop is that 3 annotators of MD (M) - see
    # anno.py for a detailed explanation at the top - labeled the same tweet
    # twice, so counting would be off by 1 for 3 annotators. Therefore,
    # anno.Annotator handles these exceptions and ignores the tweets that were
    # labeled a second time.

    inst_counts = count_annotators_per_tweet(inst_annos)
    group_counts = {
        "S": count_annotators_per_tweet(group_annos["S"]),
        "M": count_annotators_per_tweet(group_annos["M"])
    }

    # Now only keep tweets that were labeled sufficiently often by annotators
    # Create a list of tweet IDs that must be removed since they weren't labeled
    # by enough annotators
    removed_inst_tweets = [
        tid for tid in inst_counts if inst_counts[tid] < min_annos
    ]

    # Delete from each group/institution all tweets that weren't labeled
    # by enough annotators
    for anno in inst_annos:
        anno.delete_tweets(removed_inst_tweets)

    # Test that all tweets were removed
    for anno in inst_annos:
        for tid in anno.all_tweets():
            if tid in removed_inst_tweets:
                raise Exception("can't happen")

    # Make sure that we only count tweets that were sufficiently often
    # labeled in the institution
    for tid in removed_inst_tweets:
        del inst_counts[tid]

    # Delete tweets from groups that weren't labeled sufficiently often
    for group in group_annos:
        # Create a list of tweet IDs that must be removed since they
        # weren't labeled by enough annotators
        removed_group_tweets = [
            tid for tid in group_counts[group]
            if group_counts[group][tid] < min_annos
        ]

        for anno in group_annos[group]:
            anno.delete_tweets(removed_group_tweets)

        # Make sure that we only count tweets that were sufficiently often
        # labeled in the group
        for tid in removed_group_tweets:
            del group_counts[group][tid]

        # Test that all tweets were removed
        for anno in group_annos[group]:
            for tid in anno.all_tweets():
                if tid in removed_group_tweets:
                    raise Exception("can't happen")

    return inst_annos, group_annos