예제 #1
0
def docs(dataset_name):
    p = utils.Progbar(target=(utils.lines_in_file(directories.RAW +
                                                  dataset_name)))
    for i, d in enumerate(utils.load_json_lines(directories.RAW +
                                                dataset_name)):
        p.update(i + 1)
        yield d
예제 #2
0
def write_feature_names():
    utils.write_pickle(
        {
            f: i
            for i, f in enumerate(
                next(utils.load_json_lines(directories.RAW +
                                           'train'))["pair_feature_names"])
        }, directories.MISC + 'pair_feature_names.pkl')
예제 #3
0
def build_dataset(vectors, name, tune_fraction=0.0, reduced=False, columns=None):
    doc_vectors = utils.load_pickle(directories.MISC + name.replace("_reduced", "") +
                                   "_document_vectors.pkl")

    main_pairs = PairDataBuilder(columns)
    tune_pairs = PairDataBuilder(columns)
    main_mentions = MentionDataBuilder(columns)
    tune_mentions = MentionDataBuilder(columns)
    main_docs = DocumentDataBuilder(columns)
    tune_docs = DocumentDataBuilder(columns)

    print("Building dataset", name + ("/tune" if tune_fraction > 0 else ""))
    p = utils.Progbar(target=(2 if reduced else utils.lines_in_file(directories.RAW + name)))
    for i, d in enumerate(utils.load_json_lines(directories.RAW + name)):
        if reduced and i > 2:
            break
        p.update(i + 1)

        if reduced and tune_fraction != 0:
            pairs, mentions, docs = (main_pairs, main_mentions, main_docs) \
                if i == 0 else (tune_pairs, tune_mentions, tune_docs)
        else:
            pairs, mentions, docs = (main_pairs, main_mentions, main_docs) \
                if random.random() > tune_fraction else (tune_pairs, tune_mentions, tune_docs)

        ms, ps = mentions.size(), pairs.size()
        mention_positions = {}
        for mention_num in sorted(d["mentions"].keys(), key=int):
            mention_positions[mention_num] = mentions.size()
            mentions.add_mention(d["mentions"][mention_num], vectors,
                                 doc_vectors[d["mentions"][mention_num]["doc_id"]])

        for key in sorted(d["labels"].keys(), key=lambda k: (int(k.split()[1]), int(k.split()[0]))):
            k1, k2 = key.split()
            pairs.add_pair(d["labels"][key], mention_positions[k1], mention_positions[k2],
                           int(d["mentions"][k1]["doc_id"]),
                           int(d["mentions"][k1]["mention_id"]),
                           int(d["mentions"][k2]["mention_id"]),
                           d["pair_features"][key])

        me, pe = mentions.size(), pairs.size()
        docs.add_doc(ms, me, ps, pe, d["document_features"])

    suffix = ("_reduced" if reduced else "")
    if tune_mentions.size() > 0:
        tune_mentions.write(name + "_tune" + suffix)
        tune_pairs.write(name + "_tune" + suffix)
        tune_docs.write(name + "_tune" + suffix)
        main_mentions.write(name + "_train" + suffix)
        main_pairs.write(name + "_train" + suffix)
        main_docs.write(name + "_train" + suffix)
    else:
        main_mentions.write(name + suffix)
        main_pairs.write(name + suffix)
        main_docs.write(name + suffix)
예제 #4
0
def build_dataset(vectors, name, tune_fraction=0.0, reduced=False, columns=None):
    doc_vectors = utils.load_pickle(directories.MISC + name.replace("_reduced", "") +
                                   "_document_vectors.pkl")

    main_pairs = PairDataBuilder(columns)
    tune_pairs = PairDataBuilder(columns)
    main_mentions = MentionDataBuilder(columns)
    tune_mentions = MentionDataBuilder(columns)
    main_docs = DocumentDataBuilder(columns)
    tune_docs = DocumentDataBuilder(columns)

    print "Building dataset", name + ("/tune" if tune_fraction > 0 else "")
    p = utils.Progbar(target=(2 if reduced else utils.lines_in_file(directories.RAW + name)))
    for i, d in enumerate(utils.load_json_lines(directories.RAW + name)):
        if reduced and i > 2:
            break
        p.update(i + 1)

        if reduced and tune_fraction != 0:
            pairs, mentions, docs = (main_pairs, main_mentions, main_docs) \
                if i == 0 else (tune_pairs, tune_mentions, tune_docs)
        else:
            pairs, mentions, docs = (main_pairs, main_mentions, main_docs) \
                if random.random() > tune_fraction else (tune_pairs, tune_mentions, tune_docs)

        ms, ps = mentions.size(), pairs.size()
        mention_positions = {}
        for mention_num in sorted(d["mentions"].keys(), key=int):
            mention_positions[mention_num] = mentions.size()
            mentions.add_mention(d["mentions"][mention_num], vectors,
                                 doc_vectors[d["mentions"][mention_num]["doc_id"]])

        for key in sorted(d["labels"].keys(), key=lambda k: (int(k.split()[1]), int(k.split()[0]))):
            k1, k2 = key.split()
            pairs.add_pair(d["labels"][key], mention_positions[k1], mention_positions[k2],
                           int(d["mentions"][k1]["doc_id"]),
                           int(d["mentions"][k1]["mention_id"]),
                           int(d["mentions"][k2]["mention_id"]),
                           d["pair_features"][key])

        me, pe = mentions.size(), pairs.size()
        docs.add_doc(ms, me, ps, pe, d["document_features"])

    suffix = ("_reduced" if reduced else "")
    if tune_mentions.size() > 0:
        tune_mentions.write(name + "_tune" + suffix)
        tune_pairs.write(name + "_tune" + suffix)
        tune_docs.write(name + "_tune" + suffix)
        main_mentions.write(name + "_train" + suffix)
        main_pairs.write(name + "_train" + suffix)
        main_docs.write(name + "_train" + suffix)
    else:
        main_mentions.write(name + suffix)
        main_pairs.write(name + suffix)
        main_docs.write(name + suffix)
예제 #5
0
def load_gold(dataset_name):
    gold = {}
    mention_to_gold = {}
    for doc_gold in utils.load_json_lines(directories.GOLD + dataset_name):
        did = int(list(doc_gold.keys())[0])
        gold[did] = doc_gold[str(did)]
        mention_to_gold[did] = {}
        for gold_cluster in doc_gold[str(did)]:
            for m in gold_cluster:
                mention_to_gold[did][m] = tuple(gold_cluster)
    return gold, mention_to_gold
예제 #6
0
def load_gold(dataset_name):
    gold = {}
    mention_to_gold = {}
    for doc_gold in utils.load_json_lines(directories.GOLD + dataset_name):
        did = int(doc_gold.keys()[0])
        gold[did] = doc_gold[str(did)]
        mention_to_gold[did] = {}
        for gold_cluster in doc_gold[str(did)]:
            for m in gold_cluster:
                mention_to_gold[did][m] = tuple(gold_cluster)
    return gold, mention_to_gold
예제 #7
0
def explore_pairwise_features():
    pos_sum, neg_sum = np.zeros(9), np.zeros(9)
    pos_count, neg_count = 0, 0
    for i, d in enumerate(utils.load_json_lines(directories.RAW + "train")):
        for key in d["labels"].keys():
            if d["labels"][key] == 1:
                pos_sum += d["pair_features"][key]
                pos_count += 1
            else:
                neg_sum += d["pair_features"][key]
                neg_count += 1
        print("positive counts", list(pos_sum))
        print("negative counts", list(neg_sum))
        print("feature odds", list(np.divide(pos_sum / pos_count,
                                             (pos_sum / pos_count + neg_sum / neg_count))))
        print()
예제 #8
0
def explore_pairwise_features():
    pos_sum, neg_sum = np.zeros(9), np.zeros(9)
    pos_count, neg_count = 0, 0
    for i, d in enumerate(utils.load_json_lines(directories.RAW + "train")):
        for key in d["labels"].keys():
            if d["labels"][key] == 1:
                pos_sum += d["pair_features"][key]
                pos_count += 1
            else:
                neg_sum += d["pair_features"][key]
                neg_count += 1
        print "positive counts", list(pos_sum)
        print "negative counts", list(neg_sum)
        print "feature odds", list(np.divide(pos_sum / pos_count,
                                             (pos_sum / pos_count + neg_sum / neg_count)))
        print
예제 #9
0
def write_feature_names():
    raw_train = directories.RAW + 'train'
    try:
        utils.write_pickle(
            {
                f: i
                for i, f in enumerate(
                    next(utils.load_json_lines(raw_train))
                    ["pair_feature_names"])
            }, directories.MISC + 'pair_feature_names.pkl')
    except FileNotFoundError as e:
        if e.filename == raw_train:
            raise FileNotFoundError(
                'Raw training data not found.  Perhaps you need to copy the original dataset first: %s'
                % e.filename) from e
        else:
            raise
예제 #10
0
def main(model_path, dataset_name):
    docs = utils.load_pickle(model_path + dataset_name + '_processed_docs.pkl')

    for doc_data in utils.load_json_lines(directories.RAW + dataset_name):
        sentences = doc_data["sentences"]
        mid_to_mention = {int(m["mention_id"]): m for m in doc_data["mentions"].values()}
        mid_to_position = {mid: int(m["mention_num"]) for mid, m in mid_to_mention.iteritems()}

        doc = docs[doc_data["document_features"]["doc_id"]]
        clusters = [c for c in doc.clusters if len(c) > 1]

        cluster_to_endpoints = {}
        for c in clusters:
            positions = [mid_to_position[mid] for mid in c]
            cluster_to_endpoints[c] = (min(positions), max(positions))
        sorted_clusters = sorted(clusters, key=lambda c: cluster_to_endpoints[c])

        color_last_usage = {i: -1 for i in range(len(COLORS))}
        active_clusters = []
        cluster_to_color = {}
        for c in sorted_clusters:
            start, end = cluster_to_endpoints[c]
            for a in list(active_clusters):
                if cluster_to_endpoints[a][1] < start:
                    active_clusters.remove(a)

            used_colors = [cluster_to_color[a] for a in active_clusters]
            sorted_colors = sorted((u, i) for i, u in color_last_usage.iteritems())
            next_color = None
            for u, i in sorted_colors:
                if i not in used_colors:
                    next_color = i
                    break
            if next_color is None:
                next_color = sorted_colors[0][1]

            color_last_usage[next_color] = start
            cluster_to_color[c] = next_color
            active_clusters.append(c)

        annotations = defaultdict(lambda: defaultdict(list))
        for i, c in enumerate(sorted_clusters):
            color = COLORS[cluster_to_color[c]]
            for m in c:
                mention = mid_to_mention[m]
                start, end = mention["start_index"], mention["end_index"] - 1
                annotations[mention["sent_num"]][start].append(
                    (color + "[" + ENDC, 1 + end))
                annotations[mention["sent_num"]][end].append(
                    (color + "]" + subscript(i) + ENDC, -1 - start))

        for i, s in enumerate(sentences):
            for j, sentence_annotations in annotations[i].iteritems():
                sentence_annotations = sorted(sentence_annotations, key=itemgetter(1))
                for (annotation, priority) in sentence_annotations:
                    if priority > 0:
                        s[j] = annotation + s[j]
                    else:
                        s[j] = s[j] + annotation
            print(" ".join(s))

        print()
        print(80 * "=")
        print()
예제 #11
0
def docs(dataset_name):
    p = utils.Progbar(target=(utils.lines_in_file(directories.RAW + dataset_name)))
    for i, d in enumerate(utils.load_json_lines(directories.RAW + dataset_name)):
        p.update(i + 1)
        yield d
예제 #12
0
def write_feature_names():
    utils.write_pickle({f: i for i, f in enumerate(next(
        utils.load_json_lines(directories.RAW + 'train'))["pair_feature_names"])},
                      directories.MISC + 'pair_feature_names.pkl')
예제 #13
0
def main(model_path, dataset_name):
    docs = utils.load_pickle(model_path + dataset_name + '_processed_docs.pkl')

    for doc_data in utils.load_json_lines(directories.RAW + dataset_name):
        sentences = doc_data["sentences"]
        mid_to_mention = {int(m["mention_id"]): m for m in doc_data["mentions"].values()}
        mid_to_position = {mid: int(m["mention_num"]) for mid, m in mid_to_mention.iteritems()}

        doc = docs[doc_data["document_features"]["doc_id"]]
        clusters = [c for c in doc.clusters if len(c) > 1]

        cluster_to_endpoints = {}
        for c in clusters:
            positions = [mid_to_position[mid] for mid in c]
            cluster_to_endpoints[c] = (min(positions), max(positions))
        sorted_clusters = sorted(clusters, key=lambda c: cluster_to_endpoints[c])

        color_last_usage = {i: -1 for i in range(len(COLORS))}
        active_clusters = []
        cluster_to_color = {}
        for c in sorted_clusters:
            start, end = cluster_to_endpoints[c]
            for a in list(active_clusters):
                if cluster_to_endpoints[a][1] < start:
                    active_clusters.remove(a)

            used_colors = [cluster_to_color[a] for a in active_clusters]
            sorted_colors = sorted((u, i) for i, u in color_last_usage.iteritems())
            next_color = None
            for u, i in sorted_colors:
                if i not in used_colors:
                    next_color = i
                    break
            if next_color is None:
                next_color = sorted_colors[0][1]

            color_last_usage[next_color] = start
            cluster_to_color[c] = next_color
            active_clusters.append(c)

        annotations = defaultdict(lambda: defaultdict(list))
        for i, c in enumerate(sorted_clusters):
            color = COLORS[cluster_to_color[c]]
            for m in c:
                mention = mid_to_mention[m]
                start, end = mention["start_index"], mention["end_index"] - 1
                annotations[mention["sent_num"]][start].append(
                    (color + "[" + ENDC, 1 + end))
                annotations[mention["sent_num"]][end].append(
                    (color + "]" + subscript(i) + ENDC, -1 - start))

        for i, s in enumerate(sentences):
            for j, sentence_annotations in annotations[i].iteritems():
                sentence_annotations = sorted(sentence_annotations, key=itemgetter(1))
                for (annotation, priority) in sentence_annotations:
                    if priority > 0:
                        s[j] = annotation + s[j]
                    else:
                        s[j] = s[j] + annotation
            print " ".join(s)

        print
        print 80 * "="
        print