Exemplo n.º 1
0
def sentences(config, normalize, include_historic=True, include_modern=True):
    if normalize:
        names = utils.load_pickle(config.names)
        wordlike_names = utils.load_pickle(config.wordlike_names)

    for decade in os.listdir(config.data_dir):
        decade_int = int(decade[:-1])
        if not ((include_historic and decade_int < 1860) or
                (include_modern and decade_int > 1980)):
            continue

        print "DECADE:", decade
        path = os.path.join(config.data_dir, decade)
        for fname in utils.logged_loop([fname for fname in os.listdir(path) if fname[:3] == "fic"]):
            with open(os.path.join(path, fname)) as f:
                for line in f:
                    line = line.strip().replace("<p>", "")
                    sentences = nltk.tokenize.sent_tokenize(line)
                    for s in sentences:
                        if "@" in s or "Page image" in s or "PRINTED" in s or "nbsp" in s:
                            continue
                        words = nltk.tokenize.word_tokenize(s)
                        if normalize:
                            words = normalize_sentence(words, names, wordlike_names)
                        yield words
Exemplo n.º 2
0
def make_attn_word_level(data, tokenizer, cased):
    for features in utils.logged_loop(data):
        words_to_tokens = tokenize_and_align(tokenizer, features["words"],
                                             cased)
        assert sum(len(word)
                   for word in words_to_tokens) == len(features["tokens"])
        features["attns"] = np.stack([[
            get_word_word_attention(attn_head, words_to_tokens)
            for attn_head in layer_attns
        ] for layer_attns in features["attns"]])
Exemplo n.º 3
0
def write_probable_pairs(dataset_name, action_space_path, scores):
    probable_pairs = {}
    margin_removals = 0
    total_pairs = 0
    total_size = 0
    for did in utils.logged_loop(scores):
        doc_scores = scores[did]
        pairs = sorted([pair for pair in doc_scores.keys() if pair[0] != -1],
                       key=lambda pr: doc_scores[pr] - (-1 - 0.3 * doc_scores[
                           (-1, pr[1])]),
                       reverse=True)

        total_pairs += len(pairs)
        probable_pairs[did] = []
        for pair in pairs:
            score = doc_scores[pair] - (-1 - 0.3 * doc_scores[(-1, pair[1])])
            if score < SCORE_THRESHOLD:
                break
            probable_pairs[did].append(pair)

        max_scores = {}
        for pair in probable_pairs[did]:
            if pair[1] not in max_scores:
                max_scores[pair[1]] = max(doc_scores[pair],
                                          -1 - 0.3 * doc_scores[(-1, pair[1])])
            else:
                max_scores[pair[1]] = max(max_scores[pair[1]],
                                          doc_scores[pair])
        margin_removals += len(probable_pairs[did])
        probable_pairs[did] = [
            p for p in probable_pairs[did]
            if doc_scores[p] - max_scores[p[1]] > MARGIN_THRESHOLD
        ]
        margin_removals -= len(probable_pairs[did])
        total_size += len(probable_pairs[did])

    print "num docs:", len(scores)
    print "avg size without filter: {:.1f}".format(total_pairs /
                                                   float(len(scores)))
    print "avg size: {:.1f}".format(total_size / float(len(scores)))
    print "margin removals size: {:.1f}".format(margin_removals /
                                                float(len(scores)))
    utils.write_pickle(
        probable_pairs,
        action_space_path + dataset_name + '_probable_pairs.pkl')
    shutil.copyfile('clustering_preprocessing.py',
                    action_space_path + 'clustering_preprocessing.py')
Exemplo n.º 4
0
def make_attn_word_level(data, bert_attn_tokenizer, self_attn_tokenizer, args):
    for features in utils.logged_loop(data):
        #print('wordsss:',features["words"])
        #print('self tokens:', features["self_tokens"])
        #print('bert tokens:', features["bert_tokens"])
        self_words_to_tokens = tokenize_and_align(self_attn_tokenizer,
                                                  features["words"], args,
                                                  'self',
                                                  features['self_tokens'])
        bert_words_to_tokens = tokenize_and_align(bert_attn_tokenizer,
                                                  features["words"], args,
                                                  'bert',
                                                  features['bert_tokens'])
        assert sum(len(word) for word in self_words_to_tokens) == len(
            features["self_tokens"])
        assert sum(len(word) for word in bert_words_to_tokens) == len(
            features["bert_tokens"])
        features["attns"] = np.stack([[
            get_word_word_attention(attn_head, self_words_to_tokens,
                                    bert_words_to_tokens, args)
            for attn_head in layer_attns
        ] for layer_attns in features["attns"]])
Exemplo n.º 5
0
def main():
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument(
        "--attn-data-file",
        required=True,
        help="Pickle file containing extracted attention maps.")
    parser.add_argument("--outfile",
                        required=True,
                        help="Where to write out the distances between heads.")
    args = parser.parse_args()

    print("Loading attention data")
    data = utils.load_pickle(args.attn_data_file)

    print("Computing head distances")
    js_distances = np.zeros([144, 144])
    for doc in utils.logged_loop(data, n_steps=None):
        if "attns" not in doc:
            continue
        tokens, attns = doc["tokens"], np.array(doc["attns"])

        attns_flat = attns.reshape([144, attns.shape[2], attns.shape[3]])
        for head in range(144):
            head_attns = np.expand_dims(attns_flat[head], 0)
            head_attns_smoothed = (0.001 / head_attns.shape[1]) + (head_attns *
                                                                   0.999)
            attns_flat_smoothed = (0.001 / attns_flat.shape[1]) + (attns_flat *
                                                                   0.999)
            m = (head_attns_smoothed + attns_flat_smoothed) / 2
            js = -head_attns_smoothed * np.log(m / head_attns_smoothed)
            js += -attns_flat_smoothed * np.log(m / attns_flat_smoothed)
            js /= 2
            js = js.sum(-1).sum(-1)
            js_distances[head] += js

        utils.write_pickle(js_distances, args.outfile)
Exemplo n.º 6
0
def examples_in_batches(examples, batch_size):
    for i in utils.logged_loop(range(1 + ((len(examples) - 1) // batch_size))):
        yield examples[i * batch_size:(i + 1) * batch_size]