Exemplo n.º 1
0
def main():
    logging.basicConfig(
        level='INFO',
        format="%(asctime)s: %(name)s: %(levelname)s: %(message)s")

    parser = argparse.ArgumentParser("Builds a dictionary")
    parser.add_argument("--target_coverage_text",
                        type=float,
                        help="Target coverage of text")
    parser.add_argument("--target_coverage_def",
                        type=float,
                        help="Target coverage of def")
    parser.add_argument("--vocab_text", type=str, help="Vocabulary of text")
    parser.add_argument("--vocab_def", type=str, help="Vocabulary of def")
    parser.add_argument("--step_size", type=int, default=30)
    parser.add_argument("--target", type=str, default="Final path")
    args = parser.parse_args()

    vocab_text = Vocabulary(args.vocab_text)
    vocab_def = Vocabulary(args.vocab_def)

    # Greedy solution is optimal
    # I also approximate greedy a bit by adding word by word. This is fine, vocabs are big
    target_coverage_text = np.sum(
        vocab_text.frequencies) * args.target_coverage_text
    target_coverage_def = np.sum(
        vocab_def.frequencies) * args.target_coverage_def
    current_vocab = set([])

    # Of course I could use binsearch
    for id in range(vocab_def.size() / args.step_size):
        for id2 in range(args.step_size):
            current_vocab.add(vocab_def.id_to_word(id * args.step_size + id2))

        current_vocab_mod = set(current_vocab)

        current_coverage_def = 0.0
        current_coverage_text = 0.0

        for w in current_vocab_mod:
            current_coverage_def += vocab_def.frequencies[vocab_def.word_to_id(
                w)]
            current_coverage_text += vocab_text.frequencies[
                vocab_text.word_to_id(w)]

        id_text = 0
        while current_coverage_text < target_coverage_text:
            while vocab_text.id_to_word(id_text) in current_vocab_mod:
                id_text += 1
                if id_text >= vocab_text.size():
                    raise Exception("Perhaps try lower target coverage")

            w = vocab_text.id_to_word(id_text)
            current_vocab_mod.add(w)
            current_coverage_def += vocab_def.frequencies[vocab_def.word_to_id(
                w)]
            current_coverage_text += vocab_text.frequencies[id_text]

        if current_coverage_def > target_coverage_def:
            current_vocab = current_vocab_mod
            break

        print(
            "After adding {} words I covered {} of def and {} of text occurences"
            .format(
                len(current_vocab_mod),
                current_coverage_def / float(np.sum(vocab_def.frequencies)),
                current_coverage_text / float(np.sum(vocab_text.frequencies))))

    # To be safe rechecking shortlist works
    current_coverage_def = 0
    current_coverage_text = 0
    for w in current_vocab:
        current_coverage_def += vocab_def.frequencies[vocab_def.word_to_id(w)]
        current_coverage_text += vocab_text.frequencies[vocab_text.word_to_id(
            w)]

    print(
        "Sanity check: after adding {} words I covered {} of def and {} of text occurences"
        .format(len(current_vocab),
                current_coverage_def / float(np.sum(vocab_def.frequencies)),
                current_coverage_text / float(np.sum(vocab_text.frequencies))))

    vocab_result = Vocabulary.build(
        {word: vocab_text.word_freq(word)
         for word in current_vocab})
    vocab_result.save(args.target)
Exemplo n.º 2
0
def main():
    logging.basicConfig(
        level='INFO',
        format="%(asctime)s: %(name)s: %(levelname)s: %(message)s")

    parser = argparse.ArgumentParser("Builds a dictionary")
    parser.add_argument("--top-k",
                        type=int,
                        help="Top most frequent words to leave")
    parser.add_argument(
        "--vocab-text",
        default=None,
        help="Vocab corresponding to the main if text is a dictionary.")
    parser.add_argument(
        "--weight-dict-entries",
        action='store_true',
        help="Weight dict entries according to the freqs from a vocab.")
    parser.add_argument(
        "--exclude-top-k",
        type=int,
        help="Ignore definitions of a number of most frequent words")
    parser.add_argument(
        "text",
        help=
        "The text to use. Can be a text file or .h5 or a dictionary with format.json in which case you need to use --vocab-text as well."
    )
    parser.add_argument("vocab", help="Destination")
    args = parser.parse_args()

    text = []
    if args.vocab_text:
        text = collections.defaultdict(int)
        vocab_text = Vocabulary(args.vocab_text)
    for f_name in args.text.split(","):
        logging.info("Processing " + f_name)
        if f_name.endswith('.h5'):
            with h5py.File(f_name) as h5_file:
                if 'text' not in h5_file.keys():
                    print("Missing text field from " + f_name)
                text.extend(h5_file['text'][:])
        elif f_name.endswith('.json'):
            logging.info(
                "Will build the vocabulary from definitions in a dictionary")
            dict_ = json.load(open(f_name, "r"))
            for word, list_defs in dict_.items():
                text_vocab_id = vocab_text.word_to_id(word)

                if (text_vocab_id != vocab_text.unk
                        and text_vocab_id < args.exclude_top_k):
                    continue

                for def_ in list_defs:
                    for def_word in def_:
                        if args.weight_dict_entries:
                            text[def_word] += vocab_text.word_freq(word)
                        else:
                            text[def_word] += 1
        else:
            with open(f_name) as file_:

                def data():
                    for line in file_:
                        for word in line.strip().split():
                            try:
                                yield text_type(word, 'utf-8')
                            except:
                                print("Skipped word " + word)

                text.extend(data())
        logging.info("{} words".format(len(text)))

    vocab = Vocabulary.build(text, args.top_k)
    vocab.save(args.vocab)
Exemplo n.º 3
0
def evaluate(c, tar_path, *args, **kwargs):
    """
    Performs rudimentary evaluation of SNLI/MNLI run

    * Runs on valid and test given network
    * Saves all predictions
    * Saves embedding matrix
    * Saves results.json and predictions.csv
    """

    # Load and configure
    model = kwargs['model']
    assert c.endswith("json")
    c = json.load(open(c))

    # Very ugly absolute path fix
    ABS_PATHS = [
        "data/", "/mnt/users/jastrzebski/local/dict_based_learning/data/",
        "/data/cf9ffb48-61bd-40dc-a011-b2e7e5acfd72/"
    ]
    from six import string_types
    for abs_path in ABS_PATHS:
        for k in c:
            if isinstance(c[k], string_types):
                if c[k].startswith(abs_path):
                    c[k] = c[k][len(abs_path):]

    # Make data paths nice
    for path in [
            'dict_path', 'embedding_def_path', 'embedding_path', 'vocab',
            'vocab_def', 'vocab_text'
    ]:
        if c.get(path, ''):
            if not os.path.isabs(c[path]):
                c[path] = os.path.join(fuel.config.data_path[0], c[path])

    logging.info("Updating config with " + str(kwargs))
    c.update(**kwargs)

    # NOTE: This assures we don't miss crucial definition for some def heavy words
    # usually it is a good idea
    c['max_def_per_word'] = c['max_def_per_word'] * 2

    assert tar_path.endswith("tar")
    dest_path = os.path.dirname(tar_path)
    prefix = os.path.splitext(os.path.basename(tar_path))[0]

    s1_decoded, s2_decoded = T.lmatrix('sentence1'), T.lmatrix('sentence2')

    if c['dict_path']:
        s1_def_map, s2_def_map = T.lmatrix('sentence1_def_map'), T.lmatrix(
            'sentence2_def_map')
        def_mask = T.fmatrix("def_mask")
        defs = T.lmatrix("defs")
    else:
        s1_def_map, s2_def_map = None, None
        def_mask = None
        defs = None

    s1_mask, s2_mask = T.fmatrix('sentence1_mask'), T.fmatrix('sentence2_mask')

    if model == 'simple':
        model, data, used_dict, used_retrieval, used_vocab = _initialize_simple_model_and_data(
            c)
    elif model == 'esim':
        model, data, used_dict, used_retrieval, used_vocab = _initialize_esim_model_and_data(
            c)
    else:
        raise NotImplementedError()

    pred = model.apply(s1_decoded,
                       s1_mask,
                       s2_decoded,
                       s2_mask,
                       def_mask=def_mask,
                       defs=defs,
                       s1_def_map=s1_def_map,
                       s2_def_map=s2_def_map,
                       train_phase=False)
    cg = ComputationGraph([pred])
    if c.get("bn", True):
        bn_params = [
            p for p in VariableFilter(bricks=[BatchNormalization])(cg)
            if hasattr(p, "set_value")
        ]
    else:
        bn_params = []

    # Load model
    model = Model(cg.outputs)
    parameters = model.get_parameter_dict()  # Blocks version mismatch
    logging.info(
        "Trainable parameters" + "\n" +
        pprint.pformat([(key, parameters[key].get_value().shape)
                        for key in sorted([
                            get_brick(param).get_hierarchical_name(param)
                            for param in cg.parameters
                        ])],
                       width=120))
    logging.info("# of parameters {}".format(
        sum([
            np.prod(parameters[key].get_value().shape) for key in sorted([
                get_brick(param).get_hierarchical_name(param)
                for param in cg.parameters
            ])
        ])))
    with open(tar_path) as src:
        params = load_parameters(src)

        loaded_params_set = set(params.keys())
        model_params_set = set([
            get_brick(param).get_hierarchical_name(param)
            for param in cg.parameters
        ])

        logging.info("Loaded extra parameters")
        logging.info(loaded_params_set - model_params_set)
        logging.info("Missing parameters")
        logging.info(model_params_set - loaded_params_set)
    model.set_parameter_values(params)

    if c.get("bn", True):
        logging.info("Loading " + str([
            get_brick(param).get_hierarchical_name(param)
            for param in bn_params
        ]))
        for param in bn_params:
            param.set_value(
                params[get_brick(param).get_hierarchical_name(param)])
        for p in bn_params:
            model._parameter_dict[get_brick(p).get_hierarchical_name(p)] = p

    # Read logs
    logs = pd.read_csv(os.path.join(dest_path, "logs.csv"))
    best_val_acc = logs['valid_misclassificationrate_apply_error_rate'].min()
    logging.info("Best measured valid acc: " + str(best_val_acc))

    # NOTE(kudkudak): We need this to have comparable mean rank and embedding scores
    reference_vocab = Vocabulary(
        os.path.join(fuel.config.data_path[0], c['data_path'], 'vocab.txt'))
    vocab_all = Vocabulary(
        os.path.join(
            fuel.config.data_path[0], c['data_path'],
            'vocab_all.txt'))  # Can include OOV words, which is interesting
    retrieval_all = Retrieval(vocab_text=used_vocab,
                              dictionary=used_dict,
                              max_def_length=c['max_def_length'],
                              exclude_top_k=0,
                              max_def_per_word=c['max_def_per_word'])
    # logging.info("Calculating dict and word embeddings for vocab.txt and vocab_all.txt")
    # for name in ['s1_word_embeddings', 's1_dict_word_embeddings']:
    #     variables = VariableFilter(name=name)(cg)
    #     if len(variables):
    #         s1_emb = variables[0]
    #         # A bit sloppy about downcast
    #
    #         if "dict" in name:
    #             embedder = construct_dict_embedder(
    #                 theano.function([s1_decoded, defs, def_mask, s1_def_map], s1_emb, allow_input_downcast=True),
    #                 vocab=data.vocab, retrieval=retrieval_all)
    #         else:
    #             embedder = construct_embedder(theano.function([s1_decoded], s1_emb, allow_input_downcast=True),
    #                 vocab=data.vocab)
    #
    #         for v_name, v in [("vocab_all", vocab_all), ("vocab", reference_vocab)]:
    #             logging.info("Calculating {} embeddings for {}".format(name, v_name))

    # Predict
    predict_fnc = theano.function(cg.inputs, pred)
    results = {}
    batch_size = 14
    for subset in ['valid', 'test']:
        logging.info("Predicting on " + subset)
        stream = data.get_stream(subset, batch_size=batch_size, seed=778)
        it = stream.get_epoch_iterator()
        rows = []
        for ex in tqdm.tqdm(it, total=10000 / batch_size):
            ex = dict(zip(stream.sources, ex))
            inp = [ex[v.name] for v in cg.inputs]
            prob = predict_fnc(*inp)
            label_pred = np.argmax(prob, axis=1)

            for id in range(len(prob)):
                s1_decoded = used_vocab.decode(ex['sentence1'][id]).split()
                s2_decoded = used_vocab.decode(ex['sentence2'][id]).split()

                assert used_vocab == data.vocab

                s1_decoded = [
                    '*' + w + '*'
                    if used_vocab.word_to_id(w) > c['num_input_words'] else w
                    for w in s1_decoded
                ]
                s2_decoded = [
                    '*' + w + '*'
                    if used_vocab.word_to_id(w) > c['num_input_words'] else w
                    for w in s2_decoded
                ]

                # Different difficulty metrics

                # text_unk_percentage
                s1_no_pad = [w for w in ex['sentence1'][id] if w != 0]
                s2_no_pad = [w for w in ex['sentence2'][id] if w != 0]

                s1_unk_percentage = sum([
                    1. for w in s1_no_pad if w == used_vocab.unk
                ]) / len(s1_no_pad)
                s2_unk_percentage = sum([
                    1. for w in s1_no_pad if w == used_vocab.unk
                ]) / len(s2_no_pad)

                # mean freq word
                s1_mean_freq = np.mean([
                    0 if w == data.vocab.unk else used_vocab._id_to_freq[w]
                    for w in s1_no_pad
                ])
                s2_mean_freq = np.mean([
                    0 if w == data.vocab.unk else used_vocab._id_to_freq[w]
                    for w in s2_no_pad
                ])

                # mean rank word (UNK is max rank)
                # NOTE(kudkudak): Will break if we reindex unk between vocabs :P
                s1_mean_rank = np.mean([
                    reference_vocab.size() if reference_vocab.word_to_id(
                        used_vocab.id_to_word(w)) == reference_vocab.unk else
                    reference_vocab.word_to_id(used_vocab.id_to_word(w))
                    for w in s1_no_pad
                ])

                s2_mean_rank = np.mean([
                    reference_vocab.size() if reference_vocab.word_to_id(
                        used_vocab.id_to_word(w)) == reference_vocab.unk else
                    reference_vocab.word_to_id(used_vocab.id_to_word(w))
                    for w in s2_no_pad
                ])

                rows.append({
                    "pred": label_pred[id],
                    "true_label": ex['label'][id],
                    "s1": ' '.join(s1_decoded),
                    "s2": ' '.join(s2_decoded),
                    "s1_unk_percentage": s1_unk_percentage,
                    "s2_unk_percentage": s2_unk_percentage,
                    "s1_mean_freq": s1_mean_freq,
                    "s2_mean_freq": s2_mean_freq,
                    "s1_mean_rank": s1_mean_rank,
                    "s2_mean_rank": s2_mean_rank,
                    "p_0": prob[id, 0],
                    "p_1": prob[id, 1],
                    "p_2": prob[id, 2]
                })

        preds = pd.DataFrame(rows, columns=rows[0].keys())
        preds.to_csv(
            os.path.join(dest_path,
                         prefix + '_predictions_{}.csv'.format(subset)))
        results[subset] = {}
        results[subset]['misclassification'] = 1 - np.mean(
            preds.pred == preds.true_label)

        if subset == "valid" and np.abs(
            (1 - np.mean(preds.pred == preds.true_label)) -
                best_val_acc) > 0.001:
            logging.error("!!!")
            logging.error(
                "Found different best_val_acc. Probably due to changed specification of the model class."
            )
            logging.error("Discrepancy {}".format(
                (1 - np.mean(preds.pred == preds.true_label)) - best_val_acc))
            logging.error("!!!")

        logging.info(results)

    json.dump(results,
              open(os.path.join(dest_path, prefix + '_results.json'), "w"))
Exemplo n.º 4
0
def main():
    logging.basicConfig(
        level='INFO',
        format="%(asctime)s: %(name)s: %(levelname)s: %(message)s")

    parser = argparse.ArgumentParser(
        "Converts GLOVE embeddings to a numpy array")
    parser.add_argument("txt", help="GLOVE data in txt format")
    parser.add_argument("npy", help="Destination for npy format")
    parser.add_argument("--vocab",
                        default="",
                        help="Performs subsetting based on passed vocab")
    parser.add_argument("--dict",
                        default="",
                        help="Performs subsetting based on passed dict")

    # OOV handling
    parser.add_argument("--try-lemma", action="store_true", help="Try lemma")
    parser.add_argument("--try-lowercase", default="", help="Try lowercase")

    args = parser.parse_args()

    if args.dict and not args.vocab:
        # usually you'd want to use both, I suppose
        raise NotImplementedError("Not implemented")
    if args.try_lemma or args.try_lowercase:
        # TODO(kudkudak): Implement
        raise NotImplementedError("Not implemented yet")

    if args.vocab == "":
        embeddings = []
        dim = None
        with open(args.txt) as src:
            for i, line in enumerate(src):
                tokens = line.strip().split()
                features = map(float, tokens[1:])
                dim = len(features)
                embeddings.append(features)
                if i and i % 100000 == 0:
                    print i
        embeddings = [[0.] * dim] * len(
            Vocabulary.SPECIAL_TOKEN_MAP) + embeddings
        numpy.save(args.npy, embeddings)
    else:
        vocab = Vocabulary(args.vocab)
        if args.dict:
            dict_ = Dictionary(args.dict)

        print('Computing GloVe')

        # Loading
        embeddings_index = {}
        f = open(args.txt)
        print('Reading GloVe file')
        for line in f:
            values = line.split(' ')
            word = values[0]
            dim = len(values[1:])
            coefs = numpy.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
        f.close()

        # Embedding matrix
        print('Reading GloVe file')
        embedding_matrix = numpy.zeros((vocab.size(), dim))
        for word in vocab._word_to_id:
            embedding_vector = embeddings_index.get(word)
            in_glove = embedding_vector is not None
            if args.dict:
                in_dict = len(dict_.get_definitions(word)) > 0

            if in_glove and (not args.dict or in_dict):
                # words not found in embedding index will be all-zeros.
                embedding_matrix[vocab.word_to_id(word)] = embedding_vector
            else:
                if not in_glove:
                    print(u'Missing from GloVe: {}'.format(word))
                else:
                    print(u'Missing from dict: {}'.format(word))

        numpy.save(args.npy, embedding_matrix)