Пример #1
0
def main():
    """
    Get a validation/test set, computes the compositional vectors of
    the noun compounds in the set, and saves the embeddings file.
    """
    ap = argparse.ArgumentParser()
    ap.add_argument('composition_model_path',
                    help='The composition model file (model.tar.gz)')
    ap.add_argument('nc_vocab', help='The noun compound vocabulary file')
    ap.add_argument('out_vector_file', help='Where to save the gzipped file')
    args = ap.parse_args()

    with codecs.open(args.nc_vocab, 'r', 'utf-8') as f_in:
        nc_vocab = [line.strip().lower().replace('\t', '_') for line in f_in]

    logger.info(f'Loading model from {args.composition_model_path}')
    reader = NCDatasetReader()
    archive = load_archive(args.composition_model_path)
    model = archive.model
    predictor = Predictor(model, dataset_reader=reader)

    logger.info(f'Computing vectors for the noun compounds in {args.nc_vocab}')
    with codecs.open(args.out_vector_file, 'a', 'utf-8') as f_out:
        for nc in tqdm.tqdm(nc_vocab):
            w1, w2 = nc.split('_')
            instance = reader.text_to_instance(nc, w1, w2)

            if instance is None:
                logger.warning(f'Instance is None for {nc}')
            else:
                curr_vector = predictor.predict_instance(instance)['vector']
                vector_text = ' '.join(map(str, curr_vector)).strip()
                f_out.write(f'comp_{nc} {vector_text}\n')
Пример #2
0
def compute_compositional_vectors(model_path,
                                  terms,
                                  nc_reader=NCDatasetReaderForWords(),
                                  single_word_reader=NCDatasetReaderForWords(),
                                  single_word_composition_model=None):
    """
    Compute the vectors of the terms using the model
    :param model_path: the path to the pre-trained composition model
    :param terms: a list of terms (noun compounds and single words) to compute
    :return: a dictionary of term to vector
    """
    logger.info(f'Loading model from {model_path}')
    archive = load_archive(model_path)
    model = archive.model
    predictor = Predictor(model, dataset_reader=single_word_reader)

    single_word_predictor = predictor
    if single_word_composition_model:
        single_word_model = single_word_composition_model(model)
        single_word_predictor = Predictor(single_word_model,
                                          dataset_reader=single_word_reader)

    term_to_vec = {}

    for term in tqdm.tqdm(terms):
        if '_' in term:
            curr_predictor = predictor
            curr_reader = nc_reader
        else:
            curr_predictor = single_word_predictor
            curr_reader = single_word_reader

        instance = curr_reader.text_to_instance(term)

        if instance is None:
            logger.warning(f'Instance is None for {term}')
        else:
            curr_vector = curr_predictor.predict_instance(instance)['vector']

            if len(curr_vector) == 1:
                curr_vector = curr_vector[0]

            term_to_vec[term] = curr_vector

    return term_to_vec
Пример #3
0
    def predict(
        self,
        premise='',
        hypothesis='',
    ):
        cache_key = '{}->{}'.format(premise, hypothesis)
        if cache_key in self._cache:
            return self._cache[cache_key]['label_probs']

        result = self.predictor.predict(
            premise=premise,
            hypothesis=hypothesis,
        )
        # [entailment, contradiction, neutral]
        self._cache[cache_key] = result
        self.save()
        return result['label_probs']

    def save(self):
        pickle.dump(self._cache, open(self.cache_path, "wb"))


if __name__ == '__main__':
    predictor = Predictor()
    print(
        predictor.predict(
            hypothesis=
            'The information is straight-forward, and probably easy to verify.',
            premise='The sentence provides coherent, verifiable information.'))