Exemplo n.º 1
0
    def fit(self, dataset, epochs=10, dev=None):
        """
        Trains a BIST model on an annotated dataset in CoNLL file format.

        Args:
            dataset (str): Path to input dataset for training, formatted in CoNLL/U format.
            epochs (int, optional): Number of learning iterations.
            dev (str, optional): Path to development dataset for conducting evaluations.
        """
        if dev:
            dev = validate_existing_filepath(dev)
        dataset = validate_existing_filepath(dataset)
        validate((epochs, int, 0, None))

        print("\nRunning fit on " + dataset + "...\n")
        words, w2i, pos, rels = utils.vocab(dataset)
        self.params = words, w2i, pos, rels, self.options

        from nlp_architect.models.bist.mstlstm import MSTParserLSTM

        self.model = MSTParserLSTM(*self.params)

        for epoch in range(epochs):
            print("Starting epoch", epoch + 1)
            self.model.train(dataset)
            if dev:
                ext = dev.rindex(".")
                res_path = dev[:ext] + "_epoch_" + str(epoch +
                                                       1) + "_pred" + dev[ext:]
                utils.write_conll(res_path, self.model.predict(dev))
                utils.run_eval(dev, res_path)
Exemplo n.º 2
0
    def __init__(self,
                 model_path,
                 settings_path,
                 spacy_model="en",
                 batch_size=32,
                 use_cudnn=False):
        _model_path = path.join(path.dirname(path.realpath(__file__)),
                                model_path)
        validate_existing_filepath(_model_path)
        _settings_path = path.join(path.dirname(path.realpath(__file__)),
                                   settings_path)
        validate_existing_filepath(_settings_path)

        nlp = spacy.load(spacy_model)
        for p in nlp.pipe_names:
            if p not in ["tagger"]:
                nlp.remove_pipe(p)
        nlp.add_pipe(nlp.create_pipe("sentencizer"), first=True)
        nlp.add_pipe(
            NPAnnotator.load(_model_path,
                             settings_path,
                             batch_size=batch_size,
                             use_cudnn=use_cudnn),
            last=True,
        )
        self.nlp = nlp
Exemplo n.º 3
0
    def load(cls,
             model_path: str,
             parameter_path: str,
             batch_size: int = 32,
             use_cudnn: bool = False):
        """
        Load a NPAnnotator annotator

        Args:
            model_path (str): path to trained model
            parameter_path (str): path to model parameters
            batch_size (int, optional): inference batch_size
            use_cudnn (bool, optional): use gpu for inference (cudnn cells)

        Returns:
            NPAnnotator class with loaded model
        """

        _model_path = path.join(path.dirname(path.realpath(__file__)),
                                model_path)
        validate_existing_filepath(_model_path)
        _parameter_path = path.join(path.dirname(path.realpath(__file__)),
                                    parameter_path)
        validate_existing_filepath(_parameter_path)

        model = SequenceChunker(use_cudnn=use_cudnn)
        model.load(_model_path)
        with open(_parameter_path, "rb") as fp:
            model_params = pickle.load(fp)
            word_vocab = model_params["word_vocab"]
            chunk_vocab = model_params["chunk_vocab"]
            char_vocab = model_params.get("char_vocab", None)
        return cls(model, word_vocab, char_vocab, chunk_vocab, batch_size)
Exemplo n.º 4
0
    def fit(self, dataset, epochs=10, dev=None):
        """
        Trains a BIST model on an annotated dataset in CoNLL file format.

        Args:
            dataset (str): Path to input dataset for training, formatted in CoNLL/U format.
            epochs (int, optional): Number of learning iterations.
            dev (str, optional): Path to development dataset for conducting evaluations.
        """
        if dev:
            dev = validate_existing_filepath(dev)
        dataset = validate_existing_filepath(dataset)
        validate((epochs, int, 0, None))

        print('\nRunning fit on ' + dataset + '...\n')
        words, w2i, pos, rels = utils.vocab(dataset)
        self.params = words, w2i, pos, rels, self.options
        self.model = MSTParserLSTM(*self.params)

        for epoch in range(epochs):
            print('Starting epoch', epoch + 1)
            self.model.train(dataset)
            if dev:
                ext = dev.rindex('.')
                res_path = dev[:ext] + '_epoch_' + str(epoch + 1) + '_pred' + dev[ext:]
                utils.write_conll(res_path, self.model.predict(dev))
                utils.run_eval(dev, res_path)
Exemplo n.º 5
0
    def predict(self, dataset, evaluate=False):
        """
        Runs inference with the BIST model on a dataset in CoNLL file format.

        Args:
            dataset (str): Path to input CoNLL file.
            evaluate (bool, optional): Write prediction and evaluation files to dataset's folder.
        Returns:
            res (list of list of ConllEntry): The list of input sentences with predicted
            dependencies attached.
        """
        dataset = validate_existing_filepath(dataset)
        validate((evaluate, bool))

        print("\nRunning predict on " + dataset + "...\n")
        res = list(self.model.predict(conll_path=dataset))
        if evaluate:
            ext = dataset.rindex(".")
            pred_path = dataset[:ext] + "_pred" + dataset[ext:]
            utils.write_conll(pred_path, res)
            utils.run_eval(dataset, pred_path)
        return res
Exemplo n.º 6
0
    def predict(self, dataset, evaluate=False):
        """
        Runs inference with the BIST model on a dataset in CoNLL file format.

        Args:
            dataset (str): Path to input CoNLL file.
            evaluate (bool, optional): Write prediction and evaluation files to dataset's folder.
        Returns:
            res (list of list of ConllEntry): The list of input sentences with predicted
            dependencies attached.
        """
        dataset = validate_existing_filepath(dataset)
        validate((evaluate, bool))

        print('\nRunning predict on ' + dataset + '...\n')
        res = list(self.model.predict(conll_path=dataset))
        if evaluate:
            ext = dataset.rindex('.')
            pred_path = dataset[:ext] + '_pred' + dataset[ext:]
            utils.write_conll(pred_path, res)
            utils.run_eval(dataset, pred_path)
        return res
Exemplo n.º 7
0
        'word representations. '
        'Set `max_n` to be lesser than `min_n` to avoid char ngrams being used.')
    arg_parser.add_argument(
        '--word_ngrams',
        default=1,
        type=int,
        choices=[
            0,
            1],
        help='fasttext training hyperparameter. If 1, uses enrich word vectors with subword ('
        'ngrams) information. If 0, this is equivalent to word2vec training.')

    args = arg_parser.parse_args()

    if args.corpus_format != 'conll2000':
        validate_existing_filepath(args.corpus)

    np2vec = NP2vec(
        args.corpus,
        args.corpus_format,
        args.mark_char,
        args.word_embedding_type,
        args.sg,
        args.size,
        args.window,
        args.alpha,
        args.min_alpha,
        args.min_count,
        args.sample,
        args.workers,
        args.hs,
Exemplo n.º 8
0
def validate_existing_filepath(prefix, suffix=None):
    """Validates existing file in the path constructed from prefix.suffix in case
    prefix is not None"""
    if prefix is not None and prefix:
        io.validate_existing_filepath(fix_path(add_suffix(prefix, suffix)))
Exemplo n.º 9
0
 def _validate_paths(self, data_path):
     validate_existing_directory(data_path)
     for f in self.dataset_files:
         _f_path = path.join(data_path, self.dataset_files[f])
         validate_existing_filepath(_f_path)
         self.dataset_files[f] = _f_path
Exemplo n.º 10
0
parser.add_argument('--output', type=str, help='location were to create dump file', required=True)

args = parser.parse_args()


def vo_dump():
    vo_file = args.vo
    out_file = args.output
    mentions_event_gold_file = [args.mentions]
    vocab = load_mentions_vocab(mentions_event_gold_file, True)
    vo = VerboceanRelationExtraction.load_verbocean_file(vo_file)
    vo_for_vocab = {}
    for word in vocab:
        if word in vo:
            vo_for_vocab[word] = vo[word]

    logger.info('Found %d words from vocabulary', len(vo_for_vocab.keys()))
    logger.info('Preparing to save refDict output file')

    with open(out_file, 'w') as f:
        json.dump(vo_for_vocab, f)
    logger.info('Done saved to-%s', out_file)


if __name__ == '__main__':
    io.validate_existing_filepath(args.mentions)
    io.validate_existing_filepath(args.output)
    io.validate_existing_filepath(args.vo)
    vo_dump()
Exemplo n.º 11
0
                              target_names=data.labels_0.columns.values))


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--file_path",
                        type=str,
                        default="./",
                        help="file_path where the files to parse are located")
    parser.add_argument("--data_type",
                        type=str,
                        default="amazon",
                        choices=["amazon"],
                        help="dataset source")
    parser.add_argument(
        "--epochs",
        type=int,
        default=10,
        help="Number of epochs for both models",
        action=check_size(1, 20000),
    )
    args_in = parser.parse_args()

    # Check file path
    if args_in.file_path:
        validate_existing_filepath(args_in.file_path)

    if args_in.data_type == "amazon":
        data_in = Amazon_Reviews(args_in.file_path)
    ensemble_models(data_in, args_in)
Exemplo n.º 12
0
    parser.add_argument('--model_name',
                        default='chunker_model',
                        type=str,
                        required=True,
                        help='Model name (used for saving the model)')
    parser.add_argument('-b',
                        type=int,
                        action=check_size(1, 9999),
                        default=1,
                        help='inference batch size')
    args = parser.parse_args()
    model_path = path.join(path.dirname(path.realpath(__file__)),
                           '{}.h5'.format(str(args.model_name)))
    settings_path = path.join(path.dirname(path.realpath(__file__)),
                              '{}.params'.format(str(args.model_name)))
    validate_existing_filepath(model_path)
    validate_existing_filepath(settings_path)

    # load model and parameters
    model = SequenceChunker()
    model.load(model_path)
    word_length = model.max_word_len
    with open(settings_path, 'rb') as fp:
        model_params = pickle.load(fp)
        word_vocab = model_params['word_vocab']
        chunk_vocab = model_params['chunk_vocab']
        char_vocab = model_params.get('char_vocab', None)

    # parse documents and get tokens
    nlp = SpacyInstance(
        disable=['tagger', 'ner', 'parser', 'vectors', 'textcat'])
Exemplo n.º 13
0
    elmo_ecb_embeddings = load_elmo_for_vocab(mentions)

    with open(out_file, 'wb') as f:
        pickle.dump(elmo_ecb_embeddings, f)

    logger.info('Saving dump to file-%s', out_file)


if __name__ == '__main__':
    parser = argparse.ArgumentParser(
        description='Create Elmo Embedding dataset only dump')
    parser.add_argument('--mentions',
                        type=str,
                        help='mentions_file file',
                        required=True)
    parser.add_argument('--output',
                        type=str,
                        help='location were to create dump file',
                        required=True)

    args = parser.parse_args()

    if os.path.isdir(args.mentions):
        io.validate_existing_directory(args.mentions)
    else:
        io.validate_existing_filepath(args.mentions)

    elmo_dump()
    print('Done!')
Exemplo n.º 14
0
    type=float,
    default=1e-8,
    help='epsilon used to avoid divide by zero in softmax renormalization.',
    action=check_size(1e-100, 1e-2))
parser.add_argument('--model_file',
                    default='memn2n_weights.npz',
                    help='File to load model weights from.',
                    type=str)

parser.set_defaults(batch_size=32, epochs=200)
args = parser.parse_args()

validate((args.emb_size, int, 1, 10000), (args.eps, float, 1e-15, 1e-2))

# Sanitize inputs
validate_existing_filepath(args.model_file)
model_file = args.model_file
assert model_file.endswith('.npz')
validate_parent_exists(args.data_dir)
data_dir = args.data_dir

babi = BABI_Dialog(path=data_dir,
                   task=args.task,
                   oov=args.use_oov,
                   use_match_type=args.use_match_type,
                   cache_match_type=args.cache_match_type,
                   cache_vectorized=args.cache_vectorized)

weight_saver = Saver()

# Set num iterations to 1 epoch since we loop over epochs & shuffle
Exemplo n.º 15
0
        'word representations. '
        'Set `max_n` to be lesser than `min_n` to avoid char ngrams being used.')
    arg_parser.add_argument(
        '--word_ngrams',
        default=1,
        type=int,
        choices=[
            0,
            1],
        help='fasttext training hyperparameter. If 1, uses enrich word vectors with subword ('
        'ngrams) information. If 0, this is equivalent to word2vec training.')

    args = arg_parser.parse_args()

    if args.corpus_format is not 'conll2000':
        validate_existing_filepath(args.corpus)

    np2vec = NP2vec(
        args.corpus,
        args.corpus_format,
        args.mark_char,
        args.word_embedding_type,
        args.sg,
        args.size,
        args.window,
        args.alpha,
        args.min_alpha,
        args.min_count,
        args.sample,
        args.workers,
        args.hs,
Exemplo n.º 16
0
    help='epsilon used to avoid divide by zero in softmax renormalization.',
    action=check_size(1e-100,1e-2))
parser.add_argument(
    '--model_file',
    default='memn2n_weights.npz',
    help='File to load model weights from.',
    type=str)

parser.set_defaults(batch_size=32, epochs=200)
args = parser.parse_args()

validate((args.emb_size, int, 1, 10000),
         (args.eps, float, 1e-15, 1e-2))

# Sanitize inputs
validate_existing_filepath(args.model_file)
model_file = args.model_file
assert model_file.endswith('.npz')
validate_parent_exists(args.data_dir)
data_dir = args.data_dir

babi = BABI_Dialog(
    path=data_dir,
    task=args.task,
    oov=args.use_oov,
    use_match_type=args.use_match_type,
    cache_match_type=args.cache_match_type,
    cache_vectorized=args.cache_vectorized)

weight_saver = Saver()