def main(): # set the seed for replicability np.random.seed(89) #(42) defaults = {} parser = argparse.ArgumentParser(description="Train or use a Named Entity tagger.") parser.add_argument('-c', '--config', dest='config_file', help='Specify config file', metavar='FILE') # args, remaining_argv = parser.parse_known_args() # if args.config_file: # config = ConfigParser.SafeConfigParser() # config.read([args.config_file]) # defaults = dict(config.items('Defaults')) # parser.set_defaults(**defaults) parser.add_argument('model', type=str, help='Model file to train/use.') # training options train = parser.add_argument_group('Train') train.add_argument('-t', '--train', type=str, default='', help='File with annotated data for training.') train.add_argument('-w', '--window', type=int, default=2, help='Size of the word window (default %(default)s)') train.add_argument('-s', '--embeddings-size', type=int, default=50, help='Number of features per word (default %(default)s)', dest='embeddings_size') train.add_argument('-e', '--epochs', type=int, default=100, help='Number of training epochs (default %(default)s)', dest='iterations') train.add_argument('-l', '--learning_rate', type=float, default=0.001, help='Learning rate for network weights (default %(default)s)', dest='learning_rate') train.add_argument('-n', '--hidden', type=int, default=200, help='Number of hidden neurons (default %(default)s)', dest='hidden') train.add_argument('--eps', type=float, default=1e-6, help='Epsilon value for AdaGrad (default %(default)s)') train.add_argument('--ro', type=float, default=0.95, help='Ro value for AdaDelta (default %(default)s)') train.add_argument('-o', '--output', type=str, default='', help='File where to save embeddings') # Embeddings embeddings = parser.add_argument_group('Embeddings') embeddings.add_argument('--vocab', type=str, default='', help='Vocabulary file, either read or created') embeddings.add_argument('--vocab-size', type=int, default=0, help='Maximum size of vocabulary from corpus (default %(default)s)') embeddings.add_argument('--vectors', type=str, default='', help='Embeddings file, either read or created') embeddings.add_argument('--min-occurr', type=int, default=3, help='Minimum occurrences for inclusion in vocabulary (default %(default)s', dest='minOccurr') embeddings.add_argument('--load', type=str, default='', help='Load previously saved model') embeddings.add_argument('--variant', type=str, default='', help='Either "senna" (default), "polyglot" or "word2vec".') # Extractors: extractors = parser.add_argument_group('Extractors') extractors.add_argument('--caps', const=5, nargs='?', type=int, default=None, help='Include capitalization features. Optionally, supply the number of features (default %(default)s)') extractors.add_argument('--pos', const=1, type=int, nargs='?', default=None, help='Use POS tag. Optionally supply the POS token field index (default %(default)s)') extractors.add_argument('--suffix', const=5, nargs='?', type=int, default=None, help='Include suffix features. Optionally, supply the number of features (default %(default)s)') extractors.add_argument('--suffixes', type=str, default='', help='Load suffixes from this file') extractors.add_argument('--prefix', const=5, nargs='?', type=int, default=None, help='Include prefix features. Optionally, '\ 'supply the number of features (default %(default)s)') extractors.add_argument('--prefixes', type=str, default='', help='Load prefixes from this file') extractors.add_argument('--gazetteer', type=str, help='Load gazetteer from this file') extractors.add_argument('--gsize', type=int, default=5, help='Size of gazetteer features (default %(default)s)') # reader parser.add_argument('--form-field', type=int, default=0, help='Token field containing form (default %(default)s)', dest='formField') # common parser.add_argument('--threads', type=int, default=1, help='Number of threads (default %(default)s)') parser.add_argument('-v', '--verbose', help='Verbose mode', action='store_true') # Use this for obtaining defaults from config file: #args = arguments.get_args() args = parser.parse_args() log_format = '%(message)s' log_level = logging.DEBUG if args.verbose else logging.INFO logging.basicConfig(format=log_format, level=log_level) logger = logging.getLogger("Logger") config = ConfigParser() if args.config_file: config.read(args.config_file) # merge args with config if args.train: reader = NerReader(args.formField) # a generator (can be iterated several times) sentence_iter = reader.read(args.train) if args.vocab and os.path.exists(args.vocab): if args.vectors and os.path.exists(args.vectors): # use supplied embeddings embeddings = Embeddings(vectors=args.vectors, vocab_file=args.vocab, variant=args.variant) else: # create random embeddings embeddings = Embeddings(args.embeddings_size, vocab_file=args.vocab, variant=args.variant) # add the ngrams from the corpus # build vocabulary and tag set if args.vocab_size: vocab, tagset = reader.create_vocabulary(sentence_iter, args.vocab_size, args.minOccurr) embeddings.merge(vocab) logger.info("Overriding vocabulary in %s" % args.vocab) embeddings.save_vocabulary(args.vocab) else: tagset = reader.create_tagset(sentence_iter) elif args.variant == 'word2vec': if os.path.exists(args.vectors): embeddings = Embeddings(vectors=args.vectors, variant=args.variant) vocab, tagset = reader.create_vocabulary(sentence_iter, args.vocab_size, args.minOccurr) embeddings.merge(vocab) else: vocab, tagset = reader.create_vocabulary(sentence_iter, args.vocab_size, args.minOccurr) embeddings = Embeddings(vocab=vocab, variant=args.variant) if args.vocab: logger.info("Saving vocabulary in %s" % args.vocab) embeddings.save_vocabulary(args.vocab) elif not args.vocab_size: logger.error("Missing parameter --vocab-size") return else: # build vocabulary and tag set vocab, tagset = reader.create_vocabulary(sentence_iter, args.vocab_size, args.minOccurr) logger.info("Creating word embeddings") embeddings = Embeddings(args.embeddings_size, vocab=vocab, variant=args.variant) if args.vocab: logger.info("Saving vocabulary in %s" % args.vocab) embeddings.save_vocabulary(args.vocab) converter = Converter() # pass just the formField from tokens to the extractor converter.add(embeddings, reader.formField) if args.caps: logger.info("Creating capitalization features...") converter.add(CapsExtractor(args.caps), reader.formField) if args.pos: logger.info("Creating POS features...") postags = frozenset((token[args.pos] for sent in sentence_iter for token in sent)) # tell the extractor which field to use converter.add(AttributeExtractor(postags), args.pos) # no variant, preserve case if ((args.suffixes and not os.path.exists(args.suffixes)) or (args.prefixes and not os.path.exists(args.prefixes))): # collect the forms once words = (tok[reader.formField] for sent in sentence_iter for tok in sent) if args.suffix: if os.path.exists(args.suffixes): logger.info("Loading suffix list...") extractor = SuffixExtractor(args.suffix, args.suffixes) converter.add(extractor, reader.formField) else: logger.info("Creating suffix list...") extractor = SuffixExtractor(args.suffix, None, words) converter.add(extractor, reader.formField) if args.suffixes: logger.info("Saving suffix list to: %s", args.suffixes) extractor.write(args.suffixes) if args.prefix: if os.path.exists(args.prefixes): logger.info("Loading prefix list...") extractor = PrefixExtractor(args.prefix, args.prefixes) converter.add(extractor, reader.formField) else: logger.info("Creating prefix list...") extractor = PrefixExtractor(args.prefix, None, words) converter.add(extractor, reader.formField) if args.prefixes: logger.info("Saving prefix list to: %s", args.prefixes) extractor.write(args.prefixes) if args.gazetteer: if os.path.exists(args.gazetteer): logger.info("Loading gazetteers") for extractor in GazetteerExtractor.create(args.gazetteer, args.gsize): # tell the extractor which field to use converter.add(extractor, reader.formField) else: logger.info("Creating gazetteer") tries = GazetteerExtractor.build(sentence_iter, reader.formField, reader.tagField) for tag, trie in tries.items(): # tell the extractor which field to use converter.add(GazetteerExtractor(trie, args.gsize), reader.formField) logger.info("Saving gazetteer list to: %s", args.gazetteer) with open(args.gazetteer, 'wb') as file: for tag, trie in tries.iteritems(): for ngram in trie: print >> file, ('%s\t%s' % (tag, ' '.join(ngram))).encode('UTF-8') # if args.pos: # converter.add(POS(arg.pos)) # obtain the tags for each sentence tag_index = { t:i for i,t in enumerate(tagset) } sentences = [] tags = [] for sent in sentence_iter: sentences.append(converter.convert(sent)) tags.append(np.array([tag_index[token[reader.tagField]] for token in sent], np.int32)) logger.info("Vocabulary size: %d" % embeddings.dict.size()) logger.info("Tagset size: %d" % len(tagset)) trainer = create_trainer(args, converter, tag_index) logger.info("Starting training with %d sentences" % len(sentences)) report_frequency = max(args.iterations / 200, 1) report_frequency = 1 # DEBUG trainer.train(sentences, tags, args.iterations, report_frequency, args.threads) logger.info("Saving trained model ...") trainer.saver(trainer) logger.info("... to %s" % args.model) else: with open(args.model) as file: tagger = NerTagger.load(file) reader = TaggerReader() for sent in reader: ConllWriter.write(tagger.tag(sent, reader.tagField))
def main(): # set the seed for replicability np.random.seed(42) defaults = {} parser = argparse.ArgumentParser(description="Learn word embeddings.") parser.add_argument('-c', '--config', dest='config_file', help='Specify config file', metavar='FILE') # args, remaining_argv = parser.parse_known_args() # if args.config_file: # config = ConfigParser.SafeConfigParser() # config.read([args.config_file]) # defaults = dict(config.items('Defaults')) # parser.set_defaults(**defaults) parser.add_argument('model', type=str, help='Model file to train/use.') parser.add_argument('-w', '--window', type=int, default=5, help='Size of the word window (default 5)') parser.add_argument('-s', '--embeddings-size', type=int, default=50, help='Number of features per word (default 50)', dest='embeddings_size') parser.add_argument('-e', '--epochs', type=int, default=100, help='Number of training epochs (default 100)', dest='iterations') parser.add_argument('-l', '--learning_rate', type=float, default=0.001, help='Learning rate for network weights (default 0.001)', dest='learning_rate') parser.add_argument('-n', '--hidden', type=int, default=200, help='Number of hidden neurons (default 200)', dest='hidden') parser.add_argument('--threads', type=int, default=1, help='Number of threads (default 1)') parser.add_argument('-t', '--train', type=str, default=None, help='File with annotated data for training.') parser.add_argument('-o', '--output', type=str, default=None, help='File where to save embeddings') # Extractors: parser.add_argument('--caps', const=5, nargs='?', type=int, default=None, help='Include capitalization features. Optionally, supply the number of features (default 5)') parser.add_argument('--suffix', const=5, nargs='?', type=int, default=None, help='Include suffix features. Optionally, supply the number of features (default 5)') parser.add_argument('--suffixes', type=str, help='Load suffixes from this file') parser.add_argument('--prefix', const=0, nargs='?', type=int, default=None, help='Include prefix features. Optionally, '\ 'supply the number of features (default 0)') parser.add_argument('--prefixes', type=str, help='Load prefixes from this file') parser.add_argument('--gazetteer', type=str, help='Load gazetteer from this file') parser.add_argument('--gsize', type=int, default=5, help='Size of gazetteer features (default 5)') # common parser.add_argument('--vocab', type=str, default=None, help='Vocabulary file, either read or created') parser.add_argument('--vectors', type=str, default=None, help='Embeddings file, either read or created') parser.add_argument('--min-occurr', type=int, default=3, help='Minimum occurrences for inclusion in vocabulary', dest='minOccurr') parser.add_argument('--load', type=str, default=None, help='Load previously saved model') parser.add_argument('--variant', type=str, default=None, help='Either "senna" (default), "polyglot", "word2vec" or "gensym".') parser.add_argument('-v', '--verbose', help='Verbose mode', action='store_true') # Use this for obtaining defaults from config file: #args = arguments.get_args() args = parser.parse_args() log_format = '%(message)s' log_level = logging.DEBUG if args.verbose else logging.INFO logging.basicConfig(format=log_format, level=log_level) logger = logging.getLogger("Logger") config = ConfigParser() if args.config_file: config.read(args.config_file) # merge args with config if args.train: reader = NerReader() # a generator (can be iterated several times) sentence_iter = reader.read(args.train) if args.vocab: if not args.vectors: logger.error("No --vectors specified") return embeddings = Embeddings(args.embeddings_size, args.vocab, args.vectors, variant=args.variant) tagset = reader.create_tagset(sentence_iter) #tagset = Plain.read_vocabulary('ner-tag-dict.txt') # DEBUG elif args.variant == 'word2vec': embeddings = Embeddings(vectors=args.vectors, variant=args.variant) tagset = reader.create_tagset(sentence_iter) else: # build vocabulary and tag set vocab, tagset = reader.create_vocabulary(sentence_iter, args.vocab_size, args.minOccurr) logger.info("Creating word embeddings") embeddings = Embeddings(args.embeddings_size, vocab=vocab, variant=args.variant) converter = Converter() converter.add(embeddings) if args.caps: logger.info("Creating capitalization features...") converter.add(CapsExtractor(args.caps)) if args.suffix: logger.info("Creating suffix features...") # collect the forms words = (tok[0] for sent in sentence_iter for tok in sent) extractor = SuffixExtractor(args.suffix, args.suffixes, words) converter.add(extractor) if args.prefix: logger.info("Creating prefix features...") extractor = PrefixExtractor(args.prefix, args.prefixes, sentence_iter) converter.add(extractor) if args.gazetteer: logger.info("Creating gazetteer features") for extractor in GazetteerExtractor.create(args.gazetteer, args.gsize): converter.add(extractor) # if args.pos: # converter.add(POS(arg.pos)) # obtain the tags for each sentence tags_dict = { t:i for i,t in enumerate(tagset) } sentences = [] tags = [] for sent in sentence_iter: sentences.append(converter.convert([token[0] for token in sent])) tags.append(np.array([tags_dict[token[-1]] for token in sent])) trainer = create_trainer(args, converter, tags_dict) logger.info("Starting training with %d sentences" % len(sentences)) report_frequency = max(args.iterations / 200, 1) report_frequency = 1 # DEBUG trainer.train(sentences, tags, args.iterations, report_frequency, args.threads) logger.info("Saving trained model ...") trainer.saver(trainer) logger.info("... to %s" % args.model) else: with open(args.model) as file: tagger = NerTagger.load(file) reader = ConllReader() for sent in reader: sent = [x[0] for x in sent] # extract form ConllWriter.write(tagger.tag(sent))
def main(): # set the seed for replicability np.random.seed(42) defaults = {} parser = argparse.ArgumentParser(description="POS tagger using word embeddings.") parser.add_argument('-c', '--config', dest='config_file', help='Specify config file', metavar='FILE') # args, remaining_argv = parser.parse_known_args() # if args.config_file: # config = ConfigParser.SafeConfigParser() # config.read([args.config_file]) # defaults = dict(config.items('Defaults')) # parser.set_defaults(**defaults) parser.add_argument('model', type=str, help='Model file to train/use.') parser.add_argument('--threads', type=int, default=1, help='Number of threads (default %(default)s)') parser.add_argument('-v', '--verbose', help='Verbose mode', action='store_true') # training options train = parser.add_argument_group('Train') train.add_argument('-t', '--train', type=str, default=None, help='File with annotated data for training.') train.add_argument('-w', '--window', type=int, default=5, help='Size of the word window (default %(default)s)') train.add_argument('-s', '--embeddings-size', type=int, default=50, help='Number of features per word (default %(default)s)', dest='embeddings_size') train.add_argument('-e', '--epochs', type=int, default=100, help='Number of training epochs (default %(default)s)', dest='iterations') train.add_argument('-l', '--learning_rate', type=float, default=0.001, help='Learning rate for network weights (default %(default)s)', dest='learning_rate') train.add_argument('-n', '--hidden', type=int, default=200, help='Number of hidden neurons (default %(default)s)', dest='hidden') train.add_argument('--eps', type=float, default=1e-8, help='Epsilon value for AdaGrad (default %(default)s)') train.add_argument('--ro', type=float, default=0.95, help='Ro value for AdaDelta (default %(default)s)') train.add_argument('-o', '--output', type=str, default='', help='File where to save embeddings') # Embeddings embeddings = parser.add_argument_group('Embeddings') embeddings.add_argument('--vocab', type=str, default='', help='Vocabulary file, either read or created') embeddings.add_argument('--vocab-size', type=int, default=0, help='Maximum size of vocabulary (default %(default)s)') embeddings.add_argument('--vectors', type=str, default='', help='Embeddings file, either read or created') embeddings.add_argument('--min-occurr', type=int, default=3, help='Minimum occurrences for inclusion in vocabulary', dest='minOccurr') embeddings.add_argument('--load', type=str, default='', help='Load previously saved model') embeddings.add_argument('--variant', type=str, default='', help='Either "senna" (default), "polyglot" or "word2vec".') # Extractors: extractors = parser.add_argument_group('Extractors') extractors.add_argument('--caps', const=5, nargs='?', type=int, default=None, help='Include capitalization features. Optionally, supply the number of features (default %(default)s)') extractors.add_argument('--suffix', const=5, nargs='?', type=int, default=None, help='Include suffix features. Optionally, supply the number of features (default %(default)s)') extractors.add_argument('--suffixes', type=str, default='', help='Load suffixes from this file') extractors.add_argument('--prefix', const=5, nargs='?', type=int, default=None, help='Include prefix features. Optionally, '\ 'supply the number of features (default %(default)s)') extractors.add_argument('--prefixes', type=str, default='', help='Load prefixes from this file') # reader parser.add_argument('--form-field', type=int, default=0, help='Token field containing form (default %(default)s)', dest='formField') # Use this for obtaining defaults from config file: #args = arguments.get_args() args = parser.parse_args() log_format = '%(message)s' log_level = logging.DEBUG if args.verbose else logging.INFO logging.basicConfig(format=log_format, level=log_level) logger = logging.getLogger("Logger") config = ConfigParser() if args.config_file: config.read(args.config_file) # merge args with config if args.train: reader = PosReader(args.formField) # a generator (can be iterated several times) sentence_iter = reader.read(args.train) if args.vocab and os.path.exists(args.vocab): # start with the given vocabulary base_vocab = reader.load_vocabulary(args.vocab) if args.vectors and os.path.exists(args.vectors): embeddings = Embeddings(vectors=args.vectors, vocab=base_vocab, variant=args.variant) else: # create random embeddings embeddings = Embeddings(args.embeddings_size, vocab=base_vocab, variant=args.variant) # add the ngrams from the corpus # build vocabulary and tag set if args.vocab_size: vocab, tagset = reader.create_vocabulary(sentence_iter, args.vocab_size, args.minOccurr) embeddings.merge(vocab) logger.info("Overriding vocabulary in %s" % args.vocab) embeddings.save_vocabulary(args.vocab) else: vocab = base_vocab tagset = reader.create_tagset(sentence_iter) elif args.vocab: if not args.vectors: logger.error("No --vectors specified") return embeddings = Embeddings(args.embeddings_size, args.vocab, args.vectors, variant=args.variant) tagset = reader.create_tagset(sentence_iter) logger.info("Creating vocabulary in %s" % args.vocab) embeddings.save_vocabulary(args.vocab) elif args.variant == 'word2vec': if os.path.exists(args.vectors): embeddings = Embeddings(vectors=args.vectors, variant=args.variant) vocab, tagset = reader.create_vocabulary(sentence_iter, args.vocab_size, args.minOccurr) embeddings.merge(vocab) else: embeddings = Embeddings(vectors=args.vectors, variant=args.variant) tagset = reader.create_tagset(sentence_iter) if args.vocab: logger.info("Creating vocabulary in %s" % args.vocab) embeddings.save_vocabulary(args.vocab) else: # build vocabulary and tag set vocab, tagset = reader.create_vocabulary(sentence_iter, args.vocab_size, args.minOccurr) logger.info("Creating vocabulary in %s" % args.vocab) embeddings.save_vocabulary(args.vocab) logger.info("Creating word embeddings") embeddings = Embeddings(args.embeddings_size, vocab=vocab, variant=args.variant) converter = Converter() converter.add(embeddings) if args.caps: logger.info("Creating capitalization features...") converter.add(CapsExtractor(args.caps)) if ((args.suffixes and not os.path.exists(args.suffixes)) or (args.prefixes and not os.path.exists(args.prefixes))): # collect the forms once words = (tok[reader.formField] for sent in sentence_iter for tok in sent) if args.suffix: if os.path.exists(args.suffixes): logger.info("Loading suffix list...") extractor = SuffixExtractor(args.suffix, args.suffixes) converter.add(extractor) else: logger.info("Creating suffix list...") extractor = SuffixExtractor(args.suffix, None, words) converter.add(extractor) if args.suffixes: logger.info("Saving suffix list to: %s", args.suffixes) extractor.write(args.suffixes) if args.prefix: if os.path.exists(args.prefixes): logger.info("Loading prefix list...") extractor = PrefixExtractor(args.prefix, args.prefixes) converter.add(extractor) else: logger.info("Creating prefix list...") extractor = PrefixExtractor(args.prefix, None, words) converter.add(extractor) if args.prefixes: logger.info("Saving prefix list to: %s", args.prefixes) extractor.write(args.prefixes) # obtain the tags for each sentence tag_index = { t:i for i,t in enumerate(tagset) } sentences = [] tags = [] for sent in sentence_iter: sentences.append(converter.convert([token[reader.formField] for token in sent])) tags.append(np.array([tag_index[token[reader.tagField]] for token in sent])) trainer = create_trainer(args, converter, tag_index) logger.info("Starting training with %d sentences" % len(sentences)) report_frequency = max(args.iterations / 200, 1) report_frequency = 1 # DEBUG trainer.train(sentences, tags, args.iterations, report_frequency, args.threads) logger.info("Saving trained model ...") trainer.saver(trainer) logger.info("... to %s" % args.model) else: with open(args.model) as file: tagger = Tagger.load(file) reader = ConllReader() for sent in reader: sent = [x[args.formField] for x in sent] # extract form ConllWriter.write(tagger.tag_sequence(sent, return_tokens=True))
def main(): # set the seed for replicability np.random.seed(42) defaults = {} parser = argparse.ArgumentParser( description="Train or use a Named Entity tagger.") parser.add_argument('-c', '--config', dest='config_file', help='Specify config file', metavar='FILE') # args, remaining_argv = parser.parse_known_args() # if args.config_file: # config = ConfigParser.SafeConfigParser() # config.read([args.config_file]) # defaults = dict(config.items('Defaults')) # parser.set_defaults(**defaults) parser.add_argument('model', type=str, help='Model file to train/use.') parser.add_argument('-w', '--window', type=int, default=5, help='Size of the word window (default 5)') parser.add_argument('-s', '--embeddings-size', type=int, default=50, help='Number of features per word (default 50)', dest='embeddings_size') parser.add_argument('-e', '--epochs', type=int, default=100, help='Number of training epochs (default 100)', dest='iterations') parser.add_argument( '-l', '--learning_rate', type=float, default=0.001, help='Learning rate for network weights (default 0.001)', dest='learning_rate') parser.add_argument('-n', '--hidden', type=int, default=200, help='Number of hidden neurons (default 200)', dest='hidden') parser.add_argument('--threads', type=int, default=1, help='Number of threads (default 1)') parser.add_argument('-t', '--train', type=str, default='', help='File with annotated data for training.') parser.add_argument('-o', '--output', type=str, default='', help='File where to save embeddings') # Extractors: parser.add_argument( '--caps', const=5, nargs='?', type=int, default=None, help= 'Include capitalization features. Optionally, supply the number of features (default 5)' ) parser.add_argument( '--suffix', const=5, nargs='?', type=int, default=None, help= 'Include suffix features. Optionally, supply the number of features (default 5)' ) parser.add_argument('--suffixes', type=str, default='', help='Load suffixes from this file') parser.add_argument('--prefix', const=0, nargs='?', type=int, default=None, help='Include prefix features. Optionally, '\ 'supply the number of features (default 0)') parser.add_argument('--prefixes', type=str, default='', help='Load prefixes from this file') parser.add_argument('--gazetteer', type=str, help='Load gazetteer from this file') parser.add_argument('--gsize', type=int, default=5, help='Size of gazetteer features (default 5)') # reader parser.add_argument('--form-field', type=int, default=0, dest='formField', help='Token field containin form (default 0)') # common parser.add_argument('--vocab', type=str, default='', help='Vocabulary file, either read or created') parser.add_argument('--vocab-size', type=int, default=0, dest='vocab_size', help='Size of vocabulary to create') parser.add_argument('--vectors', type=str, default='', help='Embeddings file, either read or created') parser.add_argument('--min-occurr', type=int, default=3, help='Minimum occurrences for inclusion in vocabulary', dest='minOccurr') parser.add_argument('--load', type=str, default='', help='Load previously saved model') parser.add_argument( '--variant', type=str, default='', help='Either "senna" (default), "polyglot" or "word2vec".') parser.add_argument('-v', '--verbose', help='Verbose mode', action='store_true') # Use this for obtaining defaults from config file: #args = arguments.get_args() args = parser.parse_args() log_format = '%(message)s' log_level = logging.DEBUG if args.verbose else logging.INFO logging.basicConfig(format=log_format, level=log_level) logger = logging.getLogger("Logger") config = ConfigParser() if args.config_file: config.read(args.config_file) # merge args with config if args.train: reader = NerReader(args.formField) # a generator (can be iterated several times) sentence_iter = reader.read(args.train) if os.path.exists(args.vocab): # start with the given vocabulary base_vocab = reader.load_vocabulary(args.vocab) if os.path.exists(args.vectors): embeddings = Embeddings(vectors=args.vectors, vocab=base_vocab, variant=args.variant) else: embeddings = Embeddings(args.embeddings_size, vocab=base_vocab, variant=args.variant) # add the ngrams from the corpus # build vocabulary and tag set vocab, tagset = reader.create_vocabulary(sentence_iter, args.vocab_size, args.minOccurr) embeddings.merge(vocab) logger.info("Overriding vocabulary in %s" % args.vocab) embeddings.save_vocabulary(args.vocab) elif args.vocab: if not args.vectors: logger.error("No --vectors specified") return embeddings = Embeddings(args.embeddings_size, args.vocab, args.vectors, variant=args.variant) tagset = reader.create_tagset(sentence_iter) logger.info("Creating vocabulary in %s" % args.vocab) embeddings.save_vocabulary(args.vocab) elif args.variant == 'word2vec': if os.path.exists(args.vectors): embeddings = Embeddings(vectors=args.vectors, variant=args.variant) vocab, tagset = reader.create_vocabulary( sentence_iter, args.vocab_size, args.minOccurr) embeddings.merge(vocab) else: embeddings = Embeddings(vectors=args.vectors, variant=args.variant) tagset = reader.create_tagset(sentence_iter) else: # build vocabulary and tag set vocab, tagset = reader.create_vocabulary(sentence_iter, args.vocab_size, args.minOccurr) logger.info("Creating vocabulary in %s" % args.vocab) embeddings.save_vocabulary(args.vocab) logger.info("Creating word embeddings") embeddings = Embeddings(args.embeddings_size, vocab=vocab, variant=args.variant) converter = Converter() converter.add(embeddings) if args.caps: logger.info("Creating capitalization features...") converter.add(CapsExtractor(args.caps)) if ((args.suffixes and not os.path.exists(args.suffixes)) or (args.prefixes and not os.path.exists(args.prefixes))): # collect the forms once words = (tok[reader.formField] for sent in sentence_iter for tok in sent) if os.path.exists(args.suffixes): logger.info("Loading suffix list...") extractor = SuffixExtractor.create(args.suffix, args.suffixes) converter.add(extractor) elif args.suffixes: logger.info("Creating suffix list...") # collect the forms words = (tok[reader.formField] for sent in sentence_iter for tok in sent) extractor = SuffixExtractor(args.suffix, None, words) converter.add(extractor) logger.info("Saving suffix list to: %s", args.suffixes) extractor.write(args.suffixes) if os.path.exists(args.prefixes): logger.info("Loading prefix list...") extractor = PrefixExtractor.create(args.prefix, args.prefixes) converter.add(extractor) elif args.prefixes: logger.info("Creating prefix list...") extractor = PrefixExtractor(args.prefix, None, words) converter.add(extractor) if args.prefixes: logger.info("Saving prefix list to: %s", args.prefixes) extractor.write(args.prefixes) if os.path.exists(args.gazetteer): logger.info("Loading gazetteers") for extractor in GazetteerExtractor.create(args.gazetteer, args.gsize): converter.add(extractor) elif args.gazetteer: logger.info("Creating gazetteer") # strip B-/I- classes = sorted([tag[2:] or tag for tag in tagset]) # gazetteers must be kept in the same order as tags gazs = OrderedDict() for tag in classes: if tag != 'O': gazs[tag] = Counter( ) # we might want to keep the most frequent for sent in sentence_iter: for tok in sent: tag = tok[reader.tagField] # last field if tag != 'O': tag = tag[2:] # strip B-/I- form = tok[reader.formField].lower() # lowercase gazs[tag][form] += 1 # FORM for tag, counter in gazs.items(): converter.add(GazetteerExtractor(counter.keys(), args.gsize)) logger.info("Saving gazetter list to: %s", args.gazetteer) with open(args.gazetteer, 'wb') as file: for tag, counter in gazs.iteritems(): for w in counter.keys(): print >> file, '\t'.join((tag, w)).encode('UTF-8') # if args.pos: # converter.add(POS(arg.pos)) # obtain the tags for each sentence tags_dict = {t: i for i, t in enumerate(tagset)} sentences = [] tags = [] for sent in sentence_iter: sentences.append( converter.convert([token[reader.formField] for token in sent])) tags.append( np.array([tags_dict[token[reader.tagField]] for token in sent])) trainer = create_trainer(args, converter, tags_dict) logger.info("Starting training with %d sentences" % len(sentences)) report_frequency = max(args.iterations / 200, 1) report_frequency = 1 # DEBUG trainer.train(sentences, tags, args.iterations, report_frequency, args.threads) logger.info("Saving trained model ...") trainer.saver(trainer) logger.info("... to %s" % args.model) else: with open(args.model) as file: tagger = NerTagger.load(file) reader = ConllReader() for sent in reader: sent = [x[args.formField] for x in sent] # extract form ConllWriter.write(tagger.tag(sent))
def main(): # set the seed for replicability np.random.seed(89) #(42) defaults = {} parser = argparse.ArgumentParser( description="Train or use a Named Entity tagger.") parser.add_argument('-c', '--config', dest='config_file', help='Specify config file', metavar='FILE') # args, remaining_argv = parser.parse_known_args() # if args.config_file: # config = ConfigParser.SafeConfigParser() # config.read([args.config_file]) # defaults = dict(config.items('Defaults')) # parser.set_defaults(**defaults) parser.add_argument('model', type=str, help='Model file to train/use.') # training options train = parser.add_argument_group('Train') train.add_argument('-t', '--train', type=str, default='', help='File with annotated data for training.') train.add_argument('-w', '--window', type=int, default=2, help='Size of the word window (default %(default)s)') train.add_argument( '-s', '--embeddings-size', type=int, default=50, help='Number of features per word (default %(default)s)', dest='embeddings_size') train.add_argument('-e', '--epochs', type=int, default=100, help='Number of training epochs (default %(default)s)', dest='iterations') train.add_argument( '-l', '--learning_rate', type=float, default=0.001, help='Learning rate for network weights (default %(default)s)', dest='learning_rate') train.add_argument('-n', '--hidden', type=int, default=200, help='Number of hidden neurons (default %(default)s)', dest='hidden') train.add_argument('--eps', type=float, default=1e-6, help='Epsilon value for AdaGrad (default %(default)s)') train.add_argument('--ro', type=float, default=0.95, help='Ro value for AdaDelta (default %(default)s)') train.add_argument('-o', '--output', type=str, default='', help='File where to save embeddings') # Embeddings embeddings = parser.add_argument_group('Embeddings') embeddings.add_argument('--vocab', type=str, default='', help='Vocabulary file, either read or created') embeddings.add_argument( '--vocab-size', type=int, default=0, help='Maximum size of vocabulary from corpus (default %(default)s)') embeddings.add_argument('--vectors', type=str, default='', help='Embeddings file, either read or created') embeddings.add_argument( '--min-occurr', type=int, default=3, help= 'Minimum occurrences for inclusion in vocabulary (default %(default)s', dest='minOccurr') embeddings.add_argument('--load', type=str, default='', help='Load previously saved model') embeddings.add_argument( '--variant', type=str, default='', help='Either "senna" (default), "polyglot" or "word2vec".') # Extractors: extractors = parser.add_argument_group('Extractors') extractors.add_argument( '--caps', const=5, nargs='?', type=int, default=None, help= 'Include capitalization features. Optionally, supply the number of features (default %(default)s)' ) extractors.add_argument( '--pos', const=1, type=int, nargs='?', default=None, help= 'Use POS tag. Optionally supply the POS token field index (default %(default)s)' ) extractors.add_argument( '--suffix', const=5, nargs='?', type=int, default=None, help= 'Include suffix features. Optionally, supply the number of features (default %(default)s)' ) extractors.add_argument('--suffixes', type=str, default='', help='Load suffixes from this file') extractors.add_argument('--prefix', const=5, nargs='?', type=int, default=None, help='Include prefix features. Optionally, '\ 'supply the number of features (default %(default)s)') extractors.add_argument('--prefixes', type=str, default='', help='Load prefixes from this file') extractors.add_argument('--gazetteer', type=str, help='Load gazetteer from this file') extractors.add_argument( '--gsize', type=int, default=5, help='Size of gazetteer features (default %(default)s)') # reader parser.add_argument( '--form-field', type=int, default=0, help='Token field containing form (default %(default)s)', dest='formField') # common parser.add_argument('--threads', type=int, default=1, help='Number of threads (default %(default)s)') parser.add_argument('-v', '--verbose', help='Verbose mode', action='store_true') # Use this for obtaining defaults from config file: #args = arguments.get_args() args = parser.parse_args() log_format = '%(message)s' log_level = logging.DEBUG if args.verbose else logging.INFO logging.basicConfig(format=log_format, level=log_level) logger = logging.getLogger("Logger") config = ConfigParser() if args.config_file: config.read(args.config_file) # merge args with config if args.train: reader = NerReader(args.formField) # a generator (can be iterated several times) sentence_iter = reader.read(args.train) if args.vocab and os.path.exists(args.vocab): if args.vectors and os.path.exists(args.vectors): # use supplied embeddings embeddings = Embeddings(vectors=args.vectors, vocab_file=args.vocab, variant=args.variant) else: # create random embeddings embeddings = Embeddings(args.embeddings_size, vocab_file=args.vocab, variant=args.variant) # add the ngrams from the corpus # build vocabulary and tag set if args.vocab_size: vocab, tagset = reader.create_vocabulary( sentence_iter, args.vocab_size, args.minOccurr) embeddings.merge(vocab) logger.info("Overriding vocabulary in %s" % args.vocab) embeddings.save_vocabulary(args.vocab) else: tagset = reader.create_tagset(sentence_iter) elif args.variant == 'word2vec': if os.path.exists(args.vectors): embeddings = Embeddings(vectors=args.vectors, variant=args.variant) vocab, tagset = reader.create_vocabulary( sentence_iter, args.vocab_size, args.minOccurr) embeddings.merge(vocab) else: vocab, tagset = reader.create_vocabulary( sentence_iter, args.vocab_size, args.minOccurr) embeddings = Embeddings(vocab=vocab, variant=args.variant) if args.vocab: logger.info("Saving vocabulary in %s" % args.vocab) embeddings.save_vocabulary(args.vocab) elif not args.vocab_size: logger.error("Missing parameter --vocab-size") return else: # build vocabulary and tag set vocab, tagset = reader.create_vocabulary(sentence_iter, args.vocab_size, args.minOccurr) logger.info("Creating word embeddings") embeddings = Embeddings(args.embeddings_size, vocab=vocab, variant=args.variant) if args.vocab: logger.info("Saving vocabulary in %s" % args.vocab) embeddings.save_vocabulary(args.vocab) converter = Converter() # pass just the formField from tokens to the extractor converter.add(embeddings, reader.formField) if args.caps: logger.info("Creating capitalization features...") converter.add(CapsExtractor(args.caps), reader.formField) if args.pos: logger.info("Creating POS features...") postags = frozenset( (token[args.pos] for sent in sentence_iter for token in sent)) # tell the extractor which field to use converter.add(AttributeExtractor(postags), args.pos) # no variant, preserve case if ((args.suffixes and not os.path.exists(args.suffixes)) or (args.prefixes and not os.path.exists(args.prefixes))): # collect the forms once words = (tok[reader.formField] for sent in sentence_iter for tok in sent) if args.suffix: if os.path.exists(args.suffixes): logger.info("Loading suffix list...") extractor = SuffixExtractor(args.suffix, args.suffixes) converter.add(extractor, reader.formField) else: logger.info("Creating suffix list...") extractor = SuffixExtractor(args.suffix, None, words) converter.add(extractor, reader.formField) if args.suffixes: logger.info("Saving suffix list to: %s", args.suffixes) extractor.write(args.suffixes) if args.prefix: if os.path.exists(args.prefixes): logger.info("Loading prefix list...") extractor = PrefixExtractor(args.prefix, args.prefixes) converter.add(extractor, reader.formField) else: logger.info("Creating prefix list...") extractor = PrefixExtractor(args.prefix, None, words) converter.add(extractor, reader.formField) if args.prefixes: logger.info("Saving prefix list to: %s", args.prefixes) extractor.write(args.prefixes) if args.gazetteer: if os.path.exists(args.gazetteer): logger.info("Loading gazetteers") for extractor in GazetteerExtractor.create( args.gazetteer, args.gsize): # tell the extractor which field to use converter.add(extractor, reader.formField) else: logger.info("Creating gazetteer") tries = GazetteerExtractor.build(sentence_iter, reader.formField, reader.tagField) for tag, trie in tries.items(): # tell the extractor which field to use converter.add(GazetteerExtractor(trie, args.gsize), reader.formField) logger.info("Saving gazetteer list to: %s", args.gazetteer) with open(args.gazetteer, 'wb') as file: for tag, trie in tries.iteritems(): for ngram in trie: print(('%s\t%s' % (tag, ' '.join(ngram))).encode('UTF-8'), file=file) # if args.pos: # converter.add(POS(arg.pos)) # obtain the tags for each sentence tag_index = {t: i for i, t in enumerate(tagset)} sentences = [] tags = [] for sent in sentence_iter: sentences.append(converter.convert(sent)) tags.append( np.array([tag_index[token[reader.tagField]] for token in sent], np.int32)) logger.info("Vocabulary size: %d" % embeddings.dict.size()) logger.info("Tagset size: %d" % len(tagset)) trainer = create_trainer(args, converter, tag_index) logger.info("Starting training with %d sentences" % len(sentences)) report_frequency = max(args.iterations / 200, 1) report_frequency = 1 # DEBUG trainer.train(sentences, tags, args.iterations, report_frequency, args.threads) logger.info("Saving trained model ...") trainer.saver(trainer) logger.info("... to %s" % args.model) else: with open(args.model) as file: tagger = NerTagger.load(file) reader = TaggerReader() for sent in reader: ConllWriter.write(tagger.tag(sent, reader.tagField))
def main(): # set the seed for replicability np.random.seed(42) defaults = {} parser = argparse.ArgumentParser(description="Train or use a Named Entity tagger.") parser.add_argument('-c', '--config', dest='config_file', help='Specify config file', metavar='FILE') # args, remaining_argv = parser.parse_known_args() # if args.config_file: # config = ConfigParser.SafeConfigParser() # config.read([args.config_file]) # defaults = dict(config.items('Defaults')) # parser.set_defaults(**defaults) parser.add_argument('model', type=str, help='Model file to train/use.') parser.add_argument('-w', '--window', type=int, default=5, help='Size of the word window (default 5)') parser.add_argument('-s', '--embeddings-size', type=int, default=50, help='Number of features per word (default 50)', dest='embeddings_size') parser.add_argument('-e', '--epochs', type=int, default=100, help='Number of training epochs (default 100)', dest='iterations') parser.add_argument('-l', '--learning_rate', type=float, default=0.001, help='Learning rate for network weights (default 0.001)', dest='learning_rate') parser.add_argument('-n', '--hidden', type=int, default=200, help='Number of hidden neurons (default 200)', dest='hidden') parser.add_argument('--threads', type=int, default=1, help='Number of threads (default 1)') parser.add_argument('-t', '--train', type=str, default='', help='File with annotated data for training.') parser.add_argument('-o', '--output', type=str, default='', help='File where to save embeddings') # Extractors: parser.add_argument('--caps', const=5, nargs='?', type=int, default=None, help='Include capitalization features. Optionally, supply the number of features (default 5)') parser.add_argument('--suffix', const=5, nargs='?', type=int, default=None, help='Include suffix features. Optionally, supply the number of features (default 5)') parser.add_argument('--suffixes', type=str, default='', help='Load suffixes from this file') parser.add_argument('--prefix', const=0, nargs='?', type=int, default=None, help='Include prefix features. Optionally, '\ 'supply the number of features (default 0)') parser.add_argument('--prefixes', type=str, default='', help='Load prefixes from this file') parser.add_argument('--gazetteer', type=str, help='Load gazetteer from this file') parser.add_argument('--gsize', type=int, default=5, help='Size of gazetteer features (default 5)') # reader parser.add_argument('--form-field', type=int, default=0, dest='formField', help='Token field containin form (default 0)') # common parser.add_argument('--vocab', type=str, default='', help='Vocabulary file, either read or created') parser.add_argument('--vocab-size', type=int, default=0, dest='vocab_size', help='Size of vocabulary to create') parser.add_argument('--vectors', type=str, default='', help='Embeddings file, either read or created') parser.add_argument('--min-occurr', type=int, default=3, help='Minimum occurrences for inclusion in vocabulary', dest='minOccurr') parser.add_argument('--load', type=str, default='', help='Load previously saved model') parser.add_argument('--variant', type=str, default='', help='Either "senna" (default), "polyglot" or "word2vec".') parser.add_argument('-v', '--verbose', help='Verbose mode', action='store_true') # Use this for obtaining defaults from config file: #args = arguments.get_args() args = parser.parse_args() log_format = '%(message)s' log_level = logging.DEBUG if args.verbose else logging.INFO logging.basicConfig(format=log_format, level=log_level) logger = logging.getLogger("Logger") config = ConfigParser() if args.config_file: config.read(args.config_file) # merge args with config if args.train: reader = NerReader(args.formField) # a generator (can be iterated several times) sentence_iter = reader.read(args.train) if os.path.exists(args.vocab): # start with the given vocabulary base_vocab = reader.load_vocabulary(args.vocab) if os.path.exists(args.vectors): embeddings = Embeddings(vectors=args.vectors, vocab=base_vocab, variant=args.variant) else: embeddings = Embeddings(args.embeddings_size, vocab=base_vocab, variant=args.variant) # add the ngrams from the corpus # build vocabulary and tag set vocab, tagset = reader.create_vocabulary(sentence_iter, args.vocab_size, args.minOccurr) embeddings.merge(vocab) logger.info("Overriding vocabulary in %s" % args.vocab) embeddings.save_vocabulary(args.vocab) elif args.vocab: if not args.vectors: logger.error("No --vectors specified") return embeddings = Embeddings(args.embeddings_size, args.vocab, args.vectors, variant=args.variant) tagset = reader.create_tagset(sentence_iter) logger.info("Creating vocabulary in %s" % args.vocab) embeddings.save_vocabulary(args.vocab) elif args.variant == 'word2vec': if os.path.exists(args.vectors): embeddings = Embeddings(vectors=args.vectors, variant=args.variant) vocab, tagset = reader.create_vocabulary(sentence_iter, args.vocab_size, args.minOccurr) embeddings.merge(vocab) else: embeddings = Embeddings(vectors=args.vectors, variant=args.variant) tagset = reader.create_tagset(sentence_iter) else: # build vocabulary and tag set vocab, tagset = reader.create_vocabulary(sentence_iter, args.vocab_size, args.minOccurr) logger.info("Creating vocabulary in %s" % args.vocab) embeddings.save_vocabulary(args.vocab) logger.info("Creating word embeddings") embeddings = Embeddings(args.embeddings_size, vocab=vocab, variant=args.variant) converter = Converter() converter.add(embeddings) if args.caps: logger.info("Creating capitalization features...") converter.add(CapsExtractor(args.caps)) if ((args.suffixes and not os.path.exists(args.suffixes)) or (args.prefixes and not os.path.exists(args.prefixes))): # collect the forms once words = (tok[reader.formField] for sent in sentence_iter for tok in sent) if os.path.exists(args.suffixes): logger.info("Loading suffix list...") extractor = SuffixExtractor.create(args.suffix, args.suffixes) converter.add(extractor) elif args.suffixes: logger.info("Creating suffix list...") # collect the forms words = (tok[reader.formField] for sent in sentence_iter for tok in sent) extractor = SuffixExtractor(args.suffix, None, words) converter.add(extractor) logger.info("Saving suffix list to: %s", args.suffixes) extractor.write(args.suffixes) if os.path.exists(args.prefixes): logger.info("Loading prefix list...") extractor = PrefixExtractor.create(args.prefix, args.prefixes) converter.add(extractor) elif args.prefixes: logger.info("Creating prefix list...") extractor = PrefixExtractor(args.prefix, None, words) converter.add(extractor) if args.prefixes: logger.info("Saving prefix list to: %s", args.prefixes) extractor.write(args.prefixes) if os.path.exists(args.gazetteer): logger.info("Loading gazetteers") for extractor in GazetteerExtractor.create(args.gazetteer, args.gsize): converter.add(extractor) elif args.gazetteer: logger.info("Creating gazetteer") # strip B-/I- classes = sorted([tag[2:] or tag for tag in tagset]) # gazetteers must be kept in the same order as tags gazs = OrderedDict() for tag in classes: if tag != 'O': gazs[tag] = Counter() # we might want to keep the most frequent for sent in sentence_iter: for tok in sent: tag = tok[reader.tagField] # last field if tag != 'O': tag = tag[2:] # strip B-/I- form = tok[reader.formField].lower() # lowercase gazs[tag][form] += 1 # FORM for tag, counter in gazs.items(): converter.add(GazetteerExtractor(counter.keys(), args.gsize)) logger.info("Saving gazetter list to: %s", args.gazetteer) with open(args.gazetteer, 'wb') as file: for tag, counter in gazs.iteritems(): for w in counter.keys(): print >> file, '\t'.join((tag, w)).encode('UTF-8') # if args.pos: # converter.add(POS(arg.pos)) # obtain the tags for each sentence tags_dict = { t:i for i,t in enumerate(tagset) } sentences = [] tags = [] for sent in sentence_iter: sentences.append(converter.convert([token[reader.formField] for token in sent])) tags.append(np.array([tags_dict[token[reader.tagField]] for token in sent])) trainer = create_trainer(args, converter, tags_dict) logger.info("Starting training with %d sentences" % len(sentences)) report_frequency = max(args.iterations / 200, 1) report_frequency = 1 # DEBUG trainer.train(sentences, tags, args.iterations, report_frequency, args.threads) logger.info("Saving trained model ...") trainer.saver(trainer) logger.info("... to %s" % args.model) else: with open(args.model) as file: tagger = NerTagger.load(file) reader = ConllReader() for sent in reader: sent = [x[args.formField] for x in sent] # extract form ConllWriter.write(tagger.tag(sent))
def main(): # set the seed for replicability np.random.seed(42) # DEBUG defaults = {} parser = argparse.ArgumentParser(description="Learn word embeddings.") parser.add_argument('-c', '--config', dest='config_file', help='Specify config file', metavar='FILE') #args, remaining_argv = parser.parse_known_args() # if args.config_file: # config = ConfigParser.SafeConfigParser() # config.read([args.config_file]) # defaults = dict(config.items('Defaults')) # parser.set_defaults(**defaults) parser.add_argument('model', type=str, help='Model file to train/use.') parser.add_argument('-w', '--window', type=int, default=5, help='Size of the word window (default 5)') parser.add_argument('-s', '--embeddings-size', type=int, default=50, help='Number of features per word (default 50)', dest='embeddings_size') parser.add_argument('-e', '--epochs', type=int, default=100, help='Number of training epochs (default 100)', dest='iterations') parser.add_argument('-l', '--learning_rate', type=float, default=0.001, help='Learning rate for network weights (default 0.001)', dest='learning_rate') parser.add_argument('-n', '--hidden', type=int, default=200, help='Number of hidden neurons (default 200)', dest='hidden') parser.add_argument('--threads', type=int, default=1, help='Number of threads (default 1)') parser.add_argument('-t', '--train', type=str, default=None, help='File with annotated data for training.') parser.add_argument('-o', '--output', type=str, default=None, help='File where to save embeddings') # Extractors: parser.add_argument('--caps', const=5, nargs='?', type=int, default=None, help='Include capitalization features. Optionally, supply the number of features (default 5)') parser.add_argument('--suffix', const=5, nargs='?', type=int, default=None, help='Include suffix features. Optionally, supply the number of features (default 5)') parser.add_argument('--suffixes', type=str, help='Load suffixes from this file') parser.add_argument('--prefix', const=0, nargs='?', type=int, default=None, help='Include prefix features. Optionally, '\ 'supply the number of features (default 0)') parser.add_argument('--prefixes', type=str, help='Load prefixes from this file') # common parser.add_argument('--vocab', type=str, default=None, help='Vocabulary file, either read or created') parser.add_argument('--vectors', type=str, default=None, help='Embeddings file, either read or created') parser.add_argument('--min-occurr', type=int, default=3, help='Minimum occurrences for inclusion in vocabulary', dest='minOccurr') parser.add_argument('--load', type=str, default=None, help='Load previously saved model') parser.add_argument('--variant', type=str, default=None, help='Either "senna" (default), "polyglot" or "word2vec".') parser.add_argument('-v', '--verbose', help='Verbose mode', action='store_true') # Use this for obtaining defaults from config file: #args = arguments.get_args() args = parser.parse_args() log_format = '%(message)s' log_level = logging.DEBUG if args.verbose else logging.INFO logging.basicConfig(format=log_format, level=log_level) logger = logging.getLogger("Logger") config = ConfigParser() if args.config_file: config.read(args.config_file) # merge args with config if args.train: reader = PosReader() # a generator (can be iterated several times) sentence_iter = reader.read(args.train) if args.vocab: if not args.vectors: logger.error("No --vectors specified") return embeddings = Embeddings(args.embeddings_size, args.vocab, args.vectors, variant=args.variant) tagset = reader.create_tagset(sentence_iter) #tagset = Plain.read_vocabulary('wsj.nlpnet/pos-tags.txt') # DEBUG elif args.variant == 'word2vec': embeddings = Embeddings(vectors=args.vectors, variant=args.variant) tagset = reader.create_tagset(sentence_iter) else: # build vocabulary and tag set vocab, tagset = reader.create_vocabulary(sentence_iter, args.vocab_size, args.minOccurr) logger.info("Creating word embeddings") embeddings = Embeddings(args.embeddings_size, vocab=vocab, variant=args.variant) converter = Converter() converter.add(embeddings) if args.caps: logger.info("Creating capitalization features...") converter.add(CapsExtractor(args.caps)) if args.suffix: logger.info("Creating suffix features...") # collect the forms words = (tok[0] for sent in sentence_iter for tok in sent) extractor = SuffixExtractor(args.suffix, args.suffixes, words) converter.add(extractor) if args.prefix: logger.info("Creating prefix features...") extractor = PrefixExtractor(args.prefix, args.prefixes, sentence_iter) converter.add(extractor) # obtain the tags for each sentence tags_dict = { t:i for i,t in enumerate(tagset) } sentences = [] tags = [] for sent in sentence_iter: sentences.append(converter.convert([token[0] for token in sent])) tags.append(np.array([tags_dict[token[-1]] for token in sent])) trainer = create_trainer(args, converter, tags_dict) logger.info("Starting training with %d sentences" % len(sentences)) report_frequency = max(args.iterations / 200, 1) report_frequency = 1 # DEBUG trainer.train(sentences, tags, args.iterations, report_frequency, args.threads) logger.info("Saving trained model ...") trainer.saver(trainer) logger.info("... to %s" % args.model) else: with open(args.model) as file: tagger = Tagger.load(file) reader = ConllReader() for sent in reader: sent = [x[0] for x in sent] # extract form ConllWriter.write(tagger.tag_sequence(sent, return_tokens=True))