def sswe_trainer(model_parameters): # set the seed for replicability np.random.seed(42) # args = parser.parse_args() args = model_parameters log_format = '%(message)s' log_level = logging.DEBUG if args.verbose else logging.INFO log_level = logging.INFO logging.basicConfig(format=log_format, level=log_level) logger = logging.getLogger("Logger") config = ConfigParser() if args.config_file: config.read(args.config_file) # merge args with config reader = TweetReader(text_field=args.textField, label_field=args.tagField, ngrams=args.ngrams) reader.read(args.train) vocab, bigrams, trigrams = reader.create_vocabulary( reader.sentences, args.vocab_size, min_occurrences=args.minOccurr) #print("length vocab") #print(len(vocab)) if args.variant == 'word2vec' and os.path.exists(args.vectors): embeddings = Embeddings(vectors=args.vectors, variant=args.variant) embeddings.merge(vocab) logger.info("Saving vocabulary in %s" % args.vocab) embeddings.save_vocabulary(args.vocab) elif os.path.exists(args.vocab): # start with the given vocabulary b_vocab = reader.load_vocabulary(args.vocab) bound = len(b_vocab) - len(bigrams) - len(trigrams) base_vocab = b_vocab[:bound] #print("length base vocab :") #print(len(base_vocab)) if os.path.exists(args.vectors): # load embeddings embeddings = Embeddings(vectors=args.vectors, vocab=base_vocab, variant=args.variant) else: # create embeddings embeddings = Embeddings(args.embeddings_size, vocab=base_vocab, variant=args.variant) # add the ngrams from the corpus embeddings.merge(vocab) logger.info("Overriding vocabulary in %s" % args.vocab) embeddings.save_vocabulary(args.vocab) else: embeddings = Embeddings(args.embeddings_size, vocab=vocab, variant=args.variant) logger.info("Saving vocabulary in %s" % args.vocab) embeddings.save_vocabulary(args.vocab) # Assume bigrams are prefix of trigrams, or else we should put a terminator # on trie trie = {} for b in chain(bigrams, trigrams): tmp = trie for w in b: tmp = tmp.setdefault(embeddings.dict[w], {}) converter = Converter() converter.add(embeddings) trainer = create_trainer(args, converter) report_intervals = max(args.iterations / 200, 1) report_intervals = 10000 # DEBUG logger.info("Starting training") # a generator expression (can be iterated several times) # It caches converted sentences, avoiding repeated conversions converted_sentences = converter.generator(reader.sentences, cache=True) trainer.train(converted_sentences, reader.polarities, trie, args.iterations, report_intervals) logger.info("Overriding vectors to %s" % args.vectors) embeddings.save_vectors(args.vectors, args.variant) if args.model: logger.info("Saving trained model to %s" % args.model) trainer.save(args.model)
log_format = '%(message)s' log_level = logging.DEBUG if args.verbose else logging.INFO logging.basicConfig(format=log_format, level=log_level) logger = logging.getLogger("Logger") config = ConfigParser() if args.config_file: config.read(args.config_file) # merge args with config reader = TweetReader(text_field=args.textField, label_field=args.tagField, ngrams=args.ngrams) reader.read(args.train) vocab, bigrams, trigrams = reader.create_vocabulary(reader.sentences, args.vocab_size, min_occurrences=args.minOccurr) if args.variant == 'word2vec' and os.path.exists(args.vectors): embeddings = Embeddings(vectors=args.vectors, variant=args.variant) embeddings.merge(vocab) logger.info("Saving vocabulary in %s" % args.vocab) embeddings.save_vocabulary(args.vocab) elif os.path.exists(args.vocab): # start with the given vocabulary base_vocab = reader.load_vocabulary(args.vocab) if os.path.exists(args.vectors): # load embeddings embeddings = Embeddings(vectors=args.vectors, vocab=base_vocab, variant=args.variant) else: # create embeddings
logger = logging.getLogger("Logger") config = ConfigParser() if args.config_file: config.read(args.config_file) # merge args with config reader = TweetReader(args.ngrams) reader.read(args.train) loaded_vocab = False if args.vocab and os.path.exists(args.vocab): loaded_vocab = True vocab = reader.load_vocabulary(args.vocab) else: vocab = reader.create_vocabulary(reader.sentences) tokens = [] for l in vocab: tokens.extend(l) # flatten ngrams dictionaries embeddings = Embeddings(args.embeddings_size, vocab=tokens, variant=args.variant) converter = Converter() converter.add_extractor(embeddings) trainer = create_trainer(args, converter) report_intervals = max(args.iterations / 200, 1) report_intervals = 10000 # DEBUG logger.info("Starting training")
args = parser.parse_args() log_format = '%(message)s' log_level = logging.DEBUG if args.verbose else logging.INFO logging.basicConfig(format=log_format, level=log_level) logger = logging.getLogger("Logger") config = ConfigParser() if args.config_file: config.read(args.config_file) # merge args with config reader = TweetReader(args.ngrams) reader.read(args.train) vocab, bigrams, trigrams = reader.create_vocabulary(reader.sentences, min_occurrences=2) if os.path.exists(args.vocab): # start with the given vocabulary base_vocab = reader.load_vocabulary(args.vocab) if os.path.exists(args.vectors): embeddings = Embeddings(vectors=args.vectors, vocab=base_vocab, variant=args.variant) else: embeddings = Embeddings(args.embeddings_size, vocab=base_vocab, variant=args.variant) # add the ngrams from the corpus embeddings.merge(vocab) logger.info("Overriding vocabulary in %s" % args.vocab) embeddings.save_vocabulary(args.vocab) elif args.variant == 'word2vec' and os.path.exists(args.vectors): embeddings = Embeddings(vectors=args.vectors,
args = parser.parse_args() log_format = '%(message)s' log_level = logging.DEBUG if args.verbose else logging.INFO logging.basicConfig(format=log_format, level=log_level) logger = logging.getLogger("Logger") config = ConfigParser() if args.config_file: config.read(args.config_file) # merge args with config reader = TweetReader(args.ngrams) reader.read(args.train) vocab, bigrams, trigrams = reader.create_vocabulary(reader.sentences, min_occurrences=2) if os.path.exists(args.vocab): # start with the given vocabulary base_vocab = reader.load_vocabulary(args.vocab) if os.path.exists(args.vectors): embeddings = Embeddings(vectors=args.vectors, vocab=base_vocab, variant=args.variant) else: embeddings = Embeddings(args.embeddings_size, vocab=base_vocab, variant=args.variant) # add the ngrams from the corpus embeddings.merge(vocab) logger.info("Overriding vocabulary in %s" % args.vocab) embeddings.save_vocabulary(args.vocab)