def build_models(sentences, delta=0.5, sigma=0.1): trs_counts, ems_counts, tags, words = count_grams(sentences) # print('Number of tags:', len(tags)) # print('Number of words:', len(words)) trs_model = LanguageModel(trs_counts, tags, delta) ems_model = LanguageModel(ems_counts, words, sigma) return trs_model, ems_model
def build_lang_models(self, delta = 0.1): pos_counts = Counter() neg_counts = Counter() for review in self.corpus: if review.label: pos_counts.update(review.features) else: neg_counts.update(review.features) self.pos_model = LanguageModel(pos_counts, self.vocab, delta) self.neg_model = LanguageModel(neg_counts, self.vocab, delta)
def main(): # path = input ("Where is the dataset(assignment1-dataset.txt)? Give me folder path : ") path = "E:\\Universite\\8yy\\497\\NLP_Homeworks\\generateSentence" """ NN_grams = int(input('''Which grams model you want? \nUnigram -> 1, Bigram -> 2, Trigram ->3 and so on. \nGive me your wanted ''')) """ # count_of_sentence = int(input("How many sentences you want : ")) count_of_sentence = 3 # maxlength_of_sentence = int(input("How many words does a sentence consist of : ")) maxlength_of_sentence = 30 NN_grams = 3 sentences = dataset(path) ngram_language_model = LanguageModel(NN_grams) ngram_language_model.LoadDataset2Model(sentences) generated_sentences = ngram_language_model.Generate( maxlength_of_sentence, count_of_sentence) for i in range(count_of_sentence): print(str(i) + ".sentence :", generated_sentences[i]) ngram_language_model.PPL(generated_sentences[i])
def __init__(self, opt, reuse=False): # build the model # separate inference op with train op, especially in train and validation steps with tf.name_scope('inference'): LM = LanguageModel(opt, 'test', reuse=reuse) LM.build() self.model = LM
def run_comprehension_experiment(dataset, experiment_paths, experiment_config, image_ids=None): if experiment_config.exp_name == 'baseline' or experiment_config.exp_name.startswith('max_margin'): captioner = LanguageModel(experiment_config.test.lstm_model_file, experiment_config.test.lstm_net_file, experiment_config.vocab_file, device_id=0) elif experiment_config.exp_name.startswith('mil_context'): captioner = MILContextLanguageModel(experiment_config.test.lstm_model_file, experiment_config.test.lstm_net_file, experiment_config.vocab_file, device_id=0) else: raise StandardError("Unknown experiment name: %s" % experiment_config.exp_name) if experiment_config.exp_name == 'baseline' or experiment_config.exp_name.startswith('max_margin'): experimenter = ComprehensionExperiment(captioner, dataset, image_ids=image_ids) elif experiment_config.exp_name.startswith('mil_context'): experimenter = MILContextComprehensionExperiment(captioner, dataset, image_ids=image_ids) else: raise StandardError("Unknown experiment name: %s" % experiment_config.exp_name) results = experimenter.comprehension_experiment(experiment_paths, proposal_source=experiment_config.test.proposal_source, visualize=experiment_config.test.visualize) if isinstance(results,dict): for method in results: print "Results for method: %s" % method results_filename = '%s/%s_%s_%s_results.json' % (experiment_paths.retrieval_results, dataset.dataset_name, experiment_config.test.tag, method) with open(results_filename,'w') as f: json.dump(results[method], f) else: results_filename = '%s/%s_%s_results.json' % (experiment_paths.retrieval_results, dataset.dataset_name, experiment_config.test.tag) with open(results_filename,'w') as f: json.dump(results, f)
def main(): if parameters.FULL_DATA_MODE: lm = LanguageModel() lm.load_data(train_data_path=parameters.SMALL_DATA_PATH) lm.define_model() lm.compile_model() lm.fit_model() lm.evaluate_model() print(lm.generate_seq('女', 5)) else: lm = LanguageModel() lm.prepare_for_generator(train_data_path=parameters.TRAIN_DATA_PATH, val_data_path=parameters.VAL_DATA_PATH, test_data_path=parameters.TEST_DATA_PATH) lm.define_model() lm.compile_model() lm.fit_model_with_generator() lm.evaluate_model_with_generator()
def setUp(self): self.doc = {'hello': 1, 'world': 2, 'help': 1} self.col = { 'hello': 20, 'world': 5, 'good': 5, 'bye': 15, 'free': 1, 'code': 1, 'source': 1, 'compile': 1, 'error': 1 } self.colLM = LanguageModel(term_dict=self.col) self.docLM = LanguageModel(term_dict=self.doc)
def get_model(): # Load max_len, chars, mapping for_server = load(open('saved_models/for_server.pkl', 'rb')) sequence_max_len, chars, chars_mapping = for_server[0], for_server[ 1], for_server[2] language_model = LanguageModel(sequence_max_len, chars, chars_mapping) model = language_model.load_model() return language_model, model
def __init__(self, raw_segments, observation_sequences, label_sequences): super(Retrainer, self).__init__() self.raw_segments = raw_segments self.observation_sequences = observation_sequences self.label_sequences = label_sequences self.hmm_new = None self.feature_entity_list = FeatureEntityList() self.lm = LanguageModel() self.boosting_feature_generator = BoostingFeatureGenerator() self.DOMINANT_RATIO = 0.85 # dominant label ratio: set empirically self.retrain_with_boosting_features()
def __init__(self, threshold=0.96): basename = os.path.dirname(os.path.realpath(__file__)) self.lm = LanguageModel() # Load spaCy self.nlp = spacy.load("en") # Hunspell spellchecker: https://pypi.python.org/pypi/CyHunspell # CyHunspell seems to be more accurate than Aspell in PyEnchant, but a bit slower. self.gb = Hunspell("en_GB-large", hunspell_data_dir=basename + '/resources/spelling/') # Inflection forms: http://wordlist.aspell.net/other/ self.gb_infl = loadWordFormDict(basename + "/resources/agid-2016.01.19/infl.txt") # List of common determiners self.determiners = {"", "the", "a", "an"} # List of common prepositions self.prepositions = { "", "about", "at", "by", "for", "from", "in", "of", "on", "to", "with" } self.threshold = threshold
def __init__(self, opt, dict): super(Trainer, self).__init__() self.opt = opt self.dict = dict self.hier = opt.hier if opt.restore: if opt.hier: self.mlp, self.encoder = torch.load(opt.model) else: self.mlp = torch.load(opt.model) self.encoder = self.mlp.encoder self.mlp.epoch += 1 print("Restoring MLP {} with epoch {}".format( opt.model, self.mlp.epoch)) else: if opt.hier: glove_weights = build_glove(dict["w2i"]) if opt.glove else None self.encoder = apply_cuda(HierAttnEncoder(len(dict["i2w"]), opt.bowDim, opt.hiddenSize, opt, glove_weights)) self.mlp = apply_cuda(HierAttnDecoder(len(dict["i2w"]), opt.bowDim, opt.hiddenSize, opt, glove_weights)) else: self.mlp = apply_cuda(LanguageModel(self.dict, opt)) self.encoder = self.mlp.encoder self.mlp.epoch = 0 self.loss = apply_cuda(nn.NLLLoss(ignore_index=0)) self.decoder_embedding = self.mlp.context_embedding if opt.hier: c = 0.9 self.encoder_optimizer = torch.optim.RMSprop(filter(lambda p: p.requires_grad, self.encoder.parameters()), self.opt.learningRate, momentum=c, weight_decay=c) self.optimizer = torch.optim.RMSprop(filter(lambda p: p.requires_grad, self.mlp.parameters()), self.opt.learningRate, momentum=c, weight_decay=c) else: self.optimizer = torch.optim.SGD(filter(lambda p: p.requires_grad, self.mlp.parameters()), self.opt.learningRate) # Half learning rate
def train(train_x, train_y, word_dict, args): with tf.compat.v1.Session() as sess: # model = AutoEncoder(word_dict, MAX_DOCUMENT_LEN) model = LanguageModel(word_dict, MAX_DOCUMENT_LEN) global_steps = tf.Variable(0, trainable=False) params = tf.trainable_variables() gradients = tf.gradients(model.loss, params) clipped_gradients, _ = tf.clip_by_global_norm(gradients, 5.0) optimizer = tf.train.AdamOptimizer(0.001) train_op = optimizer.apply_gradients(zip(clipped_gradients, params)) loss_summary = tf.summary.scalar("loss", model.loss) summary_op = tf.summary.merge_all() summary_writer = tf.summary.FileWriter("AutoEncoder", sess.graph) saver = tf.train.Saver(tf.global_variables()) sess.run(tf.global_variables_initializer()) def train_step(batch_x): feed_dict = {model.x: batch_x} _, step, summaries, loss = sess.run([train_op, global_steps, summary_op, model.loss], feed_dict=feed_dict) summary_writer.add_summary(summaries, step) if step % 100 == 0: print("step {0} : loss = {1}".format(step, loss)) batches = batch_iter(train_x, train_y, BATCH_SIZE, NUM_EPOCHS) for batch_x, _ in batches: train_step(batch_x) step = tf.train.global_step(sess, global_steps) if step % 5000 == 0: saver.save(sess, os.path.join("AutoEncoder", "model", "model.ckpt"), global_step=step)
def language_modeling(self): decoder = self.create_decoder() assert (os.path.exists(self.options.result_dir + 'model_dec')) self.load_decoder(decoder) encoder = self.create_encoder() assert (os.path.exists(self.options.result_dir + 'model_enc')) self.load_encoder(encoder) print('computing language model score...') test = self.reader.next_example(2) lm = LanguageModel(encoder, decoder) total_ll = 0 total_tokens = 0 for dataid, data in enumerate(test): s1, s2, s3, pos, act = data[0], data[1], data[2], data[3], data[4] if len(s1) <= 1: continue total_ll += lm(s1, s2, s3, pos, self.options.nsamples) total_tokens += len(s1) perp = compute_perplexity(total_ll, total_tokens) print('perplexity: {}'.format(perp))
def main(): """This function implements the command-line interface.""" # Parse input configuration. if argv[1] == "prepare": logging.basicConfig(format='%(asctime)s | %(levelname)s : %(message)s', level=logging.INFO) # Prepare the language model. language_model = LanguageModel() # Produce the gold results for the dev dataset. produce_gold_results([DEV_DATASET_FNAME], "%s/%s" % (TEST2016_DIRNAME, DEV_GOLD_BASE_FNAME)) raise SystemExit else: logging.basicConfig(format='%(asctime)s | %(levelname)s : %(message)s', level=logging.WARNING) config = argv[1].split('-') method = config[0] assert method in ("unsegmented", "segmented_ml", "segmented_aggregation") year = argv[2] assert year in ("dev", "2016", "2017") segment_filtering_method = config[1] assert segment_filtering_method in \ ("none", "kolczetal00_title", "kolczetal00_firstpara", "kolczetal00_parawithmosttitlewords", "kolczetal00_firsttwopara", "kolczetal00_firstlastpara") \ or re.match(r"kolczetal00_bestsentence[0-5]", segment_filtering_method) segment_filtering = segment_filtering_method \ if segment_filtering_method != "none" else None base_term_weighting = config[2] assert re.match(r"(bm25|tfidf)_", base_term_weighting) extra_term_weighting_method = config[3] assert extra_term_weighting_method in ("none", "godwin", "murataetal00_A", "murataetal00_B") extra_term_weighting = extra_term_weighting_method \ if extra_term_weighting_method != "none" else None if method == "segmented_aggregation": aggregate_tier1_segments_method = config[4] assert aggregate_tier1_segments_method in AGGREGATION_METHOD_MAP.keys() aggregate_tier1_segments = AGGREGATION_METHOD_MAP[ aggregate_tier1_segments_method] aggregate_tier2_segments_method = config[5] assert aggregate_tier2_segments_method in AGGREGATION_METHOD_MAP.keys() aggregate_tier2_segments = AGGREGATION_METHOD_MAP[ aggregate_tier2_segments_method] order = config[6] assert order in ("result_first", "query_first") thread_first = order == "result_first" # Determine directory and file names if year == "dev": test_dirname = TEST2016_DIRNAME test_predictions_dirname = TEST2016_PREDICTIONS_DIRNAME gold_base_fname = DEV_GOLD_BASE_FNAME test_dataset_fname = DEV_DATASET_FNAME train_dataset_fnames = TRAIN2016_DATASET_FNAMES elif year == "2016": test_dirname = TEST2016_DIRNAME test_predictions_dirname = TEST2016_PREDICTIONS_DIRNAME gold_base_fname = TEST2016_GOLD_BASE_FNAME test_dataset_fname = TEST2016_DATASET_FNAME train_dataset_fnames = TRAIN2016_DATASET_FNAMES + [DEV_DATASET_FNAME] elif year == "2017": test_dirname = TEST2017_DIRNAME test_predictions_dirname = TEST2017_PREDICTIONS_DIRNAME gold_base_fname = TEST2017_GOLD_BASE_FNAME test_dataset_fname = TEST2017_DATASET_FNAME train_dataset_fnames = TRAIN2017_DATASET_FNAMES + [DEV_DATASET_FNAME] output_fname = "%s/subtask_B_%s-%s.txt" % (test_predictions_dirname, argv[1], argv[2]) base_output_fname = "%s/subtask_B_%s-%s.txt" % ( TEST_PREDICTIONS_BASE_DIRNAME, argv[1], argv[2]) LOGGER.info("Producing %s ...", output_fname) # Perform training language_model = LanguageModel(base_term_weighting=base_term_weighting, extra_term_weighting=extra_term_weighting) if method == "segmented_ml": classifier = train_segmented_ml(language_model, train_dataset_fnames, segment_filtering=segment_filtering) elif method == "segmented_aggregation": classifier = train_segmented_aggregation( language_model, train_dataset_fnames, aggregate_tier1_segments, aggregate_tier2_segments, thread_first=thread_first, segment_filtering=segment_filtering) elif method == "unsegmented": classifier = train_nonsegmented(language_model, train_dataset_fnames, segment_filtering=segment_filtering) # Perform evaluation if method == "segmented_ml": evaluate_segmented_ml(language_model, classifier, [test_dataset_fname], output_fname, segment_filtering=segment_filtering) elif method == "segmented_aggregation": evaluate_segmented_aggregation(language_model, classifier, [test_dataset_fname], output_fname, aggregate_tier1_segments, aggregate_tier2_segments, thread_first=thread_first, segment_filtering=segment_filtering) elif method == "unsegmented": evaluate_nonsegmented(language_model, classifier, [test_dataset_fname], output_fname, segment_filtering=segment_filtering) print("%s %s %s" % (test_dirname, gold_base_fname, base_output_fname))
def __init__(self): print('----- Loading data -----') self.train_set = Amazon('train', False) self.test_set = Amazon('test', False) self.val_set = Amazon('dev', False) print('The train set has {} items'.format(len(self.train_set))) print('The test set has {} items'.format(len(self.test_set))) print('The val set has {} items'.format(len(self.val_set))) self.vocab = self.train_set.vocab # Load pretrained classifier for evaluation. self.clseval = ClassifierEval(config.test_cls, config.dataset) self.clseval.restore_model() print('----- Loading model -----') embedding = self.vocab.embedding self.Emb = nn.Embedding.from_pretrained(embedding.clone(), freeze=False) self.Classifier = AttenClassifier(emb_dim=config.emb_dim, dim_h=config.dim_h, n_layers=config.n_layers, dropout=config.dropout, bi=config.bidirectional) self.Gate0 = Gate(dim_h=config.dim_h, n_layers=config.n_layers, dropout=config.dropout, bi=config.bidirectional, temperature=config.temp_gate, embedding=embedding.clone()) self.Gate1 = Gate(dim_h=config.dim_h, n_layers=config.n_layers, dropout=config.dropout, bi=config.bidirectional, temperature=config.temp_gate, embedding=embedding.clone()) self.InsFront0 = InsFront(dim_h=config.dim_h, embedding=embedding.clone(), n_layers=config.n_layers, dropout=config.dropout, bi=config.bidirectional, voc_size=self.vocab.size, temperature=config.temp_sub) self.InsFront1 = InsFront(dim_h=config.dim_h, embedding=embedding.clone(), n_layers=config.n_layers, dropout=config.dropout, bi=config.bidirectional, voc_size=self.vocab.size, temperature=config.temp_sub) self.InsBehind0 = InsBehind(dim_h=config.dim_h, embedding=embedding.clone(), n_layers=config.n_layers, dropout=config.dropout, bi=config.bidirectional, voc_size=self.vocab.size, temperature=config.temp_sub) self.InsBehind1 = InsBehind(dim_h=config.dim_h, embedding=embedding.clone(), n_layers=config.n_layers, dropout=config.dropout, bi=config.bidirectional, voc_size=self.vocab.size, temperature=config.temp_sub) self.Replace0 = Replace(dim_h=config.dim_h, embedding=embedding.clone(), n_layers=config.n_layers, dropout=config.dropout, bi=config.bidirectional, voc_size=self.vocab.size, temperature=config.temp_sub) self.Replace1 = Replace(dim_h=config.dim_h, embedding=embedding.clone(), n_layers=config.n_layers, dropout=config.dropout, bi=config.bidirectional, voc_size=self.vocab.size, temperature=config.temp_sub) self.Del0 = Delete() # Not a module. self.Del1 = Delete() # Not a module. # Language models. if config.train_mode == 'pto': self.LMf0 = LanguageModel(config.dataset, direction='forward', sentiment=0) self.LMf0.model_load() self.LMf1 = LanguageModel(config.dataset, direction='forward', sentiment=1) self.LMf1.model_load() self.LMb0 = LanguageModel(config.dataset, direction='backward', sentiment=0) self.LMb0.model_load() self.LMb1 = LanguageModel(config.dataset, direction='backward', sentiment=1) self.LMb1.model_load() # Auxiliary classifier. self.Aux_Emb = nn.Embedding.from_pretrained(embedding.clone(), freeze=False) self.Aux_Classifier = AttenClassifier(emb_dim=config.emb_dim, dim_h=config.dim_h, n_layers=config.n_layers, dropout=config.dropout, bi=config.bidirectional) self.modules = ['Emb', 'Classifier', 'Gate0', 'Gate1', 'InsFront0', 'InsFront1', 'InsBehind0', 'InsBehind1', 'Replace0', 'Replace1', 'Aux_Classifier', 'Aux_Emb'] for module in self.modules: print('--- {}: '.format(module)) print(getattr(self, module)) setattr(self, module, gpu_wrapper(getattr(self, module))) self.scopes = { 'emb': ['Emb'], 'cls': ['Classifier'], 'aux_cls': ['Aux_Classifier', 'Aux_Emb'], 'gate': ['Gate0', 'Gate1'], 'oprt': ['InsFront0', 'InsFront1', 'InsBehind0', 'InsBehind1', 'Replace0', 'Replace1'], } for scope in self.scopes.keys(): setattr(self, scope + '_lr', getattr(config, scope + '_lr')) self.iter_num = -1 self.logger = None if config.train_mode == 'pto': pass elif config.train_mode == 'aux-cls-only': self.train_set = Amazon('train', True) self.test_set = Amazon('test', True) self.val_set = Amazon('dev', True) self.best_acc = 0 elif config.train_mode == 'cls-only': self.best_acc = 0 self.criterionSeq, self.criterionCls, self.criterionRL, self.criterionBack = None, None, None, None
parser.add_argument('-m', '--model', help='LSTM type', required=True, type=str, **share_param) parser.add_argument('-e', '--epoch', help='LSTM type', required=True, type=int, **share_param) parser.add_argument('-v', '--version', help='', default=None, type=int, **share_param) return parser.parse_args() if __name__ == '__main__': # Ignore warning message by tensor flow os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' # checkpoint _parser = argparse.ArgumentParser(description='This script is ...', formatter_class=argparse.RawTextHelpFormatter) args = get_options(_parser) if args.version is not None: _checkpoint_dir, _parameter = \ checkpoint_version('./checkpoint/%s' % args.model, version=args.version) else: _parameter = toml.load(open('./hyperparameters/%s.toml' % args.model)) _checkpoint_dir, _ = checkpoint_version('./checkpoint/%s' % args.model, _parameter) # data raw_train, raw_validation, raw_test, vocab = ptb_raw_data("./simple-examples/data") iterators = dict() for raw_data, key in zip([raw_train, raw_validation, raw_test], ["batcher_train", "batcher_valid", "batcher_test"]): iterators[key] = BatchFeeder(batch_size=_parameter['batch_size'], num_steps=_parameter['config']['num_steps'], sequence=raw_data) model = LanguageModel(max_max_epoch=args.epoch, checkpoint_dir=_checkpoint_dir, **_parameter) model.train(verbose=True, **iterators)
def setUp(self): self.expected = {'hello': 10, 'world': 20, 'goodbye': 10} self.logger = logging.getLogger("TestLanguageModel") self.bk_model = LanguageModel(file='term_occurrences.txt') self.doc_model = LanguageModel(term_dict=self.expected)
def main(): """This function implements the command-line interface.""" # Parse input configuration. year = argv[2] assert year in ("dry_run", "dev", "2016", "2017") config = argv[1].split('-', 8) technique_string = config[0] assert technique_string in ("hard_terms", "soft_terms", "hard_topics", "soft_topics") technique = technique_string # Set up the document similarity model. if technique == "hard_topics" or technique == "soft_topics": similarity_model = TopicCosineSimilarity() if technique == "soft_topics" or technique == "soft_terms": term_similarity_string = config[1] assert term_similarity_string in ("w2v.ql", "w2v.googlenews", "glove.enwiki_gigaword5", "glove.common_crawl", "glove.twitter", "fasttext.enwiki") term_similarity = term_similarity_string soft_matrices_string = config[2] assert soft_matrices_string in ("mrel", "mlev", "mrel_mlev") if soft_matrices_string == "mrel": soft_matrices = [("mrel", 1.0)] elif soft_matrices_string == "mlev": soft_matrices = [("mlev", 1.0)] else: soft_matrices = [("mrel", 0.5), ("mlev", 0.5)] if technique == "hard_terms": similarity_model = TermHardCosineSimilarity() kwargs = {} elif technique == "hard_topics": kwargs = {} elif technique == "soft_terms": weighting_string = config[3] assert weighting_string in ("early", "late", "none") if weighting_string == "none": weighting = None else: weighting = weighting_string normalization_string = config[4] assert normalization_string in ("soft", "hard", "none") if normalization_string == "none": normalization = None else: normalization = normalization_string rounding_string = config[5] assert rounding_string in ("none", "round", "floor", "ceil") if rounding_string == "none": rounding = None else: rounding = rounding_string similarity_model = TermSoftCosineSimilarity(weighting=weighting, rounding=rounding, \ normalization=normalization) w2v_min_count = int(config[6]) m_knn = int(config[7]) m_threshold = float(config[8]) kwargs = {"soft_matrices": soft_matrices, "w2v_min_count": w2v_min_count, "m_knn": m_knn, \ "m_threshold": m_threshold, "term_similarity": term_similarity } elif technique == "soft_topics": w2v_min_count = int(config[3]) m_knn = int(config[4]) m_threshold = float(config[5]) kwargs = {"soft_matrices": soft_matrices, "w2v_min_count": w2v_min_count, "m_knn": m_knn, \ "m_threshold": m_threshold, "term_similarity": term_similarity } if year == "dry_run": # Prepare the language model and exit prematurely. LanguageModel(similarity=similarity_model, technique=technique, **kwargs) return # Determine directory and file names. if year == "dev": test_dirname = TEST2016_DIRNAME test_predictions_dirname = TEST2016_PREDICTIONS_DIRNAME gold_base_fname = DEV_GOLD_BASE_FNAME test_dataset_fname = DEV_DATASET_FNAME # train_dataset_fnames = TRAIN2016_DATASET_FNAMES elif year == "2016": test_dirname = TEST2016_DIRNAME test_predictions_dirname = TEST2016_PREDICTIONS_DIRNAME gold_base_fname = TEST2016_GOLD_BASE_FNAME test_dataset_fname = TEST2016_DATASET_FNAME # train_dataset_fnames = TRAIN2016_DATASET_FNAMES + [DEV_DATASET_FNAME] elif year == "2017": test_dirname = TEST2017_DIRNAME test_predictions_dirname = TEST2017_PREDICTIONS_DIRNAME gold_base_fname = TEST2017_GOLD_BASE_FNAME test_dataset_fname = TEST2017_DATASET_FNAME # train_dataset_fnames = TRAIN2017_DATASET_FNAMES + [DEV_DATASET_FNAME] output_fname = "%s/subtask_B_%s-%s.txt" % (test_predictions_dirname, argv[1], argv[2]) base_output_fname = "%s/subtask_B_%s-%s.txt" % ( TEST_PREDICTIONS_BASE_DIRNAME, argv[1], argv[2]) # Perform the evaluation. if not path.exists(output_fname): LOGGER.info("Producing %s ...", output_fname) file_handler = logging.FileHandler("%s.log" % output_fname, encoding='utf8') logging.getLogger().addHandler(file_handler) start_time = time() language_model = LanguageModel(similarity=similarity_model, technique=technique, **kwargs) evaluate(language_model, [test_dataset_fname], output_fname) LOGGER.info("Time elapsed: %s" % timedelta(seconds=time() - start_time)) logging.getLogger().removeHandler(file_handler) print("%s %s %s" % (test_dirname, gold_base_fname, base_output_fname))
cfg.logger.debug('Learning rate: {}'.format(cfg.lr)) cfg.logger.debug('Schedular factor: {}'.format(cfg.sch_factor)) cfg.logger.debug('Schedular patience: {}'.format(cfg.sch_patience)) cfg.logger.debug('Schedular verbose: {}'.format(cfg.sch_verbose)) cfg.logger.debug('Device: {}'.format(cfg.device)) cfg.logger.debug('Embedding model directory: {}'.format(cfg.emb_model_dir)) cfg.logger.debug('Lyrics data directory: {}'.format(cfg.lyrics_dir)) if cfg.pretrained_lm_dir: cfg.logger.debug('Pre-trained language model: {}'.format( cfg.pretrained_lm_dir)) else: cfg.logger.debug('Pre-trained language model: initial training') # Training language_model = LanguageModel(wv_dict, cfg.hidden_dim).to(cfg.device) criterion = nn.NLLLoss() optimizer = optim.Adam(language_model.parameters(), lr=cfg.lr) schedular = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=cfg.sch_factor, patience=cfg.sch_patience, verbose=cfg.sch_verbose) if cfg.pretrained_lm_dir: lm_loading_res = language_model.load_state_dict( torch.load(cfg.pretrained_lm_dir)) cfg.logger.debug('Loading language model: {}'.format(lm_loading_res)) train_losses, train_accs = [], [] # losses & accuracies to save if cfg.test_ratio > 0: test_losses, test_accs = [], []
fixed_queries_to_words = pd.Series(fixed_queries).replace( '[' + punctuation + ']', '', regex=True).str.split() fixed_words = flatten_list(fixed_queries_to_words) original_queries_to_words = pd.Series(original_queries).replace( '[' + punctuation + ']', '', regex=True).str.split() original_words = flatten_list(original_queries_to_words) error_model = ErrorModel() for original, fixed in zip(original_queries_to_words, fixed_queries_to_words): number_of_words = min(len(original), len(fixed)) for i in range(number_of_words): error_model.update_statistics(original[i], fixed[i]) error_model.calculate_weights() language_model = LanguageModel() for fixed in fixed_queries_to_words: for word in fixed: language_model.update_statistics(word) language_model.calculate_weights() error_model.store_json('error.json') language_model.store_json('language.json') # In[ ]:
def gradients_clipping(grads_params): new_grads_params = [] for g,p in grads_params: clipped_g = tf.clip_by_value(g,-FLAGS.clip_value,FLAGS.clip_value) new_grads_params.append((clipped_g,p)) return new_grads_params models = [] grads = [] with g.as_default(): # build the model for i in xrange(FLAGS.ngpu): with tf.device('/device:GPU:{:d}'.format(i)),tf.name_scope('model{:d}'.format(i)): reuse = i>0 models.append(LanguageModel(opts,'train',reuse)) models[i].build() # create a function to validate val_fns, generators = [],[] with tf.device('/gpu:0'.format(i)): # don't use the numpy version generator, use tensorflow version generator instead val_fn, _ = create_val_fn(batch_size = 100) val_fns.append(val_fn) #generators.append(generator) batch_size = FLAGS.batch_size*FLAGS.ngpu start_decay_steps = int(opts.nImgs//batch_size*opts.start_decay_epoches) decay_steps = int(opts.nImgs//batch_size*opts.decay_epoches) decayed_learning_rate = tf.train.exponential_decay(opts.learning_rate, tf.maximum(models[0].step-start_decay_steps,0),
instance.serialize_class_data() instance.log('Run %d of %d:' % (i + 1, n_runs)) instance.create_model() instance.compile_model() instance.train() instance.results() instance.serialize_model() instance.serialize_results() intervening = lambda dep: dep['n_intervening'] >= 1 models = { 'grammaticality': CorruptAgreement(filenames.deps, prop_train=0.1), 'predict_number': PredictVerbNumber(filenames.deps, prop_train=0.1), 'language_model': LanguageModel(filenames.deps, prop_train=0.1), 'inflect_verb': InflectVerb(filenames.deps, prop_train=0.1), 'predict_number_targeted': PredictVerbNumber(filenames.deps, prop_train=0.2, criterion=intervening), 'predict_number_only_nouns': PredictVerbNumberOnlyNouns(filenames.deps, prop_train=0.1), 'predict_number_only_generalized_nouns': PredictVerbNumberOnlyGeneralizedNouns(filenames.deps, prop_train=0.1), 'predict_number_srn': PredictVerbNumber(filenames.deps, prop_train=0.1, rnn_class=SimpleRNN), }
vocab.save_to_files(args.serialization_path + "/vocabulary") token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'), embedding_dim=EMBEDDING_DIM) word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding}) lstm = PytorchSeq2SeqWrapper( torch.nn.LSTM(EMBEDDING_DIM, HIDDEN_DIM, batch_first=True, dropout=args.drop)) lstm_model = LanguageModel(contextualizer=lstm, text_field_embedder=word_embeddings, vocab=vocab) transformer = MultiHeadSelfAttention(attention_dim=16, input_dim=EMBEDDING_DIM, num_heads=2, values_dim=16, attention_dropout_prob=args.drop) transformer_model = LanguageModel(contextualizer=transformer, text_field_embedder=word_embeddings, vocab=vocab) stacked_transformer = StackedSelfAttentionEncoder( input_dim=EMBEDDING_DIM, hidden_dim=HIDDEN_DIM,
URIs = config['URI_' + language] stopwords = utils.load_stopwords( path_to_resources + URIs['stopwords'] ) filler_words = utils.load_filler_words( path_to_resources + URIs['filler_words'] ) word_vectors = KeyedVectors.load_word2vec_format( path_to_resources + URIs['word_vectors'], binary=True ) language_model = LanguageModel( path_to_resources + URIs['language_model'] ) pos_tagger = StanfordPOSTagger( model_filename=path_to_resources + URIs['pos_tagger_model'], path_to_jar=path_to_resources + URIs['pos_tagger_jar'] ) print "time_cost = %.2fs" % (time.time() - start) resources[language] = { 'stopwords': stopwords, 'filler_words': filler_words, 'pos_tagger': pos_tagger, 'word_vectors': word_vectors, 'language_model': language_model
path_to_lm = path_to_root + 'resources/en-70k-0.2.lm' # Load Word2Vec (takes approx. 8G RAM) print "loading GoogleNews..." start = time.time() # vectors = Word2Vec(size=3e2, min_count=1) # vectors.build_vocab([item for sublist in lists_of_tokens.values() for item in sublist]) # vectors.intersect_word2vec_format(path_to_wv, binary=True) wv = gensim.models.KeyedVectors.load_word2vec_format(path_to_wv, binary=True) # vectors = Word2Vec.load_word2vec_format(path_to_wv, binary=True) print "finish loading GoogleNews, time_cost = %.2fs" % (time.time() - start) # Load language model (takes approx. 8G RAM) print "loading language model..." start = time.time() lm = LanguageModel(model_path=path_to_lm) print "finish loading language model, time_cost = %.2fs" % (time.time() - start) # ###################### # ### PARAMETER GRID ### # ###################### system_name_list = ['filippova', 'boudin', 'mehdad', 'tixier'] system_params_dict = {} for system_name in system_name_list: # pos_filtering_grid = [True, False] if system_name == 'tixier' or system_name == 'mehdad' else [False] # cr_w_grid = [3, 10, 20] if system_name == 'tixier' else [3] cr_w_grid = [6, 12] if system_name == 'tixier' else [3] cr_overspanning_grid = [True, False ] if system_name == 'tixier' else [False]
def print_or_value(id, calculated, value): if value == calculated: print(True) # print() else: print(id,calculated) print() sentance_pairs = [(["la", "casa"],["the","big","house"]),(["casa", "pez","verde"],["green","house"]),(["casa"],["shop"])] t_f_given_e = ibmmodel1.train(sentance_pairs, 100) reversed = [(x,y) for y,x in sentance_pairs] t_e_given_f = ibmmodel1.train(reversed, 100) alignments = [ibmmodel1.get_phrase_alignment(t_f_given_e, t_e_given_f, fs, es) for fs, es in sentance_pairs] phrase_table = ibmmodel1.get_phrase_probabilities(alignments, sentance_pairs) lang_model = LanguageModel([e for _,e in sentance_pairs], n=2) ibmmodel1.print_phrase_table(phrase_table) # Tests: foreign_sentence = "la casa".split(" ") print_or_value(1, cur_cost([], foreign_sentence, phrase_table, lang_model), 1) print_or_value(2, cur_cost([(0,0,"the big")], foreign_sentence, phrase_table, lang_model), 0.041666666666666664) print_or_value(3, cur_cost([(1,1,"shop")], foreign_sentence, phrase_table, lang_model), 0.125) print_or_value(4, cur_cost([(0,0,"the big house")], foreign_sentence, phrase_table, lang_model), 0.013888888888888888) print_or_value(5, cur_cost([(0,0,"the big"),(1,1,"shop")], foreign_sentence, phrase_table, lang_model), 0.003472222222222222) phrase_to_max_prob = get_phrase_to_max_prob(phrase_table) print_or_value(6, future_cost([], foreign_sentence, phrase_to_max_prob), 0.25) print_or_value(7, future_cost([(0,0,"the big")], foreign_sentence, phrase_to_max_prob), 0.5) print_or_value(8, future_cost([(1,1,"shop")], foreign_sentence, phrase_to_max_prob), 0.5) print_or_value(9, future_cost([(0,0,"the big house")], foreign_sentence, phrase_to_max_prob), 0.5) print_or_value(10, future_cost([(0,0,"the big"),(1,1,"shop")], foreign_sentence, phrase_to_max_prob), 1)