Пример #1
0
def build_models(sentences, delta=0.5, sigma=0.1):
    trs_counts, ems_counts, tags, words = count_grams(sentences)

    # print('Number of tags:', len(tags))
    # print('Number of words:', len(words))

    trs_model = LanguageModel(trs_counts, tags, delta)
    ems_model = LanguageModel(ems_counts, words, sigma)

    return trs_model, ems_model
Пример #2
0
  def build_lang_models(self, delta = 0.1):
    pos_counts = Counter()
    neg_counts = Counter()

    for review in self.corpus:
      if review.label:
        pos_counts.update(review.features)
      else:
        neg_counts.update(review.features)

    self.pos_model = LanguageModel(pos_counts, self.vocab, delta)
    self.neg_model = LanguageModel(neg_counts, self.vocab, delta)
Пример #3
0
def main():

    # path = input ("Where is the dataset(assignment1-dataset.txt)? Give me folder path : ")
    path = "E:\\Universite\\8yy\\497\\NLP_Homeworks\\generateSentence"
    """ NN_grams = int(input('''Which grams model you want? 
                        \nUnigram -> 1, Bigram -> 2, Trigram ->3 and so on. 
                        \nGive me your wanted ''')) """

    # count_of_sentence = int(input("How many sentences you want : "))
    count_of_sentence = 3

    # maxlength_of_sentence = int(input("How many words does a sentence consist of : "))
    maxlength_of_sentence = 30

    NN_grams = 3

    sentences = dataset(path)

    ngram_language_model = LanguageModel(NN_grams)
    ngram_language_model.LoadDataset2Model(sentences)
    generated_sentences = ngram_language_model.Generate(
        maxlength_of_sentence, count_of_sentence)

    for i in range(count_of_sentence):
        print(str(i) + ".sentence :", generated_sentences[i])
        ngram_language_model.PPL(generated_sentences[i])
Пример #4
0
 def __init__(self, opt, reuse=False):
     # build the model
     # separate inference op with train op, especially in train and validation steps
     with tf.name_scope('inference'):
         LM = LanguageModel(opt, 'test', reuse=reuse)
         LM.build()
     self.model = LM
Пример #5
0
def run_comprehension_experiment(dataset, experiment_paths, experiment_config, image_ids=None):
  if experiment_config.exp_name == 'baseline' or experiment_config.exp_name.startswith('max_margin'):
    captioner = LanguageModel(experiment_config.test.lstm_model_file, experiment_config.test.lstm_net_file,
                              experiment_config.vocab_file, device_id=0)
  elif experiment_config.exp_name.startswith('mil_context'):
    captioner = MILContextLanguageModel(experiment_config.test.lstm_model_file, experiment_config.test.lstm_net_file,
                                        experiment_config.vocab_file, device_id=0)
  else:
      raise StandardError("Unknown experiment name: %s" % experiment_config.exp_name)

  if experiment_config.exp_name == 'baseline' or experiment_config.exp_name.startswith('max_margin'):
    experimenter = ComprehensionExperiment(captioner, dataset, image_ids=image_ids)
  elif experiment_config.exp_name.startswith('mil_context'):
    experimenter = MILContextComprehensionExperiment(captioner, dataset, image_ids=image_ids)
  else:
    raise StandardError("Unknown experiment name: %s" % experiment_config.exp_name)

  results = experimenter.comprehension_experiment(experiment_paths, proposal_source=experiment_config.test.proposal_source,
                                                  visualize=experiment_config.test.visualize)

  if isinstance(results,dict):
    for method in results:
      print "Results for method: %s" % method
      results_filename = '%s/%s_%s_%s_results.json' % (experiment_paths.retrieval_results, dataset.dataset_name,
                                                       experiment_config.test.tag, method)
      with open(results_filename,'w') as f: json.dump(results[method], f)
  else:
    results_filename = '%s/%s_%s_results.json' % (experiment_paths.retrieval_results, dataset.dataset_name,
                                                           experiment_config.test.tag)
    with open(results_filename,'w') as f: json.dump(results, f)
def main():
    if parameters.FULL_DATA_MODE:
        lm = LanguageModel()
        lm.load_data(train_data_path=parameters.SMALL_DATA_PATH)
        lm.define_model()
        lm.compile_model()
        lm.fit_model()
        lm.evaluate_model()
        print(lm.generate_seq('女', 5))
    else:
        lm = LanguageModel()
        lm.prepare_for_generator(train_data_path=parameters.TRAIN_DATA_PATH,
                                 val_data_path=parameters.VAL_DATA_PATH,
                                 test_data_path=parameters.TEST_DATA_PATH)
        lm.define_model()
        lm.compile_model()
        lm.fit_model_with_generator()
        lm.evaluate_model_with_generator()
Пример #7
0
    def setUp(self):

        self.doc = {'hello': 1, 'world': 2, 'help': 1}

        self.col = {
            'hello': 20,
            'world': 5,
            'good': 5,
            'bye': 15,
            'free': 1,
            'code': 1,
            'source': 1,
            'compile': 1,
            'error': 1
        }

        self.colLM = LanguageModel(term_dict=self.col)
        self.docLM = LanguageModel(term_dict=self.doc)
def get_model():
    # Load max_len, chars, mapping
    for_server = load(open('saved_models/for_server.pkl', 'rb'))
    sequence_max_len, chars, chars_mapping = for_server[0], for_server[
        1], for_server[2]

    language_model = LanguageModel(sequence_max_len, chars, chars_mapping)
    model = language_model.load_model()

    return language_model, model
Пример #9
0
    def __init__(self, raw_segments, observation_sequences, label_sequences):
        super(Retrainer, self).__init__()
        self.raw_segments = raw_segments
        self.observation_sequences = observation_sequences
        self.label_sequences = label_sequences
        self.hmm_new = None
        self.feature_entity_list = FeatureEntityList()
        self.lm = LanguageModel()
        self.boosting_feature_generator = BoostingFeatureGenerator()

        self.DOMINANT_RATIO = 0.85  # dominant label ratio: set empirically

        self.retrain_with_boosting_features()
Пример #10
0
 def __init__(self, threshold=0.96):
     basename = os.path.dirname(os.path.realpath(__file__))
     self.lm = LanguageModel()
     # Load spaCy
     self.nlp = spacy.load("en")
     # Hunspell spellchecker: https://pypi.python.org/pypi/CyHunspell
     # CyHunspell seems to be more accurate than Aspell in PyEnchant, but a bit slower.
     self.gb = Hunspell("en_GB-large",
                        hunspell_data_dir=basename + '/resources/spelling/')
     # Inflection forms: http://wordlist.aspell.net/other/
     self.gb_infl = loadWordFormDict(basename +
                                     "/resources/agid-2016.01.19/infl.txt")
     # List of common determiners
     self.determiners = {"", "the", "a", "an"}
     # List of common prepositions
     self.prepositions = {
         "", "about", "at", "by", "for", "from", "in", "of", "on", "to",
         "with"
     }
     self.threshold = threshold
Пример #11
0
    def __init__(self, opt, dict):
        super(Trainer, self).__init__()
        self.opt = opt
        self.dict = dict
        self.hier = opt.hier

        if opt.restore:
            if opt.hier:
                self.mlp, self.encoder = torch.load(opt.model)
            else:
                self.mlp = torch.load(opt.model)
                self.encoder = self.mlp.encoder

            self.mlp.epoch += 1
            print("Restoring MLP {} with epoch {}".format(
                opt.model, self.mlp.epoch))
        else:
            if opt.hier:
                glove_weights = build_glove(dict["w2i"]) if opt.glove else None

                self.encoder = apply_cuda(HierAttnEncoder(len(dict["i2w"]), opt.bowDim, opt.hiddenSize, opt, glove_weights))
                self.mlp = apply_cuda(HierAttnDecoder(len(dict["i2w"]), opt.bowDim, opt.hiddenSize, opt, glove_weights))
            else:
                self.mlp = apply_cuda(LanguageModel(self.dict, opt))
                self.encoder = self.mlp.encoder

            self.mlp.epoch = 0

        self.loss = apply_cuda(nn.NLLLoss(ignore_index=0))
        self.decoder_embedding = self.mlp.context_embedding

        if opt.hier:
            c = 0.9
            self.encoder_optimizer = torch.optim.RMSprop(filter(lambda p: p.requires_grad, self.encoder.parameters()), self.opt.learningRate,
                                                                    momentum=c, weight_decay=c)
            self.optimizer = torch.optim.RMSprop(filter(lambda p: p.requires_grad, self.mlp.parameters()), self.opt.learningRate,
                                                                    momentum=c, weight_decay=c)
        else:
            self.optimizer = torch.optim.SGD(filter(lambda p: p.requires_grad, self.mlp.parameters()), self.opt.learningRate)  # Half learning rate
Пример #12
0
def train(train_x, train_y, word_dict, args):
    with tf.compat.v1.Session() as sess:
        # model = AutoEncoder(word_dict, MAX_DOCUMENT_LEN)
        model = LanguageModel(word_dict, MAX_DOCUMENT_LEN)
        global_steps = tf.Variable(0, trainable=False)
        params = tf.trainable_variables()
        gradients = tf.gradients(model.loss, params)
        clipped_gradients, _ = tf.clip_by_global_norm(gradients, 5.0)
        optimizer = tf.train.AdamOptimizer(0.001)
        train_op = optimizer.apply_gradients(zip(clipped_gradients, params))

        loss_summary = tf.summary.scalar("loss", model.loss)
        summary_op = tf.summary.merge_all()
        summary_writer = tf.summary.FileWriter("AutoEncoder", sess.graph)

        saver = tf.train.Saver(tf.global_variables())

        sess.run(tf.global_variables_initializer())

        def train_step(batch_x):
            feed_dict = {model.x: batch_x}
            _, step, summaries, loss = sess.run([train_op, global_steps, summary_op, model.loss],
                                                feed_dict=feed_dict)
            summary_writer.add_summary(summaries, step)

            if step % 100 == 0:
                print("step {0} : loss = {1}".format(step, loss))

        batches = batch_iter(train_x, train_y, BATCH_SIZE, NUM_EPOCHS)

        for batch_x, _ in batches:
            train_step(batch_x)
            step = tf.train.global_step(sess, global_steps)

            if step % 5000 == 0:
                saver.save(sess, os.path.join("AutoEncoder", "model", "model.ckpt"), global_step=step)
Пример #13
0
    def language_modeling(self):
        decoder = self.create_decoder()
        assert (os.path.exists(self.options.result_dir + 'model_dec'))
        self.load_decoder(decoder)

        encoder = self.create_encoder()
        assert (os.path.exists(self.options.result_dir + 'model_enc'))
        self.load_encoder(encoder)

        print('computing language model score...')

        test = self.reader.next_example(2)
        lm = LanguageModel(encoder, decoder)

        total_ll = 0
        total_tokens = 0
        for dataid, data in enumerate(test):
            s1, s2, s3, pos, act = data[0], data[1], data[2], data[3], data[4]
            if len(s1) <= 1:
                continue
            total_ll += lm(s1, s2, s3, pos, self.options.nsamples)
            total_tokens += len(s1)
        perp = compute_perplexity(total_ll, total_tokens)
        print('perplexity: {}'.format(perp))
Пример #14
0
def main():
    """This function implements the command-line interface."""
    # Parse input configuration.
    if argv[1] == "prepare":
        logging.basicConfig(format='%(asctime)s | %(levelname)s : %(message)s',
                            level=logging.INFO)
        # Prepare the language model.
        language_model = LanguageModel()
        # Produce the gold results for the dev dataset.
        produce_gold_results([DEV_DATASET_FNAME],
                             "%s/%s" % (TEST2016_DIRNAME, DEV_GOLD_BASE_FNAME))
        raise SystemExit
    else:
        logging.basicConfig(format='%(asctime)s | %(levelname)s : %(message)s',
                            level=logging.WARNING)
    config = argv[1].split('-')
    method = config[0]
    assert method in ("unsegmented", "segmented_ml", "segmented_aggregation")
    year = argv[2]
    assert year in ("dev", "2016", "2017")
    segment_filtering_method = config[1]
    assert segment_filtering_method in \
        ("none", "kolczetal00_title", "kolczetal00_firstpara",
         "kolczetal00_parawithmosttitlewords", "kolczetal00_firsttwopara",
         "kolczetal00_firstlastpara") \
        or re.match(r"kolczetal00_bestsentence[0-5]", segment_filtering_method)
    segment_filtering = segment_filtering_method \
                        if segment_filtering_method != "none" else None
    base_term_weighting = config[2]
    assert re.match(r"(bm25|tfidf)_", base_term_weighting)
    extra_term_weighting_method = config[3]
    assert extra_term_weighting_method in ("none", "godwin", "murataetal00_A",
                                           "murataetal00_B")
    extra_term_weighting = extra_term_weighting_method \
                           if extra_term_weighting_method != "none" else None
    if method == "segmented_aggregation":
        aggregate_tier1_segments_method = config[4]
        assert aggregate_tier1_segments_method in AGGREGATION_METHOD_MAP.keys()
        aggregate_tier1_segments = AGGREGATION_METHOD_MAP[
            aggregate_tier1_segments_method]
        aggregate_tier2_segments_method = config[5]
        assert aggregate_tier2_segments_method in AGGREGATION_METHOD_MAP.keys()
        aggregate_tier2_segments = AGGREGATION_METHOD_MAP[
            aggregate_tier2_segments_method]
        order = config[6]
        assert order in ("result_first", "query_first")
        thread_first = order == "result_first"

    # Determine directory and file names
    if year == "dev":
        test_dirname = TEST2016_DIRNAME
        test_predictions_dirname = TEST2016_PREDICTIONS_DIRNAME
        gold_base_fname = DEV_GOLD_BASE_FNAME
        test_dataset_fname = DEV_DATASET_FNAME
        train_dataset_fnames = TRAIN2016_DATASET_FNAMES
    elif year == "2016":
        test_dirname = TEST2016_DIRNAME
        test_predictions_dirname = TEST2016_PREDICTIONS_DIRNAME
        gold_base_fname = TEST2016_GOLD_BASE_FNAME
        test_dataset_fname = TEST2016_DATASET_FNAME
        train_dataset_fnames = TRAIN2016_DATASET_FNAMES + [DEV_DATASET_FNAME]
    elif year == "2017":
        test_dirname = TEST2017_DIRNAME
        test_predictions_dirname = TEST2017_PREDICTIONS_DIRNAME
        gold_base_fname = TEST2017_GOLD_BASE_FNAME
        test_dataset_fname = TEST2017_DATASET_FNAME
        train_dataset_fnames = TRAIN2017_DATASET_FNAMES + [DEV_DATASET_FNAME]
    output_fname = "%s/subtask_B_%s-%s.txt" % (test_predictions_dirname,
                                               argv[1], argv[2])
    base_output_fname = "%s/subtask_B_%s-%s.txt" % (
        TEST_PREDICTIONS_BASE_DIRNAME, argv[1], argv[2])
    LOGGER.info("Producing %s ...", output_fname)

    # Perform training
    language_model = LanguageModel(base_term_weighting=base_term_weighting,
                                   extra_term_weighting=extra_term_weighting)
    if method == "segmented_ml":
        classifier = train_segmented_ml(language_model,
                                        train_dataset_fnames,
                                        segment_filtering=segment_filtering)
    elif method == "segmented_aggregation":
        classifier = train_segmented_aggregation(
            language_model,
            train_dataset_fnames,
            aggregate_tier1_segments,
            aggregate_tier2_segments,
            thread_first=thread_first,
            segment_filtering=segment_filtering)
    elif method == "unsegmented":
        classifier = train_nonsegmented(language_model,
                                        train_dataset_fnames,
                                        segment_filtering=segment_filtering)

    # Perform evaluation
    if method == "segmented_ml":
        evaluate_segmented_ml(language_model,
                              classifier, [test_dataset_fname],
                              output_fname,
                              segment_filtering=segment_filtering)
    elif method == "segmented_aggregation":
        evaluate_segmented_aggregation(language_model,
                                       classifier, [test_dataset_fname],
                                       output_fname,
                                       aggregate_tier1_segments,
                                       aggregate_tier2_segments,
                                       thread_first=thread_first,
                                       segment_filtering=segment_filtering)
    elif method == "unsegmented":
        evaluate_nonsegmented(language_model,
                              classifier, [test_dataset_fname],
                              output_fname,
                              segment_filtering=segment_filtering)

    print("%s %s %s" % (test_dirname, gold_base_fname, base_output_fname))
Пример #15
0
    def __init__(self):

        print('----- Loading data -----')
        self.train_set = Amazon('train', False)
        self.test_set = Amazon('test', False)
        self.val_set = Amazon('dev', False)
        print('The train set has {} items'.format(len(self.train_set)))
        print('The test set has {} items'.format(len(self.test_set)))
        print('The val set has {} items'.format(len(self.val_set)))

        self.vocab = self.train_set.vocab

        # Load pretrained classifier for evaluation.
        self.clseval = ClassifierEval(config.test_cls, config.dataset)
        self.clseval.restore_model()

        print('----- Loading model -----')
        embedding = self.vocab.embedding
        self.Emb = nn.Embedding.from_pretrained(embedding.clone(), freeze=False)
        self.Classifier = AttenClassifier(emb_dim=config.emb_dim,
                                          dim_h=config.dim_h,
                                          n_layers=config.n_layers,
                                          dropout=config.dropout,
                                          bi=config.bidirectional)
        self.Gate0 = Gate(dim_h=config.dim_h, n_layers=config.n_layers, dropout=config.dropout, bi=config.bidirectional,
                          temperature=config.temp_gate, embedding=embedding.clone())
        self.Gate1 = Gate(dim_h=config.dim_h, n_layers=config.n_layers, dropout=config.dropout, bi=config.bidirectional,
                          temperature=config.temp_gate, embedding=embedding.clone())
        self.InsFront0 = InsFront(dim_h=config.dim_h, embedding=embedding.clone(), n_layers=config.n_layers,
                                  dropout=config.dropout, bi=config.bidirectional, voc_size=self.vocab.size,
                                  temperature=config.temp_sub)
        self.InsFront1 = InsFront(dim_h=config.dim_h, embedding=embedding.clone(), n_layers=config.n_layers,
                                  dropout=config.dropout, bi=config.bidirectional, voc_size=self.vocab.size,
                                  temperature=config.temp_sub)
        self.InsBehind0 = InsBehind(dim_h=config.dim_h, embedding=embedding.clone(), n_layers=config.n_layers,
                                    dropout=config.dropout, bi=config.bidirectional, voc_size=self.vocab.size,
                                    temperature=config.temp_sub)
        self.InsBehind1 = InsBehind(dim_h=config.dim_h, embedding=embedding.clone(), n_layers=config.n_layers,
                                    dropout=config.dropout, bi=config.bidirectional, voc_size=self.vocab.size,
                                    temperature=config.temp_sub)
        self.Replace0 = Replace(dim_h=config.dim_h, embedding=embedding.clone(), n_layers=config.n_layers,
                                dropout=config.dropout, bi=config.bidirectional, voc_size=self.vocab.size,
                                temperature=config.temp_sub)
        self.Replace1 = Replace(dim_h=config.dim_h, embedding=embedding.clone(), n_layers=config.n_layers,
                                dropout=config.dropout, bi=config.bidirectional, voc_size=self.vocab.size,
                                temperature=config.temp_sub)
        self.Del0 = Delete()  # Not a module.
        self.Del1 = Delete()  # Not a module.
        # Language models.
        if config.train_mode == 'pto':
            self.LMf0 = LanguageModel(config.dataset, direction='forward', sentiment=0)
            self.LMf0.model_load()
            self.LMf1 = LanguageModel(config.dataset, direction='forward', sentiment=1)
            self.LMf1.model_load()
            self.LMb0 = LanguageModel(config.dataset, direction='backward', sentiment=0)
            self.LMb0.model_load()
            self.LMb1 = LanguageModel(config.dataset, direction='backward', sentiment=1)
            self.LMb1.model_load()
        # Auxiliary classifier.
        self.Aux_Emb = nn.Embedding.from_pretrained(embedding.clone(), freeze=False)
        self.Aux_Classifier = AttenClassifier(emb_dim=config.emb_dim,
                                              dim_h=config.dim_h,
                                              n_layers=config.n_layers,
                                              dropout=config.dropout,
                                              bi=config.bidirectional)

        self.modules = ['Emb', 'Classifier',
                        'Gate0', 'Gate1',
                        'InsFront0', 'InsFront1',
                        'InsBehind0', 'InsBehind1',
                        'Replace0', 'Replace1',
                        'Aux_Classifier', 'Aux_Emb']
        for module in self.modules:
            print('--- {}: '.format(module))
            print(getattr(self, module))
            setattr(self, module, gpu_wrapper(getattr(self, module)))

        self.scopes = {
            'emb': ['Emb'],
            'cls': ['Classifier'],
            'aux_cls': ['Aux_Classifier', 'Aux_Emb'],
            'gate': ['Gate0', 'Gate1'],
            'oprt': ['InsFront0', 'InsFront1', 'InsBehind0', 'InsBehind1', 'Replace0', 'Replace1'],
        }
        for scope in self.scopes.keys():
            setattr(self, scope + '_lr', getattr(config, scope + '_lr'))

        self.iter_num = -1
        self.logger = None
        if config.train_mode == 'pto':
            pass
        elif config.train_mode == 'aux-cls-only':
            self.train_set = Amazon('train', True)
            self.test_set = Amazon('test', True)
            self.val_set = Amazon('dev', True)
            self.best_acc = 0
        elif config.train_mode == 'cls-only':
            self.best_acc = 0
        self.criterionSeq, self.criterionCls, self.criterionRL, self.criterionBack = None, None, None, None
Пример #16
0
    parser.add_argument('-m', '--model', help='LSTM type', required=True, type=str, **share_param)
    parser.add_argument('-e', '--epoch', help='LSTM type', required=True, type=int, **share_param)
    parser.add_argument('-v', '--version', help='', default=None, type=int, **share_param)
    return parser.parse_args()


if __name__ == '__main__':
    # Ignore warning message by tensor flow
    os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
    # checkpoint
    _parser = argparse.ArgumentParser(description='This script is ...', formatter_class=argparse.RawTextHelpFormatter)
    args = get_options(_parser)
    if args.version is not None:
        _checkpoint_dir, _parameter = \
            checkpoint_version('./checkpoint/%s' % args.model, version=args.version)
    else:
        _parameter = toml.load(open('./hyperparameters/%s.toml' % args.model))
        _checkpoint_dir, _ = checkpoint_version('./checkpoint/%s' % args.model, _parameter)

    # data
    raw_train, raw_validation, raw_test, vocab = ptb_raw_data("./simple-examples/data")

    iterators = dict()
    for raw_data, key in zip([raw_train, raw_validation, raw_test], ["batcher_train", "batcher_valid", "batcher_test"]):
        iterators[key] = BatchFeeder(batch_size=_parameter['batch_size'],
                                     num_steps=_parameter['config']['num_steps'],
                                     sequence=raw_data)

    model = LanguageModel(max_max_epoch=args.epoch, checkpoint_dir=_checkpoint_dir, **_parameter)
    model.train(verbose=True, **iterators)
Пример #17
0
 def setUp(self):
     self.expected = {'hello': 10, 'world': 20, 'goodbye': 10}
     self.logger = logging.getLogger("TestLanguageModel")
     self.bk_model = LanguageModel(file='term_occurrences.txt')
     self.doc_model = LanguageModel(term_dict=self.expected)
Пример #18
0
def main():
    """This function implements the command-line interface."""
    # Parse input configuration.
    year = argv[2]
    assert year in ("dry_run", "dev", "2016", "2017")
    config = argv[1].split('-', 8)
    technique_string = config[0]
    assert technique_string in ("hard_terms", "soft_terms", "hard_topics",
                                "soft_topics")
    technique = technique_string

    # Set up the document similarity model.
    if technique == "hard_topics" or technique == "soft_topics":
        similarity_model = TopicCosineSimilarity()
    if technique == "soft_topics" or technique == "soft_terms":
        term_similarity_string = config[1]
        assert term_similarity_string in ("w2v.ql", "w2v.googlenews",
                                          "glove.enwiki_gigaword5",
                                          "glove.common_crawl",
                                          "glove.twitter", "fasttext.enwiki")
        term_similarity = term_similarity_string

        soft_matrices_string = config[2]
        assert soft_matrices_string in ("mrel", "mlev", "mrel_mlev")
        if soft_matrices_string == "mrel":
            soft_matrices = [("mrel", 1.0)]
        elif soft_matrices_string == "mlev":
            soft_matrices = [("mlev", 1.0)]
        else:
            soft_matrices = [("mrel", 0.5), ("mlev", 0.5)]

    if technique == "hard_terms":
        similarity_model = TermHardCosineSimilarity()
        kwargs = {}
    elif technique == "hard_topics":
        kwargs = {}
    elif technique == "soft_terms":
        weighting_string = config[3]
        assert weighting_string in ("early", "late", "none")
        if weighting_string == "none":
            weighting = None
        else:
            weighting = weighting_string

        normalization_string = config[4]
        assert normalization_string in ("soft", "hard", "none")
        if normalization_string == "none":
            normalization = None
        else:
            normalization = normalization_string

        rounding_string = config[5]
        assert rounding_string in ("none", "round", "floor", "ceil")
        if rounding_string == "none":
            rounding = None
        else:
            rounding = rounding_string

        similarity_model = TermSoftCosineSimilarity(weighting=weighting, rounding=rounding, \
                                                    normalization=normalization)

        w2v_min_count = int(config[6])
        m_knn = int(config[7])
        m_threshold = float(config[8])
        kwargs = {"soft_matrices": soft_matrices, "w2v_min_count": w2v_min_count, "m_knn": m_knn, \
                  "m_threshold": m_threshold, "term_similarity": term_similarity }
    elif technique == "soft_topics":
        w2v_min_count = int(config[3])
        m_knn = int(config[4])
        m_threshold = float(config[5])
        kwargs = {"soft_matrices": soft_matrices, "w2v_min_count": w2v_min_count, "m_knn": m_knn, \
                  "m_threshold": m_threshold, "term_similarity": term_similarity }

    if year == "dry_run":
        # Prepare the language model and exit prematurely.
        LanguageModel(similarity=similarity_model,
                      technique=technique,
                      **kwargs)
        return

    # Determine directory and file names.
    if year == "dev":
        test_dirname = TEST2016_DIRNAME
        test_predictions_dirname = TEST2016_PREDICTIONS_DIRNAME
        gold_base_fname = DEV_GOLD_BASE_FNAME
        test_dataset_fname = DEV_DATASET_FNAME
#       train_dataset_fnames = TRAIN2016_DATASET_FNAMES
    elif year == "2016":
        test_dirname = TEST2016_DIRNAME
        test_predictions_dirname = TEST2016_PREDICTIONS_DIRNAME
        gold_base_fname = TEST2016_GOLD_BASE_FNAME
        test_dataset_fname = TEST2016_DATASET_FNAME
#       train_dataset_fnames = TRAIN2016_DATASET_FNAMES + [DEV_DATASET_FNAME]
    elif year == "2017":
        test_dirname = TEST2017_DIRNAME
        test_predictions_dirname = TEST2017_PREDICTIONS_DIRNAME
        gold_base_fname = TEST2017_GOLD_BASE_FNAME
        test_dataset_fname = TEST2017_DATASET_FNAME


#       train_dataset_fnames = TRAIN2017_DATASET_FNAMES + [DEV_DATASET_FNAME]
    output_fname = "%s/subtask_B_%s-%s.txt" % (test_predictions_dirname,
                                               argv[1], argv[2])
    base_output_fname = "%s/subtask_B_%s-%s.txt" % (
        TEST_PREDICTIONS_BASE_DIRNAME, argv[1], argv[2])

    # Perform the evaluation.
    if not path.exists(output_fname):
        LOGGER.info("Producing %s ...", output_fname)
        file_handler = logging.FileHandler("%s.log" % output_fname,
                                           encoding='utf8')
        logging.getLogger().addHandler(file_handler)
        start_time = time()

        language_model = LanguageModel(similarity=similarity_model,
                                       technique=technique,
                                       **kwargs)
        evaluate(language_model, [test_dataset_fname], output_fname)

        LOGGER.info("Time elapsed: %s" %
                    timedelta(seconds=time() - start_time))
        logging.getLogger().removeHandler(file_handler)
    print("%s %s %s" % (test_dirname, gold_base_fname, base_output_fname))
Пример #19
0
    cfg.logger.debug('Learning rate: {}'.format(cfg.lr))
    cfg.logger.debug('Schedular factor: {}'.format(cfg.sch_factor))
    cfg.logger.debug('Schedular patience: {}'.format(cfg.sch_patience))
    cfg.logger.debug('Schedular verbose: {}'.format(cfg.sch_verbose))
    cfg.logger.debug('Device: {}'.format(cfg.device))
    cfg.logger.debug('Embedding model directory: {}'.format(cfg.emb_model_dir))
    cfg.logger.debug('Lyrics data directory: {}'.format(cfg.lyrics_dir))

    if cfg.pretrained_lm_dir:
        cfg.logger.debug('Pre-trained language model: {}'.format(
            cfg.pretrained_lm_dir))
    else:
        cfg.logger.debug('Pre-trained language model: initial training')

    # Training
    language_model = LanguageModel(wv_dict, cfg.hidden_dim).to(cfg.device)
    criterion = nn.NLLLoss()
    optimizer = optim.Adam(language_model.parameters(), lr=cfg.lr)
    schedular = optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                     mode='min',
                                                     factor=cfg.sch_factor,
                                                     patience=cfg.sch_patience,
                                                     verbose=cfg.sch_verbose)
    if cfg.pretrained_lm_dir:
        lm_loading_res = language_model.load_state_dict(
            torch.load(cfg.pretrained_lm_dir))
        cfg.logger.debug('Loading language model: {}'.format(lm_loading_res))

    train_losses, train_accs = [], []  # losses & accuracies to save
    if cfg.test_ratio > 0:
        test_losses, test_accs = [], []
Пример #20
0
fixed_queries_to_words = pd.Series(fixed_queries).replace(
    '[' + punctuation + ']', '', regex=True).str.split()
fixed_words = flatten_list(fixed_queries_to_words)

original_queries_to_words = pd.Series(original_queries).replace(
    '[' + punctuation + ']', '', regex=True).str.split()
original_words = flatten_list(original_queries_to_words)

error_model = ErrorModel()

for original, fixed in zip(original_queries_to_words, fixed_queries_to_words):
    number_of_words = min(len(original), len(fixed))
    for i in range(number_of_words):
        error_model.update_statistics(original[i], fixed[i])

error_model.calculate_weights()

language_model = LanguageModel()

for fixed in fixed_queries_to_words:
    for word in fixed:
        language_model.update_statistics(word)

language_model.calculate_weights()

error_model.store_json('error.json')
language_model.store_json('language.json')

# In[ ]:
Пример #21
0
def gradients_clipping(grads_params):
    new_grads_params = []
    for g,p in grads_params:
        clipped_g = tf.clip_by_value(g,-FLAGS.clip_value,FLAGS.clip_value)
        new_grads_params.append((clipped_g,p))
    return new_grads_params

models = []
grads = []
with g.as_default():
         
    # build the model
    for i in xrange(FLAGS.ngpu):
        with tf.device('/device:GPU:{:d}'.format(i)),tf.name_scope('model{:d}'.format(i)):
            reuse = i>0
            models.append(LanguageModel(opts,'train',reuse))
            models[i].build()
                  
    # create a function to validate
    val_fns, generators = [],[]
    with tf.device('/gpu:0'.format(i)):
        # don't use the numpy version generator, use tensorflow version generator instead
        val_fn, _ = create_val_fn(batch_size = 100)
        val_fns.append(val_fn)
        #generators.append(generator) 
    
    batch_size = FLAGS.batch_size*FLAGS.ngpu
    start_decay_steps = int(opts.nImgs//batch_size*opts.start_decay_epoches)
    decay_steps = int(opts.nImgs//batch_size*opts.decay_epoches)
    decayed_learning_rate = tf.train.exponential_decay(opts.learning_rate,
                                                       tf.maximum(models[0].step-start_decay_steps,0),
Пример #22
0
            instance.serialize_class_data()
        instance.log('Run %d of %d:' % (i + 1, n_runs))
        instance.create_model()
        instance.compile_model()
        instance.train()
        instance.results()
        instance.serialize_model()
        instance.serialize_results()


intervening = lambda dep: dep['n_intervening'] >= 1

models = {
    'grammaticality':
    CorruptAgreement(filenames.deps, prop_train=0.1),
    'predict_number':
    PredictVerbNumber(filenames.deps, prop_train=0.1),
    'language_model':
    LanguageModel(filenames.deps, prop_train=0.1),
    'inflect_verb':
    InflectVerb(filenames.deps, prop_train=0.1),
    'predict_number_targeted':
    PredictVerbNumber(filenames.deps, prop_train=0.2, criterion=intervening),
    'predict_number_only_nouns':
    PredictVerbNumberOnlyNouns(filenames.deps, prop_train=0.1),
    'predict_number_only_generalized_nouns':
    PredictVerbNumberOnlyGeneralizedNouns(filenames.deps, prop_train=0.1),
    'predict_number_srn':
    PredictVerbNumber(filenames.deps, prop_train=0.1, rnn_class=SimpleRNN),
}
Пример #23
0
vocab.save_to_files(args.serialization_path + "/vocabulary")

token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
                            embedding_dim=EMBEDDING_DIM)

word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding})

lstm = PytorchSeq2SeqWrapper(
    torch.nn.LSTM(EMBEDDING_DIM,
                  HIDDEN_DIM,
                  batch_first=True,
                  dropout=args.drop))

lstm_model = LanguageModel(contextualizer=lstm,
                           text_field_embedder=word_embeddings,
                           vocab=vocab)

transformer = MultiHeadSelfAttention(attention_dim=16,
                                     input_dim=EMBEDDING_DIM,
                                     num_heads=2,
                                     values_dim=16,
                                     attention_dropout_prob=args.drop)

transformer_model = LanguageModel(contextualizer=transformer,
                                  text_field_embedder=word_embeddings,
                                  vocab=vocab)

stacked_transformer = StackedSelfAttentionEncoder(
    input_dim=EMBEDDING_DIM,
    hidden_dim=HIDDEN_DIM,
Пример #24
0
    URIs = config['URI_' + language]
    stopwords = utils.load_stopwords(
        path_to_resources + URIs['stopwords']
    )

    filler_words = utils.load_filler_words(
        path_to_resources + URIs['filler_words']
    )

    word_vectors = KeyedVectors.load_word2vec_format(
        path_to_resources + URIs['word_vectors'],
        binary=True
    )

    language_model = LanguageModel(
        path_to_resources + URIs['language_model']
    )

    pos_tagger = StanfordPOSTagger(
        model_filename=path_to_resources + URIs['pos_tagger_model'],
        path_to_jar=path_to_resources + URIs['pos_tagger_jar']
    )

    print "time_cost = %.2fs" % (time.time() - start)

    resources[language] = {
        'stopwords': stopwords,
        'filler_words': filler_words,
        'pos_tagger': pos_tagger,
        'word_vectors': word_vectors,
        'language_model': language_model
    path_to_lm = path_to_root + 'resources/en-70k-0.2.lm'

# Load Word2Vec (takes approx. 8G RAM)
print "loading GoogleNews..."
start = time.time()
# vectors = Word2Vec(size=3e2, min_count=1)
# vectors.build_vocab([item for sublist in lists_of_tokens.values() for item in sublist])
# vectors.intersect_word2vec_format(path_to_wv, binary=True)
wv = gensim.models.KeyedVectors.load_word2vec_format(path_to_wv, binary=True)
# vectors = Word2Vec.load_word2vec_format(path_to_wv, binary=True)
print "finish loading GoogleNews, time_cost = %.2fs" % (time.time() - start)

# Load language model (takes approx. 8G RAM)
print "loading language model..."
start = time.time()
lm = LanguageModel(model_path=path_to_lm)
print "finish loading language model, time_cost = %.2fs" % (time.time() -
                                                            start)

# ######################
# ### PARAMETER GRID ###
# ######################
system_name_list = ['filippova', 'boudin', 'mehdad', 'tixier']
system_params_dict = {}

for system_name in system_name_list:
    # pos_filtering_grid = [True, False] if system_name == 'tixier' or system_name == 'mehdad' else [False]
    # cr_w_grid = [3, 10, 20] if system_name == 'tixier' else [3]
    cr_w_grid = [6, 12] if system_name == 'tixier' else [3]
    cr_overspanning_grid = [True, False
                            ] if system_name == 'tixier' else [False]
Пример #26
0
    def print_or_value(id, calculated, value):
        if value == calculated:
            print(True)
            # print()
        else:
            print(id,calculated)
            print()

    sentance_pairs = [(["la", "casa"],["the","big","house"]),(["casa", "pez","verde"],["green","house"]),(["casa"],["shop"])]
    t_f_given_e = ibmmodel1.train(sentance_pairs, 100)
    reversed = [(x,y) for y,x in sentance_pairs]
    t_e_given_f = ibmmodel1.train(reversed, 100)
    alignments = [ibmmodel1.get_phrase_alignment(t_f_given_e, t_e_given_f, fs, es) for fs, es in sentance_pairs]

    phrase_table = ibmmodel1.get_phrase_probabilities(alignments, sentance_pairs)
    lang_model = LanguageModel([e for _,e in sentance_pairs], n=2)
    ibmmodel1.print_phrase_table(phrase_table)
    # Tests:
    foreign_sentence = "la casa".split(" ")
    print_or_value(1, cur_cost([], foreign_sentence, phrase_table, lang_model), 1)
    print_or_value(2, cur_cost([(0,0,"the big")], foreign_sentence, phrase_table, lang_model), 0.041666666666666664)
    print_or_value(3, cur_cost([(1,1,"shop")], foreign_sentence, phrase_table, lang_model), 0.125)
    print_or_value(4, cur_cost([(0,0,"the big house")], foreign_sentence, phrase_table, lang_model), 0.013888888888888888)
    print_or_value(5, cur_cost([(0,0,"the big"),(1,1,"shop")], foreign_sentence, phrase_table, lang_model), 0.003472222222222222)

    phrase_to_max_prob = get_phrase_to_max_prob(phrase_table)
    print_or_value(6, future_cost([], foreign_sentence, phrase_to_max_prob), 0.25)
    print_or_value(7, future_cost([(0,0,"the big")], foreign_sentence, phrase_to_max_prob), 0.5)
    print_or_value(8, future_cost([(1,1,"shop")], foreign_sentence, phrase_to_max_prob), 0.5)
    print_or_value(9, future_cost([(0,0,"the big house")], foreign_sentence, phrase_to_max_prob), 0.5)
    print_or_value(10, future_cost([(0,0,"the big"),(1,1,"shop")], foreign_sentence, phrase_to_max_prob), 1)