def test_d2_1_gp():
    global food_corpus, natr_corpus

    food_corpus_tk = train.tokenize_corpus(food_corpus)
    natr_corpus_tk = train.tokenize_corpus(natr_corpus)

    food_train, food_vocab = padded_everygram_pipeline(
        3, food_corpus_tk[:int(0.8 * len(food_corpus_tk))])
    natr_train, natr_vocab = padded_everygram_pipeline(
        3, natr_corpus_tk[:int(0.8 * len(natr_corpus_tk))])

    food_test = sum([['<s>'] + x + ['</s>']
                     for x in food_corpus_tk[int(0.8 * len(food_corpus_tk)):]],
                    [])
    natr_test = sum([['<s>'] + x + ['</s>']
                     for x in natr_corpus_tk[int(0.8 * len(natr_corpus_tk)):]],
                    [])

    food_lm = Laplace(3)
    natr_lm = Laplace(3)

    food_lm.fit(food_train, food_vocab)
    natr_lm.fit(natr_train, natr_vocab)

    eq_(int(evaluate.get_perplexity(food_lm, food_test[:2500])), 7318)
    eq_(int(evaluate.get_perplexity(food_lm, natr_test[:2500])), 7309)
    eq_(int(evaluate.get_perplexity(natr_lm, natr_test[:2500])), 5222)
    eq_(int(evaluate.get_perplexity(natr_lm, food_test[:2500])), 5354)
예제 #2
0
def main(argv):
    """Trains an nltk language model.

    Loads in files of normalized text, partitions them into a train partition
    (3/4 of data) and a test partition (last 1/4 of data). Uses Laplace
    smoothing for unseen ngrams.
    """
    if len(argv) > 1:
        raise app.UsageError("Too many command-line arguments.")

    normalized_data = load_normalized_data(FLAGS.language, FLAGS.data_source,
                                           FLAGS.pass_valid, FLAGS.experiment)
    train_partition, test_partition = partition_data(normalized_data)
    train_ngrams, vocab = padded_everygram_pipeline(2, train_partition)
    test_ngrams, _ = padded_everygram_pipeline(2, test_partition)
    language_model = Laplace(2)
    language_model.fit(train_ngrams, vocab)

    avg_perp, count = compute_avg_perplexity(test_ngrams, language_model)
    print("\n----------------------------\n"
          "Language Model Parameters:\n"
          f"\tLanguage={FLAGS.language}\n"
          f"\tData Sources={FLAGS.data_source}\n"
          f"\tPass Valid={FLAGS.pass_valid}\n"
          f"\tExperiment={FLAGS.experiment}\n"
          "----------------------------\n")
    print(f"Average perplexity across {count} ngrams:\t{avg_perp}")
예제 #3
0
 def test_d2_1_gp(self):
     nltk.download('punkt')
     food_corpus_tk = lab3.tokenize_corpus(self.food_corpus)
     natr_corpus_tk = lab3.tokenize_corpus(self.natr_corpus)
     food_train, food_vocab = padded_everygram_pipeline(
         3, food_corpus_tk[:int(0.8 * len(food_corpus_tk))])
     natr_train, natr_vocab = padded_everygram_pipeline(
         3, natr_corpus_tk[:int(0.8 * len(natr_corpus_tk))])
     food_test = sum(
         [['<s>'] + x + ['</s>']
          for x in food_corpus_tk[int(0.8 * len(food_corpus_tk)):]], [])
     natr_test = sum(
         [['<s>'] + x + ['</s>']
          for x in natr_corpus_tk[int(0.8 * len(natr_corpus_tk)):]], [])
     food_lm = Laplace(3)
     natr_lm = Laplace(3)
     food_lm.fit(food_train, food_vocab)
     natr_lm.fit(natr_train, natr_vocab)
     self.assertEqual(int(lab3.get_perplexity(food_lm, food_test[:2500])),
                      7318)
     self.assertEqual(int(lab3.get_perplexity(food_lm, natr_test[:2500])),
                      7309)
     self.assertEqual(int(lab3.get_perplexity(natr_lm, natr_test[:2500])),
                      5222)
     self.assertEqual(int(lab3.get_perplexity(natr_lm, food_test[:2500])),
                      5354)
예제 #4
0
파일: q2.py 프로젝트: nazaninsbr/NLP-UT
def calculate_word_ngrams(data):
    text_bigrams, text_unigrams = {}, {}
    for news_type in data.keys():
        all_news_type_texts = []
        for news in data[news_type]:
            all_news_texts = []
            for sent in news:
                all_news_texts.extend(sent)
            all_news_type_texts.append(all_news_texts)
        train_bi, vocab_bi = padded_everygram_pipeline(2, all_news_type_texts)
        text_bigrams[news_type] = {'train': train_bi, 'vocab': vocab_bi}
        train_uni, vocab_uni = padded_everygram_pipeline(1, all_news_type_texts)
        text_unigrams[news_type] = {'train': train_uni, 'vocab': vocab_uni}
    return text_unigrams, text_bigrams
예제 #5
0
    def compute_pp(self, n, tokenized_train, tokenized_test):
        train_data, padded_sents = padded_everygram_pipeline(
            n, tokenized_train)
        test_data, padded_sents = padded_everygram_pipeline(n, tokenized_test)
        model = Laplace(1)
        model.fit(train_data, padded_sents)

        s = 0
        for i, test in enumerate(test_data):
            p = model.perplexity(test)
            s += p

        perplexity = s / (i + 1)
        return perplexity
예제 #6
0
def train_ngram_lm(tokenized_text, models, n=3, a=0.0015, unk_cutoff=10, discount=0.1):
    training_ngrams, padded_sents = padded_everygram_pipeline(n, tokenized_text)
    vocab = Vocabulary(padded_sents, unk_cutoff=unk_cutoff)
    lms = []
    for model in models:
        training_ngrams, padded_sents = padded_everygram_pipeline(n, tokenized_text)
        if model == 'Kneser Ney':
            lm = MKneserNeyInterpolated(order=n, discount=discount, vocabulary=vocab)
        elif model == 'WBI':
            lm = MWittenBellInterpolated(order=n, vocabulary=vocab)
        elif model == 'Lidstone':
            lm = MLidstone(gamma=a, order=n, vocabulary=vocab)
        lm.fit(training_ngrams)
        lms += [lm]
    return lms
예제 #7
0
파일: ex_ngram.py 프로젝트: mfkiwl/hdlp
def train_ngram_model(src_dict: dict, ngram_order=N_GRAM_ORDER):
    print(f"Training {ngram_order}-gram model on train dataset...")
    train_data, padded_sents = padded_everygram_pipeline(
        ngram_order, src_dict["train"])
    model = MLE(ngram_order)
    model.fit(train_data, padded_sents)
    return model
예제 #8
0
파일: format.py 프로젝트: minhptx/spade
    def fit(self, dirty_df: pd.DataFrame, col):
        tokenized_text = [
            word_tokenize(value) for value in dirty_df[col].values
        ]

        train_data, padded_sents = padded_everygram_pipeline(2, tokenized_text)
        self.model.fit(train_data, padded_sents)
예제 #9
0
    def __init__(self,
                 training_set: List[str],
                 n_param: int = 3,
                 max_predict=4):
        super().__init__()
        '''
        Initialize the completions for the test phrase
        '''

        # convert sentence to list[words] using tokenizer
        # self.tokenizer = ToktokTokenizer()

        training_ngrams, padded_sentences = padded_everygram_pipeline(
            n_param,
            #list(map(self.tokenizer.tokenize, training_set)),
            list(map(wordpunct_tokenize, training_set)),
        )

        # print(len(training_ngrams))
        # temp = list(training_ngrams)
        # for i in range(10):
        #     print(list(temp[i]))

        self.model_obj = MLE(order=n_param)
        self.model_obj.fit(training_ngrams, padded_sentences)
        print('Vocab length: {}'.format(len(self.model_obj.vocab)))
        print('Counts: ', self.model_obj.counts)

        self.max_predict = max_predict
def generate_sentence(LM3_MLE,text):
    min_per = 10000000000000000000000
    min_text =""
    for i in range(5):
        starting_text = ["<s>"]
        starting_text.append(text)
        generated = generate_sent(LM3_MLE, starting_text)
        test_tokenized_text = [list(map(str.lower, word_tokenize(sent))) for sent in sent_tokenize(generated)]
        test_data, _ = padded_everygram_pipeline(LM3_MLE.order, test_tokenized_text)

        sentences =[]
        for test in test_data:
            for each in list(test):
                sentences.append(each)

        ngram_list =[]
        for each in sentences:
            if(len(each) == LM3_MLE.order and (each[0] != "<s>" and each[-1] != "</s>")):
                ngram_list.append(each)
        
        if(len(ngram_list)>0):
            if(LM3_MLE.perplexity(ngram_list) < min_per):
                min_per = LM3_MLE.perplexity(ngram_list)
                min_text = generated
            elif(LM3_MLE.perplexity(ngram_list) == min_per and len(generated) > len(min_text)):
                min_per = LM3_MLE.perplexity(ngram_list)
                min_text = generated
    return text+" "+min_text,min_per
예제 #11
0
def train_LM_model(corpus, model, n, gamma=None, unk_cutoff=1):
    """
    Entraîne un modèle de langue n-gramme NLTK de la classe `model` sur le corpus.

    :param corpus: list(list(str)), un corpus tokenizé
    :param model: un des éléments de (MLE, Lidstone, Laplace)
    :param n: int, l'ordre du modèle
    :param gamma: float or None, le paramètre gamma (pour `model=Lidstone` uniquement). Si model=Lidstone, alors cet
    argument doit être renseigné
    :param unk_cutoff: le seuil au-dessous duquel un mot est considéré comme inconnu et remplacé par <UNK>
    :return: un modèle entraîné
    """

    train, words = padded_everygram_pipeline(n, corpus.copy())
    vocab = Vocabulary(words, unk_cutoff)

    if (model == Lidstone) and (gamma is not None):
        model = Lidstone(gamma,n,vocab)
        model.fit(train)
    elif model == MLE:
        model = mle.train_MLE_model(corpus, n)
    elif model == Laplace:
        model = Laplace(n,vocab)
        model.fit(train)

    return model
예제 #12
0
파일: q2.py 프로젝트: nazaninsbr/NLP-UT
def calculate_characters_ngrams(data):
    text_bigrams, text_unigrams = {}, {}
    for news_type in data.keys():
        all_news_type_texts = []
        for news in data[news_type]:
            all_news_texts = []
            for sent in news:
                for word in sent:
                    all_chars = [c for c in word]+[' ']
                    all_news_texts.extend(all_chars)
            all_news_type_texts.append(all_news_texts)
        train_bi, vocab_bi = padded_everygram_pipeline(2, all_news_type_texts)
        text_bigrams[news_type] = {'train': train_bi, 'vocab': vocab_bi}
        train_uni, vocab_uni = padded_everygram_pipeline(1, all_news_type_texts)
        text_unigrams[news_type] = {'train': train_uni, 'vocab': vocab_uni}
    return text_unigrams, text_bigrams
예제 #13
0
파일: nextWord.py 프로젝트: arrgee23/ml
def makeModel():
    #sentences = webtext.raw()+brown.raw()+reuters.raw()
    sentences = webtext.raw() + reuters.raw()
    # Tokenize the sentences
    try:  # Use the default NLTK tokenizer.
        from nltk import word_tokenize, sent_tokenize
        # Testing whether it works.
        # Sometimes it doesn't work on some machines because of setup issues.
        word_tokenize(
            sent_tokenize("This is a foobar sentence. Yes it is.")[0])

    except:  # Use a naive sentence tokenizer and toktok.
        import re
        from nltk.tokenize import ToktokTokenizer
        # See https://stackoverflow.com/a/25736515/610569
        sent_tokenize = lambda x: re.split(r'(?<=[^A-Z].[.?]) +(?=[A-Z])', x)
        # Use the toktok tokenizer that requires no dependencies.
        toktok = ToktokTokenizer()
        word_tokenize = word_tokenize = toktok.tokenize

    tokenized_text = [
        list(map(str.lower, word_tokenize(sent)))
        for sent in sent_tokenize(sentences)
    ]

    # Make it ready for making 3 grams
    n = 5
    train_data, padded_sents = padded_everygram_pipeline(n, tokenized_text)

    model = MLE(n)  # Lets train a 3-grams model, previously we set n=3

    model.fit(train_data, padded_sents)
    #print(model.vocab)

    return model
예제 #14
0
def create_LanguageModel(docs, model_type="MLE", ngram=3):
    global _ngram
    _ngram = ngram
    tokenized_text = []

    new_docs = preprocess(docs)

    for d in new_docs:
        text = sent_tokenize(d, language="turkish")
        for sent in text:
            temp = []
            for i in word_tokenize(sent, language="turkish"):
                temp.append(i.lower())
            tokenized_text.append(temp)

    training_ngrams, vocab = padded_everygram_pipeline(ngram, tokenized_text)

    if model_type == "MLE":
        model = MLE(ngram)  #, vocabulary=Vocabulary(vocab))
        model.fit(training_ngrams, vocab)
        # print(model.vocab)
        return model
    elif model_type == "KneserNeyInterpolated":
        model = KneserNeyInterpolated(ngram)
        model.fit(training_ngrams, vocab)  # padded_sents)
        # print(model.vocab)
        return model
    else:
        print("Unkown Model Type")
        return 0
 def _parallel_load_genre_to_datadict(self, genre):
     """ DOESN'T WORK """
     scripts = self.all_scripts_for_genre(self.df, genre)
     # print("processing :", genre, len(scripts))
     tokenized = self.tokenize_scripts(scripts, genre)
     ngrams, vocab = padded_everygram_pipeline(self.n, tokenized)
     self.data_dict[genre] = (ngrams, vocab)
예제 #16
0
 def fit(self, sequences: List[List]):
     train, vocab = padded_everygram_pipeline(self.config.GRAM_SIZE, sequences)
     model = MLE(self.config.GRAM_SIZE)
     model.fit(train, vocab)
     self.model = model
     if self.config.SAVE_PATH:
         self.save_model(self.config.SAVE_PATH)
예제 #17
0
def train_LM_model(corpus, model, n, gamma=None, unk_cutoff=2):
    """
    Entraîne un modèle de langue n-gramme NLTK de la classe `model` sur le corpus.

    :param corpus: list(list(str)), un corpus tokenizé
    :param model: un des éléments de (MLE, Lidstone, Laplace)
    :param n: int, l'ordre du modèle
    :param gamma: float or None, le paramètre gamma (pour `model=Lidstone` uniquement). Si model=Lidstone, alors cet
    argument doit être renseigné
    :param unk_cutoff: le seuil au-dessous duquel un mot est considéré comme inconnu et remplacé par <UNK>
    :return: un modèle entraîné
    """
    lm = None
    ngrams, words = padded_everygram_pipeline(n, corpus)
    vocab = Vocabulary(words, unk_cutoff=unk_cutoff)
    if model== MLE:
        lm = model(n, vocabulary=vocab)
        lm.fit(ngrams)
    elif model == Lidstone:
        if gamma == None:
            raise Exception('Please enter a value for gamma')
        else:
            lm = Lidstone(gamma, order = n, vocabulary=vocab)
            lm.fit(ngrams)
    elif model==Laplace:
        lm = Laplace(order = n, vocabulary=vocab)
        lm.fit(ngrams)
    else:
        raise Exception('Wrong model in train_LM_model')
    return lm
예제 #18
0
def train_LM_model(corpus, model, n, gamma=None, unk_cutoff=2):
    """
    Entraîne un modèle de langue n-gramme NLTK de la classe `model` sur le corpus.

    :param corpus: list(list(str)), un corpus tokenizé
    :param model: un des éléments de (MLE, Lidstone, Laplace)
    :param n: int, l'ordre du modèle
    :param gamma: float or None, le paramètre gamma (pour `model=Lidstone` uniquement). Si model=Lidstone, alors cet
    argument doit être renseigné
    :param unk_cutoff: le seuil au-dessous duquel un mot est considéré comme inconnu et remplacé par <UNK>
    :return: un modèle entraîné
    """
    if model not in [MLE, Laplace, Lidstone]:
        raise TypeError("Unkown model type! supported types: (MLE, Lidstone, Laplace)")

    ngrams, words = padded_everygram_pipeline(n, corpus)
    vocab = Vocabulary(words, unk_cutoff=unk_cutoff)

    params = {
        "order":n,
        "vocabulary":vocab,
    }
    if model == Lidstone:
        params["gamma"] = gamma
    ist_model = model(**params)
    ist_model.fit(ngrams)
    
    return ist_model
예제 #19
0
    def tokenize_text(self, text):
        tokenized_text = [
            list(word_tokenize(sent)) for sent in sent_tokenize(text)
        ]

        train_data, padded_sents = padded_everygram_pipeline(
            NGRAM, tokenized_text)
        return padded_sents
예제 #20
0
    def train_P(self):
        n = 3
        train_data, padded_sents = padded_everygram_pipeline(n, self.X_train)

        language_model = MLE(n)
        language_model.fit(train_data, padded_sents)
        language_model.vocab()
        return language_model
예제 #21
0
    def build_ngram_lm(self, train):
        if not train: return None

        n = 5  # up to 5 gram language model
        train, vocab = padded_everygram_pipeline(n, train)
        model = KneserNeyInterpolated(n)
        model.fit(train, vocab)
        return model
예제 #22
0
def train_ngram_lm(dataset, data, ngram=3, gamma=0.5):
    print(f'[!] max 3-gram, Lidstone smoothing with gamma 0.5')
    train, vocab = padded_everygram_pipeline(ngram, data)
    lm = Lidstone(gamma, ngram)
    lm.fit(train, vocab)
    with open(f'./data/{dataset}/lm.pkl', 'wb') as f:
        pickle.dump(lm, f)
    print(f'[!] ngram language model saved into ./data/{dataset}/lm.pkl')
예제 #23
0
파일: query_lms.py 프로젝트: halolimat/work
    def score(self, patient_id):

        # txt="Cancer refers to any one of a large number of diseases characterized by the development of abnormal cells that divide uncontrollably and have the ability to infiltrate and destroy normal body tissue. Cancer often has the ability to spread throughout your body. Cancer is the second-leading cause of death in the world."

        # ===================================
        import json
        if False:
            hits=self.es.search_list("patient_id", [patient_id])
            with open("txt", "w") as f:
                json.dump(hits, f)
        else:
            with open("txt") as f:
                hits=json.load(f)

        seq=[]
        for hit in hits:
            for page in hit["_source"]["doc_pages"]:
                doc=self.nlp(page["page_contents"])
                seq+=[tuple(token.text for token in sent) for sent in doc.sents]

        m_names=list(self.models)
        scores=[]

        test_data, _ = padded_everygram_pipeline(2, seq)

        seq=[]
        for i, test in enumerate(test_data):
            seq.append(tuple([x for x in test]))

        for mname in self.models:
            # print(mname)
            # int_scores=[]
            # for i, test in enumerate(test_data):
            #     pp=self.models[mname].perplexity(test)
            #     scores.append(pp)
            #     int_scores.append(pp)
            #
            # print(min(int_scores))
            # scores.append(min(int_scores))

            start = timeit.default_timer()

            # the winning model is the model with the lowest perplexity
            pp=self.models[mname].perplexity(seq)
            scores.append(pp)
            print(mname)
            print(pp)
            print("----")
            print('Time: ', timeit.default_timer() - start)

        m=min(scores)

        pred_class=[]
        for idx in range(len(scores)):
            if scores[idx]==m:
                pred_class.append(m_names[idx])

        return pred_class
예제 #24
0
def getEveryModel(n: int, text: List, ngrams):
    """ get mixed-n model """
    lm = MLE(n)

    train, vocab = padded_everygram_pipeline(n, text)

    lm.fit(train, vocab)

    return lm
예제 #25
0
    def create_model_as_dict(self, corpus_name):
        # USO: dict(model["vale", "la"])['revancha']
        #                 previous_words    word

        # Create a placeholder for model
        # model = defaultdict(lambda: defaultdict(lambda: 0))

        print('leyendo corpus')
        reader = PlaintextCorpusReader(CORPUS_DIR, corpus_name)
        print('leyo corpus')
        train, vocab = padded_everygram_pipeline(self.ngram, reader.sents())
        print('everygram completed')

        model = dict()
        appearences = dict()

        # Cada elemento de list(train) es la lista con todos los ngrama (1,2,3,...) de cada sentencia
        i = 1
        print('inicializando')
        for everygram in train:
            # print('i:', i)
            # i += 1
            # j = 1
            for gram in everygram:
                # print('j:', j)
                j += 1
                if len(gram) == 1:
                    if gram[0] not in appearences:
                        appearences[gram[0]] = 1
                    else:
                        appearences[gram[0]] += 1
                elif len(gram) == 2:
                    if not gram[0] in model:
                        model[gram[0]] = dict()
                    if gram[1] not in model[gram[0]]:
                        model[gram[0]][gram[1]] = 1
                    else:
                        model[gram[0]][gram[1]] += 1
                elif len(gram) == 3:
                    if not (gram[0], gram[1]) in model:
                        model[(gram[0], gram[1])] = dict()
                    if gram[2] not in model[(gram[0], gram[1])]:
                        model[(gram[0], gram[1])][gram[2]] = 1
                    else:
                        model[(gram[0], gram[1])][gram[2]] += 1

        # print('conto todo', model)
        for w1 in model:
            # print('sumando', w1)
            total_count = float(sum(model[w1].values()))
            for w3 in model[w1]:
                # print('calculando', w3)
                model[w1][w3] /= total_count

        print('paso a probabilidad')

        return appearences, model
예제 #26
0
 def test_padded_everygram_pipeline(self):
     expected_train = [[("<s>", ), ("<s>", "a"), ("a", ), ("a", "b"),
                        ("b", ), ("b", "c"), ("c", ), ("c", "</s>"),
                        ("</s>", )]]
     expected_vocab = ["<s>", "a", "b", "c", "</s>"]
     train_data, vocab_data = padded_everygram_pipeline(
         2, [["a", "b", "c"]])
     self.assertEqual([list(sent) for sent in train_data], expected_train)
     self.assertEqual(list(vocab_data), expected_vocab)
예제 #27
0
    def fit(self, text: List[str], order: int):
        self.model = Laplace(order)
        self.order = order
        train_data, padded_sents = padded_everygram_pipeline(order, text)

        print('Fitting n-gram model', file=sys.stderr)
        self.model.fit(train_data, padded_sents)
        print(f'Vocabulary size: {self.model.vocab}', file=sys.stderr)
        return self
def create_LanguageModel(Docs,model_type,ngram):
    text = " ".join(Docs)
    text = text.replace("\\n"," ")
    tokenized_text = [list(map(str.lower, word_tokenize(sent))) for sent in sent_tokenize(text)]
    train_data, padded_sents = padded_everygram_pipeline(ngram, tokenized_text)
    model = MLE(ngram)
    if model_type != "MLE":
        model = KneserNeyInterpolated(ngram) 
    model.fit(train_data, padded_sents)
    return model
예제 #29
0
def create_and_fit_model(corpus):
    # Recibe un corpus tokenizado por sentencia y por palabra.
    train_data, padded_sents = padded_everygram_pipeline(NGRAM, corpus)

    # Crea modelo
    model = MLE(NGRAM)

    # Ajusta a los datos
    model.fit(train_data, padded_sents)

    return model
예제 #30
0
 def create_model(self, model_nm):
     self.model = {
         "lidstone": Lidstone(0.5, self.ngram_order),
         "kneserney": KneserNeyInterpolated(self.ngram_order),
         "wittenbell": WittenBellInterpolated(self.ngram_order)
     }[model_nm]
     train, vocab = padded_everygram_pipeline(self.ngram_order, self.text)
     vocab = Vocabulary(vocab, unk_cutoff=2, unk_label="<UNK>")
     print("Creating ngram...")
     self.model.fit(train, vocab)
     print("done")
예제 #31
0
 def test_padded_everygram_pipeline(self):
     expected_train = [
         [
             ("<s>",),
             ("a",),
             ("b",),
             ("c",),
             ("</s>",),
             ("<s>", "a"),
             ("a", "b"),
             ("b", "c"),
             ("c", "</s>"),
         ]
     ]
     expected_vocab = ["<s>", "a", "b", "c", "</s>"]
     train_data, vocab_data = padded_everygram_pipeline(2, [["a", "b", "c"]])
     self.assertEqual([list(sent) for sent in train_data], expected_train)
     self.assertEqual(list(vocab_data), expected_vocab)