def next_word(model, text):
    tokenized_test = [
        list(map(str.lower, word_tokenize(sent)))
        for sent in sent_tokenize(text)
    ]
    context = list(
        pad_sequence(tokenized_test[0],
                     pad_left=True,
                     left_pad_symbol="<s>",
                     pad_right=False,
                     right_pad_symbol="</s>",
                     n=2))
    best_score = -200
    best_word = '<unk>'
    second_word = '<unk>'
    second_score = -202
    third_word = '<unk>'
    third_score = -203
    for word in model.vocab:
        score = model.logscore(word, context)
        if score > best_score and word != '<s>' and word != ',' and word != ';' and word != ':' and word != "’":
            third_score = second_score
            third_word = second_word
            second_score = best_score
            second_word = best_word
            best_score = score
            best_word = word
    choices = [best_word, second_word, third_word]
    #return "First word",best_score, best_word, "Second word",second_score, second_word, "Third word", third_score, third_word
    return choices
def pading(line, start, end):
    line = line.rstrip().replace('\u200c', '').split(" ")
    line = list(
        pad_sequence(line,
                     pad_left=True,
                     left_pad_symbol=start,
                     pad_right=True,
                     right_pad_symbol=end,
                     n=2))
    return line
Пример #3
0
    def log_score(self, progression):
        progression = list(pad_sequence(progression, self.order, pad_left=True,
                                        left_pad_symbol="<s>", pad_right=True, right_pad_symbol="</s>"))
        contexts = list(ngrams(progression, self.order-1))[:-1]
        words = progression[self.order-1:]

        total_log_score = 0
        for word, context in zip(words, contexts):
            score = self.model.score(word, context)
            log_score = np.log(score)
            total_log_score += log_score

        return total_log_score
Пример #4
0
def train_texts(train_files, exclude, extension, n_ngram):
    # Training data file
    # train_data_file = "./train/treino.txt"

    # read training data
    #train_data_files = glob.glob('./train/*' + extension)
    train_data_files = train_files.copy()

    if (exclude):
        print("Arquivos no diretorio do treino antes de remover o item do test: ", train_data_files)
        train_data_files.remove(exclude)

    print("Arquivos utilizados no treino: ", train_data_files)

    train_texts = ''

    for train_data_file in train_data_files:

        try:
            #path_file_train =
            with open(os.path.join("./train", train_data_file), encoding='utf-8') as f:
                train_text = f.read().lower()
        except:
            print("Não foi possível acessar os arquivos de treino com a extensão ." + extension + " no diretório train.")

        # apply preprocessing (remove text inside square and curly brackets and rem punc)
        train_text = re.sub(r"\[.*\]|\{.*\}", "", train_text)
        train_text = re.sub(r'[^\w\s]', "", train_text)
        train_texts += train_text

    # pad the text and tokenize
    training_data = list(pad_sequence(word_tokenize(train_texts), n_ngram,
                                      pad_left=True,
                                      left_pad_symbol="<s>"))

    print("training_data", training_data)

    # generate ngrams
    ngrams = list(everygrams(training_data, max_len=n_ngram))
    print("Number of ngrams:", len(ngrams))

    # build ngram language models
    model = WittenBellInterpolated(n_ngram)
    model.fit([ngrams], vocabulary_text=training_data)
    print(model.vocab)

    return model
Пример #5
0
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.lm.preprocessing import padded_everygram_pipeline
from nltk.lm import MLE
import os
import io  #codecs

if __name__ == '__main__':
    text = ['ධිර පරීක්ෂණ පෞද්ගලික අංශයෙන් සිදුකිරීම']
    tokenizer = RegexpTokenizer('\s+', gaps=True)
    words = tokenizer.tokenize(text[0])

    padded_sent = list(
        pad_sequence(words,
                     pad_left=True,
                     left_pad_symbol="<s>",
                     pad_right=True,
                     right_pad_symbol="</s>",
                     n=2))
    all_grams = list(everygrams(padded_sent, max_len=2))
    #bi = list(bigrams(words))
    # Preprocess the tokenized text for 3-grams language modelling
    n = 3
    train_data, padded_sents = padded_everygram_pipeline(n, words)
    model = MLE(n)
    model.fit(train_data, padded_sents)
    print(len(model.vocab))
    print(model.vocab.lookup(padded_sent))
    print(model.counts)
    print(padded_sent)
Пример #6
0
def build_LM(in_file):
    """
    build language models for each label
    each line in in_file contains a label and a string separated by a space
    """
    print('building language models...')
    # This is an empty method
    # Pls implement your code in below

    '''
    Labeling the training data
    split sentence to 4-gram
    padding at the beginning and the end
    '''
    malay_corpus = list()
    indon_corpus = list()
    tamil_corpus = list()
    vocabulary = dict() # total vocabulary
    
    fp = open(in_file, "r")
    while True:
        line = fp.readline()
        if not line: 
            break

        line_clean = preprocess(line)

        # Exclude label, split sentence
        label = line_clean.split(" ", 1)[0]
        sentence = line_clean.split(" ", 1)[1]

        # remove excess space due to deletion of punctuations and digits
        integrated = ' '.join(sentence.split())

        # Split sentence into single characters
        text = list(integrated)
        
        # 4-gram LM with beginning and end padding
        padded_sent = list(pad_sequence(text,
                        pad_left=True, left_pad_symbol="<s>",
                        pad_right=True, right_pad_symbol="</s>",
                        n=4))
        padded_list = list(ngrams(padded_sent, n=4))
            
        # Add one smoothing
        for item in padded_list:
            if item not in vocabulary:
                vocabulary[item] = [1,1,1]
        
        if (label == "malaysian"):
            malay_corpus.append(padded_list)
        elif (label == "indonesian"):
            indon_corpus.append(padded_list)
        elif (label == "tamil"):
            tamil_corpus.append(padded_list)
    
    fp.close()

    # size of distinct 4-grams in training data
    tot = len(vocabulary)

    '''
    dictionary vocabulary
    key: such as ('S', 'a', 'm', 'e')
    value: [#(malay), #(indon), #tamil, #(random/other)]
    '''
    cnt_m = tot
    cnt_i = tot
    cnt_t = tot
    for sentence in malay_corpus:
        for chars in sentence:
            cnt_m += 1
            vocabulary[chars][0] += 1   
    for sentence in indon_corpus:
        for chars in sentence:
            cnt_i += 1
            vocabulary[chars][1] += 1
    for sentence in tamil_corpus:
        for chars in sentence:
            cnt_t += 1
            vocabulary[chars][2] += 1

    # Normalize with add-one-smoothing
    for key in vocabulary.keys():
        lst = vocabulary[key]
        new_lst = [lst[0]/cnt_m, lst[1]/cnt_i, lst[2]/cnt_t, 1/tot]
        vocabulary[key] = new_lst
    return vocabulary
Пример #7
0
def test_LM(in_file, out_file, LM):
    """
    test the language models on new strings
    each line of in_file contains a string
    you should print the most probable label for each string into out_file
    """
    print("testing language models...")
    # This is an empty method
    # Pls implement your code in below

    '''
    multiply the probabilities of the 4-grams for this string, 
    and return the label (i.e., malaysian, indonesian, and tamil) 
    that gives the highest product. 
    Ignore the four-gram if it is not found in the LMs.
    '''
    fp = open(in_file, "r")
    fo = open(out_file, "w")
    while True:
        line = fp.readline()
        if not line: break

        line_clean = preprocess(line)

        # remove excess space due to deletion of punctuations and digits
        integrated = ' '.join(line_clean.split())

        # Split sentence into single characters
        text = list(integrated)
        # 4-gram LM with beginning and end padding
        padded_sent = list(pad_sequence(text,
                                        pad_left=True, left_pad_symbol="<s>",
                                        pad_right=True, right_pad_symbol="</s>",
                                        n=4))
        padded_list = list(ngrams(padded_sent, n=4))
        
        # the initial prob should be 1, but we take logarithm 
        # as prob can be very small float number
        prob_m = 0  # prob of malaysian
        prob_i = 0  # prob of indonesian
        prob_t = 0  # prob of tamil
        prob_r = 0  # random model probability, used to determine other languages
        num_r = 0   # number of unseen four-grams
        num = 0     # number of total four-grams
        for four_gram in padded_list:
            num += 1
            if four_gram not in LM.keys():
                # Ignore the four-gram if it is not found in the LMs.
                # pass
                num_r += 1
            else:
                # multiply the probabilities of the 4-grams for this string
                # but in practice we calculate logarithm by addition
                prob_m += math.log(LM[four_gram][0])
                prob_i += math.log(LM[four_gram][1])
                prob_t += math.log(LM[four_gram][2])
                prob_r += math.log(LM[four_gram][3])
            
        '''        
        return the label (i.e., malaysian, indonesian, and tamil)
        that gives the highest product.
        Add label at the beginning of the sentence
        '''
        line_w = ""
        if ((num_r/num > 0.7) or prob_r >= max(prob_m, prob_i, prob_t)):
            line_w = "other " + line
        elif (prob_m >= prob_i) and (prob_m >= prob_t):
            line_w = "malaysian " + line
        elif (prob_i >= prob_m) and (prob_i >= prob_t):
            line_w = "indonesian " + line
        elif (prob_t >= prob_m) and (prob_t >= prob_i):
            line_w = "tamil " + line
        # else:
        #     line_w = "other " + line
        
        # write predicted result into output file
        fo.write(line_w)
    fp.close()
    fo.close()
Пример #8
0
def test_text(model, extension, n_ngram, test_data_file, all_files):
    print("Arquivo utilizado no teste: ", test_data_file)

    print(test_data_file)
    if (all_files):
        path_file = os.path.join("./train",test_data_file)
    else:
        path_file = os.path.join("./test",test_data_file)
    # Read testing data
    with open(path_file, encoding='utf-8') as f:
        test_text = f.read().lower()

    test_text = re.sub(r'[^\w\s]', "", test_text)

    # Tokenize and pad the text
    testing_data = list(pad_sequence(word_tokenize(test_text), n_ngram,
                                     pad_left=True,
                                     left_pad_symbol="<s>"))
    print("Length of test data:", len(testing_data))
    print("testing_data", testing_data)

    # assign scores
    scores = []
    for i, item in enumerate(testing_data[n_ngram - 1:]):
        s = model.score(item, testing_data[i:i + n_ngram - 1])
        scores.append(s)

    scores_np = np.array(scores)

    # set width and height
    width = 8
    height = np.ceil(len(testing_data) / width).astype("int32")
    print("Width, Height:", width, ",", height)

    # copy scores to rectangular blank array
    a = np.zeros(width * height)
    a[:len(scores_np)] = scores_np
    diff = len(a) - len(scores_np)

    # apply gaussian smoothing for aesthetics
    a = gaussian_filter(a, sigma=1.0)

    # reshape to fit rectangle
    a = a.reshape(-1, width)

    # format labels
    labels = [" ".join(testing_data[i:i + width]) for i in range(n_ngram - 1, len(testing_data), width)]
    labels_individual = [x.split() for x in labels]
    labels_individual[-1] += [""] * diff
    labels = [f"{x:60.60}" for x in labels]

    # create heatmap
    fig = go.Figure(data=go.Heatmap(
        z=a, x0=0, dx=1,
        y=labels, zmin=0, zmax=1,
        customdata=labels_individual,
        hovertemplate='%{customdata} <br><b>Score:%{z:.3f}<extra></extra>',
        colorscale="burg"))
    fig.update_layout({"height": height * 28, "width": 1000, "font": {"family": "Courier New"}})
    fig['layout']['yaxis']['autorange'] = "reversed"
    #fig.show()
    fig.write_html(file='./public/'+test_data_file+'.html')
Пример #9
0
from nltk import bigrams
from nltk import trigrams
from nltk import ngrams
from nltk import FreqDist
from nltk.util import pad_sequence

#Create a corpus
sentCorpus = ['I am Sam','Sam I am', 'I do not like green eggs and ham']

#Split the padded sentences into words 
words = []
for i in range(0,len(sentCorpus)):
    #Split the strings based on blankspace
    sen = sentCorpus[i].split(' ')
    #Pad at either end of the sentence with markers
    sent = list(pad_sequence(sen, pad_left=True, left_pad_symbol="<s>", pad_right=True, right_pad_symbol="</s>", n=2))
    #Extend the list by adding
    words.extend(sent)
print(words)

#Unigrams, bigrams, trigrams, quadgrams
listU = words
listB = list(bigrams(words))
listT = list(trigrams(words))
listQ = list(ngrams(words,4))

#Get total number of unigrams, bigrams, trigrams and quadgrams
cntU = len(listU)
cntB = len(listB)
cntT = len(listT)
cntQ = len(listQ)
Пример #10
0
def success():
    if request.method == 'POST':
        #pegando o arquivo e salvando na pasta
        a = request.form.getlist('ano')
        f = request.files['file']
        f.save(os.path.join(app.config['UPLOAD_FOLDER'], f.filename))

        #analisando arquivos:
        ano_salvo = str(a[0])
        resultado_analise = next(
            os.walk("/home/{nome_de_usuario}/mysite/resultados/"))
        path, dirs, files = next(
            os.walk("/home/{nome_de_usuario}/mysite/projetos/" + ano_salvo +
                    "/"))
        file_count = len(files)
        texto = "/home/{nome_de_usuario}/mysite/uploads/" + f.filename
        valores_maximos = []
        valores_medios = []
        valores_arquivo = []

        j = 0
        while j < file_count:
            #Variaveis:
            texto = "/home/{nome_de_usuario}/mysite/uploads/" + f.filename
            texto_salvo = parser.from_file(
                "/home/{nome_de_usuario}/mysite/projetos/" + ano_salvo + "/" +
                files[j])
            texto_fornecido = parser.from_file(texto)

            #Fornecendo dados para a variável "train_text" com o valor de "texto_salvo" para posteriormente ser analisado
            train_text = texto_salvo['content']
            # aplique o pré-processamento (remova o texto entre colchetes e chaves e rem punc)
            train_text = re.sub(r"\[.*\]|\{.*\}", "", train_text)
            train_text = re.sub(r'[^\w\s]', "", train_text)

            # definir o número ngram
            n = 5

            # preencher o texto e tokenizar
            training_data = list(
                pad_sequence(word_tokenize(train_text),
                             n,
                             pad_left=True,
                             left_pad_symbol="<s>"))

            # gerar ngrams
            ngrams = list(everygrams(training_data, max_len=n))

            # build ngram language models
            model = WittenBellInterpolated(n)
            model.fit([ngrams], vocabulary_text=training_data)

            #Fornecendo dados para a variável "testt_text" com o valor de pdf2_test para posteriormente ser comparado com o arquivo de treinamento
            test_text = texto_fornecido['content']
            test_text = re.sub(r'[^\w\s]', "", test_text)

            # Tokenize e preencha o texto
            testing_data = list(
                pad_sequence(word_tokenize(test_text),
                             n,
                             pad_left=True,
                             left_pad_symbol="<s>"))

            # atribuir pontuações
            scores = []
            for i, item in enumerate(testing_data[n - 1:]):
                s = model.score(item, testing_data[i:i + n - 1])
                scores.append(s)

            scores_np = np.array(scores)

            # definir largura e altura
            width = 8
            height = np.ceil(len(testing_data) / width).astype("int64")

            # copiar pontuações para matriz em branco retangular
            a = np.zeros(width * height)
            a[:len(scores_np)] = scores_np
            diff = len(a) - len(scores_np)

            # aplique suavização gaussiana para estética
            a = gaussian_filter(a, sigma=1.0)

            # remodelar para caber no retângulo
            a = a.reshape(-1, width)

            # rótulos de formato
            labels = [
                " ".join(testing_data[i:i + width])
                for i in range(n - 1, len(testing_data), width)
            ]
            labels_individual = [x.split() for x in labels]
            labels_individual[-1] += [""] * diff
            labels = [f"{x:60.60}" for x in labels]

            # criar mapa de calor para colocar no resultado visual
            fig = go.Figure(data=go.Heatmap(
                z=a,
                x0=0,
                dx=1,
                y=labels,
                zmin=0,
                zmax=1,
                customdata=labels_individual,
                hovertemplate=
                '%{customdata} <br><b>Pontuacao:%{z:.3f}<extra></extra>',
                colorscale="burg"))
            fig.update_layout({
                "height": height * 40,
                "width": 1000,
                "font": {
                    "family": "Courier New"
                }
            })
            #criando resultado visual:
            #plotly.offline.plot(fig, filename='/home/Allberson/mysite/resultados/resultado.html', auto_open=False)

            #Armazenando dados dos scores para mostrar posteriormente
            valores_scores = np.array(scores)

            #Atribuindo valores para propor condições de valores:
            buscar_max = 0.9000000000000000  #Nivel alto de plágio

            buscar_med = 0.8000000000000000  #Nível acima da média

            #atribuindo valores mais autos de cópia
            maximo = np.where(valores_scores > buscar_max)[0]
            medio = np.where(
                valores_scores > buscar_med)[0]  #Nao ustilizado no momento
            valores_maximos.insert(j, len(maximo))
            valores_medios.insert(j, len(medio))  #Nao ustilizado no momento
            valores_arquivo.insert(j, files[j])

            j = j + 1

        #buscando arquivo com maior nível igualdade:
        val_maximo = np.array(valores_maximos)
        val_medio = np.array(valores_medios)
        busc_val_max = 1090
        busc_val_med = 500
        maxx = np.where(val_maximo > busc_val_max)[0]
        medd = np.where(val_medio > busc_val_med)[0]

        #Iniciando a página web
        if len(maxx) == 0:
            ano = ano_salvo
            resultado_false = "Nenhum arquivo encontrado que se iguale com o seu"
            os.remove('/home/{nome_de_usuario}/mysite/uploads/' +
                      f.filename)  #removendo arquivo enviado pelo usuário
            return render_template("resultado_page.html",
                                   name=f.filename,
                                   resultado_neg=resultado_false,
                                   valor_ano=ano)
        elif len(maxx) > 0:
            ano = ano_salvo
            tot_projetos = file_count
            resultado_mensagem = 'Encontramos um projeto com uma grande similaridade.'
            valor = "80%"
            enc = "Encontramos alguns projetos tiveram resultados positivos no momento de nossa análise. Veja a tabela abaixo"
            projetos_nomes_ok = files[int(maxx)]
            mens = "O(s) projeto(s) analisado(s) pode/podem ter um valor igual ou superior ao mostrado na coluna 'valor de cópia' : "
            os.remove('/home/{nome_de_usuario}/mysite/uploads/' + f.filename)
            return render_template("resultado_page.html",
                                   name=f.filename,
                                   mensagem=mens,
                                   resultado_men=resultado_mensagem,
                                   resultado_proj=projetos_nomes_ok,
                                   resultado_max=valor,
                                   encontrado=enc,
                                   valor_ano=ano,
                                   tot_proj=tot_projetos)