def next_word(model, text): tokenized_test = [ list(map(str.lower, word_tokenize(sent))) for sent in sent_tokenize(text) ] context = list( pad_sequence(tokenized_test[0], pad_left=True, left_pad_symbol="<s>", pad_right=False, right_pad_symbol="</s>", n=2)) best_score = -200 best_word = '<unk>' second_word = '<unk>' second_score = -202 third_word = '<unk>' third_score = -203 for word in model.vocab: score = model.logscore(word, context) if score > best_score and word != '<s>' and word != ',' and word != ';' and word != ':' and word != "’": third_score = second_score third_word = second_word second_score = best_score second_word = best_word best_score = score best_word = word choices = [best_word, second_word, third_word] #return "First word",best_score, best_word, "Second word",second_score, second_word, "Third word", third_score, third_word return choices
def pading(line, start, end): line = line.rstrip().replace('\u200c', '').split(" ") line = list( pad_sequence(line, pad_left=True, left_pad_symbol=start, pad_right=True, right_pad_symbol=end, n=2)) return line
def log_score(self, progression): progression = list(pad_sequence(progression, self.order, pad_left=True, left_pad_symbol="<s>", pad_right=True, right_pad_symbol="</s>")) contexts = list(ngrams(progression, self.order-1))[:-1] words = progression[self.order-1:] total_log_score = 0 for word, context in zip(words, contexts): score = self.model.score(word, context) log_score = np.log(score) total_log_score += log_score return total_log_score
def train_texts(train_files, exclude, extension, n_ngram): # Training data file # train_data_file = "./train/treino.txt" # read training data #train_data_files = glob.glob('./train/*' + extension) train_data_files = train_files.copy() if (exclude): print("Arquivos no diretorio do treino antes de remover o item do test: ", train_data_files) train_data_files.remove(exclude) print("Arquivos utilizados no treino: ", train_data_files) train_texts = '' for train_data_file in train_data_files: try: #path_file_train = with open(os.path.join("./train", train_data_file), encoding='utf-8') as f: train_text = f.read().lower() except: print("Não foi possível acessar os arquivos de treino com a extensão ." + extension + " no diretório train.") # apply preprocessing (remove text inside square and curly brackets and rem punc) train_text = re.sub(r"\[.*\]|\{.*\}", "", train_text) train_text = re.sub(r'[^\w\s]', "", train_text) train_texts += train_text # pad the text and tokenize training_data = list(pad_sequence(word_tokenize(train_texts), n_ngram, pad_left=True, left_pad_symbol="<s>")) print("training_data", training_data) # generate ngrams ngrams = list(everygrams(training_data, max_len=n_ngram)) print("Number of ngrams:", len(ngrams)) # build ngram language models model = WittenBellInterpolated(n_ngram) model.fit([ngrams], vocabulary_text=training_data) print(model.vocab) return model
import nltk from nltk.tokenize import RegexpTokenizer from nltk.lm.preprocessing import padded_everygram_pipeline from nltk.lm import MLE import os import io #codecs if __name__ == '__main__': text = ['ධිර පරීක්ෂණ පෞද්ගලික අංශයෙන් සිදුකිරීම'] tokenizer = RegexpTokenizer('\s+', gaps=True) words = tokenizer.tokenize(text[0]) padded_sent = list( pad_sequence(words, pad_left=True, left_pad_symbol="<s>", pad_right=True, right_pad_symbol="</s>", n=2)) all_grams = list(everygrams(padded_sent, max_len=2)) #bi = list(bigrams(words)) # Preprocess the tokenized text for 3-grams language modelling n = 3 train_data, padded_sents = padded_everygram_pipeline(n, words) model = MLE(n) model.fit(train_data, padded_sents) print(len(model.vocab)) print(model.vocab.lookup(padded_sent)) print(model.counts) print(padded_sent)
def build_LM(in_file): """ build language models for each label each line in in_file contains a label and a string separated by a space """ print('building language models...') # This is an empty method # Pls implement your code in below ''' Labeling the training data split sentence to 4-gram padding at the beginning and the end ''' malay_corpus = list() indon_corpus = list() tamil_corpus = list() vocabulary = dict() # total vocabulary fp = open(in_file, "r") while True: line = fp.readline() if not line: break line_clean = preprocess(line) # Exclude label, split sentence label = line_clean.split(" ", 1)[0] sentence = line_clean.split(" ", 1)[1] # remove excess space due to deletion of punctuations and digits integrated = ' '.join(sentence.split()) # Split sentence into single characters text = list(integrated) # 4-gram LM with beginning and end padding padded_sent = list(pad_sequence(text, pad_left=True, left_pad_symbol="<s>", pad_right=True, right_pad_symbol="</s>", n=4)) padded_list = list(ngrams(padded_sent, n=4)) # Add one smoothing for item in padded_list: if item not in vocabulary: vocabulary[item] = [1,1,1] if (label == "malaysian"): malay_corpus.append(padded_list) elif (label == "indonesian"): indon_corpus.append(padded_list) elif (label == "tamil"): tamil_corpus.append(padded_list) fp.close() # size of distinct 4-grams in training data tot = len(vocabulary) ''' dictionary vocabulary key: such as ('S', 'a', 'm', 'e') value: [#(malay), #(indon), #tamil, #(random/other)] ''' cnt_m = tot cnt_i = tot cnt_t = tot for sentence in malay_corpus: for chars in sentence: cnt_m += 1 vocabulary[chars][0] += 1 for sentence in indon_corpus: for chars in sentence: cnt_i += 1 vocabulary[chars][1] += 1 for sentence in tamil_corpus: for chars in sentence: cnt_t += 1 vocabulary[chars][2] += 1 # Normalize with add-one-smoothing for key in vocabulary.keys(): lst = vocabulary[key] new_lst = [lst[0]/cnt_m, lst[1]/cnt_i, lst[2]/cnt_t, 1/tot] vocabulary[key] = new_lst return vocabulary
def test_LM(in_file, out_file, LM): """ test the language models on new strings each line of in_file contains a string you should print the most probable label for each string into out_file """ print("testing language models...") # This is an empty method # Pls implement your code in below ''' multiply the probabilities of the 4-grams for this string, and return the label (i.e., malaysian, indonesian, and tamil) that gives the highest product. Ignore the four-gram if it is not found in the LMs. ''' fp = open(in_file, "r") fo = open(out_file, "w") while True: line = fp.readline() if not line: break line_clean = preprocess(line) # remove excess space due to deletion of punctuations and digits integrated = ' '.join(line_clean.split()) # Split sentence into single characters text = list(integrated) # 4-gram LM with beginning and end padding padded_sent = list(pad_sequence(text, pad_left=True, left_pad_symbol="<s>", pad_right=True, right_pad_symbol="</s>", n=4)) padded_list = list(ngrams(padded_sent, n=4)) # the initial prob should be 1, but we take logarithm # as prob can be very small float number prob_m = 0 # prob of malaysian prob_i = 0 # prob of indonesian prob_t = 0 # prob of tamil prob_r = 0 # random model probability, used to determine other languages num_r = 0 # number of unseen four-grams num = 0 # number of total four-grams for four_gram in padded_list: num += 1 if four_gram not in LM.keys(): # Ignore the four-gram if it is not found in the LMs. # pass num_r += 1 else: # multiply the probabilities of the 4-grams for this string # but in practice we calculate logarithm by addition prob_m += math.log(LM[four_gram][0]) prob_i += math.log(LM[four_gram][1]) prob_t += math.log(LM[four_gram][2]) prob_r += math.log(LM[four_gram][3]) ''' return the label (i.e., malaysian, indonesian, and tamil) that gives the highest product. Add label at the beginning of the sentence ''' line_w = "" if ((num_r/num > 0.7) or prob_r >= max(prob_m, prob_i, prob_t)): line_w = "other " + line elif (prob_m >= prob_i) and (prob_m >= prob_t): line_w = "malaysian " + line elif (prob_i >= prob_m) and (prob_i >= prob_t): line_w = "indonesian " + line elif (prob_t >= prob_m) and (prob_t >= prob_i): line_w = "tamil " + line # else: # line_w = "other " + line # write predicted result into output file fo.write(line_w) fp.close() fo.close()
def test_text(model, extension, n_ngram, test_data_file, all_files): print("Arquivo utilizado no teste: ", test_data_file) print(test_data_file) if (all_files): path_file = os.path.join("./train",test_data_file) else: path_file = os.path.join("./test",test_data_file) # Read testing data with open(path_file, encoding='utf-8') as f: test_text = f.read().lower() test_text = re.sub(r'[^\w\s]', "", test_text) # Tokenize and pad the text testing_data = list(pad_sequence(word_tokenize(test_text), n_ngram, pad_left=True, left_pad_symbol="<s>")) print("Length of test data:", len(testing_data)) print("testing_data", testing_data) # assign scores scores = [] for i, item in enumerate(testing_data[n_ngram - 1:]): s = model.score(item, testing_data[i:i + n_ngram - 1]) scores.append(s) scores_np = np.array(scores) # set width and height width = 8 height = np.ceil(len(testing_data) / width).astype("int32") print("Width, Height:", width, ",", height) # copy scores to rectangular blank array a = np.zeros(width * height) a[:len(scores_np)] = scores_np diff = len(a) - len(scores_np) # apply gaussian smoothing for aesthetics a = gaussian_filter(a, sigma=1.0) # reshape to fit rectangle a = a.reshape(-1, width) # format labels labels = [" ".join(testing_data[i:i + width]) for i in range(n_ngram - 1, len(testing_data), width)] labels_individual = [x.split() for x in labels] labels_individual[-1] += [""] * diff labels = [f"{x:60.60}" for x in labels] # create heatmap fig = go.Figure(data=go.Heatmap( z=a, x0=0, dx=1, y=labels, zmin=0, zmax=1, customdata=labels_individual, hovertemplate='%{customdata} <br><b>Score:%{z:.3f}<extra></extra>', colorscale="burg")) fig.update_layout({"height": height * 28, "width": 1000, "font": {"family": "Courier New"}}) fig['layout']['yaxis']['autorange'] = "reversed" #fig.show() fig.write_html(file='./public/'+test_data_file+'.html')
from nltk import bigrams from nltk import trigrams from nltk import ngrams from nltk import FreqDist from nltk.util import pad_sequence #Create a corpus sentCorpus = ['I am Sam','Sam I am', 'I do not like green eggs and ham'] #Split the padded sentences into words words = [] for i in range(0,len(sentCorpus)): #Split the strings based on blankspace sen = sentCorpus[i].split(' ') #Pad at either end of the sentence with markers sent = list(pad_sequence(sen, pad_left=True, left_pad_symbol="<s>", pad_right=True, right_pad_symbol="</s>", n=2)) #Extend the list by adding words.extend(sent) print(words) #Unigrams, bigrams, trigrams, quadgrams listU = words listB = list(bigrams(words)) listT = list(trigrams(words)) listQ = list(ngrams(words,4)) #Get total number of unigrams, bigrams, trigrams and quadgrams cntU = len(listU) cntB = len(listB) cntT = len(listT) cntQ = len(listQ)
def success(): if request.method == 'POST': #pegando o arquivo e salvando na pasta a = request.form.getlist('ano') f = request.files['file'] f.save(os.path.join(app.config['UPLOAD_FOLDER'], f.filename)) #analisando arquivos: ano_salvo = str(a[0]) resultado_analise = next( os.walk("/home/{nome_de_usuario}/mysite/resultados/")) path, dirs, files = next( os.walk("/home/{nome_de_usuario}/mysite/projetos/" + ano_salvo + "/")) file_count = len(files) texto = "/home/{nome_de_usuario}/mysite/uploads/" + f.filename valores_maximos = [] valores_medios = [] valores_arquivo = [] j = 0 while j < file_count: #Variaveis: texto = "/home/{nome_de_usuario}/mysite/uploads/" + f.filename texto_salvo = parser.from_file( "/home/{nome_de_usuario}/mysite/projetos/" + ano_salvo + "/" + files[j]) texto_fornecido = parser.from_file(texto) #Fornecendo dados para a variável "train_text" com o valor de "texto_salvo" para posteriormente ser analisado train_text = texto_salvo['content'] # aplique o pré-processamento (remova o texto entre colchetes e chaves e rem punc) train_text = re.sub(r"\[.*\]|\{.*\}", "", train_text) train_text = re.sub(r'[^\w\s]', "", train_text) # definir o número ngram n = 5 # preencher o texto e tokenizar training_data = list( pad_sequence(word_tokenize(train_text), n, pad_left=True, left_pad_symbol="<s>")) # gerar ngrams ngrams = list(everygrams(training_data, max_len=n)) # build ngram language models model = WittenBellInterpolated(n) model.fit([ngrams], vocabulary_text=training_data) #Fornecendo dados para a variável "testt_text" com o valor de pdf2_test para posteriormente ser comparado com o arquivo de treinamento test_text = texto_fornecido['content'] test_text = re.sub(r'[^\w\s]', "", test_text) # Tokenize e preencha o texto testing_data = list( pad_sequence(word_tokenize(test_text), n, pad_left=True, left_pad_symbol="<s>")) # atribuir pontuações scores = [] for i, item in enumerate(testing_data[n - 1:]): s = model.score(item, testing_data[i:i + n - 1]) scores.append(s) scores_np = np.array(scores) # definir largura e altura width = 8 height = np.ceil(len(testing_data) / width).astype("int64") # copiar pontuações para matriz em branco retangular a = np.zeros(width * height) a[:len(scores_np)] = scores_np diff = len(a) - len(scores_np) # aplique suavização gaussiana para estética a = gaussian_filter(a, sigma=1.0) # remodelar para caber no retângulo a = a.reshape(-1, width) # rótulos de formato labels = [ " ".join(testing_data[i:i + width]) for i in range(n - 1, len(testing_data), width) ] labels_individual = [x.split() for x in labels] labels_individual[-1] += [""] * diff labels = [f"{x:60.60}" for x in labels] # criar mapa de calor para colocar no resultado visual fig = go.Figure(data=go.Heatmap( z=a, x0=0, dx=1, y=labels, zmin=0, zmax=1, customdata=labels_individual, hovertemplate= '%{customdata} <br><b>Pontuacao:%{z:.3f}<extra></extra>', colorscale="burg")) fig.update_layout({ "height": height * 40, "width": 1000, "font": { "family": "Courier New" } }) #criando resultado visual: #plotly.offline.plot(fig, filename='/home/Allberson/mysite/resultados/resultado.html', auto_open=False) #Armazenando dados dos scores para mostrar posteriormente valores_scores = np.array(scores) #Atribuindo valores para propor condições de valores: buscar_max = 0.9000000000000000 #Nivel alto de plágio buscar_med = 0.8000000000000000 #Nível acima da média #atribuindo valores mais autos de cópia maximo = np.where(valores_scores > buscar_max)[0] medio = np.where( valores_scores > buscar_med)[0] #Nao ustilizado no momento valores_maximos.insert(j, len(maximo)) valores_medios.insert(j, len(medio)) #Nao ustilizado no momento valores_arquivo.insert(j, files[j]) j = j + 1 #buscando arquivo com maior nível igualdade: val_maximo = np.array(valores_maximos) val_medio = np.array(valores_medios) busc_val_max = 1090 busc_val_med = 500 maxx = np.where(val_maximo > busc_val_max)[0] medd = np.where(val_medio > busc_val_med)[0] #Iniciando a página web if len(maxx) == 0: ano = ano_salvo resultado_false = "Nenhum arquivo encontrado que se iguale com o seu" os.remove('/home/{nome_de_usuario}/mysite/uploads/' + f.filename) #removendo arquivo enviado pelo usuário return render_template("resultado_page.html", name=f.filename, resultado_neg=resultado_false, valor_ano=ano) elif len(maxx) > 0: ano = ano_salvo tot_projetos = file_count resultado_mensagem = 'Encontramos um projeto com uma grande similaridade.' valor = "80%" enc = "Encontramos alguns projetos tiveram resultados positivos no momento de nossa análise. Veja a tabela abaixo" projetos_nomes_ok = files[int(maxx)] mens = "O(s) projeto(s) analisado(s) pode/podem ter um valor igual ou superior ao mostrado na coluna 'valor de cópia' : " os.remove('/home/{nome_de_usuario}/mysite/uploads/' + f.filename) return render_template("resultado_page.html", name=f.filename, mensagem=mens, resultado_men=resultado_mensagem, resultado_proj=projetos_nomes_ok, resultado_max=valor, encontrado=enc, valor_ano=ano, tot_proj=tot_projetos)