def teste1N(self, diretorioSusp, nomeSusp): ''' Testa um documento suspeito para todos os fontes do diretorio da classe ''' corp = c.Corpus(self.diretorio) corp.carregarDiretorio() doc = self.buscarArquivo(diretorioSusp, nomeSusp) docsBasePlagio = corp.verificaPlagio(doc, 0.01) return self.salvarLogSaida(docsBasePlagio, nomeSusp)
def teste11(self, diretorioSusp, nomeSusp, nomeFonte): ''' Testa um documento suspeito para um fonte cujo nome informado se encontra no diretorio da classe ''' corp = c.Corpus(self.diretorio) docFonte = corp.carregarDoc(self.diretorio + nomeFonte, nomeSusp) corp.lDocumentos.anexar(docFonte) doc = self.carregarDoc(diretorioSusp + nomeSusp, nomeSusp) docsBasePlagio = corp.verificaPlagio(doc, 0.01) return self.salvarLogSaida(docsBasePlagio, nomeSusp)
def score_authors(author_list, abstract): """ Scores a list of authors against a given abstract :param author_list: A list of authors populated with papers :param abstract: Abstract to be scored against :return: """ # create corpus from query words docs = {} cachedStopWords = stopwords.words("english") query = TextBlob(abstract.lower()) docs[-1] = query corpWords = [] for word in query.words: if word not in cachedStopWords and word not in corpWords: corpWords.append(word) # construct tf-idf vectors from documents maxCitations = 0 for author in author_list: for paper in author.papers: if paper.citations > maxCitations: maxCitations = paper.citations if paper.id not in docs.keys(): docs[paper.id] = TextBlob(paper.desc.lower()) corpus = Corpus(docs, corpWords) corpus.constructVectors() # cosine similarity query = corpus.scoredDocs[0].vector # original doc has id of -1 for doc in corpus.scoredDocs: if doc.id == -1: query = doc.vector docDict = {} for document in corpus.scoredDocs: sim = cosine_sim(query, document.vector) document.addScore(sim) docDict[document.id] = sim for author in author_list: author.setCosineSimilarity(docDict) author.scorePapers(maxCitations) author.papers.sort(key=lambda paper: paper.finalScore, reverse=True) author.scoreAuthor()
def re_gen(dataset, type, id): corpus = Corpus(config['CORPUS'][dataset], dataset) tmp_dir = f'./tmp/{dataset}/{type}/{id}' create_dir(tmp_dir) def get_random_corpus_file(type): original_file_path = random.sample( glob.glob( os.path.join(get_repo_dir(dataset), f'./{type}/*/*-orig.java')), 1)[0] original_file_name = original_file_path.split('/')[-1].split( '-orig')[0] + '.java' tmp_original_path = os.path.join(tmp_dir, original_file_name) shutil.copy(original_file_path, tmp_original_path) return (original_file_name, '', tmp_original_path) gen_errored(corpus, get_random_corpus_file, dataset, type, id, get_repo_dir(dataset))
def analys(corpus_name): corpus = Corpus(corpus_path[corpus_name], corpus_name) corpus.get_data() path = "plot/" X, Y = corpus.data print("size", len(Y)) circle_plot(Histograme(corpus), path + "/" + corpus_name + "/", title=corpus_name + " : distribution of relationships") st = get_stop_words('en') st.extend(string.punctuation) st.extend([str(i) for i in range(10)]) def rm_stop_words(dic): for i in st: if i in dic: dic[i] = 0 return dic vocab, vocab_rel = get_vocab(corpus) vocab[''] = 0 vocab = rm_stop_words(vocab) H = pd.DataFrame.from_dict(vocab, orient='index').nlargest(20, 0).to_dict()[0] histo(H, path + "/" + corpus_name + "/", title=corpus_name + " Histo") for i in get_rel_class(corpus): vocab = vocab_rel[i] vocab[''] = 0 vocab = rm_stop_words(vocab) for k in H: if k in vocab: vocab[k] = 0 Hi = pd.DataFrame.from_dict(vocab, orient='index').nlargest(20, 0).to_dict()[0] histo(Hi, path + "/" + corpus_name + "/", title=corpus_name + " relation " + i + " Histo") dist = Dist(corpus) box(dist, path + "/" + corpus_name + "/", title=corpus_name + " distances") dist = Dist(corpus) mean_frame, std = [], [] for rel in dist.keys(): df = pd.DataFrame.from_dict({rel: dist[rel]}) mean_frame.append(df.mean()) std.append(df.std()) mean = pd.DataFrame(pd.concat(mean_frame), columns=["mean"]) std = pd.DataFrame(pd.concat(std), columns=["std"]) res = pd.concat((mean, std), axis=1) data = {'sentence length': [], 'Vocab': [], 'tokenisation length': []} tokenizer_bert, _ = get_bert() tokenizer_scibert, _ = get_bert(bert_type='scibert') for x in X: data['sentence length'].append(len(x[0].split(' '))) data['Vocab'].append('BERT VOCAB') data['tokenisation length'].append(len(tokenizer_bert.tokenize(x[0]))) data['sentence length'].append(len(x[0].split(' '))) data['Vocab'].append('SciBERT VOCAB') data['tokenisation length'].append( len(tokenizer_scibert.tokenize(x[0]))) data = pd.DataFrame(data) data = data.sort_values(by=['sentence length']) print(data) title = corpus_name + " tokenisation analysis" plt.rcParams["figure.figsize"] = (9, 9) pylab.mpl.style.use('seaborn') g = sns.relplot(x="sentence length", y="tokenisation length", hue="Vocab", style="Vocab", hue_order=['SciBERT VOCAB', 'BERT VOCAB'], kind="line", data=data, col_order=['SciBERT VOCAB', 'BERT VOCAB'], style_order=['SciBERT VOCAB', 'BERT VOCAB']) sns.despine() plt.title(title) plt.show() plt.savefig(title + ".png")
def train(): print "train" start_time = time.time() config = SiameseTCNNConfig() corpus = Corpus(train_file, vocab_file, 0.0, config.seq_length, config.vocab_size) testcorpus = Corpus(test_file, vocab_file, 1.0, config.seq_length, config.vocab_size) print(corpus) print(testcorpus) config.vocab_size = len(corpus.words) train_data = TensorDataset(torch.LongTensor(corpus.x_train1), torch.LongTensor(corpus.x_train2), torch.FloatTensor(corpus.y_train)) test_data = TensorDataset(torch.LongTensor(testcorpus.x_test1), torch.LongTensor(testcorpus.x_test2), torch.FloatTensor(testcorpus.y_test)) print('Configuring CNN model...') model = SiameseTextCNN(config) print(model) # optimizer and loss function # criterion = nn.CrossEntropyLoss(size_average=False) # criterion = torch.nn.BCELoss(reduce=False, size_average=False) if config.contra_loss: criterion = ContrastiveLoss() optimizer = optim.Adam(model.parameters(), lr=config.learning_rate) # set the mode to train print("Training and evaluating...") best_F1 = 0.0 for epoch in range(config.num_epochs): # load the training data in batch model.train() train_loader = DataLoader(train_data, batch_size=config.batch_size) ii = 0 for x1_batch, x2_batch, y_batch in train_loader: ii += 1 if ii % 100 == 0: print epoch, "batch", ii inputs1, inputs2, targets = Variable(x1_batch), Variable( x2_batch), Variable(y_batch) optimizer.zero_grad() outputs1, outputs2 = model(inputs1, inputs2) # forward computation loss = criterion(outputs1, outputs2, targets) """ todo """ # backward propagation and update parameters loss.backward() optimizer.step() # evaluate on both training and test dataset print "epoch", epoch train_loss, train_F1 = evaluate(train_data, model, criterion) test_loss, test_F1 = evaluate(test_data, model, criterion) #print "train_loss:",train_loss if test_F1 > best_F1: # store the best result best_F1 = test_F1 improved_str = '*' torch.save(model.state_dict(), model_file) else: improved_str = '' time_dif = get_time_dif(start_time) msg = "Epoch {0:3}, Train_loss: {1:>7.3}, Train_F1 {2:>6.3%}, " \ + "Test_loss: {3:>6.3}, Test_F1 {4:>6.3%}, Time: {5} {6}" print( msg.format(epoch + 1, train_loss, train_F1, test_loss, test_F1, time_dif, improved_str))
except: author = i['author']['name'] txt = i['title'] + ". " + i['summary'] txt = txt.replace('\n', ' ') txt = txt.replace('\r', ' ') try: coAuth = [aut['name'] for aut in i['author']][1:] except: coAuth = "Pas de Co-Auteur" doc = Document.ArxivDocument(datet, i['title'], author, txt, i['id'], coAuth) corpus_Arxiv.add_doc(doc) #Initialisation des corpus corpus_Reddit = Corpus.Corpus("Corona_red") corpus_Arxiv = Corpus.Corpus("Corona_arx") #Chargement des données dans les corpus loadArxiv(corpus_Arxiv) loadReddit(corpus_Reddit) #Affichage du nombre de documents et d'auteurs print("Création du corpus Reddit, %d documents et %d auteurs" % (corpus_Reddit.ndoc, corpus_Reddit.naut)) print("Création du corpus Arxiv, %d documents et %d auteurs" % (corpus_Arxiv.ndoc, corpus_Arxiv.naut)) print() #Enregistrement des corpus
import Corpus import numpy as np import IBM1 import IBM2 import HMM print("loading the corpus...") corpus = Corpus.Corpus("eutrans/training", separator="#") #corpus = Corpus.Corpus("corpus.txt", separator="---") corpus.print_corpus_description() print("...done") #%% Testing IBM1 # print(" ") # print("*"*50) # print(" ") # print("Building IBM1 item...") # ibm1 = IBM1.IBM1(corpus) # print("...done") # print("starting to train IBM1...") # ibm1_nb_training_step = 10 # imb1perplexityevol = ibm1.train(ibm1_nb_training_step, verbose=True) # print("...done") # # print "\nIBM1 perplexity : ",ibm1.get_perplexity(),"\n" # # f2e = np.argmax(ibm1.proba_f_knowing_e,axis=1) # print "IBM1 Translations :" # for i in range(len(corpus.french_words)): # print corpus.french_words[i], " --> ", corpus.english_words[f2e[i]]
from glove import Glove import Corpus corpus = Corpus() sent_token = [["안녕", "하세요"], ["지니티토리", "입니다"]] corpus.fit(sent_token, window=20) # model glove = Glove(no_components=128, learning_rate=0.01) glove.fit(corpus.matrix, epochs=50, no_threads=4, verbose=False) glove.add_dictionary(corpus.dictionary) # save glove.save(DATA_DIR + '/glove_w20_epoch50.model')
from Corpus import * if __name__ == '__main__': #Read corpus #corp_path = raw_input("Please input the path of the corpus:\n") #train_file = raw_input("Please input the filename of the training data:\n") #gold_file = raw_input("Please input the filename of the gold label:\n") train_file = "trail.csv" gold_file = "trial.labels" corpus = Corpus(train_file) #corpus.readCourpus() #training part. '''..to be complete ''' predict_file = "trial.predict" #Evaluation if (corpus.gold_file != gold_file): corpus.readGold(gold_file) if (corpus.predict_file != predict_file): corpus.readPrediction(predict_file) corpus.evaluation() corpus.print_result()
return (original_file_name, '', tmp_original_path) gen_errored(corpus, get_random_corpus_file, dataset, type, id, get_repo_dir(dataset)) if __name__ == '__main__': if sys.argv[2] == 'all': dataset_list = list_folders(get_repo_dir('')) else: dataset_list = sys.argv[2:] if len(sys.argv) >= 2 and sys.argv[1] == 'run': corpora = [] for corpus in sys.argv[2:]: corpora.append(Corpus(config['CORPUS'][corpus], corpus)) share = { key: config['DATASHARE'].getint(key) for key in ['learning', 'validation', 'testing'] } for corpus in corpora: gen_dataset(corpus, share) if len(sys.argv) >= 2 and sys.argv[1] == 'exp': for dataset in tqdm(dataset_list, desc='datasets'): target = get_experiment_dir(dataset) if not os.path.exists(target): gen_experiment(dataset) run_experiment(dataset) if len(sys.argv) >= 2 and sys.argv[1] == 'exp-cs': results = {} for dataset in dataset_list:
np.random.seed(SEED) os.environ["CUDA_VISIBLE_DEVICES"] = "0" from gensim.models import word2vec custom_embedding = WordEmbeddings('pathtotoEmbeddings.vec') # now create the StackedEmbedding object that combines all embeddings stacked_embeddings = StackedEmbeddings(embeddings=[ custom_embedding ]) # , charlm_embedding_forward,charlm_embedding_backward]) dataset_dict = Dataset_load.load() corp = Corpus.Corpus(dataset_dict, embeddings_file_path=None, stacked_embeddings=stacked_embeddings) model_params = { "filter_width": 3, "embeddings_dropout": True, "n_filters": [256], "dense_dropout": True, "token_embeddings_dim": 300, "char_embeddings_dim": 50, "cell_type": 'lstm', "use_batch_norm": True, "concat_embeddings": True, "use_crf": True, "use_char_embeddins": True, "net_type": 'rnn',