def __init__(self, loc=MOHX_LOCATION): self.instances, self.words = [], [] c = 0 for line in open(loc).readlines()[1:]: sentence = Corpus.Sentence() data = line.split(",") sentence.id = str(c) c += 1 word_data = data[3].split() for i in range(len(word_data)): met = "N" if i == int(data[-2]): met = "tag-" + data[-1].strip() w = Corpus.Word(text=word_data[i], met=met, sentence=sentence, index=i) sentence.words.append(w) self.words.append(w) self.instances.append(sentence) Corpus.add_dependencies(self.instances, MOHX_DEPS, lex_field=1)
def do_filter(sample_url_path, corpus_path, sample_corpus_path): import Corpus name_set = set( map(lambda line: line.strip().split()[0].split('/')[-1], open(sample_url_path).readlines())) trec_reader = Corpus.TRECReader() trec_reader.open(corpus_path) trec_writer = Corpus.TRECWriter(sample_corpus_path) doc = trec_reader.next() start_title_tag = '<title>' start_title_tag_len = len(start_title_tag) end_title_tag = '</title>' count = 0 while doc: text = doc.text start = text.find(start_title_tag) end = text.find(end_title_tag) title = '' if start >= 0 and end >= 0: title = text[start + start_title_tag_len:end] if name_set.__contains__(title): trec_writer.write(doc) count += 1 if count % 1000 == 0: print count doc = trec_reader.next() trec_reader.close() trec_writer.close()
def __init__(self): self.model = NMT_Model.NMT_Model() self.srcVocab = Corpus.Vocabulary() self.trgVocab = Corpus.Vocabulary() self.srcVocab.loadDict(Config.srcVocabF) self.trgVocab.loadDict(Config.trgVocabF) self.trainData = Corpus.BiCorpus(self.srcVocab, self.trgVocab, Config.trainSrcF, Config.trainTrgF) self.valData = Corpus.BiCorpus(self.srcVocab, self.trgVocab, Config.valSrcF, Config.valTrgF) self.buckets = self.trainData.getBuckets() self.networkBucket = {} self.bestValCE = 999999 self.bestBleu = 0 self.badValCount = 0 self.maxBadVal = 5 self.learningRate = Config.LearningRate self.inputSrc = tf.placeholder( tf.int32, shape=[Config.MaxLength, Config.BatchSize], name='srcInput') self.maskSrc = tf.placeholder( tf.float32, shape=[Config.MaxLength, Config.BatchSize], name='srcMask') self.inputTrg = tf.placeholder( tf.int32, shape=[Config.MaxLength, Config.BatchSize], name='trgInput') self.maskTrg = tf.placeholder( tf.float32, shape=[Config.MaxLength, Config.BatchSize], name='trgMask') self.optimizer = tf.train.AdamOptimizer() self.createBucketNetworks()
def do_match(infobox_path, text_path, out_path): import Corpus import time print 'loading......' infobox = load_infobox(infobox_path) reader = Corpus.TRECReader() reader.open(text_path) writer = Corpus.TRECWriter(out_path) matcher = InfoBoxMatcher() t0 = time.time() count = 0 doc = reader.next() while doc: text = doc.text lines = text.split('\n') newlines = lines[:3] title_line = lines[1] title_begin_index = title_line.find('>') title_end_index = title_line.find('<', title_begin_index + 1) title = '' if title_begin_index >= 0 and title_end_index >= 0: title = title_line[title_begin_index + 1:title_end_index].strip() if infobox.has_key(title): tagged_text = matcher.match(infobox[title], lines[3:]) doc.text = '\n'.join(lines[:3]) + '\n' doc.text += tagged_text writer.write(doc) doc = reader.next() count += 1 if count % 100 == 0: print count, time.time() - t0 writer.close()
def __init__(self, threshhold=1): super().__init__() self.instances, self.words = [], [] def merge_sents(sent1, sent2): for i in range(len(sent1.words)): if sent1.words[i].met == "N" and sent2.words[i].met != "N": sent1.words[i].met = sent2.words[i].met if sent1.words[i].met != "N" and sent2.words[ i].met != "N" and sent1.words[i].met != sent2.words[ i].met: if sent2.words[i].met not in sent1.words[i].met: sent1.words[i].met += "-" + sent2.words[i].met lcc_data = etree.parse(LCC_LOCATION) instances = lcc_data.findall(".//LmInstance") metaphors = set() for instance in instances: metaphor = LCCMetaphor(instance) if metaphor.met_score >= threshhold: metaphors.add(metaphor) Corpus.add_dependencies(metaphors, LCC_DEPS, lex_field=0) Corpus.add_vn_parse(metaphors, LCC_VN) #Corpus.add_allen_parse(metaphors, "C:/Users/Kevin/PycharmProjects/metaphor/corpora/lcc_metaphor_dataset/lcc_allen.tagged") #Constructions.predict_constructions(metaphors) for met in metaphors: self.instances.append(met) self.words.extend(met.words) super().build_lexicon()
def __init__(self): self.model = RNNLM_Model.LM_Model() self.trainData = Corpus.MonoCorpus(Config.trgVocabF, Config.trainTrgF) self.valData = Corpus.MonoCorpus(Config.trgVocabF, Config.valTrgF) self.networkBucket = {} self.exampleNetwork = self.getNetwork(Config.BucketGap) if os.path.isfile(Config.initModelF): self.model.loadModel(Config.initModelF)
def F1scores(y_true, y_pred): cs_positive_correct = 0.0 cs_positive_total = 0.0 cs_positive_true = 0.0 mono_positive_correct = 0.0 mono_positive_total = 0.0 mono_positive_true = 0.0 for i in range(0, len(y_pred)): tag_true = Corpus.globalTag(y_true[i]) tag_pred = Corpus.globalTag(y_pred[i]) if tag_pred == "CS": cs_positive_total += 1.0 elif tag_pred == "ES" or tag_pred == "EUS": mono_positive_total += 1.0 if tag_pred == "CS" and tag_true == "CS": cs_positive_correct += 1.0 elif tag_pred == "ES" and tag_true == "ES": mono_positive_correct += 1.0 elif tag_pred == "EUS" and tag_true == "EUS": mono_positive_correct += 1.0 if tag_true == "CS": cs_positive_true += 1.0 elif tag_true == "ES" or tag_true == "EUS": mono_positive_true += 1.0 cs_precision = float(cs_positive_correct / cs_positive_total) cs_recall = float(cs_positive_correct / cs_positive_true) if cs_precision + cs_recall > 0: cs_f1 = float(2.0 * ((cs_precision * cs_recall) / (cs_precision + cs_recall))) else: cs_f1 = 0.0 mono_precision = float(mono_positive_correct / mono_positive_total) mono_recall = float(mono_positive_correct / mono_positive_true) if mono_precision + mono_recall > 0: mono_f1 = float(2.0 * ((mono_precision * mono_recall) / (mono_precision + mono_recall))) else: mono_f1 = 0.0 print "" print "Precision for code-switched tweets: %.5f" % cs_precision print "Recall for code-switched tweets: %.5f" % cs_recall print "F1-score for code-switched tweets: %.5f" % cs_f1 print "Code-switched tagged tweets: %d" % cs_positive_total print "Precision for monolingual tweets: %.5f" % mono_precision print "Recall for monolingual tweets: %.5f" % mono_recall print "F1-score for monolingual tweets: %.5f" % mono_f1 print "Monolingual tagged tweets: %d" % mono_positive_total
def __init__(self): self.model = RnnlmModel.LM_Model() self.trainData = Corpus.MonoCorpus(Config.trgVocabF, Config.trainTrgF) self.valData = Corpus.MonoCorpus(Config.trgVocabF, Config.valTrgF) self.networkBucket = {} self.inputTrg = tf.placeholder( tf.int32, shape=[Config.MaxLength, Config.BatchSize], name='input') self.maskTrg = tf.placeholder( tf.float32, shape=[Config.MaxLength, Config.BatchSize], name='inputMask') self.optimizer = tf.train.AdamOptimizer() self.createBucketNetworks(Config.MaxLength)
def trainCRF(corpus_file_name): X_set = [] Y_set = [] global options #Read the corpus CS_Corpus = open(corpus_file_name, 'rb') CS_Reader = csv.reader(CS_Corpus, delimiter=',', quotechar='"') CS_Reader.next() #Skip first line lines = 0 for row in CS_Reader: (X_set_part, Y_set_part) = TrainTweetToCRF(tweet=Corpus.getTweetTokensTags(row), token_prev_next=token_prev_next, options=options, y_set=True) if X_set_part and Y_set_part: X_set.extend(X_set_part) Y_set.extend(Y_set_part) lines += 1 CS_Corpus.close() print "Tweets read: %d" % lines print "Train amount: %d" % lines crf = sklearn_crfsuite.CRF(algorithm='lbfgs', c1=0.1, c2=0.1, max_iterations=100, all_possible_transitions=True) crf.fit(X_set, Y_set) #Train CRF return crf
def __init__(self): self.corpus = Corpus() self.featurematrix = [] self.doctermmatrix = [] self.classlabels = [] self.spacename = ""
def do_convert_mallet(match_path, mallet_path, tag_path, num): import Corpus reader = Corpus.TRECReader() reader.open(match_path) doc = reader.next() converter_type = 'token' converter = get_converter(converter_type) converter.open(mallet_path) tag_set = set(map(lambda s: s.strip(), open(tag_path).readlines())) num = int(num) doc_count = 0 t0 = time.time() total_count = 0 while doc: tagged_text = TaggedText() tagged_text.get_from_string(doc.text) convert_mallet(tagged_text, converter, tag_set) doc = reader.next() doc_count += 1 if doc_count % 10 == 0: print doc_count, time.time() - t0 if doc_count > num: break converter.close() reader.close()
def do_count_length(in_trec, out_path): import Corpus reader = Corpus.TRECReader() reader.open(in_trec) doc = reader.next() count = 1; entry_per_file = 10000 json_list = [] start_time = time.time() with codecs.open(out_path, encoding='utf8', mode='w') as writer: while doc: length = len(doc.text) if '#redirect' in doc.text.lower(): doc = reader.next() continue plain = Wiki2Plain(get_main_section(doc.text)) text = plain.text body_start_pos = text.find('\n') if body_start_pos > 0: title = text[:body_start_pos] writer.write(u'%s\t%d\n' % (title, length)) writer.flush() doc = reader.next() reader.close()
def do_batch(in_trec, out_dir): import Corpus reader = Corpus.TRECReader() reader.open(in_trec) doc = reader.next() count = 1; entry_per_file = 10000 json_list = [] start_time = time.time() while doc: plain = Wiki2Plain(get_main_section(doc.text)) text = plain.text body_start_pos = text.find('\n') if body_start_pos > 0: title = text[:body_start_pos] body = text[body_start_pos:] if not title.count(':') or not re.match(invalid_title_pattern, title.split(':')[0]): json_list.append({'id': str(count), 'title': title.strip(), 'body': body.strip()}) if count % entry_per_file == 0: out_path = os.path.join(out_dir, str(count / entry_per_file) + '.json') print('writing', out_path) with codecs.open(out_path, encoding='utf-8', mode='w') as writer: json.dump(json_list, writer, indent=2, ensure_ascii=False) json_list = [] print(count, title, time.time() - start_time) count += 1 doc = reader.next() reader.close()
def do_batch_apply(trec_path, model_dir, pattern_path, out_path, lib_dir): get_classpath(lib_dir) check_java_compile(lib_dir) pattern_set = set( map(lambda line: line.split()[0], open(pattern_path).readlines())) base_tag_trec_path = '%s.basetag' % trec_path command = [ 'java', '-Xms13G', '-Xmx13G', '-classpath', class_path, stanford_tag_program, '--batch-trec', trec_path, base_tag_trec_path ] print ' '.join(command) subprocess.call(command) t = time.time() reader = Corpus.TRECReader() reader.open(base_tag_trec_path) doc = reader.next() indecies = [0] ids = [] all_tagged_text = None while doc: tagged_text = TaggedText() tagged_text.get_from_string('\n'.join( filter(lambda line: not line.startswith('<'), doc.text.split('\n')))) if all_tagged_text: all_tagged_text += tagged_text else: all_tagged_text = tagged_text indecies.append(len(all_tagged_text)) tagged_text = apply_tag(trec_path, tagged_text, model_dir, pattern_set) ids.append(doc.ID) doc = reader.next() reader.close() os.remove(base_tag_trec_path) #tagged_text = apply_tag(trec_path, all_tagged_text, model_dir, pattern_set) print len(tagged_text) writer = Corpus.TRECWriter(out_path) for i in xrange(len(ids)): doc = Corpus.Document( ids[i], tagged_text[indecies[i]:indecies[i + 1]].__str__()) writer.write(doc) writer.close() global prune_t, label_t print time.time() - t, prune_t, label_t
def score_authors(author_list, abstract): """ Scores a list of authors against a given abstract :param author_list: A list of authors populated with papers :param abstract: Abstract to be scored against :return: """ # create corpus from query words docs = {} cachedStopWords = stopwords.words("english") query = TextBlob(abstract.lower()) docs[-1] = query corpWords = [] for word in query.words: if word not in cachedStopWords and word not in corpWords: corpWords.append(word) # construct tf-idf vectors from documents maxCitations = 0 for author in author_list: for paper in author.papers: if paper.citations > maxCitations: maxCitations = paper.citations if paper.id not in docs.keys(): docs[paper.id] = TextBlob(paper.desc.lower()) corpus = Corpus(docs, corpWords) corpus.constructVectors() # cosine similarity query = corpus.scoredDocs[0].vector # original doc has id of -1 for doc in corpus.scoredDocs: if doc.id == -1: query = doc.vector docDict = {} for document in corpus.scoredDocs: sim = cosine_sim(query, document.vector) document.addScore(sim) docDict[document.id] = sim for author in author_list: author.setCosineSimilarity(docDict) author.scorePapers(maxCitations) author.papers.sort(key=lambda paper: paper.finalScore, reverse=True) author.scoreAuthor()
def __init__(self): super().__init__() self.instances, self.words = [], [] lemmatizer = WordNetLemmatizer() cur_verb, cluster = "", "" for line in open(TROFI_LOCATION).readlines(): if re.match(r"\*\*\*[a-z]", line): cur_verb = line.split("***")[1] continue elif "*" in line or not line.strip(): if "literal" in line: cluster = "literal" elif "nonliteral" in line: cluster = "nonliteral" continue sentence = Corpus.Sentence() data = line.strip().split("\t") sentence.id = data[0] met = "" if "N" in data[1]: met = "met" if "L" in data[1]: met = "N" if "U" in data[1]: met = "?" for i in range(len(data[2].split())): word = data[2].split()[i] v_lem = lemmatizer.lemmatize(word, "v") cur_met = "N" if v_lem == cur_verb: cur_met = "tag-" + met w = Corpus.Word(text=word, met=cur_met, sentence=sentence, index=i) sentence.words.append(w) self.words.append(w) self.instances.append(sentence) Corpus.add_dependencies(self.instances, TROFI_DEPS, lex_field=1)
def process(self, title, text): import Corpus self.count += 1 title = title.replace(' ', '_').encode('utf8') text = text.encode('utf8') if self.name_set.__contains__(title): self.writer.write(Corpus.Document(str(self.id), '<title>%s</title>\n%s' % (title, text))) print self.count, self.id, title self.id += 1
def teste1N(self, diretorioSusp, nomeSusp): ''' Testa um documento suspeito para todos os fontes do diretorio da classe ''' corp = c.Corpus(self.diretorio) corp.carregarDiretorio() doc = self.buscarArquivo(diretorioSusp, nomeSusp) docsBasePlagio = corp.verificaPlagio(doc, 0.01) return self.salvarLogSaida(docsBasePlagio, nomeSusp)
def teste11(self, diretorioSusp, nomeSusp, nomeFonte): ''' Testa um documento suspeito para um fonte cujo nome informado se encontra no diretorio da classe ''' corp = c.Corpus(self.diretorio) docFonte = corp.carregarDoc(self.diretorio + nomeFonte, nomeSusp) corp.lDocumentos.anexar(docFonte) doc = self.carregarDoc(diretorioSusp + nomeSusp, nomeSusp) docsBasePlagio = corp.verificaPlagio(doc, 0.01) return self.salvarLogSaida(docsBasePlagio, nomeSusp)
def do_batch(in_trec, out_trec): import Corpus reader = Corpus.TRECReader() reader.open(in_trec) writer = Corpus.TRECWriter(out_trec) doc = reader.next() count = 1 while doc: plain = Wiki2Plain(doc.text) text = plain.text pos = text.find('\n') if pos > 0: text = '<title>%s</title>%s' % (text[:pos], text[pos:]) doc.text = text writer.write(doc) doc = reader.next() if count % 1000 == 0: print count count += 1 reader.close() writer.close()
def __init__(self, corpus_location): self.instances, self.words = [], [] data = csv.reader(open(corpus_location)) next(data) for line in data: sentence = Corpus.Sentence() sentence.id = line[1] index = int(line[-2]) tag = int(line[-1]) sent_data = line[3].split() for i in range(len(sent_data)): word = sent_data[i] met = "N" if i == index: met = "met" w = Corpus.Word(text=word, sentence=sentence, met=met, index=i) sentence.words.append(w) self.words.append(w) self.instances.append(sentence)
def __init__(self): self.model = NMT_Model.NMT_Model() self.srcVocab = Corpus.Vocabulary() self.trgVocab = Corpus.Vocabulary() self.srcVocab.loadDict(Config.srcVocabF) self.trgVocab.loadDict(Config.trgVocabF) self.trainData = Corpus.BiCorpus(self.srcVocab, self.trgVocab, Config.trainSrcF, Config.trainTrgF) self.valData = Corpus.BiCorpus(self.srcVocab, self.trgVocab, Config.valSrcF, Config.valTrgF) self.valBleuData = Corpus.ValCorpus(self.srcVocab, self.trgVocab, Config.valFile, Config.refCount) self.decoder = NMT_Decoder.NMT_Decoder(self.model, self.srcVocab, self.trgVocab) self.networkBucket = {} self.exampleNetwork = self.getNetwork(1, 1) self.bestValCE = 999999 self.bestBleu = 0 self.badValCount = 0 self.maxBadVal = 5 self.learningRate = Config.LearningRate if os.path.isfile(Config.initModelF): self.model.loadModel(Config.initModelF)
def __init__(self, lcc_instance_node): super().__init__() self.target_cm = [lcc_instance_node.get('targetConcept')] annotations_element = lcc_instance_node.find(".//Annotations") met_anns = annotations_element.find(".//MetaphoricityAnnotations") self.met_score = sum([float(m.get('score')) for m in met_anns]) / len(met_anns) cm_source_anns = annotations_element.find(".//CMSourceAnnotations") self.source_cm = [] if cm_source_anns is not None: self.source_cm = set([(cm.get("sourceConcept"), float(cm.get("score"))) for cm in cm_source_anns if float(cm.get('score')) >= 0]) self.chain = lcc_instance_node.get('chain') self.id = lcc_instance_node.get('id') all_text = lcc_instance_node.find(".//TextContent") self.current_text = all_text.find(".//Current") self.prev_text = all_text.find(".//Prev") self.next_text = all_text.find(".//Next") self.source_lm = self.current_text.find(".//LmSource").text.strip() self.target_lm = self.current_text.find(".//LmTarget").text.strip() i = 0 all_words = [] for word_group in self.current_text.itertext(): if word_group.strip() == self.source_lm: met = ["source", self.source_cm, self.met_score] elif word_group.strip() == self.target_lm: met = ["target", self.target_cm, self.met_score] else: met = ["N", "", ""] for w in [ w for w in re.findall(r"[\w']+|[.,?!;:\"']", word_group) if w != "=" ]: self.words.append( Corpus.Word(text=w, met=met, index=i, sentence=self)) i += 1
def re_gen(dataset, type, id): corpus = Corpus(config['CORPUS'][dataset], dataset) tmp_dir = f'./tmp/{dataset}/{type}/{id}' create_dir(tmp_dir) def get_random_corpus_file(type): original_file_path = random.sample( glob.glob( os.path.join(get_repo_dir(dataset), f'./{type}/*/*-orig.java')), 1)[0] original_file_name = original_file_path.split('/')[-1].split( '-orig')[0] + '.java' tmp_original_path = os.path.join(tmp_dir, original_file_name) shutil.copy(original_file_path, tmp_original_path) return (original_file_name, '', tmp_original_path) gen_errored(corpus, get_random_corpus_file, dataset, type, id, get_repo_dir(dataset))
def do_stat(match_path): import Corpus counts = {} conflicts = set() reader = Corpus.TRECReader() reader.open(match_path) doc = reader.next() doc_count = 0 t0 = time.time() total_count = 0 while doc: for token in doc.text.split(): pos = token.find('/') if pos > 0: tag_string = token[pos + 1:] if tag_string.startswith('[') and tag_string.endswith(']'): conflict_set = set() for tag_token in tag_string[1:-1].split(','): if tag_token.startswith('wiki:'): conflict_set.add(tag_token) total_count += 1 if counts.has_key(tag_token): counts[tag_token] += 1 else: counts[tag_token] = 1 if len(conflict_set) > 1: conflicts.add(' '.join(list(conflict_set))) doc = reader.next() doc_count += 1 if doc_count % 1000 == 0: print doc_count, time.time() - t0, total_count, len(counts), len( conflicts) count_array = map(lambda tag_count: (tag_count[1], tag_count[0]), counts.items()) count_array.sort(reverse=True) for count, tag in count_array: print count, tag for conflict in conflicts: print conflict
from Corpus import * if __name__ == '__main__': #Read corpus #corp_path = raw_input("Please input the path of the corpus:\n") #train_file = raw_input("Please input the filename of the training data:\n") #gold_file = raw_input("Please input the filename of the gold label:\n") train_file = "trail.csv" gold_file = "trial.labels" corpus = Corpus(train_file) #corpus.readCourpus() #training part. '''..to be complete ''' predict_file = "trial.predict" #Evaluation if (corpus.gold_file != gold_file): corpus.readGold(gold_file) if (corpus.predict_file != predict_file): corpus.readPrediction(predict_file) corpus.evaluation() corpus.print_result()
def tag (self, corpus): """scoring of each sentence of the corpus""" res = Corpus() res.sentences = [self.viterbi(s) for s in corpus.sentences] return res
def analys(corpus_name): corpus = Corpus(corpus_path[corpus_name], corpus_name) corpus.get_data() path = "plot/" X, Y = corpus.data print("size", len(Y)) circle_plot(Histograme(corpus), path + "/" + corpus_name + "/", title=corpus_name + " : distribution of relationships") st = get_stop_words('en') st.extend(string.punctuation) st.extend([str(i) for i in range(10)]) def rm_stop_words(dic): for i in st: if i in dic: dic[i] = 0 return dic vocab, vocab_rel = get_vocab(corpus) vocab[''] = 0 vocab = rm_stop_words(vocab) H = pd.DataFrame.from_dict(vocab, orient='index').nlargest(20, 0).to_dict()[0] histo(H, path + "/" + corpus_name + "/", title=corpus_name + " Histo") for i in get_rel_class(corpus): vocab = vocab_rel[i] vocab[''] = 0 vocab = rm_stop_words(vocab) for k in H: if k in vocab: vocab[k] = 0 Hi = pd.DataFrame.from_dict(vocab, orient='index').nlargest(20, 0).to_dict()[0] histo(Hi, path + "/" + corpus_name + "/", title=corpus_name + " relation " + i + " Histo") dist = Dist(corpus) box(dist, path + "/" + corpus_name + "/", title=corpus_name + " distances") dist = Dist(corpus) mean_frame, std = [], [] for rel in dist.keys(): df = pd.DataFrame.from_dict({rel: dist[rel]}) mean_frame.append(df.mean()) std.append(df.std()) mean = pd.DataFrame(pd.concat(mean_frame), columns=["mean"]) std = pd.DataFrame(pd.concat(std), columns=["std"]) res = pd.concat((mean, std), axis=1) data = {'sentence length': [], 'Vocab': [], 'tokenisation length': []} tokenizer_bert, _ = get_bert() tokenizer_scibert, _ = get_bert(bert_type='scibert') for x in X: data['sentence length'].append(len(x[0].split(' '))) data['Vocab'].append('BERT VOCAB') data['tokenisation length'].append(len(tokenizer_bert.tokenize(x[0]))) data['sentence length'].append(len(x[0].split(' '))) data['Vocab'].append('SciBERT VOCAB') data['tokenisation length'].append( len(tokenizer_scibert.tokenize(x[0]))) data = pd.DataFrame(data) data = data.sort_values(by=['sentence length']) print(data) title = corpus_name + " tokenisation analysis" plt.rcParams["figure.figsize"] = (9, 9) pylab.mpl.style.use('seaborn') g = sns.relplot(x="sentence length", y="tokenisation length", hue="Vocab", style="Vocab", hue_order=['SciBERT VOCAB', 'BERT VOCAB'], kind="line", data=data, col_order=['SciBERT VOCAB', 'BERT VOCAB'], style_order=['SciBERT VOCAB', 'BERT VOCAB']) sns.despine() plt.title(title) plt.show() plt.savefig(title + ".png")
def train(): print "train" start_time = time.time() config = SiameseTCNNConfig() corpus = Corpus(train_file, vocab_file, 0.0, config.seq_length, config.vocab_size) testcorpus = Corpus(test_file, vocab_file, 1.0, config.seq_length, config.vocab_size) print(corpus) print(testcorpus) config.vocab_size = len(corpus.words) train_data = TensorDataset(torch.LongTensor(corpus.x_train1), torch.LongTensor(corpus.x_train2), torch.FloatTensor(corpus.y_train)) test_data = TensorDataset(torch.LongTensor(testcorpus.x_test1), torch.LongTensor(testcorpus.x_test2), torch.FloatTensor(testcorpus.y_test)) print('Configuring CNN model...') model = SiameseTextCNN(config) print(model) # optimizer and loss function # criterion = nn.CrossEntropyLoss(size_average=False) # criterion = torch.nn.BCELoss(reduce=False, size_average=False) if config.contra_loss: criterion = ContrastiveLoss() optimizer = optim.Adam(model.parameters(), lr=config.learning_rate) # set the mode to train print("Training and evaluating...") best_F1 = 0.0 for epoch in range(config.num_epochs): # load the training data in batch model.train() train_loader = DataLoader(train_data, batch_size=config.batch_size) ii = 0 for x1_batch, x2_batch, y_batch in train_loader: ii += 1 if ii % 100 == 0: print epoch, "batch", ii inputs1, inputs2, targets = Variable(x1_batch), Variable( x2_batch), Variable(y_batch) optimizer.zero_grad() outputs1, outputs2 = model(inputs1, inputs2) # forward computation loss = criterion(outputs1, outputs2, targets) """ todo """ # backward propagation and update parameters loss.backward() optimizer.step() # evaluate on both training and test dataset print "epoch", epoch train_loss, train_F1 = evaluate(train_data, model, criterion) test_loss, test_F1 = evaluate(test_data, model, criterion) #print "train_loss:",train_loss if test_F1 > best_F1: # store the best result best_F1 = test_F1 improved_str = '*' torch.save(model.state_dict(), model_file) else: improved_str = '' time_dif = get_time_dif(start_time) msg = "Epoch {0:3}, Train_loss: {1:>7.3}, Train_F1 {2:>6.3%}, " \ + "Test_loss: {3:>6.3}, Test_F1 {4:>6.3%}, Time: {5} {6}" print( msg.format(epoch + 1, train_loss, train_F1, test_loss, test_F1, time_dif, improved_str))
except: author = i['author']['name'] txt = i['title'] + ". " + i['summary'] txt = txt.replace('\n', ' ') txt = txt.replace('\r', ' ') try: coAuth = [aut['name'] for aut in i['author']][1:] except: coAuth = "Pas de Co-Auteur" doc = Document.ArxivDocument(datet, i['title'], author, txt, i['id'], coAuth) corpus_Arxiv.add_doc(doc) #Initialisation des corpus corpus_Reddit = Corpus.Corpus("Corona_red") corpus_Arxiv = Corpus.Corpus("Corona_arx") #Chargement des données dans les corpus loadArxiv(corpus_Arxiv) loadReddit(corpus_Reddit) #Affichage du nombre de documents et d'auteurs print("Création du corpus Reddit, %d documents et %d auteurs" % (corpus_Reddit.ndoc, corpus_Reddit.naut)) print("Création du corpus Arxiv, %d documents et %d auteurs" % (corpus_Arxiv.ndoc, corpus_Arxiv.naut)) print() #Enregistrement des corpus
del content[:] return all_docs # Returns the maximum X terms of a list def get_top_terms(dict, number_of_terms): sorted_tf_idf_list = sorted(dict.items(), key=operator.itemgetter(1), reverse=True) return sorted_tf_idf_list[0:number_of_terms] print os.listdir(".") # Read text files from the base folder for file in os.listdir(base_folder): print "Now grabbing contents from file", file cor = Corpus(file) f = codecs.open(base_folder + file, encoding="utf-8") cor.list_of_documents = import_file(f) print "Number of documents is", len(cor.list_of_documents) f.close() corpses.append(cor) # Print stats for all corpuses for corpse in corpses: corpse.print_stats() while True: input = raw_input("Enter word: ") count = 0
class DataSpace: corpus = Corpus() featurematrix = [] doctermmatrix = [] classlabels = [] spacename = "" # numoffiles_classtask def __init__(self): self.corpus = Corpus() self.featurematrix = [] self.doctermmatrix = [] self.classlabels = [] self.spacename = "" def __getfeaturematrix(self): fname = IOtools.picklepath+os.sep+self.spacename+"_featurematrix.p" return self.featurematrix, fname def __getdoctermmatrix(self): fname = IOtools.picklepath+os.sep+self.spacename+"_doctermmatrix.p" return self.doctermmatrix, fname def __getcorpora(self): fname = IOtools.picklepath+os.sep+self.spacename+"_corpora.p" return self.corpora, fname def __dumpfeaturematrix(self): fname = IOtools.picklepath+os.sep+self.spacename+"_featurematrix.p" pickle.dump(self.featurematrix, open(fname, "wb")) def __dumpdoctermmatrix(self): fname = IOtools.picklepath+os.sep+self.spacename+"_doctermmatrix.p" pickle.dump(self.doctermmatrix, open(fname, "wb")) def __dumpcorpora(self): fname = IOtools.picklepath+os.sep+self.spacename+"_corpora.p" pickle.dump(self.corpora, open(fname, "wb")) ''' nfile is a dict storing the number of files per classlabel to be read nfile default value 0 rec.''' def buildcorpus(self, nfile, resourcepath, classlabels, corpusname, taskname, plaintext, nostopwords): labelwisepathlist = {} for classlabel in classlabels: labelwisepathlist[classlabel] = [] for classlabel in classlabels: p = resourcepath + os.sep + classlabel + os.sep fileids = [] fileids = IOtools.getfilenames_of_dir(p, removeextension=False)[:nfile[classlabel]] labelwisepathlist[classlabel].extend(fileids) self.corpus.setname(corpusname) self.corpus.read_corpus(labelwisepathlist, plaintext, nostopwords) ncat = len(classlabels) self.spacename = taskname+"-"+str(nfile*ncat)+"texts" def compute_tfidf(self): ''' matrix leri duzelt. csv olarak kaydet ''' def build_featurematrix(self): for corpus in self.corpora: datapoints = corpus.build_featurematrix() for k,v in datapoints.iteritems(): self.featurematrix.append([k]+v+[corpus.label]) self.record_matrix(self.featurematrix, "featureMATRIX") def build_termdocmatrix(self): cfdDocTerm = nltk.ConditionalFreqDist() #docs = [] labelleddocs = [] for corpus in self.corpora: cfd = corpus.build_termmatrix() label = corpus.label print label for term in cfd.conditions(): #docs.extend(list(cfd[term])) #labelleddocs = [(doc, label) for doc in docs] #print list(cfd[term]) for fileid in list(cfd[term]): cfdDocTerm[term].inc(fileid) labelleddocs.append((fileid, label)) print labelleddocs labelleddocs = list(set(labelleddocs)) print labelleddocs CFDhelpers.recordCFD(cfdDocTerm, self.spacename+"CFDdocterm") matrix = [] matrix.append(cfdDocTerm.conditions()) for fileid,label in labelleddocs: row = [] for term in cfdDocTerm.conditions(): numofoccurrences = cfdDocTerm[term][fileid] row.append(numofoccurrences) self.doctermmatrix.append([fileid]+row+[label]) matrix.append([fileid]+row+[label]) self.record_matrix(matrix, "DocTermMATRIXn") self.record_matrix(self.doctermmatrix, "DocTermMatrix") self.__dumpdoctermmatrix() def record_matrix(self, matrix, mname): fname = IOtools.matrixpath+os.sep+mname+"-"+self.spacename+"MATRIX.m" IOtools.todisc_matrix(matrix, fname)
import Corpus import numpy as np import IBM1 import IBM2 import HMM print("loading the corpus...") corpus = Corpus.Corpus("eutrans/training", separator="#") #corpus = Corpus.Corpus("corpus.txt", separator="---") corpus.print_corpus_description() print("...done") #%% Testing IBM1 # print(" ") # print("*"*50) # print(" ") # print("Building IBM1 item...") # ibm1 = IBM1.IBM1(corpus) # print("...done") # print("starting to train IBM1...") # ibm1_nb_training_step = 10 # imb1perplexityevol = ibm1.train(ibm1_nb_training_step, verbose=True) # print("...done") # # print "\nIBM1 perplexity : ",ibm1.get_perplexity(),"\n" # # f2e = np.argmax(ibm1.proba_f_knowing_e,axis=1) # print "IBM1 Translations :" # for i in range(len(corpus.french_words)): # print corpus.french_words[i], " --> ", corpus.english_words[f2e[i]]
from glove import Glove import Corpus corpus = Corpus() sent_token = [["안녕", "하세요"], ["지니티토리", "입니다"]] corpus.fit(sent_token, window=20) # model glove = Glove(no_components=128, learning_rate=0.01) glove.fit(corpus.matrix, epochs=50, no_threads=4, verbose=False) glove.add_dictionary(corpus.dictionary) # save glove.save(DATA_DIR + '/glove_w20_epoch50.model')