def load_data_vslp2018(folder='../data/vlsp2018'): train_sents = ConllCorpusReader( folder, 'train.conll', ['words', 'pos', 'ignore', 'chunk']).iob_sents() test_sents = ConllCorpusReader( folder, 'test.conll', ['words', 'pos', 'ignore', 'chunk']).iob_sents() train_sents = [x for x in train_sents if len(x) > 0] test_sents = [x for x in test_sents if len(x) > 0] print("#train_sents", len(train_sents)) print("#test_sents", len(test_sents)) return train_sents, test_sents
def load_data_conll2003(): train_sents = ConllCorpusReader( '../data/conll2003', 'train.txt', ['words', 'pos', 'ignore', 'chunk']).iob_sents() test_sents = ConllCorpusReader( '../data/conll2003', 'valid.txt', ['words', 'pos', 'ignore', 'chunk']).iob_sents() train_sents = [x for x in train_sents if len(x) > 0] test_sents = [x for x in test_sents if len(x) > 0] print("#train_sents", len(train_sents)) print("#test_sents", len(test_sents)) return train_sents, test_sents
def _get_corpus(self, file_path: str) -> ConllCorpusReader: path = PurePath(file_path) return ConllCorpusReader( root=str(path.parents[0]), fileids=str(path.name), columntypes=["words", "pos", "ignore", "chunk"], )
def main(): if cf.EMBEDDING_MODEL == "Elmo": raise Exception("Please use build_data_elmo instead.") #if cf.MODEL_TYPE == S2S: corpusReader = ConllCorpusReader(cf.DATA_FOLDER, [cf.TRAIN_FILENAME, cf.TEST_FILENAME], ['words', 'pos']) #elif cf.MODEL_TYPE == S21: # corpusReader = TabbedCorpusReader(cf.DATA_FOLDER, [cf.TRAIN_FILENAME, cf.TEST_FILENAME]) tagged_sents = corpusReader.tagged_sents() test_unique_wordtags, test_unique_chartags = get_unique_test_tag_set() logger.info("%d sentences loaded." % len(tagged_sents)) #tagged_sents = clean_sentences(tagged_sents) #logger.info("%d sentences after cleaning (removing short/long sentences)." % len(tagged_sents)) word_to_ix, ix_to_word, wtag_to_ix, ix_to_wtag = get_word_and_wordtag_ids( tagged_sents) #, test_unique_wordtags) char_to_ix, ix_to_char, ctag_to_ix, ix_to_ctag = get_char_and_chartag_ids( tagged_sents) #, test_unique_chartags) save_data_to_files(tagged_sents, word_to_ix, wtag_to_ix, ix_to_word, ix_to_wtag, char_to_ix, ctag_to_ix, ix_to_char, ix_to_ctag) if cf.USE_PRETRAINED_WORD_EMBEDDINGS: # Get all words in the embedding vocab emb_vocab = get_emb_vocab(cf.EMB_VEC_FILENAME) # Generate OOV embeddings for any words in ix_to_word that aren't in emb_vocab #generate_oov_embeddings(ix_to_word, emb_vocab, cf.EMB_BIN_FILENAME, cf.OOV_TOKENS_FILENAME, cf.EMB_OOV_FILENAME) # Combine OOV embeddings with IV embeddings and export them to a file export_trimmed_embedding_vectors(word_to_ix, cf.WORD_EMBEDDING_DIM, cf.EMB_OOV_FILENAME, cf.EMB_VEC_FILENAME, cf.EMB_TRIMMED_FILENAME, cf.WORD_EMBEDDING_DIM) if cf.USE_PRETRAINED_CHAR_EMBEDDINGS: char_emb_vocab = get_emb_vocab(cf.CHAR_EMB_VEC_FILENAME) generate_oov_embeddings(ix_to_char, char_emb_vocab, cf.CHAR_EMB_BIN_FILENAME, cf.CHAR_OOV_TOKENS_FILENAME, cf.CHAR_EMB_OOV_FILENAME) export_trimmed_embedding_vectors(char_to_ix, cf.CHAR_EMBEDDING_DIM, cf.CHAR_EMB_OOV_FILENAME, cf.CHAR_EMB_VEC_FILENAME, cf.CHAR_EMB_TRIMMED_FILENAME, cf.CHAR_EMBEDDING_DIM) logger.info("Data building complete.")
def read_turkish_corpus(self): tagged_sentences_raw = [] conll_reader = ConllCorpusReader('path/to/languages-corpora', 'path/to/turkish-pos-conll-file', ('words', 'pos'), encoding='UTF-8') tagged_sentences_raw_map = conll_reader.tagged_sents( 'path/to/turkish-pos-conll-file') for sent in tagged_sentences_raw_map: tagged_sentences_raw.append(sent) tagged_sentences = [[(w.lower(), t) for (w, t) in s] for s in tagged_sentences_raw] return tagged_sentences
def get_unique_test_tag_set(): logger.info("Building set of testset-unique tags...") corpusReaderTrain = ConllCorpusReader(cf.DATA_FOLDER, [cf.TRAIN_FILENAME], ['words', 'pos']) corpusReaderTest = ConllCorpusReader(cf.DATA_FOLDER, [cf.TEST_FILENAME], ['words', 'pos']) tagged_sents_train = corpusReaderTrain.tagged_sents() tagged_sents_test = corpusReaderTest.tagged_sents() train_wordtags = set() train_chartags = set() for sent in tagged_sents_train: for word, tag in sent: if tag != "<PAD>" and tag != "<SELF>": train_wordtags.add(tag) for char in tag: train_chartags.add(char) test_unique_wordtags = set() test_unique_chartags = set() for sent in tagged_sents_test: for word, tag in sent: if tag != "<PAD>" and tag != "<SELF>": if tag not in train_wordtags: test_unique_wordtags.add(tag) for char in tag: if char not in train_chartags: test_unique_chartags.add(char) logger.info( "%d unique word tags and %d unique char tags found in the test dataset." % (len(test_unique_wordtags), len(test_unique_chartags))) return test_unique_wordtags, test_unique_chartags
def main(): corpusReader = ConllCorpusReader(cf.DATA_FOLDER, [cf.TRAIN_FILENAME, cf.TEST_FILENAME], ['words', 'pos']) tagged_sents = corpusReader.tagged_sents() test_unique_wordtags, test_unique_chartags = get_unique_test_tag_set() logger.info("%d sentences loaded." % len(tagged_sents)) ix_to_word, word_to_ix, wtag_to_ix, ix_to_wtag, embedding_vectors = get_ids_and_elmo_embeddings(tagged_sents) char_to_ix, ix_to_char, ctag_to_ix, ix_to_ctag = get_char_and_chartag_ids(tagged_sents) save_data_to_files(tagged_sents, word_to_ix, wtag_to_ix, ix_to_word, ix_to_wtag, char_to_ix, ctag_to_ix, ix_to_char, ix_to_ctag) logger.info("Data building complete.")
def main() -> None: """Точка входа в приложение.""" corpus_root = Path('corpus') # Настроим логирование результатов global _logger setup_logger(_logger, corpus_root / 'collocations.log') # Загрузим стоп-слова nltk.download('stopwords', '.env/share/nltk_data') stop_words = set(stopwords.words('russian')) # Импортируем корпус tags_root = corpus_root / 'pos_tagging' reader = ConllCorpusReader( str(tags_root), [f.name for f in tags_root.glob('*.tags')], columntypes=['words', 'ignore', 'ignore', 'ignore', 'pos'], separator='\t') _logger.info('Документов: %d', len(reader.fileids())) _logger.info('Токенов в первом документе (%s): %d', reader.fileids()[0], len(reader.words(reader.fileids()[0]))) _logger.info('Загружаем предложения') sentences = reader.sents() # Строим таблицы сопряжённости для всех слов в корпусе _logger.info('Считаем таблицу сопряжённости по всем словам') bigram_finder = BigramCollocationFinder.from_documents( [w.lower() for w in sent] for sent in tqdm(sentences)) _logger.info('Всего биграм: %d', bigram_finder.N) print_samples(bigram_finder) # А теперь отфильтруем по частоте и удалим пунктуацию, стоп-слова _logger.info( 'Отфильтруем пунктуацию, стоп-слова и установим предел по частоте') bigram_finder.apply_freq_filter(5) bigram_finder.apply_word_filter(lambda w: len(w) < 3 or w in stop_words) _logger.info('Всего биграм: %d', bigram_finder.N) print_samples(bigram_finder)
def load_datasets(word_to_ix, wtag_to_ix, char_to_ix, ctag_to_ix, ix_to_char, ix_to_word): data_iterators = { "train": None, "dev": None } test_dataset = [] word_index = 1 # Used for elmo models only for i, dataset in enumerate(["train", "test"]): #if cf.MODEL_TYPE == S2S: corpusReader = ConllCorpusReader(cf.DATA_FOLDER, [[cf.TRAIN_FILENAME, cf.TEST_FILENAME][i]], ['words', 'pos']) #elif cf.MODEL_TYPE == S21: # corpusReader = TabbedCorpusReader(cf.DATA_FOLDER, [[cf.TRAIN_FILENAME, cf.TEST_FILENAME][i]]) tagged_sents = corpusReader.tagged_sents() data_w, data_x, data_y, data_f, rejected_sents, rejected_words, filtered_words, non_alphabetical_words, rejected_tags, word_index = tagged_sents_to_numpy(tagged_sents, word_to_ix, wtag_to_ix, char_to_ix, ctag_to_ix, ix_to_char, ix_to_word, dataset, word_index) if cf.WORD_LEVEL_WITH_FLAGGER: myDataset = MyDatasetWithFlags(data_w, data_x, data_y, data_f) else: myDataset = MyDataset(data_w, data_x, data_y) data_iterator = DataLoader(myDataset, batch_size=cf.BATCH_SIZE, pin_memory=True) data_iterators[dataset] = data_iterator #for d in data_iterator: # torch.set_printoptions(threshold = 5000000) # print d # exit() logger.info("Loaded %d %s batches.\n" % (len(data_iterator), dataset) + " (%d x %d = ~%d %s total)" % (len(data_iterator), cf.BATCH_SIZE, len(data_iterator) * cf.BATCH_SIZE, "words" if cf.GRANULARITY in [CHAR_LEVEL, CHAR_AND_WORD_LEVEL] else "sentences")) if len(rejected_sents) > 0: logger.warning("%d of %d sentences from the %s set were trimmed due to being too long or short." % (len(rejected_sents), len(tagged_sents) + len(rejected_sents), dataset)) if len(rejected_words) > 0: logger.warning("%d words from the %s set were trimmed due to being too long." % (len(rejected_words), dataset)) if len(rejected_tags) > 0: logger.warning("%d labels from the %s set were trimmed due to being too long." % (len(rejected_tags), dataset)) if len(filtered_words) > 0: logger.info("%d words were filtered from the %s set due to beginning with undesirable character sequences." % (len(filtered_words), dataset)) if len(non_alphabetical_words) > 0: logger.info("%d words were filtered from the %s set due to being entirely non-alphabetical." % (len(non_alphabetical_words), dataset)) return data_iterators
def get_iob_words(grid): return self._get_iob_words(grid, tagset) return LazyMap(get_iob_words, self._grids(fileids)) def _get_iob_words(self, grid, tagset=None): pos_tags = self._get_column(grid, self._colmap['pos']) if tagset and tagset != self._tagset: pos_tags = [map_tag(self._tagset, tagset, t) for t in pos_tags] return list( zip(self._get_column(grid, self._colmap['words']), pos_tags, self._get_column(grid, self._colmap['ne']))) bject = ConllCorpusReader("/home/subham", 'train_ner.txt', ('words', 'pos', 'chunk'), ('NP_B', 'PP', 'VP')) train_sents = bject.iob_sents('train_ner.txt') bject1 = ConllCorpusReader("/home/subham", 'test_accuracy.txt', ('words', 'pos', 'chunk'), ('NP_B', 'PP', 'VP')) #train_sents=bject.iob_sents('conll.txt') test_sents = bject1.iob_sents('test_accuracy.txt') #train_sents = list(nltk.corpus.conll2002.iob_sents('esp.train')) #test_sents = list(nltk.corpus.conll2002.iob_sents('esp.testb')) #print(test_sents[0]) #print(train_sents[0]) def word2features(sent, i): word = sent[i][0] postag = sent[i][1]
for i in range(0,len(list_train)): lst.append(list_train[i].split()) lst_train = [] for i in range(0,len(lst)): lst2 = [] for j in range(0,len(lst[i])): lst2.append(lst[i][j].split('|')) lst_train.append(lst2) print('Train Set Readed') ## Reads the WikiGold Test Set reader = ConllCorpusReader('/home/nicor/Documents','.conll',('words','pos')) list_test = reader.tagged_sents('wikigold.txt') lst_test = [] for i in range(0,len(list_test)): lst1 = [] lst2 = [] lst3 = [] for j in range(0, len(list_test[i])): lst1.append(list_test[i][j][0]) list2 = nltk.pos_tag(lst1) for j in range(0, len(list2)): lst3.append([list2[j][0], list2[j][1], list_test[i][j][1]]) lst_test.append(lst3) ### Defines the Features to be obtained from every sentence
# Copyright # https://github.com/scrapinghub/python-crfsuite/blob/master/examples/CoNLL%202002.ipynb from itertools import chain import pycrfsuite from nltk.corpus.reader import ConllCorpusReader from sklearn.metrics import classification_report from sklearn.preprocessing import LabelBinarizer train = ConllCorpusReader("datasets/conll2003", "eng.train", ["words", "pos", "ignore", "chunk"]) test = ConllCorpusReader("datasets/conll2003", "eng.testb", ["words", "pos", "ignore", "chunk"]) train_sents = list(train.iob_sents()) test_sents = list(test.iob_sents()) def word2features(sent, i): # remove postag word = sent[i][0] # postag = sent[i][1] features = [ "bias", "word.lower=" + word.lower(), "word[-3:]=" + word[-3:], "word[-2:]=" + word[-2:], "word.isupper=%s" % word.isupper(), "word.istitle=%s" % word.istitle(), "word.isdigit=%s" % word.isdigit(), # 'postag=' + postag,
from __future__ import division from nltk.corpus.reader import ConllCorpusReader from nltk.probability import FreqDist, DictionaryProbDist, LaplaceProbDist, SimpleGoodTuringProbDist, MLEProbDist conllreader = ConllCorpusReader(".", "de-train.tt", ('words', 'pos')) # getting a train corpus from file states = ('VERB', 'NOUN', 'PRON', 'ADJ', 'ADV', 'ADP', 'CONJ', 'DET', 'NUM', 'PRT', 'X', '.') # list of 12 POS tags sentslen = len(conllreader.tagged_sents()) # getting number of sentences tagfdist = FreqDist(pair[1] for pair in conllreader.tagged_words()) # getting frequence of (word,tag) firsttagfdist = FreqDist(pair[0][1] for pair in conllreader.tagged_sents()) # getting frequence of first tags A0j = DictionaryProbDist(dict(map(lambda (k, x): (k, x/sentslen), firsttagfdist.iteritems()))) A0jLap = LaplaceProbDist(firsttagfdist) A0jGT = SimpleGoodTuringProbDist(firsttagfdist) A0jMLE = MLEProbDist(firsttagfdist) TagPair = [] words = conllreader.tagged_words() for i in range(0, len(words)-1): TagPair.append((words[i][1], words[i+1][1])) TagPairfdist = FreqDist(TagPair) Aij = DictionaryProbDist(dict(map(lambda (k, x): (k, x/tagfdist.get(k[0])), TagPairfdist.iteritems()))) AijLap = LaplaceProbDist(TagPairfdist) AijGT = SimpleGoodTuringProbDist(TagPairfdist) AijMLE = MLEProbDist(TagPairfdist) TagWordfdist = FreqDist(conllreader.tagged_words()) Biw = DictionaryProbDist(dict(map(lambda (k, x): (k, x/tagfdist.get(k[1])), TagWordfdist.iteritems()))) BiwLap = LaplaceProbDist(TagWordfdist) BiwGT = SimpleGoodTuringProbDist(TagWordfdist)
a = {} ## Function to add an adjective to a noun key def add_adj(noun_param, adj_param): if (noun_param in a): a[noun_param].append(adj_param) else: a[noun_param] = [adj_param] filedir = '/Users/fnascime/Documents/Sicily_Project/texts/' filename = 'ilgattopardo_prima' mycorpus = ConllCorpusReader(filedir, filename + '.conll', ('ignore', 'words', 'ignore', 'pos', 'ignore', 'ignore', 'ignore', 'ignore')) words = mycorpus.tagged_words() list_len = len(words) ## Loop through file and retrieve adjetives directly associated with nouns (adjunct words) for i in range(list_len): if (words[i][1] == 'S'): if ((i > 0) and (words[i - 1][1] == 'A')): add_adj(words[i][0], words[i - 1][0]) elif ((i < list_len - 1) and (words[i + 1][1] == 'A')): add_adj(words[i][0], words[i + 1][0]) ## Loop throught the list of words and verify the ones with more adjective
from __future__ import division from nltk.corpus.reader import ConllCorpusReader import Train conllreader = ConllCorpusReader(".", "de-test.t", ('words', 'pos')) states = Train.states def viterbi(obs, states, start_p, trans_p, emit_p): V = [{}] path = {} # Initialize base cases (t == 0) for y in states: if sum(emit_p.prob((obs[0], y1)) for y1 in states) != 0: V[0][y] = start_p.logprob(y) + emit_p.logprob((obs[0], y)) else: V[0][y] = start_p.logprob(y) path[y] = [y] # Run Viterbi for t > 0 for t in range(1, len(obs)): V.append({}) newpath = {} for y in states: if sum(emit_p.prob((obs[t], y1)) for y1 in states) != 0: (prob, state) = max((V[t - 1][y0] + trans_p.logprob((y0, y)) + emit_p.logprob((obs[t], y)), y0) for y0 in states) else: (prob, state) = max(