def main(): if cf.EMBEDDING_MODEL == "Elmo": raise Exception("Please use build_data_elmo instead.") #if cf.MODEL_TYPE == S2S: corpusReader = ConllCorpusReader(cf.DATA_FOLDER, [cf.TRAIN_FILENAME, cf.TEST_FILENAME], ['words', 'pos']) #elif cf.MODEL_TYPE == S21: # corpusReader = TabbedCorpusReader(cf.DATA_FOLDER, [cf.TRAIN_FILENAME, cf.TEST_FILENAME]) tagged_sents = corpusReader.tagged_sents() test_unique_wordtags, test_unique_chartags = get_unique_test_tag_set() logger.info("%d sentences loaded." % len(tagged_sents)) #tagged_sents = clean_sentences(tagged_sents) #logger.info("%d sentences after cleaning (removing short/long sentences)." % len(tagged_sents)) word_to_ix, ix_to_word, wtag_to_ix, ix_to_wtag = get_word_and_wordtag_ids( tagged_sents) #, test_unique_wordtags) char_to_ix, ix_to_char, ctag_to_ix, ix_to_ctag = get_char_and_chartag_ids( tagged_sents) #, test_unique_chartags) save_data_to_files(tagged_sents, word_to_ix, wtag_to_ix, ix_to_word, ix_to_wtag, char_to_ix, ctag_to_ix, ix_to_char, ix_to_ctag) if cf.USE_PRETRAINED_WORD_EMBEDDINGS: # Get all words in the embedding vocab emb_vocab = get_emb_vocab(cf.EMB_VEC_FILENAME) # Generate OOV embeddings for any words in ix_to_word that aren't in emb_vocab #generate_oov_embeddings(ix_to_word, emb_vocab, cf.EMB_BIN_FILENAME, cf.OOV_TOKENS_FILENAME, cf.EMB_OOV_FILENAME) # Combine OOV embeddings with IV embeddings and export them to a file export_trimmed_embedding_vectors(word_to_ix, cf.WORD_EMBEDDING_DIM, cf.EMB_OOV_FILENAME, cf.EMB_VEC_FILENAME, cf.EMB_TRIMMED_FILENAME, cf.WORD_EMBEDDING_DIM) if cf.USE_PRETRAINED_CHAR_EMBEDDINGS: char_emb_vocab = get_emb_vocab(cf.CHAR_EMB_VEC_FILENAME) generate_oov_embeddings(ix_to_char, char_emb_vocab, cf.CHAR_EMB_BIN_FILENAME, cf.CHAR_OOV_TOKENS_FILENAME, cf.CHAR_EMB_OOV_FILENAME) export_trimmed_embedding_vectors(char_to_ix, cf.CHAR_EMBEDDING_DIM, cf.CHAR_EMB_OOV_FILENAME, cf.CHAR_EMB_VEC_FILENAME, cf.CHAR_EMB_TRIMMED_FILENAME, cf.CHAR_EMBEDDING_DIM) logger.info("Data building complete.")
def read_turkish_corpus(self): tagged_sentences_raw = [] conll_reader = ConllCorpusReader('path/to/languages-corpora', 'path/to/turkish-pos-conll-file', ('words', 'pos'), encoding='UTF-8') tagged_sentences_raw_map = conll_reader.tagged_sents( 'path/to/turkish-pos-conll-file') for sent in tagged_sentences_raw_map: tagged_sentences_raw.append(sent) tagged_sentences = [[(w.lower(), t) for (w, t) in s] for s in tagged_sentences_raw] return tagged_sentences
def get_unique_test_tag_set(): logger.info("Building set of testset-unique tags...") corpusReaderTrain = ConllCorpusReader(cf.DATA_FOLDER, [cf.TRAIN_FILENAME], ['words', 'pos']) corpusReaderTest = ConllCorpusReader(cf.DATA_FOLDER, [cf.TEST_FILENAME], ['words', 'pos']) tagged_sents_train = corpusReaderTrain.tagged_sents() tagged_sents_test = corpusReaderTest.tagged_sents() train_wordtags = set() train_chartags = set() for sent in tagged_sents_train: for word, tag in sent: if tag != "<PAD>" and tag != "<SELF>": train_wordtags.add(tag) for char in tag: train_chartags.add(char) test_unique_wordtags = set() test_unique_chartags = set() for sent in tagged_sents_test: for word, tag in sent: if tag != "<PAD>" and tag != "<SELF>": if tag not in train_wordtags: test_unique_wordtags.add(tag) for char in tag: if char not in train_chartags: test_unique_chartags.add(char) logger.info( "%d unique word tags and %d unique char tags found in the test dataset." % (len(test_unique_wordtags), len(test_unique_chartags))) return test_unique_wordtags, test_unique_chartags
def main(): corpusReader = ConllCorpusReader(cf.DATA_FOLDER, [cf.TRAIN_FILENAME, cf.TEST_FILENAME], ['words', 'pos']) tagged_sents = corpusReader.tagged_sents() test_unique_wordtags, test_unique_chartags = get_unique_test_tag_set() logger.info("%d sentences loaded." % len(tagged_sents)) ix_to_word, word_to_ix, wtag_to_ix, ix_to_wtag, embedding_vectors = get_ids_and_elmo_embeddings(tagged_sents) char_to_ix, ix_to_char, ctag_to_ix, ix_to_ctag = get_char_and_chartag_ids(tagged_sents) save_data_to_files(tagged_sents, word_to_ix, wtag_to_ix, ix_to_word, ix_to_wtag, char_to_ix, ctag_to_ix, ix_to_char, ix_to_ctag) logger.info("Data building complete.")
def load_datasets(word_to_ix, wtag_to_ix, char_to_ix, ctag_to_ix, ix_to_char, ix_to_word): data_iterators = { "train": None, "dev": None } test_dataset = [] word_index = 1 # Used for elmo models only for i, dataset in enumerate(["train", "test"]): #if cf.MODEL_TYPE == S2S: corpusReader = ConllCorpusReader(cf.DATA_FOLDER, [[cf.TRAIN_FILENAME, cf.TEST_FILENAME][i]], ['words', 'pos']) #elif cf.MODEL_TYPE == S21: # corpusReader = TabbedCorpusReader(cf.DATA_FOLDER, [[cf.TRAIN_FILENAME, cf.TEST_FILENAME][i]]) tagged_sents = corpusReader.tagged_sents() data_w, data_x, data_y, data_f, rejected_sents, rejected_words, filtered_words, non_alphabetical_words, rejected_tags, word_index = tagged_sents_to_numpy(tagged_sents, word_to_ix, wtag_to_ix, char_to_ix, ctag_to_ix, ix_to_char, ix_to_word, dataset, word_index) if cf.WORD_LEVEL_WITH_FLAGGER: myDataset = MyDatasetWithFlags(data_w, data_x, data_y, data_f) else: myDataset = MyDataset(data_w, data_x, data_y) data_iterator = DataLoader(myDataset, batch_size=cf.BATCH_SIZE, pin_memory=True) data_iterators[dataset] = data_iterator #for d in data_iterator: # torch.set_printoptions(threshold = 5000000) # print d # exit() logger.info("Loaded %d %s batches.\n" % (len(data_iterator), dataset) + " (%d x %d = ~%d %s total)" % (len(data_iterator), cf.BATCH_SIZE, len(data_iterator) * cf.BATCH_SIZE, "words" if cf.GRANULARITY in [CHAR_LEVEL, CHAR_AND_WORD_LEVEL] else "sentences")) if len(rejected_sents) > 0: logger.warning("%d of %d sentences from the %s set were trimmed due to being too long or short." % (len(rejected_sents), len(tagged_sents) + len(rejected_sents), dataset)) if len(rejected_words) > 0: logger.warning("%d words from the %s set were trimmed due to being too long." % (len(rejected_words), dataset)) if len(rejected_tags) > 0: logger.warning("%d labels from the %s set were trimmed due to being too long." % (len(rejected_tags), dataset)) if len(filtered_words) > 0: logger.info("%d words were filtered from the %s set due to beginning with undesirable character sequences." % (len(filtered_words), dataset)) if len(non_alphabetical_words) > 0: logger.info("%d words were filtered from the %s set due to being entirely non-alphabetical." % (len(non_alphabetical_words), dataset)) return data_iterators
def tagged_sents(self, fileids=None, categories=None): return ConllCorpusReader.tagged_sents( self, self._resolve(fileids, categories))
def tagged_sents(self, fileids=None, categories=None): return ConllCorpusReader.tagged_sents(self, self._resolve(fileids, categories))
lst.append(list_train[i].split()) lst_train = [] for i in range(0,len(lst)): lst2 = [] for j in range(0,len(lst[i])): lst2.append(lst[i][j].split('|')) lst_train.append(lst2) print('Train Set Readed') ## Reads the WikiGold Test Set reader = ConllCorpusReader('/home/nicor/Documents','.conll',('words','pos')) list_test = reader.tagged_sents('wikigold.txt') lst_test = [] for i in range(0,len(list_test)): lst1 = [] lst2 = [] lst3 = [] for j in range(0, len(list_test[i])): lst1.append(list_test[i][j][0]) list2 = nltk.pos_tag(lst1) for j in range(0, len(list2)): lst3.append([list2[j][0], list2[j][1], list_test[i][j][1]]) lst_test.append(lst3) ### Defines the Features to be obtained from every sentence
from __future__ import division from nltk.corpus.reader import ConllCorpusReader from nltk.probability import FreqDist, DictionaryProbDist, LaplaceProbDist, SimpleGoodTuringProbDist, MLEProbDist conllreader = ConllCorpusReader(".", "de-train.tt", ('words', 'pos')) # getting a train corpus from file states = ('VERB', 'NOUN', 'PRON', 'ADJ', 'ADV', 'ADP', 'CONJ', 'DET', 'NUM', 'PRT', 'X', '.') # list of 12 POS tags sentslen = len(conllreader.tagged_sents()) # getting number of sentences tagfdist = FreqDist(pair[1] for pair in conllreader.tagged_words()) # getting frequence of (word,tag) firsttagfdist = FreqDist(pair[0][1] for pair in conllreader.tagged_sents()) # getting frequence of first tags A0j = DictionaryProbDist(dict(map(lambda (k, x): (k, x/sentslen), firsttagfdist.iteritems()))) A0jLap = LaplaceProbDist(firsttagfdist) A0jGT = SimpleGoodTuringProbDist(firsttagfdist) A0jMLE = MLEProbDist(firsttagfdist) TagPair = [] words = conllreader.tagged_words() for i in range(0, len(words)-1): TagPair.append((words[i][1], words[i+1][1])) TagPairfdist = FreqDist(TagPair) Aij = DictionaryProbDist(dict(map(lambda (k, x): (k, x/tagfdist.get(k[0])), TagPairfdist.iteritems()))) AijLap = LaplaceProbDist(TagPairfdist) AijGT = SimpleGoodTuringProbDist(TagPairfdist) AijMLE = MLEProbDist(TagPairfdist) TagWordfdist = FreqDist(conllreader.tagged_words()) Biw = DictionaryProbDist(dict(map(lambda (k, x): (k, x/tagfdist.get(k[1])), TagWordfdist.iteritems()))) BiwLap = LaplaceProbDist(TagWordfdist) BiwGT = SimpleGoodTuringProbDist(TagWordfdist)