def load_internal_conll(files, data_path="data/wikipedia2/"): start = time() if os.path.exists(data_path + "_".join(files) + ".p"): with open(data_path + "_".join(files) + ".p", "rb") as file: data = pickle.load(file) else: columntypes = ["words", "pos", "chunk"] conll_reader = ConllCorpusReader(data_path, files, columntypes) data = [] sentences = conll_reader.iob_sents() for s in sentences: if not s == []: w, ne, link = zip(*s) stats = {} for label in ["O", "ORG", "PER", "LOC", "VESSEL", "MISC"]: stats[label] = np.sum([ int(t == "O") if label == "O" else int(label in t) for t in ne ]) data.append((w, ne, link, stats)) with open(data_path + "_".join(files) + ".p", "wb") as file: pickle.dump(data, file) print("Loaded %s in %s seconds" % ('_'.join(files), time() - start)) return data
def load_ner_data_label(root, filename, batchsize, labels2y=None): CCR = ConllCorpusReader(root=root, fileids='.conll', columntypes=('words', 'pos', 'ne', 'chunk')) for masked_word_lsts, pos_lst, tags in process_tagged_sents( CCR.tagged_sents(filename), batchsize, labels2y): yield masked_word_lsts, pos_lst, tags
def demo(corpus, num_sents): """ Loads a few sentences from the Brown corpus or the Wall Street Journal corpus, trains them, tests the tagger's accuracy and tags an unseen sentence. @type corpus: C{str} @param corpus: Name of the corpus to load, either C{brown} or C{treebank}. @type num_sents: C{int} @param num_sents: Number of sentences to load from a corpus. Use a small number, as training might take a while. """ if corpus.lower() == "brown": from nltk.corpus import brown tagged_sents = brown.tagged_sents()[:num_sents] elif corpus.lower() == "treebank": from nltk.corpus import treebank tagged_sents = treebank.tagged_sents()[:num_sents] elif corpus.lower() == "floresta": from nltk.corpus import floresta tagged_sents = floresta.tagged_sents()[:num_sents] elif corpus.lower() == "cintil": print "Loading CINTIL" #column_types = ['ignore','words','ignore','ignore','pos','ignore'] #cintil = ConllCorpusReader('/home/dsbatista/cintil/','cintil-fixed.conll',column_types) column_types = ['words','pos','ignore'] #cintil = ConllCorpusReader('/home/dsbatista/extract-publico-relationships/pos-tagger','cintil-fixed.conll',column_types) cintil = ConllCorpusReader('/home/dsbatista/extract-publico-relationships/pos-tagger','cintil-fixed-reduced.conll',column_types) tagged_sents = cintil.tagged_sents()[:num_sents] else: print "Please load either the 'brown' or the 'treebank' corpus." size = int(len(tagged_sents) * 0.1) train_sents, test_sents = tagged_sents[size:], tagged_sents[:size] maxent_tagger = MaxentPosTagger() maxent_tagger.train(train_sents) maxent_tagger.evaluate(test_sents) """ print "tagger accuracy (test %i sentences, after training %i):" % \ (size, (num_sents - size)), maxent_tagger.evaluate(test_sents) print "\n\n" print "classify unseen sentence: ", maxent_tagger.tag(["Isto", "é", "bastante","rápido", "!"]) print "\n\n" print "show the 40 most informative features:" print maxent_tagger.classifier.show_most_informative_features(40) """ fModel = open('test.pkl',"wb") pickle.dump(maxent_tagger, fModel,1) fModel.close()
def get_corpus_reader(fileids, columntypes=(ConllCorpusReader.WORDS, ConllCorpusReader.POS, ConllCorpusReader.NE, ConllCorpusReader.IGNORE), root=root): corpus_reader = ConllCorpusReader(root, fileids, columntypes) return corpus_reader
def load_to_nltk(folder): """Recursively reads .conll files from a directory tree :param folder: the root folder for the corpus :returns: a NLTK Corpus object """ fields = ('words', 'ignore', 'ignore', 'ignore', 'pos') corpus = ConllCorpusReader(folder, r".*\.conll", fields) return corpus
def conllReader(corpus): ''' Data reader for CoNLL format data ''' root = "data/" sentences = [] ccorpus = ConllCorpusReader(root, ".conll", ('words', 'pos', 'tree')) raw = ccorpus.sents(corpus) for sent in raw: sentences.append([TreebankWordDetokenizer().detokenize(sent)]) tagged = ccorpus.tagged_sents(corpus) print(tagged) return tagged, sentences
def test_output(y_pred, args, LABELS, data_fname): CCR = ConllCorpusReader(root=args.data, fileids='.conll', columntypes=('words', 'pos', 'ne', 'chunk')) print(data_fname) tagged_sents = CCR.tagged_sents(data_fname) pred_conll = [] counter = 0 for tagged_sent in tagged_sents: tags_position = return_tag_position(tagged_sent) token_lst, tags_lst = zip(*tagged_sent) tags_pred = deepcopy(list(tags_lst)) for tag_pos in tags_position: for pos in range(tag_pos[0], tag_pos[1]): tags_pred[pos] = tags_lst[pos][:2] + LABELS[y_pred[counter]] counter += 1 pred_conll.append(list(zip(token_lst, tags_lst, tags_pred))) return pred_conll
def __init__(self, directory=None, replace_rare_tokens=True, **kwargs): self.directory = directory # don't load corpus unless directory was passed on object construction if self.directory is not None: self.directory = data_utils.get_filepaths(directory) self.conll_parser = ConllCorpusReader(directory, '.conll', ('words', 'pos')) self.replace_rare_tokens = replace_rare_tokens # word, character and tag sequences from dataset (per partition) self.type_seq = {'train': None, 'valid': None, 'test': None} # mappings of word, characters, and tag types to unique integer IDs self.type_to_idx = {'word': None, 'char': None, 'tag': None} # reverse mapping of unique integer IDs to tag types self.idx_to_tag = None # same as type_seq but all words, characters and tags have been mapped to unique integer IDs self.idx_seq = {'train': None, 'valid': None, 'test': None} for key, value in kwargs.items(): setattr(self, key, value)
def load_conll03(files=["eng.train", "eng.testa", "eng.testb"], max_len=200): start = time() columntypes1 = ["words", "pos", "chunk", "ne"] columntypes2 = ["words", "pos", "ne", "chunk"] conll_reader1 = ConllCorpusReader("data/CoNLL2003/", files, columntypes1) conll_reader2 = ConllCorpusReader("data/CoNLL2003/", files, columntypes2) words = [] poses = [] chunkes = [] nes = [] sentences1 = conll_reader1.iob_sents() sentences2 = conll_reader2.iob_sents() for i, s1 in enumerate(sentences1): if not s1 == [] and len(s1) <= max_len: w, pos, chunk = zip(*s1) _, _, ne = zip(*sentences2[i]) words.append(list(w)) poses.append(list(pos)) chunkes.append(list(chunk)) nes.append(list(ne)) print("Loaded CoNLL03 in %s seconds" % (time() - start)) return words, poses, chunkes, nes
def load_conll00(files=["train.txt, test.txt"], max_len=200): start = time() columntypes = ["words", "pos", "chunk"] conll_reader = ConllCorpusReader("data/CoNLL2000/", files, columntypes) words = [] poses = [] chunkes = [] nes = [] sentences = conll_reader.iob_sents() for i, s in enumerate(sentences): if not s == [] and len(s) <= max_len: w, pos, chunk = zip(*s) words.append(list(w)) poses.append(list(pos)) chunkes.append(list(chunk)) print("Loaded CoNLL00 in %s seconds" % (time() - start)) return words, poses, chunkes
class Dataset(object): """A class for handling datasets. Expects datasets to be in tab-seperated CoNLL format, where each line contains a token and its tag (seperated by a tab) and each sentence is seperated by a blank line. Example corpus: ''' The O transcription O of O most O RP B-PRGE genes I-PRGE ... ''' Args: directory (str): Path to directory containing CoNLL formatted dataset(s). replace_rare_tokens (bool): True if rare tokens should be replaced with a special unknown token. Threshold for considering tokens rare can be found at `saber.constants.NUM_RARE`. """ def __init__(self, directory=None, replace_rare_tokens=True, **kwargs): self.directory = directory # don't load corpus unless directory was passed on object construction if self.directory is not None: self.directory = data_utils.get_filepaths(directory) self.conll_parser = ConllCorpusReader(directory, '.conll', ('words', 'pos')) self.replace_rare_tokens = replace_rare_tokens # word, character and tag sequences from dataset (per partition) self.type_seq = {'train': None, 'valid': None, 'test': None} # mappings of word, characters, and tag types to unique integer IDs self.type_to_idx = {'word': None, 'char': None, 'tag': None} # reverse mapping of unique integer IDs to tag types self.idx_to_tag = None # same as type_seq but all words, characters and tags have been mapped to unique integer IDs self.idx_seq = {'train': None, 'valid': None, 'test': None} for key, value in kwargs.items(): setattr(self, key, value) def load(self): """Coordinates the loading of a given data set at `self.directory`. For a given dataset in CoNLL format at `self.directory`, coordinates the loading of data and updates the appropriate instance attributes. Expects `self.directory` to be a directory containing a single file, `train.*` and optionally two additional files, `valid.*` and `test.*`. Raises: ValueError if `self.directory` is None. """ if self.directory is None: err_msg = "`Dataset.directory` is None; must be provided before call to `Dataset.load`" LOGGER.error('ValueError %s', err_msg) raise ValueError(err_msg) # unique words, chars and tags from CoNLL formatted dataset types = self._get_types() # map each word, char, and tag type to a unique integer self._get_idx_maps(types) # get word, char, and tag sequences from CoNLL formatted dataset self._get_type_seq() # get final representation used for training self.get_idx_seq() # useful during prediction / annotation self.idx_to_tag = generic_utils.reverse_dict(self.type_to_idx['tag']) def _get_types(self): """Collects the sets of all words, characters and tags in a CoNLL formatted dataset. For the CoNLL formatted dataset given at `self.directory`, updates `self.types` with the sets of all words (word types), characters (character types) and tags (tag types). All types are shared across all partitions, that is, word, char and tag types are collected from the train and, if provided, valid/test partitions found at `self.directory/train.*`, `self.directory/valid.*` and `self.directory/test.*`. """ types = { 'word': [constants.PAD, constants.UNK], 'char': [constants.PAD, constants.UNK], 'tag': [constants.PAD], } for _, filepath in self.directory.items(): if filepath is not None: conll_file = os.path.basename( filepath) # get name of conll file types['word'].extend(set(self.conll_parser.words(conll_file))) types['char'].extend( set( chain(*[ list(w) for w in self.conll_parser.words(conll_file) ]))) types['tag'].extend( set([ tag[-1] for tag in self.conll_parser.tagged_words(conll_file) ])) # ensure that we have only unique types types['word'] = list(set(types['word'])) types['char'] = list(set(types['char'])) types['tag'] = list(set(types['tag'])) return types def _get_type_seq(self): """Loads sequence data from a CoNLL format data set given at `self.directory`. For the CoNLL formatted dataset given at `self.directory`, updates `self.type_seq` with lists containing the word, character and tag sequences for the train and, if provided, valid/test partitions found at `self.directory/train.*`, `self.directory/valid.*` and `self.directory/test.*`. """ for partition, filepath in self.directory.items(): if filepath is not None: conll_file = os.path.basename( filepath) # get name of conll file # collect sequence data sents = list(self.conll_parser.sents(conll_file)) tagged_sents = list(self.conll_parser.tagged_sents(conll_file)) word_seq = Preprocessor.replace_rare_tokens( sents) if self.replace_rare_tokens else sents char_seq = [[[c for c in w] for w in s] for s in sents] tag_seq = [[t[-1] for t in s] for s in tagged_sents] # update the class attributes self.type_seq[partition] = { 'word': word_seq, 'char': char_seq, 'tag': tag_seq } def _get_idx_maps(self, types, initial_mapping=None): """Updates `self.type_to_idx` with mappings from word, char and tag types to unique int IDs. """ initial_mapping = constants.INITIAL_MAPPING if initial_mapping is None else initial_mapping # generate type to index mappings self.type_to_idx['word'] = Preprocessor.type_to_idx( types['word'], initial_mapping['word']) self.type_to_idx['char'] = Preprocessor.type_to_idx( types['char'], initial_mapping['word']) self.type_to_idx['tag'] = Preprocessor.type_to_idx( types['tag'], initial_mapping['tag']) def get_idx_seq(self): """Updates `self.idx_seq` with the final representation of the data used for training. Updates `self.idx_seq` with numpy arrays, by using `self.type_to_idx` to map all elements in `self.type_seq` to their corresponding integer IDs, for the train and, if provided, valid/test partitions found at `self.directory/train.*`, `self.directory/valid.*` and `self.directory/test.*`. """ for partition, filepath in self.directory.items(): if filepath is not None: self.idx_seq[partition] = { 'word': Preprocessor.get_type_idx_sequence( self.type_seq[partition]['word'], self.type_to_idx['word'], type_='word'), 'char': Preprocessor.get_type_idx_sequence( self.type_seq[partition]['word'], self.type_to_idx['char'], type_='char'), 'tag': Preprocessor.get_type_idx_sequence( self.type_seq[partition]['tag'], self.type_to_idx['tag'], type_='tag'), } # one-hot encode our targets self.idx_seq[partition]['tag'] = to_categorical( self.idx_seq[partition]['tag'])
from nltk.corpus.reader.conll import ConllCorpusReader """ Script to append end token to every sentence in eval set in order to test properly. New file is created. """ END_TOKEN = "<END>" corpus = ConllCorpusReader("data", ".tt", ["words", "pos"]) result = list() for sent in corpus.tagged_sents("de-eval.tt"): sent.append((END_TOKEN, END_TOKEN)) result.append(sent) try: with open("./data/de-eval_end.tt", 'w') as conll_file: for sent in result: for pair in sent: conll_file.write("\t".join(pair) + '\n') conll_file.write('\n') except FileNotFoundError: print("Not able to open the file for test writing!")
def features(self, dict): d = defaultdict(int) """ for ii in kTOKENIZER.tokenize(dict['Question Text']): d[morphy_stem(ii)] += 1 """ qd = form_dict(dict['QANTA Scores']) wd = form_dict(dict['IR_Wiki Scores']) sp = int(dict['Sentence Position']) qtext = dict['Question Text'] list_words = [] sorted_qd = sorted(qd.items(), key=operator.itemgetter(1), reverse=True) sorted_wd = sorted(wd.items(), key=operator.itemgetter(1), reverse=True) overlap = 0 consider_qanta = 0 """ if sorted_qd[0][0] == sorted_wd[0][0] and sorted_qd[0][1] > 0.5: consider_qanta = 1 """ for w in kTOKENIZER.tokenize(qtext): if consider_qanta != 0: break if morphy_stem(w) not in stopwords.words('english') and w[0] not in PUNC: list_words.append(morphy_stem(w)) #bigrams_qtext = list(bigrams(list_words)) trigrams_qtext = list(trigrams(list_words)) highest = 0 rnk = 0 for i in xrange(10): if consider_qanta != 0: break fid = "./wikipedia/"+sorted_qd[i][0][0]+"/"+sorted_qd[i][0] v = 0 if os.path.exists(fid): fids = [fid] cr = ConllCorpusReader("", fids, COLUMN_TYPES) c = 0 for r in cr.iob_sents(): if c > 3: break wiki_words = [] for j in xrange(len(r)): w = morphy_stem(r[j][1].lower()) if w not in stopwords.words('english') and w[0] not in PUNC: wiki_words.append(w) for j in xrange(len(wiki_words)-2): bgrm = (wiki_words[j],wiki_words[j+1],wiki_words[j+2]) if bgrm in trigrams_qtext: ky = "Bigrams"+str(i) d[ky] += TOPGUESSES - i v += 1 print "QID:",dict['Question ID'],"SP=",sp,"\t", rnk,v, bgrm[0],bgrm[1],bgrm[2] c += 1 if v > highest: highest = v rnk = i d['bigrams'] = rnk """ highest = 0 rnk = 0 qwords = kTOKENIZER.tokenize(qtext) for i in xrange(10): fid = "./wikipedia/"+sorted_qd[i][0][0]+"/"+sorted_qd[i][0] if os.path.exists(fid): fids = [fid] cr = ConllCorpusReader("", fids, COLUMN_TYPES) c = 0 v = 0 for r in cr.iob_sents(): if c > 4: break for j in xrange(len(r)): if r[j][1].lower() not in stopwords.words('english') and r[j][1][0] not in PUNC and r[j][1].lower() in qwords: v += 1 c += 1 if v > highest: highest = v rnk = i d['unigram'] = rnk """ for qg in xrange(len(sorted_qd)): if sorted_qd[qg][0] == sorted_wd[0][0]: overlap = 1 d['Top IR Overlap'] += qg """ if (sorted_qd[0][1] - sorted_qd[1][1]) < 0.15: d['Top IR Overlap'] += qg else: d['Top IR Overlap'] -= qg """ break if overlap == 0: if sorted_qd[0][1] < 0.05 and sorted_wd[0][1] > 4.0: d['Top IR'] += 1 if sp == 0 and sorted_qd[0][1] < 0.1: d['Top_IR'] += int(sorted_wd[0][1]) for i in xrange(4*sp): if i < len(sorted_qd) and sorted_qd[i][0] == sorted_wd[i][0]: d['Equal Rank'] += i + 2 if sorted_qd[0][1] > 0.75: d['Q Score'] = 0 elif sorted_qd[0][1] > 0.39: d['Q Score'] = 1 elif sorted_qd[0][1] > 0.24: d['Q Score'] = 2 else: d['Q Score'] = -1 if sorted_wd[0][1] > 20.0 and sorted_qd[0][1] < 0.25: d['IR Score'] = TOPGUESSES d['Sentence Position'] = sp return d