def for_all_in_ptb_scts(scts, call): for sct in scts: print "Section " + sct fs = [f for f in ptb.fileids() if f.startswith("WSJ/" + sct)] for f in fs: print " File " + f + "...", call(f)
def get_fileids(min_section, max_section): for fileid in ptb.fileids(): corpus, section, filename = fileid.split('/') if corpus == 'WSJ': section = int(section) if min_section <= section and section <= max_section: yield fileid
def test_news_fileids(self): self.assertEqual( ptb.fileids('news')[:3], [ 'WSJ/00/WSJ_0001.MRG', 'WSJ/00/WSJ_0002.MRG', 'WSJ/00/WSJ_0003.MRG' ], )
def test_news_fileids(self): self.assertEqual( ptb.fileids("news")[:3], [ "WSJ/00/WSJ_0001.MRG", "WSJ/00/WSJ_0002.MRG", "WSJ/00/WSJ_0003.MRG" ], )
def get_word_to_posvec(): word_to_posvec = {} for fileid in ptb.fileids('news'): for (word, tag) in ptb.tagged_words(fileid, tagset='universal'): if word not in word_to_posvec: word_to_posvec[word] = [0] * len(_UNIVERSAL_TAGS) word_to_posvec[word][tag_to_index[tag]] += 1 return word_to_posvec
def test_fileids(self): self.assertEqual( ptb.fileids()[:4], [ 'BROWN/CF/CF01.MRG', 'BROWN/CF/CF02.MRG', 'BROWN/CF/CF03.MRG', 'BROWN/CF/CF04.MRG', ], )
def get_ptb_data(w2id): all_words = [] for item in ptb.fileids(): all_words.extend(list(map(str.lower, ptb.words(item)))) # print(all_words) all_words_id = [] for w in all_words: id = w2id.get(w) if id == None: id = w2id.get("<unk>") all_words_id.append(id) return all_words_id
def get_batches(scts): batch_xs, batch_ys = empty_batch() ex_cnt = 0 if OOV_ONLY: iv_set = set() def add_iv(f): for sent in ptb.sents(f): for tok in sent: iv_set.add(tok) common.for_all_in_ptb_scts(TRAIN_SCTS, add_iv) for sct in scts: print "Section " + sct fs = [f for f in ptb.fileids() if f.startswith("WSJ/" + sct)] for f in fs: print " File " + f + "...", # For each word in the sentences of the file, # create an example and add it to the batch. for sent in ptb.tagged_sents(f): for i in range(len(sent)): # Ignore "None" tags (not overt lingustic elements) if sent[i][1] == "-NONE-": continue # If we're in OOV, skip known tokens if OOV_ONLY and sent[i][0] in iv_set: continue x, y = get_example(sent, i) batch_xs[ex_cnt] = x batch_ys[ex_cnt] = y # If we reach enough examples to form a batch, yield it now, # then start a new batch. ex_cnt += 1 if ex_cnt == BATCH_SIZE: yield (batch_xs, batch_ys) batch_xs, batch_ys = empty_batch() ex_cnt = 0 # If we have an incomplete batch at the end, pad it with nothings # and yield it. if ex_cnt != 0: while ex_cnt < BATCH_SIZE: x, y = empty_example() batch_xs[ex_cnt] = x batch_ys[ex_cnt] = y ex_cnt += 1 yield (batch_xs, batch_ys) raise StopIteration
def load_wsj_pos_tagging(): """ Returns a full list representing the WSJ PTB2 dataset for POS-tagging. Each item is a sentence built of (word, tag) tuples for each token. """ dataset = HilbertDataset('wsj-pos', is_unsupervised=False) files = list(ptb.fileids()) test_dirs = ['22', '23', '24'] train_d, test_d = [], [] for f in files: data_list = test_d if f[4:6] in test_dirs else train_d data_list += [[(w.lower(), t) for w, t in s] for s in ptb.tagged_sents(f)] dataset.add_train(train_d) dataset.add_test(test_d) return dataset
def verify_ptb_install(): # Download PTB metadata assert(nltk.download('ptb')) import hashlib from nltk.corpus import ptb # Be sure we have the category list assert('news' in ptb.categories()) m = hashlib.md5() # NOT SECURE! m.update(','.join(ptb.fileids()).encode('utf8')) if m.hexdigest() == 'e3b49c6df5529560b2945e6a4715f9b0': print('Penn Treebank succesfully installed!') return True else: print('Error installing Penn Treebank (hash mismatch).') print('It may still work - try loading it in NLTK.') return False
def verify_ptb_install(): # Download PTB metadata assert (nltk.download('ptb')) import hashlib from nltk.corpus import ptb # Be sure we have the category list assert ('news' in ptb.categories()) m = hashlib.md5() # NOT SECURE! m.update(','.join(ptb.fileids()).encode('utf8')) if m.hexdigest() == 'e3b49c6df5529560b2945e6a4715f9b0': print('Penn Treebank succesfully installed!') return True else: print('Error installing Penn Treebank (hash mismatch).') print('It may still work - try loading it in NLTK.') return False
def __init__(self, dict_path): """Initialization. Args: dict_path: path to dictionary folder Raises: Exception: missing dictionary """ dict_file_name = os.path.join(dict_path, 'dict.pkl') if os.path.exists(dict_file_name): self.dictionary = pickle.load(open(dict_file_name, 'rb')) else: raise Exception all_file_ids = ptb.fileids() train_file_ids = [] valid_file_ids = [] test_file_ids = [] rest_file_ids = [] for file_id in all_file_ids: if 'WSJ/00/WSJ_0200.MRG' <= file_id <= 'WSJ/21/WSJ_2199.MRG': train_file_ids.append(file_id) if 'WSJ/22/WSJ_2200.MRG' <= file_id <= 'WSJ/22/WSJ_2299.MRG': valid_file_ids.append(file_id) if 'WSJ/23/WSJ_2300.MRG' <= file_id <= 'WSJ/23/WSJ_2399.MRG': test_file_ids.append(file_id) elif ('WSJ/00/WSJ_0000.MRG' <= file_id <= 'WSJ/01/WSJ_0199.MRG') or \ ('WSJ/24/WSJ_2400.MRG' <= file_id <= 'WSJ/24/WSJ_2499.MRG'): rest_file_ids.append(file_id) self.train, self.train_sens, self.train_trees, self.train_nltktrees \ = self.tokenize(train_file_ids) self.valid, self.valid_sens, self.valid_trees, self.valid_nltktress \ = self.tokenize(valid_file_ids) self.test, self.test_sens, self.test_trees, self.test_nltktrees \ = self.tokenize(test_file_ids) self.rest, self.rest_sens, self.rest_trees, self.rest_nltktrees \ = self.tokenize(rest_file_ids)
def get_raw_data(): raw_data = {} fileids = ptb.fileids() obj_sofar = 0 for fileid in fileids: corpus, section, _ = fileid.split('/') if corpus.lower() != 'wsj': continue section = int(section) if section >= 2 and section <= 21: split = 'train' elif section == 22: split = 'valid' elif section == 23: split = 'test' else: split = None sent_sofar = 0 for y in ptb.parsed_sents(fileid): words, part_of_speech = zip(*y.pos()) constituency_parse = tree_to_tuple(y) obj = collections.OrderedDict() obj['example_id'] = 'ptb{}'.format(obj_sofar) obj['file_id'] = fileid obj['sent_id'] = sent_sofar obj['words'] = words obj['part_of_speech'] = part_of_speech obj['constituency_parse'] = constituency_parse sent_sofar += 1 obj_sofar += 1 raw_data.setdefault('all', []).append(obj) if split is not None: raw_data.setdefault(split, []).append(obj) return raw_data
), ], ), ], ) ], ) ], ), '.', ], ), ) @skipIf(not ptb.fileids(), "A full installation of the Penn Treebank is not available") class TestPTB(unittest.TestCase): def test_fileids(self): self.assertEqual( ptb.fileids()[:4], [ 'BROWN/CF/CF01.MRG', 'BROWN/CF/CF02.MRG', 'BROWN/CF/CF03.MRG', 'BROWN/CF/CF04.MRG', ], ) def test_words(self): self.assertEqual(
def test_fileids(self): self.assertEqual( ptb.fileids()[:4], ["BROWN/CF/CF01.MRG", "BROWN/CF/CF02.MRG", "BROWN/CF/CF03.MRG", "BROWN/CF/CF04.MRG"] )
], ), ], ) ], ) ], ), '.', ], ), ) @pytest.mark.skipif( not ptb.fileids(), reason="A full installation of the Penn Treebank is not available" ) class TestPTB(unittest.TestCase): def test_fileids(self): self.assertEqual( ptb.fileids()[:4], [ 'BROWN/CF/CF01.MRG', 'BROWN/CF/CF02.MRG', 'BROWN/CF/CF03.MRG', 'BROWN/CF/CF04.MRG', ], ) def test_words(self):
), ], ), ], ) ], ) ], ), '.', ], ), ) @skipIf(not ptb.fileids(), "A full installation of the Penn Treebank is not available") class TestPTB(unittest.TestCase): def test_fileids(self): self.assertEqual( ptb.fileids()[:4], [ 'BROWN/CF/CF01.MRG', 'BROWN/CF/CF02.MRG', 'BROWN/CF/CF03.MRG', 'BROWN/CF/CF04.MRG', ], ) def test_words(self): self.assertEqual( ptb.words('WSJ/00/WSJ_0003.MRG')[:7],
def test_news_fileids(self): self.assertEqual( ptb.fileids('news')[:3], ['WSJ/00/WSJ_0001.MRG', 'WSJ/00/WSJ_0002.MRG', 'WSJ/00/WSJ_0003.MRG'] )
def test_fileids(self): self.assertEqual( ptb.fileids()[:4], ['BROWN/CF/CF01.MRG', 'BROWN/CF/CF02.MRG', 'BROWN/CF/CF03.MRG', 'BROWN/CF/CF04.MRG'] )
Tree('0,9349_dólares', [ 'los', Tree('de', [ Tree('mañana', ['esta']) ]) ]) ]) ]) ]) ]), '.' ]) ) @skipIf(not ptb.fileids(), "A full installation of the Penn Treebank is not available") class TestPTB(unittest.TestCase): def test_fileids(self): self.assertEqual( ptb.fileids()[:4], ['BROWN/CF/CF01.MRG', 'BROWN/CF/CF02.MRG', 'BROWN/CF/CF03.MRG', 'BROWN/CF/CF04.MRG'] ) def test_words(self): self.assertEqual( ptb.words('WSJ/00/WSJ_0003.MRG')[:7], ['A', 'form', 'of', 'asbestos', 'once', 'used', '*'] ) def test_tagged_words(self): self.assertEqual(
def test_news_fileids(self): self.assertEqual(ptb.fileids("news")[:3], ["WSJ/00/WSJ_0001.MRG", "WSJ/00/WSJ_0002.MRG", "WSJ/00/WSJ_0003.MRG"])
'.', ',', ':', '-LRB-', '-RRB-', '\'\'', '``', '--', ';', '-', '?', '!', '...', '-LCB-', '-RCB-' ] P = re.compile("[-+]?\d*\.\d+|[-+]\d+|[-+]?\d*\,\d+|\d+|\d+:\d+") PY = re.compile( "\d+[%]?-[a-zA-Z]|\d+[%]?[a-zA-Z]|[-+]?\d*\.\d+[%]?-[a-zA-Z]|[-+]?\d*\.\d+[%]?[a-zA-Z]|[-+]?\d*\,\d+-[%]?[a-zA-Z]|[-+]?\d*\,\d+[%]?[a-zA-Z]" ) PA = re.compile("\d+-\d+-\d+") PB = re.compile("\d+\\\/\d+-[A-Za-z]|\d+\\\/\d+[A-Za-z]") L = [ "a310-300s", "747-100s", "747-400s", "45,000-$60,000", "767-300er", "747-400s" ] file_ids = ptb.fileids() train_file_ids = [] valid_file_ids = [] test_file_ids = [] rest_file_ids = [] train_lm_file_ids = [] for id in file_ids: if 'WSJ/00/WSJ_0000.MRG' <= id <= 'WSJ/24/WSJ_2499.MRG': train_file_ids.append(id) if 'WSJ/00/WSJ_0000.MRG' <= id <= 'WSJ/20/WSJ_2099.MRG': train_lm_file_ids.append(id) if 'WSJ/22/WSJ_2200.MRG' <= id <= 'WSJ/22/WSJ_2299.MRG': valid_file_ids.append(id) if 'WSJ/23/WSJ_2300.MRG' <= id <= 'WSJ/23/WSJ_2399.MRG': test_file_ids.append(id)
#!/usr/bin/python # -*- coding: utf-8 -*- from nltk.corpus import treebank print(treebank.fileids()) # doctest: +ELLIPSIS print(treebank.words('wsj_0003.mrg')) print(treebank.tagged_words('wsj_0003.mrg')) print(treebank.parsed_sents('wsj_0003.mrg')[0]) from nltk.corpus import ptb print(ptb.fileids()) # doctest: +SKIP