示例#1
0
def for_all_in_ptb_scts(scts, call):
	for sct in scts:
		print "Section " + sct 
		fs = [f for f in ptb.fileids() if f.startswith("WSJ/" + sct)]
		for f in fs:
			print "  File " + f + "...",
			call(f)
示例#2
0
文件: ptb.py 项目: danieldeutsch/gcd
def get_fileids(min_section, max_section):
    for fileid in ptb.fileids():
        corpus, section, filename = fileid.split('/')
        if corpus == 'WSJ':
            section = int(section)
            if min_section <= section and section <= max_section:
                yield fileid
示例#3
0
def for_all_in_ptb_scts(scts, call):
    for sct in scts:
        print "Section " + sct
        fs = [f for f in ptb.fileids() if f.startswith("WSJ/" + sct)]
        for f in fs:
            print "  File " + f + "...",
            call(f)
 def test_news_fileids(self):
     self.assertEqual(
         ptb.fileids('news')[:3],
         [
             'WSJ/00/WSJ_0001.MRG', 'WSJ/00/WSJ_0002.MRG',
             'WSJ/00/WSJ_0003.MRG'
         ],
     )
示例#5
0
 def test_news_fileids(self):
     self.assertEqual(
         ptb.fileids("news")[:3],
         [
             "WSJ/00/WSJ_0001.MRG", "WSJ/00/WSJ_0002.MRG",
             "WSJ/00/WSJ_0003.MRG"
         ],
     )
def get_word_to_posvec():
    word_to_posvec = {}
    for fileid in ptb.fileids('news'):
        for (word, tag) in ptb.tagged_words(fileid, tagset='universal'):
            if word not in word_to_posvec:
                word_to_posvec[word] = [0] * len(_UNIVERSAL_TAGS)
            word_to_posvec[word][tag_to_index[tag]] += 1
    return word_to_posvec
示例#7
0
 def test_fileids(self):
     self.assertEqual(
         ptb.fileids()[:4],
         [
             'BROWN/CF/CF01.MRG',
             'BROWN/CF/CF02.MRG',
             'BROWN/CF/CF03.MRG',
             'BROWN/CF/CF04.MRG',
         ],
     )
def get_ptb_data(w2id):
    all_words = []
    for item in ptb.fileids():
        all_words.extend(list(map(str.lower, ptb.words(item))))
    # print(all_words)
    all_words_id = []
    for w in all_words:
        id = w2id.get(w)
        if id == None:
            id = w2id.get("<unk>")
        all_words_id.append(id)
    return all_words_id
示例#9
0
def get_batches(scts):
	batch_xs, batch_ys = empty_batch()
	ex_cnt = 0
	
	if OOV_ONLY:
		iv_set = set()
		def add_iv(f):
			for sent in ptb.sents(f):
				for tok in sent:
					iv_set.add(tok)
		common.for_all_in_ptb_scts(TRAIN_SCTS, add_iv)
	
	for sct in scts:
		print "Section " + sct 
		fs = [f for f in ptb.fileids() if f.startswith("WSJ/" + sct)]
		for f in fs:
			print "  File " + f + "...",
			# For each word in the sentences of the file,
			# create an example and add it to the batch.
			for sent in ptb.tagged_sents(f):
				for i in range(len(sent)):
					# Ignore "None" tags (not overt lingustic elements)
					if sent[i][1] == "-NONE-":
						continue
					
					# If we're in OOV, skip known tokens
					if OOV_ONLY and sent[i][0] in iv_set:
						continue
						
					x, y = get_example(sent, i)
					batch_xs[ex_cnt] = x
					batch_ys[ex_cnt] = y
					
					# If we reach enough examples to form a batch, yield it now,
					# then start a new batch.
					ex_cnt += 1
					if ex_cnt == BATCH_SIZE:
						yield (batch_xs, batch_ys)
						batch_xs, batch_ys = empty_batch()
						ex_cnt = 0
	
	# If we have an incomplete batch at the end, pad it with nothings
	# and yield it.
	if ex_cnt != 0:
		while ex_cnt < BATCH_SIZE:
			x, y = empty_example()
			batch_xs[ex_cnt] = x
			batch_ys[ex_cnt] = y
			ex_cnt += 1
		yield (batch_xs, batch_ys)
	
	raise StopIteration
def load_wsj_pos_tagging():
    """
    Returns a full list representing the WSJ PTB2 dataset for POS-tagging.
        Each item is a sentence built of (word, tag) tuples for each token.
    """
    dataset = HilbertDataset('wsj-pos', is_unsupervised=False)
    files = list(ptb.fileids())
    test_dirs = ['22', '23', '24']
    train_d, test_d = [], []
    for f in files:
        data_list = test_d if f[4:6] in test_dirs else train_d
        data_list += [[(w.lower(), t) for w, t in s]
                       for s in ptb.tagged_sents(f)]
    dataset.add_train(train_d)
    dataset.add_test(test_d)
    return dataset
示例#11
0
def verify_ptb_install():
    # Download PTB metadata
    assert(nltk.download('ptb'))

    import hashlib
    from nltk.corpus import ptb
    # Be sure we have the category list
    assert('news' in ptb.categories())

    m = hashlib.md5()  # NOT SECURE!
    m.update(','.join(ptb.fileids()).encode('utf8'))
    if m.hexdigest() == 'e3b49c6df5529560b2945e6a4715f9b0':
        print('Penn Treebank succesfully installed!')
        return True
    else:
        print('Error installing Penn Treebank (hash mismatch).')
        print('It may still work - try loading it in NLTK.')
        return False
示例#12
0
def verify_ptb_install():
    # Download PTB metadata
    assert (nltk.download('ptb'))

    import hashlib
    from nltk.corpus import ptb
    # Be sure we have the category list
    assert ('news' in ptb.categories())

    m = hashlib.md5()  # NOT SECURE!
    m.update(','.join(ptb.fileids()).encode('utf8'))
    if m.hexdigest() == 'e3b49c6df5529560b2945e6a4715f9b0':
        print('Penn Treebank succesfully installed!')
        return True
    else:
        print('Error installing Penn Treebank (hash mismatch).')
        print('It may still work - try loading it in NLTK.')
        return False
示例#13
0
    def __init__(self, dict_path):
        """Initialization.

    Args:
      dict_path: path to dictionary folder
    Raises:
      Exception: missing dictionary
    """

        dict_file_name = os.path.join(dict_path, 'dict.pkl')
        if os.path.exists(dict_file_name):
            self.dictionary = pickle.load(open(dict_file_name, 'rb'))
        else:
            raise Exception

        all_file_ids = ptb.fileids()
        train_file_ids = []
        valid_file_ids = []
        test_file_ids = []
        rest_file_ids = []
        for file_id in all_file_ids:
            if 'WSJ/00/WSJ_0200.MRG' <= file_id <= 'WSJ/21/WSJ_2199.MRG':
                train_file_ids.append(file_id)
            if 'WSJ/22/WSJ_2200.MRG' <= file_id <= 'WSJ/22/WSJ_2299.MRG':
                valid_file_ids.append(file_id)
            if 'WSJ/23/WSJ_2300.MRG' <= file_id <= 'WSJ/23/WSJ_2399.MRG':
                test_file_ids.append(file_id)
            elif ('WSJ/00/WSJ_0000.MRG' <= file_id <= 'WSJ/01/WSJ_0199.MRG') or \
                ('WSJ/24/WSJ_2400.MRG' <= file_id <= 'WSJ/24/WSJ_2499.MRG'):
                rest_file_ids.append(file_id)

        self.train, self.train_sens, self.train_trees, self.train_nltktrees \
            = self.tokenize(train_file_ids)
        self.valid, self.valid_sens, self.valid_trees, self.valid_nltktress \
            = self.tokenize(valid_file_ids)
        self.test, self.test_sens, self.test_trees, self.test_nltktrees \
            = self.tokenize(test_file_ids)
        self.rest, self.rest_sens, self.rest_trees, self.rest_nltktrees \
            = self.tokenize(rest_file_ids)
示例#14
0
def get_raw_data():
    raw_data = {}
    fileids = ptb.fileids()

    obj_sofar = 0

    for fileid in fileids:
        corpus, section, _ = fileid.split('/')
        if corpus.lower() != 'wsj':
            continue
        section = int(section)
        if section >= 2 and section <= 21:
            split = 'train'
        elif section == 22:
            split = 'valid'
        elif section == 23:
            split = 'test'
        else:
            split = None
        sent_sofar = 0
        for y in ptb.parsed_sents(fileid):
            words, part_of_speech = zip(*y.pos())
            constituency_parse = tree_to_tuple(y)
            obj = collections.OrderedDict()
            obj['example_id'] = 'ptb{}'.format(obj_sofar)
            obj['file_id'] = fileid
            obj['sent_id'] = sent_sofar
            obj['words'] = words
            obj['part_of_speech'] = part_of_speech
            obj['constituency_parse'] = constituency_parse
            sent_sofar += 1
            obj_sofar += 1

            raw_data.setdefault('all', []).append(obj)
            if split is not None:
                raw_data.setdefault(split, []).append(obj)

    return raw_data
示例#15
0
                                                    ),
                                                ],
                                            ),
                                        ],
                                    )
                                ],
                            )
                        ],
                    ),
                    '.',
                ],
            ),
        )


@skipIf(not ptb.fileids(),
        "A full installation of the Penn Treebank is not available")
class TestPTB(unittest.TestCase):
    def test_fileids(self):
        self.assertEqual(
            ptb.fileids()[:4],
            [
                'BROWN/CF/CF01.MRG',
                'BROWN/CF/CF02.MRG',
                'BROWN/CF/CF03.MRG',
                'BROWN/CF/CF04.MRG',
            ],
        )

    def test_words(self):
        self.assertEqual(
示例#16
0
 def test_fileids(self):
     self.assertEqual(
         ptb.fileids()[:4], ["BROWN/CF/CF01.MRG", "BROWN/CF/CF02.MRG", "BROWN/CF/CF03.MRG", "BROWN/CF/CF04.MRG"]
     )
示例#17
0
                                                ],
                                            ),
                                        ],
                                    )
                                ],
                            )
                        ],
                    ),
                    '.',
                ],
            ),
        )


@pytest.mark.skipif(
    not ptb.fileids(),
    reason="A full installation of the Penn Treebank is not available"
)
class TestPTB(unittest.TestCase):
    def test_fileids(self):
        self.assertEqual(
            ptb.fileids()[:4],
            [
                'BROWN/CF/CF01.MRG',
                'BROWN/CF/CF02.MRG',
                'BROWN/CF/CF03.MRG',
                'BROWN/CF/CF04.MRG',
            ],
        )

    def test_words(self):
示例#18
0
                                                    ),
                                                ],
                                            ),
                                        ],
                                    )
                                ],
                            )
                        ],
                    ),
                    '.',
                ],
            ),
        )


@skipIf(not ptb.fileids(), "A full installation of the Penn Treebank is not available")
class TestPTB(unittest.TestCase):
    def test_fileids(self):
        self.assertEqual(
            ptb.fileids()[:4],
            [
                'BROWN/CF/CF01.MRG',
                'BROWN/CF/CF02.MRG',
                'BROWN/CF/CF03.MRG',
                'BROWN/CF/CF04.MRG',
            ],
        )

    def test_words(self):
        self.assertEqual(
            ptb.words('WSJ/00/WSJ_0003.MRG')[:7],
 def test_news_fileids(self):
     self.assertEqual(
         ptb.fileids('news')[:3],
         ['WSJ/00/WSJ_0001.MRG', 'WSJ/00/WSJ_0002.MRG', 'WSJ/00/WSJ_0003.MRG']
     )
 def test_fileids(self):
     self.assertEqual(
         ptb.fileids()[:4],
         ['BROWN/CF/CF01.MRG', 'BROWN/CF/CF02.MRG', 'BROWN/CF/CF03.MRG', 'BROWN/CF/CF04.MRG']
     )
                                Tree('0,9349_dólares', [
                                    'los',
                                    Tree('de', [
                                        Tree('mañana', ['esta'])
                                    ])
                                ])
                            ])
                        ])
                    ])
                ]),
                '.'
            ])
        )


@skipIf(not ptb.fileids(), "A full installation of the Penn Treebank is not available")
class TestPTB(unittest.TestCase):
    def test_fileids(self):
        self.assertEqual(
            ptb.fileids()[:4],
            ['BROWN/CF/CF01.MRG', 'BROWN/CF/CF02.MRG', 'BROWN/CF/CF03.MRG', 'BROWN/CF/CF04.MRG']
        )

    def test_words(self):
        self.assertEqual(
            ptb.words('WSJ/00/WSJ_0003.MRG')[:7],
            ['A', 'form', 'of', 'asbestos', 'once', 'used', '*']
        )

    def test_tagged_words(self):
        self.assertEqual(
示例#22
0
 def test_news_fileids(self):
     self.assertEqual(ptb.fileids("news")[:3], ["WSJ/00/WSJ_0001.MRG", "WSJ/00/WSJ_0002.MRG", "WSJ/00/WSJ_0003.MRG"])
示例#23
0
文件: data.py 项目: Noahs-ARK/PaLM
    '.', ',', ':', '-LRB-', '-RRB-', '\'\'', '``', '--', ';', '-', '?', '!',
    '...', '-LCB-', '-RCB-'
]

P = re.compile("[-+]?\d*\.\d+|[-+]\d+|[-+]?\d*\,\d+|\d+|\d+:\d+")
PY = re.compile(
    "\d+[%]?-[a-zA-Z]|\d+[%]?[a-zA-Z]|[-+]?\d*\.\d+[%]?-[a-zA-Z]|[-+]?\d*\.\d+[%]?[a-zA-Z]|[-+]?\d*\,\d+-[%]?[a-zA-Z]|[-+]?\d*\,\d+[%]?[a-zA-Z]"
)
PA = re.compile("\d+-\d+-\d+")
PB = re.compile("\d+\\\/\d+-[A-Za-z]|\d+\\\/\d+[A-Za-z]")
L = [
    "a310-300s", "747-100s", "747-400s", "45,000-$60,000", "767-300er",
    "747-400s"
]

file_ids = ptb.fileids()

train_file_ids = []
valid_file_ids = []
test_file_ids = []
rest_file_ids = []
train_lm_file_ids = []
for id in file_ids:
    if 'WSJ/00/WSJ_0000.MRG' <= id <= 'WSJ/24/WSJ_2499.MRG':
        train_file_ids.append(id)
    if 'WSJ/00/WSJ_0000.MRG' <= id <= 'WSJ/20/WSJ_2099.MRG':
        train_lm_file_ids.append(id)
    if 'WSJ/22/WSJ_2200.MRG' <= id <= 'WSJ/22/WSJ_2299.MRG':
        valid_file_ids.append(id)
    if 'WSJ/23/WSJ_2300.MRG' <= id <= 'WSJ/23/WSJ_2399.MRG':
        test_file_ids.append(id)
#!/usr/bin/python
# -*- coding: utf-8 -*-

from nltk.corpus import treebank
print(treebank.fileids())  # doctest: +ELLIPSIS
print(treebank.words('wsj_0003.mrg'))
print(treebank.tagged_words('wsj_0003.mrg'))
print(treebank.parsed_sents('wsj_0003.mrg')[0])

from nltk.corpus import ptb
print(ptb.fileids())  # doctest: +SKIP