def retrieve_data(): train_data = BracketParseCorpusReader("data", "02-21.10way.clean") val_data = BracketParseCorpusReader("data", "22.auto.clean") test_data = BracketParseCorpusReader("data", "23.auto.clean") train_words = [x.lower() for x in train_data.words()] val_words = [x.lower() for x in val_data.words()] test_words = [x.lower() for x in test_data.words()] all_words = train_words + val_words + test_words word_counter = Counter(all_words) vocab = ['PAD', 'SOS', 'EOS'] + list(word_counter.keys()) vocab_size = len(vocab) word2idx = {ch: i for i, ch in enumerate(vocab)} idx2word = {i: ch for i, ch in enumerate(vocab)} train_sents = [[w.lower() for w in sent] for sent in train_data.sents()] val_sents = [[w.lower() for w in sent] for sent in val_data.sents()] test_sents = [[w.lower() for w in sent] for sent in test_data.sents()] train_dataset = TextData(train_sents, word2idx, idx2word, vocab_size) val_dataset = TextData(val_sents, word2idx, idx2word, vocab_size) test_dataset = TextData(test_sents, word2idx, idx2word, vocab_size) return train_dataset, val_dataset, test_dataset
def loadCorpora(): corpus_root = '/usr/share/dict' wordlists = PlaintextCorpusReader(corpus_root, '.*') wordlists.fileids() wordlists.words('connectives') corpus_root = r"C:\corpora\penntreebank\parsed\mrg\wsj" file_pattern = r".*/wsj_.*\.mrg" ptb = BracketParseCorpusReader(corpus_root, file_pattern) ptb.fileids() len(ptb.sents()) ptb.sents(fileids='20/wsj_2013.mrg')[19]
def train(refresh=True): if refresh: ptb = BracketParseCorpusReader(Corpus.DATA_DIR, Corpus.FILE_PATTERN) train_folders = [str(i) + str(j) for i in range(2) for j in range(10)] train_folders += [str(i) + str(j) for i in range(2, 3) for j in range(5)] dictionary = corpora.dictionary.Dictionary() train_documents = list() logger.debug('Starting to parse training documents') for folder in train_folders: for ptb_file in os.listdir(os.path.join(Corpus.DATA_DIR, folder)): document_sentences = ptb.sents(fileids=[os.path.join(folder, ptb_file)]) if len(document_sentences) > DOC_LEN_THRESHOLD: doc2sentence = list(chain.from_iterable(document_sentences)) doc2sentence = clean_text(doc2sentence) dictionary.add_documents([doc2sentence]) train_documents.append(doc2sentence) logger.debug('Parsed all training documents') dictionary.filter_extremes(no_below=1, no_above=0.5) dictionary.save(DICTIONARY_FILE) logger.debug('Creating corpus for training data') corpus = [dictionary.doc2bow(text) for text in train_documents] logger.debug('Finished creating corpus') logger.debug('Training LDA model on corpus') lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=N_TOPICS, passes=20) logger.debug('Completed LDA training') lda.save(LDA_MODEL_FILE) else: dictionary = corpora.dictionary.Dictionary.load(DICTIONARY_FILE) lda = LdaModel.load(LDA_MODEL_FILE) return lda, dictionary
words[1:20] sents = gutenberg.sents("burgess-busterbrown.txt") sents[1:20] from nltk.corpus import PlaintextCorpusReader corpus_root = '' #yourown file wordlists = PlaintextCorpusReader(corpus_root, '.*') wlrdlists.fileids() wordlists.words('connectives') from nltk.corpus import BracketParseCorpusReader corpus_root = r"" file_pattern = r".*/wsj_.*\.mrg" ptb = BracketParseCorpusReader(corpus_root, file_pattern) ptb.fileids() len(ptb.sents()) ptb.sents(fileids = '20/wsj_2013.mrg')[19] #2.2==================== text = ['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', ...] pairs = [('news', 'The'), ('news', 'Fulton'), ('news', 'County'), ...] import nltk from nltk.corpus import brown cfd = nltk.ConditionalFreqDist( (genre, word) for genre in brown.categories() for word in brown.words(categories = genre))
from nltk.corpus import PlaintextCorpusReader corpus_root = '/Temp/delete' wordlists = PlaintextCorpusReader(corpus_root, '.*') print(wordlists.fileids()) print(wordlists.words('blake-poems.txt')) from nltk.corpus import BracketParseCorpusReader corpus_root = r'C:\nltk_data\corpora\treebank\combined' file_pattern = r'.*/wsj_.*\.mrg' file_pattern = r'wsj_.*.mrg' ptb = BracketParseCorpusReader(corpus_root, file_pattern) print(ptb) print(ptb.fileids()) print(len(ptb.sents())) print(ptb.sents(fileids='wsj_0199.mrg')[1]) # 2. 条件频率分布:是频率分布的集合,每个频率分布有一个不同的“条件”。(condition,word)根据condition(条件)统计word(单词)的频率。 # 2.1. 条件 和 事件 # 2.2. 按文体计数词汇 from nltk.corpus import brown cfd = nltk.ConditionalFreqDist((genre, word) for genre in brown.categories() for word in brown.words(categories=genre)) genre_word = [(genre, word) for genre in ['news', 'romance'] for word in brown.words(categories=genre)] print(genre_word) print(len(genre_word)) print(genre_word[:4]) print(genre_word[-4:])
def get_sents_by_field_ids(field_ids): if not isinstance(field_ids, list): field_ids = [field_ids] ptb = BracketParseCorpusReader(DATA_DIR, FILE_PATTERN) return ptb.sents(fileids=field_ids)
sents[1:20] #Loading your own Corpus from nltk.corpus import PlaintextCorpusReader corpus_root = '/usr/share/dict' # '.*' can be a list of fileids, like ['a.txt', 'test/b.txt'], or a pattern that matches all fileids, like '[abc]/.*\.txt' wordlists = PlaintextCorpusReader(corpus_root, '.*') wordlists.fileids() wordlists.words('connectives') from nltk.corpus import BracketParseCorpusReader corpus_root = r"C:\corpora\penntreebank\parsed\mrg\wsj" file_pattern = r".*/wsj_.*\.mrg" ptb = BracketParseCorpusReader(corpus_root, file_pattern) ptb.fileids() len(ptb.sents()) ptb.sents(fileids='20/wsj_2013.mrg')[19] # Conditional Frequency Distributions: # is a collection of frequency distributions, each one for a different "condition". # The condition will often be the category of the text. # A frequency distribution counts observable events, # such as the appearance of words in a text. # A conditional frequency distribution needs to pair each event with a condition. # So instead of processing a sequence of words, # we have to process a sequence of pairs: text = ['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', """..."""] pairs = [('news', 'The'), ('news', 'Fulton'), ('news', 'County'), '''...'''] # Each pair has the form (condition, event). # If we were processing the entire Brown Corpus by genre there would be 15 conditions
# 中文是字符型的,不能使用单词读入函数 words() # chinese_mandarin_words=udhr.words('Chinese_Mandarin-UTF8') # print(chinese_mandarin_words[:13]) # 中文是字符型的,不能使用句子读入函数 sents() # chinese_mandarin_sents=udhr.sents('Chinese_Mandarin-UTF8') # print(chinese_mandarin_sents[:13]) # 3.1.9. 载入自己的语料库 from nltk.corpus import PlaintextCorpusReader # 这个在 C 盘根目录下,子目录中需要放入一些文件 corpus_root = '/nltk_data/tokenizers/punkt' word_lists = PlaintextCorpusReader(corpus_root, '.*') print("自己语料库的文件列表= ", word_lists.fileids()) from nltk.corpus import BracketParseCorpusReader corpus_root = r'C:\nltk_data\corpora\treebank\combined' file_pattern = r'wsj_.*\.mrg' ptb = BracketParseCorpusReader(corpus_root, file_pattern) show_subtitle("文件列表") print(ptb.fileids()[:13]) show_subtitle("句子列表") print(ptb.sents()[:3]) show_subtitle("指定文件中的句子") print(ptb.sents(fileids='wsj_0003.mrg')[19])
class PTBReader(object): def __init__(self, corpus_root, file_pattern): self.ptb = BracketParseCorpusReader(corpus_root, file_pattern) self.all_sents = [] self.all_tagged_sents = [] self.all_parsed_sents = [] self.ptb_file_id = '' def read_ptb_file(self, node): if node.file_id != self.ptb_file_id: path = '{0}/{1}.mrg'.format(node.directory, node.file_id) self.all_sents = self.ptb.sents(fileids=path) self.all_tagged_sents = self.ptb.tagged_sents(fileids=path) self.all_parsed_sents = self.ptb.parsed_sents(fileids=path) self.ptb_file_id = node.file_id def get_subtree_pos(self, node): parsed_sent = self.all_parsed_sents[node.sent_id] token_pos = parsed_sent.leaf_treeposition(node.token_id) subtree_pos = token_pos[:-(node.phrase_level + 1)] return subtree_pos def is_child_node(self, parent, child): if not (isinstance(parent, Node) and isinstance(child, Node)): return False if not (parent.file_id == child.file_id and parent.sent_id == child.sent_id): return False self.read_ptb_file(parent) parent_subtree_pos = self.get_subtree_pos(parent) child_subtree_pos = self.get_subtree_pos(child) if child_subtree_pos[:len(parent_subtree_pos)] == parent_subtree_pos: return True else: return False def parse_node(self, node): if node.__class__ == SplitNode: # parse each node in the split node for n in node.node_list: self.parse_node(n) # combine the ptb_surface of each node node.ptb_idx_list = [ idx for n in node.node_list for idx in n.ptb_idx_list ] node.ptb_surface = ' '.join( [n.ptb_surface for n in node.node_list]) else: self.read_ptb_file(node) node.subtree_pos = self.get_subtree_pos(node) parsed_sent = self.all_parsed_sents[node.sent_id] node.ptb_idx_list = [] for idx in range(len(parsed_sent.leaves())): if parsed_sent.leaf_treeposition(idx)[:len(node.subtree_pos)] \ == node.subtree_pos: node.ptb_idx_list.append(idx) assert node.ptb_idx_list == \ range(node.ptb_idx_list[0], node.ptb_idx_list[-1] + 1), \ 'Error in matching indices for subtree leaves: {0}'.format(node) tagged_sent = self.all_tagged_sents[node.sent_id] node.ptb_surface = ' '.join([ word[0] for word in [tagged_sent[i] for i in node.ptb_idx_list] ])
file_pattern = r".*/WSJ_.*\.MRG" ptb = BracketParseCorpusReader(wsj, file_pattern) print('Gathered %d files...' % len(ptb.fileids())) print('Generating vocabulary...') vocab = get_vocab() print('Done.') print('Preprocessing all sections...') for fn, sections in zip([TRAIN_FILE, TEST_FILE, DEV_FILE], SECTIONS): print('Preprocessing %s...' % fn) h = open(fn, 'wt') for section in range(sections[0], sections[1] + 1): fileids = [ i for i in ptb.fileids() if i.startswith(str(section).zfill(2)) ] for sent, tree in zip(ptb.sents(fileids), ptb.parsed_sents(fileids)): sent = [ normalize(word) if normalize(word) in vocab else '<unk>' for word in sent ] lin = linearize(tree, token=True, label=False) if len(sent) < MAXLEN and len(lin.split()) < MAXLEN: h.write('%s\t%s\n' % (' '.join(sent), lin)) h.close() print('Done.') print('Done.')
# 1.9. 载入自己的语料库 from nltk.corpus import PlaintextCorpusReader corpus_root = '/Temp/delete' wordlists = PlaintextCorpusReader(corpus_root, '.*') wordlists.fileids() wordlists.words('blake-poems.txt') from nltk.corpus import BracketParseCorpusReader corpus_root = r'C:\nltk_data\corpora\treebank\combined' file_pattern = r'.*/wsj_.*\.mrg' file_pattern = r'wsj_.*.mrg' ptb = BracketParseCorpusReader(corpus_root, file_pattern) ptb.fileids() len(ptb.sents()) ptb.sents(fileids='wsj_0199.mrg')[1] # 2. 条件频率分布:是频率分布的集合,每个频率分布有一个不同的“条件”。(condition,word)根据condition(条件)统计word(单词)的频率。 # 2.1. 条件 和 事件 # 2.2. 按文体计数词汇 from nltk.corpus import brown cfd = nltk.ConditionalFreqDist((genre, word) for genre in brown.categories() for word in brown.words(categories=genre)) genre_word = [(genre, word) for genre in ['news', 'romance'] for word in brown.words(categories=genre)] len(genre_word) genre_word[:4] genre_word[-4:] cfd = nltk.ConditionalFreqDist(genre_word)
if left_nulls: iterable = [None] * (size - 1) + iterable iters = tee(iterable, size) for i in range(1, size): for each in iters[i:]: next(each, None) return zip(*iters) corpus_root = "wsj" file_pattern = ".*/wsj_.*\.mrg" ptb = BracketParseCorpusReader(corpus_root, file_pattern) counts = defaultdict(lambda: defaultdict(lambda: defaultdict(int))) for sent in ptb.sents(): for word1, word2, word3, word4, word5 in window(sent, 5): counts[-2][word3][word1] += 1 counts[-1][word3][word2] += 1 counts[1][word3][word4] += 1 counts[2][word3][word5] += 1 counts = dict(counts) for index, outer_dict in counts.items(): for word, inner_dict in outer_dict.items(): counts[index][word] = dict(inner_dict) counts[index] = dict(outer_dict) pickle.dump(counts, open('semantic_counts.pickle', 'wb'))
tree.chomsky_normal_form() return ToString(tree), ToPlainString(tree) #corpus_root="/home/jihuni/ptb/treebank_3/parsed/mrg/wsj/train/" #output="/home/jihuni/ptb/treebank_3/wsj.train" corpus_root = sys.argv[1] output = sys.argv[2] file_pattern = r".*/wsj_.*\.mrg" if (sys.argv > 3): file_pattern = sys.argv[3] ptb = BracketParseCorpusReader(corpus_root, file_pattern) #ptb.fileids() sents = [' '.join(words) for words in ptb.sents()] parsed_sents = ptb.parsed_sents() #binary_trees=[ToBinaryTreeStr(tree) for tree in parsed_sents] trimmed_binary_trees = [ToTrimmedBinaryTreeStr(tree) for tree in parsed_sents] #with open(output, 'w') as f: # for sent in sents: # f.write(sent+'\n') #with open(output+'.tree', 'w') as f: # for sent in binary_trees: # f.write(sent+'\n') with open(output + '.trim', 'w') as f: with open(output + '.trim.tree', 'w') as f2: for sent, plain_sent in trimmed_binary_trees: f.write(plain_sent.encode('utf-8') + '\n') f2.write(sent.encode('utf-8') + '\n')
from nltk.corpus import BracketParseCorpusReader; corpus_root = r"xenopedia"; file_pattern = r".*\.txt"; ptb = BracketParseCorpusReader(corpus_root,file_pattern); print ptb.fileids(); print len(ptb.sents()); print ptb.sents();
# Pad left with None's so that the the first iteration is [None, ..., None, iterable[0]] if left_nulls: iterable = [None] * (size - 1) + iterable iters = tee(iterable, size) for i in range(1, size): for each in iters[i:]: next(each, None) return zip(*iters) corpus_root = "wsj" file_pattern = ".*/wsj_.*\.mrg" ptb = BracketParseCorpusReader(corpus_root, file_pattern) counts = defaultdict(lambda: defaultdict(lambda: defaultdict(int))) for sent in ptb.sents(): for word1, word2, word3, word4, word5 in window(sent, 5): counts[-2][word3][word1] += 1 counts[-1][word3][word2] += 1 counts[1][word3][word4] += 1 counts[2][word3][word5] += 1 counts = dict(counts) for index, outer_dict in counts.items(): for word, inner_dict in outer_dict.items(): counts[index][word] = dict(inner_dict) counts[index] = dict(outer_dict) pickle.dump(counts, open('semantic_counts.pickle', 'wb'))
from nltk.corpus import BracketParseCorpusReader corpus_root = r"xenopedia" file_pattern = r".*\.txt" ptb = BracketParseCorpusReader(corpus_root, file_pattern) print ptb.fileids() print len(ptb.sents()) print ptb.sents()