def loadCorpora(): corpus_root = '/usr/share/dict' wordlists = PlaintextCorpusReader(corpus_root, '.*') wordlists.fileids() wordlists.words('connectives') corpus_root = r"C:\corpora\penntreebank\parsed\mrg\wsj" file_pattern = r".*/wsj_.*\.mrg" ptb = BracketParseCorpusReader(corpus_root, file_pattern) ptb.fileids() len(ptb.sents()) ptb.sents(fileids='20/wsj_2013.mrg')[19]
def open_flod(self, root_path, file_type ): ptb = BracketParseCorpusReader(root_path, file_type) files_list = ptb.fileids() files_path = [] for f in files_list: files_path.append(os.path.join(root_path,f)) return (files_path,files_list)
def tree_reader(): d = {} trees = BracketParseCorpusReader("parsed_sentences/", ".*") for name in trees.fileids(): d_name = re.sub(r"\.tree", "", name) d[d_name] = list(trees.parsed_sents(name)) return d
def extracting_cfg( corpus_root, file_pattern): #returns cfg eith only 2 non-terminals on the right ptb = BracketParseCorpusReader(corpus_root, file_pattern) cfg_dict = {} unite_productions = {} lexicon = {} for file in ptb.fileids(): #file = ptb.fileids()[0] print(file) for sentence in ptb.parsed_sents(file): # iterating through sentences #sentence =ptb.parsed_sents(file)[some_i] if len(sentence.leaves()) <= 8: #print(sentence.leaves()) for subtree in sentence.subtrees(): # extracting subtree left_side = subtree.label() right_side = [] for children in subtree: if isinstance(children, str): # reached leaf node right_side.append(children) if left_side in lexicon: lexicon[left_side].add(children) else: lexicon[left_side] = set() lexicon[left_side].add(children) else: # still not leafe node right_side.append(children.label()) while len( right_side ) > 2: # making only 2 non-terminals on the right side new_head = '_'.join( right_side[1:] ) # generating new left side of the rule new_right_side = right_side[:1] + [ new_head ] # generating new right side of the rule tup = tuple(new_right_side) if left_side not in cfg_dict: # new key cfg_dict[left_side] = set() cfg_dict[left_side].add(tup) else: cfg_dict[left_side].add(tup) left_side = new_head right_side = right_side[1:] if len(right_side) == 1: #unite production if left_side in unite_productions: unite_productions[left_side].add(tuple(right_side)) else: unite_productions[left_side] = set() unite_productions[left_side].add(tuple(right_side)) if left_side in cfg_dict: # adding rule to the dict cfg_dict[left_side].add(tuple(right_side)) else: cfg_dict[left_side] = set() cfg_dict[left_side].add(tuple(right_side)) return cfg_dict, lexicon, unite_productions
def extracting_cnf(corpus_root, file_pattern): ptb = BracketParseCorpusReader(corpus_root, file_pattern) cnf_dict = {} cnf_dict['lexicon'] = set() #for file in ptb.fileids(): #for file in ptb.fileids(): file = ptb.fileids()[0] print(file) for s in range(1, len(ptb.parsed_sents(file))): tree = ptb.parsed_sents(file)[s] for sub in tree.subtrees(): return_rule(sub, cnf_dict, file) return cnf_dict
def print_corpus_metrics(corpus_dir='data'): ptb = BracketParseCorpusReader(DATA_DIR, FILE_PATTERN) words = ptb.words() print 'Total number of words', len(words) print 'Total number of unique words', len(set(words)) print 'Total number of documents', len(ptb.fileids())
def load_reader_and_filedids(lang,data_type): assert data_type in ('train','val','test') def filter_trees(tree, data_type): def _is_control(char): """Checks whether `chars` is a control character.""" # These are technically control characters but we count them as whitespace # characters. if char == "\t" or char == "\n" or char == "\r": return False cat = unicodedata.category(char) if cat.startswith("C"): return True return False sent=tree.leaves() if data_type=='wsj' and len(sent)>10: return False if data_type!='wsj' and len(sent)>128: return False try: for c in ' '.join(sent): cp=ord(c) if cp == 0 or cp == 0xfffd or _is_control(c): return False return True except: return False def filt_id(fileids,lang): assert lang in ('en','fr','zh') train_file_ids,valid_file_ids,test_file_ids=[],[],[] for id in fileids: prefix=id.split('.')[0] if lang=='en': if 'WSJ/22/WSJ_2200' <= prefix <= 'WSJ/22/WSJ_2299': valid_file_ids.append(id) elif 'WSJ/23/WSJ_2300' <= prefix <= 'WSJ/23/WSJ_2399': test_file_ids.append(id) else: train_file_ids.append(id) elif lang=='zh': if '0886' <= prefix <= '0931' or '1148' <= prefix <= '1151': valid_file_ids.append(id) elif '0816' <= prefix <= '0885' or '1137' <= prefix <='1147': test_file_ids.append(id) else: train_file_ids.append(id) else: if prefix in ('flmf3_12500_12999co','flmf7ab2ep','flmf7ad1co','flmf7ae1ep'): valid_file_ids.append(id) elif prefix in ('flmf3_12000_12499ep','flmf7aa1ep','flmf7aa2ep','flmf7ab1co'): test_file_ids.append(id) else: train_file_ids.append(id) return train_file_ids,valid_file_ids,test_file_ids assert lang in ('en','zh','fr','il','jp','sp','ca','sw','de') lang_dir=treebank_dir+'/'+lang reader=BracketParseCorpusReader(lang_dir, '.*') fileids=reader.fileids() if data_type=='wsj10': return [t for t in reader.parsed_sents(fileids) if filter_trees(t,data_type)] train_file_ids = [] valid_file_ids = [] test_file_ids = [] if lang in ('en','zh','fr'): train_file_ids,valid_file_ids,test_file_ids=filt_id(fileids,lang) train_trees=reader.parsed_sents(train_file_ids) val_trees=reader.parsed_sents(valid_file_ids) test_trees=reader.parsed_sents(test_file_ids) else: for fid in fileids: if 'train' in fid: train_trees=reader.parsed_sents(fid) elif 'val' in fid: val_trees=reader.parsed_sents(fid) elif 'test' in fid: test_trees=reader.parsed_sents(fid) if data_type=='train': train_trees=[t for t in train_trees if filter_trees(t,data_type)] print(f'train:{len(train_trees)}') return train_trees elif data_type=='val': val_trees=[t for t in val_trees if filter_trees(t,data_type)] print(f'val:{len(val_trees)}') return val_trees else: test_trees=[t for t in test_trees if filter_trees(t,data_type)] print(f'test:{len(test_trees)}') return test_trees
def revertPOS(symbol): return symbol[1:-1] ###### Main ######################################################################### if __name__ == '__main__': clArgs = createArgParser().parse_args() #Check if any arguments are given. If not, display help active = False if clArgs.penn != None and clArgs.grammar != None: active = True ## Set up the treebank reader ptb = BracketParseCorpusReader(path.dirname(clArgs.penn), [path.basename(clArgs.penn)]) ## Collect all terminal and nonterminals for tree in ptb.parsed_sents(ptb.fileids()[0]): # Also set the start symbol to the root of the first tree if len(start_symbol) == 0: start_symbol = tree.node findSymbolsInTree(tree) ## Find ambiguous symbols and map them to a unique alternative for symbol in nonterminals.intersection(pos): replacement = "_" + symbol + "_" symbolMap[symbol] = replacement if replacement in pos or replacement in nonterminals: print "Cannot make nonterminal unambiguous: ", symbol sys.exit(-1) ## Iterate over all trees and replace ambigous nonterminals with their unique alternative
sents = gutenberg.sents("burgess-busterbrown.txt") sents[1:20] #Loading your own Corpus from nltk.corpus import PlaintextCorpusReader corpus_root = '/usr/share/dict' # '.*' can be a list of fileids, like ['a.txt', 'test/b.txt'], or a pattern that matches all fileids, like '[abc]/.*\.txt' wordlists = PlaintextCorpusReader(corpus_root, '.*') wordlists.fileids() wordlists.words('connectives') from nltk.corpus import BracketParseCorpusReader corpus_root = r"C:\corpora\penntreebank\parsed\mrg\wsj" file_pattern = r".*/wsj_.*\.mrg" ptb = BracketParseCorpusReader(corpus_root, file_pattern) ptb.fileids() len(ptb.sents()) ptb.sents(fileids='20/wsj_2013.mrg')[19] # Conditional Frequency Distributions: # is a collection of frequency distributions, each one for a different "condition". # The condition will often be the category of the text. # A frequency distribution counts observable events, # such as the appearance of words in a text. # A conditional frequency distribution needs to pair each event with a condition. # So instead of processing a sequence of words, # we have to process a sequence of pairs: text = ['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', """..."""] pairs = [('news', 'The'), ('news', 'Fulton'), ('news', 'County'), '''...'''] # Each pair has the form (condition, event).
from nltk.corpus import stopwords #new from nltk.probability import FreqDist #new from nltk.tokenize import word_tokenize #new from nltk.tokenize import RegexpTokenizer #new from nltk.corpus import BracketParseCorpusReader #new from itertools import zip_longest default_stopwords = set(stopwords.words('english')) custom_stopwords = set(('mln', 'reuter', 'dlrs', 'pct', 'the', 'bc', 'reute', 'cts', 'shr', 'feb', 'vs', 'would', 'will', 'inc', 'corp', 'ltd', 'net', 'billion')) stops = default_stopwords | custom_stopwords corpus_root = r"articles/" file_pattern = r"[A-Za-z0-9-]+.sgm" ptb = BracketParseCorpusReader(corpus_root, file_pattern) filtered_words = [] onlyfiles = [join(corpus_root, f) for f in ptb.fileids()] # file = 'reut2-000.sgm' for file in onlyfiles: with open(file) as file: soup = BeautifulSoup(file, 'html.parser') tokenizer = RegexpTokenizer(r'\w+') words = tokenizer.tokenize(soup.getText()) for word in words: word = word.lower() if word not in stops and not word.isnumeric() and len(word)>1: filtered_words.append(word) fdist = FreqDist(filtered_words)
words = gutenberg.words("burgess-busterbrown.txt") words[1:20] sents = gutenberg.sents("burgess-busterbrown.txt") sents[1:20] from nltk.corpus import PlaintextCorpusReader corpus_root = '' #yourown file wordlists = PlaintextCorpusReader(corpus_root, '.*') wlrdlists.fileids() wordlists.words('connectives') from nltk.corpus import BracketParseCorpusReader corpus_root = r"" file_pattern = r".*/wsj_.*\.mrg" ptb = BracketParseCorpusReader(corpus_root, file_pattern) ptb.fileids() len(ptb.sents()) ptb.sents(fileids = '20/wsj_2013.mrg')[19] #2.2==================== text = ['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', ...] pairs = [('news', 'The'), ('news', 'Fulton'), ('news', 'County'), ...] import nltk from nltk.corpus import brown cfd = nltk.ConditionalFreqDist( (genre, word) for genre in brown.categories() for word in brown.words(categories = genre))
vocab = [i[0] for i in vocab] return vocab if __name__ == '__main__': TRAIN_FILE = 'data/wsj_2-21' TEST_FILE = 'data/wsj_23' DEV_FILE = 'data/wsj_24' SECTIONS = [(2, 21), (23, 23), (24, 24)] MAXLEN = 50 wsj = '/data/penn_tb_3.0/TREEBANK_3/PARSED/MRG/WSJ/' file_pattern = r".*/WSJ_.*\.MRG" ptb = BracketParseCorpusReader(wsj, file_pattern) print('Gathered %d files...' % len(ptb.fileids())) print('Generating vocabulary...') vocab = get_vocab() print('Done.') print('Preprocessing all sections...') for fn, sections in zip([TRAIN_FILE, TEST_FILE, DEV_FILE], SECTIONS): print('Preprocessing %s...' % fn) h = open(fn, 'wt') for section in range(sections[0], sections[1] + 1): fileids = [ i for i in ptb.fileids() if i.startswith(str(section).zfill(2)) ] for sent, tree in zip(ptb.sents(fileids),
class PropBank(DataLoader): """Load PropBank data.""" def __init__(self, params, corpus, with_doc=False): super().__init__(params, corpus) logging.info('Initialize PropBank reader.') if with_doc: self.wsj_treebank = BracketParseCorpusReader( root=params.wsj_path, fileids=params.wsj_file_pattern, tagset='wsj', encoding='ascii') logging.info('Found {} treebank files.'.format( len(self.wsj_treebank.fileids()))) self.propbank = PropbankCorpusReader( root=FileSystemPathPointer(params.root), propfile=params.propfile, framefiles=params.frame_files, verbsfile=params.verbs_file, ) self.propbank_annos = defaultdict(list) logging.info("Loading PropBank Data.") for inst in self.propbank.instances(): docid = inst.fileid.split('/')[-1] self.propbank_annos[docid].append(inst) self.stats = { 'predicate_count': 0, 'argument_count': 0, } def add_all_annotations(self, doc): logging.info("Adding propbank annotations for " + doc.docid) instances = self.propbank_annos[doc.docid] for inst in instances: parsed_sents = doc.get_parsed_sents() tree = parsed_sents[inst.sentnum] p_word_idx = utils.make_words_from_pointer(tree, inst.predicate) pred_span = utils.get_nltk_span(doc.get_token_spans(), inst.sentnum, p_word_idx) pred_node_repr = "%s:%d:%s" % (doc.docid, inst.sentnum, inst.predicate) self.stats['predicate_count'] += 1 for argloc, arg_slot in inst.arguments: a_word_idx = utils.make_words_from_pointer(tree, argloc) arg_span = utils.get_nltk_span(doc.get_token_spans(), inst.sentnum, a_word_idx) if len(arg_span) == 0: continue self.stats['argument_count'] += 1 p = doc.add_predicate(None, pred_span, frame_type='PROPBANK') arg_em = doc.add_entity_mention(None, arg_span) arg_node_repr = "%s:%d:%s" % (doc.docid, inst.sentnum, argloc) if p and arg_em: p.add_meta('node', pred_node_repr) arg_mention = doc.add_argument_mention( p, arg_em.aid, arg_slot.lower()) arg_mention.add_meta('node', arg_node_repr) def print_stats(self): logging.info("Corpus statistics from Propbank") for key, value in self.stats.items(): logging.info(f"{key} : {value}")
from nltk.corpus import BracketParseCorpusReader; corpus_root = r"xenopedia"; file_pattern = r".*\.txt"; ptb = BracketParseCorpusReader(corpus_root,file_pattern); print ptb.fileids(); print len(ptb.sents()); print ptb.sents();
# 1.9. 载入自己的语料库 from nltk.corpus import PlaintextCorpusReader corpus_root = '/Temp/delete' wordlists = PlaintextCorpusReader(corpus_root, '.*') print(wordlists.fileids()) print(wordlists.words('blake-poems.txt')) from nltk.corpus import BracketParseCorpusReader corpus_root = r'C:\nltk_data\corpora\treebank\combined' file_pattern = r'.*/wsj_.*\.mrg' file_pattern = r'wsj_.*.mrg' ptb = BracketParseCorpusReader(corpus_root, file_pattern) print(ptb) print(ptb.fileids()) print(len(ptb.sents())) print(ptb.sents(fileids='wsj_0199.mrg')[1]) # 2. 条件频率分布:是频率分布的集合,每个频率分布有一个不同的“条件”。(condition,word)根据condition(条件)统计word(单词)的频率。 # 2.1. 条件 和 事件 # 2.2. 按文体计数词汇 from nltk.corpus import brown cfd = nltk.ConditionalFreqDist((genre, word) for genre in brown.categories() for word in brown.words(categories=genre)) genre_word = [(genre, word) for genre in ['news', 'romance'] for word in brown.words(categories=genre)] print(genre_word) print(len(genre_word)) print(genre_word[:4])
class NomBank(DataLoader): """Loading Nombank data and implicit argument annotations.""" def __init__(self, params, corpus, with_doc=False): super().__init__(params, corpus, with_doc) self.wsj_treebank = BracketParseCorpusReader( root=params.wsj_path, fileids=params.wsj_file_pattern, tagset='wsj', encoding='ascii') logging.info('Found {} treebank files.'.format( len(self.wsj_treebank.fileids()))) self.nombank = NombankCorpusReader( root=FileSystemPathPointer(params.nombank_path), nomfile=params.nomfile, framefiles=params.frame_file_pattern, nounsfile=params.nombank_nouns_file, parse_fileid_xform=lambda s: s[4:], parse_corpus=self.wsj_treebank) logging.info("Loading G&C annotations.") self.gc_annos = self.load_gc_annotations() num_gc_preds = sum( [len(preds) for (d, preds) in self.gc_annos.items()]) logging.info(f"Loaded {num_gc_preds} predicates") logging.info("Loading Nombank annotations") self.nombank_annos = defaultdict(list) for nb_instance in self.nombank.instances(): docid = nb_instance.fileid.split('/')[-1] self.nombank_annos[docid].append(nb_instance) self.stats = { 'target_pred_count': Counter(), 'predicates_with_implicit': Counter(), 'implicit_slots': Counter(), } self.stat_dir = params.stat_dir class NomElement: def __init__(self, article_id, sent_num, tree_pointer): self.article_id = article_id self.sent_num = int(sent_num) self.pointer = tree_pointer @staticmethod def from_text(pointer_text): parts = pointer_text.split(':') if len(parts) != 4: raise ValueError("Invalid pointer text.") read_id = parts[0] full_id = read_id.split('_')[1][:2] + '/' + read_id + '.mrg' return NomBank.NomElement( full_id, int(parts[1]), NombankTreePointer(int(parts[2]), int(parts[3]))) def __str__(self): return 'Node-%s-%s:%s' % (self.article_id, self.sent_num, self.pointer.__repr__()) def __hash__(self): return hash( (self.article_id, self.sent_num, self.pointer.__repr__())) def __eq__(self, other): return other and other.__str__() == self.__str__() __repr__ = __str__ def load_gc_annotations(self): tree = ET.parse(self.params.implicit_path) root = tree.getroot() gc_annotations = defaultdict(dict) def merge_split_pointers(pointers): all_pointers = [] split_pointers = [] for pointer, is_split in pointers: if is_split: split_pointers.append(pointer) else: all_pointers.append(pointer) if len(split_pointers) > 0: sorted(split_pointers, key=lambda t: t.wordnum) all_pointers.append(NombankChainTreePointer(split_pointers)) return all_pointers total_implicit_count = 0 total_preds = 0 for annotations in root: pred_node_pos = annotations.attrib['for_node'] predicate = NomBank.NomElement.from_text(pred_node_pos) article_id = predicate.article_id total_preds += 1 explicit_roles = set() arg_annos = defaultdict(list) for annotation in annotations: arg_type = annotation.attrib['value'] arg_node_pos = annotation.attrib['node'] (arg_article_id, arg_sent_id, arg_terminal_id, arg_height) = arg_node_pos.split(':') is_split = False is_explicit = False for attribute in annotation[0]: if attribute.text == 'Split': is_split = True elif attribute.text == 'Explicit': is_explicit = True if pred_node_pos == arg_node_pos: # Incorporated nodes are explicit. is_explicit = True if is_explicit: explicit_roles.add(arg_type) else: p = NombankTreePointer(int(arg_terminal_id), int(arg_height)) # Arguments are group by their sentences. arg_annos[(arg_sent_id, arg_type)].append((p, is_split)) all_args = defaultdict(list) implicit_role_here = set() for (arg_sent_id, arg_type), l_pointers in arg_annos.items(): if int(arg_sent_id) > predicate.sent_num: # Ignoring annotations after the sentence. continue if arg_type not in explicit_roles: for p in merge_split_pointers(l_pointers): arg_element = NomBank.NomElement( article_id, arg_sent_id, p) if not predicate.pointer == arg_element.pointer: # Ignoring incorporated ones. all_args[arg_type].append(arg_element) implicit_role_here.add(arg_type) gc_annotations[article_id.split('/')[-1]][predicate] = all_args total_implicit_count += len(implicit_role_here) logging.info(f"Loaded {total_preds} predicates, " f"{total_implicit_count} implicit arguments.") return gc_annotations def add_predicate(self, doc, parsed_sents, predicate_node): pred_node_repr = "%s:%d:%s" % (doc.docid, predicate_node.sent_num, predicate_node.pointer) p_tree = parsed_sents[predicate_node.sent_num] p_word_idx = utils.make_words_from_pointer(p_tree, predicate_node.pointer) predicate_span = utils.get_nltk_span(doc.token_spans, predicate_node.sent_num, p_word_idx) if len(predicate_span) == 0: logging.warning("Zero length predicate found") return p = doc.add_predicate(None, predicate_span, frame_type='NOMBANK') if p: p.add_meta('node', pred_node_repr) return p def add_nombank_arg(self, doc, parsed_sents, wsj_spans, arg_type, predicate, arg_node, implicit=False): arg_type = arg_type.lower() a_tree = parsed_sents[arg_node.sent_num] a_word_idx = utils.make_words_from_pointer(a_tree, arg_node.pointer) arg_node_repr = "%s:%d:%s" % (doc.docid, arg_node.sent_num, arg_node.pointer) argument_span = utils.get_nltk_span(wsj_spans, arg_node.sent_num, a_word_idx) if len(argument_span) == 0: # Some arguments are empty nodes, they will be ignored. return em = doc.add_entity_mention(None, argument_span) if em: if implicit: arg_type = 'i_' + arg_type arg_mention = doc.add_argument_mention(predicate, em.aid, arg_type) arg_mention.add_meta('node', arg_node_repr) if implicit: arg_mention.add_meta('implicit', True) arg_mention.add_meta('sent_num', arg_node.sent_num) arg_mention.add_meta('text', em.text) return arg_mention def get_predicate_text(self, p): p_text = p.text.lower() if p_text == 'losses' or p_text == 'loss' or p_text == 'tax-loss': p_text = 'loss' else: p_text = p_text.rstrip('s') if p_text == 'savings-and-loan': p_text = 'loan' if '-' in p_text: p_text = p_text.split('-')[1] return p_text def add_all_annotations(self, doc, parsed_sents): logging.info("Adding Nombank annotation for " + doc.docid) nb_instances = self.nombank_annos[doc.docid] for nb_instance in nb_instances: predicate_node = NomBank.NomElement(doc.docid, nb_instance.sentnum, nb_instance.predicate) p = self.add_predicate(doc, parsed_sents, predicate_node) for argloc, argid in nb_instance.arguments: arg_node = NomBank.NomElement(doc.docid, nb_instance.sentnum, argloc) arg = self.add_nombank_arg(doc, parsed_sents, doc.token_spans, argid, p, arg_node) if arg_node.pointer == predicate_node.pointer: arg.add_meta('incorporated', True) if not self.params.explicit_only and doc.docid in self.gc_annos: for predicate_node, gc_args in self.gc_annos[doc.docid].items(): added_args = defaultdict(list) p = self.add_predicate(doc, parsed_sents, predicate_node) p_text = utils.normalize_pred_text(p.text) p.add_meta('from_gc', True) self.stats['target_pred_count'][p_text] += 1 for arg_type, arg_nodes in gc_args.items(): for arg_node in arg_nodes: arg = self.add_nombank_arg(doc, parsed_sents, doc.token_spans, arg_type, p, arg_node, True) added_args[arg_type].append(arg) # The following should be useless already. if arg_node.pointer == predicate_node.pointer: arg.add_meta('incorporated', True) if arg_node.sent_num > predicate_node.sent_num: arg.add_meta('succeeding', True) if len(added_args) > 0: self.stats['predicates_with_implicit'][p_text] += 1 self.stats['implicit_slots'][p_text] += len(added_args) def set_wsj_text(self, doc, fileid): text = '' w_start = 0 spans = [] for tagged_sent in self.wsj_treebank.tagged_sents(fileid): word_spans = [] for word, tag in tagged_sent: if not tag == '-NONE-': text += word + ' ' word_spans.append((w_start, w_start + len(word))) w_start += len(word) + 1 else: # Ignoring these words. word_spans.append(None) text += '\n' w_start += 1 spans.append(word_spans) doc.set_text(text) return spans def load_nombank(self): all_annos = defaultdict(list) for nb_instance in self.nombank.instances(): all_annos[nb_instance.fileid].append(nb_instance) return all_annos def get_doc(self): for docid, instances in self.nombank_annos.items(): if self.params.gc_only and docid not in self.gc_annos: continue doc = DEDocument(self.corpus) doc.set_id(docid) fileid = docid.split('_')[-1][:2] + '/' + docid parsed_sents = self.wsj_treebank.parsed_sents(fileids=fileid) doc.set_parsed_sents(parsed_sents) token_spans = self.set_wsj_text(doc, fileid) doc.set_token_spans(token_spans) self.add_all_annotations(doc, parsed_sents) yield doc def print_stats(self): logging.info("Corpus statistics from Nombank") keys = self.stats.keys() headline = 'predicate\t' + '\t'.join(keys) sums = Counter() if not os.path.exists(self.stat_dir): os.makedirs(self.stat_dir) preds = sorted(self.stats['predicates_with_implicit'].keys()) with open(os.path.join(self.stat_dir, 'counts.txt'), 'w') as out: print(headline) out.write(f'{headline}\n') for pred in preds: line = f"{pred}:" for key in keys: line += f"\t{self.stats[key][pred]}" sums[key] += self.stats[key][pred] print(line) out.write(f'{line}\n') sum_line = 'Total\t' + '\t'.join([str(sums[k]) for k in keys]) print(sum_line) out.write(f'{sum_line}\n')
# 中文是字符型的,不能使用单词读入函数 words() # chinese_mandarin_words=udhr.words('Chinese_Mandarin-UTF8') # print(chinese_mandarin_words[:13]) # 中文是字符型的,不能使用句子读入函数 sents() # chinese_mandarin_sents=udhr.sents('Chinese_Mandarin-UTF8') # print(chinese_mandarin_sents[:13]) # 3.1.9. 载入自己的语料库 from nltk.corpus import PlaintextCorpusReader # 这个在 C 盘根目录下,子目录中需要放入一些文件 corpus_root = '/nltk_data/tokenizers/punkt' word_lists = PlaintextCorpusReader(corpus_root, '.*') print("自己语料库的文件列表= ", word_lists.fileids()) from nltk.corpus import BracketParseCorpusReader corpus_root = r'C:\nltk_data\corpora\treebank\combined' file_pattern = r'wsj_.*\.mrg' ptb = BracketParseCorpusReader(corpus_root, file_pattern) show_subtitle("文件列表") print(ptb.fileids()[:13]) show_subtitle("句子列表") print(ptb.sents()[:3]) show_subtitle("指定文件中的句子") print(ptb.sents(fileids='wsj_0003.mrg')[19])
import nltk import random from nltk.corpus import BracketParseCorpusReader from nltk import induce_pcfg treebank = BracketParseCorpusReader( "resources/", "skladnica_with_heads.txt", ) productions = [] for item in treebank.fileids()[:2]: for tree in treebank.parsed_sents(item): #tree.draw() productions += tree.productions() grammar = induce_pcfg(nltk.Nonterminal('wypowiedzenie:|'), productions) print(grammar.start()) #print(grammar.productions()) #print(grammar._lhs_index) #print(grammar.productions(lhs=grammar.start())) #print(grammar.productions(lhs=nltk.Nonterminal("wypowiedzenie:|mogę"))) #print(grammar.productions(lhs=nltk.Nonterminal("znakkonca:|."))) used_symbols = [] def generate_symbols(symbol):
from nltk.corpus import BracketParseCorpusReader corpus_root = r"xenopedia" file_pattern = r".*\.txt" ptb = BracketParseCorpusReader(corpus_root, file_pattern) print ptb.fileids() print len(ptb.sents()) print ptb.sents()