def __init__(self, params, corpus, with_doc=False): super().__init__(params, corpus) logging.info('Initialize PropBank reader.') if with_doc: self.wsj_treebank = BracketParseCorpusReader( root=params.wsj_path, fileids=params.wsj_file_pattern, tagset='wsj', encoding='ascii') logging.info('Found {} treebank files.'.format( len(self.wsj_treebank.fileids()))) self.propbank = PropbankCorpusReader( root=FileSystemPathPointer(params.root), propfile=params.propfile, framefiles=params.frame_files, verbsfile=params.verbs_file, ) self.propbank_annos = defaultdict(list) logging.info("Loading PropBank Data.") for inst in self.propbank.instances(): docid = inst.fileid.split('/')[-1] self.propbank_annos[docid].append(inst) self.stats = { 'predicate_count': 0, 'argument_count': 0, }
def read_ptb(): sys.stderr.write("\nReading PTB data from " + PTB_DATA_DIR + " ...\n") sentences = [] senno = 0 with codecs.open("ptb.sents", "w", "utf-8") as ptbsf: for constitfile in os.listdir(PTB_DATA_DIR): reader = BracketParseCorpusReader(PTB_DATA_DIR, constitfile) parses = reader.parsed_sents() # TODO: map from parses to sentences for p in parses: ptbsf.write(" ".join(p.leaves()) + "\n") tokpos = p.pos() tokens = [VOCDICT.addstr(tok) for tok, pos in tokpos] postags = [POSDICT.addstr(pos) for tok, pos in tokpos] s = Sentence( "constit", sentnum=senno, tokens=tokens, postags=postags, ) s.get_all_parts_of_ctree(p, CLABELDICT, False) sentences.append(s) senno += 1 sys.stderr.write("# PTB sentences: %d\n" % len(sentences)) ptbsf.close() return sentences
def __init__(self, corpus_root, file_pattern): self.ptb = BracketParseCorpusReader(corpus_root, file_pattern) self.all_sents = [] self.all_tagged_sents = [] self.all_parsed_sents = [] self.ptb_file_id = ''
def open_flod(self, root_path, file_type ): ptb = BracketParseCorpusReader(root_path, file_type) files_list = ptb.fileids() files_path = [] for f in files_list: files_path.append(os.path.join(root_path,f)) return (files_path,files_list)
def tree_reader(): d = {} trees = BracketParseCorpusReader("parsed_sentences/", ".*") for name in trees.fileids(): d_name = re.sub(r"\.tree", "", name) d[d_name] = list(trees.parsed_sents(name)) return d
def get_tagger(): dirname = os.path.dirname(__file__) corpus_root = os.path.join(dirname, 'training_data') testcaselists = BracketParseCorpusReader(corpus_root, [ 'click.txt', 'enter_text.txt', 'browser.txt', 'load_url.txt', 'keyboard_actions.txt' ]) tagger = ConsecutivePosTagger(testcaselists.tagged_sents()) return tagger
def extracting_cfg( corpus_root, file_pattern): #returns cfg eith only 2 non-terminals on the right ptb = BracketParseCorpusReader(corpus_root, file_pattern) cfg_dict = {} unite_productions = {} lexicon = {} for file in ptb.fileids(): #file = ptb.fileids()[0] print(file) for sentence in ptb.parsed_sents(file): # iterating through sentences #sentence =ptb.parsed_sents(file)[some_i] if len(sentence.leaves()) <= 8: #print(sentence.leaves()) for subtree in sentence.subtrees(): # extracting subtree left_side = subtree.label() right_side = [] for children in subtree: if isinstance(children, str): # reached leaf node right_side.append(children) if left_side in lexicon: lexicon[left_side].add(children) else: lexicon[left_side] = set() lexicon[left_side].add(children) else: # still not leafe node right_side.append(children.label()) while len( right_side ) > 2: # making only 2 non-terminals on the right side new_head = '_'.join( right_side[1:] ) # generating new left side of the rule new_right_side = right_side[:1] + [ new_head ] # generating new right side of the rule tup = tuple(new_right_side) if left_side not in cfg_dict: # new key cfg_dict[left_side] = set() cfg_dict[left_side].add(tup) else: cfg_dict[left_side].add(tup) left_side = new_head right_side = right_side[1:] if len(right_side) == 1: #unite production if left_side in unite_productions: unite_productions[left_side].add(tuple(right_side)) else: unite_productions[left_side] = set() unite_productions[left_side].add(tuple(right_side)) if left_side in cfg_dict: # adding rule to the dict cfg_dict[left_side].add(tuple(right_side)) else: cfg_dict[left_side] = set() cfg_dict[left_side].add(tuple(right_side)) return cfg_dict, lexicon, unite_productions
def extracting_cnf(corpus_root, file_pattern): ptb = BracketParseCorpusReader(corpus_root, file_pattern) cnf_dict = {} cnf_dict['lexicon'] = set() #for file in ptb.fileids(): #for file in ptb.fileids(): file = ptb.fileids()[0] print(file) for s in range(1, len(ptb.parsed_sents(file))): tree = ptb.parsed_sents(file)[s] for sub in tree.subtrees(): return_rule(sub, cnf_dict, file) return cnf_dict
def seg_pos_ctb(ctb_dir, fileids): reader = BracketParseCorpusReader(ctb_dir, fileids) #生成词语和词性元组 # tree=reader.tagged_sents() #生成每个句子的树结构,对于部分数据如40.nw中五年来一句无法正确解析 tree = reader.parsed_sents() print('tree len: {}'.format(len(tree))) seg_pos_sentences = [] broken_parses = [] for s in tree: s = s.pos() if s and s != [] and type(s[0]) == tuple: s = [j if j[1] != '-NONE-' else (' NONE ', 'NONE') for j in s] seg_pos_sentences.append(s) else: broken_parses.append(s) return seg_pos_sentences, broken_parses
def __init__(self, params, corpus, with_doc=False): super().__init__(params, corpus, with_doc) self.wsj_treebank = BracketParseCorpusReader( root=params.wsj_path, fileids=params.wsj_file_pattern, tagset="wsj", encoding="ascii", ) logging.info("Found {} treebank files.".format( len(self.wsj_treebank.fileids()))) self.nombank = NombankCorpusReader( root=FileSystemPathPointer(params.nombank_path), nomfile=params.nomfile, framefiles=params.frame_file_pattern, nounsfile=params.nombank_nouns_file, parse_fileid_xform=lambda s: s[4:], parse_corpus=self.wsj_treebank, ) logging.info("Loading G&C annotations.") self.gc_annos = self.load_gc_annotations() num_gc_preds = sum( [len(preds) for (d, preds) in self.gc_annos.items()]) logging.info(f"Loaded {num_gc_preds} predicates") logging.info("Loading Nombank annotations") self.nombank_annos = defaultdict(list) for nb_instance in self.nombank.instances(): docid = nb_instance.fileid.split("/")[-1] self.nombank_annos[docid].append(nb_instance) self.stats = { "target_pred_count": Counter(), "predicates_with_implicit": Counter(), "implicit_slots": Counter(), } self.stat_dir = params.stat_dir
def read_wsj(article_count): wsj_root = '/Users/chbrown/Dropbox/ut/nlp/data/penn-treebank3/parsed/mrg/wsj' articles = [] for section in range(25): for article_path in os.listdir('%s/%02d' % (wsj_root, section)): reader = BracketParseCorpusReader(wsj_root, '%02d/%s' % (section, article_path)) sentences = [] for tagged_sent in reader.tagged_sents(): # token_postag_pairs = sentence token_postag_pairs = [ (token.lower(), pos_tag) for token, pos_tag in tagged_sent if pos_tag not in ('-LRB-', '-RRB-', '-NONE-')] sentence = DefinitenessDocument.from_token_postag_pairs(token_postag_pairs) sentences.append(sentence) articles.append(sentences) if len(articles) >= article_count: return articles return articles
def train(refresh=True): if refresh: ptb = BracketParseCorpusReader(Corpus.DATA_DIR, Corpus.FILE_PATTERN) train_folders = [str(i) + str(j) for i in range(2) for j in range(10)] train_folders += [str(i) + str(j) for i in range(2, 3) for j in range(5)] dictionary = corpora.dictionary.Dictionary() train_documents = list() logger.debug('Starting to parse training documents') for folder in train_folders: for ptb_file in os.listdir(os.path.join(Corpus.DATA_DIR, folder)): document_sentences = ptb.sents(fileids=[os.path.join(folder, ptb_file)]) if len(document_sentences) > DOC_LEN_THRESHOLD: doc2sentence = list(chain.from_iterable(document_sentences)) doc2sentence = clean_text(doc2sentence) dictionary.add_documents([doc2sentence]) train_documents.append(doc2sentence) logger.debug('Parsed all training documents') dictionary.filter_extremes(no_below=1, no_above=0.5) dictionary.save(DICTIONARY_FILE) logger.debug('Creating corpus for training data') corpus = [dictionary.doc2bow(text) for text in train_documents] logger.debug('Finished creating corpus') logger.debug('Training LDA model on corpus') lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=N_TOPICS, passes=20) logger.debug('Completed LDA training') lda.save(LDA_MODEL_FILE) else: dictionary = corpora.dictionary.Dictionary.load(DICTIONARY_FILE) lda = LdaModel.load(LDA_MODEL_FILE) return lda, dictionary
def loadCorpora(): corpus_root = '/usr/share/dict' wordlists = PlaintextCorpusReader(corpus_root, '.*') wordlists.fileids() wordlists.words('connectives') corpus_root = r"C:\corpora\penntreebank\parsed\mrg\wsj" file_pattern = r".*/wsj_.*\.mrg" ptb = BracketParseCorpusReader(corpus_root, file_pattern) ptb.fileids() len(ptb.sents()) ptb.sents(fileids='20/wsj_2013.mrg')[19]
def read_wsj(article_count): wsj_root = '/Users/chbrown/Dropbox/ut/nlp/data/penn-treebank3/parsed/mrg/wsj' articles = [] for section in range(25): for article_path in os.listdir('%s/%02d' % (wsj_root, section)): reader = BracketParseCorpusReader( wsj_root, '%02d/%s' % (section, article_path)) sentences = [] for tagged_sent in reader.tagged_sents(): # token_postag_pairs = sentence token_postag_pairs = [ (token.lower(), pos_tag) for token, pos_tag in tagged_sent if pos_tag not in ('-LRB-', '-RRB-', '-NONE-') ] sentence = DefinitenessDocument.from_token_postag_pairs( token_postag_pairs) sentences.append(sentence) articles.append(sentences) if len(articles) >= article_count: return articles return articles
def __init__(self, config_path): conf = load_file_config(config_path) logging.info(json.dumps(conf, indent=2)) params = GCDataSet.GCConfig(config=conf) super().__init__(params) wsj_treebank = BracketParseCorpusReader( root=params.wsj_path, fileids=params.wsj_file_pattern, tagset='wsj', encoding='ascii') self.nombank = NombankCorpusReader( root=FileSystemPathPointer(params.nombank_path), nomfile=params.nomfile, framefiles=params.frame_file_pattern, nounsfile=params.nombank_nouns_file, parse_fileid_xform=lambda s: s[4:], parse_corpus=wsj_treebank)
def get_sents_by_field_ids(field_ids): if not isinstance(field_ids, list): field_ids = [field_ids] ptb = BracketParseCorpusReader(DATA_DIR, FILE_PATTERN) return ptb.sents(fileids=field_ids)
def load_reader_and_filedids(lang,data_type): assert data_type in ('train','val','test') def filter_trees(tree, data_type): def _is_control(char): """Checks whether `chars` is a control character.""" # These are technically control characters but we count them as whitespace # characters. if char == "\t" or char == "\n" or char == "\r": return False cat = unicodedata.category(char) if cat.startswith("C"): return True return False sent=tree.leaves() if data_type=='wsj' and len(sent)>10: return False if data_type!='wsj' and len(sent)>128: return False try: for c in ' '.join(sent): cp=ord(c) if cp == 0 or cp == 0xfffd or _is_control(c): return False return True except: return False def filt_id(fileids,lang): assert lang in ('en','fr','zh') train_file_ids,valid_file_ids,test_file_ids=[],[],[] for id in fileids: prefix=id.split('.')[0] if lang=='en': if 'WSJ/22/WSJ_2200' <= prefix <= 'WSJ/22/WSJ_2299': valid_file_ids.append(id) elif 'WSJ/23/WSJ_2300' <= prefix <= 'WSJ/23/WSJ_2399': test_file_ids.append(id) else: train_file_ids.append(id) elif lang=='zh': if '0886' <= prefix <= '0931' or '1148' <= prefix <= '1151': valid_file_ids.append(id) elif '0816' <= prefix <= '0885' or '1137' <= prefix <='1147': test_file_ids.append(id) else: train_file_ids.append(id) else: if prefix in ('flmf3_12500_12999co','flmf7ab2ep','flmf7ad1co','flmf7ae1ep'): valid_file_ids.append(id) elif prefix in ('flmf3_12000_12499ep','flmf7aa1ep','flmf7aa2ep','flmf7ab1co'): test_file_ids.append(id) else: train_file_ids.append(id) return train_file_ids,valid_file_ids,test_file_ids assert lang in ('en','zh','fr','il','jp','sp','ca','sw','de') lang_dir=treebank_dir+'/'+lang reader=BracketParseCorpusReader(lang_dir, '.*') fileids=reader.fileids() if data_type=='wsj10': return [t for t in reader.parsed_sents(fileids) if filter_trees(t,data_type)] train_file_ids = [] valid_file_ids = [] test_file_ids = [] if lang in ('en','zh','fr'): train_file_ids,valid_file_ids,test_file_ids=filt_id(fileids,lang) train_trees=reader.parsed_sents(train_file_ids) val_trees=reader.parsed_sents(valid_file_ids) test_trees=reader.parsed_sents(test_file_ids) else: for fid in fileids: if 'train' in fid: train_trees=reader.parsed_sents(fid) elif 'val' in fid: val_trees=reader.parsed_sents(fid) elif 'test' in fid: test_trees=reader.parsed_sents(fid) if data_type=='train': train_trees=[t for t in train_trees if filter_trees(t,data_type)] print(f'train:{len(train_trees)}') return train_trees elif data_type=='val': val_trees=[t for t in val_trees if filter_trees(t,data_type)] print(f'val:{len(val_trees)}') return val_trees else: test_trees=[t for t in test_trees if filter_trees(t,data_type)] print(f'test:{len(test_trees)}') return test_trees
def retrieve_data(): train_data = BracketParseCorpusReader("data", "02-21.10way.clean") val_data = BracketParseCorpusReader("data", "22.auto.clean") test_data = BracketParseCorpusReader("data", "23.auto.clean") train_words = [x.lower() for x in train_data.words()] val_words = [x.lower() for x in val_data.words()] test_words = [x.lower() for x in test_data.words()] all_words = train_words + val_words + test_words word_counter = Counter(all_words) vocab = ['PAD', 'SOS', 'EOS'] + list(word_counter.keys()) vocab_size = len(vocab) word2idx = {ch: i for i, ch in enumerate(vocab)} idx2word = {i: ch for i, ch in enumerate(vocab)} train_sents = [[w.lower() for w in sent] for sent in train_data.sents()] val_sents = [[w.lower() for w in sent] for sent in val_data.sents()] test_sents = [[w.lower() for w in sent] for sent in test_data.sents()] train_dataset = TextData(train_sents, word2idx, idx2word, vocab_size) val_dataset = TextData(val_sents, word2idx, idx2word, vocab_size) test_dataset = TextData(test_sents, word2idx, idx2word, vocab_size) return train_dataset, val_dataset, test_dataset
class PTBReader(object): def __init__(self, corpus_root, file_pattern): self.ptb = BracketParseCorpusReader(corpus_root, file_pattern) self.all_sents = [] self.all_tagged_sents = [] self.all_parsed_sents = [] self.ptb_file_id = '' def read_ptb_file(self, node): if node.file_id != self.ptb_file_id: path = '{0}/{1}.mrg'.format(node.directory, node.file_id) self.all_sents = self.ptb.sents(fileids=path) self.all_tagged_sents = self.ptb.tagged_sents(fileids=path) self.all_parsed_sents = self.ptb.parsed_sents(fileids=path) self.ptb_file_id = node.file_id def get_subtree_pos(self, node): parsed_sent = self.all_parsed_sents[node.sent_id] token_pos = parsed_sent.leaf_treeposition(node.token_id) subtree_pos = token_pos[:-(node.phrase_level + 1)] return subtree_pos def is_child_node(self, parent, child): if not (isinstance(parent, Node) and isinstance(child, Node)): return False if not (parent.file_id == child.file_id and parent.sent_id == child.sent_id): return False self.read_ptb_file(parent) parent_subtree_pos = self.get_subtree_pos(parent) child_subtree_pos = self.get_subtree_pos(child) if child_subtree_pos[:len(parent_subtree_pos)] == parent_subtree_pos: return True else: return False def parse_node(self, node): if node.__class__ == SplitNode: # parse each node in the split node for n in node.node_list: self.parse_node(n) # combine the ptb_surface of each node node.ptb_idx_list = [ idx for n in node.node_list for idx in n.ptb_idx_list ] node.ptb_surface = ' '.join( [n.ptb_surface for n in node.node_list]) else: self.read_ptb_file(node) node.subtree_pos = self.get_subtree_pos(node) parsed_sent = self.all_parsed_sents[node.sent_id] node.ptb_idx_list = [] for idx in range(len(parsed_sent.leaves())): if parsed_sent.leaf_treeposition(idx)[:len(node.subtree_pos)] \ == node.subtree_pos: node.ptb_idx_list.append(idx) assert node.ptb_idx_list == \ range(node.ptb_idx_list[0], node.ptb_idx_list[-1] + 1), \ 'Error in matching indices for subtree leaves: {0}'.format(node) tagged_sent = self.all_tagged_sents[node.sent_id] node.ptb_surface = ' '.join([ word[0] for word in [tagged_sent[i] for i in node.ptb_idx_list] ])
class NomBank(DataLoader): """Loading Nombank data and implicit argument annotations.""" def __init__(self, params, corpus, with_doc=False): super().__init__(params, corpus, with_doc) self.wsj_treebank = BracketParseCorpusReader( root=params.wsj_path, fileids=params.wsj_file_pattern, tagset='wsj', encoding='ascii') logging.info('Found {} treebank files.'.format( len(self.wsj_treebank.fileids()))) self.nombank = NombankCorpusReader( root=FileSystemPathPointer(params.nombank_path), nomfile=params.nomfile, framefiles=params.frame_file_pattern, nounsfile=params.nombank_nouns_file, parse_fileid_xform=lambda s: s[4:], parse_corpus=self.wsj_treebank) logging.info("Loading G&C annotations.") self.gc_annos = self.load_gc_annotations() num_gc_preds = sum( [len(preds) for (d, preds) in self.gc_annos.items()]) logging.info(f"Loaded {num_gc_preds} predicates") logging.info("Loading Nombank annotations") self.nombank_annos = defaultdict(list) for nb_instance in self.nombank.instances(): docid = nb_instance.fileid.split('/')[-1] self.nombank_annos[docid].append(nb_instance) self.stats = { 'target_pred_count': Counter(), 'predicates_with_implicit': Counter(), 'implicit_slots': Counter(), } self.stat_dir = params.stat_dir class NomElement: def __init__(self, article_id, sent_num, tree_pointer): self.article_id = article_id self.sent_num = int(sent_num) self.pointer = tree_pointer @staticmethod def from_text(pointer_text): parts = pointer_text.split(':') if len(parts) != 4: raise ValueError("Invalid pointer text.") read_id = parts[0] full_id = read_id.split('_')[1][:2] + '/' + read_id + '.mrg' return NomBank.NomElement( full_id, int(parts[1]), NombankTreePointer(int(parts[2]), int(parts[3]))) def __str__(self): return 'Node-%s-%s:%s' % (self.article_id, self.sent_num, self.pointer.__repr__()) def __hash__(self): return hash( (self.article_id, self.sent_num, self.pointer.__repr__())) def __eq__(self, other): return other and other.__str__() == self.__str__() __repr__ = __str__ def load_gc_annotations(self): tree = ET.parse(self.params.implicit_path) root = tree.getroot() gc_annotations = defaultdict(dict) def merge_split_pointers(pointers): all_pointers = [] split_pointers = [] for pointer, is_split in pointers: if is_split: split_pointers.append(pointer) else: all_pointers.append(pointer) if len(split_pointers) > 0: sorted(split_pointers, key=lambda t: t.wordnum) all_pointers.append(NombankChainTreePointer(split_pointers)) return all_pointers total_implicit_count = 0 total_preds = 0 for annotations in root: pred_node_pos = annotations.attrib['for_node'] predicate = NomBank.NomElement.from_text(pred_node_pos) article_id = predicate.article_id total_preds += 1 explicit_roles = set() arg_annos = defaultdict(list) for annotation in annotations: arg_type = annotation.attrib['value'] arg_node_pos = annotation.attrib['node'] (arg_article_id, arg_sent_id, arg_terminal_id, arg_height) = arg_node_pos.split(':') is_split = False is_explicit = False for attribute in annotation[0]: if attribute.text == 'Split': is_split = True elif attribute.text == 'Explicit': is_explicit = True if pred_node_pos == arg_node_pos: # Incorporated nodes are explicit. is_explicit = True if is_explicit: explicit_roles.add(arg_type) else: p = NombankTreePointer(int(arg_terminal_id), int(arg_height)) # Arguments are group by their sentences. arg_annos[(arg_sent_id, arg_type)].append((p, is_split)) all_args = defaultdict(list) implicit_role_here = set() for (arg_sent_id, arg_type), l_pointers in arg_annos.items(): if int(arg_sent_id) > predicate.sent_num: # Ignoring annotations after the sentence. continue if arg_type not in explicit_roles: for p in merge_split_pointers(l_pointers): arg_element = NomBank.NomElement( article_id, arg_sent_id, p) if not predicate.pointer == arg_element.pointer: # Ignoring incorporated ones. all_args[arg_type].append(arg_element) implicit_role_here.add(arg_type) gc_annotations[article_id.split('/')[-1]][predicate] = all_args total_implicit_count += len(implicit_role_here) logging.info(f"Loaded {total_preds} predicates, " f"{total_implicit_count} implicit arguments.") return gc_annotations def add_predicate(self, doc, parsed_sents, predicate_node): pred_node_repr = "%s:%d:%s" % (doc.docid, predicate_node.sent_num, predicate_node.pointer) p_tree = parsed_sents[predicate_node.sent_num] p_word_idx = utils.make_words_from_pointer(p_tree, predicate_node.pointer) predicate_span = utils.get_nltk_span(doc.token_spans, predicate_node.sent_num, p_word_idx) if len(predicate_span) == 0: logging.warning("Zero length predicate found") return p = doc.add_predicate(None, predicate_span, frame_type='NOMBANK') if p: p.add_meta('node', pred_node_repr) return p def add_nombank_arg(self, doc, parsed_sents, wsj_spans, arg_type, predicate, arg_node, implicit=False): arg_type = arg_type.lower() a_tree = parsed_sents[arg_node.sent_num] a_word_idx = utils.make_words_from_pointer(a_tree, arg_node.pointer) arg_node_repr = "%s:%d:%s" % (doc.docid, arg_node.sent_num, arg_node.pointer) argument_span = utils.get_nltk_span(wsj_spans, arg_node.sent_num, a_word_idx) if len(argument_span) == 0: # Some arguments are empty nodes, they will be ignored. return em = doc.add_entity_mention(None, argument_span) if em: if implicit: arg_type = 'i_' + arg_type arg_mention = doc.add_argument_mention(predicate, em.aid, arg_type) arg_mention.add_meta('node', arg_node_repr) if implicit: arg_mention.add_meta('implicit', True) arg_mention.add_meta('sent_num', arg_node.sent_num) arg_mention.add_meta('text', em.text) return arg_mention def get_predicate_text(self, p): p_text = p.text.lower() if p_text == 'losses' or p_text == 'loss' or p_text == 'tax-loss': p_text = 'loss' else: p_text = p_text.rstrip('s') if p_text == 'savings-and-loan': p_text = 'loan' if '-' in p_text: p_text = p_text.split('-')[1] return p_text def add_all_annotations(self, doc, parsed_sents): logging.info("Adding Nombank annotation for " + doc.docid) nb_instances = self.nombank_annos[doc.docid] for nb_instance in nb_instances: predicate_node = NomBank.NomElement(doc.docid, nb_instance.sentnum, nb_instance.predicate) p = self.add_predicate(doc, parsed_sents, predicate_node) for argloc, argid in nb_instance.arguments: arg_node = NomBank.NomElement(doc.docid, nb_instance.sentnum, argloc) arg = self.add_nombank_arg(doc, parsed_sents, doc.token_spans, argid, p, arg_node) if arg_node.pointer == predicate_node.pointer: arg.add_meta('incorporated', True) if not self.params.explicit_only and doc.docid in self.gc_annos: for predicate_node, gc_args in self.gc_annos[doc.docid].items(): added_args = defaultdict(list) p = self.add_predicate(doc, parsed_sents, predicate_node) p_text = utils.normalize_pred_text(p.text) p.add_meta('from_gc', True) self.stats['target_pred_count'][p_text] += 1 for arg_type, arg_nodes in gc_args.items(): for arg_node in arg_nodes: arg = self.add_nombank_arg(doc, parsed_sents, doc.token_spans, arg_type, p, arg_node, True) added_args[arg_type].append(arg) # The following should be useless already. if arg_node.pointer == predicate_node.pointer: arg.add_meta('incorporated', True) if arg_node.sent_num > predicate_node.sent_num: arg.add_meta('succeeding', True) if len(added_args) > 0: self.stats['predicates_with_implicit'][p_text] += 1 self.stats['implicit_slots'][p_text] += len(added_args) def set_wsj_text(self, doc, fileid): text = '' w_start = 0 spans = [] for tagged_sent in self.wsj_treebank.tagged_sents(fileid): word_spans = [] for word, tag in tagged_sent: if not tag == '-NONE-': text += word + ' ' word_spans.append((w_start, w_start + len(word))) w_start += len(word) + 1 else: # Ignoring these words. word_spans.append(None) text += '\n' w_start += 1 spans.append(word_spans) doc.set_text(text) return spans def load_nombank(self): all_annos = defaultdict(list) for nb_instance in self.nombank.instances(): all_annos[nb_instance.fileid].append(nb_instance) return all_annos def get_doc(self): for docid, instances in self.nombank_annos.items(): if self.params.gc_only and docid not in self.gc_annos: continue doc = DEDocument(self.corpus) doc.set_id(docid) fileid = docid.split('_')[-1][:2] + '/' + docid parsed_sents = self.wsj_treebank.parsed_sents(fileids=fileid) doc.set_parsed_sents(parsed_sents) token_spans = self.set_wsj_text(doc, fileid) doc.set_token_spans(token_spans) self.add_all_annotations(doc, parsed_sents) yield doc def print_stats(self): logging.info("Corpus statistics from Nombank") keys = self.stats.keys() headline = 'predicate\t' + '\t'.join(keys) sums = Counter() if not os.path.exists(self.stat_dir): os.makedirs(self.stat_dir) preds = sorted(self.stats['predicates_with_implicit'].keys()) with open(os.path.join(self.stat_dir, 'counts.txt'), 'w') as out: print(headline) out.write(f'{headline}\n') for pred in preds: line = f"{pred}:" for key in keys: line += f"\t{self.stats[key][pred]}" sums[key] += self.stats[key][pred] print(line) out.write(f'{line}\n') sum_line = 'Total\t' + '\t'.join([str(sums[k]) for k in keys]) print(sum_line) out.write(f'{sum_line}\n')
from nltk.corpus import BracketParseCorpusReader; corpus_root = r"xenopedia"; file_pattern = r".*\.txt"; ptb = BracketParseCorpusReader(corpus_root,file_pattern); print ptb.fileids(); print len(ptb.sents()); print ptb.sents();
class PropBank(DataLoader): """Load PropBank data.""" def __init__(self, params, corpus, with_doc=False): super().__init__(params, corpus) logging.info('Initialize PropBank reader.') if with_doc: self.wsj_treebank = BracketParseCorpusReader( root=params.wsj_path, fileids=params.wsj_file_pattern, tagset='wsj', encoding='ascii') logging.info('Found {} treebank files.'.format( len(self.wsj_treebank.fileids()))) self.propbank = PropbankCorpusReader( root=FileSystemPathPointer(params.root), propfile=params.propfile, framefiles=params.frame_files, verbsfile=params.verbs_file, ) self.propbank_annos = defaultdict(list) logging.info("Loading PropBank Data.") for inst in self.propbank.instances(): docid = inst.fileid.split('/')[-1] self.propbank_annos[docid].append(inst) self.stats = { 'predicate_count': 0, 'argument_count': 0, } def add_all_annotations(self, doc): logging.info("Adding propbank annotations for " + doc.docid) instances = self.propbank_annos[doc.docid] for inst in instances: parsed_sents = doc.get_parsed_sents() tree = parsed_sents[inst.sentnum] p_word_idx = utils.make_words_from_pointer(tree, inst.predicate) pred_span = utils.get_nltk_span(doc.get_token_spans(), inst.sentnum, p_word_idx) pred_node_repr = "%s:%d:%s" % (doc.docid, inst.sentnum, inst.predicate) self.stats['predicate_count'] += 1 for argloc, arg_slot in inst.arguments: a_word_idx = utils.make_words_from_pointer(tree, argloc) arg_span = utils.get_nltk_span(doc.get_token_spans(), inst.sentnum, a_word_idx) if len(arg_span) == 0: continue self.stats['argument_count'] += 1 p = doc.add_predicate(None, pred_span, frame_type='PROPBANK') arg_em = doc.add_entity_mention(None, arg_span) arg_node_repr = "%s:%d:%s" % (doc.docid, inst.sentnum, argloc) if p and arg_em: p.add_meta('node', pred_node_repr) arg_mention = doc.add_argument_mention( p, arg_em.aid, arg_slot.lower()) arg_mention.add_meta('node', arg_node_repr) def print_stats(self): logging.info("Corpus statistics from Propbank") for key, value in self.stats.items(): logging.info(f"{key} : {value}")
#load do nosso from nltk.corpus import PlaintextCorpusReader corpus_root = '/usr/share/dict' #diretoria onde está o ficheiro wordlists = PlaintextCorpusReader(corpus_root, '.*') #agora temos o nome de todos os ficheiros. wordlists.fileids() wordlists.words('connectives') from nltk.corpus import BracketParseCorpusReader corpus_root = r"C:\corpora\penntreebank\parsed\mrg\wsj" file_pattern = r".*/wsj_.*\.mrg" ptb = BracketParseCorpusReader(corpus_root, file_pattern) """ 2.2 """ from nltk.corpus import brown cfd = nltk.ConditionalFreqDist( (genre, word) for genre in brown.categories() for word in brown.words(categories=genre))
nombank_root = join(corpus_root, 'nombank.1.0') nombank_file = 'nombank.1.0_sorted' nombank_nouns_file = 'nombank.1.0.words' frame_file_pattern = 'frames/.*\.xml' def fileid_xform_function(filename): result = re.sub(r'^wsj/', '', filename) # result = re.sub(r'^wsj/\d\d/', '', filename) # result = re.sub(r'\.mrg$', '', result) return result treebank = BracketParseCorpusReader(root=treebank_root, fileids=treebank_file_pattern, tagset='wsj', encoding='ascii') propbank = PropbankCorpusReader(root=FileSystemPathPointer(propbank_root), propfile=propbank_file, framefiles=frame_file_pattern, verbsfile=propbank_verbs_file, parse_fileid_xform=fileid_xform_function, parse_corpus=treebank) nombank = NombankCorpusReader(root=FileSystemPathPointer(nombank_root), nomfile=nombank_file, framefiles=frame_file_pattern, nounsfile=nombank_nouns_file, parse_fileid_xform=fileid_xform_function, parse_corpus=treebank)
import nltk from nltk.corpus import BracketParseCorpusReader import numpy as np import scipy from scipy import spatial import matplotlib.pyplot as plt import math import re import sys import csv corpus_root = r"all/" file_pattern = r".*\.mrg" sw = BracketParseCorpusReader(corpus_root, file_pattern) trees = sw.parsed_sents() def give(t): return t.label() == 'VP' all_vp = [] for tree in trees: for vp in tree.subtrees(give): children = [] pps = [] pp = []
def print_corpus_metrics(corpus_dir='data'): ptb = BracketParseCorpusReader(DATA_DIR, FILE_PATTERN) words = ptb.words() print 'Total number of words', len(words) print 'Total number of unique words', len(set(words)) print 'Total number of documents', len(ptb.fileids())
print(sents[1:20]) # 1.9. 载入自己的语料库 from nltk.corpus import PlaintextCorpusReader corpus_root = '/Temp/delete' wordlists = PlaintextCorpusReader(corpus_root, '.*') print(wordlists.fileids()) print(wordlists.words('blake-poems.txt')) from nltk.corpus import BracketParseCorpusReader corpus_root = r'C:\nltk_data\corpora\treebank\combined' file_pattern = r'.*/wsj_.*\.mrg' file_pattern = r'wsj_.*.mrg' ptb = BracketParseCorpusReader(corpus_root, file_pattern) print(ptb) print(ptb.fileids()) print(len(ptb.sents())) print(ptb.sents(fileids='wsj_0199.mrg')[1]) # 2. 条件频率分布:是频率分布的集合,每个频率分布有一个不同的“条件”。(condition,word)根据condition(条件)统计word(单词)的频率。 # 2.1. 条件 和 事件 # 2.2. 按文体计数词汇 from nltk.corpus import brown cfd = nltk.ConditionalFreqDist((genre, word) for genre in brown.categories() for word in brown.words(categories=genre)) genre_word = [(genre, word) for genre in ['news', 'romance'] for word in brown.words(categories=genre)] print(genre_word)
""" # Pad left with None's so that the the first iteration is [None, ..., None, iterable[0]] if left_nulls: iterable = [None] * (size - 1) + iterable iters = tee(iterable, size) for i in range(1, size): for each in iters[i:]: next(each, None) return zip(*iters) corpus_root = "wsj" file_pattern = ".*/wsj_.*\.mrg" ptb = BracketParseCorpusReader(corpus_root, file_pattern) counts = defaultdict(lambda: defaultdict(lambda: defaultdict(int))) for sent in ptb.sents(): for word1, word2, word3, word4, word5 in window(sent, 5): counts[-2][word3][word1] += 1 counts[-1][word3][word2] += 1 counts[1][word3][word4] += 1 counts[2][word3][word5] += 1 counts = dict(counts) for index, outer_dict in counts.items(): for word, inner_dict in outer_dict.items(): counts[index][word] = dict(inner_dict) counts[index] = dict(outer_dict)
def parse_trees(dir, fileid): # reader = BracketParseCorpusReader('/home/lnn/Documents/ability/cranfield_testdata/upenn_transfer/new_ctb', fileid) reader = BracketParseCorpusReader(dir, fileid) tree = reader.parsed_sents() return tree
replaceSymbolsInTree(tree[i], sent) ## Turns a _A_ symbol back to A def revertPOS(symbol): return symbol[1:-1] ###### Main ######################################################################### if __name__ == '__main__': clArgs = createArgParser().parse_args() #Check if any arguments are given. If not, display help active = False if clArgs.penn != None and clArgs.grammar != None: active = True ## Set up the treebank reader ptb = BracketParseCorpusReader(path.dirname(clArgs.penn), [path.basename(clArgs.penn)]) ## Collect all terminal and nonterminals for tree in ptb.parsed_sents(ptb.fileids()[0]): # Also set the start symbol to the root of the first tree if len(start_symbol) == 0: start_symbol = tree.node findSymbolsInTree(tree) ## Find ambiguous symbols and map them to a unique alternative for symbol in nonterminals.intersection(pos): replacement = "_" + symbol + "_" symbolMap[symbol] = replacement if replacement in pos or replacement in nonterminals: print "Cannot make nonterminal unambiguous: ", symbol
raw[1:20] words = gutenberg.words("burgess-busterbrown.txt") words[1:20] sents = gutenberg.sents("burgess-busterbrown.txt") sents[1:20] from nltk.corpus import PlaintextCorpusReader corpus_root = '' #yourown file wordlists = PlaintextCorpusReader(corpus_root, '.*') wlrdlists.fileids() wordlists.words('connectives') from nltk.corpus import BracketParseCorpusReader corpus_root = r"" file_pattern = r".*/wsj_.*\.mrg" ptb = BracketParseCorpusReader(corpus_root, file_pattern) ptb.fileids() len(ptb.sents()) ptb.sents(fileids = '20/wsj_2013.mrg')[19] #2.2==================== text = ['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', ...] pairs = [('news', 'The'), ('news', 'Fulton'), ('news', 'County'), ...] import nltk from nltk.corpus import brown cfd = nltk.ConditionalFreqDist( (genre, word) for genre in brown.categories()
h.close() vocab = [i[0] for i in vocab] return vocab if __name__ == '__main__': TRAIN_FILE = 'data/wsj_2-21' TEST_FILE = 'data/wsj_23' DEV_FILE = 'data/wsj_24' SECTIONS = [(2, 21), (23, 23), (24, 24)] MAXLEN = 50 wsj = '/data/penn_tb_3.0/TREEBANK_3/PARSED/MRG/WSJ/' file_pattern = r".*/WSJ_.*\.MRG" ptb = BracketParseCorpusReader(wsj, file_pattern) print('Gathered %d files...' % len(ptb.fileids())) print('Generating vocabulary...') vocab = get_vocab() print('Done.') print('Preprocessing all sections...') for fn, sections in zip([TRAIN_FILE, TEST_FILE, DEV_FILE], SECTIONS): print('Preprocessing %s...' % fn) h = open(fn, 'wt') for section in range(sections[0], sections[1] + 1): fileids = [ i for i in ptb.fileids() if i.startswith(str(section).zfill(2)) ]
import nltk import random from nltk.corpus import BracketParseCorpusReader from nltk import induce_pcfg treebank = BracketParseCorpusReader( "resources/", "skladnica_with_heads.txt", ) productions = [] for item in treebank.fileids()[:2]: for tree in treebank.parsed_sents(item): #tree.draw() productions += tree.productions() grammar = induce_pcfg(nltk.Nonterminal('wypowiedzenie:|'), productions) print(grammar.start()) #print(grammar.productions()) #print(grammar._lhs_index) #print(grammar.productions(lhs=grammar.start())) #print(grammar.productions(lhs=nltk.Nonterminal("wypowiedzenie:|mogę"))) #print(grammar.productions(lhs=nltk.Nonterminal("znakkonca:|."))) used_symbols = [] def generate_symbols(symbol):
tags.append("EOS") while(sentence[i] != "" or len(sentence) <= 3 ): tags.append(get_next_tag(pos_dist, tags[i])) sentence.append(get_next_word(t2w_dist, tags[i+1])) i += 1 return (sentence, tags) # In[ ]: # Import and parse the corpus corpus_root = './corpus_clean/' corpus = BracketParseCorpusReader(corpus_root, ".*") tagged_sentences = corpus.tagged_sents() ngram_input = [] pos_input = [] legal_tags = ["EOS","$","#", "GW", "CC", "CD", "DT", "EX", "FW", "IN", "JJ","JJR","JJS","LS","MD", "NN","NNS","NNP",'NNPS','PDT','POS','PRP','PRP$','RB','RBR','RBS','RP','TO', "UH",'VB', 'VBD',"VBG","VBN","VBP","VBZ","WDT","WP","WP$","WRB", "\"", "\'", ",", ".", "AFX"] single_letter_words = ["a", "i", ",", ".", "!", "?", "\'", "\"", ":", ';', '0', '1', '2', "3", '4', '5', "6", '7', '8', "9", "=", "&", "#", '/', '>', "$", '<', '+', '%',] # tags_removed = ["-NONE-","SYM", "CODE", "ADD", "HYPH","-LSB-", "-RSB-",":", "NFP", "XX", "-LRB-", "-RRB-"] # Remove -NONE- and SYM tags from the training data and create a list of tokens and a list of tags. for sentence in tagged_sentences:
def read_brackets(constitfile): sys.stderr.write("\nReading constituents from " + constitfile + " ...\n") reader = BracketParseCorpusReader(PARSER_DATA_DIR + "rnng/", constitfile) parses = reader.parsed_sents() return parses
# 中文是字符型的,不能使用单词读入函数 words() # chinese_mandarin_words=udhr.words('Chinese_Mandarin-UTF8') # print(chinese_mandarin_words[:13]) # 中文是字符型的,不能使用句子读入函数 sents() # chinese_mandarin_sents=udhr.sents('Chinese_Mandarin-UTF8') # print(chinese_mandarin_sents[:13]) # 3.1.9. 载入自己的语料库 from nltk.corpus import PlaintextCorpusReader # 这个在 C 盘根目录下,子目录中需要放入一些文件 corpus_root = '/nltk_data/tokenizers/punkt' word_lists = PlaintextCorpusReader(corpus_root, '.*') print("自己语料库的文件列表= ", word_lists.fileids()) from nltk.corpus import BracketParseCorpusReader corpus_root = r'C:\nltk_data\corpora\treebank\combined' file_pattern = r'wsj_.*\.mrg' ptb = BracketParseCorpusReader(corpus_root, file_pattern) show_subtitle("文件列表") print(ptb.fileids()[:13]) show_subtitle("句子列表") print(ptb.sents()[:3]) show_subtitle("指定文件中的句子") print(ptb.sents(fileids='wsj_0003.mrg')[19])
usage() preprocess = True if preprocess == False: inpath = sys.argv[1] instring = open(inpath).read() inargs = instring.split('\t') ptb_path = inargs[0] stringAddresses = inargs[1] argAddresses = getGalFromString(stringAddresses) if argAddresses == None: print 'no address provided' sys.exit(1) docSents = BracketParseCorpusReader( os.path.dirname(ptb_path), os.path.basename(ptb_path)).parsed_sents() (prods, head, processedArgTree) = wellner_head_extraction(docSents, argAddresses) if prods == None: sys.exit(1) give_output(processedArgTree, os.path.splitext(os.path.basename(ptb_path))[0], prods, head) outfile = open(sys.argv[2], 'w') outfile.write(head + '\n') outfile.close() else: reqpath = sys.argv[2] outpath = sys.argv[3] outfile = open(outpath, 'w')
words[1:20] sents = gutenberg.sents("burgess-busterbrown.txt") sents[1:20] #Loading your own Corpus from nltk.corpus import PlaintextCorpusReader corpus_root = '/usr/share/dict' # '.*' can be a list of fileids, like ['a.txt', 'test/b.txt'], or a pattern that matches all fileids, like '[abc]/.*\.txt' wordlists = PlaintextCorpusReader(corpus_root, '.*') wordlists.fileids() wordlists.words('connectives') from nltk.corpus import BracketParseCorpusReader corpus_root = r"C:\corpora\penntreebank\parsed\mrg\wsj" file_pattern = r".*/wsj_.*\.mrg" ptb = BracketParseCorpusReader(corpus_root, file_pattern) ptb.fileids() len(ptb.sents()) ptb.sents(fileids='20/wsj_2013.mrg')[19] # Conditional Frequency Distributions: # is a collection of frequency distributions, each one for a different "condition". # The condition will often be the category of the text. # A frequency distribution counts observable events, # such as the appearance of words in a text. # A conditional frequency distribution needs to pair each event with a condition. # So instead of processing a sequence of words, # we have to process a sequence of pairs: text = ['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', """..."""] pairs = [('news', 'The'), ('news', 'Fulton'), ('news', 'County'), '''...''']