示例#1
0
    def __init__(self, params, corpus, with_doc=False):
        super().__init__(params, corpus)
        logging.info('Initialize PropBank reader.')

        if with_doc:
            self.wsj_treebank = BracketParseCorpusReader(
                root=params.wsj_path,
                fileids=params.wsj_file_pattern,
                tagset='wsj',
                encoding='ascii')

            logging.info('Found {} treebank files.'.format(
                len(self.wsj_treebank.fileids())))

        self.propbank = PropbankCorpusReader(
            root=FileSystemPathPointer(params.root),
            propfile=params.propfile,
            framefiles=params.frame_files,
            verbsfile=params.verbs_file,
        )

        self.propbank_annos = defaultdict(list)
        logging.info("Loading PropBank Data.")
        for inst in self.propbank.instances():
            docid = inst.fileid.split('/')[-1]
            self.propbank_annos[docid].append(inst)

        self.stats = {
            'predicate_count': 0,
            'argument_count': 0,
        }
示例#2
0
def read_ptb():
    sys.stderr.write("\nReading PTB data from " + PTB_DATA_DIR + " ...\n")
    sentences = []
    senno = 0
    with codecs.open("ptb.sents", "w", "utf-8") as ptbsf:
        for constitfile in os.listdir(PTB_DATA_DIR):
            reader = BracketParseCorpusReader(PTB_DATA_DIR, constitfile)
            parses = reader.parsed_sents()
            # TODO: map from parses to sentences
            for p in parses:
                ptbsf.write(" ".join(p.leaves()) + "\n")
                tokpos = p.pos()
                tokens = [VOCDICT.addstr(tok) for tok, pos in tokpos]
                postags = [POSDICT.addstr(pos) for tok, pos in tokpos]
                s = Sentence(
                    "constit",
                    sentnum=senno,
                    tokens=tokens,
                    postags=postags,
                )
                s.get_all_parts_of_ctree(p, CLABELDICT, False)
                sentences.append(s)
                senno += 1
        sys.stderr.write("# PTB sentences: %d\n" % len(sentences))
        ptbsf.close()
    return sentences
示例#3
0
    def __init__(self, corpus_root, file_pattern):
        self.ptb = BracketParseCorpusReader(corpus_root, file_pattern)

        self.all_sents = []
        self.all_tagged_sents = []
        self.all_parsed_sents = []
        self.ptb_file_id = ''
示例#4
0
 def open_flod(self, root_path, file_type ):
     ptb         = BracketParseCorpusReader(root_path, file_type)
     files_list  = ptb.fileids()
     files_path  = []
     for f in files_list:
         files_path.append(os.path.join(root_path,f))
     return (files_path,files_list)
def tree_reader():
    d = {}
    trees = BracketParseCorpusReader("parsed_sentences/", ".*")
    for name in trees.fileids():
        d_name = re.sub(r"\.tree", "", name)
        d[d_name] = list(trees.parsed_sents(name))

    return d
def tree_reader():
    d = {}
    trees = BracketParseCorpusReader("parsed_sentences/", ".*")
    for name in trees.fileids():
        d_name = re.sub(r"\.tree", "", name)
        d[d_name] = list(trees.parsed_sents(name))

    return d
示例#7
0
def get_tagger():
    dirname = os.path.dirname(__file__)
    corpus_root = os.path.join(dirname, 'training_data')
    testcaselists = BracketParseCorpusReader(corpus_root, [
        'click.txt', 'enter_text.txt', 'browser.txt', 'load_url.txt',
        'keyboard_actions.txt'
    ])
    tagger = ConsecutivePosTagger(testcaselists.tagged_sents())
    return tagger
示例#8
0
def extracting_cfg(
        corpus_root,
        file_pattern):  #returns cfg eith only 2 non-terminals on the right
    ptb = BracketParseCorpusReader(corpus_root, file_pattern)
    cfg_dict = {}
    unite_productions = {}
    lexicon = {}
    for file in ptb.fileids():
        #file = ptb.fileids()[0]
        print(file)
        for sentence in ptb.parsed_sents(file):  # iterating through sentences
            #sentence =ptb.parsed_sents(file)[some_i]
            if len(sentence.leaves()) <= 8:
                #print(sentence.leaves())
                for subtree in sentence.subtrees():  # extracting subtree
                    left_side = subtree.label()
                    right_side = []
                    for children in subtree:
                        if isinstance(children, str):  # reached leaf node
                            right_side.append(children)
                            if left_side in lexicon:
                                lexicon[left_side].add(children)
                            else:
                                lexicon[left_side] = set()
                                lexicon[left_side].add(children)
                        else:  # still not leafe node
                            right_side.append(children.label())
                    while len(
                            right_side
                    ) > 2:  # making only 2 non-terminals on the right side
                        new_head = '_'.join(
                            right_side[1:]
                        )  # generating new left side of the rule
                        new_right_side = right_side[:1] + [
                            new_head
                        ]  # generating new right side of the rule
                        tup = tuple(new_right_side)
                        if left_side not in cfg_dict:  # new key
                            cfg_dict[left_side] = set()
                            cfg_dict[left_side].add(tup)
                        else:
                            cfg_dict[left_side].add(tup)
                        left_side = new_head
                        right_side = right_side[1:]
                    if len(right_side) == 1:  #unite production
                        if left_side in unite_productions:
                            unite_productions[left_side].add(tuple(right_side))
                        else:
                            unite_productions[left_side] = set()
                            unite_productions[left_side].add(tuple(right_side))
                    if left_side in cfg_dict:  # adding rule to the dict
                        cfg_dict[left_side].add(tuple(right_side))
                    else:
                        cfg_dict[left_side] = set()
                        cfg_dict[left_side].add(tuple(right_side))
    return cfg_dict, lexicon, unite_productions
示例#9
0
def extracting_cnf(corpus_root, file_pattern):
    ptb = BracketParseCorpusReader(corpus_root, file_pattern)
    cnf_dict = {}
    cnf_dict['lexicon'] = set()
    #for file in ptb.fileids():
    #for file in ptb.fileids():
    file = ptb.fileids()[0]
    print(file)
    for s in range(1, len(ptb.parsed_sents(file))):
        tree = ptb.parsed_sents(file)[s]
        for sub in tree.subtrees():
            return_rule(sub, cnf_dict, file)
    return cnf_dict
示例#10
0
def seg_pos_ctb(ctb_dir, fileids):
    reader = BracketParseCorpusReader(ctb_dir, fileids)
    #生成词语和词性元组
    # tree=reader.tagged_sents()
    #生成每个句子的树结构,对于部分数据如40.nw中五年来一句无法正确解析
    tree = reader.parsed_sents()
    print('tree len: {}'.format(len(tree)))

    seg_pos_sentences = []
    broken_parses = []
    for s in tree:
        s = s.pos()

        if s and s != [] and type(s[0]) == tuple:
            s = [j if j[1] != '-NONE-' else (' NONE ', 'NONE') for j in s]
            seg_pos_sentences.append(s)
        else:
            broken_parses.append(s)

    return seg_pos_sentences, broken_parses
示例#11
0
    def __init__(self, params, corpus, with_doc=False):
        super().__init__(params, corpus, with_doc)

        self.wsj_treebank = BracketParseCorpusReader(
            root=params.wsj_path,
            fileids=params.wsj_file_pattern,
            tagset="wsj",
            encoding="ascii",
        )

        logging.info("Found {} treebank files.".format(
            len(self.wsj_treebank.fileids())))

        self.nombank = NombankCorpusReader(
            root=FileSystemPathPointer(params.nombank_path),
            nomfile=params.nomfile,
            framefiles=params.frame_file_pattern,
            nounsfile=params.nombank_nouns_file,
            parse_fileid_xform=lambda s: s[4:],
            parse_corpus=self.wsj_treebank,
        )

        logging.info("Loading G&C annotations.")
        self.gc_annos = self.load_gc_annotations()
        num_gc_preds = sum(
            [len(preds) for (d, preds) in self.gc_annos.items()])
        logging.info(f"Loaded {num_gc_preds} predicates")

        logging.info("Loading Nombank annotations")
        self.nombank_annos = defaultdict(list)
        for nb_instance in self.nombank.instances():
            docid = nb_instance.fileid.split("/")[-1]
            self.nombank_annos[docid].append(nb_instance)

        self.stats = {
            "target_pred_count": Counter(),
            "predicates_with_implicit": Counter(),
            "implicit_slots": Counter(),
        }

        self.stat_dir = params.stat_dir
示例#12
0
def read_wsj(article_count):
    wsj_root = '/Users/chbrown/Dropbox/ut/nlp/data/penn-treebank3/parsed/mrg/wsj'
    articles = []
    for section in range(25):
        for article_path in os.listdir('%s/%02d' % (wsj_root, section)):
            reader = BracketParseCorpusReader(wsj_root, '%02d/%s' % (section, article_path))

            sentences = []
            for tagged_sent in reader.tagged_sents():
                # token_postag_pairs = sentence
                token_postag_pairs = [
                    (token.lower(), pos_tag)
                    for token, pos_tag in tagged_sent
                    if pos_tag not in ('-LRB-', '-RRB-', '-NONE-')]
                sentence = DefinitenessDocument.from_token_postag_pairs(token_postag_pairs)
                sentences.append(sentence)

            articles.append(sentences)

            if len(articles) >= article_count:
                return articles
    return articles
示例#13
0
文件: LDA.py 项目: pbamotra/cgrnnlm
def train(refresh=True):
    if refresh:
        ptb = BracketParseCorpusReader(Corpus.DATA_DIR, Corpus.FILE_PATTERN)
        train_folders = [str(i) + str(j) for i in range(2) for j in range(10)]
        train_folders += [str(i) + str(j) for i in range(2, 3) for j in range(5)]

        dictionary = corpora.dictionary.Dictionary()
        train_documents = list()

        logger.debug('Starting to parse training documents')
        for folder in train_folders:
            for ptb_file in os.listdir(os.path.join(Corpus.DATA_DIR, folder)):
                document_sentences = ptb.sents(fileids=[os.path.join(folder, ptb_file)])
                if len(document_sentences) > DOC_LEN_THRESHOLD:
                    doc2sentence = list(chain.from_iterable(document_sentences))
                    doc2sentence = clean_text(doc2sentence)
                    dictionary.add_documents([doc2sentence])
                    train_documents.append(doc2sentence)
        logger.debug('Parsed all training documents')

        dictionary.filter_extremes(no_below=1, no_above=0.5)
        dictionary.save(DICTIONARY_FILE)

        logger.debug('Creating corpus for training data')
        corpus = [dictionary.doc2bow(text) for text in train_documents]
        logger.debug('Finished creating corpus')

        logger.debug('Training LDA model on corpus')
        lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=N_TOPICS, passes=20)
        logger.debug('Completed LDA training')

        lda.save(LDA_MODEL_FILE)
    else:
        dictionary = corpora.dictionary.Dictionary.load(DICTIONARY_FILE)
        lda = LdaModel.load(LDA_MODEL_FILE)

    return lda, dictionary
示例#14
0
def loadCorpora():

    corpus_root = '/usr/share/dict'
    wordlists = PlaintextCorpusReader(corpus_root, '.*')
    wordlists.fileids()
    wordlists.words('connectives')

    corpus_root = r"C:\corpora\penntreebank\parsed\mrg\wsj"
    file_pattern = r".*/wsj_.*\.mrg" 
    ptb = BracketParseCorpusReader(corpus_root, file_pattern)
    ptb.fileids()
    len(ptb.sents())
    ptb.sents(fileids='20/wsj_2013.mrg')[19]
示例#15
0
def read_wsj(article_count):
    wsj_root = '/Users/chbrown/Dropbox/ut/nlp/data/penn-treebank3/parsed/mrg/wsj'
    articles = []
    for section in range(25):
        for article_path in os.listdir('%s/%02d' % (wsj_root, section)):
            reader = BracketParseCorpusReader(
                wsj_root, '%02d/%s' % (section, article_path))

            sentences = []
            for tagged_sent in reader.tagged_sents():
                # token_postag_pairs = sentence
                token_postag_pairs = [
                    (token.lower(), pos_tag) for token, pos_tag in tagged_sent
                    if pos_tag not in ('-LRB-', '-RRB-', '-NONE-')
                ]
                sentence = DefinitenessDocument.from_token_postag_pairs(
                    token_postag_pairs)
                sentences.append(sentence)

            articles.append(sentences)

            if len(articles) >= article_count:
                return articles
    return articles
示例#16
0
    def __init__(self, config_path):
        conf = load_file_config(config_path)
        logging.info(json.dumps(conf, indent=2))

        params = GCDataSet.GCConfig(config=conf)
        super().__init__(params)

        wsj_treebank = BracketParseCorpusReader(
            root=params.wsj_path,
            fileids=params.wsj_file_pattern,
            tagset='wsj',
            encoding='ascii')

        self.nombank = NombankCorpusReader(
            root=FileSystemPathPointer(params.nombank_path),
            nomfile=params.nomfile,
            framefiles=params.frame_file_pattern,
            nounsfile=params.nombank_nouns_file,
            parse_fileid_xform=lambda s: s[4:],
            parse_corpus=wsj_treebank)
示例#17
0
def get_sents_by_field_ids(field_ids):
    if not isinstance(field_ids, list):
        field_ids = [field_ids]
    ptb = BracketParseCorpusReader(DATA_DIR, FILE_PATTERN)
    return ptb.sents(fileids=field_ids)
def load_reader_and_filedids(lang,data_type):
    assert data_type in ('train','val','test')
    def filter_trees(tree, data_type):
        def _is_control(char):
            """Checks whether `chars` is a control character."""
            # These are technically control characters but we count them as whitespace
            # characters.
            if char == "\t" or char == "\n" or char == "\r":
                return False
            cat = unicodedata.category(char)
            if cat.startswith("C"):
                return True
            return False
        
        sent=tree.leaves()
        if data_type=='wsj' and len(sent)>10: return False
        if data_type!='wsj' and len(sent)>128: return False
        try:
            for c in ' '.join(sent):
                cp=ord(c)
                if cp == 0 or cp == 0xfffd or _is_control(c):
                    return False
            return True
        except:
            return False

    def filt_id(fileids,lang):
        assert lang in ('en','fr','zh')
        train_file_ids,valid_file_ids,test_file_ids=[],[],[]
        for id in fileids:
            prefix=id.split('.')[0]
            if lang=='en':
                if 'WSJ/22/WSJ_2200' <= prefix <= 'WSJ/22/WSJ_2299':
                    valid_file_ids.append(id)
                elif 'WSJ/23/WSJ_2300' <= prefix <= 'WSJ/23/WSJ_2399':
                    test_file_ids.append(id)
                else:
                    train_file_ids.append(id)        
            elif lang=='zh':
                if '0886' <= prefix <= '0931' or '1148' <= prefix <= '1151':
                    valid_file_ids.append(id)
                elif '0816' <= prefix <= '0885' or '1137' <= prefix <='1147':
                    test_file_ids.append(id)
                else:
                    train_file_ids.append(id)        
            else:
                if prefix in ('flmf3_12500_12999co','flmf7ab2ep','flmf7ad1co','flmf7ae1ep'):
                    valid_file_ids.append(id) 
                elif prefix in ('flmf3_12000_12499ep','flmf7aa1ep','flmf7aa2ep','flmf7ab1co'):
                    test_file_ids.append(id)
                else:
                    train_file_ids.append(id)
        return train_file_ids,valid_file_ids,test_file_ids

    assert lang in ('en','zh','fr','il','jp','sp','ca','sw','de')
    lang_dir=treebank_dir+'/'+lang
    reader=BracketParseCorpusReader(lang_dir, '.*')
    fileids=reader.fileids()
    if data_type=='wsj10':
        return [t for t in reader.parsed_sents(fileids) if filter_trees(t,data_type)]
    train_file_ids = []
    valid_file_ids = []
    test_file_ids = []
    if lang in ('en','zh','fr'):
        train_file_ids,valid_file_ids,test_file_ids=filt_id(fileids,lang)
        train_trees=reader.parsed_sents(train_file_ids)
        val_trees=reader.parsed_sents(valid_file_ids)
        test_trees=reader.parsed_sents(test_file_ids)
    else:
        for fid in fileids:
            if 'train' in fid:
                train_trees=reader.parsed_sents(fid)
            elif 'val' in fid:
                val_trees=reader.parsed_sents(fid)
            elif 'test' in fid:
                test_trees=reader.parsed_sents(fid)
    if data_type=='train':
        train_trees=[t for t in train_trees if filter_trees(t,data_type)]
        print(f'train:{len(train_trees)}')
        return train_trees
    elif data_type=='val':
        val_trees=[t for t in val_trees if filter_trees(t,data_type)]
        print(f'val:{len(val_trees)}')
        return val_trees
    else:
        test_trees=[t for t in test_trees if filter_trees(t,data_type)]
        print(f'test:{len(test_trees)}')
        return test_trees     
示例#19
0
def retrieve_data():
    train_data = BracketParseCorpusReader("data", "02-21.10way.clean")
    val_data = BracketParseCorpusReader("data", "22.auto.clean")
    test_data = BracketParseCorpusReader("data", "23.auto.clean")

    train_words = [x.lower() for x in train_data.words()]
    val_words = [x.lower() for x in val_data.words()]
    test_words = [x.lower() for x in test_data.words()]

    all_words = train_words + val_words + test_words

    word_counter = Counter(all_words)

    vocab = ['PAD', 'SOS', 'EOS'] + list(word_counter.keys())
    vocab_size = len(vocab)

    word2idx = {ch: i for i, ch in enumerate(vocab)}
    idx2word = {i: ch for i, ch in enumerate(vocab)}

    train_sents = [[w.lower() for w in sent] for sent in train_data.sents()]
    val_sents = [[w.lower() for w in sent] for sent in val_data.sents()]
    test_sents = [[w.lower() for w in sent] for sent in test_data.sents()]

    train_dataset = TextData(train_sents, word2idx, idx2word, vocab_size)
    val_dataset = TextData(val_sents, word2idx, idx2word, vocab_size)
    test_dataset = TextData(test_sents, word2idx, idx2word, vocab_size)

    return train_dataset, val_dataset, test_dataset
示例#20
0
class PTBReader(object):
    def __init__(self, corpus_root, file_pattern):
        self.ptb = BracketParseCorpusReader(corpus_root, file_pattern)

        self.all_sents = []
        self.all_tagged_sents = []
        self.all_parsed_sents = []
        self.ptb_file_id = ''

    def read_ptb_file(self, node):
        if node.file_id != self.ptb_file_id:
            path = '{0}/{1}.mrg'.format(node.directory, node.file_id)
            self.all_sents = self.ptb.sents(fileids=path)
            self.all_tagged_sents = self.ptb.tagged_sents(fileids=path)
            self.all_parsed_sents = self.ptb.parsed_sents(fileids=path)
            self.ptb_file_id = node.file_id

    def get_subtree_pos(self, node):
        parsed_sent = self.all_parsed_sents[node.sent_id]
        token_pos = parsed_sent.leaf_treeposition(node.token_id)
        subtree_pos = token_pos[:-(node.phrase_level + 1)]
        return subtree_pos

    def is_child_node(self, parent, child):
        if not (isinstance(parent, Node) and isinstance(child, Node)):
            return False
        if not (parent.file_id == child.file_id
                and parent.sent_id == child.sent_id):
            return False

        self.read_ptb_file(parent)
        parent_subtree_pos = self.get_subtree_pos(parent)
        child_subtree_pos = self.get_subtree_pos(child)
        if child_subtree_pos[:len(parent_subtree_pos)] == parent_subtree_pos:
            return True
        else:
            return False

    def parse_node(self, node):
        if node.__class__ == SplitNode:
            # parse each node in the split node
            for n in node.node_list:
                self.parse_node(n)

            # combine the ptb_surface of each node
            node.ptb_idx_list = [
                idx for n in node.node_list for idx in n.ptb_idx_list
            ]
            node.ptb_surface = ' '.join(
                [n.ptb_surface for n in node.node_list])

        else:
            self.read_ptb_file(node)

            node.subtree_pos = self.get_subtree_pos(node)

            parsed_sent = self.all_parsed_sents[node.sent_id]
            node.ptb_idx_list = []
            for idx in range(len(parsed_sent.leaves())):
                if parsed_sent.leaf_treeposition(idx)[:len(node.subtree_pos)] \
                        == node.subtree_pos:
                    node.ptb_idx_list.append(idx)

            assert node.ptb_idx_list == \
                range(node.ptb_idx_list[0], node.ptb_idx_list[-1] + 1), \
                'Error in matching indices for subtree leaves: {0}'.format(node)

            tagged_sent = self.all_tagged_sents[node.sent_id]
            node.ptb_surface = ' '.join([
                word[0]
                for word in [tagged_sent[i] for i in node.ptb_idx_list]
            ])
示例#21
0
class NomBank(DataLoader):
    """Loading Nombank data and implicit argument annotations."""
    def __init__(self, params, corpus, with_doc=False):
        super().__init__(params, corpus, with_doc)

        self.wsj_treebank = BracketParseCorpusReader(
            root=params.wsj_path,
            fileids=params.wsj_file_pattern,
            tagset='wsj',
            encoding='ascii')

        logging.info('Found {} treebank files.'.format(
            len(self.wsj_treebank.fileids())))

        self.nombank = NombankCorpusReader(
            root=FileSystemPathPointer(params.nombank_path),
            nomfile=params.nomfile,
            framefiles=params.frame_file_pattern,
            nounsfile=params.nombank_nouns_file,
            parse_fileid_xform=lambda s: s[4:],
            parse_corpus=self.wsj_treebank)

        logging.info("Loading G&C annotations.")
        self.gc_annos = self.load_gc_annotations()
        num_gc_preds = sum(
            [len(preds) for (d, preds) in self.gc_annos.items()])
        logging.info(f"Loaded {num_gc_preds} predicates")

        logging.info("Loading Nombank annotations")
        self.nombank_annos = defaultdict(list)
        for nb_instance in self.nombank.instances():
            docid = nb_instance.fileid.split('/')[-1]
            self.nombank_annos[docid].append(nb_instance)

        self.stats = {
            'target_pred_count': Counter(),
            'predicates_with_implicit': Counter(),
            'implicit_slots': Counter(),
        }

        self.stat_dir = params.stat_dir

    class NomElement:
        def __init__(self, article_id, sent_num, tree_pointer):
            self.article_id = article_id
            self.sent_num = int(sent_num)
            self.pointer = tree_pointer

        @staticmethod
        def from_text(pointer_text):
            parts = pointer_text.split(':')
            if len(parts) != 4:
                raise ValueError("Invalid pointer text.")

            read_id = parts[0]
            full_id = read_id.split('_')[1][:2] + '/' + read_id + '.mrg'

            return NomBank.NomElement(
                full_id, int(parts[1]),
                NombankTreePointer(int(parts[2]), int(parts[3])))

        def __str__(self):
            return 'Node-%s-%s:%s' % (self.article_id, self.sent_num,
                                      self.pointer.__repr__())

        def __hash__(self):
            return hash(
                (self.article_id, self.sent_num, self.pointer.__repr__()))

        def __eq__(self, other):
            return other and other.__str__() == self.__str__()

        __repr__ = __str__

    def load_gc_annotations(self):
        tree = ET.parse(self.params.implicit_path)
        root = tree.getroot()

        gc_annotations = defaultdict(dict)

        def merge_split_pointers(pointers):
            all_pointers = []
            split_pointers = []

            for pointer, is_split in pointers:
                if is_split:
                    split_pointers.append(pointer)
                else:
                    all_pointers.append(pointer)

            if len(split_pointers) > 0:
                sorted(split_pointers, key=lambda t: t.wordnum)
                all_pointers.append(NombankChainTreePointer(split_pointers))

            return all_pointers

        total_implicit_count = 0
        total_preds = 0

        for annotations in root:
            pred_node_pos = annotations.attrib['for_node']
            predicate = NomBank.NomElement.from_text(pred_node_pos)

            article_id = predicate.article_id

            total_preds += 1

            explicit_roles = set()

            arg_annos = defaultdict(list)

            for annotation in annotations:
                arg_type = annotation.attrib['value']
                arg_node_pos = annotation.attrib['node']

                (arg_article_id, arg_sent_id, arg_terminal_id,
                 arg_height) = arg_node_pos.split(':')

                is_split = False
                is_explicit = False

                for attribute in annotation[0]:
                    if attribute.text == 'Split':
                        is_split = True
                    elif attribute.text == 'Explicit':
                        is_explicit = True

                if pred_node_pos == arg_node_pos:
                    # Incorporated nodes are explicit.
                    is_explicit = True

                if is_explicit:
                    explicit_roles.add(arg_type)
                else:
                    p = NombankTreePointer(int(arg_terminal_id),
                                           int(arg_height))
                    # Arguments are group by their sentences.
                    arg_annos[(arg_sent_id, arg_type)].append((p, is_split))

            all_args = defaultdict(list)
            implicit_role_here = set()
            for (arg_sent_id, arg_type), l_pointers in arg_annos.items():
                if int(arg_sent_id) > predicate.sent_num:
                    # Ignoring annotations after the sentence.
                    continue

                if arg_type not in explicit_roles:
                    for p in merge_split_pointers(l_pointers):
                        arg_element = NomBank.NomElement(
                            article_id, arg_sent_id, p)

                        if not predicate.pointer == arg_element.pointer:
                            # Ignoring incorporated ones.
                            all_args[arg_type].append(arg_element)
                            implicit_role_here.add(arg_type)

            gc_annotations[article_id.split('/')[-1]][predicate] = all_args

            total_implicit_count += len(implicit_role_here)

        logging.info(f"Loaded {total_preds} predicates, "
                     f"{total_implicit_count} implicit arguments.")

        return gc_annotations

    def add_predicate(self, doc, parsed_sents, predicate_node):
        pred_node_repr = "%s:%d:%s" % (doc.docid, predicate_node.sent_num,
                                       predicate_node.pointer)
        p_tree = parsed_sents[predicate_node.sent_num]
        p_word_idx = utils.make_words_from_pointer(p_tree,
                                                   predicate_node.pointer)
        predicate_span = utils.get_nltk_span(doc.token_spans,
                                             predicate_node.sent_num,
                                             p_word_idx)

        if len(predicate_span) == 0:
            logging.warning("Zero length predicate found")
            return

        p = doc.add_predicate(None, predicate_span, frame_type='NOMBANK')

        if p:
            p.add_meta('node', pred_node_repr)

        return p

    def add_nombank_arg(self,
                        doc,
                        parsed_sents,
                        wsj_spans,
                        arg_type,
                        predicate,
                        arg_node,
                        implicit=False):
        arg_type = arg_type.lower()

        a_tree = parsed_sents[arg_node.sent_num]
        a_word_idx = utils.make_words_from_pointer(a_tree, arg_node.pointer)

        arg_node_repr = "%s:%d:%s" % (doc.docid, arg_node.sent_num,
                                      arg_node.pointer)
        argument_span = utils.get_nltk_span(wsj_spans, arg_node.sent_num,
                                            a_word_idx)

        if len(argument_span) == 0:
            # Some arguments are empty nodes, they will be ignored.
            return

        em = doc.add_entity_mention(None, argument_span)

        if em:
            if implicit:
                arg_type = 'i_' + arg_type

            arg_mention = doc.add_argument_mention(predicate, em.aid, arg_type)
            arg_mention.add_meta('node', arg_node_repr)

            if implicit:
                arg_mention.add_meta('implicit', True)
                arg_mention.add_meta('sent_num', arg_node.sent_num)
                arg_mention.add_meta('text', em.text)

            return arg_mention

    def get_predicate_text(self, p):
        p_text = p.text.lower()
        if p_text == 'losses' or p_text == 'loss' or p_text == 'tax-loss':
            p_text = 'loss'
        else:
            p_text = p_text.rstrip('s')

        if p_text == 'savings-and-loan':
            p_text = 'loan'

        if '-' in p_text:
            p_text = p_text.split('-')[1]
        return p_text

    def add_all_annotations(self, doc, parsed_sents):
        logging.info("Adding Nombank annotation for " + doc.docid)
        nb_instances = self.nombank_annos[doc.docid]

        for nb_instance in nb_instances:
            predicate_node = NomBank.NomElement(doc.docid, nb_instance.sentnum,
                                                nb_instance.predicate)

            p = self.add_predicate(doc, parsed_sents, predicate_node)

            for argloc, argid in nb_instance.arguments:
                arg_node = NomBank.NomElement(doc.docid, nb_instance.sentnum,
                                              argloc)
                arg = self.add_nombank_arg(doc, parsed_sents, doc.token_spans,
                                           argid, p, arg_node)

                if arg_node.pointer == predicate_node.pointer:
                    arg.add_meta('incorporated', True)

        if not self.params.explicit_only and doc.docid in self.gc_annos:
            for predicate_node, gc_args in self.gc_annos[doc.docid].items():
                added_args = defaultdict(list)

                p = self.add_predicate(doc, parsed_sents, predicate_node)
                p_text = utils.normalize_pred_text(p.text)

                p.add_meta('from_gc', True)

                self.stats['target_pred_count'][p_text] += 1

                for arg_type, arg_nodes in gc_args.items():
                    for arg_node in arg_nodes:
                        arg = self.add_nombank_arg(doc, parsed_sents,
                                                   doc.token_spans, arg_type,
                                                   p, arg_node, True)
                        added_args[arg_type].append(arg)

                        # The following should be useless already.
                        if arg_node.pointer == predicate_node.pointer:
                            arg.add_meta('incorporated', True)

                        if arg_node.sent_num > predicate_node.sent_num:
                            arg.add_meta('succeeding', True)

                if len(added_args) > 0:
                    self.stats['predicates_with_implicit'][p_text] += 1
                    self.stats['implicit_slots'][p_text] += len(added_args)

    def set_wsj_text(self, doc, fileid):
        text = ''
        w_start = 0

        spans = []
        for tagged_sent in self.wsj_treebank.tagged_sents(fileid):
            word_spans = []

            for word, tag in tagged_sent:
                if not tag == '-NONE-':
                    text += word + ' '
                    word_spans.append((w_start, w_start + len(word)))
                    w_start += len(word) + 1
                else:
                    # Ignoring these words.
                    word_spans.append(None)

            text += '\n'
            w_start += 1

            spans.append(word_spans)

        doc.set_text(text)

        return spans

    def load_nombank(self):
        all_annos = defaultdict(list)
        for nb_instance in self.nombank.instances():
            all_annos[nb_instance.fileid].append(nb_instance)
        return all_annos

    def get_doc(self):
        for docid, instances in self.nombank_annos.items():
            if self.params.gc_only and docid not in self.gc_annos:
                continue

            doc = DEDocument(self.corpus)
            doc.set_id(docid)

            fileid = docid.split('_')[-1][:2] + '/' + docid

            parsed_sents = self.wsj_treebank.parsed_sents(fileids=fileid)
            doc.set_parsed_sents(parsed_sents)

            token_spans = self.set_wsj_text(doc, fileid)
            doc.set_token_spans(token_spans)

            self.add_all_annotations(doc, parsed_sents)

            yield doc

    def print_stats(self):
        logging.info("Corpus statistics from Nombank")

        keys = self.stats.keys()
        headline = 'predicate\t' + '\t'.join(keys)
        sums = Counter()

        if not os.path.exists(self.stat_dir):
            os.makedirs(self.stat_dir)

        preds = sorted(self.stats['predicates_with_implicit'].keys())

        with open(os.path.join(self.stat_dir, 'counts.txt'), 'w') as out:
            print(headline)
            out.write(f'{headline}\n')

            for pred in preds:
                line = f"{pred}:"
                for key in keys:
                    line += f"\t{self.stats[key][pred]}"
                    sums[key] += self.stats[key][pred]
                print(line)
                out.write(f'{line}\n')

            sum_line = 'Total\t' + '\t'.join([str(sums[k]) for k in keys])
            print(sum_line)
            out.write(f'{sum_line}\n')
from nltk.corpus import BracketParseCorpusReader;
corpus_root = r"xenopedia";
file_pattern = r".*\.txt";

ptb = BracketParseCorpusReader(corpus_root,file_pattern);

print ptb.fileids();
print len(ptb.sents());
print ptb.sents();
示例#23
0
class PropBank(DataLoader):
    """Load PropBank data."""
    def __init__(self, params, corpus, with_doc=False):
        super().__init__(params, corpus)
        logging.info('Initialize PropBank reader.')

        if with_doc:
            self.wsj_treebank = BracketParseCorpusReader(
                root=params.wsj_path,
                fileids=params.wsj_file_pattern,
                tagset='wsj',
                encoding='ascii')

            logging.info('Found {} treebank files.'.format(
                len(self.wsj_treebank.fileids())))

        self.propbank = PropbankCorpusReader(
            root=FileSystemPathPointer(params.root),
            propfile=params.propfile,
            framefiles=params.frame_files,
            verbsfile=params.verbs_file,
        )

        self.propbank_annos = defaultdict(list)
        logging.info("Loading PropBank Data.")
        for inst in self.propbank.instances():
            docid = inst.fileid.split('/')[-1]
            self.propbank_annos[docid].append(inst)

        self.stats = {
            'predicate_count': 0,
            'argument_count': 0,
        }

    def add_all_annotations(self, doc):
        logging.info("Adding propbank annotations for " + doc.docid)

        instances = self.propbank_annos[doc.docid]

        for inst in instances:
            parsed_sents = doc.get_parsed_sents()

            tree = parsed_sents[inst.sentnum]

            p_word_idx = utils.make_words_from_pointer(tree, inst.predicate)
            pred_span = utils.get_nltk_span(doc.get_token_spans(),
                                            inst.sentnum, p_word_idx)

            pred_node_repr = "%s:%d:%s" % (doc.docid, inst.sentnum,
                                           inst.predicate)

            self.stats['predicate_count'] += 1

            for argloc, arg_slot in inst.arguments:
                a_word_idx = utils.make_words_from_pointer(tree, argloc)
                arg_span = utils.get_nltk_span(doc.get_token_spans(),
                                               inst.sentnum, a_word_idx)

                if len(arg_span) == 0:
                    continue

                self.stats['argument_count'] += 1

                p = doc.add_predicate(None, pred_span, frame_type='PROPBANK')
                arg_em = doc.add_entity_mention(None, arg_span)
                arg_node_repr = "%s:%d:%s" % (doc.docid, inst.sentnum, argloc)

                if p and arg_em:
                    p.add_meta('node', pred_node_repr)

                    arg_mention = doc.add_argument_mention(
                        p, arg_em.aid, arg_slot.lower())
                    arg_mention.add_meta('node', arg_node_repr)

    def print_stats(self):
        logging.info("Corpus statistics from Propbank")

        for key, value in self.stats.items():
            logging.info(f"{key} : {value}")
示例#24
0
#load do nosso
from nltk.corpus import PlaintextCorpusReader
corpus_root = '/usr/share/dict' #diretoria onde está o ficheiro
wordlists = PlaintextCorpusReader(corpus_root, '.*')

#agora temos o nome de todos os ficheiros.
wordlists.fileids()
wordlists.words('connectives')
from nltk.corpus import BracketParseCorpusReader
corpus_root = r"C:\corpora\penntreebank\parsed\mrg\wsj"
file_pattern = r".*/wsj_.*\.mrg"



ptb = BracketParseCorpusReader(corpus_root, file_pattern)




"""
	2.2
"""
from nltk.corpus import brown
cfd = nltk.ConditionalFreqDist(
	(genre, word)
	for genre in brown.categories()
	for word in brown.words(categories=genre))


示例#25
0
nombank_root = join(corpus_root, 'nombank.1.0')
nombank_file = 'nombank.1.0_sorted'
nombank_nouns_file = 'nombank.1.0.words'

frame_file_pattern = 'frames/.*\.xml'


def fileid_xform_function(filename):
    result = re.sub(r'^wsj/', '', filename)
    # result = re.sub(r'^wsj/\d\d/', '', filename)
    # result = re.sub(r'\.mrg$', '', result)
    return result


treebank = BracketParseCorpusReader(root=treebank_root,
                                    fileids=treebank_file_pattern,
                                    tagset='wsj',
                                    encoding='ascii')

propbank = PropbankCorpusReader(root=FileSystemPathPointer(propbank_root),
                                propfile=propbank_file,
                                framefiles=frame_file_pattern,
                                verbsfile=propbank_verbs_file,
                                parse_fileid_xform=fileid_xform_function,
                                parse_corpus=treebank)

nombank = NombankCorpusReader(root=FileSystemPathPointer(nombank_root),
                              nomfile=nombank_file,
                              framefiles=frame_file_pattern,
                              nounsfile=nombank_nouns_file,
                              parse_fileid_xform=fileid_xform_function,
                              parse_corpus=treebank)
import nltk
from nltk.corpus import BracketParseCorpusReader
import numpy as np
import scipy
from scipy import spatial
import matplotlib.pyplot as plt
import math
import re
import sys
import csv

corpus_root = r"all/"
file_pattern = r".*\.mrg"

sw = BracketParseCorpusReader(corpus_root, file_pattern)

trees = sw.parsed_sents()


def give(t):
    return t.label() == 'VP'


all_vp = []

for tree in trees:
    for vp in tree.subtrees(give):
        children = []
        pps = []
        pp = []
示例#27
0
def print_corpus_metrics(corpus_dir='data'):
    ptb = BracketParseCorpusReader(DATA_DIR, FILE_PATTERN)
    words = ptb.words()
    print 'Total number of words', len(words)
    print 'Total number of unique words', len(set(words))
    print 'Total number of documents', len(ptb.fileids())
示例#28
0
print(sents[1:20])

# 1.9. 载入自己的语料库
from nltk.corpus import PlaintextCorpusReader

corpus_root = '/Temp/delete'
wordlists = PlaintextCorpusReader(corpus_root, '.*')
print(wordlists.fileids())
print(wordlists.words('blake-poems.txt'))

from nltk.corpus import BracketParseCorpusReader

corpus_root = r'C:\nltk_data\corpora\treebank\combined'
file_pattern = r'.*/wsj_.*\.mrg'
file_pattern = r'wsj_.*.mrg'
ptb = BracketParseCorpusReader(corpus_root, file_pattern)
print(ptb)
print(ptb.fileids())
print(len(ptb.sents()))
print(ptb.sents(fileids='wsj_0199.mrg')[1])

# 2. 条件频率分布:是频率分布的集合,每个频率分布有一个不同的“条件”。(condition,word)根据condition(条件)统计word(单词)的频率。
# 2.1. 条件 和 事件
# 2.2. 按文体计数词汇
from nltk.corpus import brown

cfd = nltk.ConditionalFreqDist((genre, word) for genre in brown.categories()
                               for word in brown.words(categories=genre))
genre_word = [(genre, word) for genre in ['news', 'romance']
              for word in brown.words(categories=genre)]
print(genre_word)
    """

    # Pad left with None's so that the the first iteration is [None, ..., None, iterable[0]]
    if left_nulls:
        iterable = [None] * (size - 1) + iterable

    iters = tee(iterable, size)
    for i in range(1, size):
        for each in iters[i:]:
            next(each, None)
    return zip(*iters)


corpus_root = "wsj"
file_pattern = ".*/wsj_.*\.mrg"
ptb = BracketParseCorpusReader(corpus_root, file_pattern)

counts = defaultdict(lambda: defaultdict(lambda: defaultdict(int)))
for sent in ptb.sents():
    for word1, word2, word3, word4, word5 in window(sent, 5):
        counts[-2][word3][word1] += 1
        counts[-1][word3][word2] += 1
        counts[1][word3][word4] += 1
        counts[2][word3][word5] += 1
counts = dict(counts)

for index, outer_dict in counts.items():
    for word, inner_dict in outer_dict.items():
        counts[index][word] = dict(inner_dict)
    counts[index] = dict(outer_dict)
示例#30
0
def parse_trees(dir, fileid):
    # reader = BracketParseCorpusReader('/home/lnn/Documents/ability/cranfield_testdata/upenn_transfer/new_ctb', fileid)
    reader = BracketParseCorpusReader(dir, fileid)
    tree = reader.parsed_sents()
    return tree
示例#31
0
            replaceSymbolsInTree(tree[i], sent)

## Turns a _A_ symbol back to A
def revertPOS(symbol):
    return symbol[1:-1]

###### Main #########################################################################
if __name__ == '__main__':
    clArgs = createArgParser().parse_args()
    #Check if any arguments are given. If not, display help
    active = False

    if clArgs.penn != None and clArgs.grammar != None:
        active = True
        ## Set up the treebank reader
        ptb = BracketParseCorpusReader(path.dirname(clArgs.penn), [path.basename(clArgs.penn)])

        ## Collect all terminal and nonterminals
        for tree in ptb.parsed_sents(ptb.fileids()[0]):
            # Also set the start symbol to the root of the first tree
            if len(start_symbol) == 0:
                start_symbol = tree.node
            findSymbolsInTree(tree)


        ## Find ambiguous symbols and map them to a unique alternative
        for symbol in nonterminals.intersection(pos):
            replacement = "_" + symbol + "_"
            symbolMap[symbol] = replacement
            if replacement in pos or replacement in nonterminals:
                print "Cannot make nonterminal unambiguous: ", symbol
示例#32
0
文件: NLP.py 项目: Toma-L/NLP
raw[1:20]
words = gutenberg.words("burgess-busterbrown.txt")
words[1:20]
sents = gutenberg.sents("burgess-busterbrown.txt")
sents[1:20]

from nltk.corpus import PlaintextCorpusReader
corpus_root = '' #yourown file
wordlists = PlaintextCorpusReader(corpus_root, '.*')
wlrdlists.fileids()
wordlists.words('connectives')

from nltk.corpus import BracketParseCorpusReader
corpus_root = r""
file_pattern = r".*/wsj_.*\.mrg"
ptb = BracketParseCorpusReader(corpus_root, file_pattern)
ptb.fileids()
len(ptb.sents())
ptb.sents(fileids = '20/wsj_2013.mrg')[19]


#2.2====================

text = ['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', ...]
pairs = [('news', 'The'), ('news', 'Fulton'), ('news', 'County'), ...]

import nltk
from nltk.corpus import brown
cfd = nltk.ConditionalFreqDist(
        (genre, word)
        for genre in brown.categories()
    h.close()

    vocab = [i[0] for i in vocab]
    return vocab


if __name__ == '__main__':
    TRAIN_FILE = 'data/wsj_2-21'
    TEST_FILE = 'data/wsj_23'
    DEV_FILE = 'data/wsj_24'
    SECTIONS = [(2, 21), (23, 23), (24, 24)]
    MAXLEN = 50

    wsj = '/data/penn_tb_3.0/TREEBANK_3/PARSED/MRG/WSJ/'
    file_pattern = r".*/WSJ_.*\.MRG"
    ptb = BracketParseCorpusReader(wsj, file_pattern)
    print('Gathered %d files...' % len(ptb.fileids()))

    print('Generating vocabulary...')
    vocab = get_vocab()
    print('Done.')

    print('Preprocessing all sections...')
    for fn, sections in zip([TRAIN_FILE, TEST_FILE, DEV_FILE], SECTIONS):
        print('Preprocessing %s...' % fn)
        h = open(fn, 'wt')
        for section in range(sections[0], sections[1] + 1):
            fileids = [
                i for i in ptb.fileids()
                if i.startswith(str(section).zfill(2))
            ]
示例#34
0
import nltk
import random

from nltk.corpus import BracketParseCorpusReader
from nltk import induce_pcfg

treebank = BracketParseCorpusReader(
    "resources/",
    "skladnica_with_heads.txt",
)

productions = []
for item in treebank.fileids()[:2]:
    for tree in treebank.parsed_sents(item):
        #tree.draw()
        productions += tree.productions()

grammar = induce_pcfg(nltk.Nonterminal('wypowiedzenie:|'), productions)
print(grammar.start())
#print(grammar.productions())
#print(grammar._lhs_index)
#print(grammar.productions(lhs=grammar.start()))

#print(grammar.productions(lhs=nltk.Nonterminal("wypowiedzenie:|mogę")))
#print(grammar.productions(lhs=nltk.Nonterminal("znakkonca:|.")))

used_symbols = []


def generate_symbols(symbol):
    tags.append("EOS")

    while(sentence[i] != "" or len(sentence) <= 3 ):
        tags.append(get_next_tag(pos_dist, tags[i]))
        sentence.append(get_next_word(t2w_dist, tags[i+1]))
        i += 1

    return (sentence, tags)


# In[ ]:

# Import and parse the corpus

corpus_root = './corpus_clean/'
corpus = BracketParseCorpusReader(corpus_root, ".*")

tagged_sentences = corpus.tagged_sents()
ngram_input = []
pos_input = []
legal_tags = ["EOS","$","#", "GW", "CC", "CD", "DT", "EX", "FW", "IN", "JJ","JJR","JJS","LS","MD",
             "NN","NNS","NNP",'NNPS','PDT','POS','PRP','PRP$','RB','RBR','RBS','RP','TO', "UH",'VB',
             'VBD',"VBG","VBN","VBP","VBZ","WDT","WP","WP$","WRB", "\"", "\'", ",", ".", "AFX"]

single_letter_words = ["a", "i", ",", ".", "!", "?", "\'", "\"", ":", ';', '0', '1', '2', "3", '4',
                       '5', "6", '7', '8', "9", "=", "&", "#", '/', '>', "$", '<', '+', '%',]

# tags_removed = ["-NONE-","SYM", "CODE", "ADD", "HYPH","-LSB-", "-RSB-",":", "NFP", "XX", "-LRB-", "-RRB-"]

#  Remove -NONE- and  SYM tags from the training data and create a list of tokens and a list of tags.
for sentence in tagged_sentences:
示例#36
0
def read_brackets(constitfile):
    sys.stderr.write("\nReading constituents from " + constitfile + " ...\n")
    reader = BracketParseCorpusReader(PARSER_DATA_DIR + "rnng/", constitfile)
    parses = reader.parsed_sents()
    return parses
示例#37
0
# 中文是字符型的,不能使用单词读入函数 words()
# chinese_mandarin_words=udhr.words('Chinese_Mandarin-UTF8')
# print(chinese_mandarin_words[:13])

# 中文是字符型的,不能使用句子读入函数 sents()
# chinese_mandarin_sents=udhr.sents('Chinese_Mandarin-UTF8')
# print(chinese_mandarin_sents[:13])

# 3.1.9. 载入自己的语料库
from nltk.corpus import PlaintextCorpusReader

# 这个在 C 盘根目录下,子目录中需要放入一些文件
corpus_root = '/nltk_data/tokenizers/punkt'
word_lists = PlaintextCorpusReader(corpus_root, '.*')
print("自己语料库的文件列表= ", word_lists.fileids())

from nltk.corpus import BracketParseCorpusReader

corpus_root = r'C:\nltk_data\corpora\treebank\combined'
file_pattern = r'wsj_.*\.mrg'
ptb = BracketParseCorpusReader(corpus_root, file_pattern)

show_subtitle("文件列表")
print(ptb.fileids()[:13])

show_subtitle("句子列表")
print(ptb.sents()[:3])

show_subtitle("指定文件中的句子")
print(ptb.sents(fileids='wsj_0003.mrg')[19])
            usage()
        preprocess = True

if preprocess == False:
    inpath = sys.argv[1]
    instring = open(inpath).read()
    inargs = instring.split('\t')

    ptb_path = inargs[0]
    stringAddresses = inargs[1]
    argAddresses = getGalFromString(stringAddresses)
    if argAddresses == None:
        print 'no address provided'
        sys.exit(1)

    docSents = BracketParseCorpusReader(
        os.path.dirname(ptb_path), os.path.basename(ptb_path)).parsed_sents()
    (prods, head,
     processedArgTree) = wellner_head_extraction(docSents, argAddresses)
    if prods == None:
        sys.exit(1)
    give_output(processedArgTree,
                os.path.splitext(os.path.basename(ptb_path))[0], prods, head)

    outfile = open(sys.argv[2], 'w')
    outfile.write(head + '\n')
    outfile.close()

else:
    reqpath = sys.argv[2]
    outpath = sys.argv[3]
    outfile = open(outpath, 'w')
words[1:20]
sents = gutenberg.sents("burgess-busterbrown.txt")
sents[1:20]

#Loading your own Corpus
from nltk.corpus import PlaintextCorpusReader
corpus_root = '/usr/share/dict'
# '.*' can be a list of fileids, like ['a.txt', 'test/b.txt'], or a pattern that matches all fileids, like '[abc]/.*\.txt'
wordlists = PlaintextCorpusReader(corpus_root, '.*')
wordlists.fileids()
wordlists.words('connectives')

from nltk.corpus import BracketParseCorpusReader
corpus_root = r"C:\corpora\penntreebank\parsed\mrg\wsj"
file_pattern = r".*/wsj_.*\.mrg"
ptb = BracketParseCorpusReader(corpus_root, file_pattern)
ptb.fileids()
len(ptb.sents())
ptb.sents(fileids='20/wsj_2013.mrg')[19]

# Conditional Frequency Distributions: 
# is a collection of frequency distributions, each one for a different "condition".
# The condition will often be the category of the text. 

# A frequency distribution counts observable events,
# such as the appearance of words in a text.
# A conditional frequency distribution needs to pair each event with a condition.
# So instead of processing a sequence of words,
# we have to process a sequence of pairs:
text = ['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', """..."""]
pairs = [('news', 'The'), ('news', 'Fulton'), ('news', 'County'), '''...''']