Пример #1
0
    def tokenize(self, file_ids):
        def tree2list(tree):
            if isinstance(tree, nltk.Tree):
                if tree.label() in word_tags:
                    return tree.leaves()[0]
                else:
                    root = []
                    for child in tree:
                        c = tree2list(child)
                        if c != []:
                            root.append(c)
                    if len(root) > 1:
                        return root
                    elif len(root) == 1:
                        return root[0]
            return []

        sens_idx = []
        sens = []
        trees = []
        for id in file_ids:
            sentences = ptb.parsed_sents(id)
            for sen_tree in sentences:
                words = self.filter_words(sen_tree)
                words = ['<s>'] + words + ['</s>']
                # if len(words) > 50:
                #     continue
                sens.append(words)
                idx = []
                for word in words:
                    idx.append(self.dictionary[word])
                sens_idx.append(torch.LongTensor(idx))
                trees.append(tree2list(sen_tree))

        return sens_idx, sens, trees
Пример #2
0
def addTrees(sec, trees):
   secNum = ("" if sec >= 10 else "0") + str(sec)

   files = os.listdir("/u/scr/corpora/ldc/1999/LDC99T42/parsed/mrg/wsj/"+secNum)
   for name in files:
      for tree in ptb.parsed_sents("WSJ/"+secNum+"/"+name):
         trees.append(tree)
Пример #3
0
    def trees(self, file_ids):
        def tree2list(tree):
            if isinstance(tree, nltk.Tree):
                if tree.label() in word_tags:
                    w = tree.leaves()[0].lower()
                    w = re.sub('[0-9]+', 'N', w)
                    return w
                else:
                    root = []
                    for child in tree:
                        c = tree2list(child)
                        if c != []:
                            root.append(c)
                    if len(root) > 1:
                        return root
                    elif len(root) == 1:
                        return root[0]
            return []

        trees = []
        nltk_trees = []
        sens = []
        for id in file_ids:
            sentences = ptb.parsed_sents(id)

            for sen_tree in sentences:
                words = self.filter_words(sen_tree)
                words = words + ['<eos>']
                sens.append(words)
                nltk.treetransforms.chomsky_normal_form(sen_tree)
                trees.append(tree2list(sen_tree))
                nltk_trees.append(sen_tree)

        return sens, trees, nltk_trees
Пример #4
0
 def add_words(self, file_ids):
     for file_id_i in file_ids:
         sentences = ptb.parsed_sents(file_id_i)
         for sen_tree in sentences:
             words = self.filter_words(sen_tree)
             for word in words:
                 self.dictionary.add_word(word)
Пример #5
0
 def add_words(self, file_ids):
     # Add words to the dictionary
     for id in file_ids:
         sentences = ptb.parsed_sents(id)
         for sen_tree in sentences:
             words = self.filter_words(sen_tree)
             words = ['<eos>'] + words + ['<eos>']
             for word in words:
                 self.dictionary.add_word(word)
Пример #6
0
def save(fileids, filename):
    with open(filename, 'w') as out:
        for fileid in tqdm(list(fileids)):
            for tree in ptb.parsed_sents(fileid):
                tokens, parse = [], []
                if _filter_none:
                    tree = drop_none(tree)
                flatten(tree, tokens, parse)
                sanity_checks(tokens, parse)
                data = {
                    'tokens': ' '.join(tokens),
                    'parse': ' '.join(parse)
                }
                out.write(json.dumps(data) + '\n')
Пример #7
0
    def tokenize(self, file_ids, wsj10):
        def tree2list(tree):
            if isinstance(tree, nltk.Tree):
                if tree.label() in word_tags:
                    return tree.leaves()[0]
                else:
                    root = []
                    for child in tree:
                        c = tree2list(child)
                        if c != []:
                            root.append(c)
                    if len(root) > 1:
                        return root
                    elif len(root) == 1:
                        return root[0]
            return []

        def tree2tree_wo_punc(old_tree):
            tree = ParentedTree.convert(old_tree)
            for sub in reversed(list(tree.subtrees())):
                if sub.height() == 2 and sub.label(
                ) not in word_tags:  # find not word subtree
                    parent = sub.parent()
                    while parent and len(parent) == 1:
                        sub = parent
                        parent = sub.parent()
                    print(sub, "will be deleted")
                    del tree[sub.treeposition()]
            return tree

        sens = []
        trees = []
        nltk_trees = []
        for id in tqdm(file_ids):
            sentences = ptb.parsed_sents(id)
            for sen_tree in sentences:
                words = self.filter_words(sen_tree)
                if len(words) > 10 and wsj10:
                    continue
                sens.append(words)
                trees.append(tree2list(sen_tree))
                nltk_trees.append(sen_tree)

        return sens, trees, nltk_trees


# corpus = Corpus('./data/WSJ', 'WSJ23')
Пример #8
0
def addTrees(sec, trees):
    secNum = ("" if sec >= 10 else "0") + str(sec)

    files = os.listdir("/u/scr/corpora/ldc/1999/LDC99T42/parsed/mrg/wsj/" +
                       secNum)
    for name in files:
        for tree in ptb.parsed_sents("WSJ/" + secNum + "/" + name):
            leaves = " ".join([
                ("(" if x == "-LRB-" else (")" if x == "-RRB-" else x.replace(
                    "\/", "/").replace("\*", "*"))) for x in tree.leaves()
                if "*-" not in x and (
                    not x.startswith("*")) and x not in ["0", "*U*", "*?*"]
            ])
            if leaves not in deps:  # only applies to one sentence in the training partition
                print(leaves)
                continue
            trees.append((tree, deps[leaves]))
Пример #9
0
def node2span(node, offsets):
    section_id, article_id, sentence_id, head_token_id, tree_height = parse_node_id(
        node)

    ptree = ParentedTree.convert(
        ptb.parsed_sents(f"wsj/{section_id}/wsj_{article_id}.mrg")
        [sentence_id])

    # Index each leaf node with its offset into the document
    offset = offsets[article_id][sentence_id]
    ptree = index_tree(ptree, offset)

    leaf_position = ptree.leaf_treeposition(head_token_id)
    span_position = leaf_position[:-(tree_height + 1)]
    span_tokens = ptree[span_position].leaves()

    return section_id, article_id, sentence_id, span_tokens, ptree
    def tokenize(self, file_ids):

        def tree2list(tree):
            if isinstance(tree, nltk.Tree):
                if tree.label() in word_tags:
                    w = tree.leaves()[0].lower()
                    w = re.sub('[0-9]+', 'N', w)
                    return w
                else:
                    root = []
                    for child in tree:
                        c = tree2list(child)
                        if c != []:
                            root.append(c)
                    if len(root) > 1:
                        return root
                    elif len(root) == 1:
                        return root[0]
            return []

        sens_idx = []
        sens = []
        trees = []
        nltk_trees = []
        for id in file_ids:
            sentences = ptb.parsed_sents(id)
            for sen_tree in sentences:
                words = self.filter_words(sen_tree)
                words = ['<eos>'] + words + ['<eos>']
                sens.append(words)
                if self.wvec:
                    word2idx = tools.pkl_loader(os.path.join('data/wordvec', self.wvec, 'words2idx'))
                    idx = tools.indexesFromSentence(words, word2idx)
                    sens_idx.append(idx)
                else:
                    idx = []
                    for word in words:
                        idx.append(self.dictionary[word])
                    sens_idx.append(torch.LongTensor(idx))
                trees.append(tree2list(sen_tree))
                nltk_trees.append(sen_tree)

        return sens_idx, sens, trees, nltk_trees
Пример #11
0
    def tokenize(self, file_ids):
        def tree2list(tree):
            if isinstance(tree, nltk.Tree):
                if tree.label() in word_tags:
                    w = tree.leaves()[0].lower()
                    w = re.sub('[0-9]+', 'N', w)
                    return w
                else:

                    root = []
                    for child in tree:
                        c = tree2list(child)
                        if c != []:
                            root.append(c)
                    if len(root) > 1:
                        return root
                    elif len(root) == 1:
                        return root[0]
            return []

        sens_idx = []
        sens = []
        trees = []
        nltk_trees = []
        N = 0
        for id in file_ids:
            sentences = ptb.parsed_sents(id)
            for sen_tree in sentences:
                words = self.filter_words(sen_tree)
                words = self.filter_words_tag(sen_tree)
                words = words + ['<eos>']

                sens.append(words)
                idx = []
                for word in words:
                    idx.append(self.dictionary[word])
                sens_idx.append(idx)
                trees.append(tree2list(sen_tree))
                nltk_trees.append(sen_tree)
                N += len(words)

        return sens_idx, sens, trees, nltk_trees
Пример #12
0
def get_nltk_sents(sent_ids):
    raw = {}
    tagged = {}
    trees = {}
    data = {}

    for i in tqdm(sent_ids, desc="Collecting sentences and trees"):
        file_num, sent_num = i.split('_')
        subdir = file_num[:2]
        sent_num = int(sent_num)
        path = f'WSJ/{subdir}/WSJ_{file_num}.MRG'
        raw[i] = ptb.sents(path)[sent_num]
        tagged[i] = ptb.tagged_sents(path)[sent_num]
        trees[i] = ptb.parsed_sents(path)[sent_num]

    data['raw'] = raw
    data['tagged'] = tagged
    data['trees'] = trees

    return data
Пример #13
0
 def save_file(file_ids, out_file):
     sens = []
     trees = []
     tags = []
     f_out = open(out_file, 'w')
     for f in file_ids:
         sentences = ptb.parsed_sents(f)
         for sen_tree in sentences:
             orig = sen_tree.pformat(margin=sys.maxsize).strip()
             c = 0
             while not all([tag in word_tags for _, tag in sen_tree.pos()]):
                 del_tags(sen_tree, word_tags)
                 c += 1
                 if c > 10:
                     assert False
             out = sen_tree.pformat(margin=sys.maxsize).strip()          
             while re.search('\(([A-Z0-9]{1,})((-|=)[A-Z0-9]*)*\s{1,}\)', out) is not None:
                 out = re.sub('\(([A-Z0-9]{1,})((-|=)[A-Z0-9]*)*\s{1,}\)', '', out)
             out = out.replace(' )', ')')
             out = re.sub('\s{2,}', ' ', out)
             f_out.write(out + '\n')
     f_out.close()
Пример #14
0
def get_raw_data():
    raw_data = {}
    fileids = ptb.fileids()

    obj_sofar = 0

    for fileid in fileids:
        corpus, section, _ = fileid.split('/')
        if corpus.lower() != 'wsj':
            continue
        section = int(section)
        if section >= 2 and section <= 21:
            split = 'train'
        elif section == 22:
            split = 'valid'
        elif section == 23:
            split = 'test'
        else:
            split = None
        sent_sofar = 0
        for y in ptb.parsed_sents(fileid):
            words, part_of_speech = zip(*y.pos())
            constituency_parse = tree_to_tuple(y)
            obj = collections.OrderedDict()
            obj['example_id'] = 'ptb{}'.format(obj_sofar)
            obj['file_id'] = fileid
            obj['sent_id'] = sent_sofar
            obj['words'] = words
            obj['part_of_speech'] = part_of_speech
            obj['constituency_parse'] = constituency_parse
            sent_sofar += 1
            obj_sofar += 1

            raw_data.setdefault('all', []).append(obj)
            if split is not None:
                raw_data.setdefault(split, []).append(obj)

    return raw_data
Пример #15
0
  def tokenize(self, file_ids):
    """Tokenizes a mrg file."""

    def tree2list(tree):
      if isinstance(tree, nltk.Tree):
        if tree.label() in WORD_TAGS:
          w = tree.leaves()[0].lower()
          w = re.sub('[0-9]+', 'N', w)
          return w
        else:
          root = []
          for child in tree:
            c = tree2list(child)
            if c:
              root.append(c)
          if len(root) > 1:
            return root
          elif len(root) == 1:
            return root[0]
      return []

    sens_idx = []
    sens = []
    trees = []
    nltk_trees = []
    for file_id_i in file_ids:
      sentences = ptb.parsed_sents(file_id_i)
      for sen_tree in sentences:
        words = self.filter_words(sen_tree)
        sens.append(words)
        idx = []
        for word in words:
          idx.append(self.dictionary[word])
        sens_idx.append(idx)
        trees.append(tree2list(sen_tree))
        nltk_trees.append(sen_tree)

    return sens_idx, sens, trees, nltk_trees
Пример #16
0
"""process chunk"""
path = '/Users/pengwu5501/Downloads/wsj-2'
files = os.listdir(path)
l = []
grammar = {}
i = 0
dict_word = {}
dict_unit_rule = {}
unit = 0
ter = 0
for file in files:
    sub_path = os.path.join(path, file)
    mrg = os.listdir(sub_path)
    for item in mrg:
        name = os.path.join(sub_path, item)
        tree = ptb.parsed_sents(name)
        for tre in tree:
            list = []
            p = traversal(tre, list)
            for item in p:
                it = item.split('->')
                late = it[1].split()
                if len(late) > 2:
                    for item_ in late:
                        if item_.find('\'') == 0:
                            break
                    it[0] += ' '
                    it[0] += '->'
                    it[0] += ' '
                    it[0] += late[0]
                    it[0] += ' '
Пример #17
0
from nltk.corpus import ptb
import os


def tree2prod(trees):
    prods = []
    for t in trees:
        prods += t.productions()
    return prods


path = '/Users/pengwu5501/nltk_data/corpora/ptb/wsj'
files = os.listdir(path)
productions = []
cnt = 0
for file in files:
    sub_path = os.path.join(path, file)
    sub_file = os.listdir(sub_path)
    for item in sub_file:
        name = os.path.join(sub_path, item)
        tbank = ptb.parsed_sents(name)
        productions += tree2prod(tbank)
        set(productions)
        cnt += 1
        print(len(productions))
gramm = CFG(Nonterminal('S'), productions)
print(gramm)
        num_files_in_dir = len(
            os.listdir('/Users/morischick/nltk_data/corpora/ptb/WSJ/' +
                       dir_num))
        #print(dir_num, num_files_in_dir)
        print("Beginning WSJ/", dir_num, "...")

        # loop through each file
        for j in range(0, num_files_in_dir):
            file_num = str(j)

            if j < 10:
                file_num = "0" + str(j)

            try:
                file_name = 'WSJ/' + dir_num + '/WSJ_' + dir_num + file_num + '.MRG'
                num_sentences = len(ptb.parsed_sents(file_name))
                genre = genre_dict[file_name]
                #print(file_name, i , j, num_sentences, genre)

            except:
                print(
                    "This file does not exist and a genre cannot be found for it"
                )

            if genreCount[genre] < NUM_EXAMPLES:

                try:
                    # loop through each sentence
                    for x in range(0, num_sentences):

                        if genreCount[genre] < NUM_EXAMPLES:
Пример #19
0
from nltk.corpus import ptb
from nltk.tree import Tree
t = ptb.parsed_sents(
    '/Users/pengwu5501/nltk_data/corpora/ptb/wsj/00/wsj_0001.mrg')


def getCFG(tree):
    line = ''
    if isinstance(tree, Tree):
        line += tree.label()
        line += ' '
        line += '->'
        for subtree in tree:
            if isinstance(subtree, Tree):
                line += ' '
                line += subtree.label()
            else:
                line += ' '
                line += '"'
                line += subtree
                line += '"'
        return line


lis = []


def traversal(tree):
    lis.append(getCFG(tree))
    for subtree in tree:
        if isinstance(subtree, Tree):
Пример #20
0
ap = argparse.ArgumentParser()
ap.add_argument("--ptbfiles", help="PennTreebank files")
ap.add_argument("--trees", help="Output trees")
ap.add_argument("--words", help="Output words, sentence per line")
args = ap.parse_args()

tree_file = open(args.trees, 'w')
word_file = open(args.words, 'w')

#TODO can not -> cannot

for filename in sorted(
        glob.glob('/home/marecek/nltk_data/corpora/ptb/' + args.ptbfiles)):
    #print("Processing " + filename)
    trees = ptb.parsed_sents(filename)
    for i in range(len(trees)):
        # remove traces and other empty nodes
        for sub in trees[i].subtrees():
            for n, child in enumerate(sub):
                if isinstance(child, str):
                    continue
                if (all(leaf.startswith("*") for leaf in child.leaves())
                        or child.label() == '-NONE-'):
                    del sub[n]
        # extract list of POS tags and remove POS tags from the trees
        sent_tags = list()
        for sub in trees[i].subtrees():
            sub.set_label("X")
            for n, child in enumerate(sub):
                if isinstance(child, str):
Пример #21
0
def main(test=False):
    """
    makes a big dumb PTB CFG, and ShiftReduceParser, and a ViterbiParser, and
    serializes them all to disk for future use.

    The ViterbiParser runs in cubic time and give the most likely parse.
    The ShiftReduceParser runs in linear time and gives a single parse.

    https://stackoverflow.com/questions/7056996/how-do-i-get-a-set-of-grammar-rules-from-penn-treebank-using-python-nltk
    https://groups.google.com/forum/#!topic/nltk-users/_LXtbIekLvc
    https://www.nltk.org/_modules/nltk/grammar.html
    """
    vocabulary = chainer.datasets.get_ptb_words_vocabulary()
    freq_thresh = 0 ## ARBITRARY
    word_freqs = FreqDist(ptb.words())

    if not os.path.isfile('parsers/grammar.pkl'):

        productions = []
        add_dict = {}

        # use the entire treebank's parsed sentences to generate the CFG
        for i, tree in enumerate(ptb.parsed_sents()):

            # is it a good idea to combine this with my preprocessing?
            tree.collapse_unary(collapsePOS=False)
            tree.chomsky_normal_form(horzMarkov=2)

            # preprocess all productions by removing all tags
            these_productions = tree.productions()
            for production in these_productions:

                # remove all tags from the LHS (only keep primary tag)
                production._lhs = preprocess_nt(production._lhs)

                rhs = []
                for item in production._rhs:

                    # remove all tags from the Nonterminals on the RHS
                    if type(item) == nltk.grammar.Nonterminal:
                        rhs.append(preprocess_nt(item))

                    # replace numbers with N
                    elif is_number(item):
                        rhs.append('N')

                    # items not in dictionary replaced with <unk>
                    # dictionary requires lower
                    elif not is_key(vocabulary, item.lower()):
                        rhs.append('<unk>')

                    # replace infrequent words with <unk>
                    elif word_freqs[item] < freq_thresh:
                        rhs.append('<unk>')

                    # lowercase all entries in the grammar
                    else:
                        rhs.append(item.lower())

                production._rhs = tuple(rhs)

                if not is_key(add_dict, production.unicode_repr()):
                    add_dict[production.unicode_repr()] = True
                    productions.append(production)

        print('** {} productions found! **'.format(len(productions)))
        grammar = induce_pcfg(Nonterminal('S'), productions)

        with open('parsers/grammar.pkl', 'wb') as f:
            f.write(pickle.dumps(grammar))

    if not os.path.isfile('parsers/viterbi_parser.pkl'):
        filename = open('parsers/grammar.pkl', 'rb')
        grammar = pickle.load(filename)
        viterbi_parser = ViterbiParser(grammar, trace=0) # cubic time

        with open('parsers/viterbi_parser.pkl', 'wb') as f:
            f.write(pickle.dumps(viterbi_parser))

    if not os.path.isfile('parsers/shift_reduce_parser.pkl'):
        filename = open('parsers/grammar.pkl', 'rb')
        grammar = pickle.load(filename)
        shift_reduce_parser = ShiftReduceParser(grammar, trace=0)     # linear time

        with open('parsers/shift_reduce_parser.pkl', 'wb') as f:
            f.write(pickle.dumps(shift_reduce_parser))

    with open('data/ptb.train.txt', 'r') as f:
        data = f.readlines()

    if test:
        for sample in [1, 23, 20330, 20332, 443]:

            t1 = time.time()
            viterbi_parser.parse_one(data[sample].split())
            t2 = time.time()
            print('viterbi      = {:.2f} sec for {} words'.format(
                t2-t1, len(data[sample].split())))

            t1 = time.time()
            shift_reduce_parser.parse_one(data[sample].split())
            t2 = time.time()
            print('shift reduce = {:.2f} sec for {} words'.format(
            t2-t1, len(data[sample].split())))
Пример #22
0
import nltk
from nltk import Nonterminal as NT
from nltk.grammar import PCFG
from nltk.corpus import ptb
from nltk import induce_pcfg
import pickle

productions = []
for i, tree in enumerate(ptb.parsed_sents()):
    tree.collapse_unary(collapsePOS=False)
    tree.chomsky_normal_form(horzMarkov=2)
    productions += tree.productions()

S = NT('S')
grammar = induce_pcfg(S, productions)

with open('ptb_grammar.pcfg', 'wb') as w:
    pickle.dump(grammar, w)
Пример #23
0
#!/usr/bin/env python3
from nltk.corpus import ptb
import re
"""
slicing penn treebank to extract sentences of less than N words and
output the gold standard trees.
the directory of the treebank mrg directory must be in ~/nltk_data/ptb
this outputs trees, and one can use make item to convert into tagwords or words
"""
sent_length = 20  # the maximum number of words in a sentence

parsed_sents = ptb.parsed_sents()
with open('../test.txt', 'w') as fi:
    for index, sent in enumerate(ptb.tagged_sents()):
        count = 0
        for word, tag in sent:
            if 'NONE' not in tag:
                count += 1
        if count <= sent_length:
            tree = parsed_sents[index]
            for pos in tree.treepositions('leaves'):
                tree[pos] = tree[pos].lower()
            # tree.collapse_unary(collapsePOS=True)
            tree = str(tree).replace('\n', '')
            tree = re.sub('\s+', ' ', tree)
            print(tree, file=fi)
Пример #24
0
def get_bnp_from_ptb(ptb_dir):
    #train_fileids = [ptb_dir+"%02d"%x for x in range(2, 22)]
    #valid_fileids = [ptb_dir+"%02d"%x for x in range(22, 23)]
    #test_fileids = [ptb_dir+"%02d"%x for x in range(23, 24)]

    train_fileids = [path.join(ptb_dir, "%02d" % x) for x in range(2, 22)]
    valid_fileids = [path.join(ptb_dir, "%02d" % x) for x in range(22, 23)]
    test_fileids = [path.join(ptb_dir, "%02d" % x) for x in range(23, 24)]

    train = []
    valid = []
    test = []
    basenp_count_train = 0
    basenp_count_valid = 0
    basenp_count_test = 0
    token_count_train = 0
    token_count_valid = 0
    token_count_test = 0
    np_lens_train = []
    np_lens_valid = []
    np_lens_test = []

    all_lens_train = []

    print_every = 1000

    for split_fileids, split_label in zip(
        [train_fileids, valid_fileids, test_fileids],
        ["train", "valid", "test"]):
        sent_ctr = 0
        for wsj_section_folderpath in split_fileids:
            all_mrg_files_in_split = glob.glob(wsj_section_folderpath +
                                               "/*.mrg")
            for mrg_file in all_mrg_files_in_split:
                parsed_sents = ptb.parsed_sents(mrg_file)
                parsed_sents_ctr = 0
                for parsed_tree in parsed_sents:
                    if sent_ctr % print_every == 0:
                        print "Currently processing %d in %s" % (sent_ctr,
                                                                 split_label)
                    sent_ctr += 1
                    base_np_delineated_tokens = traverse_tree(
                        parsed_tree, True)

                    parsed_sents_ctr += 1
                    if split_label == "train":

                        basenp_count_split, token_count_split, np_lens_split, all_lens_split = count_basenps(
                            base_np_delineated_tokens)
                        basenp_count_train += basenp_count_split
                        token_count_train += token_count_split
                        np_lens_train.extend(np_lens_split)

                        all_lens_train.extend(all_lens_split)

                        train.append(base_np_delineated_tokens)
                    elif split_label == "valid":

                        basenp_count_split, token_count_split, np_lens_split, _ = count_basenps(
                            base_np_delineated_tokens)
                        basenp_count_valid += basenp_count_split
                        token_count_valid += token_count_split
                        np_lens_valid.extend(np_lens_split)

                        valid.append(base_np_delineated_tokens)
                    elif split_label == "test":

                        basenp_count_split, token_count_split, np_lens_split, _ = count_basenps(
                            base_np_delineated_tokens)
                        basenp_count_test += basenp_count_split
                        token_count_test += token_count_split
                        np_lens_test.extend(np_lens_split)

                        test.append(base_np_delineated_tokens)

    print "Train"
    print "Total bag size: %d, Average size of item in bag %f" % (
        len(all_lens_train), np.mean(all_lens_train))
    print "Base NP count: %d, Average Base NP length: %f, Token count: %d, Sentence count: %d" % (
        basenp_count_train, np.mean(np_lens_train), token_count_train,
        len(train))

    print "Valid"
    print "Base NP count: %d, Average Base NP length: %f, Token count: %d, Sentence count: %d" % (
        basenp_count_valid, np.mean(np_lens_valid), token_count_valid,
        len(valid))

    print "Test"
    print "Base NP count: %d, Average Base NP length: %f, Token count: %d, Sentence count: %d" % (
        basenp_count_test, np.mean(np_lens_test), token_count_test, len(test))

    #startswith("NP"): Number of base NPs in the training set: 228399
    #Train
    #Base NP count: 228399, Average Base NP length: 2.212983, Token count: 949938, Sentence count: 39832
    #Valid
    #Base NP count: 9536, Average Base NP length: 2.273700, Token count: 40104, Sentence count: 1700
    #Test
    #Base NP count: 13457, Average Base NP length: 2.192465, Token count: 56674, Sentence count: 2416
    #Size of vocab (i.e., tokens that appear in training, not including the additional <unk>) 10000

    raw_train = remove_base_np_syms(train)
    raw_valid = remove_base_np_syms(valid)
    raw_test = remove_base_np_syms(test)

    return raw_train, raw_valid, raw_test, train, valid, test
Пример #25
0
def _read_document(file_path: str):
    print(f"Reading GC2012 instances from dataset file at: {file_path}")

    xml_tree = ET.parse(file_path)
    root = xml_tree.getroot()

    # Read in all relevant documents to get token offsets
    sentence_offsets = dict(
    )  # sentence_offsets[doc][sentence] = starting_token_idx

    # Remove special parse tokens (e.g., "*RNR*-1") from text and reindex tokens
    # It appears that all special characters have a parent of '-NONE-'.
    token_map = dict(
    )  # token_map[doc][original_token_idx] = remapped_token_idx
    texts = dict()
    filtered_texts = dict()
    for annotations in root.getchildren():
        trigger_node = annotations.get('for_node')  # wsj_xxxx:a:b:c
        section_id, article_id, _, _, _ = parse_node_id(trigger_node)

        if article_id in sentence_offsets.keys():
            # we've already processed this document
            continue
        else:
            sentence_offsets[article_id] = dict()
            token_map[article_id] = dict()

        parse_trees = ptb.parsed_sents(
            f"wsj/{section_id}/wsj_{article_id}.mrg")
        total_valid_tokens_seen = 0
        total_tokens_seen = 0
        text = []
        filtered_text = []
        for sent_id, parse_tree in enumerate(parse_trees):
            ptree = ParentedTree.convert(parse_tree)

            tokens = ptree.leaves()
            valid_token_indices = [
                i if valid_token(ptree, i) else None
                for i, x in enumerate(tokens)
            ]
            valid_tokens = [
                x for i, x in enumerate(tokens) if valid_token(ptree, i)
            ]

            sentence_offsets[article_id][sent_id] = total_tokens_seen
            for i, x in enumerate(valid_token_indices):
                if x is None:
                    # special token that should be removed (e.g. *RNR*-1)
                    token_map[article_id][total_tokens_seen] = None
                else:
                    token_map[article_id][
                        total_tokens_seen] = total_valid_tokens_seen
                    total_valid_tokens_seen += 1

                total_tokens_seen += 1

            text.append(tokens)
            filtered_text.append(valid_tokens)

        texts[article_id] = text
        filtered_texts[article_id] = filtered_text

    # See `http://lair.cse.msu.edu/projects/implicit_annotations.html` for details.
    packets = []
    for annotations in root.getchildren():
        trigger_node = annotations.get('for_node')  # wsj_xxxx:a:b:c
        trigger_section_id, trigger_article_id, trigger_sentence_id, trigger_span_tokens, _ = node2span(
            trigger_node, sentence_offsets)

        # Readjust token indices since we removed special tokens (this probably doesn't happen in the data, but just to be safe)
        trigger_text, original_trigger_span = indices2range(
            trigger_span_tokens)
        trigger_span = (
            token_map[trigger_article_id][original_trigger_span[0]],
            token_map[trigger_article_id][original_trigger_span[1]])

        packet = {
            "document_id": f"wsj_{trigger_article_id}",
            "document": filtered_texts[
                trigger_article_id],  # filtered document (does not include special parse tokens)
            "trigger": {
                "node_id": trigger_node,
                "span": trigger_span,  # offset into filtered document
                "text": trigger_text
            },
            "arguments": defaultdict(list)
        }

        printed_trigger = False
        for annotation in annotations.getchildren():
            argument_node = annotation.attrib.get('node')
            argument_section_id, argument_article_id, argument_sentence_id, argument_span_tokens, _ = node2span(
                argument_node, sentence_offsets)

            if trigger_section_id != argument_section_id:
                raise ValueError(
                    f"Trigger and argument should be in same section: got trigger_section_id={trigger_section_id}, argument_section_id={argument_section_id}"
                )
            if trigger_article_id != argument_article_id:
                raise ValueError(
                    f"Trigger and argument should be in same article: got trigger_article_id={trigger_article_id}, argument_article_id={argument_article_id}"
                )

            argn = annotation.attrib.get('value')

            # get `attribute`
            assert len(annotation.getchildren()) == 1
            assert annotation.getchildren()[0].tag == 'attributes'

            assert len(annotation.getchildren()[0].getchildren()) <= 1
            if len(annotation.getchildren()[0].getchildren()) == 1:
                attribute = annotation.getchildren()[0].getchildren()[0].text
            else:
                attribute = ""

            # Readjust token indices since we removed special tokens
            argument_text, original_argument_span = indices2range(
                argument_span_tokens)
            argument_span = (
                token_map[argument_article_id][original_argument_span[0]],
                token_map[argument_article_id][original_argument_span[1]])

            if attribute == "Split":
                if not printed_trigger:
                    print("Trigger", trigger_node, trigger_sentence_id,
                          trigger_text, trigger_span)
                    printed_trigger = True
                print(argn, attribute, argument_node, argument_sentence_id,
                      argument_text, argument_span)

            packet["arguments"][argn].append({
                "node_id": argument_node,
                "span": argument_span,  # offset into filtered document
                "attribute": attribute,
                "text": argument_text
            })

        if printed_trigger:
            print(packet)
            print()

        packets.append(packet)

    return packets
Пример #26
0
    def _create_data(self):

        # hard coding of the number of samples for train and valid
        # n_train = 42069
        # n_valid = 7139
        # n_total = 49208

        if self.split == 'train':
            self._create_vocab()
        else:
            self._load_vocab()

        #tokenizer = TweetTokenizer(preserve_case=False)
        # we build the dataset by looping through these inds of parsed_sents()
        if self.split == 'train':
            n_begin = 0
            n_end = 42069
        else:
            n_begin = 42069
            n_end = 49208

        data = defaultdict(dict)

        # collect all treebank sentences and nonterminals for multi-processing
        t1 = time.time()
        all_sentences = ptb.sents()
        all_sentences = all_sentences[n_begin:n_end]
        all_parses = ptb.parsed_sents()
        all_parses = all_parses[n_begin:n_end]
        t2 = time.time()
        print('read all sentences in {} sec'.format(t2 - t1))

        # preprocess all sentences in paralell
        pool = Pool()  # required for multicore
        try:
            t1 = time.time()
            preprocessed_sentences = pool.map_async(self._preprocess,
                                                    all_sentences).get(9999999)
            pool.close()
            t2 = time.time()
            print('preprocessed all sentences in {} min'.format(
                (t2 - t1) / 60.0))
        except KeyboardInterrupt:
            pool.terminate()
            pool.join()
            sys.exit(1)

        # get all phrase tags in paralell
        pool = Pool()
        try:
            t1 = time.time()
            phrase_tags = pool.map_async(self._get_phrase_tags,
                                         all_parses).get(9999999)
            pool.close()
            t2 = time.time()
            print('phrase tags for all sentences collected in {} min'.format(
                (t2 - t1) / 60.0))
        except KeyboardInterrupt:
            pool.terminate()
            pool.join()
            sys.exit(1)

        # now, finish things up by adding start/end tags
        t1 = time.time()
        tag_count = np.zeros(len(PHRASE_TAGS))
        for i, words in enumerate(preprocessed_sentences):

            inputs = ['<sos>'] + words
            inputs = inputs[:self.max_sequence_length]

            target = words[:self.max_sequence_length - 1]
            target = target + ['<eos>']

            assert len(inputs) == len(target), "%i, %i" % (len(inputs),
                                                           len(target))
            length = len(inputs)

            inputs.extend(['<pad>'] * (self.max_sequence_length - length))
            target.extend(['<pad>'] * (self.max_sequence_length - length))

            inputs = [self.w2i.get(w, self.w2i['<unk>']) for w in inputs]
            target = [self.w2i.get(w, self.w2i['<unk>']) for w in target]

            tag_count += phrase_tags[i]

            data[i]['input'] = inputs
            data[i]['target'] = target
            data[i]['length'] = length
            data[i]['tags'] = phrase_tags[i]

        t2 = time.time()
        print('sentences loaded into dict in {} sec'.format(i, n_end, t2 - t1))
        for i, tag in enumerate(PHRASE_TAGS):
            print('+ tag {}, n={}'.format(tag, tag_count[i]))

        with io.open(os.path.join(self.data_dir, self.data_file),
                     'wb') as data_file:
            data = json.dumps(data, ensure_ascii=False)
            data_file.write(data.encode('utf8', 'replace'))

        self._load_data(vocab=False)