Пример #1
0
def get_stats_from_snli_dataset(files, tagset=("NN", "NNS"), use_lemmas=False):

    lemmatizer = None
    if use_lemmas:
        lemmatizer = WordNetLemmatizer()

    stats = dd(int)
    num_of_token = 0

    for filename in files:
        f = NamedTemporaryFile()
        fields_to_read = {"sentence1_parse", "sentence2_parse"}
        for sent in open(filename):
            sent = ujson.loads(sent)
            for field in fields_to_read:
                f.write("%s\n" % sent[field])

        reader = BracketParseCorpusReader("/tmp", os.path.basename(f.name))
        for word, tag in reader.tagged_words():
            if tagset is None or tag in tagset:
                if use_lemmas:
                    word = lemmatizer.lemmatize(word, pos=tag.lower()[0])
                stats[word] += 1
                num_of_token += 1

    return stats, num_of_token
Пример #2
0
        in_domain = BracketParseCorpusReader(args.data,
                                             "en-wsj-dev.2.mrg")

        print("Loading out-of-domain validation trees")
        out_of_domain = BracketParseCorpusReader(args.data,
                                                 "en-web-dev.3.mrg")

    if args.test:
        print("Loading test sentences")
        test_sentences = [line.strip().split()
                          for line in open(os.path.join(args.data,
                                                        "en-web-weblogs-test"
                                                        ".sentences"))]

    print("Building grammar and lexicon")
    lexicon = hw5.Lexicon(train_corpus.tagged_words())
    grammar = hw5.FullGrammar(train_corpus, lexicon, max_train=args.maxTrain,
                              horizontal=args.horizontal,
                              vertical=args.vertical)
    lexicon.finalize()

# Set up and train the parser.
parser = hw5.Parser(grammar, lexicon)
# pickle.dump(grammar, open("grammar.pkl", "wb"), -1)

if args.mini:
    tree = parser.generate_parse_tree(["fish", "people", "fish", "tanks"],
                                      root_tag="S", theta=args.theta)
    tree.draw()

else:
Пример #3
0
class PtbParser(object):
    """Gold parser that gets annotations from the PTB.

    It uses an instantiated NLTK BracketedParseCorpusReader
    for the PTB section relevant to the RST DT corpus.

    Note that the path you give to this will probably end with
    something like `parsed/mrg/wsj`
    """

    def __init__(self, corpus_dir):
        """ """
        self.reader = BracketParseCorpusReader(corpus_dir,
                                               r'../wsj_.*\.mrg',
                                               encoding='ascii')

    def tokenize(self, doc):
        """Tokenize the document text using the PTB gold annotation.

        Return a tokenized document.
        """
        # get tokens from PTB
        ptb_name = _guess_ptb_name(doc.key)
        if ptb_name is None:
            return doc

        # get doc text
        # here we cheat and get it from the RST-DT tree
        # was: rst_text = doc.orig_rsttree.text()
        rst_text = doc.text
        tagged_tokens = self.reader.tagged_words(ptb_name)
        # tweak tokens THEN filter empty nodes
        tweaked1, tweaked2 =\
            itertools.tee(_tweak_token(ptb_name)(i, tok) for i, tok in
                          enumerate(tagged_tokens)
                          if not is_empty_category(tok[1]))
        spans = generic_token_spans(rst_text, tweaked1,
                                    txtfn=lambda x: x.tweaked_word)
        result = [_mk_token(t, s) for t, s in izip(tweaked2, spans)]

        # store in doc
        doc.tkd_tokens.extend(result)

        return doc

    def parse(self, doc):
        """
        Given a document, return a list of educified PTB parse trees
        (one per sentence).

        These are almost the same as the trees that would be returned by the
        `parsed_sents` method, except that each leaf/node is
        associated with a span within the RST DT text.

        Note: does nothing if there is no associated PTB corpus entry.
        """
        # get PTB trees
        ptb_name = _guess_ptb_name(doc.key)
        if ptb_name is None:
            return doc

        # get tokens from tokenized document
        # FIXME alignment/reconstruction should never have to deal
        # with the left padding token in the first place
        doc_tokens = doc.tkd_tokens[1:]  # skip left padding token
        tokens_iter = iter(doc_tokens)

        trees = []
        lex_heads = []
        for tree in self.reader.parsed_sents(ptb_name):
            # apply standard cleaning to tree
            # strip function tags, remove empty nodes
            tree_no_empty = prune_tree(tree, is_non_empty)
            tree_no_empty_no_gf = transform_tree(tree_no_empty,
                                                 strip_subcategory)
            #
            leaves = tree_no_empty_no_gf.leaves()
            tslice = itertools.islice(tokens_iter, len(leaves))
            clean_tree = ConstituencyTree.build(tree_no_empty_no_gf,
                                                tslice)
            trees.append(clean_tree)

            # lexicalize the PTB tree: find the head word of each constituent
            # constituents and their heads are designated by their Gorn address
            # ("tree position" in NLTK) in the tree
            lheads = find_lexical_heads(clean_tree)
            lex_heads.append(lheads)

        # store trees in doc
        doc.tkd_trees.extend(trees)
        # store lexical heads in doc
        # TODO move to DocumentPlus
        doc.lex_heads = []
        doc.lex_heads.append(None)
        # end TODO
        doc.lex_heads.extend(lex_heads)

        return doc
Пример #4
0
class PtbParser(object):
    """Gold parser that gets annotations from the PTB.

    It uses an instantiated NLTK BracketedParseCorpusReader
    for the PTB section relevant to the RST DT corpus.

    Note that the path you give to this will probably end with
    something like `parsed/mrg/wsj`
    """
    def __init__(self, corpus_dir):
        """ """
        self.reader = BracketParseCorpusReader(corpus_dir,
                                               r'../wsj_.*\.mrg',
                                               encoding='ascii')

    def tokenize(self, doc):
        """Tokenize the document text using the PTB gold annotation.

        Parameters
        ----------
        doc: DocumentPlus
            Rich representation of the document.

        Returns
        -------
        doc: DocumentPlus
            Rich representation of the document, with tokenization.
        """
        # get tokens from PTB
        ptb_name = _guess_ptb_name(doc.key)
        if ptb_name is None:
            return doc

        # get doc text
        # here we cheat and get it from the RST-DT tree
        # was: rst_text = doc.orig_rsttree.text()
        rst_text = doc.text
        tagged_tokens = self.reader.tagged_words(ptb_name)
        # tweak tokens THEN filter empty nodes
        tweaked1, tweaked2 =\
            itertools.tee(_tweak_token(ptb_name)(i, tok) for i, tok in
                          enumerate(tagged_tokens)
                          if not is_empty_category(tok[1]))
        spans = generic_token_spans(rst_text,
                                    tweaked1,
                                    txtfn=lambda x: x.tweaked_word)
        result = [_mk_token(t, s) for t, s in izip(tweaked2, spans)]

        # store in doc
        doc.set_tokens(result)

        return doc

    def parse(self, doc):
        """Parse a document, using the gold PTB annotation.

        Given a document, return a list of educified PTB parse trees
        (one per sentence).

        These are almost the same as the trees that would be returned by the
        `parsed_sents` method, except that each leaf/node is
        associated with a span within the RST DT text.

        Note: does nothing if there is no associated PTB corpus entry.

        Parameters
        ----------
        doc: DocumentPlus
            Rich representation of the document.

        Returns
        -------
        doc: DocumentPlus
            Rich representation of the document, with syntactic
            constituency trees.
        """
        # get PTB trees
        ptb_name = _guess_ptb_name(doc.key)
        if ptb_name is None:
            return doc

        # get tokens from tokenized document
        # FIXME alignment/reconstruction should never have to deal
        # with the left padding token in the first place
        doc_tokens = doc.tkd_tokens[1:]  # skip left padding token
        tokens_iter = iter(doc_tokens)

        trees = []
        lex_heads = []
        for tree in self.reader.parsed_sents(ptb_name):
            # apply standard cleaning to tree
            # strip function tags, remove empty nodes
            tree_no_empty = prune_tree(tree, is_non_empty)
            tree_no_empty_no_gf = transform_tree(tree_no_empty,
                                                 strip_subcategory)
            #
            leaves = tree_no_empty_no_gf.leaves()
            tslice = itertools.islice(tokens_iter, len(leaves))
            clean_tree = ConstituencyTree.build(tree_no_empty_no_gf, tslice)
            trees.append(clean_tree)

            # lexicalize the PTB tree: find the head word of each constituent
            # constituents and their heads are designated by their Gorn address
            # ("tree position" in NLTK) in the tree
            lheads = find_lexical_heads(clean_tree)
            lex_heads.append(lheads)

        # store trees in doc
        doc.set_syn_ctrees(trees, lex_heads=lex_heads)

        return doc