Exemplo n.º 1
0
class SyntaxTreeParser:
    def __init__(self):
        self.parser = StanfordParser()
        if not self.parser:
            raise RuntimeError('Stanford Parsre could not be initialized.')
    
    def raw_parse(self, sent):
        tree = next(self.parser.raw_parse(sent))
        return tree

    def parse(self, sent):
        one_sent = sent
        if len(sent[0]) == 1:
            one_sent = nltk.pos_tag(sent)
        tree = self.parser.tagged_parse(one_sent)
        return tree
Exemplo n.º 2
0
sentences = []
for x in range(len(df['sentences'])):
    sentence = df.at[x, 'sentences']
    tagged = tagger.tag(sentence.split())
    sentences.append(tagged)
df['tagged'] = sentences

##Constituent Parser
from nltk.parse.stanford import StanfordParser
path_to_model_1 = "D:\Constituency parsing\Parser\stanford-parser-full-2020-11-17\model.ser.gz"
path_to_jar_1 = "D:\Constituency parsing\Parser\stanford-parser-full-2020-11-17\stanford-parser.jar"
parser = StanfordParser(path_to_model_1, path_to_jar_1)
# parser.java_options='-mx4096m'          ### Setting higher memory limit for long sentences
parse_string = []
for y in range(len(df['tagged'])):
    tagged = df.at[y, 'tagged']
    cons = next(parser.tagged_parse(tagged))
    cons = ' '.join(str(cons).split())
    parse_string.append(cons)
df['Parse_String'] = parse_string

# parse_string = ' '.join(str(cons).split())
# print(parse_string)

#Move into excel csv
import pandas as pd
df_new = df
df_new.to_csv('coba.csv')

# print(cons.pretty_print())
Exemplo n.º 3
0
                if not token.is_punct and not token.head.is_punct and not token.is_space:
                    first = token.dep_
                    second = "{0}-{1}".format("ROOT" if token.dep_ == "ROOT" else token.head, "0" if token.dep_ == "ROOT" else tokend_dic[token.head])
                    third = "{0}-{1}".format(token, tokend_dic[token])
                    d = [first, second, third]
                    dependencies.append(d)

            #constituency parser
            rsentence = ""

            for t in sent:
                if not token.is_space:
                    rsentence = rsentence + " " + t.text

            rsentence = rsentence.strip().replace('\t', ' ').replace('\n', ' ')

            if re.match("[a-z]+|[A-Z]+", rsentence) == None:
                continue;

            sys.stderr.write(str(filter(None, str(rsentence).split(' ')))+ '\n')
            pcfg = parser.tagged_parse(nltk.pos_tag(filter(None, str(rsentence).split(' '))))
            #pcfg = parser.raw_parse(rsentence)
            #getting rid of (S1 )
            parsetree = re.sub("\s+", " ", str(list(pcfg)[0][0]).replace("\n", "").replace("ROOT", "")) + "\n"

            sentences.append({"dependencies": dependencies, "parsetree": parsetree, "words": words})

        parsejson.update({doc_id: {"sentences": sentences}})

print(json.dumps(parsejson, sort_keys=True))
def _extract_syntags_features(text_list, tagsets, use_stanford_tagger=False, stanford_parser_instance=None):
    """

    :param text_list:
    :param tagsets:
    :param use_stanford_tagger: POS-tagging is done by StanfordParser instead of NLTK
    :param stanford_parser_instance:
    :return:
    """

    if not use_stanford_tagger:
        text_4iteration_list, text_postags_txt_list = do_extract_postags_word_level(text_list)
    else:
        nltk.internals.config_java(options='-xmx4G')
        text_4iteration_list = text_list

    sp = StanfordParser(verbose=True, java_options="-xmx4G") if stanford_parser_instance is None else stanford_parser_instance #path_to_jar="/Users/zkey/tools/stanford-parser/stanford-parser.jar"

    treerep_of_sentences = []
    logging.debug("extract treerep of sentences...")
    for i, st in enumerate(text_4iteration_list):
        try:
            logging.debug ("sentence %s" % i)

            if use_stanford_tagger:
                tmp = sp.raw_parse(st)
            else:
                tmp = sp.tagged_parse(st)

            treerep_of_sentences.append(tmp)
        except:
            logging.error(sys.exc_info())
            logging.error("sentence: ", st)
            treerep_of_sentences.append(None)

        # # return unchanged featuresets
        # return featuresets

    # prepare a list for each tagset
    syntags_lists = ["" for i in tagsets]

    syntactic_tree_heigt_list = []
    syntactic_subtree_count_list = []

    logging.debug("sentence list len: %s " % len(treerep_of_sentences))
    logging.debug("tree list len: %s " % len(treerep_of_sentences))

    for j, tree in enumerate(treerep_of_sentences):
        logging.debug("Tree %s" % j)

        if tree:
            s = next(tree)
            s = nltk.ParentedTree.convert(s)

            for tagset_i, tagset in enumerate(tagsets):

                sentence_tags = []
                # for i, st in enumerate(s.subtrees(filter=lambda x: x.label() in tagset)):
                #     sentence_tags.append(st.label())
                #
                sentence_tags = [st.label() for st in s.subtrees(filter=lambda x: x.label() in tagset)]

                logging.debug(sentence_tags)
                syntags_lists[tagset_i] += " " + " ".join(sentence_tags)

                logging.debug("tagset %s: %s" % (tagset_i, syntags_lists[tagset_i]))

            # count height of a tree
            syntactic_tree_heigt_list.append(float(s.height()))
            logging.debug("tree height: %s" % s.height())

            # count subtrees with height bigger then 2
            subtree_count = len([st for st in s.subtrees(filter=lambda x: x.height() > 2)])
            syntactic_subtree_count_list.append(subtree_count)
            logging.debug("syn. subtree count: %s" % subtree_count)
        else:
            for tagset_i, tagset in enumerate(tagsets):
                syntags_lists[tagset_i] = [""]

            syntactic_tree_heigt_list.append(0)
            syntactic_subtree_count_list.append(0)

    return syntags_lists , syntactic_tree_heigt_list, syntactic_subtree_count_list
Exemplo n.º 5
0
from nltk.parse.stanford import StanfordParser
import re, nltk

parser = StanfordParser()
sentence = "In the 3rd level I would place my little brother in. because my little brother is a very greedy little bot he always wants something."
#print nltk.pos_tag(sentence.split(' '))
#pcfg = parser.raw_parse(sentence)
pcfg = parser.tagged_parse(nltk.pos_tag(sentence.split(' ')))
#getting rid of (S1 )
parsetree = re.sub("\s+", " ", str(list(pcfg)[0][0]).replace("\n", "").replace("ROOT", "")) + "\n"

print parsetree