예제 #1
0
def main():
    parser = StanfordParser(
        path_to_jar=script_wrapper.stanford_parser_jar,
        path_to_models_jar=script_wrapper.stanford_model_jar)
    st = StanfordNERTagger(
        model_filename=
        '../lib/stanford-ner-2015-12-09/classifiers/english.all.3class.distsim.crf.ser.gz',
        path_to_jar="../lib/stanford-ner-2015-12-09/stanford-ner-3.6.0.jar")
    raw_sent = "Dempsey was drafted by Major League Soccer club New England Revolution."
    sent = word_tokenize(raw_sent)
    ne_tuple = st.cur_tag(
        sent
    )  # ##need write interface for tokenized sent (http://nlp.stanford.edu/software/crf-faq.shtml#tokenized)
    print ne_tuple

    print parser.raw_parse(raw_sent).next()

    return
    # find name entity
    f = 0
    ne_list = []
    for (ne, label) in ne_tuple:
        if label == 'PERSON':
            f = 1
        if f and label != 'PERSON':
            break
        if f:
            ne_list.append(ne)
    # print ne_list

    init_file(main_tree)
    ####### my issue here: 1. don't know how to get NP. 2. is there a quicker way to find PERON ?
    # try head to ask who/what
    pattern = "S < NP=np"
    head = check_output([
        'bash',  ###add bash !!!!
        tregex_path,
        '-s',
        pattern,
        init_tree_file
    ])
    print head

    def get_main_verbs(tree):
        pattern = '/(VB.?)/=main >+ (VP) (S > ROOT)'
        main_verbs = check_output([
            'bash',  ###add bash !!!!
            tregex_path,
            '-s',
            pattern,
            init_tree_file
        ])
        print main_verbs
        main_verbs = main_verbs.split('\n')[:-1]
        main_verbs = [Tree.fromstring(main_verb) for main_verb in main_verbs]
        return main_verbs
예제 #2
0
class Parser(object):
    """
    A natural language parser is a program that works out the grammatical 
    structure of sentences, for instance, which groups of words go together 
    (as “phrases”) and which words are the subject or object of a verb. 
    Probabilistic parsers use knowledge of language gained from hand-parsed 
    sentences to try to produce the most likely analysis of new sentences. 
    These statistical parsers still make some mistakes, but commonly work 
    rather well. Their development was one of the biggest breakthroughs in 
    natural language processing in the 1990s.
    """
    def __init__(self, model_path, path_to_jar, path_to_models_jar):
        # nltk package
        from nltk.parse.stanford import StanfordParser
        self.__model_path = model_path
        self.__path_to_jar = path_to_jar
        self.__path_to_model_jar = path_to_models_jar
        self.__stf_parser = StanfordParser(
            path_to_jar=path_to_jar,
            path_to_models_jar=path_to_models_jar,
            model_path=model_path,
            encoding='utf-8')

    def parse_sentence(self, text):
        """
        Arguments:
            text -- input text string to be parsed
        
        Returns:
            list of the parsed result in the form (parent_tag(tag, word))
        """
        self.__text = text
        return list(self.__stf_parser.raw_parse(text))

    def tree_print(self):
        """
        Arguments:
            -- None
        Returns:
            -- None
        """
        for line in self.__stf_parser.raw_parse(self.__text):
            for sentence in line:
                print(sentence)

    def tree_draw(self):
        """
        Arguments:
            -- None
        Returns:
            -- None
        """
        for line in self.__stf_parser.raw_parse(self.__text):
            for sentence in line:
                sentence.draw()
예제 #3
0
def sdfprocess(rvdata):
    parser = StanfordParser(
        path_to_jar=
        '/home/cosmo/Dropbox/Purdue/nlp/stanford-corenlp-full-2014-08-27/stanford-corenlp-3.4.1.jar',
        path_to_models_jar=
        '/home/cosmo/Dropbox/Purdue/nlp/stanford-corenlp-full-2014-08-27/stanford-corenlp-3.4.1-models.jar',
        model_path='edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz',
        java_options='-mx15000m')
    sdfdata = []
    cnn = 0
    widgets = [
        'Progress: ',
        Percentage(), ' ',
        Bar(marker=RotatingMarker()), ' ',
        ETA(), ' ',
        FileTransferSpeed()
    ]
    pbar = ProgressBar(widgets=widgets, maxval=len(rvdata)).start()
    for eg in rvdata:
        # if cnn%100 == 0: print "%f%% of document %d finished" % (cnn*100*1.0/len(rvdata), partidx+1)
        cmt = eg[3].decode('utf-8')  #3 is the idx of comment
        sentences = nltk.sent_tokenize(cmt)
        parsedls = []
        for snt in sentences:
            sntparsed = parser.raw_parse(snt)
            parsedls.append(sntparsed)
        sdfdata.append(eg[:3] + [parsedls])
        # print cnn
        # print sdfparsed
        # print sdfdata
        # if cnn > 5: break
        pbar.update(cnn + 1)
        cnn += 1
    pbar.finish()
    return sdfdata
예제 #4
0
def main():
    """Main function of script."""
    args = utils.read_arguments(__doc__)

    # Read dataset. Each row of x_matrix is a sentence.
    x_matrix, y_vector = utils.pickle_from_file(args['input_filename'])

    # Get Stanford model
    parser = StanfordParser(
        model_path='edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz',
        encoding='utf8')
    # Get parse trees.
    parsed_matrix = []
    for index, document in tqdm(enumerate(x_matrix), total=len(x_matrix)):
        parsed_document = []
        for paragraph_index, paragraph in enumerate(document):
            parsed_paragraph = []
            for sentence_index, sentence in enumerate(paragraph):
                try:
                    parsed_paragraph.append(
                        list(
                            parser.raw_parse(
                                six.text_type(sentence.decode('utf-8')))))
                except UnicodeDecodeError:
                    logging.warning(
                        'Skip sentence {}-{}-{} for unicode error'.format(
                            index, paragraph_index, sentence_index))
                    y_vector[index].pop(sentence_index)
            parsed_document.append(parsed_paragraph)
        parsed_matrix.append(parsed_document)

    # Save output
    logging.info('Saving {} documents'.format(len(parsed_matrix)))
    utils.pickle_to_file((parsed_matrix, y_vector), args['output_filename'])
    logging.info('All operations finished')
예제 #5
0
def cStructure():
    print '######## C Structure'
    parser = StanfordParser(path_to_jar=path_to_jar, path_to_models_jar=path_to_models_jar)
    example = parser.raw_parse("Who were the CEO of IBM?")
#    example = parser.raw_parse("Steve Jobs was Founder of Apple. He was born in United States of America.")

    #for line in example:
        #for sentence in line:
            #sentence.draw()

    #print type(example)

    example = list(example)
    #print example
    abcabc = example[0]
    abcabc1 = abcabc[0]
    print type(abcabc)
    hello = str(abcabc)
    print type(abcabc)
    print hello
    #print abcabc1.label()

    for a in abcabc:
        #print a.height()
        if a.height() > 1:
            extractNP(a)


    print myNounPhrasesTree
예제 #6
0
def check(sent):

    parser = StanfordParser()

    # Parse the example sentence

    print(sent)
    t = list(parser.raw_parse(sent))[0]
    print(t)
    t = ParentedTree.convert(t)
    print(t)
    t.pretty_print()
    try:
        subj = find_subject(t)
    except:
        subj = []
    try:
        pred = find_predicate(t)
    except:
        pred = []
    try:
        obj = find_object(t)
    except:
        obj = []

    print(subj)
    print(pred)
    print(obj)
    return subj, pred, obj
예제 #7
0
    def ConstituencyParser(sentence):

        from nltk.parse.stanford import StanfordParser
        # create parser object
        scp = StanfordParser(path_to_jar='/path/to/stanford-parser.jar', path_to_models_jar='path/to/stanford-parser-models.jar')
        # get parse tree
        result = list(scp.raw_parse(sentence))
 def extract_h4_parser(self, sentence):
     list = []
     parser = StanfordParser(model_path="E:/Stanford parser/stanford-parser-full-2017-06-09/stanford-parser-3.8.0-models/edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz")
     t = parser.raw_parse(str(sentence))
     for i in t:
         for j in i.subtrees(lambda i: i.height() == 4):
             list.append(str(j))
     return list
예제 #9
0
def main():
    parser = StanfordParser(path_to_jar=script_wrapper.stanford_parser_jar, path_to_models_jar=script_wrapper.stanford_model_jar)
    st = StanfordNERTagger(model_filename='../lib/stanford-ner-2015-12-09/classifiers/english.all.3class.distsim.crf.ser.gz', path_to_jar="../lib/stanford-ner-2015-12-09/stanford-ner-3.6.0.jar")
    raw_sent = "Dempsey was drafted by Major League Soccer club New England Revolution."
    sent = word_tokenize(raw_sent)
    ne_tuple = st.cur_tag(sent)  # ##need write interface for tokenized sent (http://nlp.stanford.edu/software/crf-faq.shtml#tokenized)
    print ne_tuple
    
    print parser.raw_parse(raw_sent).next()

    return
    # find name entity
    f = 0
    ne_list = []
    for (ne, label) in ne_tuple:
        if label == 'PERSON':
            f = 1
        if f and label != 'PERSON':
            break
        if f:
            ne_list.append(ne)
    # print ne_list

    init_file(main_tree)
                    ####### my issue here: 1. don't know how to get NP. 2. is there a quicker way to find PERON ?
    # try head to ask who/what
    pattern = "S < NP=np"
    head = check_output(['bash',  ###add bash !!!!
                         tregex_path,
                         '-s',
                         pattern,
                         init_tree_file])
    print head

    def get_main_verbs(tree):
        pattern = '/(VB.?)/=main >+ (VP) (S > ROOT)'
        main_verbs = check_output(['bash',  ###add bash !!!!
                                   tregex_path,
                                   '-s',
                                   pattern,
                                   init_tree_file])
        print main_verbs
        main_verbs = main_verbs.split('\n')[:-1]
        main_verbs = [Tree.fromstring(main_verb) for main_verb in main_verbs]
        return main_verbs
예제 #10
0
def parseSentence(inputSentence):
    parser = StanfordParser(
        model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz")
    parsedSentence = parser.raw_parse(inputSentence)
    sent = printSentence(parsedSentence)
    ret = str(sent).replace("\n", "").replace('    ',
                                              "").replace("(", "{").replace(
                                                  ")", "}").replace(" {", "{")
    return ret
예제 #11
0
def cn_parse(sent):
    """
    对中文句子做句法分析,记得model_path要改变
    """
    parser = StanfordParser(
        'E:\standford_nlp\stanford-parser-full-2018-02-27\stanford-parser.jar',
        'E:\standford_nlp\stanford-parser-full-2018-02-27\stanford-parser-3.9.1-models.jar',
        model_path='edu/stanford/nlp/models/lexparser/chinesePCFG.ser.gz')
    return list(parser.raw_parse(sent))[0]
예제 #12
0
def en_parse(sent):
    """
    对英文句子做句法分析
    """
    parser = StanfordParser(
        'E:\standford_nlp\stanford-parser-full-2018-02-27\stanford-parser.jar',
        'E:\standford_nlp\stanford-parser-full-2018-02-27\stanford-parser-3.9.1-models.jar'
    )
    return list(parser.raw_parse(sent))[0]
예제 #13
0
def parseThisSent(sentences):
    parser = StanfordParser(path_to_models_jar=my_path_to_models_jar3,
                            path_to_jar=my_path_to_jar3)
    c = list(parser.raw_parse(sentences))
    B = c[0].copy()
    #get all  NPs and VPs
    phrasesList = []
    for s in B.subtrees(lambda B: B.label() == 'NP' or B.label() == 'VP'):
        phrasesList += [(" ".join(s.leaves()))]
    return phrasesList
예제 #14
0
class SyntaxTreeParser:
    def __init__(self):
        self.parser = StanfordParser()
        if not self.parser:
            raise RuntimeError('Stanford Parsre could not be initialized.')
    
    def raw_parse(self, sent):
        tree = next(self.parser.raw_parse(sent))
        return tree

    def parse(self, sent):
        one_sent = sent
        if len(sent[0]) == 1:
            one_sent = nltk.pos_tag(sent)
        tree = self.parser.tagged_parse(one_sent)
        return tree
예제 #15
0
def parse(text, normalize=True) : #ToDo: change behavior
    """Parses string, iterable of strings or nested iterables of strings"""

    # saves stanford_parser as global variable,
    # such that it is not recreated everytime parse is executed
    if not 'stanford_parser' in globals() :
        global stanford_parser
        stanford_parser = StanfordParser(conf.stanford_parser,conf.stanford_models)
    
    if hasattr(text, '__iter__') :
        return [parse(t) for t in text]
    else :
        if normalize:
            text = canonicalize(text)
        trees = stanford_parser.raw_parse(text)
    return trees
예제 #16
0
class Parser(object):
    """
    Parse sentence structure
    """
    def __init__(self, jar_path, model_path):
        self.parser = StanfordParser(jar_path, model_path)
        self.dep_parser = StanfordDependencyParser(jar_path, model_path)

    def __call__(self, doc):
        doc['parse'] = ParentedTree.convert(self.parse(doc['text']))
        doc['dep_parse'] = self.dep_parse(doc['text'])

    def parse(self, statement):
        return next(self.parser.raw_parse(statement))

    def dep_parse(self, statement):
        return next(self.dep_parser.raw_parse(statement))
예제 #17
0
def average_parse_tree_height(doc):
    remove_url = clean_formula(doc)
    parser=StanfordParser()
    sentence = remove_url.replace(';','.').replace('?','.').replace('!','.').split('.')
    sentence = [item for item in sentence if item]
    sentence = filter(operator.methodcaller('strip'), sentence)
    depth = lambda L: isinstance(L, list) and (max(map(depth, L)) + 1) if L else 1
    total_level = 0
    total_count = 0
    for s in sentence:
        if len(s.split())< 20: 
            total_level += depth(list(parser.raw_parse(s)))
            total_count += 1
    if total_count >0:
        average = total_level / total_count
    else:
        average = 0
    return average
예제 #18
0
class OldStanfordLibParser(Parser):
    """For StanfordParser < 3.6.0"""
    def __init__(self):
        self.parser = StanfordParser()

    def parse(self, line):
        """Returns tree objects from a sentence

        Args:
            line: Sentence to be parsed into a tree

        Returns:
            Tree object representing parsed sentence
            None if parse fails
        """
        tree = list(self.parser.raw_parse(line))[0]
        tree = tree[0]
        return tree
예제 #19
0
파일: parser.py 프로젝트: BioGeek/Lango
class OldStanfordLibParser(Parser):
    """For StanfordParser < 3.6.0"""

    def __init__(self):
        self.parser = StanfordParser()

    def parse(self, line):
        """Returns tree objects from a sentence

        Args:
            line: Sentence to be parsed into a tree

        Returns:
            Tree object representing parsed sentence
        """
        tree = list(self.parser.raw_parse(line))[0]
        tree = tree[0]
        return tree
예제 #20
0
def analyzing_sentence_structure(sentence):
    scp = StanfordParser(path_to_jar=LoadCommonSense.STF_PATH +
                         "stanford-parser.jar",
                         path_to_models_jar=LoadCommonSense.STF_PATH +
                         "stanford-parser-4.2.0-models.jar")
    sentence = " ".join(clean_text(sentence, False))
    try:
        result = list(scp.raw_parse(sentence))
    except BaseException:
        return {"subject": "", "verb": "", "object": ""}, sentence, 1
    tree_result = result[0].subtrees()
    sentence_json = {"subject": "", "verb": "", "object": ""}
    temp_object = []
    for each in tree_result:
        flag = 0
        tree_label = each.label()

        find_verb = re.findall(r"VB.*", tree_label)
        if tree_label == "NP" and flag != 1:
            flag += 1
            sentence_json["subject"] = each.leaves()
        elif find_verb:
            sentence_json["verb"] = each.leaves()
            break
        elif tree_label == 'ROOT':
            temp_object = each.leaves()

    if not sentence_json["subject"]:
        sentence_json["subject"] = ["i"]
        temp_object = sentence_json["subject"] + temp_object
        sentence_new = "i " + sentence
    else:
        sentence_new = sentence

    error = 0
    try:
        sentence_json["object"] = [
            word for word in temp_object
            if word not in (sentence_json["subject"] + sentence_json["verb"])
        ]
    except BaseException:
        error = 1

    return sentence_json, sentence_new, error
def get_parse_tree(df):
    print('\ntraversing phrase...')

    # path setting for stanford parser
    # versi windows
    # java_path = r'C:\Program Files\Java\jdk1.8.0_151\bin'
    # os.environ['JAVAHOME'] = java_path
    # stanford_parser = StanfordParser(path_to_jar='c:/stanford-parser-full/stanford-parser.jar',
    #                       path_to_models_jar='c:/stanford-parser-full/stanford-parser-3.5.2-models.jar')

    # versi linux
    java_path = r'/usr/lib/jvm/java-1.8.0-openjdk-amd64/bin'
    os.environ['JAVAHOME'] = java_path
    stanford_parser = StanfordParser(
        path_to_jar=
        '/home/akunaefi/PycharmProjects/StanfordParser/stanford-parser-full-2015-04-20/stanford-parser.jar',
        path_to_models_jar=
        '/home/akunaefi/PycharmProjects/StanfordParser/stanford-parser-full-2015-04-20/stanford-parser-3.5.2-models.jar'
    )

    for index, words in enumerate(df['lemmatized_review']):
        # list_of_frases = []
        print('--------------------- ', index)
        print('review: ', words)
        # sent = text_cleaner(words)
        # df_clause.loc[index,'Review'] = sent
        parsed_sent = stanford_parser.raw_parse(words)
        tree = parsed_sent.__next__()
        list_of_nps = traverse_tree(tree, 'NP')
        num_of_np = len(list_of_nps)
        print('NOUN = ', num_of_np)
        list_of_vps = traverse_tree(tree, 'VP')
        num_of_vp = len(list_of_vps)
        print('VERB = ', num_of_vp)
        list_of_mds = traverse_tree(tree, 'MD')
        num_of_md = len(list_of_mds)
        print('MODAL = ', num_of_md)
        df.loc[index, 'num_np'] = num_of_np
        df.loc[index, 'num_vp'] = num_of_vp
        df.loc[index, 'num_md'] = num_of_md

    return df.copy()
예제 #22
0
def average_number_of_subordinate_clauses_per_sentence(doc):
    remove_url = clean_formula(doc)
    parser=StanfordParser()
    sentence = remove_url.replace(';','.').replace('?','.').replace('!','.').split('.')
    sentence = [item for item in sentence if item]
    sentence = filter(operator.methodcaller('strip'), sentence)
    subtexts = []
    total_count = 0
    for s in sentence:
        if len(s.split())< 20: 
            t = list(parser.raw_parse(s))[0]
            total_count += 1
            for subtree in t.subtrees():
                if subtree.label()=="S" or subtree.label()=="SBAR":
                    subtexts.append(' '.join(subtree.leaves()))
    if total_count > 0:
        average = len(subtexts) / total_count
    else:
        average = 0
    return average
예제 #23
0
class Stanford:
    def __init__(self):
        """ The Stanford Parser is required, download from http://nlp.stanford.edu/software/lex-parser.shtml and unpack somewhere """
        # insert path to java home
        if os.name == "nt":
            os.environ['JAVAHOME'] = 'C:\Program Files\Java\jdk1.8.0_66\bin\java.exe'
            # insert path to the directory containing stanford-parser.jar and stanford-parser-3.5.2-models.jar
            self.english_parser = StanfordParser(
                'C:/Python34/Lib/site-packages/stanford-parser-full-2015-04-20/stanford-parser.jar',
                'C:/Python34/Lib/site-packages/stanford-parser-full-2015-04-20/stanford-parser-3.5.2-models.jar')
        elif os.name != "posix":
            os.environ['JAVAHOME'] = 'C:/Program Files (x86)/Java/jdk1.8.0_65/bin/java.exe'
            # insert path to the directory containing stanford-parser.jar and stanford-parser-3.5.2-models.jar
            self.english_parser = StanfordParser(
                'C:/Python34/Lib/site-packages/stanford-parser-full-2015-04-20/stanford-parser.jar',
                'C:/Python34/Lib/site-packages/stanford-parser-full-2015-04-20/stanford-parser-3.5.2-models.jar')
        else:
            os.environ['JAVA_HOME'] = '/usr/lib/jvm/java-1.8.0-openjdk-amd64'
            # insert path to the directory containing stanford-parser.ja and stanford-parser-3.5.2-models.jar
            self.english_parser = StanfordParser(
                expanduser("~") + '/lib/stanford-parser-full-2015-04-20/stanford-parser.jar',
                expanduser("~") + '/lib/stanford-parser-full-2015-04-20/stanford-parser-3.5.2-models.jar')

    def get_sent_depth(self, s):
        # remove linebreaks for syntax tree
        s = s.replace('\n', ' ').replace('\r', ' ')

        sentence = self.english_parser.raw_parse(s)
        current_tree = None
        depth = 0

        for line in sentence:
            current_tree = line
            depth = line.height() - 1

        sent_depth_feature_value = (depth - 4) / 20

        if sent_depth_feature_value < 0: return current_tree, 0
        if sent_depth_feature_value > 1: return current_tree, 1
        return current_tree, round(sent_depth_feature_value, 2)
예제 #24
0
def cStructure(user_input):
    #    print '######## C Structure ########'
    parser = StanfordParser(path_to_jar=path_to_jar,
                            path_to_models_jar=path_to_models_jar)
    example = parser.raw_parse(user_input)
    example = list(example)
    #print example
    getTree = example[0]
    #    print getTree

    getTreeTwo = getTree[0]
    #    print getTreeTwo
    #    print type(getTree)
    treeToString = str(getTree)
    #    print type(treeToString)
    #    print treeToString
    #print abcabc1.label()

    for element in getTree:
        #print a.height()
        if element.height() > 1:
            extractPhrases(element)
예제 #25
0
class Parser(object):

    def __init__(self, jar_path, model_path):
        self.parser = StanfordParser(jar_path, model_path)
        self.dep_parser = StanfordDependencyParser(jar_path, model_path)


    def __call__(self, doc):
        doc['parse'] = ParentedTree.convert(self.parse(doc['text']))
        doc['dep_parse'] = self.dep_parse(doc['text'])


    def parse(self, statement):
        return next(self.parser.raw_parse(statement))


    # (raw_parse) Use StanfordParser to parse a sentence. Takes a sentence as a string;
    # before parsing, it will be automatically tokenized and tagged by
    # the Stanford Parser.

    def dep_parse(self, statement):
        return next(self.dep_parser.raw_parse(statement))
예제 #26
0
class Parser:
    def __init__(self):
        self.stanford_parser = StanfordParser(model_path=MODEL_PATH)

    def fill_in_the_blank(self, text):
        parse_tree = list(self.stanford_parser.raw_parse(text))[0]
        ans_list = self.leaves(parse_tree)
        with_blanks = text
        for ans in ans_list:
            for word in ans:
                with_blanks = with_blanks.replace(word, "_" * len(word))
        print with_blanks
        return (with_blanks, ans_list)

    def leaves(self, tree):
        """Finds NP (nounphrase) leaf nodes of a chunk tree."""
        answers = []
        for subtree in tree.subtrees(filter=lambda t: t.label() == 'NP'):
            nnp_exists = list(
                subtree.subtrees(filter=lambda t: t.label() == 'NNP'))
            if nnp_exists:
                answers.append(subtree.leaves())
        return answers
예제 #27
0
def stanfordParser():
    # ajout des variables d'environnement
    os.environ['CLASSPATH'] = "stanford-parser/stanford-parser-full-2018-10-17"
    os.environ['JAVAHOME'] = "D:/Program Files/java/bin"

    # chemin du parser Stanford Parser
    parser = StanfordParser(model_path = "edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz")
    # chemin de l'input / output
    path_input = "ressources/TBAQ-txt-annot-StanfordParser/TimeBank-txt-annot/TimeBank/"    
    path_output = 'ressources/TBAQ-txt-annot-StanfordParser/TimeBank-txt-annot/TimeBank_StanfordParser/' 
     
    for filename in os.listdir(path_input):
        print(filename)
        file = codecs.open(path_input+filename, 'r', 'utf8').read()
        # pour remplacer les retours à la ligne en debut de fichier -> empechaient le tagging des phrases
        file = file.lstrip().replace('\r\n\r\n\r\n', ' ').replace('\r\n\r\n', ' ').replace('\r\n', ' ')
        # tokenization en phrase
        sents = nltk.sent_tokenize(file)
        # chaine vide
        parsedText = ""
        # pour chaque phrase on passe le stanford parser
        for sent in sents:
            constituancies = list(parser.raw_parse(sent))
#            print(constituancies)
            # regex pour mettre en forme la sortie pour que le format convienne à addDiscourse
            constituancies = re.sub(r'''(\[)(Tree)(\()('ROOT')(\,)( )(\[)(Tree)''', '(S1 (S ', str(constituancies))
            constituancies = re.sub(r'''(((\,)( )(\[)(Tree))|((\,)( )(Tree)))''', ' ', str(constituancies))
            constituancies = re.sub(r'''(\[)((')|("))|((')|("))(\])''', "'", str(constituancies))
            constituancies = re.sub(r'''(\])''', ')', str(constituancies))
            constituancies = re.sub(r'''(')''', '', str(constituancies))
            # ajout des phrases tagguées dans la chaine vide
            parsedText += str(constituancies)
            
        # ouverture des fichiers 
        with open((path_output+filename), 'w', encoding='utf8') as fileW:
            # écriture des fichiers
            fileW.write(parsedText)
예제 #28
0
                            sentence = line.strip()
                        #    print sentence
                       else:
                           sentence = sentence + " " + line.strip()
                       l= words[-1]
                     #  print 'awesome' in 'wheather is awesome here dude'
                    #   print l

                       edus.append(line.strip())
                    #   print line.strip()
                      # print ".\""
                   #    print line.strip()[-2:]
                       if line.strip()[-1]=="." or line.strip()[-2:]==".\"":
                         #  print sentence
                        #   print edus
                           rootWord = _parse_output(english_parser.raw_parse(sentence),edus,dep)
                       #    print 'end'
                           sentence =None
                           edus = deque()
                     #  for sentence in sentences:
                      #    rootWord = _parse_output(english_parser.raw_parse(sentence))
                     #     dep.write(str(sentence).split())
                      #    dep.write("@#%^&*")
                      #    dep.write(str(rootWord))
                      #    dep.write("\n")
                      #    print i
                      #    i=i+1
                if sentence!=None:
                     rootWord = _parse_output(english_parser.raw_parse(sentence),edus,dep)
                       #    print 'end'
                     sentence =None
예제 #29
0
파일: nlquery.py 프로젝트: codejitsu/labr
class NLQueryEngine(LoggingInterface):
    """
    Grammar mapping for knowledge queries of the form:
    - What is the X of Y
    - What is X's Y
    """
    def __init__(self, properties={'lang': 'en'}):
        LoggingInterface.__init__(self)
        self.parser = StanfordParser(
            model_path=MODELS_PATHS[properties['lang']])
        self.wd = WikiData()
        self.wd.set_properties(properties)
        self.properties = properties

    def subject_query(self,
                      qtype,
                      subject,
                      action,
                      jj=None,
                      prop=None,
                      prop2=None,
                      prop3=None):
        """Transforms matched context into query parameters and performs query

        Args:
            qtype: Matched type of query (what, who, where, etc.)
            subject: Matched subject (Obama)
            action: Matched verb action (is, was, ran)
            jj (optional): Matched adverb
            prop (optional): Matched prop
            prop2 (optional): Matched prop
            prop3 (optional): Matched prop

        Returns:
            Answer: Answer from query, or empty Answer if None
        """
        if (self.properties['lang'] == 'en'):
            if jj == 'old':
                # How old is Obama?
                prop = 'age'

            if jj in ['tall', 'high']:
                # How tall is Yao Ming / Eifel tower?
                prop = 'height'
        elif (self.properties['lang'] == 'de'):
            if jj == 'alt':
                # Wie alt ist Obama?
                prop = 'age'

            if jj in ['hoch', 'groß']:
                # Wie hoch ist die Zugspitze?
                prop = 'height'

            if prop in ['sprache', 'sprachen']:
                # Welche Sprache spricht man in Sweden?
                prop = 'language official'

        if prop2:
            prop = prop + ' ' + prop2

        if prop3 and not prop:
            prop = prop3

        if not prop:
            if self.properties['lang'] == 'en' and action not in ['is', 'was']:
                prop = action
            elif self.properties['lang'] == 'de' and action not in [
                    'ist', 'sind', 'war', 'hat', 'wurde', 'bedeutet'
            ]:
                prop = action

        ans = self.get_property(qtype, subject, prop)
        if not ans:
            ans = Answer()

        ans.params = {
            'qtype': qtype,
            'subject': subject,
            'prop': prop,
        }
        return ans

    def get_prop_tuple(self,
                       prop=None,
                       value=None,
                       op=None,
                       value_units=None,
                       pp_t=None):
        """Returns a property tuple (prop, value, op). E.g. (population, 1000000, >)

        Args:
            prop (str): Property to search for (e.g. population)
            value (str): Value property should equal (e.g. 10000000)
            op (str): Operator for value of property (e.g. >)

        Returns:
            tuple: Property tuple, e.g: (population, 10000000, >)
        """

        self.info('Prop tuple: {0},{1},{2},{3},{4}', prop, value, op,
                  value_units, pp_t)

        if op in ['in', 'by', 'of', 'from']:
            oper = op
        elif op in ['over', 'above', 'more', 'greater']:
            oper = '>'
        elif op in ['under', 'below', 'less']:
            oper = '<'
        else:
            self.error('NO OP {0}', op)
            return None

        # Infer property to match value
        if prop is None:
            if value_units is not None:
                if value_units in ['people']:
                    prop = 'population'
                if not prop:
                    return None

        props = [(prop, value, oper)]

        if pp_t:
            prop_tuple = match_rules(pp_t,
                                     RULES[properties['lang']]['prop_rules'],
                                     self.get_prop_tuple)
            if not prop_tuple:
                return None
            props += prop_tuple

        return props

    def find_entity_query(self,
                          qtype,
                          inst,
                          prop_match_t=None,
                          prop_match2_t=None):
        """Transforms matched context into query parameters and performs query for
        queries to find entities

        Args:
            qtype (str): Matched type of query (what, who, where, etc.)
            inst (str): Matched instance of entity to match (Obama)
            action (str): Matched verb action (is, was, ran)
            prop_match_t (Tree): Matched property Tree
            prop_match2_t (Tree): Matched property Tree

        Returns:
            Answer: Answer from query, or empty Answer if None
        """

        props = []
        if prop_match_t:
            prop = match_rules(prop_match_t,
                               RULES[self.properties['lang']]['prop_rules'],
                               self.get_prop_tuple)

            if not prop:
                return

            props += prop

        if prop_match2_t:
            prop = match_rules(prop_match2_t,
                               RULES[self.properties['lang']]['prop_rules'],
                               self.get_prop_tuple)

            if not prop:
                return

            props += prop

        if not inst.isupper():
            inst = singularize(inst)

        ans = self.wd.find_entity(qtype, inst, props)
        if not ans:
            ans = Answer()

        ans.params = {
            'qtype': qtype,
            'inst': inst,
            'props': props,
        }
        return ans

    def get_property(self, qtype, subject, prop):
        """Gets property of a subject
        Example:
            get_property('who', 'Obama', 'wife') = 'Michelle Obama'

        Args:
            subject: Subject to get property of
            prop: Property to get of subject

        Todo:
            * Add other APIs here

        Returns:
            Answer: Answer from query
        """
        return self.wd.get_property(qtype, subject, prop)

    def preprocess(self, sent):
        """Preprocesses a query by adding punctuation"""
        if sent[-1] != '?':
            sent = sent + '?'
        return sent

    def cleanup(self, sent):
        """Remove some stop words"""
        stopwords = ['der', 'die', 'das', 'ein', 'eine', 'einen']
        words = sent.split()

        result = [word for word in words if word.lower() not in stopwords]

        return ' '.join(result)

    def query(self, sent, format_='plain'):
        """Answers a query

        If format is plain, will return the answer as a string
        If format is raw, will return the raw context of query

        Args:
            sent: Query sentence
            format_: Format of answer to return (Default to plain)

        Returns:
            dict: Answer context
            str: Answer as a string

        Raises:
            ValueError: If format_ is incorrect
        """

        sent = self.preprocess(sent)
        sent = self.cleanup(sent)
        tree = next(self.parser.raw_parse(sent))

        pos = [tag for word, tag in tree.pos()]

        if self.properties['lang'] == 'de':
            if len(set(['PWS', 'PWAV', 'PWAT']) & set(pos)) == 0:
                print("Tree before:")
                for e in tree:
                    print(str(e))

                sent = "Was ist " + sent
                tree = next(self.parser.raw_parse(sent))
        # TODO
        #elif self.properties['lang'] == 'en':
        #    if len(set(['WHNP']) & set(pos)) == 0:
        #        print("Tree before:")
        #        for e in tree:
        #            print(str(e))
        #
        #        sent = "What is " + sent
        #        tree = next(self.parser.raw_parse(sent))

        context = {'query': sent, 'tree': tree}

        for e in tree:
            print(str(e))

        ans = first([
            match_rules(tree,
                        RULES[self.properties['lang']]['find_entity_rules'],
                        self.find_entity_query),
            match_rules(tree,
                        RULES[self.properties['lang']]['subject_prop_rules'],
                        self.subject_query),
        ])

        print("-> " + str(ans))

        if not ans:
            ans = Answer()

        ans.query = sent
        ans.tree = str(tree)

        if format_ == 'raw':
            return ans.to_dict()
        elif format_ == 'plain':
            return ans.to_plain()
        else:
            raise ValueError('Undefined format: %s' % format_)
예제 #30
0
class SentenceParser:
    __parser = None
    __alpha  = 1.0
    __beta   = 1.0
    __gamma  = 0.1

    __var_d  = 0.0
    __var_s  = 0.0

    def __init__(self):
        self.__parser = StanfordParser()
        self.__var_d  = 12.0/math.log(2.0)
        self.__var_s  = 4.0 * 1.0/math.log(2)

    def __parse_sent(self, sentence):
        result = self.__parser.raw_parse(sentence) 
        return result.next()

    def __obtain_nps(self, sentence):
        parse_tree = self.__parse_sent(sentence)
        nps = set()
        for phrase in parse_tree.subtrees():
            if phrase.label() != "NP": continue
            nps.add(' '.join(phrase.leaves()))

        sent_tokens = " ".join(parse_tree.leaves())
        
        #Get the smallest NPs
        nps_smallest = set()
        for np1 in nps:
            if all(np2 not in np1 for np2 in nps if np2 != np1): 
                nps_smallest.add(np1)
        return sent_tokens, nps_smallest

    def __gaussian_weight(self, distance, variance):
        return math.exp(-0.5 * (distance**2)/variance)

    def __weight_tokens(self, mid, nps, sentences, sent_id):
        st          = PorterStemmer()
        sent_target = sentences[sent_id]
        token_id    = [idx for idx, token in enumerate(sent_target.strip().split(" ")) if mid in token][0]

        sent_lengths= [len(s.split(" ")) for s in sentences]

        nps_base = {np:" ".join(st.stem(token) for token in np.split(" ")) for np in nps}
        nps_proc = {}

        for sent_idx, sent in enumerate(sentences):
            sent_stem = " ".join(st.stem(token) for token in sent.split(" "))
            for np_ori, np in nps_base.iteritems():
                if np_ori not in nps_proc: nps_proc[np_ori] = {}

                if "dist_sent" not in nps_proc[np_ori] or abs(sent_idx - sent_id) < nps_proc[np_ori]["dist_sent"]:
                    #always update the info
                    if np not in sent_stem: 
                        continue
                    np_idx      = sent_stem.rindex(np)
                    np_token_idx= len(sent_target[:np_idx].strip().split(" "))
                    dist_start  = len(sent_stem[:np_idx].strip().split(" "))
                    dist_end    = len(sent_stem[np_idx+len(np):].strip().split(" "))

                    dist_sent   = abs(sent_idx - sent_id)
                    dist_token  = -1

                    if dist_sent == 0:
                        if mid in np_ori:
                            dist_token = 0
                        elif np_token_idx < token_id:
                            dist_token = token_id - np_token_idx - (len(np.split(" ")) - 1) - 1
                        elif np_token_idx > token_id:
                            dist_token = np_token_idx - token_id - 1
                    elif sent_idx < sent_id: 
                        dist_token = dist_end + sum(sent_lengths[sent_idx+1:sent_id]) + token_id
                    elif sent_idx > sent_id:
                        dist_token = (len(sent_target.strip().split(" "))-1-token_id) + sum(sent_lengths[sent_id+1:sent_idx]) + dist_start

                    nps_proc[np_ori]["dist_sent"]  = dist_sent
                    nps_proc[np_ori]["dist_token"] = dist_token

                np_count = sent_stem.count(np)
                nps_proc[np_ori]["tf"] = (nps_proc[np_ori].get("tf") or 0) + np_count

        nps_weight = {}
        for np, vals in nps_proc.iteritems():
            term1 = self.__alpha * self.__gaussian_weight(vals["dist_token"], self.__var_d)
            term2 = self.__beta  * self.__gaussian_weight(vals["dist_sent"],  self.__var_s)
            term3 = self.__gamma * vals["tf"]
            nps_weight[np] = (term1 + term2 + term3) / (self.__alpha + self.__beta + self.__gamma)
        return nps_weight

    def obtain_nps_from_sentences(self, mid, text):
        lst_sentences = sent_tokenize(text)
        lst_sent_pr  = []
        set_nps      = set()

        sent_match_id= -1
        for sent_idx, sent in enumerate(lst_sentences):
            if sent_match_id == -1 and mid in sent: 
                sent_match_id = sent_idx

            sent_tokens, nps = self.__obtain_nps(sent)
            lst_sent_pr.append(sent_tokens)
            set_nps.update(nps)

        dct_nps_weight = self.__weight_tokens(mid, set_nps, lst_sent_pr, sent_match_id)
        return lst_sent_pr, dct_nps_weight
예제 #31
0
from nltk.parse.stanford import StanfordParser
from nltk.tag import StanfordNERTagger
import en
import utils
sentences = utils.get_tokenized_sentences("data/set1/a1.txt")
parser=StanfordParser()

print len(sentences)
print len([ x for x in sentences if "is" in x])

[parser.raw_parse((x)) for x in sentences]
예제 #32
0
def simplify(sent):
    from anytree import NodeMixin, Node, AnyNode, RenderTree
    from nltk.parse.stanford import StanfordParser

    def make_tree(tree, t, sent_list):
        #this fn. converts nltk tree to anytree
        if tree not in sent_list:
            ttt = AnyNode(id=str(tree.label()), parent=t)
            for tt in tree:
                make_tree(tt, ttt, sent_list)
        else:
            AnyNode(id=str(tree), parent=t)

    parser = StanfordParser()

    #SBAR CASE
    def find_sbar(t):
        if t.id == 'SBAR':
            global sbar
            sbar = t
        for tt in t.children:
            find_sbar(tt)

    def find_vp_in_sbar(t):
        if t.id == 'VP':
            global vp_sbar
            vp_sbar.append(t)
        for tt in t.children:
            find_vp_in_sbar(tt)

    def find_np_in_sbar(t):
        global f
        global ff
        if t.id == 'VP':
            ff = False
        if (t.id == 'NP') and f == True and ff == True:
            global np_sbar
            np_sbar = t
            f = False
        for tt in t.children:
            find_np_in_sbar(tt)

    def find_vp(t):
        if t.id == 'SBAR':
            return
        global f
        if t.id == 'VP' and f == True:
            global vp
            vp = t
            f = False
        for tt in t.children:
            find_vp(tt)

    def find_np(t):
        if t.id == 'SBAR':
            return
        global f
        if t.id == 'NP' and f == True:
            global np
            np = t
            f = False
        for tt in t.children:
            find_np(tt)

    def find_vbz(t):
        if t.id == 'SBAR':
            return
        global f
        if t.id == 'VBZ' and f == True:
            global vbz
            vbz = t.children[0].id
            f = False
        for tt in t.children:
            find_vbz(tt)

    def make_sent(t):
        global simple_sentences
        if t.id in sent_list:
            simple_sentences[-1].append(t.id)
        for tt in t.children:
            make_sent(tt)

    #sent=sent8

    parse_trees = parser.raw_parse(sent)
    global sent_list
    sent_list = [s for s in sent.split()]
    tree = next(parse_trees)[0]
    #tree.draw()
    t = AnyNode(id='ROOT')
    make_tree(tree, t, sent_list)
    global sbar
    sbar = t
    global vp_sbar
    global f
    global ff
    global np_sbar
    global vp
    global np
    global vbz
    vp_sbar = []
    vp = t
    np = t
    vbz = 'bn2'
    np_sbar = t
    find_sbar(t)
    find_vp_in_sbar(sbar)
    f = True
    ff = True
    find_np_in_sbar(sbar)
    f = True
    find_vp(t)
    f = True
    find_np(t)
    f = True
    find_vbz(t)
    global simple_sentences
    simple_sentences = []
    simple_sentences.append([])
    make_sent(np)
    make_sent(vp)
    for i in range(len(vp_sbar)):
        simple_sentences.append([])
        if np_sbar == t:
            make_sent(np)
        else:
            make_sent(np_sbar)
        if vbz != 'bn2':
            simple_sentences[-1].append(vbz)
        make_sent(vp_sbar[i])
    #print (simple_sentences)
    simple = []
    for sentence in simple_sentences:
        string = ''
        for word in sentence:
            string += word + ' '
        string += '.'
        simple.append(string)

    def is_any_sbar(t):
        if t.id == 'SBAR':
            global f
            f = True
            return
        for tt in t.children:
            is_any_sbar(tt)

    f = False
    is_any_sbar(t)
    if f == False:
        simple = [sent]
    return simple
'''
Created on Mar 11, 2016

@author: zhongzhu
'''
import os

from nltk.parse.stanford import StanfordDependencyParser
from nltk.parse.stanford import StanfordParser
from nltk.tag import StanfordNERTagger
from nltk.tag.stanford import StanfordPOSTagger


st = StanfordPOSTagger('english-bidirectional-distsim.tagger')
st.tag('What is the airspeed of an unladen swallow ?'.split())

st = StanfordNERTagger('english.all.3class.distsim.crf.ser.gz') 
st.tag('Rami Eid is studying at Stony Brook University in NY'.split())

parser = StanfordParser(model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz")
list(parser.raw_parse("the quick brown fox jumps over the lazy dog"))

dep_parser = StanfordDependencyParser(model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz")
print [parse.tree() for parse in dep_parser.raw_parse("The quick brown fox jumps over the lazy dog.")]
from nltk.tokenize import word_tokenize

import script_wrapper as stanford_parser 


sentence = "Dempsey was drafted by Major League Soccer club New England Revolution."
st = StanfordNERTagger(model_filename='../lib/stanford-ner-2015-12-09/classifiers/english.all.3class.distsim.crf.ser.gz', path_to_jar="../lib/stanford-ner-2015-12-09/stanford-ner-3.6.0.jar")
tags = st.tag(word_tokenize(sentence))
print(tags)

prev_tag_name = tags[0][1]
cur_entity = tags[0][0]
entities = {}
for i in range(1, len(tags)):
    cur_tag = tags[i]
    cur_token = cur_tag[0]
    cur_tag_name = cur_tag[1]
    if cur_tag_name == prev_tag_name:
        cur_entity = cur_entity + " " + cur_token
    else:
        if not prev_tag_name in entities:
            entities[prev_tag_name] = []
        entities[prev_tag_name].append(cur_entity)
        cur_entity = cur_token
    prev_tag_name = cur_tag_name
del entities['O']
print(entities)

parser = StanfordParser(path_to_jar=stanford_parser.stanford_parser_jar, path_to_models_jar=stanford_parser.stanford_model_jar)
print(parser.raw_parse("Dempsey was drafted by Major League Soccer club New England Revolution.").next())
예제 #35
0
with open("example_article.txt") as f:
	tokenizer = PunktSentenceTokenizer()
	sentences = tokenizer.tokenize(f.read().decode('utf-8').replace("\n"," "))
	parser=StanfordParser()	

	print len(sentences)
	print len([ x for x in sentences if "is" in x])

	sentences[0] = "I am going to watch a movie in the evening."
	sentences[0] = "I have always wondered how I have always been so good on the guitar."
	sentences[0] =  "Our dinner has been eaten by the dog."
	sentences[0] = "Playing golf is my favorite pastime"
	sentences[0] = "He plays golf for a living"
	
	sentences[0] = sentences[0].rstrip('.')
	parseTree = list(parser.raw_parse((sentences[0])))
	print sentences[0] 
	
	# the parse tree for the entire sentence
	root = parseTree[0]
	print type(root)
	print root
	print root.pretty_print()
	print root.label()
	
	print ' '.join(root.leaves())

	posTags = {}
	posTags['phrases'] = ['ADJP','ADVP','CONJP','FRAG','INTJ','LST','NAC','NP','NX','PP','PRN','PRT','QP','RRC','UCP','VP','WHADJP','WHAVP','WHNP','WHPP','X','WHADVP']

예제 #36
0
파일: parse.py 프로젝트: radi9/python_talk
import os
import sys
from nltk.parse.stanford import StanfordParser

if __name__ == '__main__':
	if not os.environ.has_key('STANFORD_PARSE_CLASSPATH'):
		if not len(sys.argv) == 2:
			print 'no stanford parse folder identify'
			stanford_path = raw_input('please give stanford parse folder path : ')
	else:
		stanford_path = os.environ['STANFORD_PARSE_CLASSPATH']
	parser = StanfordParser(stanford_path+'/stanford-parser-3.5.1-models.jar',
							stanford_path+'/stanford-parser.jar')
	#sentence = 'A man previously convicted of harassing Yahoo CEO Marissa Mayer has been arrested by Austin police on suspicion of sending her sexually graphic emails, according to police records released on Friday.'
#	sentence = 'Type 2 diabetes (T2D) and Alzheimer`` disease (AD) are two major health issues nowadays. T2D is an ever increasing epidemic, affecting millions of elderly people worldwide, with major repercussions in the patients  daily life.'
	#sentence = 'MiR-145 is reported to be significantly down-regulated in ovarian cancer.'
	#sentence = 'In this report, we find out that up-regulation of miR-145 in OVCAR-3 and SKOV-3 cells inhibit cell proliferation and promote cell apoptosis.'
	sentence = 'promoted the proliferation of ovarian cancer cells'
	parse_result = list(parser.raw_parse(sentence))
	print parse_result
	print 'print out sentence structure'
	print parse_result[0].draw()
# -*- coding: utf-8 -*-
"""
Created on Sat May 13 01:29:33 2017

@author: DIP
"""

from nltk.parse.stanford import StanfordParser

sentence = 'The quick brown fox jumps over the lazy dog'

# create parser object
scp = StanfordParser(path_to_jar='E:/stanford/stanford-parser-full-2015-04-20/stanford-parser.jar',
                   path_to_models_jar='E:/stanford/stanford-parser-full-2015-04-20/stanford-parser-3.5.2-models.jar')


# get parse tree
result = list(scp.raw_parse(sentence)) 
tree = result[0]

# print the constituency parse tree
print(tree) 

# visualize constituency parse tree
tree.draw() 
예제 #38
0
 def parse(self, sentence):
     """Set the parse tree property for the given sentence."""
     parser=StanfordParser(model_path=self.esp_model_path, path_to_models_jar=self.path_to_models_jar, path_to_jar=self.path_to_jar, encoding='utf8')
     self.parse_tree = parser.raw_parse(sentence)
     return self.parse_tree
예제 #39
0
#     num_vocab = len(set([w.lower() for w in childStoryCorpus.words(fileid)]))
#     print ((float(num_chars)/float(num_words)), float(num_words)/float(num_sents), float(num_words)/float(num_vocab), fileid)


for fileid in childStoryCorpus.fileids():

    print (fileid)
    file_path = os.path.join(childStoryCorpusDir, fileid)

    with open(file_path, 'r') as orgf:
        for line in orgf:
            for s in tokenize.sent_tokenize(line):
                print(s)
                #print(st.tag(tokenize.word_tokenize(s)))
                #print(st.tag(s.split()))
                print(list(parser.raw_parse(s)))

                # for line in parser.raw_parse(s):
                #     for sentence in line:
                #         sentence.draw()

    #s = robotStoryCorpus.sents(fileid))
    # for s in robotStoryCorpus.sents(fileid):
    #     print (s)
    #     sentences = parser.parse_sents(s)
    #
    #     for tree in sentences:
    #         list(tree)

        # for line in sentences:
        #     for sentence in line:
예제 #40
0
파일: converter.py 프로젝트: codejitsu/labr
class Converter(Dialog):
    def __init__(self, conversion_path=CONVERSION_PATH):
        with open(conversion_path, 'r') as f:
            self.metrics = json.load(f)

        self.inflect = inflect.engine()
        self.stemmer = SnowballStemmer('english')
        self.parser = StanfordParser(model_path=MODELS_PATH)

    def parse(self, text):
        parsed = self.parser.raw_parse(text)
        return list(parsed)

    def interpret(self, sents, **kwargs):
        measures = []
        confidence = 0
        results = dict()

        root = sents[0]

        if "WRB" in [tag for word, tag in root.pos()]:
            confidence += .2

            for clause in breadth_first(root, maxdepth=8):
                if isinstance(clause, Tree):
                    if clause.label() in ["S", "SQ", "WHNP"]:
                        for token, tag in clause.pos():
                            if tag in ["NN", "NNS"]:
                                measures.append(token)
                            elif tag in ["CD"]:
                                results["quantity"] = token

            measures = list(set([self.stemmer.stem(mnt) for mnt in measures]))

            if len(measures) == 2:
                confidence += .4
                results["src"] = measures[0]
                results["dst"] = measures[1]

                if results["src"] in self.metrics.keys():
                    confidence += .2
                    if results["dst"] in self.metrics[results["src"]]['Destination']:
                        confidence += .2

        return results, confidence, kwargs


    def convert(self, src, dst, quantity=1.0):
        src, dst = tuple(map(self.stemmer.stem, (src,dst)))

        if dst not in self.metrics:
            raise KeyError("cannot convert to '{}' units".format(src))
        if src not in self.metrics[dst]['Destination']:
            raise KeyError("cannot convert from {} to '{}'".format(src, dst))

        units = self.metrics.get(dst).get('Units')[
          self.metrics.get(dst).get('Destination').index(src)
        ]

        return units * float(quantity), src, dst

    def round(self, num):
        num = round(float(num), 4)
        if num.is_integer():
            return int(num)
        return num

    def pluralize(self, noun, num):
        return self.inflect.plural_noun(noun, num)

    def numericalize(self, amt):
        if amt > 100.0 and amt < 1e6:
            return humanize.intcomma(int(amt))
        if amt >= 1e6:
            return humanize.intword(int(amt))
        elif isinstance(amt, int) or amt.is_integer():
            return humanize.apnumber(int(amt))
        else:
            return humanize.fractional(amt)

    def respond(self, sents, confidence, **kwargs):
        if confidence < 0.5:
            return "Sorry, I don't know that one."

        try:
            quantity = sents.get('quantity', 1)
            amount, source, target = self.convert(**sents)

            amount = self.round(amount)
            quantity = self.round(quantity)

            source = self.pluralize(source, quantity)
            target = self.pluralize(target, amount)
            verb = self.inflect.plural_verb("is", amount)

            quantity = self.numericalize(quantity)
            amount = self.numericalize(amount)

            return "There {} {} {} in {} {}".format(
                verb, amount, target, quantity, source
            )
        except KeyError as e:
            return "I'm sorry I {}".format(str(e))
예제 #41
0
    if thresh != -1 and p < thresh:
        return

    for i in range(0, len(r)):
        decoder(r[i], r.label(), k + 1)


dep_parser = StanfordParser(path_to_jar="./stanford-parser.jar",
                            path_to_models_jar="./stanford-models.jar")

load_model()

import sys
filename = sys.argv[1]
text = list(open(filename).readlines())
text = [s.strip() for s in text]

for i in range(len(text)):

    s1 = clean_str(text[i])
    if s1 == "":
        continue

    print 201
    print text[i]

    a = list(dep_parser.raw_parse(s1))
    for _ in range(200):
        decoder(a[0], "root", 0)
        print
예제 #42
0
                        wordsSplit[f]=wordsClean
                        break

        ## choose tagger and tag sentence
        sentClean=str(wordsSplit)
        sentTagged=st.tag(sentClean)


        ## Feature 2: Completeness (capital word initial pos, punct. mark final)
        if wordsSplit[0][0].isupper() and (sent1.endswith(".") or sent1.endswith("!") or sent1.endswith("?")) :
                comp=1
        class_arrays.append(comp)        

        ## Feaure 5: Complexity (Stanford): how deeply embedded is the sentence?
        ##parse sentence with the Stanford parser
        parse=list(parser.raw_parse(sentClean.decode("utf-8")))
        sentParse=str(parse).split(" ")
        for i in range(0, len(sentParse)):
                if "Tree('S'" in sentParse[i]:
                        complexity=complexity+1

        class_arrays.append(complexity)

        ## Feature 6: position of target word - is it at the end?
        sentWOPunc=sentClean.translate(string.maketrans("",""), string.punctuation)
        if sentWOPunc.endswith(target1):
                posT=1

        class_arrays.append(posT)

        ## Make sentence without punctuation and target word lower case 
import os
import sys
import nltk
from nltk.parse.stanford import StanfordParser


f = open(sys.argv[1])

text = f.read()
text = text.decode('utf-8')


sents = nltk.sent_tokenize(text)

print sents

modelPath = 'edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz'

parser = StanfordParser(model_path = modelPath)


for s in sents:
	print list(parser.raw_parse(s))




예제 #44
0
@author: DIP
"""

sentence = 'The brown fox is quick and he is jumping over the lazy dog'

# set java path
import os
java_path = r'C:\Program Files\Java\jdk1.8.0_102\bin\java.exe'
os.environ['JAVAHOME'] = java_path

from nltk.parse.stanford import StanfordParser

scp = StanfordParser(path_to_jar='E:/stanford/stanford-parser-full-2015-04-20/stanford-parser.jar',
                   path_to_models_jar='E:/stanford/stanford-parser-full-2015-04-20/stanford-parser-3.5.2-models.jar')

result = list(scp.raw_parse(sentence))
print result[0]

result[0].draw()

import nltk
from nltk.grammar import Nonterminal
from nltk.corpus import treebank

training_set = treebank.parsed_sents()

print training_set[1]

# extract the productions for all annotated training sentences
treebank_productions = list(
    set(production for sent in training_set for production in sent.productions())
예제 #45
0
파일: rnn.py 프로젝트: giahy2507/rnn
    with open("data/rt-polarity.neg.txt",mode="r") as f:
        neg_sent.append(f.readline())
        neg_sent.append(f.readline())
        neg_sent.append(f.readline())

    with open("data/rt-polarity.pos.txt",mode="r") as f:
        pos_sent.append(f.readline())
        pos_sent.append(f.readline())
        pos_sent.append(f.readline())


    trees = []
    labels = [0]*3 + [1]*3
    sents = pos_sent + neg_sent
    for sent in sents:
        a = list(parser.raw_parse(sent))
        hytree = a[0]
        chomsky_normal_form(hytree)
        trees.append(hytree[0])

    rnn = RecursiveNeuralNetworl(embsize=300,mnb_size=6,wordvector=wordvector)

    trees[0].pretty_print()

    for tree,label in zip(trees,labels):
        root_node, softmax_layer, cost, pred = rnn.forward(tree,label)
        print("correct {0}, predict {1}, cost {2}".format(label,pred,cost))



예제 #46
0
st_ner = StanfordNERTagger(model_filename=stanford_ner_model,
                           path_to_jar=stanford_ner_jar)
#print st_ner.tag('Rami Eid is studying at Stony Brook University in New York'.split())
print st_ner.tag(
    "Gandalf deduces Sauron will attack Gondor 's capital Minas Tirith , riding there with Pippin?"
    .split())

print "========= Checking PARSER ========="
stanford_parser = 'stanford/stanford-parser-full-2015-04-20/'
eng_model_path = stanford_parser + "englishPCFG.caseless.ser.gz"
stanford_parser_model = stanford_parser + 'stanford-parser-3.5.2-models.jar'
stanford_parser_jar = stanford_parser + 'stanford-parser.jar'
st_parser = StanfordParser(model_path=eng_model_path,
                           path_to_models_jar=stanford_parser_model,
                           path_to_jar=stanford_parser_jar)
parser_result = (st_parser.raw_parse(
    'Rami Eid is studying at Stony Brook University in Los Angeles'))

for S in parser_result:
    if S[0][0].label() == 'NP' and S[0][1].label() == 'VP':
        subject_words = S[0][0].leaves()
        print subject_words
        print st_ner.tag(subject_words)
    '''
        if type(node) is nltk.Tree:
            if node.label() == ROOT:
                print "======== Sentence ========="
                print "Sentence:", " ".join(node.leaves())
            else:
                print "Label:", node.label()
                print "Leaves:", node.leaves()
from nltk.tokenize.stanford import StanfordTokenizer

'''
from practnlptools.tools import Annotator

#Testing the abilities of nltk
testString = "I made a poop in my pants"

#part of speech tagging
print(nltk.pos_tag(nltk.word_tokenize(testString)))

print(nltk.__version__)

annotator=Annotator()
notes = annotator.getAnnotations("There are people dying make this world a better place for you and for me.")
print notes['syntax_tree']
'''

import os
from nltk.parse.stanford import StanfordParser
parserPath = "C:\Users\Roger Liu\Desktop\NLP-Final-Project\Libraries\stanford-parser-full-2015-04-20"
os.environ['JAVAHOME'] = 'C:/Program Files/Java/jdk1.8.0_40/bin'  #or your java path
os.environ['CLASSPATH'] = parserPath + '/stanford-parser.jar'
os.environ['STANFORD_MODELS'] = parserPath + '/stanford-parser-3.5.2-models.jar'  
# I am using version 3.5.2 because apparently it is the more stable version, you should replace 3.5.2 with whatever version you're using

sentence = "Stanford parser is slow"
parser=StanfordParser()
print list(parser.raw_parse(sentence))
print "what"