예제 #1
0
def parse_sentence(sentence):
    parser = StanfordDependencyParser(path_to_jar=PATH_TO_JAR, path_to_models_jar=PATH_TO_MODELS)
    trees = list(parser.parse(sentence))
    if not trees:
        return None
    parsed_tree = trees[0]
    return list(parsed_tree.triples())
예제 #2
0
파일: Parse.py 프로젝트: kbroten14/Picard
 def get_parse_tree(self, tagged_sent):
     tree = []
     eng_parser = StanfordDependencyParser(PARSER_PATH, ENGLISH_MODELS_PATH)
     fr_parser = StanfordDependencyParser(PARSER_PATH, FRENCH_MODELS_PATH)
     if self.src_lang is 'eng':
         tree = fr_parser.tagged_parse(tagged_sent)
     elif self.src_lang is 'fr':
         tree = eng_parser.tagged_parse(tagged_sent)
     return tree
def NLTKparserfordependancies(sentnece):

    path_to_jar = '/home/jalaj/stanford-corenlp-full-2016-10-31/stanford-corenlp-3.7.0.jar'
    path_to_models_jar = '/home/jalaj/stanford-corenlp-full-2016-10-31/stanford-corenlp-3.7.0-models.jar'
    dependency_parser = StanfordDependencyParser(path_to_jar=path_to_jar, path_to_models_jar=path_to_models_jar)
    result = dependency_parser.raw_parse(sentnece)
    dep = result.next()
    print "\n------Dependencies------\n"
    print list(dep.triples())
예제 #4
0
def sentToTriples(sent):
    #returns a list of triples
    sent = ''.join([i if i.isalpha() else ' ' for i in sent])
    eng_parser = StanfordDependencyParser(
        r"/home/losphoenix/StanfordNLP/stanford-parser/stanford-parser.jar",
        r"/home/losphoenix/StanfordNLP/stanford-parser/stanford-parser-3.6.0-models.jar",
        r"/home/losphoenix/StanfordNLP/stanford-parser/englishPCFG.ser.gz")

    parsed = eng_parser.parse(sent.split())
    result = list(parsed)
    #print parsed;

    #   for row in result[0].triples():
    #       print(row[0]);
    return result[0].triples()
예제 #5
0
def lambda_function(event, context):
    #STANFORD

    from nltk.parse.stanford import StanfordDependencyParser
    path_to_jar = '../lib/stanford-parser/stanford-parser.jar'
    path_to_models_jar = '../lib/stanford-parser/stanford-parser-3.6.0-models.jar'
    dependency_parser = StanfordDependencyParser(path_to_jar=path_to_jar, path_to_models_jar=path_to_models_jar)

    result = dependency_parser.raw_parse(event)
    dep = result.next()
    a = list(dep.triples())
    #print a
    #print len(a)
    a = get_b_q(a)
    make_graph(a[0], a[1])
예제 #6
0
    def set_dependency_parser(self, config):
        if isinstance(config, dict):
            helpers.cond_print("Dependency Parser: " + config["name"],
                               self.verbose)
            self.dependency_parser = config["name"]
            if config["name"] == "spacy":
                """
                    Sets the model and returns the Spacy NLP instance. Example ways from the Spacy docs:
                    spacy.load("en") # shortcut link
                    spacy.load("en_core_web_sm") # package
                    spacy.load("/path/to/en") # unicode path
                    spacy.load(Path("/path/to/en")) # pathlib Path
                """
                self.dependency_parser_instance = spacy.load(config["model"])

            elif config["name"] == "corenlp":
                if 'CLASSPATH' not in os.environ:
                    os.environ['CLASSPATH'] = ""

                cpath = config["model"] + os.pathsep + config["parser"]
                if cpath not in os.environ['CLASSPATH']:
                    os.environ['CLASSPATH'] = cpath + os.pathsep + os.environ[
                        'CLASSPATH']

                # TODO:- DEPRECATED
                self.dependency_parser_instance = StanfordDependencyParser(
                    path_to_models_jar=config["model"], encoding='utf8')
            elif config["name"] == "corenlp-server":
                # Requires the CoreNLPServer running in the background at the below URL (generally https://localhost:9000)
                # Start server by running the following command in the JARs directory.
                # `java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -annotators "tokenize,ssplit,pos,lemma,parse,sentiment" -port 9000 -timeout 30000`
                self.dependency_parser_instance = CoreNLPDependencyParser(
                    url=config["url"])
예제 #7
0
    def __init__(self, url, testrun):
        """Initialize the ShallowPipeline.

        Args:
            url (String)       The Solr URL for the collection
            testrun (Boolean)  True if it is a test run, False if need
                               to index full corpus
        """
        self.solr = index.SolrSearch(url)
        self.testrun = testrun
        self.stemmer = PorterStemmer()
        self.lemmatizer = WordNetLemmatizer()
        self.tagger = PerceptronTagger()
        self.dep_parser = StanfordDependencyParser(
            model_path='edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz',
            java_options=u'-mx4g')
예제 #8
0
class DependencyParser():
    def __init__(self):

        path2jar = '/home/bendan0617/stanford-corenlp-full-2018-02-27/stanford-corenlp-3.9.1.jar'
        path2model = '/home/bendan0617/stanford-corenlp-full-2018-02-27/stanford-corenlp-3.9.1-models.jar'
        self.dep_parser = StanfordDependencyParser(path_to_jar=path2jar, path_to_models_jar=path2model, java_options='-mx100g')

    def parse_sents(self, sents):
        """
        Parameters:
        sents: list of string

        Reutrns: list of list of triples
        """
        parsed_sents = self.dep_parser.raw_parse_sents(sents)
        return [[list(parse.triples()) for parse in parsed_sent]for parsed_sent in parsed_sents]

    def get_SVOM(self, sents):
        parsed_sents = self.parse_sents(sents)
        output=[]
        for sent in parsed_sents:
            tmp={'V':('<empty>','<empty>'), 'S':('<empty>','<empty>'),
                    'O':('<empty>','<empty>'), 'M':('<empty>','<empty>')}
            for triple in sent[0]:
                t1, t2, t3 = triple[0], triple[1], triple[2]
                if t2[0:5]=='nsubj' and t1[1][0]=='V':
                    if tmp['V'][0]=='<empty>' and t1[1][0] =='V': tmp['V']=t1
                    if tmp['S'][0]=='<empty>': tmp['S']=t3
                elif t2=='nsubj' and t1[1][0] in 'VJNP':
                    if tmp['O'][0]=='<empty>': tmp['O']=t1
                    if tmp['S'][0]=='<empty>': tmp['S']=t3
                elif t2=='cop':
                    if tmp['O'][0]=='<empty>': tmp['O']=t1
                    if tmp['V'][0]=='<empty>': tmp['V']=t3
                elif t2=='dobj':
                    if tmp['V'][0]=='<empty>': tmp['V']=t1
                    if tmp['O'][0]=='<empty>': tmp['O']=t3
                elif t2=='ccomp' or t2=='iobj' or t2=='pobj' or t2=='xcomp':
                    #if tmp['S'][0]=='<empty>':
                       # tmp['S']=t3
                    if tmp['M'][0]=='<empty>':
                        tmp['M']=t3
                elif t2 == 'auxpass':
                    if tmp['V'][0]=='<empty>': tmp['V']=t1
                    if tmp['S'][0]!='<empty>':
                        tmp['O']=tmp['S']
                        tmp['S']=('<empty>','<empty>')
                #elif t2[0:3] == 'acl':
                #    if tmp['S'][0]=='<empty>': tmp['S']=t1
                elif t2[0:4] == 'nmod':
                   # if tmp['V'][0]=='<empty>' and t1[1][0] =='V': tmp['V']=t1
                    if tmp['O'][0]=='<empty>': tmp['O']=t3
                elif t2 == 'dep':
                    if tmp['S'][0]=='<empty>' and t1[1][0] != 'V' : tmp['M']=t1
                #elif t2 == 'xcomp':
                 #   if tmp['S'][0]=='<empty>' and t1[1][0] != 'V' : tmp['S']=t1
                else:
                    continue
            output.append([tmp['S'], tmp['V'], tmp['O'], tmp['M']])
        return output, parsed_sents
예제 #9
0
class MongoConnection:

    java_path = r"C:\Program Files (x86)\Java\jdk1.8.0_111\bin\java.exe"
    os.environ['JAVAHOME'] = java_path

    MONGO_CONNECTION_STRING = "mongodb://127.0.0.1:27017/"
    REVIEWS_DATABASE = "Dataset_Challenge"
    TAGS_DATABASE = "Tags"
    REVIEWS_COLLECTION = "Reviews"
    BUSINESS_COLLECTION = "Business"
    CORPUS_COLLECTION = "Corpus"

    reviews_collection = MongoClient(
        MONGO_CONNECTION_STRING)[REVIEWS_DATABASE][REVIEWS_COLLECTION]
    business_collection = MongoClient(
        MONGO_CONNECTION_STRING)[REVIEWS_DATABASE][BUSINESS_COLLECTION]

    path_to_jar = r'D:\Masters\Fall 2016\iNLP\Final Project\stanford-parser-full-2015-12-09\stanford-parser-full-2015-12-09\stanford-parser.jar'
    path_to_models_jar = r'D:\Masters\Fall 2016\iNLP\Final Project\stanford-parser-full-2015-12-09\stanford-parser-full-2015-12-09\stanford-english-corenlp-2016-10-31-models.jar'
    dependency_parser = StanfordDependencyParser(
        path_to_jar=path_to_jar, path_to_models_jar=path_to_models_jar)

    st = StanfordNERTagger(
        'D:\Masters\Fall 2016\iNLP\Final Project\stanford-ner-2015-12-09\stanford-ner-2015-12-09\classifiers\english.all.3class.distsim.crf.ser.gz',
        'D:\Masters\Fall 2016\iNLP\Final Project\stanford-ner-2015-12-09\stanford-ner-2015-12-09\stanford-ner.jar'
    )
예제 #10
0
def parser():
	os.environ['STANFORD_PARSER_PATH'] = '/Users/CHOON/Desktop/choon94.github.io/week5/nlp2/stanford-parser-full-2015-12-09'
	os.environ['CLASSPATH'] = '/Users/CHOON/Desktop/choon94.github.io/week5/nlp2/stanford-parser-full-2015-12-09/stanford-parser.jar'
	os.environ['STANFORD_MODELS'] = '/Users/CHOON/Desktop/choon94.github.io/week5/nlp2/stanford-parser-full-2015-12-09/stanford-parser-3.6.0-models.jar'

	eng_parser = StanfordParser(model_path='edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz',java_options="-mx2048m")
	for x in content:
		a = list(eng_parser.parse(x.split()))[0]
		print(a)
		# a.draw()

	eng_dep_parser = StanfordDependencyParser(model_path='edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz')
	for x in content:
		a = list(eng_dep_parser.parse(x.split()))[0]
		for row in a.triples():
			print(row)
예제 #11
0
    def dependency_graphs_pairs(self):
        # Part of Dataset
        from fowler.corpora.wsd.datasets import tag_mappings

        df = self.read_file()
        parser = StanfordDependencyParser()

        def parse(string):
            dg = next(parser.raw_parse(string))

            for node in dg.nodes.values():
                if not node['address']:
                    continue

                node['original_tag'] = node['tag']
                node['tag'] = tag_mappings[self.tagset][node['tag'][0]]

            return dg

        for _, row in df.iterrows():
            yield (
                parse(row['#1 String']),
                parse(row['#2 String']),
                row['Quality'],
                row['split'],
            )
예제 #12
0
파일: Evaluate.py 프로젝트: mwalker9/Alvin
	def __init__(self):
		self.data = None
		self.rules = []
		self.tree = None
		self.nodeList = []
		self.landmarks = []
		self.s = None
		self.t = None
		self.dependencies = []
		self.rebuiltDependencies = []
		self.minPath = []
		self.metaPath = []
		self.minPathLength = 999
		self.path = '.\InspirationSet\Paths.txt'
		self.ruleList = []
		self.rulePath = '.\InspirationSet\Rules.txt'
		self.learnedPaths = self.parsePaths(self.path)		
		self.pathCountsPath = '.\InspirationSet\PathCounts.txt'
		f = open(self.pathCountsPath,'r')
		self.trainingPathCounts = cPickle.load(f)
		self.pathCounts = np.zeros(len(self.learnedPaths))
		
		# load in rules
		f = open(self.rulePath, 'r')
		self.knownRules = cPickle.load(f)
		f.close()
		
		# dependency parsers to build parse tree
		#os.environ['JAVA_HOME'] = 'C:/Program Files (x86)/Java/jre1.8.0_65/bin/java.exe'
		self.path_to_jar = 'stanford-parser-full-2015-12-09/stanford-parser.jar'
		self.path_to_models_jar = 'stanford-parser-full-2015-12-09/stanford-parser-3.6.0-models.jar'
		self.dependencyParser = StanfordDependencyParser(path_to_jar=self.path_to_jar, path_to_models_jar=self.path_to_models_jar)
  def __init__(self):
    self.dep_parser = StanfordDependencyParser(model_path=MODEL_PATH)
    self.dep_parser.java_options = '-mx3052m'

    self.dependency_tool = DependencyTool()

    self.nodes = list()
예제 #14
0
    def build_dict(self, key_name):
        from nltk.parse.stanford import StanfordDependencyParser
        core = '/Users/fengwf/stanford/stanford-corenlp-3.7.0.jar'
        model = '/Users/fengwf/stanford/english-models.jar'
        self.parser = StanfordDependencyParser(path_to_jar=core,
                                               path_to_models_jar=model,
                                               encoding='utf8',
                                               java_options='-mx2000m')
        print('Loading data ...')
        data = pickle.load(open('RecipeDatasets/all_mm_recipes.pkl'))
        objs = {}
        adjs = {}
        vbds = {}
        all_sents = []
        print('Processing %s ...' % key_name)
        #ipdb.set_trace()
        for i in tqdm(xrange(len(data))):
            text = data[i]
            sents = [transform_digits(i.lower()) for i in text[key_name]]
            try:
                if key_name == 'Steps':
                    self.parse_steps(sents, all_sents)
                else:
                    self.parse_ingredients(sents, all_sents)
            except AssertionError:
                continue
            except KeyboardInterrupt:
                break
            except:
                continue

        if key_name == 'Steps':
            with open('RecipeDatasets/steps_dependency.pkl', 'w') as f:
                print('\n Saving file ...')
                pickle.dump(all_sents, f)
                print(' Success!\n')
        else:
            with open('RecipeDatasets/obj_dict.pkl', 'w') as f:
                print('\n Saving file ...')
                pickle.dump(
                    {
                        'objs': objs,
                        'adjs': adjs,
                        'vbds': vbds,
                        'all_sents': all_sents
                    }, f)
                print(' Success!\n')
예제 #15
0
def getObj(sentence, verb):
    parser = StanfordDependencyParser()
    lemmatizer = WordNetLemmatizer()
    dependency_tree = [
        list(line.triples()) for line in parser.raw_parse(sentence)
    ]
    dependencies = dependency_tree[0]
    verbLemma = lemmatizer.lemmatize(verb, wordnet.VERB)
    obj = ""
    for dep in dependencies:
        if "VB" in dep[0][1]:
            depVerbLemma = lemmatizer.lemmatize(dep[0][0], wordnet.VERB)
            if (verbLemma == depVerbLemma
                    and ("obj" in dep[1] or "nsubjpass" in dep[1])):
                obj = dep[2][0]  #lemmatize the noun

    return lemmatizer.lemmatize(obj, wordnet.NOUN)
    def __init__(self, path_to_parsers):

        self.dependency_parser = StanfordDependencyParser(
            '%s/stanford-corenlp-full-2018-02-27/stanford-corenlp-3.9.1.jar' %
            path_to_parsers,
            '%s/stanford-corenlp-full-2018-02-27/stanford-corenlp-3.9.1-models.jar'
            % path_to_parsers)
        self.ner_tagger = StanfordNERTagger(
            '%s/stanford-ner-2018-02-27/classifiers/english.all.3class.distsim.crf.ser.gz'
            % path_to_parsers,
            '%s/stanford-ner-2018-02-27/stanford-ner.jar' % path_to_parsers,
            encoding='utf-8')
        ''' set label values '''

        self.NO_SOURCES = 0  # no sources
        self.NO_REAL_ATTRIBUTION = 1  # unnamed source is no real attribution
        self.REAL_ATTRIBUTION = 2  # real attribution is named the quoted organization, activist, or source
예제 #17
0
 def define_stanford_dependency_parser(self,path_to_models_jar=
                                            '/Library/Tools/stanford/stanford-corenlp-full/' \
                                            'stanford-chinese-corenlp-2017-06-09-models.jar',
                                            model_path=
                                            u'edu/stanford/nlp/models/lexparser/chinesePCFG.ser.gz'):
     _stanford_dependency_parser = StanfordDependencyParser(
         path_to_models_jar=path_to_models_jar, model_path=model_path)
     return _stanford_dependency_parser
예제 #18
0
def moreMoney(dep,doc,pattern,unknown):
	import os 
	os.getcwd()
	import numpy as np
	import pandas as pd
	import spacy
	from . import formula
	nlp = spacy.load('en_core_web_sm')
	from difflib import SequenceMatcher
	import re
	path_to_jar = '/usr/local/lib/python2.7/dist-packages/nltk/tag/stanford-parser-3.8.0.jar'
	path_to_models_jar = '/usr/local/lib/python2.7/dist-packages/nltk/tag/stanford-parser-3.8.0-models.jar'

	jar = '/usr/local/lib/python2.7/dist-packages/nltk/tag/stanford-postagger-3.8.0.jar'
	model = '/usr/local/lib/python2.7/dist-packages/nltk/tag/models/english-left3words-distsim.tagger'
	import nltk
	import pprint
	pp = pprint.PrettyPrinter(indent=4)
	from nltk import word_tokenize
	from nltk.corpus import stopwords
	from nltk.parse.corenlp import CoreNLPParser
	from nltk.tag import StanfordNERTagger
	from nltk.parse.stanford import StanfordParser
	from nltk.parse.stanford import StanfordDependencyParser
	from nltk.stem import PorterStemmer
	from nltk.tokenize import sent_tokenize
	from nltk.tag import StanfordPOSTagger
	
	dependency_parser = StanfordDependencyParser(path_to_jar=path_to_jar, path_to_models_jar=path_to_models_jar)
        #print "moreMoney"
	q_dep=[]
	pos_tagger = StanfordPOSTagger(model, jar, encoding='utf8')
	ratios=[]
	answer=""
	for ent in doc.ents:
		ratios=[]
		if(ent.label_=="MONEY" or ent.label_=="CARDINAL"):
	
			q_dep=[]
			for triple in dep.triples():
				pq=clean(ent.text)
				com=clean(triple[0][0])
				com1=clean(triple[2][0])
				if(com==pq or com1==pq):
					money=pq
					q_dep.append(triple)
			q_dep = str(q_dep)
			#print "###################"
			#print q_dep
			for i in range(len(pattern)):
				m=SequenceMatcher(None,pattern["pattern"][i],q_dep)
				q=m.ratio()
				ratios.append(q)
			mx=max(ratios)
			#print len(ratios)
			ino=ratios.index(mx)
			#print ino
			answer=pattern["tag"][ino]
예제 #19
0
def main(fb_path, mid2key_path, data_dir, out_dir):
    HAS_DEP = False
    if HAS_DEP:
        dep_parser = StanfordDependencyParser(model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz") # Set CLASSPATH and STANFORD_MODELS environment variables beforehand
    kb = load_ndjson(fb_path, return_type='dict')
    mid2key = load_json(mid2key_path)
    all_split_questions = []
    split = ['factoid_webqa/train.json', 'factoid_webqa/valid.json', 'factoid_webqa/test.json']
    files = [os.path.join(data_dir, x) for x in split]
    missing_mid2key = []

    for f in files:
        data_type = os.path.basename(f).split('.')[0]
        num_unanswerable = 0
        all_questions = []
        data = load_json(f)
        for q in data:
            questions = {}
            questions['answers'] = q['answers']
            questions['entities'] = q['entities']
            questions['qText'] = q['qText']
            questions['qId'] = q['qId']
            questions['freebaseKey'] = q['freebaseKey']
            questions['freebaseKeyCands'] = [q['freebaseKey']]
            for x in q['freebaseMids']:
                if x['mid'] in mid2key:
                    fbkey = mid2key[x['mid']]
                    if fbkey != q['freebaseKey']:
                        questions['freebaseKeyCands'].append(fbkey)
                else:
                    missing_mid2key.append(x['mid'])

            qtext = tokenize(q['qText'])
            if HAS_DEP:
                qw = list(set(qtext).intersection(question_word_list))
                question_word = qw[0] if len(qw) > 0 else ''
                topic_ent = q['freebaseKey']
                dep_path = extract_dep_feature(dep_parser, ' '.join(qtext), topic_ent, question_word)
            else:
                dep_path = []
            questions['dep_path'] = dep_path
            all_questions.append(questions)

            if not q['freebaseKey'] in kb:
                num_unanswerable += 1
                continue
            cand_ans = fetch_ans_cands(kb[q['freebaseKey']])
            norm_cand_ans = set([normalize_answer(x) for x in cand_ans])
            norm_gold_ans = [normalize_answer(x) for x in q['answers']]
            # Check if we can find the gold answer from the candidiate answers.
            if len(norm_cand_ans.intersection(norm_gold_ans)) == 0:
                num_unanswerable += 1
                continue
        all_split_questions.append(all_questions)
        print('{} set: Num of unanswerable questions: {}'.format(data_type, num_unanswerable))

    for i, each in enumerate(all_split_questions):
        dump_ndjson(each, os.path.join(out_dir, split[i].split('/')[-1]))
예제 #20
0
 def __init__(self):
     path_to_model_tagger = "../lib/stanford/stanford-postagger-full-2016-10-31/models/english-caseless-left3words-distsim.tagger"
     path_to_jar_tagger = "../lib/stanford/stanford-postagger-full-2016-10-31/stanford-postagger.jar"
     NLParser.tagger = StanfordPOSTagger(path_to_model_tagger, path_to_jar_tagger)
     NLParser.tagger.java_options = '-mx4096m'  ### Setting higher memory limit for long sentences
     NLParser.parser = StanfordDependencyParser(
         path_to_jar='../lib/stanford/stanford-parser-full-2016-10-31/stanford-parser.jar')
     print "Parser Initialized........."
     NLParser.parser.raw_parse(self.sentence)
예제 #21
0
def format(sentence, jar_location):
    path_to_jar = jar_location + '/stanford-parser.jar'
    path_to_models_jar = jar_location + '/stanford-parser-3.9.2-models.jar'

    dependency_parser = StanfordDependencyParser(
        path_to_jar=path_to_jar, path_to_models_jar=path_to_models_jar)
    tokens = word_tokenize(sentence)
    result = dependency_parser.raw_parse(sentence)

    for dep in result:
        # print(dep.tree())
        cf = CanvasFrame()
        t = dep.tree()
        tc = TreeWidget(cf.canvas(), t)
        cf.add_widget(tc, 10, 10)
        cf.print_to_file('tree.ps')
        cf.destroy()
        return (dep, tokens)
예제 #22
0
    def __init__(self, config_file_path='aida_event/config/xmie.json'):
        self._config = read_dict_from_json_file(config_file_path)
        self._domain_name = self._config['common_tools']['stanford_url']
        self._port_number = self._config['common_tools']['stanford_port']
        self._pos_model = self._config['common_tools']['stanford_pos_model']
        self._pos_jar = self._config['common_tools']['stanford_pos_jar']
        self._parser_model = self._config['common_tools'][
            'stanford_parser_model']
        self._parser_jar = self._config['common_tools']['stanford_parser_jar']

        self._core_nlp_parser = CoreNLPParser(
            url='%s:%s' % (self._domain_name, self._port_number))
        self._pos_tagger = StanfordPOSTagger(model_filename=self._pos_model,
                                             path_to_jar=self._pos_jar)
        self._dep_parser = StanfordDependencyParser(
            path_to_jar=self._parser_jar,
            path_to_models_jar=self._parser_model,
            java_options='-Xmx16G')
예제 #23
0
    def extract_events2(self, tweet_sentences):
        path_to_jar = 'lib/stanford_parser/stanford-parser.jar'
        path_to_models_jar = 'lib/stanford_parser/stanford-english-corenlp-2018-02-27-models.jar'
        path_to_ner_tagger = 'lib/stanford_ner/stanford-ner.jar'
        path_to_ner_model = 'lib/stanford_ner/english.all.3class.distsim.crf.ser.gz'

        sentence_preprocessor = Preprocessor(['remove_non_letters'])
        ner_tagger = StanfordNERTagger(path_to_ner_model, path_to_ner_tagger)
        dependency_parser = StanfordDependencyParser(
            path_to_jar=path_to_jar, path_to_models_jar=path_to_models_jar)

        events = []

        chunks = list(
            self.utilities.chunkify_list(data_list=tweet_sentences,
                                         items_per_chunk=1000))

        for chunk in chunks:
            created_ats = []
            sentences = []
            for chunk_item in chunk:
                created_ats.append(chunk_item[0])
                sentences.append(
                    sentence_preprocessor.preprocess(chunk_item[1]))

            chunk_sent_dependencies = dependency_parser.raw_parse_sents(
                sentences)
            chunk_sent_ner_tags = ner_tagger.tag_sents(
                [sentence.split() for sentence in sentences])

            for sent_dependencies, sent_ner_tags, created_at in zip(
                    chunk_sent_dependencies, chunk_sent_ner_tags, created_ats):
                dependencies = [
                    list(parse.triples()) for parse in sent_dependencies
                ]

                if len(dependencies) > 0 and dependencies[0] is not None:
                    sentence_events = self.extract_events_from_stanford_dependencies(
                        dependencies[0], sent_ner_tags)
                    if len(sentence_events) > 0:
                        for sentence_event in sentence_events:
                            events.append((created_at, sentence_event))

        return events
예제 #24
0
def dependencies(): # création des fichiers - articles passé dans le stanford parser / analyse en dépendances
    (filenameDep, inputDependencies) = createId()
    
    os.environ['CLASSPATH'] = "stanford-parser/stanford-parser-full-2018-10-17"
    os.environ['JAVAHOME'] = "D:/Program Files/java/bin"
    path_parser = "stanford-parser/stanford-parser-full-2018-10-17/stanford-parser.jar"
    path_model = "stanford-parser/stanford-parser-full-2018-10-17/stanford-parser-3.9.2-models.jar"
    dependency_parser = StanfordDependencyParser(path_to_jar = path_parser, path_to_models_jar = path_model)

    texts_dependencies = {}
    for i in range(len(inputDependencies)):
        parsedText = ""
        dependencies = dependency_parser.parse_sents(inputDependencies[i])
        
        for dep in dependencies:
            for d in dep:
                parsedText += str(d)
        texts_dependencies[filenameDep[i]] = parsedText
    return texts_dependencies
예제 #25
0
class feature_cal():
    def __init__(self, text_collector):
        # wn.ensure_loaded()
        self.text_collector = text_collector
        self.dep_parser = StanfordDependencyParser(
            '/data3/zyx/project/eye_nlp/data/model/stanford-parser.jar',
            '/data3/zyx/project/eye_nlp/data/model/stanford-parser-3.9.2-models.jar',
            model_path=
            '/data3/zyx/project/eye_nlp/data/model/englishPCFG.ser.gz')
        self.tokenizer = nltk.tokenize.RegexpTokenizer('\w+')
        self.nlp = spacy.load("en_core_web_sm")

    def get_feature(self, words_list, wn):

        raw_words_list = [
            self.tokenizer.tokenize(word)[0] for word in words_list
        ]

        fea_num_letter = [len(word) for word in raw_words_list]
        fea_start_capital = [word.istitle() for word in raw_words_list]
        fea_capital_only = [word.isupper() for word in raw_words_list]
        fea_have_num = [
            True if re.match(r'[+-]?\d+$', word) else False
            for word in raw_words_list
        ]
        fea_abbre = [
            word.isupper() and len(word) >= 2 for word in raw_words_list
        ]
        fea_entity_critical = cal_entity_critical(self.nlp, words_list)

        # use nlp method

        doc = self.nlp()
        res = self.dep_parser.parse(words_list)
        deps = res.__next__()
        traverse(deps, 0)  # 0 is always the root node
        fea_domi_nodes = []
        for i in range(1, len(words_list) + 1):
            this_dominate = cal_dominate(deps, i)
            fea_domi_nodes.append(this_dominate)

        fea_max_d = cal_max_d(deps, len(words_list))

        fea_idf = cal_idf(self.text_collector, raw_words_list)
        if len(fea_max_d) != len(fea_have_num):
            print('length error')
        # fea_num_wordnet = [len(wn.synsets(word)) for word in raw_words_list]
        fea_complexity = [
            textstat.flesch_kincaid_grade(str(word)) for word in words_list
        ]
        return [
            fea_num_letter, fea_start_capital, fea_capital_only, fea_have_num,
            fea_abbre, fea_entity_critical, fea_domi_nodes, fea_max_d, fea_idf,
            fea_complexity
        ]
예제 #26
0
def getDepenParser():
    path_to_jar = '../../data/stanford/stanford-parser.jar'
    path_to_models_jar = '../../data/stanford/stanford-parser-3.5.2-models.jar'
    # path_to_models_jar = '../../data/standord/stanford-chinese-corenlp-2016-01-19-models.jar'
    model_path = '../../data/stanford/chinesePCFG.ser.gz'
    dependency_parser = StanfordDependencyParser(
        path_to_jar=path_to_jar,
        path_to_models_jar=path_to_models_jar,
        model_path=model_path)

    return dependency_parser
예제 #27
0
def dependencyParsing(sentence_list):
    
    parsing = []
    sentences_parse = []
    path_to_jar = './StanfordParser/stanford-parser.jar'
    path_to_models_jar = './StanfordParser/stanford-parser-3.9.1-models.jar'
    dependency_parser = StanfordDependencyParser(path_to_jar, path_to_models_jar)
    
    for sent in sentence_list:
        try: 
            result = dependency_parser.raw_parse(sent)
            dep = result.next()        
            for triple in dep.triples():
                #print triple[1],"(",triple[0][0],", ",triple[2][0],")"
                parsing.append(triple[0][0]+'_'+triple[2][0])
            
        except:
            pass
        sentences_parse.append(' '.join(parsing))
    return sentences_parse
예제 #28
0
	def __init__(self):
		
		# print "Inside ntlk util"
		self.constituent_parse_tree = StanfordParser()
		self.stanford_dependency = StanfordDependencyParser()
		self.lemma = WordNetLemmatizer()
		self.home = '/home/ramesh/Documents/mas_course/second_semester/rnd/rnd_submission_cd'
		self.ner = StanfordNERTagger(self.home + '/stanford-ner-2017-06-09/classifiers/english.all.3class.distsim.crf.ser.gz',self.home + '/stanford-ner-2017-06-09/stanford-ner.jar')
		self.pos_tag = StanfordPOSTagger(self.home + '/stanford-postagger-2017-06-09/models/english-bidirectional-distsim.tagger',self.home + '/stanford-postagger-2017-06-09/stanford-postagger-3.8.0.jar')
		self.CharacterOffsetEnd = 0 
		self.CharacterOffsetBegin = 0
		self.contractions = {"'nt":"not", "'ll": " will", "'re":"are", "'ve":"have", "'m":"am"}
예제 #29
0
    def __init__(self):

        # user need to download Stanford Parser, NER and POS tagger from stanford website
        self.constituent_parse_tree = StanfordParser(
        )  #user need to set as environment variable
        self.stanford_dependency = StanfordDependencyParser(
        )  #user need to set as environment variable
        self.lemma = WordNetLemmatizer()
        self.home = '/home/ramesh'
        #user needs to download stanford packages and change directory
        self.ner = StanfordNERTagger(
            self.home +
            '/stanford-ner-2017-06-09/classifiers/english.all.3class.distsim.crf.ser.gz',
            self.home + '/stanford-ner-2017-06-09/stanford-ner.jar')
        self.pos_tag = StanfordPOSTagger(
            self.home +
            '/stanford-postagger-2017-06-09/models/english-bidirectional-distsim.tagger',
            self.home +
            '/stanford-postagger-2017-06-09/stanford-postagger-3.8.0.jar')
        self.CharacterOffsetEnd = 0
        self.CharacterOffsetBegin = 0
 def test_dependency_parse(self):
     sent = [
         'First', 'In', 'the', 'Beckman', 'Ti45', '1', 'hr', '35K', 'and',
         'second', 'PEG', 'ppt', 'using', '1111', '11111111'
         'PEG', '6K', '0.5', 'M', 'NaCl', 'Yamamoto', '1970', 'Virology',
         'aswdf', 'asdf'
     ]
     dep_p = StanfordDependencyParser(
         path_to_jar=cfg.STANFORD_PARSER_JAR,
         path_to_models_jar=cfg.STANFORD_PARSER_MODEL_JAR)
     dep = DepGraphFeatures(dep_p)
     dep.dep_parser.raw_parse(" ".join(sent))
def stanfordDP(sentence, displayTree=0, showTriples=0):
    '''Stanford依存語法解析。若需印出依存圖則設定displayTree=1。
    '''
    #print(repr(sentence),'\n')
    parser = StanfordDependencyParser()
    res = list(parser.parse(sentence.split()))

    #print(res[0].tree(),'\n')
    #print(*res[0].tree(),'\n')

    rels = [rel for rel in res[0].triples()]
    if (showTriples != 0):
        for row in res[0].triples():
            print(row)

    if (displayTree != 0):
        for row in res[0].tree():
            #print(row)
            if type(row) is not str:
                #row.draw()
                display(row)
    return rels
예제 #32
0
def createQuery(question):
    import os
    from nltk.parse.stanford import StanfordDependencyParser
    os.environ[
        'STANFORD_PARSER'] = r'C:\Users\pramod\Desktop\CodeWeek\nlp\stanford-parser-full-2018-10-17'
    os.environ[
        'STANFORD_MODELS'] = r'C:\Users\pramod\Desktop\CodeWeek\nlp\stanford-parser-full-2018-10-17'
    os.environ['JAVAHOME'] = r'C:\Program Files\Java\jdk1.8.0_151\bin'
    dep_parser = StanfordDependencyParser(
        model_path=
        r"C:\Users\pramod\Desktop\CodeWeek\nlp\en_ewt_models\edu\stanford\nlp\models\lexparser\englishPCFG.ser.gz"
    )
    parsedata = list(dep_parser.raw_parse(question))
    strees = list(parsedata[0].tree())
    string = '\"%s\"' % strees[0].flatten().label()
    for t in strees[1:]:
        string = string + '\"%s\"' % (' '.join(t.flatten().leaves()) + ' ' +
                                      t.flatten().label())
        print('\"%s\"' % t.flatten().label(),
              '\"%s\"' % ' '.join(t.flatten().leaves()))
    print(string)
    return string
예제 #33
0
def findDependencies_batched(sentences):
    #try :
    dependency_parser = StanfordDependencyParser(
        path_to_jar=path_to_jar, path_to_models_jar=path_to_models_jar)
    results = dependency_parser.raw_parse_sents(sentences)
    results = list(results)
    if (len(results) != len(sentences)):
        print("#######WARNINING: Len(results) != Len(sentences) - ",
              len(results), len(sentences))

#except :
    #       print("Error in parsing the tree")
    # exit(-1)

    all_pos_tagging = []
    all_roots = []
    all_dependencyList = []
    all_Words = []

    for parsetree in results:
        pos_tagging, roots, dependencyList, Words = findDependencies(
            list(parsetree)[0])
        all_pos_tagging.append(pos_tagging)
        all_roots.append(roots)
        all_dependencyList.append(dependencyList)
        all_Words.append(Words)

    if len(all_pos_tagging) != len(sentences):
        print("#####WARNINING: Len(all_pos_tagging) < Len(sentences) - ",
              len(all_pos_tagging), len(sentences))

    while (len(all_pos_tagging) < len(sentences)):
        all_pos_tagging.append([])
        all_roots.append([])
        all_dependencyList.append([])
        all_Words.append([])

    return all_pos_tagging, all_roots, all_dependencyList, all_Words
예제 #34
0
        def decorated(*args, **kwargs):
            try:
                dep = StanfordDependencyParser(
                    path_to_jar=path_to_jar,
                    path_to_models_jar=path_to_models_jar,
                    java_options="-mx3000m")
            except (pickle.UnpicklingError, EOFError, FileNotFoundError,
                    TypeError, LookupError):
                print("Downloading Stanford Parser ...")
                url = "https://nlp.stanford.edu/software/stanford-parser-full-2017-06-09.zip"
                r = requests.get(url, stream=True)
                total_size = int(r.headers.get('content-length', 0))
                block_size = 1024
                pbar = tqdm(r.iter_content(chunk_size=block_size),
                            total=total_size,
                            unit_divisor=1024,
                            unit='B',
                            unit_scale=True)
                with io.BytesIO() as buf:
                    for chunk in pbar:
                        buf.write(chunk)
                        buf.flush()
                        pbar.update(block_size)

                    buf.seek(0, 0)

                    z = zipfile.ZipFile(buf)
                    dirpath = os.path.dirname(os.path.dirname(path_to_jar))
                    z.extractall(dirpath)
                    z.close()

                dep = StanfordDependencyParser(
                    path_to_jar=path_to_jar,
                    path_to_models_jar=path_to_models_jar,
                    java_options="-mx3000m")

            kwargs['dep_parser'] = dep
            return fn(*args, **kwargs)
예제 #35
0
def init_parsers():
    print("initializing parsers...")
    spacy_parser = spacy.load('en')
    path_to_jar = './stanford-corenlp-full-2018-02-27/stanford-corenlp-3.9.1.jar'
    path_to_models_jar = './stanford-corenlp-full-2018-02-27/stanford-corenlp-3.9.1-models.jar'
    dep_parser = StanfordDependencyParser(
        path_to_jar=path_to_jar, path_to_models_jar=path_to_models_jar)
    tree_parser = StanfordParser(path_to_jar=path_to_jar,
                                 path_to_models_jar=path_to_models_jar)
    annotators = "tokenize, ssplit, pos, lemma, ner, parse, dcoref"
    options = {}
    nlp = StanfordCoreNLP(annotators=annotators, options=options)
    verb_cats_json = json.load(open("verb_cats.json", "r"))
    return nlp, spacy_parser, dep_parser, tree_parser, verb_cats_json
예제 #36
0
    def workflow_resources(self):
        corpus_encoding = self.task_config["CORPUS_ENCODING"]
        stanford_dependency_model_path = self.task_config["STANFORD_DEPENDENCY_MODEL_PATH"]
        stanford_corenlp_models_path = self.task_config["STANFORD_CORENLP_MODELS_PATH"]

        dependency_parser = StanfordDependencyParser(
            stanford_dependency_model_path, stanford_corenlp_models_path, encoding=corpus_encoding
        )

        workflow_resources = {
            "dependency_parser": dependency_parser
        }

        return workflow_resources
예제 #37
0
def get_links(queries):
    os.environ['CLASSPATH']="/infolab/node4/lukuang/Stanford/stanford-parser-full-2016-10-31/stanford-parser.jar:"
    os.environ['CLASSPATH'] += "/infolab/node4/lukuang/Stanford/stanford-parser-full-2016-10-31/stanford-parser-3.7.0-models.jar"
    parser=StanfordDependencyParser(model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz")
    
    links = {}

    for day in queries:
        links[day] = {}
        print "Process day %s" %(day)
        for qid in queries[day]:
            print "\tProcess query %s" %(qid)
            query_text = queries[day][qid]
            # print query_text
            triples = [list(parse.triples()) for parse in parser.raw_parse(query_text)][0]
            # print triples
            query_links = []
            for t in triples:
                a_link = "%s %s" %(procss_unit(t[0][0]),procss_unit(t[2][0]))
                query_links.append(a_link)
                # print "add link %s to query %s" %(a_link,qid)
            links[day][qid] = query_links
    return links
예제 #38
0
	if not G.has_key(root):
		return 0
	nodeList = []
	for child in G[root]:
		node = getLCA(G, child, e1, e2)
		if node != 0:
			nodeList.append(node)
	if len(nodeList) > 1:
		return root
	elif len(nodeList) == 1:
		return nodeList[0]
	else:
		return 0

if __name__ == '__main__':
	dep_parser = StanfordDependencyParser(model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz");

	entityfile = open("../output/entity.txt")
	entityDict = {}
	for entity in entityfile.readlines():
		label, instance = entity.strip().split('\t')
		entityDict[label] = instance

	#outfile = open("../output/cmput690w16a2_Xu.tsv", "w")
	outfile = open("../output/raw_relation.tsv", "w")
	doc_count = 826
	#doc_count = 1
	#reg = r'((PERSON)(\d)+)|(LOCATION(\d)+)|(ORGANIZATION(\d)+)'
	reg = r'PERSON\d+|LOCATION\d+|ORGANIZATION\d+'
	regex = re.compile(reg)
	noun_tag = set(['NN', 'NNS', 'NNP', 'NNPS'])
예제 #39
0
파일: parse.py 프로젝트: ruchir594/pokego
millis1 = int(round(time.time() * 1000))
#MALT

from nltk.parse import malt
mp = malt.MaltParser('../lib/maltparser-1.9.0', '../lib/engmalt.linear-1.7.mco')
print mp.parse_one('I shot an elephant in my pajamas .'.split()).tree()

millis2 = int(round(time.time() * 1000))
print millis2-millis1'''
millis2 = int(round(time.time() * 1000))
#STANFORD

from nltk.parse.stanford import StanfordDependencyParser
path_to_jar = '../lib/stanford-parser/stanford-parser.jar'
path_to_models_jar = '../lib/stanford-parser/stanford-parser-3.6.0-models.jar'
dependency_parser = StanfordDependencyParser(path_to_jar=path_to_jar, path_to_models_jar=path_to_models_jar)

result = dependency_parser.raw_parse('I shot an elephant in my sleep')
dep = result.next()
a = list(dep.triples())

print a
print a[0]
print a[0][0]
print a[0][0][0]

millis3 = int(round(time.time() * 1000))
print millis3-millis2


예제 #40
0
파일: stanford.py 프로젝트: aadah/nlp_proj
 def __init__(self):
     self.parser = StanfordDependencyParser(path_to_jar=config.STANFORD_PARSER_JAR,
                                            path_to_models_jar=config.STANFORD_PARSER_MODEL)
예제 #41
0
파일: stanford.py 프로젝트: aadah/nlp_proj
class DepParser:
    def __init__(self):
        self.parser = StanfordDependencyParser(path_to_jar=config.STANFORD_PARSER_JAR,
                                               path_to_models_jar=config.STANFORD_PARSER_MODEL)

    def get_entity_pairs(self, text):
        pairs = []
        sents = nltk.sent_tokenize(text)
        for sent in sents:
            pairs.extend(self._get_entity_pairs(sent))
        return pairs
        
    def _get_entity_pairs(self, sent):
        #words = nltk.word_tokenize(sent)
        relations = [list(parse.triples()) for parse in self.parser.raw_parse(sent)]
        """
        print '***RELATIONS***'
        for r in relations[0]:
            print r
        """
        nnp_relations = self.filter_for_NNP(relations)

        print '***ONLY NAMED ENTITIES***'
        for r in nnp_relations:
            print r

        pairs = self.build_relation_pairs(nnp_relations, sent)
        return pairs

    def build_compound_dict(self, relations, words):
        compound_dict = collections.defaultdict(list)
        # works on the assumption that there are usually not many shared last names
        # so we can use the last name as the anchor for a compound NNP
        in_progress = False
        current = ''
        for r in relations:
            if r[1] == 'compound':
                # To prevent "Taipei, Taiwan" from being considered a compound entity
                if r[0][0] in words and words[words.index(r[0][0]) - 1] == ',':                    
                    continue
                if r[2][0] in TITLES:
                    continue
                current = r[0]
                compound_dict[r[0]].append(r[2][0])
                in_progress = True
            elif in_progress:
                in_progress = False
                if current[1] != 'NNS':
                    # We want to keep NNS entities because the compound modifiers preceding them
                    # could be important, but we don't want them being a part of set of named entities
                    compound_dict[current].append(current[0])
                current = ''
        # To catch ending compound entities
        if in_progress:
            if current[1] != 'NNS':                
                compound_dict[current].append(current[0])
        return compound_dict

    def normalize(self, entity, compound_dict):
        if entity in compound_dict:
            return ' '.join(compound_dict[entity])
        if type(entity) is tuple:
            entity = entity[0]
        return entity

    def build_relation_dict(self, relations, words):
        relation_dict = collections.defaultdict(set)
        related = set()
        for r in relations:
            if r[1] == 'compound' and r[0][0] in words:
                i = words.index(r[0][0])
                if words[i-1] == ',':
                    relation_dict[r[0]].add(r[2])
                    relation_dict[r[2]].add(r[0])
                continue
            #if r[1] in KEY_RELATIONS:
            relation_dict[r[0]].add(r[2])
            relation_dict[r[2]].add(r[0])
            related.add(r[2])
        return relation_dict

    def build_relation_pairs(self, relations, sent):
        pairs = set()
        words = nltk.word_tokenize(sent)
        relation_dict = self.build_relation_dict(relations, words)
        compound_dict = self.build_compound_dict(relations, words)
        subj = self.get_subj(relations)
        subj_norm = self.normalize(subj,compound_dict)
        obj = self.get_obj(relations)
        obj_norm = self.normalize(obj,compound_dict)
        print 'SUBJECT', subj_norm
        print 'OBJECT', obj_norm
        for entity in relation_dict:
            if not self.is_NNP(entity) or entity in STOP_ENTITIES:
                continue
            if subj and subj != entity:
                pairs.add((self.normalize(entity,compound_dict),subj_norm))
                pairs.add((subj_norm,self.normalize(entity,compound_dict)))
            if obj and obj != entity:
                pairs.add((self.normalize(entity,compound_dict),obj_norm))
                pairs.add((obj_norm,self.normalize(entity,compound_dict)))
            for one_deg_sep in relation_dict[entity]:
                if self.is_NNP(one_deg_sep):
                    if entity == one_deg_sep:
                        continue
                    pairs.add((self.normalize(entity,compound_dict),
                               self.normalize(one_deg_sep,compound_dict)))
                for two_deg_sep in relation_dict[one_deg_sep]:
                    if self.is_NNP(two_deg_sep):
                        if entity == two_deg_sep:
                            continue
                        pairs.add((self.normalize(entity,compound_dict),
                                   self.normalize(two_deg_sep,compound_dict)))
        return pairs

    def is_NNP(self, ent):
        return ent[1] in ['NNP','NNPS','NNS']

    def filter_for_NNP(self, relations):
        return [r for r in relations[0] if self.is_NNP(r[0]) or self.is_NNP(r[2])]

    def get_subj(self, relations):
        for r in relations:
            if 'subj' in r[1] or r[1] == 'agent':
                subj = r[2]
                if self.is_NNP(r[2]):
                    return r[2]
                for r in relations:
                    if r[0] == subj and self.is_NNP(r[2]):
                        return r[2]
    def get_obj(self, relations):
        for r in relations:
            if 'obj' in r[1]:
                obj = r[2]
                if self.is_NNP(r[2]):
                    return r[2]
                for r in relations:
                    if r[0] == obj and self.is_NNP(r[2]):
                        return r[2]
                                  w_type=token.dep_,
                                  left=[t.orth_ 
                                            for t 
                                            in token.lefts],
                                  right=[t.orth_ 
                                             for t 
                                             in token.rights])
                                             

# set java path
import os
java_path = r'C:\Program Files\Java\jdk1.8.0_102\bin\java.exe'
os.environ['JAVAHOME'] = java_path
                                             
from nltk.parse.stanford import StanfordDependencyParser
sdp = StanfordDependencyParser(path_to_jar='E:/stanford/stanford-parser-full-2015-04-20/stanford-parser.jar',
                               path_to_models_jar='E:/stanford/stanford-parser-full-2015-04-20/stanford-parser-3.5.2-models.jar')    
result = list(sdp.raw_parse(sentence))  

result[0]

[item for item in result[0].triples()]

dep_tree = [parse.tree() for parse in result][0]
print dep_tree
dep_tree.draw()

# generation of annotated dependency tree shown in Figure 3-4
from graphviz import Source
dep_tree_dot_repr = [parse for parse in result][0].to_dot()
source = Source(dep_tree_dot_repr, filename="dep_tree", format="png")
source.view()
예제 #43
0
	def get_dependency_tree(self):

		sentence = if_then_parsing(self.text)
		self.logic_text = sentence
		#path_to_jar = '/Users/jane_C/Documents/CMU/Courses/10701-MachineLearning/project/KnowledgeLearning/lib/stanford-parser/stanford-parser.jar'
		#path_to_models_jar = '/Users/jane_C/Documents/CMU/Courses/10701-MachineLearning/project/KnowledgeLearning/lib/stanford-parser/stanford-parser-3.5.2-models.jar'

		path_to_jar = '../lib/stanford-parser/stanford-parser.jar'
		path_to_models_jar = '../lib/stanford-parser/stanford-parser-3.5.2-models.jar'
		
		dependency_parser = StanfordDependencyParser(path_to_jar=path_to_jar, path_to_models_jar=path_to_models_jar)

		sentence_parse = dependency_parser.raw_parse(sentence)

		tokenList = []
		tokenInfo = {}
		tokenInfo["content"] = "ROOT"
		tokenInfo["pos"] = "ROOT"
		tokenInfo["head"] = -1
		tokenInfo["children"] = []
		tokenInfo["if_then"] = -1
		root = Token(0, tokenInfo)
		tokenList.append(root)

		left2right = True
		left2right_point = -1
		index = 0
		for sent in sentence_parse:
			sent_conll = sent.to_conll(10)
			tokens = sent_conll.split("\n")
			index = 0
			for term in tokens:
				index += 1
				tokenInfo = {}
				parse = term.strip().split("\t")
				if term == "" or len(parse) < 10:
					continue
				if parse[1] == ">" or parse[1] == "<":
					if parse[1] == "<":
						left2right = False
					left2right_point = index
					#continue
				tokenInfo["content"] = parse[1]
				tokenInfo["pos"] = parse[4]
				tokenInfo["head"] = int(parse[6])
				tokenInfo["children"] = []
				tokenInfo["if_then"] = 0
				t = Token(index, tokenInfo)
				tokenList.append(t)

		if left2right:
			for i in range(left2right_point, len(tokenList)):
				tokenList[i].if_then = 1
		else:
			for i in range(1, left2right_point):
				tokenList[i].if_then = 1
		tokenList[left2right_point].if_then = -1
		for i in range(1, len(tokenList)):
			token = tokenList[i]
			tokenList[token.head].children.append(i)

		self.tokens = tokenList
예제 #44
0
def write_dependency_rule_by_line(file_name):
	from nltk.parse.stanford import StanfordDependencyParser
	jar = 'lib/stanford-parser-full-2015-12-09/stanford-parser.jar'
	models_jar = 'lib/stanford-parser-full-2015-12-09/stanford-parser-3.6.0-models.jar'
	dependency_parser = StanfordDependencyParser(path_to_jar = jar, path_to_models_jar = models_jar, java_options='-mx3000m')

	all_relations = read_data_utf8(file_name)

	print( 'len of all relations: %d' % (len(all_relations)) )
	sentences = []
	lineno = 0
	line_interval = []
	for idx, relation in enumerate(all_relations):
		_from = lineno

		lines = []
		sent = []
		if '.' in relation['Arg1']['Lemma']:
			for word in relation['Arg1']['Lemma']:
				if word == '.':
					lines.append(' '.join(sent).encode('utf8').replace('\xc2\xa0', ''))
					sent = []
				else:
					sent.append(word)
			lines.append(' '.join(sent).encode('utf8').replace('\xc2\xa0', ''))
		else:
			lines.append(' '.join(relation['Arg1']['Lemma']).encode('utf8').replace('\xc2\xa0', ''))
		
		_to = _from + len(lines)

		sentences += lines
		lines = []
		sent = []
		if '.' in relation['Arg2']['Lemma']:
			for word in relation['Arg2']['Lemma']:
				if word == '.':
					lines.append(' '.join(sent).encode('utf8').replace('\xc2\xa0', ''))
					sent = []
				else:
					sent.append(word)
			lines.append(' '.join(sent).encode('utf8').replace('\xc2\xa0', ''))
		else:
			lines.append(' '.join(relation['Arg2']['Lemma']).encode('utf8').replace('\xc2\xa0', ''))

		_to += len(lines)
		sentences += lines
		lineno = _to
		line_interval.append( (_from, _to ) )
	pass
	for idx, pair in enumerate(line_interval):
		print( '(%d:%d)' % (pair[0],pair[1]) )
		for i in range(pair[0],pair[1]):
			print( '%d:%s' % (i,sentences[i]) )
	
	print( 'len of sentences: %d' % ( len(sentences) ) )

	line_interval_idx = 0
	count = 0
	'''
		each result is correspoding to a sentence
		a line_interval [from, to)
	'''
	relation_length = len(all_relations)
	all_part = 5
	for part in range(all_part+1):
		_from = part * (relation_length / all_part) # inclusive
		if _from >= relation_length:
			break
		_to = min( (part+1) * (relation_length / all_part) -1, relation_length - 1 ) # inclusive
		print('part %d' % part)
		print('relation %d' % (_to - _from+1))

		to_parse_sentences = sentences[ line_interval[_from][0] : line_interval[_to][1] ]
		print('line of sentences %d' % ( len(to_parse_sentences) ) )

		start = time.time()
		parse_result = dependency_parser.raw_parse_sents(to_parse_sentences)
		end = time.time()
		print( 'cost %f' % (end - start) )

		dep_rule_list = []
		dep_rule_for_one_relation = []
		acutal_result_no = 0
		for result in parse_result:
			acutal_result_no += 1
			for t in result:
				for node in range(len(t.nodes)):
					if t.nodes[node]['word'] == None or t.nodes[node]['deps'].items() == []:
						continue
					else:
						dep_rule_for_one_relation.append( '%s<-%s' % \
							(t.nodes[node]['word'],	' '.join( [ key for key, val in t.nodes[node]['deps'].items() ] )))	
			if count == line_interval[line_interval_idx][1] - 1:
				print '%d: (%d, %d) finished' % (line_interval_idx, line_interval[line_interval_idx][0], line_interval[line_interval_idx][1])
				line_interval_idx += 1
				dep_rule_list.append(dep_rule_for_one_relation)
				dep_rule_for_one_relation = []
			
			count += 1
		print 'actual parse result no : %d' % acutal_result_no
		# last relation
		#print '%d: (%d, %d) finished' % (line_interval_idx, line_interval[line_interval_idx][0], line_interval[line_interval_idx][1])
		#line_interval_idx += 1
		#dep_rule_list.append(dep_rule_for_one_relation)

		write_data = []
		for dep_rules in dep_rule_list:
			write_data.append( '||'.join([rule for rule in dep_rules] ) )

		print('length of  write_data %d' % len(write_data))
		with codecs.open('tmp/dep_rule_%s_part%d.txt'% (file_name, part), 'w', encoding = 'utf-8') as file:
			file.write( u'\n'.join(write_data) )
	pass#for part in range(all_part) end
'''
Created on Mar 11, 2016

@author: zhongzhu
'''
import os

from nltk.parse.stanford import StanfordDependencyParser
from nltk.parse.stanford import StanfordParser
from nltk.tag import StanfordNERTagger
from nltk.tag.stanford import StanfordPOSTagger


st = StanfordPOSTagger('english-bidirectional-distsim.tagger')
st.tag('What is the airspeed of an unladen swallow ?'.split())

st = StanfordNERTagger('english.all.3class.distsim.crf.ser.gz') 
st.tag('Rami Eid is studying at Stony Brook University in NY'.split())

parser = StanfordParser(model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz")
list(parser.raw_parse("the quick brown fox jumps over the lazy dog"))

dep_parser = StanfordDependencyParser(model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz")
print [parse.tree() for parse in dep_parser.raw_parse("The quick brown fox jumps over the lazy dog.")]
예제 #46
0
파일: Evaluate.py 프로젝트: mwalker9/Alvin
class Evaluator(object):
	
	def __init__(self):
		self.data = None
		self.rules = []
		self.tree = None
		self.nodeList = []
		self.landmarks = []
		self.s = None
		self.t = None
		self.dependencies = []
		self.rebuiltDependencies = []
		self.minPath = []
		self.metaPath = []
		self.minPathLength = 999
		self.path = '.\InspirationSet\Paths.txt'
		self.ruleList = []
		self.rulePath = '.\InspirationSet\Rules.txt'
		self.learnedPaths = self.parsePaths(self.path)		
		self.pathCountsPath = '.\InspirationSet\PathCounts.txt'
		f = open(self.pathCountsPath,'r')
		self.trainingPathCounts = cPickle.load(f)
		self.pathCounts = np.zeros(len(self.learnedPaths))
		
		# load in rules
		f = open(self.rulePath, 'r')
		self.knownRules = cPickle.load(f)
		f.close()
		
		# dependency parsers to build parse tree
		#os.environ['JAVA_HOME'] = 'C:/Program Files (x86)/Java/jre1.8.0_65/bin/java.exe'
		self.path_to_jar = 'stanford-parser-full-2015-12-09/stanford-parser.jar'
		self.path_to_models_jar = 'stanford-parser-full-2015-12-09/stanford-parser-3.6.0-models.jar'
		self.dependencyParser = StanfordDependencyParser(path_to_jar=self.path_to_jar, path_to_models_jar=self.path_to_models_jar)
		
	# evaluates the line
	def evaluateLine(self, line):
		
		# clear previous data
		self.ruleList = []
		
		self.processLine(line)
		
		#for i in self.dependencies:
		#	print i
		
		# reset the  path count numbers
		self.pathCounts = np.zeros(len(self.learnedPaths))
		
		for path in self.learnedPaths:
			#print path
			self.parseRules(path)
			
		score = (self.pathCounts * self.trainingPathCounts).sum()
			
		# upload known rules
		# observe that we do not need to upload these rules. They were never stored to memory
		f = open(self.rulePath, 'r')
		knownRules = cPickle.load(f)
		f.close()
		
		for i in self.ruleList:
			if i in self.knownRules:
				#print i
				score += 100
	
		return score
		
	# builds and modifies the dependencies
	def processLine(self, line):
		# first derive the tree
		result = self.dependencyParser.raw_parse(line)
		dependencies = result.next()
		self.dependencies = list(dependencies.triples())
				
		# build the tree
		self.buildTrees(self.dependencies)
		
		# now combine compounds
		self.combineCompounds()
		self.prependAdjectiveWrapper()
		try:
			self.unificationWrapper()
		except:
			print 'unification crashed!'
			
		
		
		# creates the new list of dependencies
		self.treeToDependencies()
		
		#for i in self.dependencies:
		#	print i
			
	# creates the list of dependencies from the tree
	def treeToDependencies(self):
	
		self.rebuiltDependencies = []
		
		# start at root and move down
		self.nodeToTuple(self.tree.root)
		
		self.dependencies = self.rebuiltDependencies
		
	# creates a list tuple for the node	
	def nodeToTuple(self, Node):
	
		if len(Node.children) == 0:
			# we are done with this node
			return
			
		# create governor values
		g = (Node.value, Node.type)
	
		# depends on the children
		for child in Node.children:
			
			r = child.edge.relationship
			d = (child.value, child.type)
			self.rebuiltDependencies.append((g, r, d))
			self.nodeToTuple(child)
		
	def parsePaths(self, rulesPath):
	
		paths = []
		
		f = open(rulesPath, 'r')
		
		eof = False
		
		while not eof:
			
			try:
				path = cPickle.load(f)
				if path not in paths:
					paths.append(path)
			except:
				eof = True
				
		f.close()
		
		return paths
		
	# uploads data from different sources
	def parseData(self, path):
		f = open(path, 'r')
		text = f.read()
		
		# delete out hyperlinks and references
		procText = ''
		ignore = False
		punctuation = ['.', ',', ';', '-', "'"]
		for i in text:
			if (i.isalnum() or i.isspace() or i in punctuation) and not ignore:
				procText += i
			# need to ignore references
			if i == '[' or i =='(':
				ignore = True
			elif i == ']' or i == ')':
				ignore = False

		text = procText.split('. ')
		
		data = []
		for line in text:
			# double end of lines means there is a break in sentences
			line = line.split('\n\n')
			for sent in line:
				sent = sent.replace('\n', '')
				if sent != '':
					data.append(sent)
		
		return data
		
	def createTree(self, dependencies):
		
		# find the root first
		idx, root = self.findRoot(dependencies)

		# build the tree	
		self.tree = Tree.Tree(root, dependencies, idx)
		self.tree.buildTree()
		
	def findRoot(self, dependencies):
		# finds the root of the tree by find the head that has no dependencies
		for i, (g1, r1, d1) in enumerate(dependencies):
			isDependent = False
			for (g2, r2, d2) in dependencies:
				if g1[0] == d2[0]:
					isDependent = True
					
			if not isDependent:
				return i, g1[0]
				
	def textToRules(self, rawText):	
		valuations = []
		# 3 step process	
		#	1. Convert raw text to dependency graph
		#	2. Convert dependency graph to cfg
		#	3. Extract valuations
		#	4. Convert valuations to 1st order logic
		
		# 1. Convert raw text to dependency graph
		# http://stackoverflow.com/questions/7443330/how-do-i-do-dependency-parsing-in-nltk/33808164#33808164
		#	First parse text into atomic dependencies		
		result = self.dependencyParser.raw_parse(rawText)
		# list of dependency for each word
		dependencies = result.next()
		self.dependencies = list(dependencies.triples())
		
		#return valuations, dependencyList
		
		#print dependencyList
		self.buildTrees(self.dependencies)
		
		self.combineCompounds()
		self.prependAdjectiveWrapper()
		
		# creates the new list of dependencies
		self.treeToDependencies()
		
		# a series of joining common areas of the graph.
		# we can learn these!!! (learn common combinations from training data)
		self.parseRules(self.dependencies)
	
		#self.rootParse(dependencyList)
		
		# Extract valuations
		#valuations = self.extractVerbs(dependencyList)
		
	# combines all compounds	
	def combineCompounds(self):
		
		# the final compound will take the POS tag of the parent 
		self.addCompound(self.tree.root)
		
	# the node takes value from its children with compound relationships
	def addCompound(self, Node):
		
		if len(Node.children) == 0:
			# nothing to do here
			return
			
		popL = []
		s = ''
		for i,child in enumerate(Node.children):
			
			# check to see if it is a compound
			if child.edge.relationship == 'compound':
				s += child.value + '_'
				popL.append(i)
				
			else:
				self.addCompound(child)
				
		popL.reverse()
		
		# remove compound children
		for i in popL:
			Node.children.pop(i)
			
		# give the node its full name
		Node.value = s + Node.value
		
	# prepends adjectives
	def prependAdjectiveWrapper(self):
		
		self.prependAdjective(self.tree.root)
	
	# prepends JJ to each node from its children
	def prependAdjective(self, Node):
		if len(Node.children) == 0:
			# nothing to do here
			return
			
		popL = []
		s = ''
		for i,child in enumerate(Node.children):
			
			# check to see if it is a compound
			if child.type == 'JJ':
				s += child.value + '_'
				
				popL.append(i)
				
			else:
				self.prependAdjective(child)
				
		popL.reverse()
		
		# remove compound children
		for i in popL:
			Node.children.pop(i)
			
		# give the node its full name
		Node.value = s + Node.value
		
	# unifies the {W*} PoS to a noun ancestor and PRP
	def unificationWrapper(self):		
		
		self.unificationPronoun(self.tree.root)
		self.unificationW(self.tree.root)
	
	def unificationPronoun(self, Node):
		pass
		
	def unificationW(self, Node):
	
		if Node.type == 'WP':
			# return node of ancestor whose parent is connected by acl:relcl
			value, type = self.findRelationship(Node, 'acl:relcl')
			Node.value = value; Node.type = type
		elif len(Node.children) == 0:
			pass
		else:
			for child in Node.children:
				self.unificationW(child)
		
	# returns the type and value of a node that is connected to a parent by the specified relationship
	def findRelationship(self, Node, relationship):
			if Node.edge.relationship == relationship:
				return Node.parent.value, Node.parent.type
			else:
				return self.findRelationship(Node.parent, relationship)
			
		
	def concatenateCompounds(self, dependencies, governor, parent):
		# we want to return the last compound
		window = False
		compound = False
		for i,(g, r, d) in enumerate(dependencies):
		
			if window == False and g[0] == parent and d[0] == governor:
				# we can start to consider compounds
				window = True
			
			elif window == True and g[0] != parent and d[0] == governor:
				# we have come across a different node with the same value
				window = False
				# we are done
				break
			
			elif window == True and g[0] == governor and r == 'nummod':
				compound = d[0]
		
			elif window == True and g[0] == governor and r == 'compound':
				compound = d[0]
			
			# adjective
			elif window == True and g[0] == governor and r == 'amod':
				compound = d[0]
		
		return compound		
		
	# builds both the main tree and the substructures	
	def buildTrees(self, dependencies):		
	
		# find the root
		self.createTree(dependencies)
		
		# build substructures for xcomp
		#self.parseXComp(dependencies)		
	
	def rootParse(self, dependencies):

		# write rules to a document
		f = open('C:\Users\jkjohnson\Documents\CS 673\Alvin-master\Star Wars Data\Rules.txt', 'ab')
		
		# loop through and find triangles
		for i, (g, r, d) in enumerate(dependencies):
			if g[1][0] == 'V':
				
				# verb nodes
				vNodes = set([])
				# noun nodes
				nNodes = set([])
				
				self.tree.findNodeWrapper(g[0], g[1], '', '', 'buildtree')
				n = self.tree.foundNode
				
					
				# this is the case where the node has already been evaluated
				if n == None:
					continue
				# look for rules with children
				for child in n.children:
					#print 'looking for children of', g[0]
					if child.type[:2] == 'NN' or child.type == 'PRP' or child.type == 'WP':
						# we can never use this node for another purpose
						#child.checked = True
						nNodes.add(child)
					elif child.type[:1] == 'V':
						# these are very interesting
						vNodes.add(child)
						
				print g[0], len(nNodes), len(vNodes)
				
				# pull data from nodes
				nNL, vNL, tNL, rNL = self.organizeNodes(nNodes, dependencies)
				nVL, vVL, tVL, rVL = self.organizeNodes(vNodes, dependencies)				
						
				if len(nNL) == 1:
					# extract the node
					#n = nodes.pop()
					pass
				
					#print g[0] + "(" + n.value + ")", n.edge.relationship				
						
				# we can look for certain combinations of nouns and relationships
				elif len(nNL) >= 2:
				
					# classic structure of a subject and direct object
					if 'nsubj' in rNL and 'dobj' in rNL:
						rule = g[0] + "(" + vNL[rNL.index('nsubj')] + ", " + vNL[rNL.index('dobj')] + ")"
						f.write(rule + '\n')
						print rule
						
					elif 'nsubj' in rNL and 'xcomp' in rNL:
						rule = g[0] + "(" + vNL[rNL.index('nsubj')] + ", " + vNL[rNL.index('xcomp')] + ")"
						f.write(rule + '\n')
						print rule	

					elif 'nsubj' in rNL and 'nmod' in rNL:
						rule = g[0] + "(" + vNL[rNL.index('nsubj')] + ", " + vNL[rNL.index('nmod')] + ")"
						f.write(rule + '\n')
						print rule	
						
					elif 'nsubjpass' in rNL and 'nmod' in rNL:
						
						'''
						if 'auxpass' in rVL:
							rule = vVL[rVL.index('auxpass')] + '_' + g[0] + "(" + vNL[rNL.index('nsubjpass')] + ", " + vNL[rNL.index('nmod')] + ")"
							f.write(rule + '\n')
							print rule	
						'''
						
						rule = g[0] + "(" + vNL[rNL.index('nmod')] + ", " + vNL[rNL.index('nsubjpass')] + ")"
						f.write(rule + '\n')
						print rule	

				if len(nVL) > 0:
					# right now, we are just looking for conjunctions
					
					# conjunction					
					
					if 'conj' not in rVL:
						# save the trouble of looking for anything else for now. Maybe need something later!!!
						continue
						
					# there may be multiple conjunctions
					
					for verbNode in nVL:
					
						if verbNode.edge.relationship == 'xcomp':
							
							if 'nsubj' in rNL:
								rule = g[0] + "_" + self.tree.xcompD[verbNode.value]['verbConj'] + \
								"(" + vNL[rNL.index('nsubj')] + ", " + self.tree.xcompD[verbNode.value]['dobjConj'] + ")"
					
						elif verbNode.edge.relationship == 'conj':
							#print 'right here', verbNode.value
							#print rNL
						
							value = ''; adverb = ''
							for child in verbNode.children:
								if child.edge.relationship == 'dobj' or child.edge.relationship == 'xcomp':
									value = child.value
									compound = self.concatenateCompounds(dependencies, value, child.parent)
									if compound != False:
										value = compound + ' ' + value
										
								elif child.edge.relationship == 'advmod':
									adverb = child.value
							
							# go back and use the parent nmod
						
							if value == '':
								if 'nmod' in rNL:
									value = vNL[rNL.index('nmod')]
								elif 'xcomp' in rNL:
									value = vNL[rNL.index('xcomp')]
								
						
							if 'nsubj' in rNL:
								#		verb joined to head				subject of head verb			
								rule = verbNode.value + "(" + vNL[rNL.index('nsubj')] + ", " + value + ")"
								f.write(rule + '\n')
								print rule
								
							elif 'nsubjpass' in rNL:
								#		verb joined to head				subject of head verb			
								rule = verbNode.value + "(" + value + ", " + vNL[rNL.index('nsubjpass')] + ")"
								f.write(rule + '\n')
								print rule				
					
						
					
			# very simple rule for adjectives
			'''
			elif d[1] == 'JJ':
				# find any compounds
				newValue = ''
				comp = self.concatenateCompounds(dependencies, g[0])
				if comp == False:
					newValue = g[0]
				else:
					newValue = comp + " " + g[0]		
				
				rule = d[0] + "(" + newValue + ")"
				f.write(rule + '\n')
				print rule	
			'''
		f.close()
				
	# pops the nodes out of the set and also creates lists of their data
	def organizeNodes(self, nodeSet, dependencies):
		
		# structures to hold node data
		nodeL = []; valueL = []; typeL = []; relationL = []
		
		while len(nodeSet) > 0:
			n = nodeSet.pop()
			
			# find any compounds
			comp = self.concatenateCompounds(dependencies, n.value, n.parent)
			if comp == False:
				pass
			else:
				n.value = comp + " " + n.value	

			# switch out proper nouns
			# !!!
			
			valueL.append(n.value)
			typeL.append(n.type)
			relationL.append(n.edge.relationship)
			nodeL.append(n)
			
		return nodeL, valueL, typeL, relationL
		
							
	def findParent(self, dependencies, (gV, gT), i):
	
		for j, (g, r, d) in enumerate(dependencies[:i]):
			
			# it can only be the parent
			if d[0] == gV and d[1] == gT:
				return g[0], g[1], r