Exemplo n.º 1
0
class Parser():
    def __init__(self):
        #corenlp_dir = "/export/data/ghpaetzold/simpatico/server_simplifiers/core_nlp/stanford-corenlp-full-2016-10-31/"
        corenlp_dir = "/export/data/cscarton/simpatico/stanford-corenlp-full-2016-10-31/"
        self.corenlp = StanfordCoreNLP(corenlp_dir, memory="4g", properties='galician.myproperties.properties')
    
    def process(self, sentence):
        #sentences = open(self.doc, "r").read().strip().split("\n")
        #sentences = [l.strip().split(' ') for l in f_read]
        #dep_parser = StanfordDependencyParser(model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz")
        return self.corenlp.raw_parse(sentence)['sentences'][0]

    def transform(self, parsed):
        dict_dep = {}
        for rel, _, head, word, n in parsed['dependencies']:
            
            n = int(n)
            head = int(head)

            if head not in dict_dep.keys():
                dict_dep[head] = {}
            if rel not in dict_dep[head].keys():
                dict_dep[head][rel] = []

            dict_dep[head][rel].append(n)
                


        return dict_dep
Exemplo n.º 2
0
class NLPParser(object):
    """
    NLP parse, including Part-Of-Speech tagging and dependency parse.
    Attributes
    ==========
    parser: StanfordCoreNLP
        the Staford Core NLP parser
    """
    def __init__(self, corenlp_dir):
        self.parser = StanfordCoreNLP(corenlp_dir)

    def parse(self, sent):
        """
        Part-Of-Speech tagging and dependency parse.
        :param sent: string
        :return: a list of tuple (word, pos, dependency)
        """
        result = self.parser.raw_parse(sent)
        tuples = []
        for s in result['sentences']:
            word, pos, dependency = [], [], []
            for dep in s['dependencies']:
                dependency.append({
                    'type': dep[0],
                    'dep': int(dep[2]) - 1,
                    'gov': int(dep[4]) - 1
                })
            for w in s['words']:
                word.append(w[0])
                pos.append(w[1]['PartOfSpeech'])
            tuples.append((word, pos, dependency))
        return tuples
Exemplo n.º 3
0
class NLPParser(object):
    """
    NLP parse, including Part-Of-Speech tagging and dependency parse.
    Attributes
    ==========
    parser: StanfordCoreNLP
        the Staford Core NLP parser
    """
    def __init__(self, corenlp_dir):
        self.parser = StanfordCoreNLP(corenlp_dir)

    def parse(self, sent):
        """
        Part-Of-Speech tagging and dependency parse.
        :param sent: string
        :return: a list of tuple (word, pos, dependency)
        """
        result = self.parser.raw_parse(sent)
        tuples = []
        for s in result['sentences']:
            word, pos, dependency = [], [], []
            for dep in s['dependencies']:
                dependency.append({'type': dep[0], 'dep': int(dep[2])-1, 'gov': int(dep[4])-1})
            for w in s['words']:
                word.append(w[0])
                pos.append(w[1]['PartOfSpeech'])
            tuples.append((word, pos, dependency))
        return tuples
Exemplo n.º 4
0
class NLPParser(object):
    """
    NLP parse, including Part-Of-Speech tagging.
    Attributes
    ==========
    parser: StanfordCoreNLP
        the Staford Core NLP parser
    """
    def __init__(self, corenlp_dir):
        self.parser = StanfordCoreNLP(corenlp_dir)

        #self.parser = POSTagger(corenlp_dir+'/models/english-bidirectional-distsim.tagger', corenlp_dir+'/stanford-postagger.jar')
    def parse(self, sent):
        """
        Part-Of-Speech tagging
        :param sent: string
        :return: a list of tuple (tokens, pos)
        """
        """
        tokens = []
        pos = []
        result = self.parser.tag(sent.split())
        for entry in result:
            tokens.append(entry[0])
            pos.append(entry[1])
        tuples = [tokens, pos]
        return tuples
        """
        result = self.parser.raw_parse(sent)
        tuples = []
        word, pos = [], []
        for s in result['sentences']:
            for w in s['words']:
                word.append(w[0])
                pos.append(w[1]['PartOfSpeech'])

            pattern = re.compile('\[Text=')
            tokenpattern = re.compile('\[Text=[^\s]+\s')
            pospattern = re.compile('PartOfSpeech=[^\s]+\s')
            startIdxed = []
            for t in re.finditer(pattern, s['parsetree']):
                startIdxed.append(t.start())
            for i in range(len(startIdxed)):
                start = startIdxed[i]
                if i < len(startIdxed) - 1:
                    end = startIdxed[i+1]
                else:
                    end = -1
                token = s['parsetree'][start:end]
                text = re.findall(tokenpattern, token)
                partOfSpeech = re.findall(pospattern, token)
                word.append(text[0][6:-1])
                pos.append(partOfSpeech[0][13:-1])
        tuples.append((word, pos))
        #print tuples
        return tuples
def sentToParse(Res, num_sents):
    # load corenlp
    sys.path.insert(0, osp.join(ROOT_DIR, 'pyutils', 'corenlp'))
    from corenlp import StanfordCoreNLP
    parser_path = osp.join(ROOT_DIR, 'pyutils', 'corenlp',
                           'stanford-corenlp-full-2015-01-30')
    stanfordParser = StanfordCoreNLP(parser_path)
    num_sents = len(Res) if num_sents < 0 else num_sents
    print 'stanford parser loaded.'
    # start parsing
    num_sents = len(Res) if num_sents < 0 else num_sents
    for i in range(num_sents):
        ref_id, sent = Res[i]['ref_id'], Res[i]['sent']
        parse = stanfordParser.raw_parse(sent)['sentences'][0]
        Res[i]['parse'] = parse
        print '%s/%s sent is parsed.' % (i + 1, num_sents)
Exemplo n.º 6
0
class StringProcessor(object):
    """Tokenize or parse a string.
    """

    def __init__(self, project):
        """Instantiate and ready the parser. Note that readying the parser takes
        some time.
        """
        self.parser = StanfordCoreNLP(app.config["CORE_NLP_DIR"])
        self.project = project

        logger = logging.getLogger(__name__)
        global project_logger
        project_logger = ProjectLogger(logger, project)

    def tokenize(self, txt):
        """Turn a string of one or more ``Sentence``\s into a list of
        ``Sentence`` objects. This method will also tokenize each word in txt,
        find its PoS, lemma, and space_before.

        :param str txt: One or more sentences, in a string format.
        :return list: A list of document.Sentence objects.
        """
        sentences = []

        for sentence_text in split_sentences(txt):
            sentence = self.parse_with_error_handling(sentence_text)
            sentences.extend(tokenize_from_raw(sentence, sentence_text,
                self.project))

        return sentences

    def parse(self, sentence, relationships=None, dependencies=None,
            max_length=30):
        """Parse a ``Sentence`` and extract dependencies, parse trees, etc.

        Note that for max_length, a "word" is defined as something with a space
        on at least one side. This is not the typical definition of "word".
        This is done so that length can be checked before resources are
        committed to processing a very long sentence.

        :param Sentence sentence: The ``Sentence`` object.
        :param int max_length: The most amount of words to process.
        """

        parsed = self.parse_with_error_handling(sentence.text)

        # If the parse was unsuccessful, exit
        if parsed == None:
            return

        parsed_sentence = parsed["sentences"][0]

        if len(parsed["sentences"]) > 1:
            project_logger.warning("More than one sentence passed in to"
                " StringProcessor.parse().")
            parsed_sentence["text"] += parsed["sentences"][1]["text"]

        for dependency in parsed_sentence["dependencies"]:
            # We don't want to make a dependency involving ROOT
            if int(dependency[2]) > 0 and int(dependency[4]) > 0:
                governor = dependency[1]
                dependent = dependency[3]
                governor_index = int(dependency[2]) - 1
                dependent_index = int(dependency[4]) - 1
                governor_pos = parsed_sentence["words"][governor_index][1]\
                    ["PartOfSpeech"]
                governor_lemma = parsed_sentence["words"][governor_index][1]\
                    ["Lemma"]
                dependent_pos = parsed_sentence["words"][dependent_index][1]\
                    ["PartOfSpeech"]
                dependent_lemma = parsed_sentence["words"][dependent_index][1]\
                    ["Lemma"]
                grammatical_relationship = dependency[0]

                # If dictionaries are present, run with duplication handling
                if relationships != None and dependencies != None:
                    key = grammatical_relationship

                    if key in relationships.keys():
                        relationship = relationships[key]
                    else:

                        try:
                            relationship = GrammaticalRelationship.query.\
                                filter_by(name = grammatical_relationship).\
                                one()
                        except(MultipleResultsFound):
                            project_logger.error("duplicate records found "
                                "for: %s", str(key))
                        except(NoResultFound):
                            relationship = GrammaticalRelationship(
                                name = grammatical_relationship)

                        relationships[key] = relationship

                    # Read the data for the governor, and find the
                    # corresponding word
                    governor = Word.query.filter_by(
                        word = governor,
                        lemma = governor_lemma,
                        part_of_speech = governor_pos
                    ).first()

                    # Same as above for the dependent in the relationship
                    dependent = Word.query.filter_by(
                        word = dependent,
                        lemma = dependent_lemma,
                        part_of_speech = dependent_pos
                    ).first()

                    try:
                        governor.id
                        dependent.id
                    except:
                        project_logger.error("Governor or dependent not "
                            "found; giving up on parse. This likely indicates"
                            " an error in the preprocessing; rerunning the "
                            "preprocessor is recommended.")
                        project_logger.info(sentence)
                        return sentence

                    key = (relationship.name, governor.id, dependent.id)

                    if key in dependencies.keys():
                        dependency = dependencies[key]
                    else:

                        try:
                            dependency = Dependency.query.filter_by(
                                grammatical_relationship = relationship,
                                governor = governor,
                                dependent = dependent
                            ).one()
                        except(MultipleResultsFound):
                            self.logg_error(("duplicate records found for: %s",
                                str(key)))
                        except(NoResultFound):
                            dependency = Dependency(
                                grammatical_relationship = relationship,
                                governor = governor,
                                dependent = dependent
                            )

                        dependencies[key] = dependency

                    # Add the dependency to the sentence
                    sentence.add_dependency(
                        dependency = dependency,
                        governor_index = governor_index,
                        dependent_index = dependent_index,
                        project = self.project,
                        force = False
                    )

                    dependency.save(False)

                else:
                    # TODO: fill
                    pass

        return sentence

    def parse_with_error_handling(self, text):
        """Run the parser and handle errors properly.

        Also checks the sentence text for irregularities that may break the
        parser and handles it before proceeding.

        Any failure will cause this method to return None

        :param str text: The text of the sentence to check
        """

        # Check for non-string
        if not isinstance(text, str) and not isinstance(text, unicode):
            project_logger.warning("Parser got a non-string argument: %s",
                text)
            return None

        # Check for non-unicode
        if not isinstance(text, unicode):

            # Try to convert the string to unicode if possible
            # Unit test: should fail with this example:
            # http://stackoverflow.com/questions/6257647/convert-string-to-unicode

            try:
                text = unicode(text)
            except(UnicodeDecodeError):
                project_logger.warning("The following sentence text is "
                    "not unicode; convertion failed.")
                project_logger.info(text)

                # Skip sentence if flag is True
                if app.config["SKIP_SENTENCE_ON_ERROR"]:
                    return None
                else:
                    # Try to parse the sentence anyway
                    project_logger.warning("Attempting to parse "
                        "non-unicode sentence.")

        # Check for empty or nonexistent text
        if text == "" or text == None:
            return None

        # Check for irregular characters
        # TODO: what are considered irregular characters?

        # Try to parse, catch errors
        parsed_text = None
        try:
            parsed_text = self.parser.raw_parse(text)
        # TODO: handle all errors properly
        # ProcessError, TimeoutError, OutOfMemoryError
        except TimeoutError as e:
            project_logger.error("Got a TimeoutError: %s", str(e))
            return None
        except ProcessError as e:
            project_logger.error("Got a ProcessError: %s", str(e))
            return None
        except:
            project_logger.error("Unknown error")
            return None

        # Parse successful, return parsed text
        return parsed_text
corenlp_dir = "/NLP_TOOLS/tool_sets/stanford-corenlp/stanford-corenlp-full-2015-04-20/"
parser = StanfordCoreNLP(corenlp_dir)

print("Stanford loaded")

tree_re = re.compile(r"\(ROOT.*")

cachedAligned = []

for aligned in pickleFile:
    if aligned is None:
        continue
    text = unicode(str(aligned), errors='replace').encode('ascii', 'ignore')
    #
    try:
        results = parser.raw_parse(text)

        aligned.tree = []
        aligned.dependencies = []

        for s in results['sentences']:
            aligned.tree.append(tree_re.search(s['parsetree']).group(0))
            aligned.dependencies += s['dependencies']
     
     
    except:
        print(text)
        print( "Unexpected error:", sys.exc_info()[0])

    cachedAligned.append(aligned)
    if len(cachedAligned) % 10 == 0:
Exemplo n.º 8
0
class MyExtract(object):
    '''
    classdocs
    '''
    def __init__(self):
        '''
        constructor
        '''

        self.rawcorpus = None
        self.corpus = []
        self.pars = []
        self.wordspace = None
        self.docspace = None
        self.stop = set(stopwords.words('english'))
        self.parser = None
        self.prelations = []
        self.nrelations = []

    def buildRawCorpus(self, myfile):
        '''
        extract text from xml files
        '''

        corpus = ""
        for txtfile in glob.glob(devdata + myfile):

            print "reading " + txtfile

            xmldoc = minidom.parse(txtfile)
            itemlist = xmldoc.getElementsByTagName('text')
            for s in itemlist:
                text = s.firstChild.data
                if "." in text:
                    corpus = corpus + " " + text
        self.rawcorpus = corpus.encode("utf-8")

    def buildCorpus(self):
        '''
        preprocess raw text (tokenize, remove stopwords)
        '''

        sents = self.rawcorpus.split(".")
        for sent in sents:
            toks = [
                w.lower() for w in nltk.word_tokenize(sent.decode('utf-8'))
                if w.lower() not in self.stop
            ]
            self.corpus.append(toks)

    def tokenizeAbs(self, parag):
        '''
        preprocess raw text (tokenize, remove stopwords)
        '''

        toks = [
            w.lower() for w in nltk.word_tokenize(parag)
            if w.lower() not in self.stop
        ]
        return toks

    def buildRawSents(self, myfile):

        for txtfile in glob.glob(devdata + myfile):
            xmldoc = minidom.parse(txtfile)
            itemlist0 = xmldoc.getElementsByTagName('document')
            count = 0
            for it0 in itemlist0:
                parag = ""
                itemlist = it0.getElementsByTagName('text')
                for item in itemlist:
                    if '.' in item.firstChild.data:
                        parag = parag + " " + item.firstChild.data
                toks = self.tokenizeAbs(parag.encode("utf-8").decode('utf-8'))
                lab = [txtfile + '_' + ` count `]
                self.pars.append(doc2vec.LabeledSentence(words=toks, tags=lab))
                count = count + 1

    def exploreCDRCorpus(self, myfile, maxsize):
        '''
        extract entities + relations from xml
        '''

        diseases = {}
        chemicals = {}
        relations = []
        xmldoc = minidom.parse(myfile)
        itemlist0 = xmldoc.getElementsByTagName('document')
        count = 0
        for it0 in itemlist0:
            print "\t- processing abstract " + ` count `

            parsed = self.docspace.docvecs[myfile + "_" + ` count `]

            itemlist1 = it0.getElementsByTagName('annotation')
            print "\t\t+ " + ` len(itemlist1) ` + " entities"

            for it1 in itemlist1:

                itemlist2 = it1.getElementsByTagName('infon')
                typ = itemlist2[0].firstChild.data
                mesh = itemlist2[len(itemlist2) - 1].firstChild.data
                text = it1.getElementsByTagName(
                    'text')[0].firstChild.data.lower()
                codes = mesh.split('|')

                for code in codes:
                    ent = MyEntity(text, code, typ)
                    if (typ == 'Chemical'):
                        chemicals[code] = ent
                    if (typ == 'Disease'):
                        diseases[code] = ent

            itemlist3 = it0.getElementsByTagName('relation')

            print "\t\t+ " + ` 2 * len(
                itemlist3) ` + " positive and negative relations"
            print "\t\t\t* extracting features for positive relations"
            print "\t\t\t* extracting features for negative relations"

            for it3 in itemlist3:

                itemlist4 = it3.getElementsByTagName('infon')
                key1 = itemlist4[1].firstChild.data
                key2 = itemlist4[2].firstChild.data
                e1 = chemicals[key1]
                e2 = diseases[key2]
                e1.bow = self.avgBOW(e1.text)
                e2.bow = self.avgBOW(e2.text)
                rel = MyRelation(e1, e2, '1')
                rel.abs = parsed
                self.prelations.append(rel)
                relations.append(key1 + "_" + key2)
                num = 0

            for key1 in chemicals.keys():
                for key2 in diseases.keys():
                    if key1 + "_" + key2 not in relations:
                        if num < len(itemlist3):
                            e1 = chemicals[key1]
                            e2 = diseases[key2]
                            e1.bow = self.avgBOW(e1.text)
                            e2.bow = self.avgBOW(e2.text)
                            rel = MyRelation(e1, e2, '-1')
                            rel.abs = parsed
                            self.nrelations.append(rel)
                            num = num + 1

            count = count + 1
            if (count == maxsize):
                break

    def exploreDDICorpus(self, myfile, maxsize, ftyp):
        '''
        extract entities + relations from xml
        '''

        #print(myfile)

        xmldoc = minidom.parse(myfile)
        itemlist0 = xmldoc.getElementsByTagName('document')
        count = 0

        for it0 in itemlist0:

            # abstract with annotations
            print "\t- processing abstract " + ` count `
            drugs = {}

            # entities
            itemlist1 = it0.getElementsByTagName('annotation')
            print "\t\t+ " + ` len(itemlist1) ` + " entities"
            for it1 in itemlist1:

                itemlist2a = it1.getElementsByTagName('infon')
                typ = itemlist2a[0].firstChild.data
                print typ

                itemlist2b = it1.getElementsByTagName('text')
                text = itemlist2b[0].firstChild.data.lower()
                print text

                ent = MyEntity(text, "", typ)
                ent.bow = self.avgBOW(ent.text)
                drugs[text] = ent

            # abstract
            itemlist3 = it0.getElementsByTagName('text')
            abstract = ""
            for it3 in itemlist3:
                if (len(it3.firstChild.data.split()) > 3):
                    abstract = abstract + it3.firstChild.data

            # parse abstract
            parsed = self.parseSentence(abstract)  #stanford
            docvec = self.docspace.docvecs[myfile + "_" + ` count `]  #doc2vec

            #print len(drugs.keys())

            if (len(drugs.keys()) > 1):

                e1 = drugs[drugs.keys()[0]]
                e2 = drugs[drugs.keys()[1]]
                e1.bow = self.avgBOW(e1.text)
                e2.bow = self.avgBOW(e2.text)

                #print(ftyp)

                if (ftyp == "positive"):

                    #print(parsed)

                    rel = MyRelation(e1, e2, '1')
                    rel.abs = docvec
                    rel.parse = parsed.encode("utf-8")
                    self.prelations.append(rel)

                if (ftyp == "negative"):

                    #print(docvec)

                    rel = MyRelation(e1, e2, '-1')
                    rel.abs = docvec
                    rel.parse = parsed.encode("utf-8")
                    self.nrelations.append(rel)

            # increment counter
            count = count + 1
            if (count == maxsize):
                break

    def avgBOW(self, entity):
        bow = []
        ents = entity.split(" ")
        i = 0
        while i < self.wordspace.layer1_size:
            v = 0
            for ent in ents:
                if ent in self.wordspace.vocab:
                    v = v + self.wordspace[ent][i]
            bow.append(v / len(ents))
            i = i + 1
        return np.array(bow)

    def buildWordSpace(self, modelfile):
        '''
        compute distributional model
        '''

        model = Word2Vec(self.corpus,
                         min_count=1,
                         size=20,
                         iter=100,
                         workers=4)
        model.save(modelfile)
        self.wordspace = model

    def buildDocSpace(self, modelfile):
        '''
        compute distributional model
        '''

        model = doc2vec.Doc2Vec(self.pars,
                                min_count=5,
                                size=20,
                                iter=100,
                                workers=4)
        model.save(modelfile)
        self.docspace = model

    def loadWordSpace(self, modelfile):
        '''
        compute distributional model
        '''

        self.wordspace = Word2Vec.load(devdata + modelfile)

    def loadDocSpace(self, modelfile):
        '''
        compute distributional model
        '''

        self.docspace = doc2vec.Doc2Vec.load(devdata + modelfile)

    def loadParser(self):

        corenlp_dir = os.environ['STANFORD']
        self.parser = StanfordCoreNLP(corenlp_dir +
                                      "/")  # wait a few minutes...

    def parseSentence(self, sentence):

        parsed = self.parser.raw_parse(sentence)['sentences'][0]['parsetree']
        return parsed
Exemplo n.º 9
0
	paragraph = ""
	for d in searcher.documents(episode=e):
		outfile.writelines(d['line'].encode('utf-8')+' ')
		# outfile.writelines((d['speaker']+': '+d['line']).encode('utf-8')+' ')
	# 	paragraph += d['speaker']+': '+d['line']+' '
	# 	# paragraph += re.sub(r'\([^)]*\)', '',d['line'])+' '
	# paragraph = paragraph.replace('\n','').replace('                           ',' ')
	# outfile.writelines(paragraph.encode('utf-8'))
	outfile.close()

parsed = []
corenlp_dir = "stanford-corenlp-full-2014-08-27"
corenlp = StanfordCoreNLP(corenlp_dir)
for e in episodeNum:
	for d in searcher.documents(episode=e):
		parsed.append(corenlp.raw_parse(d))

# sentClient = StanfordNLPSentimentClient('http://localhost:8080')
# sentiment = []
# for t in text:
# 	sentiment.append(sentClient.classify(t))

# mask = imread("friends.gif")
wc = WordCloud(max_words=30,stopwords=STOPWORDS|{'s','t','m','re','oh','right','don','know','well','hey','gonna','okay','yeah','go','really','think','hi','uh','look','god','mean','one','ye','guy','y','got','come','now'},font_path='/Users/elaine/Library/Fonts/Berlin.ttf')
for c in mainChars:
	wc.generate(lines[uniqueSpeakers.index(c)])
	wc.to_file(c+".png")

# wc = WordCloud(background_color="white",max_words=50,mask=mask,stopwords=STOPWORDS|{'s','t','m','re','oh','right','don','know','well','hey','gonna','okay','yeah','go','really','think','hi','uh','look','god','mean','one','ye','guy','y','got','come','now'},font_path='/Users/elaine/Library/Fonts/Berlin.ttf')
# for c in mainChars:
# 	wc.generate(lines[uniqueSpeakers.index(c)])
Exemplo n.º 10
0
def stanfordParse(text, corenlpDir='corenlp/stanford-corenlp-full-2014-01-04'):
    global stanford
    if stanford is None:
        stanford = StanfordCoreNLP(corenlpDir)
    return stanford.raw_parse(text)
class Nlp_persistence(object):
    """Persistence layer for having fast access to information produced by the StanfordCoreNLP tool."""
    def __init__(self, fallback=False):
        self.FILE = "nlp_infos.p"
        self.data = None
        self.data_length = None
        self.corenlp_dir = "helper/stanfordnlp/corenlp-python/stanford-corenlp-full-2013-11-12/"
        if fallback:
            try:
                self.corenlp = StanfordCoreNLP(self.corenlp_dir)
            except TIMEOUT:
                print "Stanford CoreNLP Timeout"

    def __enter__(self):
        return self

    def __exit__(self, type, value, traceback):
        self.close()

    def close(self):
        # When exiting, update pickle file with new sentences and kill StanfordCoreNLP before so we definitely have enough memory for that
        try:
            del(self.corenlp)
        except AttributeError:
            # There was a timeout
            pass

        # Write only if we added something to self.data
        if self.data_length < len(self.data):
            self._write()

    def create_persistence(self, relations):
        try:
            # Trying to load data
            data = pickle.load(open(self.FILE, "rb"))
        except (IOError, EOFError):
            # No data so far
            print "Could not open cache. Create new."
            logging.info("Could not find %s. Create new data.", self.FILE)
            data = {}

        # Create nlp information for all relevant sentences
        for relation in relations:
            if not relation.source.sentence in data:
                self._update_data(relation.source, data)
            else:
                print "Sentence is already in data"

            if not relation.target.sentence in data:
                self._update_data(relation.target, data)
            else:
                print "Sentence is already in data"
        print "Done!"
        logging.info("Successfully loaded all nlp information to persistence file.")

        # Save data to a file
        pickle.dump(data, open(self.FILE, "wb"), protocol=-1)

    def _update_data(self, entity, data):
        sentence_obj = entity.sentence
        try:
            tree = self._get_tree(sentence_obj)
        except RPCInternalError:
            logging.error("Could not process the following sentence from text %s: %s", sentence_obj.filename, sentence_obj.text)
            # Return without updating data
            return

        print "--- " + sentence_obj.filename
        print sentence_obj.text

        data.update({sentence_obj: tree})

    def load(self):
        data = {}

        if self.data is None:
            try:
                data = pickle.load(open(self.FILE, "rb"))
            except (IOError, EOFError):
                logging.warning("No cached nlp data.")
            finally:
                self.data = data
                self.data_length = len(data)
        else:
            # Data is already there - there is nothing to do
            pass

    def get_info_for_sentence(self, sentence):
        if type(self.data) is dict:
            try:
                return self.data[sentence]
            except KeyError:
                logging.error("Nlp_persistence: This sentence is not a key/Is not available in the Nlp persistence layer.")
                logging.info("Nlp_persistence fallback to CoreNLP server")
                # Fallback: Try to get tree from CoreNLP server
                tree = self._get_tree(sentence)

                # Drive by caching
                self.data.update({sentence: tree})

                return tree
        else:
            logging.error("You have to use Nlp_persistence.load() before you can get the information of a sentence")
            return None

    def get_collapsed_dependencies(self, sentence):
        info = self.get_info_for_sentence(sentence)

        return info['sentences'][0]['dependencies']

    def get_parse_tree(self, sentence):
        info = self.get_info_for_sentence(sentence)

        return info['sentences'][0]['parsetree']

    def _write(self):
        # Save data to a file
        pickle.dump(self.data, open(self.FILE, "wb"))

    def _get_tree(self, sentence):
        tree = self.corenlp.raw_parse(sentence.text)
        return tree

    def get_pos_tag_for_word(self, sentence, word):
        """Returns the POS tag for a word in a sentence. If the word is not in the sentence raise WordNotInSentence error."""
        info_sentence = self.get_info_for_sentence(sentence)
        words = info_sentence['sentences'][0]['words']

        for w in words:
            if w[0] in word:
                return w[1]["PartOfSpeech"]
        else:
            raise PosTagNotFound(sentence, word)

    def get_lemma_for_word(self, sentence, word):
        """Returns the lemma for a word in sentence."""
        info_sentence = self.get_info_for_sentence(sentence)
        words = info_sentence['sentences'][0]['words']

        for w in words:
            if w[0] in word:
                return w[1]["Lemma"]
        else:
            raise LemmaNotFound(sentence, word)

    def is_main_verb(self, sentence, word):
        """Returns true if word is a main verb of sentence and not an aux."""
        info_sentence = self.get_info_for_sentence(sentence)
        dependencies = info_sentence['sentences'][0]['dependencies']

        for dependency in dependencies:
            if dependency[0] == "aux" and dependency[2] == word:
                return False
        else:
            return True

    def get_all_aux_for_verb(self, sentence, verb):
        """Returns all distinct aux for verb as strings in order of the sentence."""
        info_sentence = self.get_info_for_sentence(sentence)
        dependencies = info_sentence['sentences'][0]['dependencies']

        aux = []
        for dependency in dependencies:
            if (dependency[0] == "aux" or dependency[0] == "auxpass") and dependency[1] == verb:
                aux.append(dependency[2])

        return aux

    def get_verb_for_aux(self, sentence, aux):
        """Returns the governing verb for the aux as string."""
        info_sentence = self.get_info_for_sentence(sentence)
        dependencies = info_sentence['sentences'][0]['dependencies']

        for dependency in dependencies:
            if dependency[0] == "aux" and dependency[2] == aux:
                return dependency[1]
        else:
            raise AuxNotFound(aux)

    def find_all_verb_pos_tags(self, sentence, verb):
        """Returns all pos tags for all verbs based on the dependencies relation of the sentence."""

        if self.is_main_verb(sentence, verb):
            # verb is not an aux
            main_verb = verb
        else:
            # verb is aux (this should normally not happen due to the data)
            main_verb = self.get_verb_for_aux(sentence, verb)

        auxes = self.get_all_aux_for_verb(sentence, main_verb)

        verb_pos = self.get_pos_tag_for_word(sentence, main_verb)

        aux_pos = map(lambda aux: self.get_pos_tag_for_word(sentence, aux), auxes)

        return aux_pos + [verb_pos]

    def get_governing_verb(self, event):
        sentence = event.sentence

        # info = [verb, aux, pos verb, pos aux, index_of_verb]
        info = self.get_info_on_governing_verb(event.text, event.index, sentence)

        if info is None:
            raise CouldNotFindGoverningVerb
        else:
            if info[0] is None:
                raise CouldNotFindGoverningVerb
            else:
                return (info[0], info[4])

    def is_root(self, event):
        sentence = event.sentence
        info_sentence = self.get_info_for_sentence(sentence)

        collapsed_dependencies = info_sentence['sentences'][0]['dependencies']

        for dependency in collapsed_dependencies:
            dependency_type = dependency[0]
            dependent = dependency[2]

            if dependency_type == "root" and dependent == event.text:
                return True
        else:
            return False

    def get_info_on_governing_verb(self, non_verb, index, sentence):
        """This method returns information about the governing verb of a non-verb.

        It returns an array with the following format: [verb, aux, POS of verb, POS of aux, index_of_verb]
        """
        info = self.get_info_for_sentence(sentence)

        if info:
            # Search for non_verb
            governing_verb, index = self._get_governing_verb(non_verb, index, info)

            info_on_governing_verb = [governing_verb, None, None, None, index]

            # Set POS of main verb
            pos_verb = self._get_pos_of_verb(governing_verb, info)
            info_on_governing_verb[2] = pos_verb

            # Searching for an Aux for the governing verb
            aux = self._get_aux_of_verb(governing_verb, info)
            info_on_governing_verb[1] = aux

            # If there is an aux, get it's POS
            if aux:
                pos_aux = self._get_pos_of_verb(aux, info)
                info_on_governing_verb[3] = pos_aux

            return info_on_governing_verb

        else:
            return None

    def _get_aux_of_verb(self, verb, info):
        dependencies = info['sentences'][0]['dependencies']

        sources = [x[1] for x in dependencies]

        # Find index of verb in targets
        index = None
        for i, source in enumerate(sources):
            if source == verb and dependencies[i][0] == "aux":
                index = i

        # Get aux
        if index is None:
            # Not every verb has an aux
            return None
        else:
            aux = dependencies[index][2]

            return aux

    def _get_pos_of_verb(self, verb, info):
        info_on_words = info['sentences'][0]['words']

        for word in info_on_words:
            if word[0] == verb:
                return word[1]['PartOfSpeech']

    def _find_governing_word(self, word, dependencies):
        for dependency in dependencies:
            if dependency[2] == word:
                return dependency[1]
        else:
            return None

    def _find_governing_word_index(self, word, index, index_dependencies):
        word = word + "-" + str(index)

        for dependency in index_dependencies:
            if dependency[2] == word:
                # Remove governor with index appended
                return dependency[1]
        else:
            return None

    def _remove_index_from_token(self, token):
        if token:
            token = token.split("-")[:-1]
            return "-".join(token)
        else:
            return None

    def _get_index_from_token(self, token):
        if token:
            index = token.split("-")[-1]
            return index
        else:
            return None

    def _get_governing_verb(self, non_verb, index, info):
        index_dependencies = info['sentences'][0]['indexeddependencies']

        # Try to find a governor for non_verb
        governor = self._find_governing_word_index(non_verb, index, index_dependencies)

        # Search through tree as long we find a verb and until we can go further up
        while not self._is_verb(self._remove_index_from_token(governor), info) and governor is not None:
            old_governor = governor
            governor = self._find_governing_word_index(self._remove_index_from_token(governor), self._get_index_from_token(governor), index_dependencies)

            if governor == old_governor:
                # Detected circle (does not happen often, but happens. Not sure why.)
                governor = None
                break

        if governor:
            # Remove index from governor string
            return (self._remove_index_from_token(governor), int(self._get_index_from_token(governor)))
        else:
            # Examples when this is allowed to happen:
            # Example for when it happens: "And in Hong Kong, a three percent drop." <- no verb
            # Other example: "One exception was the swine flu pandemic of 2009-2010, when 348 children died." and "pandemic". "pandemic" is the root of the sentence and is not governed by anything
            # Other corner case: "And the dominant flu strain early in the season was one that tends to cause more severe illness." for "season"
            raise CouldNotFindGoverningVerb(non_verb, index)

    def _is_verb(self, text, info):
        """Checks if text has the POS tag of a verb."""
        if not text: return False

        words = info['sentences'][0]['words']

        for word in words:
            if word[0] == text:
                if word[1]['PartOfSpeech'] in ['VBG', 'VBD', 'VB', 'VBN', 'VBP', 'VBZ']:
                    return True

        return False
class StanforExtractor(object):
    def __init__(self):
        corenlp_dir = "corenlp-python/stanford-corenlp-full-2014-08-27/"
        self.corenlp = StanfordCoreNLP(corenlp_dir)  # wait a few minutes...
        print("corenlp object initiated")

    def tag_text(self, text):
        """
        :param text:
        :return:
        """
        assert type(text) == str
        sents = self.corenlp.raw_parse(text)
        return sents

    def expand_rels_double(self, rel_words, sent):
        """
        :param rel_words: [wrd1,wrd2]
        :param sent: in tagged_text['sentences'], ['dependencies'] for each sent
        :return:
        """
        assert type(rel_words) == list
        assert type(sent) == list
        assert len(rel_words) == 2
        rel_tmp = [rel_words[0], rel_words[1]]
        for rel_1 in sent:
            if rel_1[1] == rel_words[0] and rel_1[2] == rel_words[1]:
                continue
            rel_1 = list(rel_1)
            # print(rel_1)
            # if prep_ or prepc_ is the tag
            # appos_tag = 1
            neg_tag = 0
            if rel_1[0].startswith(u"prep_") or rel_1[0].startswith(u"prepc_"):
                middle_word = rel_1[0][rel_1[0].find("_") + 1 :]
                rel_1 = [rel_1[1], middle_word, rel_1[2]]
            elif rel_1[0] == u"appos":
                rel_1 = [rel_1[1], rel_1[2]]
                # appos_tag = -1
            elif rel_1[0] == u"neg":
                # neg_tag = 1
                rel_1 = [rel_1[1], rel_1[2]]
            else:
                continue
                # rel_1 = [rel_1[1],rel_1[2]]
            if rel_words[0] in rel_1:
                append_start = 1
                rel_1.remove(rel_words[0])
            elif rel_words[1] in rel_1:
                append_start = -1
                rel_1.remove(rel_words[1])
            else:
                continue
            # append_start = append_start*appos_tag
            # if neg_tag == 1:
            #
            if append_start == 1:
                rel_tmp = [" ".join(rel_1)] + rel_tmp
            else:
                rel_tmp = rel_tmp + [" ".join(rel_1)]
        return rel_tmp

    def expand_rels_wordlist(self, rel_words, sent):
        """
        :param rel_words: [wrd1,wrd2,..]
        :param sent: in tagged_text['sentences'], ['dependencies'] for each sent
        :return:
        """
        assert type(rel_words) == list
        assert type(sent) == list
        rel_tmp = []
        for rel_1 in sent:  # for each word in sentence, rel_1 is the relation mapper from stanford tagger dependencies
            # if rel_1[1] in rel_words and rel_1[2] in rel_words:
            #     continue
            rel_1 = list(rel_1)
            # print(rel_1)
            # if prep_ or prepc_ is the tag
            # appos_tag = 1
            neg_tag = 0
            if rel_1[0].startswith(u"prep_") or rel_1[0].startswith(u"prepc_"):
                middle_word = rel_1[0][rel_1[0].find("_") + 1 :]
                rel_1 = [rel_1[1], middle_word, rel_1[2]]
            elif rel_1[0] == u"appos":
                rel_1 = [rel_1[1], rel_1[2]]
                # appos_tag = -1
            elif rel_1[0] == u"neg":  # what to do here?
                # neg_tag = 1
                rel_1 = [rel_1[1], rel_1[2]]
            else:
                continue
            wrd_present = False
            for wrd in rel_1:
                if wrd in rel_words:
                    rel_1.remove(wrd)
                    wrd_present = True
            if wrd_present:
                # pdb.set_trace()
                if len(rel_1) > 0:
                    rel_tmp.append(" ".join(rel_1))
        return " ".join(rel_tmp)

    def expand_rels(self, tmp_rels, sent):
        """
        add relevant sents to start or end of tmp_rels
        :param tmp_rels:
        :param sent:
        :return:
        """
        # pdb.set_trace()
        print("sent", sent)
        final_rels = []
        for rel_full in tmp_rels:
            rel_words = [rel_full[1], rel_full[2]]
            rel_tmp = self.expand_rels_double(rel_words, sent)
            final_rels.append(rel_tmp)
        # print('final_res:',final_rels)
        return final_rels

    def identify_rels(self, tagged_text):
        """
        :param tagged_text:
        :return:
        """
        assert "sentences" in tagged_text.keys()
        assert "dependencies" in tagged_text["sentences"][0].keys()
        all_rels = []
        for sent in tagged_text["sentences"]:
            tmp_rels = []
            for rel in sent["dependencies"]:
                if rel[0] in [u"nn", u"dobj"]:
                    tmp_rels.append(rel)
            if len(tmp_rels) > 0:
                final_rels = self.expand_rels(tmp_rels, sent["dependencies"])
                all_rels.append(final_rels)
        return all_rels

    def identify_word_rels(self, all_words, tagged_text):
        """
        :param all_words: list of words/phrases
        :param tagged_text:
        :return:
        """
        assert "sentences" in tagged_text.keys()
        assert "dependencies" in tagged_text["sentences"][0].keys()
        words_rels = {}
        # pdb.set_trace()
        for wrd in all_words:
            wrd_rels = []
            for sent in tagged_text["sentences"]:
                rel_frm_sent = self.expand_rels_wordlist(wrd.split(), sent["dependencies"])
                if len(rel_frm_sent) > 0:
                    wrd_rels.append(rel_frm_sent)
            words_rels[wrd] = ",".join(wrd_rels)
        return words_rels

    def identify_time(self, text):
        """
        :param text:
        :return:
        """
        time_strs = []
        text_tag = self.tag_text(text)
        for sent in text_tag["sentences"]:
            words = sent["words"]
            prev_wrd_tag = False
            for wrd in words:
                wrd_tag = wrd[1]
                assert type(wrd_tag) == dict
                # if u'Timex' in wrd_tag:
                #     timex_string = wrd_tag['Timex']
                #     new_end = timex_string.rfind('</TIMEX3>')
                #     timex_string = timex_string[:new_end]
                #     new_start = timex_string.rfind('>')
                #     time_word = timex_string[new_start+1:]
                #     time_strs.append(time_word)
                if u"NamedEntityTag" in wrd_tag:
                    if wrd_tag[u"NamedEntityTag"] in [u"DATE", u"TIME"]:
                        if not prev_wrd_tag:
                            time_strs.append(wrd[0])
                        else:
                            prev_wrd = time_strs.pop()
                            new_wrd = prev_wrd + " " + wrd[0]
                            time_strs.append(new_wrd)
                        prev_wrd_tag = True
                    else:
                        prev_wrd_tag = False
                else:
                    prev_wrd_tag = False
        time_final = []
        for wrd in time_strs:
            if wrd not in time_final:
                time_final.append(wrd)
        return time_final

    def ret_time_rels(self, text):
        """
        :param text:
        :return:
        """
        tagged_text = self.tag_text(text)
        all_times = self.identify_time(text)
        time_rels = self.identify_word_rels(all_times, tagged_text)
        return time_rels

    def return_rels(self, text):
        """
        :param text:
        :return:
        """
        text_tag = self.tag_text(text)
        rels_all = self.identify_rels(text_tag)
        return rels_all

    def identify_name(self, text):
        """
        :param text:
        :return:
        """
        name_strs = []
        text_tag = self.tag_text(text)
        for sent in text_tag["sentences"]:
            words = sent["words"]
            prev_wrd_tag = False
            for wrd in words:
                wrd_tag = wrd[1]
                assert type(wrd_tag) == dict
                # if u'Timex' in wrd_tag:
                #     timex_string = wrd_tag['Timex']
                #     new_end = timex_string.rfind('</TIMEX3>')
                #     timex_string = timex_string[:new_end]
                #     new_start = timex_string.rfind('>')
                #     time_word = timex_string[new_start+1:]
                #     time_strs.append(time_word)
                if u"NamedEntityTag" in wrd_tag:
                    if wrd_tag[u"NamedEntityTag"] in [u"PERSON"]:
                        if not prev_wrd_tag:
                            name_strs.append(wrd[0])
                        else:
                            prev_wrd = name_strs.pop()
                            new_wrd = prev_wrd + " " + wrd[0]
                            name_strs.append(new_wrd)
                        prev_wrd_tag = True
                    else:
                        prev_wrd_tag = False
                else:
                    prev_wrd_tag = False
        names_final = []
        for wrd in name_strs:
            if wrd not in names_final:
                names_final.append(wrd)
        return names_final
Exemplo n.º 13
0
from corenlp import StanfordCoreNLP
import book_utils
corenlp_dir = "../tools/corenlp-python/corenlp/stanford-corenlp-full-2014-01-04"
corenlp = StanfordCoreNLP(corenlp_dir)
corenlp.raw_parse()
#raw_text_directory = "../dataset/books_txt/small_sample"
#parsed = batch_parse(raw_text_directory, corenlp_dir,raw_output=True)
#for books in parsed:
#    print books
from corenlp import StanfordCoreNLP

corenlp_dir = "../../Scripts/stanford-corenlp-full-2014-08-27/"
corenlp = StanfordCoreNLP(corenlp_dir)  # wait a few minutes...

result = corenlp.raw_parse("What is birth date of the wife of the first black president of the United States?")

print(result['sentences'][0]['dependencies'])
Exemplo n.º 15
0
#!/usr/bin/env python

import sys, bz2
sys.path.insert(0, '/Users/timpalpant/Documents/Workspace/corenlp-python')
import nltk
from nltk.tree import Tree
from corenlp import StanfordCoreNLP
from remove_random_word import remove_random_word

print("Booting StanfordCoreNLP")
nlp = StanfordCoreNLP()

print("Initializing train file")
train = bz2.BZ2File('../data/train_v2.txt.bz2')
for line in train:
    rline = remove_random_word(line)
    lparse = nlp.raw_parse(line)
    ltree = Tree.fromstring(lparse['sentences'][0]['parsetree'])
    rparse = nlp.raw_parse(rline)
    rtree = Tree.fromstring(rparse['sentences'][0]['parsetree'])
    print(ltree)
    print(rtree)
Exemplo n.º 16
0
def scrape_func(address, website):
    """
    Function to scrape various RSS feeds. Uses the 'keep' and 'ignore'
    iterables to define which words should be used in the text search.

    Inputs
    ------
    address : address for the RSS feed to scrape. String.

    website : name of the website to scrape to be used in the filepath for the
    output. String.

    database : name of the MongoDB database that contains the collections.
    String? pymongo connection object?
    """
    connection = MongoClient()
    db = connection.atrocities_data
    collection = db[website]

    sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')

    corenlp_dir = 'stanford-corenlp/'
    corenlp_parse = StanfordCoreNLP(corenlp_dir)

    log = open('log_file.txt', 'a')
    results = pattern.web.Newsfeed().search(address, count=100, cached=False)
    log1 = 'There are %d results from %s \n' % (len(results), website)
    log.write(log1)
    for result in results:
        if website == 'nyt':
            text = pages_scrape.scrape(result.url, result.title)
            head_sentences = sent_detector.tokenize(text.strip())[:4]
            joined_sentences = ' '.join(head_sentences)
            parsed = corenlp_parse.raw_parse(joined_sentences)
            entry_id = mongo_connection.add_entry(collection, text, parsed,
                                                    result.title, result.url,
                                                    result.date, website)
            if entry_id:
                log2 = 'Added entry from %s with id %s \n' % (result.url,
                                                                str(entry_id)
                                                                )
                log.write(log2)
            else:
                log2 = 'Result from %s already in database \n' % (result.url)
                log.write(log2)
        if website == 'bbc':
            text = pages_scrape.scrape(result.url, result.title)
            head_sentences = sent_detector.tokenize(text.strip())[:4]
            joined_sentences = ' '.join(head_sentences)
            parsed = corenlp_parse.raw_parse(joined_sentences)
            entry_id = mongo_connection.add_entry(collection, text, parsed,
                                                    result.title, result.url,
                                                    result.date, website)
            if entry_id:
                log2 = 'Added entry from %s with id %s \n' % (result.url,
                                                                str(entry_id)
                                                                )
                log.write(log2)
            else:
                log2 = 'Result from %s already in database \n' % (result.url)
                log.write(log2)
        if website == 'reuters':
            text = pages_scrape.scrape(result.url, result.title)
            head_sentences = sent_detector.tokenize(text.strip())[:4]
            joined_sentences = ' '.join(head_sentences)
            parsed = corenlp_parse.raw_parse(joined_sentences)
            entry_id = mongo_connection.add_entry(collection, text, parsed,
                                                    result.title, result.url,
                                                    result.date, website)
            if entry_id:
                log2 = 'Added entry from %s with id %s \n' % (result.url,
                                                                str(entry_id)
                                                                )
                log.write(log2)
            else:
                log2 = 'Result from %s already in database \n' % (result.url)
                log.write(log2)
        if website == 'ap':
            text = pages_scrape.scrape(result.url, result.title)
            head_sentences = sent_detector.tokenize(text.strip())[:4]
            joined_sentences = ' '.join(head_sentences)
            parsed = corenlp_parse.raw_parse(joined_sentences)
            entry_id = mongo_connection.add_entry(collection, text, parsed,
                                                    result.title, result.url,
                                                    result.date, website)
            if entry_id:
                log2 = 'Added entry from %s with id %s \n' % (result.url,
                                                                str(entry_id)
                                                                )
                log.write(log2)
            else:
                log2 = 'Result from %s already in database \n' % (result.url)
                log.write(log2)
        if website == 'upi':
            text = pages_scrape.scrape(result.url, result.title)
            head_sentences = sent_detector.tokenize(text.strip())[:4]
            joined_sentences = ' '.join(head_sentences)
            parsed = corenlp_parse.raw_parse(joined_sentences)
            entry_id = mongo_connection.add_entry(collection, text, parsed,
                                                    result.title, result.url,
                                                    result.date, website)
            if entry_id:
                log2 = 'Added entry from %s with id %s \n' % (result.url,
                                                                str(entry_id)
                                                                )
                log.write(log2)
            else:
                log2 = 'Result from %s already in database \n' % (result.url)
                log.write(log2)
        if website == 'xinhua':
            page_url = result.url.encode('ascii')
            page_url = page_url.replace('"', '')
            text = pages_scrape.scrape(page_url, result.title)
            head_sentences = sent_detector.tokenize(text.strip())[:4]
            joined_sentences = ' '.join(head_sentences)
            parsed = corenlp_parse.raw_parse(joined_sentences)
            entry_id = mongo_connection.add_entry(collection, text, parsed,
                                                    result.title, result.url,
                                                    result.date, website)
            if entry_id:
                log2 = 'Added entry from %s with id %s \n' % (result.url,
                                                                str(entry_id)
                                                                )
                log.write(log2)
            else:
                log2 = 'Result from %s already in database \n' % (result.url)
                log.write(log2)
        if website == 'google':
            text = pages_scrape.scrape(result.url, result.title)
            head_sentences = sent_detector.tokenize(text.strip())[:4]
            joined_sentences = ' '.join(head_sentences)
            parsed = corenlp_parse.raw_parse(joined_sentences)
            entry_id = mongo_connection.add_entry(collection, text, parsed,
                                                    result.title, result.url,
                                                    result.date, website)
            if entry_id:
                log2 = 'Added entry from %s with id %s \n' % (result.url,
                                                                str(entry_id)
                                                                )
                log.write(log2)
            else:
                log2 = 'Result from %s already in database \n' % (result.url)
                log.write(log2)
    interupt = '+' * 70
    log3 = '%s\nScrape %s once at %s!\n%s\n' % (interupt, website,
                                                datetime.datetime.now(),
                                                interupt)
    log.write(log3)
    log.close()
Exemplo n.º 17
0
import os
from nltk.tokenize import sent_tokenize
from corenlp import StanfordCoreNLP

# The directory in which the stanford core NLP .jar is located -- you have to
# download this from their website.
CORE_NLP_DIR = "stanford-corenlp-dir/"
PARSER = StanfordCoreNLP(CORE_NLP_DIR)

in_file = "sentences.txt"
text = open(in_file, 'r').read()
sentences = sent_tokenize(text)  # Break the text into sentences.
for i, sentence in enumerate(sentences):
    try:
        parse = PARSER.raw_parse(sentence)
        if i % 50 == 0:
            print " Entered sentence " + str(i) + " of " + str(len(sentences))
        write_parse_products(parse['sentences'][0])
    except Exception:
        print "Error on sentence:\n\t " + sentence + " \n "
        pass


def write_parse_products(self, parse):
    words = parse['words']

    word_objects = []
    text = ""
    for i, word_info in enumerate(words):
        properties = word_info[1]
        token = word_info[0].lower().strip()
Exemplo n.º 18
0
class BasicStanfordCoreNLP(UtteranceProcessor):
    '''
    Basic version doesn't do anything with coref, const. and depend. parses produced by analysis.
    
    For now, words from all sentences found in the utterance are put at the top level
    of the utterance -- sentences are throw away, but could be used later for e.g.
    paragraph-level utterances. 
    
    If merge_clitics, merge e.g. I 'll -> single word I'll
    
    Add spaces back in where there is no punctuation as points at which silence can
    be inserted during alignment
    
    Add reduced POS as well as Stanford POS
    '''
    def load(self):
    
        self.target_nodes = self.config.get('target_nodes', '//utt')    
        self.input_attribute = self.config.get('input_attribute', 'norm_text')
        
        self.merge_clitics = self.config.get('merge_clitics', 'True') ## string, not bool
    
        ## check tools exist:
        corenlp_location = os.path.join(self.voice_resources.path[c.BIN], '..', \
                                                            'corenlp-python', 'corenlp')
        assert os.path.isdir(corenlp_location)
        sys.path.append(corenlp_location)
        from corenlp import StanfordCoreNLP
        corenlp_dir = os.path.join(corenlp_location, '..', 'stanford-corenlp-full-2014-06-16')
        
        ## Each document is to be treated as one sentence, no sentence splitting at all. 
        ## Write config for this if necessary:
        corenlp_conf_name = 'no_sentence_split.properties'
        corenlp_conf_file = os.path.join(corenlp_location, corenlp_conf_name)
        if not os.path.isfile(corenlp_conf_file):
            data = ['annotators = tokenize, ssplit, pos, lemma, ner, parse, dcoref', \
                    'ssplit.isOneSentence = true']
            writelist(data, corenlp_conf_file)

        print 'Loading stanford corenlp modules from %s ...'%(corenlp_dir)
        print 'Takes a while (~20-30 seconds)...'
        self.models = StanfordCoreNLP(corenlp_dir, properties=corenlp_conf_name)     
                                           

                                                            
    def process_utterance(self, utt):

        ## _END_ node
        end_node = Element('token')
        end_node.set(self.input_attribute, '_END_')
        utt.append(end_node)

        for node in utt.xpath(self.target_nodes):
            
            assert node.has_attribute(self.input_attribute)
            input = node.get(self.input_attribute)
            analysis = self.models.raw_parse(input)
            
            ## analysis looks like this:
            
            #     {'coref': ...
            #      'sentences': [{'parsetree':  ... } 
            #                     'text': 
            #                     'dependencies': 
            #                     'indexeddependencies': 
            #                     'words': [('and', {'NamedEntityTag': 'O', \
            #                         'CharacterOffsetEnd': '3', 'Lemma': 'and', \
            #                         'PartOfSpeech': 'CC', 'CharacterOffsetBegin': '0'}), ... ]
            #                       }
            #                   ]
            #     }
            
            ## preprocess the analysis: add spaces back between words where there is no
            ## punc (to use as potential silence insertion points for alignment), and
            ## possibly merge clitics (he 's -> he's, i ll' -> i'll)
            

            ## MERGE SUCCESSIVE PUNCTUATION TOKENS 
            new_analysis = {}
            new_analysis['sentences'] = []
            for sentence in analysis['sentences']:
                #new_sentence = copy.deepcopy(sentence)
                #new_sentence['words'] = []
                new_words = []
                for word in sentence['words']:
                    # is there a previous word?
                    if len(new_words) > 0:
                        # if both space / punct:
                        if self.all_space_or_punc(new_words[-1][0]) and self.all_space_or_punc(word[0]):
                            prev_word = new_words.pop(-1)
                            combined = self.merge_words(prev_word, word)
                            new_words.append(combined)
                        else:
                            new_words.append(word)
                    else:
                        new_words.append(word)
                sentence['words'] = new_words
                new_analysis['sentences'].append(sentence)
            analysis = new_analysis     


            ## MERGE CLITICS 
            ## This also merges e.g. . ''  -->  .''  (given by norm scripts from   ."  ) at sentence ends.
            if self.merge_clitics == 'True': ## string not bool
                new_analysis = {}
                new_analysis['sentences'] = []
                for sentence in analysis['sentences']:
                    #print sentence
                    new_sentence = copy.deepcopy(sentence)
                    new_sentence['words'] = []
                    i = 0
                    while i < (len(sentence['words'])-1):
                        this_word = sentence['words'][i]
                        next_word = sentence['words'][i+1]
                        if next_word[0].startswith("'") or next_word[0] == "n't":
                            merged = self.merge_words(this_word, next_word)
                            new_sentence['words'].append(merged)
                            i += 2
                        else:
                            new_sentence['words'].append(this_word)
                            i += 1
                    last_word = sentence['words'][-1]
                    if not(last_word[0].startswith("'") or last_word[0] == "n't"):
                        new_sentence['words'].append(last_word)
                    new_analysis['sentences'].append(new_sentence)
                analysis = new_analysis                    
                 
            
            ## ADD SPACES:
            new_analysis = {}
            new_analysis['sentences'] = []
            for sentence in analysis['sentences']:
                new_sentence = copy.deepcopy(sentence)
                new_sentence['words'] = []
                ## For now, ignore parsetree, dependencies, indexeddependencies (sentence level)
                previous_lemma = '_NONE_'
                for word in sentence['words']:
                
                    (text, word_attributes) = word
                    this_lemma = word_attributes['Lemma']
                    
                    ## Add whitespace back in to tokens to use for silence insertion in alignment later.
                    ## Don't add it where either neighbour is punctuation, or at start of 
                    ## utt (where previous_lemma is '_NONE_':
                    if not (self.all_space_or_punc(previous_lemma) or \
                                                self.all_space_or_punc(this_lemma)):   
                        if previous_lemma != '_NONE_':                     
                            new_sentence['words'].append((' ', {'NamedEntityTag': ' ', \
                                                        'PartOfSpeech': ' ', 'Lemma': ' '}))
                    previous_lemma = this_lemma
                    new_sentence['words'].append(word)
                new_analysis['sentences'].append(new_sentence)
            analysis = new_analysis
            
            
            ## combine all sentences to one for now:
            all_words = []
            for sentence in analysis['sentences']:
                all_words.extend(sentence['words'])
                
            
            ## Add stuff into the target node (probably utt):
            for word in all_words:
            
                (text, word_attributes) = word
                word_node = Element('token') ## also includes punctuation etc.
                word_node.set(self.input_attribute, text) ## see above at sentence level about 'text'
                
                ## For now, ignore CharacterOffsetBegin, CharacterOffsetEnd (word level)
                word_node.set('ne', word_attributes['NamedEntityTag']) 
                word_node.set('pos', word_attributes['PartOfSpeech']) 
                word_node = self.add_reduced_POS(word_node)
                
                word_node.set('lemma', word_attributes['Lemma']) 
                
                utt.append(word_node)
                
        ## _END_ node
        end_node = Element('token')
        end_node.set(self.input_attribute, '_END_')
        utt.append(end_node)    

    def add_reduced_POS(self, node):
        full_POS = node.attrib['pos']
        if '|' in full_POS:
            full_POS = full_POS.split('|')[0]
    
        ## add coarse POS (content/function) and reduced (adj,noun,adv,etc.)
        map = dict([('IN', 'function'), ('TO', 'function'), ('DT', 'function'), \
                ('PDT', 'function'), ('MD', 'function'), ('CC', 'function'), \
                ('WP', 'function'), ('PP$', 'function'), ('EX', 'function'), \
                ('POS', 'function'), ('PP', 'function'), ('WDT', 'function'), \
                ('PRP', 'function'), ('PRP$', 'function'), ('RP', 'function'), \
                ('WP$', 'function'), ('WRB', 'function'), ('LS', 'function'),\
                ('NN', 'noun'), ('NNS', 'noun'), \
                ('NP', 'noun'), ('NNP', 'noun'), ('NPS', 'noun'), ('NNPS', 'noun'), ('FW', 'noun'), \
                 ('VBG', 'verb'), ('VBN', 'verb'), \
                ('VB', 'verb'), ('VBD', 'verb'), ('VBP', 'verb'), ('VBZ', 'verb'), \
                ('JJ', 'adj'), ('JJR', 'adj'), ('JJS', 'adj'), ('CD', 'adj'), \
                ('RB', 'adv'), ('RBR', 'adv'), ('RBS', 'adv'), ('UH', 'interj')])

                ## NOTE:
                # FW -- foreign word -> noun
                # LS -- list item -> function

        if full_POS not in map:
            if full_POS == ' ':
                red_pos = 'space'
            elif self.all_space_or_punc(full_POS):
                red_pos = 'punc'
            else:
                print 'MISSING POS: %s'%(full_POS)
                red_pos = 'other'
        else:
            red_pos = map[full_POS]
        node.set('coarse_pos', red_pos)
        return node


    

    def all_space_or_punc(self, token):
        '''Use regex to match unicode properties to see if token is all punctuation or space
            This duplicates later work by e.g. token classifier.'''
        space_or_punc = '[\p{Z}||\p{C}||\p{P}||\p{S}]'
        return regex.match('\A' + space_or_punc + '+\Z', token)
        
        
    def merge_words(self, word1, word2):
        merged_form = word1[0] + word2[0]
        merged_POS = word1[1]['PartOfSpeech'] + '|' + word2[1]['PartOfSpeech']
        merged_lemma = word1[1]['Lemma']   ## first word's lemma
        merged_NER = word1[1]['NamedEntityTag']  ## first words NE tag
        merged = (merged_form, \
                    {'PartOfSpeech': merged_POS, \
                    'Lemma': merged_lemma, \
                    'NamedEntityTag': merged_NER})
        return merged
Exemplo n.º 19
0
from corenlp import StanfordCoreNLP
import simplejson as json

corenlp_dir = "/home/clai/lubbock/repos-3rd/stanford-corenlp-python/stanford-corenlp-full-2015-04-20/"

print "loading..."
corenlp = StanfordCoreNLP(corenlp_dir)

results = corenlp.raw_parse("Hello world. It's a wonderful day.")
print results

print json.dumps(results, indent=4)
Exemplo n.º 20
0
Arquivo: nlpio.py Projeto: yk/nlpfs14
def stanfordParse(text, corenlpDir='stanford-corenlp-full-2013-11-12/'):
    global stanford
    if stanford is None:
        stanford = StanfordCoreNLP(corenlpDir)
    return stanford.raw_parse(text)
def compress(sentence):
    global parser
    if not parser:
        parser = StanfordCoreNLP(corenlp_dir)
    text = sentence.simple
    words = word_tokenize(text)
    w_features = [dict() for w in words]
    stemmed = [None for w in words]

    labels = list()


    # add basic features

    # first/last words
    for i in range(1,6):
        if i < len(words):
            for x in range(i):
                w_features[x]["infirst"+str(i)] = True
                w_features[-1-x]["inlast"+str(i)] = True

    #pos = [ x[1] for x in nltk.pos_tag(a.o_words) ]

    for i in range(len(words)):
        w = words[i]
        features = w_features[i]


        #capitalization
        if w.isupper():
            features["isupper"] = True
        elif w[0].isupper():
            features["firstupper"] = True

        w = w.lower()

        #word class
        if w in negation:
            features["negation"] = True
        elif w in punct:
            features["punct"] = True
        elif w in stopWords:
            features["stopWords"] = True

        #pos
        #a.posfeatures[i]["pos_"+pos[i]] = True

        # compute the basic term frequencies of all words in paragraphs
        # for use in building corpus-wide quarry term frequency
        if w not in model.idf.stopWords:
            termFreq[w] += 1

        stem = stemmer.stem(w)
        suffix = ""
        if len(stem) < len(w) and w.startswith(stem):
            suffix = w[len(stem):]
        stemmed[i] = (stem, suffix)

        features["stem_"+stemmed[i][0]] = True
        features["affix_"+stemmed[i][1]] = True


    #Stanford tree features
    text = text.encode('ascii', 'ignore')

    
    tree = None
    dependencies = None

    try:
        results = parser.raw_parse(text)
        tree = []
        dependencies = []

        for s in results['sentences']:
            tree.append(tree_re.search(s['parsetree']).group(0))
            dependencies += s['dependencies']


    except:
        print(text)
        print( "Unexpected error:", sys.exc_info()[0])


    #print(a.tree)
    if tree:
        tree = Tree.fromstring(tree[0].encode('ascii', 'ignore'))
        #print(str(tree))
        paths = list(getPathsToLeaves(tree))
        #print(paths)
        for i in range(min(len(paths), len(words))):
            #print(paths[i][1])
            w_features[i]["tree_depth_"+str(len(paths[i][1]))] = True
            for x in range(0,2):
                w_features[i][str(x)+"_up_"+paths[i][1][-1-x]] = True
            for n in paths[i][1]:
                w_features[i]["tree_"+n] = True
            w_features[i][str(paths[i][2])+"_from_left"] = True
        #print(a.treefeatures[0])
    if dependencies:
        #make a tree out of it
        d_tree = defaultdict(list)
        mother_relations = defaultdict(list)
        daughter_relations = defaultdict(list)
        for dep in dependencies:
            d_tree[dep[1]].append((dep[0], dep[2]))
            mother_relations[dep[1]].append(dep[0])
            daughter_relations[dep[2]].append(dep[0])

        #now we can check depth and such
        #print(d_tree)
        depths = getDepths(d_tree, u'ROOT', dict(), 0)
        #print(depths)

        for i in range(len(words)):
            w = words[i]
            treefeatures = w_features[i]
            if w in depths:
                w_depth = depths[w]
                treefeatures["dep_depth_"+str(w_depth)] = True
                if w_depth > 3:
                    treefeatures["dep_depth_over_3"] = True
                if w_depth > 5:
                    treefeatures["dep_depth_over_5"] = True
            if w in mother_relations:
                for rel in mother_relations[w]:
                    treefeatures["dep_mother_"+rel] = True
            if w in daughter_relations:
                for rel in daughter_relations[w]:
                    treefeatures["dep_daughter_"+rel] = True

    # get max tfidf for scaling
    maxtfidf = max( tf*idf.idf[w] for w, tf in termFreq.items() )

    partitions = 5

    # now add tfidf threshold features
    for i in range(len(words)):
        w = words[i].lower()
        if w not in stopWords and w not in punct:
            features = w_features[i]

            tfidf = termFreq[w] * idf.idf[w]
            scaled = tfidf / maxtfidf * partitions
            for x in range(1,partitions):
                if tfidf > x:
                    features[str(x*100/partitions)+"percenttfidf"] = True

    #for f in w_features:
    #    print(f)


    # add previous features and classify
    for i in range(len(words)):

        f = w_features[i].copy()

        for prev in range(2):
            if i > prev:
                prevstring = "prev"+str(prev)+"_"
                f[prevstring+labels[-1-prev]] = True

                prevfeatures = w_features[i-1-prev]
                for k,v in prevfeatures.items():
                    if not k.startswith("in"):
                        f[prevstring+k] = v

        #print("with prev:")
        #print(f)

        # classify
        vector = vec.transform(f)
        vector = selector.transform(vector)
        result = classifier.predict(vector)
        l = result[0]
        #print(l)

        labels.append(l)

    # use labels to clear out
    print(labels)

    retained_words = list()
    for i in range(len(labels)):
        if labels[i] != 'O':
            retained_words.append(words[i])

    newsentence = ""
    for i in range(len(retained_words)):
        if i != 0 and retained_words[i] not in punct and retained_words[i-1] not in ["``"]:
            newsentence += " "
        newsentence += retained_words[i]
        
    sentence.simple = newsentence

    return sentence
Exemplo n.º 22
0
import os
from nltk.tokenize import sent_tokenize
from corenlp import StanfordCoreNLP

# The directory in which the stanford core NLP .jar is located -- you have to
# download this from their website.
CORE_NLP_DIR = "stanford-corenlp-dir/"
PARSER = StanfordCoreNLP(CORE_NLP_DIR)

in_file = "sentences.txt"
text = open(in_file, 'r').read()
sentences = sent_tokenize(text)  # Break the text into sentences.
for i, sentence in enumerate(sentences):
	try:
		parse = PARSER.raw_parse(sentence)
		if i%50 == 0:
			print " Entered sentence " + str(i) + " of " + str(len(sentences))
		write_parse_products(parse['sentences'][0])
	except Exception:
		print "Error on sentence:\n\t " + sentence + " \n "
		pass

def write_parse_products(self, parse):
	words = parse['words']

	word_objects = []
	text = ""
	for i, word_info in enumerate(words):
		properties = word_info[1]
		token = word_info[0].lower().strip()
		surface = word_info[0].strip()
Exemplo n.º 23
0
from corenlp import StanfordCoreNLP

corenlp_dir = "../../Scripts/stanford-corenlp-full-2014-08-27/"
corenlp = StanfordCoreNLP(corenlp_dir)  # wait a few minutes...

result = corenlp.raw_parse(
    "What is birth date of the wife of the first black president of the United States?"
)

print((result['sentences'][0]['dependencies']))
#!/usr/bin/env python

import sys, bz2
sys.path.insert(0, '/Users/timpalpant/Documents/Workspace/corenlp-python')
import nltk
from nltk.tree import Tree
from corenlp import StanfordCoreNLP
from remove_random_word import remove_random_word

print "Booting StanfordCoreNLP"
nlp = StanfordCoreNLP()

print "Initializing train file"
train = bz2.BZ2File('../data/train_v2.txt.bz2')
for line in train:
    rline = remove_random_word(line)
    lparse = nlp.raw_parse(line)
    ltree = Tree.fromstring(lparse['sentences'][0]['parsetree'])
    rparse = nlp.raw_parse(rline)
    rtree = Tree.fromstring(rparse['sentences'][0]['parsetree'])
    print ltree
    print rtree