예제 #1
0
def anaphora(text):
   
   nlp = StanfordCoreNLP('http://192.168.54.210:9000/')
   output = nlp.annotate(text, properties={
  'annotators': 'tokenize,ssplit,pos,depparse,parse,coref',
  'outputFormat': 'text'})
   sents = nltk.sent_tokenize(text) 
   a=[]
   for sent in sents:   
       a.append(sent.split())
   output = str(output.replace('\r','').replace('\t',''))  
   #output = output.split('Coreference set:', 1)[1]
   output = output.split('Coreference set:')
   #output = str(output.replace('\r','').replace('\t',''))
   #output = output.split('\n');
   for out in output[1:]:
       #print out
       out = str(out.replace('\r','').replace('\t',''))
       out = out.split('\n')
       for i in out[1:-1]:
           i = i.split(', that is:')
           toFrom = i[0].split('->')
           fromSent , fromStart, fromEnd = sentenceRange(toFrom[0])
           toSent , toStart, toEnd = sentenceRange(toFrom[1])
           fromText , toText = fromTo(i[1])
           
           if len(toText.split()) > 1:
              toText = shorten(toText)
              toText = [toText]
           #a[fromSent - 1][fromStart - 1:fromEnd - 1] = a[toSent - 1][toStart - 1:toEnd - 1]
              a[fromSent - 1][fromStart - 1:fromEnd - 1] = toText
   return a
예제 #2
0
파일: ontology.py 프로젝트: xR86/ml-stuff
def corenlp_tokenize(text):
    nlp = StanfordCoreNLP('http://localhost:9000')
    output = nlp.annotate(text, properties={
        'annotators': 'tokenize,ssplit,pos,depparse,parse',
        'outputFormat': 'json'
    })
    print(output['sentences'][0]['parse'])

    return output
def stanford_parsing_result():
    text =""" I shot an elephant. The dog chased the cat. School go to boy. """
    nlp = StanfordCoreNLP('http://localhost:9000')
    res = nlp.annotate(text, properties={
        'annotators': 'tokenize,ssplit,pos,depparse,parse',
        'outputFormat': 'json'
    })
    print(res['sentences'][0]['parse'])
    print(res['sentences'][2]['parse'])
예제 #4
0
def NERGetter(text):
    nlp = StanfordCoreNLP('http://192.168.54.210:9000/')
    output = nlp.annotate(text, properties={
    'annotators': 'tokenize,ssplit,pos , ner',
    'outputFormat': 'text'})
    output = str(output.replace('\r','').replace('\t',''))
    output = output.split('[', 1)[1]
    output = str(output)
    output = output.split('\n')
    for i in output[0:-1]:
        i = i.replace(']','')
        i = i.split('NamedEntityTag=')
    return i[1]
예제 #5
0
class Parser:
    def __init__(self, coreNLPServer ='http://localhost:9000'):
        self.nlp = StanfordCoreNLP('http://localhost:9000')

    def word_list(self, text):
        nlp_output = self.nlp.annotate(text, properties={
            'annotators': 'tokenize,ssplit',
            'outputFormat': 'json'
        })
        word_array = []
        for sentence in nlp_output['sentences']:
            for w in sentence['tokens']:
                word_array.append(w['word'].lower())
        return word_array


    def parse_tree(self, text, binary=False, preprocessed=False):
        nlp_output = self.nlp.annotate(text, properties={
            'annotators': 'tokenize,ssplit,pos,parse',
            'outputFormat': 'json',
            'parse.binaryTrees': 'true'
        })
        if type(nlp_output) == str:
            nlp_output = json.loads(nlp_output, strict=False)

        if len(nlp_output['sentences']) > 1:
            #merge trees from sentences
            tree_string = "(Top "
            for s in nlp_output['sentences']:
                p_tree = Tree.fromstring(s['parse'])
                tree_string += str(p_tree[0])
            tree_string += ")"
            merged_tree = Tree.fromstring(tree_string)
        else:
            #no merging required
            merged_tree = Tree.fromstring(nlp_output['sentences'][0]['parse'])
            #remove root
            merged_tree = merged_tree[0]

        if binary:
            nltk.treetransforms.chomsky_normal_form(merged_tree)

        if preprocessed:
            merged_tree = preprocess_parse_tree(merged_tree)

        return merged_tree

    def draw_parse_tree(self, parse_tree):
        nltk.draw.tree.draw_trees(parse_tree)
예제 #6
0
    def __init__(self, argv):
        self.input = ""
        self.output_folder = ""       # output has to be a folder
        self.input_type = ""

        # Start Stanford CoreNLP Server
        self.nlp = StanfordCoreNLP('http://localhost:9000')

        # Read User Command Line
        opts, args = getopt.getopt(argv,"hi:o:",["ifile=","ofile="])
        for opt, arg in opts:
          if opt == '-h':
             print("Type 'python3.5 text_preprocessing/preprocess.py  -i <inputfile> -o <outputfile>' \
                   in run_source_code.sh file")
             sys.exit()
          elif opt in ("-i", "--ifile"):
             self.input = arg
             if os.path.exists(arg) == False:
                 print("Input doesn't exist")
                 sys.exit()
             if os.path.isdir(arg) == True: self.input_type = "dir"
             elif os.path.isfile(arg) == True: self.input_type = "file"
          elif opt in ("-o", "--ofile"):
             self.output_folder = arg

        print("Input: " + self.input +", " + self.input_type)
        print("Output: " + self.output_folder)
예제 #7
0
 def __init__(self, annotators='tokenize,ssplit,pos,parse,lemma,ner', cacheDir='./corenlp'):#depparse
     self.annotators=annotators
     self.nlp = StanfordCoreNLP('http://localhost:9000')
     if not os.path.exists(cacheDir):
         os.makedirs(cacheDir)
     self.cache= os.listdir(cacheDir)
     self.cacheDir=cacheDir
예제 #8
0
class StanfordServerParser(Parser, GenericStanfordParser):
    """Follow the readme to setup the Stanford CoreNLP server"""
    def __init__(self, host='localhost', port=9000, properties={}):
        url = 'http://{0}:{1}'.format(host, port)
        self.nlp = StanfordCoreNLP(url)

        if not properties:
            self.properties = {
                'annotators': 'parse',
                'outputFormat': 'json',
            }
        else:
            self.properties = properties

    def _make_tree(self, result):
        return Tree.fromstring(result)

    def parse(self, sent):
        output = self.nlp.annotate(sent, properties=self.properties)

        # Got random html, return empty tree
        if isinstance(output, str):
            return Tree('', [])

        parse_output = output['sentences'][0]['parse'] + '\n\n'
        tree = next(next(self._parse_trees_output(parse_output)))[0]
        return tree
예제 #9
0
class CoreNLP:
    """Used to initialize the Stanford Core NLP in servlet mode and then connect to it using a socket"""
    mongo = MongoClient()
    mongo_db = mongo.get_database('dependencies')

    def __init__(self, timeout=15000, port=9000, buffer_size=4096):
        """Used to initialize the StanfordAPI object with the host, port and buffer"""
        # self.host = socket.gethostname()
        self.port = str(port)
        # self.timeout = str(timeout)
        # self.buffer = str(buffer_size)
        # self.process = Popen(
        #     args=['java', '-mx4g', '-cp', 'commons/corenlp/*', 'edu.stanford.nlp.pipeline.StanfordCoreNLPServer',
        #           '-port', self.port, '-timeout', self.timeout])
        # time.sleep(5)
        self.nlp = StanfordCoreNLP('http://localhost:' + self.port)

    def parse(self, text):
        dobj = self.mongo_db.get_collection('dependency').find_one({'text': text})
        if not dobj or dobj['deps'] == 'CoreNLP request timed out. Your document may be too long.':
            output = self.nlp.annotate(text, properties={
                'annotators': 'tokenize,ssplit,pos,depparse,parse,coref',
                'coref.algorithm': 'neural',
                'outputFormat': 'json',
            })
            dep = {'text': text, 'deps': output}
            self.mongo_db.get_collection('dependency').insert_one(dep)
            return output
        else:
            return dobj['deps']
예제 #10
0
    def __init__(self, corenlp_url='http://localhost:9000'):
        '''
        Create Converter for converting NER annotations to Brat annotations
        classifier training data.

        To start the server checkout: http://stanfordnlp.github.io/CoreNLP/corenlp-server.html#getting-started
        '''
        self.corenlp = StanfordCoreNLP(corenlp_url)
    def resolve(self, text):

        sentences_all = sent_tokenize(text, 'English')

        for i in range(2, len(sentences_all)):
            text2 = sentences_all[i-2]+' '+sentences_all[i-1]+' '+sentences_all[i]
            print(text2)
            sentences = sent_tokenize(text2, 'English')
            print(sentences)
            nlp = StanfordCoreNLP('http://localhost:9000')
            output = nlp.annotate(text2, properties={
                'annotators': 'tokenize,ssplit,pos,lemma,ner,parse,mention,dcoref',
                'outputFormat': 'json'
            })

            # target.write(output)
            # target.close()
            corefs = output['corefs']
            cnt = 1

            for key, chains in corefs.items():

                substitute = ''
                print("\nchain number "+str(cnt))
                cnt += 1
                for chain in chains:

                    # print(chain['isRepresentativeMention']+'\n')
                    print(chain['type'] + ' ' + chain['text'])
                    if (chain['isRepresentativeMention'] is True) and (chain['type'] != 'PRONOMINAL'):
                        substitute = str(chain['text'])
                        print(substitute+'\n')

                    if (chain['type'] == 'PRONOMINAL') and (substitute != ''):
                        sentence_num = chain['sentNum']
                        words = word_tokenize(sentences[sentence_num - 1], 'English')
                        words[chain['startIndex'] - 1] = substitute
                        new_sentence = ' '.join(words)
                        sentences[sentence_num - 1] = new_sentence

            sentences_all[i-2] = sentences[0]
            sentences_all[i-1] = sentences[1]
            sentences_all[i] = sentences[2]

        return sentences_all
def standford_sentiment_answer(text_str):
    asw_sentiment = make_default_sentiment()
    nlp = StanfordCoreNLP('http://localhost:9000')
    res = nlp.annotate(text_str,
                       properties={
                           'annotators': 'sentiment',
                           'outputFormat': 'json',
                           'timeout': 20000,
                       })
    try:
        total_value = 0.0
        for s in res["sentences"]:
            total_value += float(s["sentimentValue"])
            asw_sentiment[s["sentiment"]] += 1
        asw_sentiment['score'] = total_value
        return asw_sentiment
    except:
        return asw_sentiment
예제 #13
0
    def __init__(self, host='localhost', port=9000, properties={}):
        url = 'http://{0}:{1}'.format(host, port)
        self.nlp = StanfordCoreNLP(url)

        if not properties:
            self.properties = {
                'annotators': 'parse',
                'outputFormat': 'json',
            }
        else:
            self.properties = properties
예제 #14
0
 def __init__(self, timeout=15000, port=9000, buffer_size=4096):
     """Used to initialize the StanfordAPI object with the host, port and buffer"""
     # self.host = socket.gethostname()
     self.port = str(port)
     # self.timeout = str(timeout)
     # self.buffer = str(buffer_size)
     # self.process = Popen(
     #     args=['java', '-mx4g', '-cp', 'commons/corenlp/*', 'edu.stanford.nlp.pipeline.StanfordCoreNLPServer',
     #           '-port', self.port, '-timeout', self.timeout])
     # time.sleep(5)
     self.nlp = StanfordCoreNLP('http://localhost:' + self.port)
 def __init__(self, **kwargs):
     super(CoreNLPParser, self).__init__(**kwargs)
     self.corenlp = StanfordCoreNLP(kwargs['corenlp_url'] )
     self.props = {
         'annotators': 'tokenize,ssplit,lemma,pos,ner',
         'outputFormat': 'json',
         'ner.useSUTime': False,  # dont want SUTime model
         'ner.applyNumericClassifiers': False, # Dont want numeric classifier
     }
     if kwargs.get('ner_model'): # set NER model from CLI
         if not os.path.exists(kwargs.get('ner_model')):
             print('Error: Could not find NER model %s.' % 
                   kwargs.get('ner_model'))
             sys.exit(1)
         self.props['ner.model'] = kwargs['ner_model']
     print("CoreNLP Properties : ", self.props)
예제 #16
0
파일: test2.py 프로젝트: IdeologyPin/pygate
class StanfordAnnotator(PR):
    def __init__(self, annotators='tokenize,ssplit,pos,parse'):#depparse
        self.annotators=annotators
        self.nlp = StanfordCoreNLP('http://localhost:9000')

    def process(self, doc):
        output=self.nlp.annotate(doc.getText(), properties={
              'annotators': self.annotators,
              'outputFormat': 'json',
              'timeout': '600000'

        })
        sents=[]
        tokens=[]
#         print "output", json.dumps(output)
        tStart=0
        tEnd=0
        for s in output['sentences']:
            sentText=[]
            sentTokens=[]
            for t in s['tokens']:
#                 print t
                sentText.append(t['before'])
                sentText.append(t['originalText'])

                token=Annotation(t['originalText'],tEnd,tEnd,t['characterOffsetBegin'], t['characterOffsetEnd'], 'Token', doc)
                token.setFeature('pos', t['pos'])
                token.setFeature('index', t['index'])
                tokens.append(token)
                sentTokens.append(token)
                tEnd+=1

            cStart=s['tokens'][0]['characterOffsetBegin']
            cEnd=s['tokens'][-1]['characterOffsetEnd']
            sentText="".join(sentText)
            print sentText
            sent=Annotation(sentText, tStart, tEnd, cStart, cEnd, 'Sentence', doc)
            tStart=tEnd

            sent.setFeature('constituency-parse', s['parse'])
            sent.setFeature('dep-parse', 'not implemented!')
            sent.setFeature('index', s['index'])
#           sent.setRelation('tokens',sentTokens)
            sents.append(sent)
#         pr-
        doc.setSents(sents)
        doc.setTokens(tokens)
예제 #17
0
파일: nlp.py 프로젝트: zhtmike/wsc-project
class NLPFactory:
    def __init__(self):
        self.url = os.environ.get("CORENLP_URL", "http://localhost:9000")
        self.nlp = StanfordCoreNLP(self.url)

    def annotate(self, text):
        """
        annotate by dependence parser
        Args:
            text (str): input data

        Returns:
            json
        """
        # corenlp will treat sentences with full stop independently
        text = text.replace('.', ',').replace('!', ',')
        return self.nlp.annotate(text, properties={"annotators": "pos,lemma,depparse,sentiment", "outputFormat": "json"})
예제 #18
0
 def load_classifier(self, inputfile="slk_classifier.model.txt", outputfile="jsre_results.txt"):
     self.corenlp_client = StanfordCoreNLP('http://localhost:9000')
     # sup.relation.model=
     tokenkeys = set()
     sentencekeys = set()
     for d in self.corpus.documents:
         for s in self.corpus.documents[d].sentences:
             corenlpres = self.corenlp_client.annotate(s.text.encode("utf8"), properties={
                     'ssplit.eolonly': True,
                     'openie.triple.all_nominals': True,
                     'openie.triple.strict': False,
                     'openie.max_entailments_per_clause': 500,
                     'annotators': 'tokenize,ssplit,pos,depparse,natlog,openie',
                     #'annotators': 'tokenize, ssplit, pos, lemma, ner, parse, relation, openie',
                     'outputFormat': 'json',
                     # 'sup.relation.model': self.modelname
                 })
             for o in corenlpres["sentences"][0]["openie"]:
                 if "mir" in o["object"] or "mir" in o["subject"]:
                     print "{}={}>{}".format(o["subject"], o["relation"], o["object"])
예제 #19
0
class NerToBratConverter(object):
    def __init__(self, corenlp_url='http://localhost:9000'):
        '''
        Create Converter for converting NER annotations to Brat annotations
        classifier training data.

        To start the server checkout: http://stanfordnlp.github.io/CoreNLP/corenlp-server.html#getting-started
        '''
        self.corenlp = StanfordCoreNLP(corenlp_url)

    def convertToBrat(self, text_file, ann_file):
        print("Processing %s" % text_file)
        with open(text_file) as f:
            text = f.read()

        props = { 'annotators': 'tokenize,ssplit,pos,ner', 'outputFormat': 'json'}
        output = self.corenlp.annotate(text, properties=props)
        # flatten sentences and tokens
        tokenlists = [s['tokens'] for s in output['sentences']]
        tokens = itertools.chain.from_iterable(tokenlists)

        count = 1
        with open(ann_file, 'w', 1) as out:
            for token in tokens:
                if token['ner'] != 'O':
                    rec = "T%d\t%s %d %d\t%s" % (count,
                            token['ner'],
                            token['characterOffsetBegin'],
                            token['characterOffsetEnd'],
                            token['originalText'])
                    # print(rec)
                    out.write(rec)
                    out.write("\n")
                    count += 1
        print("Wrote %s" % ann_file)

    def convert_all(self, input_paths):
        with open(input_paths) as paths:
            for d in map(lambda x: x.split(','), map(lambda x: x.strip(), paths)):
                self.convertToBrat(d[0], d[1])
예제 #20
0
class StanfordNERApi():
    '''
        Make use of StanfordCoreNLP Server
        Extract keyword through name entity recogonition
    '''
    def __init__(self):
        self.nlp = StanfordCoreNLP(NLP_SERVER)
        
    def ner_groupby_ner(self, text):
        response = self.nlp.annotate(text, properties={
            'annotators': 'ner,lemma',
            'outputFormat': 'json'
        })
        return self.__process_ner_groupby_ner(response)        
        
    def __process_ner_groupby_ner(self, response):
        output_dict = dict()
        '''The response is generally organized as {sentences:[{tokens:[]},{}]}'''
        if type(response) == dict and 'sentences' in response:
            for sentence in response['sentences']:
                for item in sentence['tokens']:
                    # we only care about ner in set TARGET_NER
                    if item.get('ner') in TARGET_NER:
                        if item['ner'] not in output_dict:
                            output_dict[item['ner']] = set()
                        output_dict[item['ner']].add(item['originalText']) 
            
            # convert from set to list for further json dumps
            for key in output_dict:
                output_dict[key] = list(output_dict[key])
            # convert dict to string by json dumps
            if len(output_dict) > 0:
                return json.dumps(output_dict)
            else:
                return None
        else:
            logger.warning('sentences part is not in the response from NLP server.')
            return None
 def __init__(self, files=None):
     self.sources = files
     self.triples = []
     self.news = ""
     self.nlp = StanfordCoreNLP('http://localhost:9000')
예제 #22
0
파일: nlp.py 프로젝트: zhtmike/wsc-project
 def __init__(self):
     self.url = os.environ.get("CORENLP_URL", "http://localhost:9000")
     self.nlp = StanfordCoreNLP(self.url)
import os
import sys
from unidecode import unidecode
import string
printable = set(string.printable)
# from nltk.tag import StanfordNERTagger

# st=StanfordNERTagger("./classifiers/english.all.3class.distsim.crf.ser.gz", path_to_jar="./stanford-ner.jar")

from pycorenlp import StanfordCoreNLP

nlp = StanfordCoreNLP('http://localhost:9000')

path = sys.argv[1]

spath = path + '/../names/'

d = os.path.dirname(spath)

if not os.path.exists(d):
    os.makedirs(d)

def remove_non_ascii(text):
    return unidecode(unicode(text, encoding = "utf-8"))

for file in os.listdir(path):

    current=os.path.join(path,file)

    if os.path.isfile(current):
        data=open(current,'rb')
예제 #24
0
    else:
        return x

#tagger keeps freaking out at numbers in parentheses. this function removes the parentheses
def removeParenth(x):
    findParenth = re.search('\([0-9]+\)',x)
    if findParenth:
        x = re.sub('\(','',x)
        x = re.sub('\)','',x)
        return x
    else:
        return x


if __name__ == '__main__':
    nlp = StanfordCoreNLP('http://localhost:9000')

    for line in orig_file:
        if not line.startswith("PMID"):
            info = line.split('\t')
            pmid = info[0]
            ta = info[1]
            sentence = info[2]
            sentence = sentence.rstrip('\n')
            cleanSentence = removeBracket(sentence)
            extraClean = removeParenth(cleanSentence)
            output =  nlp.annotate(extraClean,properties={
            'annotators':'tokenize,ssplit,pos,depparse,parse',
            'outputFormat' : 'json'})
            try:
                result = output['sentences'][0]['parse']
from pycorenlp import StanfordCoreNLP
from pprint import pprint
import json

FILE = "data/test200"

nlp = StanfordCoreNLP('http://localhost:{0}'.format(9000))


def get_stanford_annotations(
        text,
        port=9000,
        annotators='tokenize,ssplit,pos,lemma,depparse,parse'):
    output = nlp.annotate(text,
                          properties={
                              "timeout": "10000",
                              "ssplit.isOneSentence": "true",
                              'annotators': annotators,
                          })
    return output


with open(FILE + '.txt',
          encoding='utf-8') as in_file, open(FILE + '.NRE',
                                             'w',
                                             encoding='utf-8') as out_file:
    for line in in_file:
        ls = line.strip().split('\t')
        sent_id = ls[0].strip()
        document = ' '.join(ls[1].strip().split())
        token1 = ls[2]
예제 #26
0
def rawfileprocess(rawfile, outputFile, aspect_file):
    f = open(rawfile)
    fout = open(outputFile, 'a')
    faspectfile = open(aspect_file, 'a')
    result = []
    aspectset = set()
    nlp = StanfordCoreNLP('http://localhost:9000')

    for line in f:
        line = line.strip()
        seperatorIndex = line.find('##')
        if seperatorIndex <= 0:
            continue

        #aspect prrprocess
        aspectString = line[:seperatorIndex].strip()
        if aspectString.find('[') < 0:
            continue
        aspectsTmp = aspectString.split(',')
        aspects = []
        for aspectScore in aspectsTmp:
            aspectScore = aspectScore.strip(' ')
            if aspectScore.find('[u]') >= 0 or aspectScore.find('[p]') >= 0:
                continue

            endIndex = aspectScore.find('[')
            if endIndex < 0:
                continue
            aspects.append(aspectScore[:endIndex].split())
            for aspectitem in aspectScore[:endIndex].split():
                if aspectitem != ' ':
                    aspectset.add(aspectitem)

        if len(aspects) == 0:
            continue

        #sentence tokenizer and word tokenizer and dep pos
        rawReview_1 = line[seperatorIndex + 2:].strip()
        output3 = nlp.annotate(rawReview_1,
                               properties={
                                   'annotators': 'tokenize,pos,depparse',
                                   'outputFormat': 'json'
                               })

        for index_sentence in range(0, len(output3['sentences'])):
            subsentence = output3['sentences'][index_sentence]['tokens']
            subsentencedep = output3['sentences'][index_sentence][
                'enhancedPlusPlusDependencies']
            tmpword = []
            tmppos = []
            tmpdeps = []
            for index in range(0, len(subsentence)):
                tmpword.append(subsentence[index]['word'])
                tmppos.append(subsentence[index]['pos'])
                tmpdep = ''
                for deps in subsentencedep:
                    if deps['dependent'] == index + 1 or deps[
                            'governor'] == index + 1:
                        dependent_index = deps['dependent'] - 1
                        gov_index = deps['governor'] - 1
                        if deps['governorGloss'] == 'ROOT':
                            govpos = '#'
                        else:
                            govpos = subsentence[gov_index]['pos']

                        if deps['dependentGloss'] == 'ROOT':
                            deppos = '#'
                        else:
                            deppos = subsentence[dependent_index]['pos']

                        tmpdep += '(' + deps['dep'] + ' ' + deps[
                            'governorGloss'] + ' ' + govpos + ' ' + deps[
                                'dependentGloss'] + ' ' + deppos + ')\t'
                tmpdep.strip('\t')
                tmpdeps.append(tmpdep)

            lables = get_lable_2(aspects, tmpword)
            if 'B-TERM' not in lables:
                continue
            '''
            for i in range(0,len(lables)):
                result.append(tmpword[i]+'\t'+tmppos[i] +'\t'+lables[i]+'\n') #+'\t'+tmpdeps[i]
                #result.append('\n')
            result.append('\n')
            '''
            for i in range(0, len(lables)):
                result.append(tmpword[i] + '\t' + tmppos[i] + '\t' +
                              lables[i] + '\t' + tmpdeps[i] + '\n')
            result.append('\n')

    try:
        for aspect in aspectset:
            faspectfile.write(aspect + '\n')
        for word in result:
            fout.write(word)
    except IOError:
        print " IOError exception"
        exit(0)
    f.close()
    fout.close()
    faspectfile.close()
예제 #27
0
from pycorenlp import StanfordCoreNLP
nlp = StanfordCoreNLP('http://localhost:9000')
from functools import reduce
import pandas as pd

def keywordsInSen(sen):
    words = [(t['lemma'], t['index'], t['ner']) for t in sen['tokens']
           if t['ner'] != 'O']
    reducedWords = []
    if len(words) == 0:
        return []
    parWord = words[0]
    if len(words) > 1:
        for w in words[1:]:
            if w[1] == parWord[1] + 1 and w[2] == parWord[2]:
                parWord = (parWord[0] + ' ' + w[0], parWord[1] + 1, parWord[2])
            else:
                reducedWords.append((parWord[0], parWord[2]))
                parWord = w
        reducedWords.append((parWord[0], parWord[2]))
    return reducedWords

def keywordsInTxt(txt):
    an = nlp.annotate(txt, properties={
      'annotators': 'ner',
      'outputFormat': 'json'
      })
    wordLists = [keywordsInSen(s) for s in an['sentences']]
    return set(reduce(lambda x, y: x + y, wordLists))

def extractFromHeadlines(headlineTable):
예제 #28
0
import re
import json
import pickle
import os

from tqdm import tqdm
from pycorenlp import StanfordCoreNLP

nlp_server = StanfordCoreNLP('http://ink-molly.usc.edu:9000')

version = "1.0"

# File name
file_path = "data/01.src.txt"

# Display options
IF_DISP_PREFIX = False
IF_DISP_TQDM = False
IF_DISP_VB_UNMATCH = False
IF_DISP_IF_UNMATCH = False
IF_DISP_BAN = False
IF_DISP_ALL_SEN = False
IF_VERB_ONLY = True

# Character filter
character_patterns = [
    '^Craig:.*',
    '^Cestero:.*',
]

예제 #29
0
파일: diag.py 프로젝트: zwep/MID
"""

from bs4 import BeautifulSoup as bS
import collections
import networkx as nx
import os
from pycorenlp import StanfordCoreNLP
import re
import requests
import unicodedata

# starting coreNLP server via the following command.
# for the latest version, see: https://stanfordnlp.github.io/CoreNLP/
# java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9000
nlp = StanfordCoreNLP('http://localhost:9000')

# <editor-fold desc='Define the functions that we will need for this script..'>


def edge2graph(input_label, input_edges, input_diag_name, input_bio_dict):
    """
    Combines all the info in one graph object... This is a preprocessing step to store the info as a json.

    :param input_label: The input category we are going to prepocess
    :param input_edges: List of all edges that are found while analyzing the text
    :param input_diag_name: List of all diag people names
    :param input_bio_dict: Dict of the bio text of all the diag people
    :return:
    """
    ppl_label_edge_list = [x[:-1] for x in input_edges if input_label in x]
from pycorenlp import StanfordCoreNLP
import pymysql

db = pymysql.connect(host="localhost",
                     user="******",
                     password="******",
                     db="ArticleNYT")
print(db)
nlp = StanfordCoreNLP('http://localhost:9000')
print(nlp)
cur = db.cursor()
cur.execute("""SELECT COUNT(NID) FROM Tech_2018;""")
temp = cur.fetchone()
counter = temp[0] - 1
print(counter)
while (counter >= 0):
    try:
        cur.execute("""SELECT Abstract from Tech_2018 WHERE NID = %s;""",
                    (counter))
        tmpabs = cur.fetchone()
        finabs = str(tmpabs[0])
        res = nlp.annotate(finabs,
                           properties={
                               'annotators': 'sentiment',
                               'outputFormat': 'json',
                               'timeout': 1000000000,
                               "ssplit.eolonly": "true"
                           })
        for s in res["sentences"]:
            print("%s" % (s["sentimentValue"]))
        score = s['sentimentValue']
예제 #31
0
    return sum/(len(sentiments))

#tales=['FundeVogel','Rapunzel','TheGooseGirl','Golden Bird','HansInGoodLuck','JorindaAndJorindel','TravelingMusicians','OldSultan','TheStraw','BriarRose','DogAndSparrow','TwelveDancingPrincesses','FishermanAndWife','TheWillowRen','FrogPrince','CatAndMouse']
taleSentiments=[]
for taleName in tales:
    #f = open("./Corefs/"+taleName,'r',encoding="utf8")
    p(taleName)
    if (sys.argv[1]==1):
        f=open("./Stories/"+taleName,'r',encoding="utf8")
    else:
        f=open("./Corefs/"+taleName,'r',encoding="utf8")
    tale= f.read()
    tale = tale.replace('\n', ' ')
    tale = tale.replace('\r', ' ')
    #pprint.pprint(tale)
    nlp_wrapper = StanfordCoreNLP('http://localhost:9000')
    #doc = "Ronaldo has moved from Real Madrid to Juventus. While Messi still plays for Barcelona"
    doc=tale
    #pprint.pprint(doc)
    annot_doc = nlp_wrapper.annotate(doc,
        properties={
            'annotators': 'ner, pos,depparse',
            'outputFormat': 'json',
            'timeout': 100000,
        })

    nsubjs=[]
    #pprint.pprint(annot_doc)
    for sentence in annot_doc['sentences']:
        for element in sentence['basicDependencies']:
            if(element['dep']=='nsubj'):
예제 #32
0
# -*- coding:utf-8 -*-
from pycorenlp import StanfordCoreNLP
import re
from nltk import RegexpParser

#nlp = StanfordCoreNLP('http://localhost:9000/')
nlp = StanfordCoreNLP("http://corenlp.run/")

grammar = """
    V: {<VB.*><PR>?<IN|TO>?}
    W: {<NN*|JJ|RB.*|PRP.*|DT>}
    P: {<IN|TO|PR>}
    VP2: {<V><P>}
    VP3: {<V><W>+<P>}
    VP1: {<V>}
"""

vp_parser = RegexpParser(grammar)


def clean(word):
    if "(" in word:
        word = word[:word.find("(")]
    return word


def analyze(sentence):
    output = nlp.annotate(sentence, properties={
        'annotators': 'tokenize,ssplit,pos,parse,depparse,coref',
        'tokenize.whitespace': True,
        'outputFormat': 'json'
#import json
#import nltk
#import math
#import re
import string
from pycorenlp import StanfordCoreNLP
from textblob import TextBlob
from nltk.corpus import wordnet
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import test_main_rules as rules

analyser = SentimentIntensityAnalyzer()
nlp = StanfordCoreNLP('http://localhost:9000')
#f = open("sample_sentences.txt","r")
line = '''
I have received redmi note 4 black matte 64gb version today. Packaging is so good.
About phone: its fabulous phone.Amazing battery back up, good camera, great memory, beautiful colour of phone with classy primium look of black matte makes it different from other phone. I am loving every feature of this phone.
'''

asp_sent = {}
asp_rating = {}


def corefResolver(line):
    ind_sent = []
    complete_coref_output = nlp.annotate(line,
                                         properties={
                                             'annotators': 'dcoref',
                                             'outputFormat': 'json'
                                         })
    coref_output = complete_coref_output['corefs']
예제 #34
0
CORENLP_SERVER_ADDRESS = 'http://localhost:9000'

NER_TRAINING_DATA_OUTPUT_PATH = join(OUTPUT_DIRECTORY,
                                     'ner-crf-training-data.tsv')
RE_TRAINING_DATA_OUTPUT_PATH = join(OUTPUT_DIRECTORY, 're-training-data.corp')

if os.path.exists(OUTPUT_DIRECTORY):
    if os.path.exists(NER_TRAINING_DATA_OUTPUT_PATH):
        os.remove(NER_TRAINING_DATA_OUTPUT_PATH)
    if os.path.exists(RE_TRAINING_DATA_OUTPUT_PATH):
        os.remove(RE_TRAINING_DATA_OUTPUT_PATH)
else:
    os.makedirs(OUTPUT_DIRECTORY)

sentence_count = 0
nlp = StanfordCoreNLP(CORENLP_SERVER_ADDRESS)

# looping through .ann files in the data directory
ann_data_files = [
    f for f in listdir(DATA_DIRECTORY)
    if isfile(join(DATA_DIRECTORY, f)) and f.split('.')[1] == 'ann'
]

for file in ann_data_files:
    entities = []
    relations = []

    # process .ann file - place entities and relations into 2 seperate lists of tuples
    with open(join(DATA_DIRECTORY, file), 'r') as document_anno_file:
        lines = document_anno_file.readlines()
        for line in lines:
예제 #35
0
def brat_to_conll(input_folder, output_filepath, tokenizer, language):
    '''
    Assumes '.txt' and '.ann' files are in the input_folder.
    Checks for the compatibility between .txt and .ann at the same time.
    '''
    if tokenizer == 'spacy':
        spacy_nlp = spacy.load(language)
    elif tokenizer == 'stanford':
        core_nlp = StanfordCoreNLP('http://localhost:{0}'.format(9000))
    else:
        raise ValueError("tokenizer should be either 'spacy' or 'stanford'.")
    verbose = False
    dataset_type = os.path.basename(input_folder)
    print("Formatting {0} set from BRAT to CONLL... ".format(dataset_type),
          end='')
    text_filepaths = sorted(glob.glob(os.path.join(input_folder, '*.txt')))
    output_file = codecs.open(output_filepath, 'w', 'latin-1')
    for text_filepath in text_filepaths:
        base_filename = os.path.splitext(os.path.basename(text_filepath))[0]
        annotation_filepath = os.path.join(os.path.dirname(text_filepath),
                                           base_filename + '.ann')
        # create annotation file if it does not exist
        if not os.path.exists(annotation_filepath):
            codecs.open(annotation_filepath, 'w', 'latin-1').close()

        text, entities = get_entities_from_brat(text_filepath,
                                                annotation_filepath)
        entities = sorted(entities, key=lambda entity: entity["start"])

        if tokenizer == 'spacy':
            sentences = get_sentences_and_tokens_from_spacy(text, spacy_nlp)
        elif tokenizer == 'stanford':
            sentences = get_sentences_and_tokens_from_stanford(text, core_nlp)

        for sentence in sentences:
            inside = False
            previous_token_label = 'O'
            for token in sentence:
                token['label'] = 'O'
                for entity in entities:
                    if entity['start'] <= token['start'] < entity['end'] or \
                       entity['start'] < token['end'] <= entity['end'] or \
                       token['start'] < entity['start'] < entity['end'] < token['end']:

                        token['label'] = entity['type'].replace(
                            '-', '_'
                        )  # Because the ANN doesn't support tag with '-' in it

                        break
                    elif token['end'] < entity['start']:
                        break

                if len(entities) == 0:
                    entity = {'end': 0}
                if token['label'] == 'O':
                    gold_label = 'O'
                    inside = False
                elif inside and token['label'] == previous_token_label:
                    gold_label = 'I-{0}'.format(token['label'])
                else:
                    inside = True
                    gold_label = 'B-{0}'.format(token['label'])
                if token['end'] == entity['end']:
                    inside = False
                previous_token_label = token['label']
                if verbose:
                    print('{0} {1} {2} {3} {4}\n'.format(
                        token['text'], base_filename, token['start'],
                        token['end'], gold_label))
                output_file.write('{0} {1} {2} {3} {4}\n'.format(
                    token['text'], base_filename, token['start'], token['end'],
                    gold_label))
            if verbose: print('\n')
            output_file.write('\n')

    output_file.close()
    print('Done.')
    if tokenizer == 'spacy':
        del spacy_nlp
    elif tokenizer == 'stanford':
        del core_nlp
예제 #36
0
import sys
import json

from pycorenlp import StanfordCoreNLP

nlp = StanfordCoreNLP('http://localhost:9000')

# file_name = './python_code/test.txt'
# input = open(file_name).read().splitlines()

file_name = json.loads(sys.stdin.readlines()[0])
input = file_name.splitlines()

i = 0
ans_index = 1
res_sentence_arr_disp = []
pos_tags = ['NN', 'NNP', 'NNS', 'NNPS', 'CD', 'JJ']
res_sentence_disp = ''
while (i < len(input)):
    input[i] = input[i].lower()
    res = nlp.annotate(input[i],
                       properties={
                           'annotators': 'pos',
                           'outputFormat': 'json',
                           'timeout': 1000000,
                       })

    for k in range(0, len(res["sentences"])):
        tokens = res["sentences"][k]['tokens']
        b_flag = False
        for token in tokens:
예제 #37
0
#-----Lemmanizing Words------------
lemmas = []
wordnet_lemmatizer = WordNetLemmatizer()
for word in unic_nouns:
    lemma = wordnet_lemmatizer.lemmatize(word, )
    lemmas.append(lemma)
#-----------------------------------
#--------Word Cloud----------------- #Print to generate Word Clouds
""""
for i in range(len(lemmas)):
    print(str(nouns_frequency[i])+ " " + lemmas[i])
"""
#-------------------------------------Item 4-----------------------------------------------------------
sentence = "The last love letter I wrote was probably about 10 years ago."
#tokenized = nltk.word_tokenize(sentence)

parse = StanfordCoreNLP('http://localhost:9000')

output = parse.annotate(sentence,
                        properties={
                            'annotators': 'parse',
                            'outputFormat': 'json'
                        })

tree1 = output['sentences'][0]['parse'] + ""
treeFinal = Tree.fromstring(tree1)
treeFinal.draw()
#t = Tree.
#t.draw()
예제 #38
0
 def __init__(self):
     self.corenlp = StanfordCoreNLP('http://localhost:9000')
예제 #39
0
import json
import os
import re
import requests
import sys
import traceback

from json import JSONDecodeError
from requests.exceptions import RequestException
from nltk.tokenize import sent_tokenize
from pycorenlp import StanfordCoreNLP

nlpserver = StanfordCoreNLP("http://localhost:9000")


def clean_depparse(dep):
    """
    Given a dependency dictionary, return a formatted string representation.
    """
    return str(dep['dep'] + "(" + dep['governorGloss'].lower() + "-" +
               str(dep['governor']) + ", " + dep['dependentGloss'] + "-" +
               str(dep['dependent']) + ")")


def clean_treeparse(tree):
    cleaned_tree = re.sub(r' {2,}', ' ', tree)
    cleaned_tree = re.sub(r'\n', '', cleaned_tree)
    cleaned_tree = re.sub(r'\([^\s]*\s', '', cleaned_tree)
    cleaned_tree = re.sub(r'\)', '', cleaned_tree)
    cleaned_tree = re.sub(r'-LRB-', '(', cleaned_tree)
    cleaned_tree = re.sub(r'-RRB-', ')', cleaned_tree)
예제 #40
0
class GoldenSupervision():
    def __init__(self):
        self.load_data()
        self.nlp = StanfordCoreNLP(config.StanfordCoreNLP_Path)

    def load_data(self):
        # loading webcomplexquestions
        with open(config.complexwebquestions_dir + 'ComplexWebQuestions_' +
                  config.EVALUATION_SET + '.json') as f:
            questions = json.load(f)
        print(len(questions))
        print(pd.DataFrame(questions)['compositionality_type'].value_counts())

        # aliases version
        compWebQ = pd.DataFrame(
            [{'ID': question['ID'], 'question': question['question'], 'webqsp_question': question['webqsp_question'], \
              'machine_question': question['machine_question'], 'comp': question['compositionality_type'], \
              } for question in questions])
        print(compWebQ['comp'].value_counts())

        self.compWebQ = compWebQ.to_dict(orient="rows")

    def calc_split_point(self, question):
        question['question'] = question['question'].replace('?', '').replace(
            '.', '')
        question['machine_question'] = question['machine_question'].replace(
            '?', '').replace('.', '')
        machine_annotations = self.annotat(question['machine_question'],
                                           annotators='tokenize')
        webqsp_annotations = self.annotat(question['webqsp_question'],
                                          annotators='tokenize')
        question['machine_tokens'] = machine_annotations
        question['webqsp_tokens'] = webqsp_annotations

        # calculating original split point
        org_q_vec = question['webqsp_tokens']
        machine_q_vec = question['machine_tokens']
        org_q_offset = 0

        for word in machine_q_vec:
            if org_q_offset < len(
                    org_q_vec) and org_q_vec[org_q_offset] == word:
                org_q_offset += 1
            else:
                break

        # adding split_point2 for composition
        if question['comp'] == 'composition':
            org_q_offset2 = len(machine_q_vec) - 1
            for word in org_q_vec[::-1]:
                if org_q_offset2 > 0 and machine_q_vec[org_q_offset2] == word:
                    org_q_offset2 -= 1
                else:
                    break
            if org_q_offset2 != len(machine_q_vec) - 1:
                question['split_point2'] = org_q_offset2
            else:
                question['split_point2'] = org_q_offset2

            question['machine_comp_internal'] = ' '.join(
                question['machine_tokens']
                [org_q_offset:question['split_point2'] + 1])

        question['split_point'] = org_q_offset
        if question['split_point'] == 0:
            question['split_point'] = 1

        org_q_offset = 0
        new_part = []
        for word in question['machine_tokens']:
            if org_q_offset < len(question['webqsp_tokens']) and question[
                    'webqsp_tokens'][org_q_offset] == word:
                org_q_offset += 1
            else:
                new_part.append(word)

        question['split_point'] = org_q_offset
        question['new_part'] = ' '.join(new_part)
        return question

    # Generating golden supervision
    def gen_golden_supervision(self):
        qind = 0
        num_q_to_proc = len(self.compWebQ)
        for question in self.compWebQ[0:num_q_to_proc]:

            # print question
            qind += 1
            if qind % 100 == 0:
                print(qind)

            if question['comp'] is None or question['comp'] in [
                    'comparative', 'superlative'
            ]:
                continue

            question = self.calc_split_point(question)
            mg_question = question['machine_question'].split()

            if question['split_point'] == 0:
                question['split_point'] = 1

            question['flip_rephrase'] = 0
            if question['comp'] == 'conjunction':
                tokens_anno = self.annotat(' '.join(mg_question))
                question['machine_comp_internal'] = ''
                s = question['split_point']
                question['split_part1'] = ' '.join(mg_question[:s])
                question['split_part2'] = mg_question[s:]
                if question['split_part2'][
                        0] == 'and':  # delete conjunction word
                    question['split_part2'] = question['split_part2'][1:]
                # add wh- and nouns of first part
                head_part = []
                for i in range(len(tokens_anno)):
                    # if we meet a verb, or a that(WDT) in the middle, we break
                    if 'V' in tokens_anno[i]['pos'] or (
                            'WDT' in tokens_anno[i]['pos'] and i != 0):
                        break
                    else:
                        head_part.append(mg_question[i])
                question['split_part2'] = ' '.join(head_part +
                                                   question['split_part2'])
            else:
                if question['split_point2'] <= question['split_point']:
                    print('found error in split point 2')
                    question['split_point2'] = question['split_point'] = 1
                s1, s2 = question['split_point'], question['split_point2']
                question['split_part1'] = question['machine_comp_internal']
                question['split_part2'] = ' '.join(mg_question[:s1] + [
                    '%composition',
                ] + mg_question[s2 + 1:])
            # print('{}[{}]\n[{}]\n[{}]\n{}'.format(question['comp'], ' '.join(mg_question),
            #                                     question['split_part1'], question['split_part2'], '-' * 100))

        out = pd.DataFrame(self.compWebQ[0:num_q_to_proc])[[
            'ID', 'comp', 'flip_rephrase', 'split_part1',
            'machine_comp_internal', 'split_part2', 'question',
            'machine_question'
        ]]

        with open(
                config.golden_supervision_dir + config.EVALUATION_SET +
                '.json', 'w') as outfile:
            json.dump(out.to_dict(orient="rows"),
                      outfile,
                      sort_keys=True,
                      indent=4)

    def annotat(self, text, annotators='pos'):
        question = text.replace('?', '')

        text = unicodedata.normalize('NFKD', question).encode(
            'ascii', 'ignore').decode(encoding='UTF-8')

        output = self.nlp.annotate(text,
                                   properties={
                                       'annotators': annotators,
                                       'outputFormat': 'json'
                                   })
        try:
            tokens_anno = output['sentences'][0]['tokens']
        except KeyError:
            tokens_anno = [k['word'] for k in output['tokens']]
        return tokens_anno
예제 #41
0
class Text2Vec(object):
    def __init__(self, wordvec_path, preload=False):
        self.wp = wordvec_path
        self.wv = {}
        self.__read_wv__(preload=preload)
        self.nlp = StanfordCoreNLP('http://localhost:9000')

    def __read_wv__(self, sep=" ", preload=False):
        if not preload:
            with open(self.wp, 'r') as f:
                for line in f:
                    tmp = line.split(sep)
                    word = tmp[0]
                    vec = np.array([float(each) for each in tmp[1:]])
                    self.wv[word] = vec
            print("Number of tokens: ", len(self.wv))
            # pprint(self.wv.keys())
            # dump wordvector
            with open('gloveWordVector.bin', 'wb') as f2:
                pickle.dump(self.wv, f2)
        else:
            with open(self.wp, 'rb') as f:
                self.wv = pickle.load(f)

    def convert2vec(self, in_path, out_path, has_keywords=False):
        vectors = []
        labels = []
        with open(in_path) as f:
            for i, line in enumerate(f):
                data = json.loads(line)
                section = data['section']
                if has_keywords:
                    keywords = data['keywords']
                else:
                    keywords = None
                headline = data['headline']
                lead_paragraph = data['lead_paragraph']
                tokens = []
                try:
                    out = self.nlp.annotate(lead_paragraph,
                                            properties={
                                                'annotators':
                                                'tokenize, ssplit, pos',
                                                'outputFormat': 'json'
                                            })
                    # take sentence 1
                    if isinstance(out, dict) and out['sentences']:
                        sentence = out['sentences'][0]
                        for each in sentence['tokens']:
                            word = each['word'].lower()
                            word = word.strip('.')
                            word = word.strip(',')
                            word = word.strip(')')
                            word = word.strip('(')
                            pos = each['pos']
                            if "JJ" in pos or "NN" in pos or "VB" in pos:
                                tokens.append(word)
                except AssertionError:
                    pass
                # add keywords
                if keywords:
                    for each in keywords:
                        tmp = each['value']
                        tmp = tmp.split(" ")
                        tmp = [each.strip(',').lower() for each in tmp]
                        tmp = [each.strip('.').lower() for each in tmp]
                        tmp = [each.strip(')').lower() for each in tmp]
                        tmp = [each.strip('(').lower() for each in tmp]
                        tokens += tmp

                # add headline
                if headline:
                    tmp = headline.split(' ')
                    tmp = [each.strip(',').lower() for each in tmp]
                    tmp = [each.strip('.').lower() for each in tmp]
                    tmp = [each.strip(')').lower() for each in tmp]
                    tmp = [each.strip('(').lower() for each in tmp]
                    tokens += tmp

                wv = None
                fail2find = []
                count = 0
                for t in tokens:
                    if t in self.wv:
                        if wv is None:
                            if float('inf') not in self.wv[t] and -float(
                                    'inf') not in self.wv[t] and all(
                                        self.wv[t] < 1e5):
                                wv = self.wv[t]
                                count += 1
                        else:
                            if float('inf') not in self.wv[t] and -float(
                                    'inf') not in self.wv[t] and all(
                                        self.wv[t] < 1e5):
                                wv += self.wv[t]
                                count += 1
                    else:
                        fail2find.append(t)
                print(
                    "article %s -- Tokens not in word vector dictionary: %s" %
                    (i, fail2find))
                if wv is not None:
                    vectors.append(wv / count)
                    labels.append(section)

        vectors = np.array(vectors)
        print(vectors.shape)
        print(vectors)
        unique_labels = set(labels)
        label_mapping = {}
        for i, each in enumerate(unique_labels):
            label_mapping[each] = i
        new_labels = []
        for each in labels:
            new_labels.append(label_mapping[each])
        new_labels = np.array(new_labels).reshape(-1, 1)
        print(new_labels.shape)
        complete_data = np.concatenate((vectors, new_labels), axis=1)
        print(complete_data)
        np.savetxt('NewYorkTime.csv', complete_data, delimiter=',')

    @staticmethod
    def plot_data(data):
        num_sample = 5000
        label = data[:, -1]
        feature = data[:, :-1]

        assignment = {}

        for i in range(len(feature)):
            if label[i] not in assignment:
                assignment[label[i]] = []

            assignment[label[i]].append(i)

        # down sample
        old_assignment = assignment
        assignment = {}

        indicies = []
        for label in old_assignment:
            last_length = len(indicies)
            indicies += np.random.choice(
                old_assignment[label],
                size=min(int(num_sample / len(old_assignment)),
                         len(old_assignment[label])),
                replace=False).tolist()
            assignment[label] = np.arange(last_length, len(indicies))

        feature = feature[indicies]
        print(feature.shape)
        print(len(indicies))
        print(len(np.unique(indicies)))

        tsne = TSNE()
        x = tsne.fit_transform(feature)

        fig, ax = plt.subplots()

        # ax.plot(x[:, 0], x[:, 1], '*')
        r = RandomColor()
        colors = r.generate(count=len(assignment))
        for i, label in enumerate(assignment):
            ax.plot(x[assignment[label]][:, 0],
                    x[assignment[label]][:, 1],
                    '*',
                    color=colors[i],
                    label=label)
        plt.legend()
        plt.show()
예제 #42
0
class StanfordRE(ReModel):
    def __init__(self, corpus, relationtype, modelname="stanfordre_classifier.ser"):
        super(StanfordRE, self).__init__()
        self.modelname = modelname
        self.pairs = {}
        self.corenlp_client = None
        self.relationtype = relationtype
        self.corpus = corpus

    def generate_data(self, corpus, modelname, pairtypes):
        if os.path.isfile(self.temp_dir + modelname + ".txt"):
            print "removed old data"
            os.remove(self.temp_dir + modelname + ".txt")
        trainlines = []
        # get all entities of this document
        # doc_entities = []
        pcount = 0
        truepcount = 0
        ns = 0
        for sentence in corpus.get_sentences("goldstandard"):
            logging.info("{}".format(sentence.sid))
            nt_to_entity = {}
            for e in sentence.entities.elist['goldstandard']:
                # TODO: merge tokens of entity
                nt = str(e.tokens[0].order)
                nt_to_entity[nt] = e
            # print nt_to_entity
            # ns = sentence.sid.split("s")[-1]
            for t in sentence.tokens:
                nt = str(t.order)
                # print nt, nt in nt_to_entity
                if nt in nt_to_entity:
                    # print nt, nt_to_entity[nt], nt_to_entity[nt].type
                    #l = [str(ns), nt_to_entity[nt].type, nt, "O", t.pos, t.text, "O", "O", "O"]
                    # TODO: change other to entitiy name
                    l = [str(ns), "Other", nt, "O", t.pos, t.text, "O", "O", "O"]
                else:
                    # print nt, nt_to_entity
                    l = [str(ns), "O", nt, "O", t.pos, t.text, "O", "O", "O"]
                trainlines.append(l)
            trainlines.append([""])
            sentence_entities = [entity for entity in sentence.entities.elist["goldstandard"]]
            # logging.debug("sentence {} has {} entities ({})".format(sentence.sid, len(sentence_entities), len(sentence.entities.elist["goldstandard"])))
            for pair in itertools.combinations(sentence_entities, 2):
                if pair[0].type == pairtypes[0] and pair[1].type == pairtypes[1] or pair[1].type == pairtypes[0] and pair[0].type == pairtypes[1]:
                    # logging.debug(pair)
                    if pair[0].type == pairtypes[0]:
                        e1id = pair[0].eid
                        e2id = pair[1].eid
                    else:
                        e1id = pair[1].eid
                        e2id = pair[0].eid
                        pair = (pair[1], pair[0])
                    pid = sentence.did + ".p" + str(pcount)
                    # self.pairs[pid] = (e1id, e2id)
                    self.pairs[pid] = pair
                    if e2id in pair[0].targets:
                        truepcount += 1
                        nt1 = str(pair[0].tokens[0].order)
                        nt2 = str(pair[1].tokens[0].order)
                        trainlines.append([nt1, nt2, "+".join(pairtypes)])
                pcount += 1
                trainlines.append([""])
                ns += 1



        logging.info("Writing {} lines...".format(len(trainlines)))
        with codecs.open(self.temp_dir + modelname + ".corp", 'w', "utf-8") as trainfile:
            for l in trainlines:
                # print l
                trainfile.write("\t".join(l) + "\n")
        logging.info("True/total relations:{}/{} ({})".format(truepcount, pcount, str(1.0*truepcount/pcount)))

    def write_props(self):
        with open(config.corenlp_dir + "roth.properties", 'r') as propfile:
            lines = propfile.readlines()

        print lines
        with open(config.corenlp_dir + "roth.properties", 'w') as propfile:
            for l in lines:
                if l.startswith("serializedRelationExtractorPath"):
                    propfile.write("serializedRelationExtractorPath = {}\n".format(config.corenlp_dir + self.modelname))
                elif l.startswith("trainPath"):
                    propfile.write("trainPath = {}\n".format(self.temp_dir + self.modelname + ".corp"))
                else:
                    propfile.write(l)

    def train(self):
        self.generate_data(self.corpus, self.modelname, pairtypes=self.relationtype)
        # java -cp classpath edu.stanford.nlp.ie.machinereading.MachineReading --arguments roth.properties
        if os.path.isfile(config.corenlp_dir + self.modelname):
            print "removed old model"
            os.remove(config.corenlp_dir + self.modelname)
        if not os.path.isfile(self.temp_dir + self.modelname  + ".corp"):
            print "could not find training file " + config.corenlp_dir + self.modelname + ".corp"
            sys.exit()
        self.write_props()
        classpath = config.corenlp_dir + "*"
        srecall = ['java', '-mx3g', '-classpath', classpath, "edu.stanford.nlp.ie.machinereading.MachineReading",
                          "--arguments",  config.corenlp_dir + "roth.properties"]
        print " ".join(srecall)
        # sys.exit()
        srecall = Popen(srecall) #, stdout=PIPE, stderr=PIPE)
        res  = srecall.communicate()
        if not os.path.isfile(config.corenlp_dir + self.modelname):
            print "error with StanfordRE! model file was not created"
            print res[1]
            sys.exit()
        else:
            statinfo = os.stat(config.corenlp_dir + self.modelname)
            if statinfo.st_size == 0:
                print "error with StanfordRE! model has 0 bytes"
                print res[0]
                print res[1]
                sys.exit()
        # logging.debug(res)

    def load_classifier(self, inputfile="slk_classifier.model.txt", outputfile="jsre_results.txt"):
        self.corenlp_client = StanfordCoreNLP('http://localhost:9000')
        # sup.relation.model=
        tokenkeys = set()
        sentencekeys = set()
        for d in self.corpus.documents:
            for s in self.corpus.documents[d].sentences:
                corenlpres = self.corenlp_client.annotate(s.text.encode("utf8"), properties={
                        'ssplit.eolonly': True,
                        'openie.triple.all_nominals': True,
                        'openie.triple.strict': False,
                        'openie.max_entailments_per_clause': 500,
                        'annotators': 'tokenize,ssplit,pos,depparse,natlog,openie',
                        #'annotators': 'tokenize, ssplit, pos, lemma, ner, parse, relation, openie',
                        'outputFormat': 'json',
                        # 'sup.relation.model': self.modelname
                    })
                for o in corenlpres["sentences"][0]["openie"]:
                    if "mir" in o["object"] or "mir" in o["subject"]:
                        print "{}={}>{}".format(o["subject"], o["relation"], o["object"])


    def test(self, outputfile="jsre_results.txt"):
        pass

    def get_predictions(self, corpus, examplesfile="slk_classifier.model.txt", resultfile="jsre_results.txt"):
        pass
author: Giancarlo D. Salton
"""

from pycorenlp import StanfordCoreNLP
import codecs
import json
import utils

properties = {
    "annotators": "tokenize,ssplit,pos,depparse,lemma",
    "depparse.extradependencies": "NONE",
    "outputFormat": "json"
}

nlp = StanfordCoreNLP('http://localhost:9000')

input_file = "sample.en"
output_file = "{:s}.json".format(input_file)

keep_all_dependencies = False
sent_count = 0
encoding = "utf-8"

with codecs.open(output_file, "a", "utf-8") as outfile:
    outfile.write("{\"corpus\":[\n")

    for line in codecs.open(input_file, "r", encoding):

        # if encoding.lower != "utf-8":
        #     line = line.encode("utf-8")
예제 #44
0
class StanfordTFIDFApi():
    '''
        Make use of StanfordCoreNLP Server
        Extract keyword through tf-idf algorithm
    '''    
    def __init__(self):
        self.nlp = StanfordCoreNLP(NLP_SERVER)

    def __tf_by_pos(self, text, pos='N'):
        response = self.nlp.annotate(text, properties={
            'annotators': 'ner,lemma',
            'outputFormat': 'json'
        })
        logger.debug(json.dumps(response))
        '''The response is generally organized as {sentences:[{tokens:[]},{}]}'''
        
        result = list()
        if type(response) == dict and 'sentences' in response:
            for sentence in response['sentences']:
                for item in sentence['tokens']:
                    if item['pos'].startswith(pos):
                        # only accept engish word, and not in STOPWORDS
                        if acceptable_word(item['lemma'].lower()):
                            result.append((item['lemma'].lower()))
            toks_count = Counter(result)
            return toks_count
        else:
            logger.warning('sentences part is not in the response from NLP server.')
            return Counter()

    def tf_idf_groupby_pos(self, text, df_cache):
        
        output = dict()
        output['NOUN'] = self.__tf_by_pos(text, 'N')
        output['VERB'] = self.__tf_by_pos(text, 'V')
        
        for pos in output:
            logger.debug('Computed tf for %s:' % pos + json.dumps(output['VERB']))
            for word in output[pos]:
                '''Formula is: tf*log(N/df)'''
                if word in df_cache:
                    output[pos][word] = output[pos][word]*math.log(df_cache['total_document']
                                              /df_cache[word])
                else:
                    output[pos][word] = output[pos][word]*math.log(df_cache['total_document'])
                    
            # return the top 10 words
            output[pos] = [word for word, count in output[pos].most_common(10)]
            logger.debug('Computed tf-idf for %s:' % pos + json.dumps(output[pos]))
        
        return json.dumps(output)
        
    def compute_df(self, document_list):
        '''Compute document frequency based on input document list'''  
        df_cache = dict()
        df_output = dict()
        
        d_index = 0
        for document in document_list:
            d_index += 1
            # tokenize each document
            reg_toks = nltk.regexp_tokenize(document, SENTENCE_RE)
            for item in reg_toks:
                # change each word to lower case and lemmatize
                item = normalise(item)
                if item not in df_cache:
                    df_cache[item] = set([d_index])
                else:
                    df_cache[item].add(d_index)
        
        for item in df_cache:
            if acceptable_word(item):
                df_output[item] = len(df_cache[item])
        
        df_output['total_document'] = len(document_list)
        
        return df_output
예제 #45
0
def process_request(conn, addr):
	print("connected client:", addr)
	lst = b''
	data_com = conn.recv(4096)
	data_com = data_com.decode("utf8")
	data_com = data_com.split(' ')
	lenght = int(data_com[1])
	i = 0
	while i < lenght:
		data = conn.recv(1024)
		lst += data
		i += 1024
	# print(data_com)
	lst2 = pickle.loads(lst)

	if data_com[0].upper() == 'STAT':
		if len(lst2) < 10:
			error = 'Not enough data'
			conn.sendall(error.encode("utf8"))
		else:
			tweet_top = tweet_top10(lst2)
			retweet_top = (list(retweet_top10(lst2)))[:10]
			retweet_top10_necessary = []
			for i in range(len(retweet_top)):
				retweet_top10_necessary.append([])
				retweet_top10_necessary[i].append(retweet_top[i][6])
				retweet_top10_necessary[i].append(retweet_top[i][3])
				retweet_top10_necessary[i].append(retweet_top[i][8])
			author_top = author_top10(lst2)
			country_tweet, country_retweet = country(lst2)
			# print(tweet_top)
			# print(retweet_top10_necessary)
			# print(author_top)
			data_for_client = [['Popular words', 'Number of words']]
			data_for_client.extend(tweet_top)
			data_for_client.extend([])
			data_for_client.extend([['Tweet content', 'author', 'RT']])
			data_for_client.extend(retweet_top10_necessary)
			data_for_client.extend([['author', 'followers']])
			data_for_client.extend(author_top)
			data_for_client.extend([['country_tweet'], country_tweet])
			data_for_client.extend([['country_retweet'], country_retweet])
			# print(data_for_client)
			message = pickle.dumps(data_for_client)
			size = len(message)
			conn.sendall((str(size)).encode("utf8"))
			time.sleep(1)
			conn.sendall(message)

	if data_com[0].upper() == 'ENTI':
		nlp = StanfordCoreNLP('http://localhost:9000')
		pos = []
		for i in lst2:
			text = i[6].replace('\n',' ')
			# print(i[6])
			result = nlp.annotate( text, properties = {'annotators': 'ner', 'outputFormat': 'json', 'timeout': 100000, })
			# print(result["sentences"][0])
			for word in result["sentences"][0]["tokens"]:
				pos.append('{} ({})'.format(word["word"], word["ner"]))
				# print(pos)
			# print('')
			# print(text)
		string = " ".join(pos)
		# print(pos)
		message = pickle.dumps(string)
		size = len(message)
		conn.sendall((str(size)).encode("utf8"))
		time.sleep(1)
		conn.sendall(message)

	conn.close()
from pycorenlp import StanfordCoreNLP
import json
from nltk.tree import Tree
from SentimentModelFunctions import *

if __name__ == '__main__':
    nlp = StanfordCoreNLP('http://localhost:9000')

    #14, 37, 58, 97, 99
    text = ["Hai"]
    for t in text:
        print("Text: {}".format(t))
        output = nlp.annotate(
            t,
            properties={
                'annotators':
                'tokenize,ssplit, parse',
                'outputFormat':
                'json',
                'parse.model':
                'edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz'
            })
        print(type(output))
        print(
            type("CoreNLP request timed out. Your document may be too long."))
        print(json.dumps(output, indent=4))
        for i in range(len(output['sentences'])):
            tokenized_sent = [
                token_json['word']
                for token_json in output['sentences'][i]['tokens']
            ]
예제 #47
0
class SentimentAnnotator:
    def __init__(self):
        self.nlp_wrapper = None
        self.settings = {
            'annotators': 'sentiment',
            'outputFormat': 'json',
            'timeout': 1000000,
        }

        # ---------------------------------------------------------------------
        # Start CoreNLP server before using sentiment annotator
        # cd stanford-corenlp-full-2018-10-05/
        # java -mx1g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer
        # note: change the 2g to 1g if space requirements too high
        # ---------------------------------------------------------------------

        # os.chdir("./stanford-corenlp-full-2018-10-05/")
        # os.system('java -mx1g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer')

    def sentence_level(self, sentence_tokens: list) -> list:
        """
        Given a sentence as a list of tokens, return the breakdown of sentiment values in the sentence
        :param sentence_tokens: A list of tokens
        :return: list of sentiment counts e.g. [0.1, 0.3, 0.2, 0.3, 0.1]
        counts[0:4] --> frequency of tokens with sentiment values 0 - 4
        """
        counts = [0] * 5

        for token in sentence_tokens:
            sentiment_val = int(
                self.nlp_wrapper.annotate(token, properties=self.settings)
                ["sentences"][0]["sentimentValue"])
            counts[sentiment_val] += 1
        return [count / len(sentence_tokens) for count in counts]

    def transform(self, string: str) -> list:
        """
        Given a string, decompose it into sentences and annotate the sentiments of each sentence
        :param string: string of data
        :return: list of sentiment count averages across all sentences
        """
        if self.nlp_wrapper is None:
            self.nlp_wrapper = StanfordCoreNLP('http://localhost:9000')

        all_sentences = self.nlp_wrapper.annotate(
            string, properties=self.settings)["sentences"]
        sentiment_values = []

        for sentence in all_sentences:
            token_list = [
                token['originalText'] for token in sentence['tokens']
            ]
            sentiment_values.append(
                self.sentence_level(token_list) +
                [int(sentence["sentimentValue"])])
            # print(self.sentence_level(token_list) + [int(sentence["sentimentValue"])])

        return list(np.mean(sentiment_values, axis=0))

    def fit_transform(self, data: pd.Series):
        return data.apply(lambda x: self.transform(x))
예제 #48
0
class Preprocess():
    def __init__(self, argv):
        self.input = ""
        self.output_folder = ""       # output has to be a folder
        self.input_type = ""

        # Start Stanford CoreNLP Server
        self.nlp = StanfordCoreNLP('http://localhost:9000')

        # Read User Command Line
        opts, args = getopt.getopt(argv,"hi:o:",["ifile=","ofile="])
        for opt, arg in opts:
          if opt == '-h':
             print("Type 'python3.5 text_preprocessing/preprocess.py  -i <inputfile> -o <outputfile>' \
                   in run_source_code.sh file")
             sys.exit()
          elif opt in ("-i", "--ifile"):
             self.input = arg
             if os.path.exists(arg) == False:
                 print("Input doesn't exist")
                 sys.exit()
             if os.path.isdir(arg) == True: self.input_type = "dir"
             elif os.path.isfile(arg) == True: self.input_type = "file"
          elif opt in ("-o", "--ofile"):
             self.output_folder = arg

        print("Input: " + self.input +", " + self.input_type)
        print("Output: " + self.output_folder)


    def sentence_parsing(self, row_string):
        parsed_json = self.nlp.annotate(row_string, properties={
                       'annotators': 'tokenize,ssplit,pos',
                       'outputFormat': 'json'
                   })
        return parsed_json


    def output_preprocessed_data(self, json_input, file_name):
        rows = []
        for sent in json_input['sentences']:
            parsed_sent = " ".join([t['originalText'] + "/" + t['pos'] for t in sent['tokens']])
            rows.append(parsed_sent)
        output_file_path = self.output_folder + file_name
        with open(output_file_path, 'a') as preprocessed_out:
            for r in rows:
                preprocessed_out.write(r + "\n")


    def pos_tagging(self):
        if self.input_type == "file":
            input_path_elems = self.input.split("/")
            file_name = ""
            if input_path_elems[-1] != "/":
                file_name = input_path_elems[-1]
            else:
                file_name = input_path_elems[-2]
            text_string = ""
            with open(self.input, 'rb') as file_input:
                for r in file_input:
                    text_string = " ".join([text_string, r.strip().decode('utf-8', 'backslashreplace')])
            print(self.input)
            parsed_json = self.sentence_parsing(text_string)
            self.output_preprocessed_data(parsed_json, file_name)
        elif self.input_type == "dir":
            for file_name in os.listdir(self.input):
                input_file_path = self.input + file_name
                text_string = ""
                with open(input_file_path, 'rb') as file_input:
                    for r in file_input:
                        text_string = " ".join([text_string, r.strip().decode('utf-8', 'backslashreplace')])
                parsed_json = self.sentence_parsing(text_string)
                print(input_file_path)
                self.output_preprocessed_data(parsed_json, file_name)

if __name__ == '__main__':
    args = parse_args()

    args.output_dir = "../../data/copa/"
    args.train_file = "../../data/copa/train.jsonl"
    args.predict_file = "../../data/copa/val.jsonl"
    args.test_file = "../../data/copa/test.jsonl"

    # make output directory if not exist
    if not os.path.exists(args.output_dir):
        os.mkdir(args.output_dir)

    # register corenlp server
    nlp = StanfordCoreNLP('http://localhost:9753')

    # load train and dev datasets
    trainset = read_json_lines(args.train_file)
    devset = read_json_lines(args.predict_file)
    testset = read_json_lines(args.test_file)

    for dataset, path, name in zip(
        (trainset, devset, testset),
        (args.train_file, args.predict_file, args.test_file),
        ('train', 'dev', 'test')):
        output_path = os.path.join(
            args.output_dir,
            "{}.tagged.jsonl".format(os.path.basename(path)[:-6]))
        tagging(dataset, nlp, output_path)
        # output_path = os.path.join(args.output_dir, "{}.tagged.jsonl".format(os.path.basename(path)[:-6]))
class DataProcessor:
    def __init__(self, files=None):
        self.sources = files
        self.triples = []
        self.news = ""
        self.nlp = StanfordCoreNLP('http://localhost:9000')

    def add_source(self, files):
        self.sources = files

    def generate_triples(self):
        for source in self.sources:
            if os.path.exists(source):
                with open(source) as file:
                    lines = file.readlines()
                    for line in lines:
                        [h, r, t] = line.split()
                        triple = self._create_triple(h, t, r)
                        self.triples.append(triple)
        return self.triples

    def analyse_input(self):
        # nlp = StanfordCoreNLP('http://localhost:9000')
        # text = "'Tom be 42 years old, Tom be a teacher'"
        output = self.nlp.annotate(
            self.news,
            properties={
                'annotators': 'tokenize, ssplit, pos, depparse, parse, openie',
                'outputFormat': 'json'
            })

        self.triples = []
        try:
            for item in output['sentences'][0]['openie']:
                tmp = item['subject'].replace(" ", "_") + "\t" \
                      + self._format_relation(item['relation']) + "\t" \
                      + item["object"].replace(" ", "_")
                [h, r, t] = tmp.split()
                triple = self._create_triple(h, t, r)
                self.triples.append(triple)
                # triple.append(tmp)
        except:
            # traceback.print_exc()
            pass
        # print(triple)

    # def solve_sentence(self, sentence):
    #     sentence = "Trump_campaign_spokeswoman	willPutFirst	America"
    #     nhead = "Trump_campaign_spokeswoman"
    #     nrelation = "willPutAt"
    #     ntail = "America"
    #     return self._create_triple(nhead, ntail, nrelation)

    def _create_triple(self, nhead="", ntail="", nrelation=""):
        head = Entity(nhead.replace("\'", "`"))
        relation = Relation(nrelation.replace("\'", "`"),
                            re.sub('[^a-zA-Z \n_]', '', nrelation))
        tail = Entity(ntail.replace("\'", "`"))
        return Triple(head, relation, tail)

    def _format_relation(self, str):
        arr = [pos for pos, char in enumerate(str) if char == " "]

        result = ""
        for index, item in enumerate(str):
            if (index - 1) in arr:
                result += item.upper()
            else:
                result += item

        result = result.replace(" ", "")

        return result
예제 #51
0
with open('tweet_data_01.csv', 'r', encoding='mac_roman') as csvfile:

    f_reader = csv.reader(csvfile, delimiter=',')
    for row in f_reader:
        tweet_tuple = (row[0], row[1], row[2], row[3]
                       )  #date,text,retweet,favorite
        raw_tweets.append(tweet_tuple)
csvfile.close()
#print(len(raw_tweets))
#text = 'Learning a winning stock trading strategy is EASY with Tim Sykes http://smq.tc/1BFjMXK  $MGPI $CORN $CAMP'
# text = '14  Top impressive #ETF $DOD $RWX $IFGL $BSJG $ERO $SCHC $VEGI $HGI $UYG $URA $GRU $JJG $CORN https://twitter.com/search?f=tweets&vertical=default&q=%24DOD%20OR%20%24RWX%20OR%20%24IFGL%20OR%20%24BSJG%20OR%20%24ERO%20OR%20%24SCHC%20OR%20%24VEGI%20OR%20%24HGI%20OR%20%24UYG%20OR%20%24URA%20OR%20%24GRU%20OR%20%24JJG%20OR%20%24CORN&src=typd …'
# text = re.sub('http\S+\s+','',text)
# print(text)

nlp = StanfordCoreNLP('http://localhost:9000')
cnt = 0
for tweet in raw_tweets:
    cnt += 1
    if cnt == 1:
        continue

    text = tweet[1].strip()  #get raw tweet text
    text = re.sub('http\S+\s+', '', text)  #get rid of urls in the tweet
    res = nlp.annotate(text,
                       properties={
                           'annotators': 'sentiment',
                           'outputFormat': 'json',
                           'timeout': 10000,
                       })
    if len(res) > 1 or len(res) < 0:
예제 #52
0
import os

os.chdir("/home/gowtham/Documents/stanford-corenlp-full-2017-06-09/")
#call(["java","-mx4g","-cp",'"*"'," edu.stanford.nlp.pipeline.StanfordCoreNLPServer","-port","9009","-timeout","15000"])
os.system(
    'java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9011 -timeout 15000'
)
import json

from pycorenlp import StanfordCoreNLP

nlp = StanfordCoreNLP('http://localhost:9011')

l = []

text = ('Pusheen and Smitha walked along the beach. ')

l = text.split(".")

print(l)
output = nlp.annotate(l[0],
                      properties={
                          'annotators': 'sentiment',
                          'outputFormat': 'json'
                      })

print(output['sentences'][0]['sentiment'])
with open("output.json", "w+") as f:
    f.write(json.dumps(output, indent=4, sort_keys=True, ensure_ascii=False))
예제 #53
0
 def __init__(self):
     self.nlp = StanfordCoreNLP(NLP_SERVER)
예제 #54
0
fopPOSNLTK = fopInputAfterTranslation + 'pos_nltk/'
fopSortedBySimScore = fopInputAfterTranslation + 'sortBySimilarityScore/'
fpSortedSource = fopSortedBySimScore + 'source.txt'
fopSortedPOSStanford = fopSortedBySimScore + 'pos_stanford/'
fopSortedPOSNLTK = fopSortedBySimScore + 'pos_nltk/'
createDirIfNotExist(fopSortedPOSStanford)
createDirIfNotExist(fopSortedPOSNLTK)

strParseResultsType = "<class 'pyparsing.ParseResults'>"
strStrType = "<class 'str'>"
nltk.download('bllip_wsj_no_aux')
model_dir = find('models/bllip_wsj_no_aux').path
parser = RerankingParser.from_unified_model_dir(model_dir)

strServerPort = '9000'
nlpObj = StanfordCoreNLP('http://localhost:' + strServerPort)

fpAppendNLTKText = fopSortedBySimScore + 'appendPOS.nltk.text.txt'
fpAppendNLTKPOS = fopSortedBySimScore + 'appendPOS.nltk.pos.txt'
fpAppendStanfordText = fopSortedBySimScore + 'appendPOS.stanford.text.txt'
fpAppendStanfordPOS = fopSortedBySimScore + 'appendPOS.stanford.pos.txt'

fpV2AppendNLTKText = fopSortedBySimScore + 'v2.appendPOS.nltk.text.txt'
fpV2AppendNLTKPOS = fopSortedBySimScore + 'v2.appendPOS.nltk.pos.txt'
fpV2AppendStanfordText = fopSortedBySimScore + 'v2.appendPOS.stanford.text.txt'
fpV2AppendStanfordPOS = fopSortedBySimScore + 'v2.appendPOS.stanford.pos.txt'

f1 = open(fpAppendStanfordText, 'r')
arrStanfordText = f1.read().strip().split('\n')
f1.close()
f1 = open(fpAppendStanfordPOS, 'r')
예제 #55
0
파일: test2.py 프로젝트: IdeologyPin/pygate
 def __init__(self, annotators='tokenize,ssplit,pos,parse'):#depparse
     self.annotators=annotators
     self.nlp = StanfordCoreNLP('http://localhost:9000')
예제 #56
0
class AMRInputPreprocessor(object):
    def __init__(self, url=core_nlp_url):
        self.nlp = StanfordCoreNLP(url)
        self.joints_map = self.readJoints()
        self.number_texts = {
            "hundred", "thousand", "million", "billion", "trillion",
            "hundreds", "thousands", "millions", "billions", "trillions"
        }
        self.slashedNumber = re.compile(r'-*\d+-\d+')

    def readJoints(self):
        joints_map = {}
        with open("data/joints.txt", 'r') as f:
            line = f.readline()
            while line.strip() != '':
                line = f.readline()
                compounds = line.split()
                past = ""
                for w in compounds:
                    joints_map.setdefault(past[:-1], []).append(w)
                    past = past + w + "-"
        return joints_map

    def combine_number(self, data):
        #combine phrase e.g. :  make up
        def combinable_number(n1, n2):
            return n2 in self.number_texts and n1 != "-"

        def combinable(i, m):
            return len(lemma) > 0 and m == "CD"\
                    and pos[-1] =="CD" and combinable_number(lemma[-1], data["lem"][i])

        lemma = []
        ner = []
        tok = []
        pos = []

        for i, m in enumerate(data["pos"]):
            if combinable(i, m):
                lemma[-1] = lemma[-1] + "," + data["lem"][i]
                tok[-1] = tok[-1] + "," + data["tok"][i]
                pos[-1] = "CD"
        #        ner[-1] = ner[-1]
            else:
                lemma.append(data["lem"][i])
                tok.append(data["tok"][i])
                pos.append(data["pos"][i])
                ner.append(data["ner"][i])

        data["lem"] = lemma
        data["ner"] = ner
        data["pos"] = pos
        data["tok"] = tok
        return data

    def tag_url_and_split_number(self, data):
        lemma = []
        ner = []
        tok = []
        pos = []

        for i, le in enumerate(data["lem"]):
            if "http" in le or "www." in le:
                ner.append("URL")
                lemma.append(data["lem"][i])
                tok.append(data["tok"][i])
                pos.append(data["pos"][i])

            elif re.match(self.slashedNumber, le) and data["ner"][i] == "DATE":
                les = le.replace("-", " - ").split()
                toks = data["tok"][i].replace("-", " - ").split()
                assert len(les) == len(toks), data
                for l in les:
                    if l != "-":
                        pos.append(data["pos"][i])
                        ner.append(data["ner"][i])
                    else:
                        pos.append(":")
                        ner.append("0")
                lemma = lemma + les
                tok = tok + toks
            else:
                ner.append(data["ner"][i])
                lemma.append(data["lem"][i])
                tok.append(data["tok"][i])
                pos.append(data["pos"][i])

        data["lem"] = lemma
        data["ner"] = ner
        data["pos"] = pos
        data["tok"] = tok
        return data

    def combine_phrase(self, data):
        #combine phrase e.g. :  make up
        lemma = []
        ner = []
        tok = []
        pos = []
        skip = False
        for i, le in enumerate(data["lem"]):
            if skip:
                skip = False
            elif len(lemma) > 0 and le in self.joints_map.get(lemma[-1], []):
                lemma[-1] = lemma[-1] + "-" + le
                tok[-1] = tok[-1] + "-" + data["tok"][i]
                pos[-1] = "COMP"
                ner[-1] = "0"
            elif len(lemma) > 0 and le == "-" and i < len(data["lem"])-1 \
                and data["lem"][i+1] in self.joints_map.get( lemma[-1] ,[]):
                lemma[-1] = lemma[-1] + "-" + data["lem"][i + 1]
                tok[-1] = tok[-1] + "-" + data["tok"][i + 1]
                pos[-1] = "COMP"
                ner[-1] = "0"
                skip = True
            else:
                lemma.append(le)
                tok.append(data["tok"][i])
                pos.append(data["pos"][i])
                ner.append(data["ner"][i])

        data["lem"] = lemma
        data["ner"] = ner
        data["pos"] = pos
        data["tok"] = tok
        return data

    def featureExtract(self, src_text, whiteSpace=False):
        data = {}
        output = self.nlp.annotate(
            src_text.strip(),
            properties={
                'annotators': "tokenize,ssplit,pos,lemma,ner",
                "tokenize.options":
                "splitHyphenated=true,normalizeParentheses=false",
                "tokenize.whitespace": whiteSpace,
                'ssplit.isOneSentence': True,
                'outputFormat': 'json'
            })
        snt = output['sentences'][0]["tokens"]
        data["ner"] = []
        data["tok"] = []
        data["lem"] = []
        data["pos"] = []
        for snt_tok in snt:
            data["ner"].append(snt_tok['ner'])
            data["tok"].append(snt_tok['word'])
            data["lem"].append(snt_tok['lemma'])
            data["pos"].append(snt_tok['pos'])

    #   if whiteSpace is False:
    #       return self.featureExtract(" ".join(data["tok"]),True)
        asserting_equal_length(data)
        return data

    def preprocess(self, src_text):
        data = self.featureExtract(src_text)
        data = self.combine_phrase(data)  #phrase from fixed joints.txt file
        data = self.combine_number(data)
        data = self.tag_url_and_split_number(data)
        asserting_equal_length(data)
        return data
예제 #57
0
#!/usr/bin/python

import cgi, cgitb 
import json
cgitb.enable()  # for troubleshooting

#the cgi library gets vars from html
data = cgi.FieldStorage()

from pycorenlp import StanfordCoreNLP
nlp = StanfordCoreNLP('http://localhost:9000')

text = data['text'].value
annotators = data['annotators'].value

output = nlp.annotate(text, properties={'annotators': annotators, 'outputFormat': 'json'})

#this is the actual output
print "Content-Type: text/html\n"
print json.dumps(output)
예제 #58
0
def brat_to_conll(input_folder, output_filepath, tokenizer, language):
    '''
    Assumes '.txt' and '.ann' files are in the input_folder.
    Checks for the compatibility between .txt and .ann at the same time.
    '''
    use_pos = False
    if tokenizer == 'spacy':
        spacy_nlp = spacy.load(language)
    elif tokenizer == 'stanford':
        core_nlp = StanfordCoreNLP('http://localhost:{0}'.format(9000))
    elif tokenizer == 'pos':
        use_pos = True
    else:
        raise ValueError("tokenizer should be either 'spacy' or 'stanford'.")
    verbose = False
    dataset_type = os.path.basename(input_folder)
    print("Formatting {0} set from BRAT to CONLL... ".format(dataset_type),
          end='')
    text_filepaths = sorted(glob.glob(os.path.join(input_folder, '*.txt')))
    output_file = codecs.open(output_filepath, 'w', 'utf-8')
    for text_filepath in text_filepaths:
        base_filename = os.path.splitext(os.path.basename(text_filepath))[0]
        annotation_filepath = os.path.join(os.path.dirname(text_filepath),
                                           base_filename + '.ann')

        # create annotation file if it does not exist
        if not os.path.exists(annotation_filepath):
            codecs.open(annotation_filepath, 'w', 'UTF-8').close()

        if use_pos:
            annotation_filepath2 = os.path.join(os.path.dirname(text_filepath),
                                                base_filename + '.ann2')
            # create annotation file if it does not exist
            if not os.path.exists(annotation_filepath2):
                codecs.open(annotation_filepath2, 'w', 'UTF-8').close()

        text, entities = get_entities_from_brat(text_filepath,
                                                annotation_filepath)
        entities = sorted(entities, key=lambda entity: entity["start"])

        if use_pos:
            pos_tags = get_pos_tags_from_brat(text, annotation_filepath2)
            #sentences = get_sentences_and_tokens_from_spacy(text, spacy_nlp)
            sentences = get_sentences_and_tokens_from_pos_tagger(pos_tags)
            #sentences = get_sentences_and_tokens_from_PlanTL(text,med_tagger)
        else:
            if tokenizer == 'spacy':
                sentences = get_sentences_and_tokens_from_spacy(
                    text, spacy_nlp)
            elif tokenizer == 'stanford':
                sentences = get_sentences_and_tokens_from_stanford(
                    text, core_nlp)

        if use_pos:
            token_counter = 0
            rep_pos_max = 0
            rep_pos_counter = 0
            rep_pos = False
        for sentence in sentences:
            inside = False
            previous_token_label = 'O'
            for token in sentence:
                '''
                if use_pos and token['text'] in ['\n', '\t', ' ', '']:
                    print('EMPTY TOKEN')
                    exit()
                '''
                #token_counter += 1
                #continue
                token['label'] = 'O'
                for entity in entities:
                    if entity['start'] <= token['start'] < entity['end'] or \
                       entity['start'] < token['end'] <= entity['end'] or \
                       token['start'] < entity['start'] < entity['end'] < token['end']:

                        token['label'] = entity['type'].replace(
                            '-', '_'
                        )  # Because the ANN doesn't support tag with '-' in it

                        break
                    elif token['end'] < entity['start']:
                        break

                if len(entities) == 0:
                    entity = {'end': 0}
                if token['label'] == 'O':
                    gold_label = 'O'
                    inside = False
                elif inside and token['label'] == previous_token_label:
                    gold_label = 'I-{0}'.format(token['label'])
                else:
                    inside = True
                    gold_label = 'B-{0}'.format(token['label'])
                if token['end'] == entity['end']:
                    inside = False
                previous_token_label = token['label']
                if use_pos:
                    pos_tag = pos_tags[token_counter]['type']
                    if not rep_pos and len(
                            pos_tags[token_counter]['text'].split()) > 1:
                        rep_pos = True
                        rep_pos_max = len(
                            pos_tags[token_counter]['text'].split())
                        rep_pos_counter = 0
                    elif rep_pos:
                        rep_pos_counter += 1
                        if rep_pos_counter >= rep_pos_max:
                            rep_pos = False
                            rep_pos_counter = 0
                    else:
                        token_counter += 1
                    if len('{0} {1} {2} {3} {4} {5}\n'.format(
                            token['text'], base_filename, token['start'],
                            token['end'], pos_tag, gold_label).split()) != 6:
                        continue
                        input('{0} {1} {2} {3} {4} {5}\n'.format(
                            token['text'], base_filename, token['start'],
                            token['end'], pos_tag, gold_label))

                    if verbose:
                        print('{0} {1} {2} {3} {4} {5}\n'.format(
                            token['text'].split()[0], base_filename,
                            token['start'], token['end'], pos_tag, gold_label))
                    output_file.write('{0} {1} {2} {3} {4} {5}\n'.format(
                        token['text'].split()[0], base_filename,
                        token['start'], token['end'] -
                        (len(token['text']) - len(token['text'].split()[0])),
                        pos_tag, gold_label))
                else:
                    if verbose:
                        print('{0} {1} {2} {3} {4}\n'.format(
                            token['text'], base_filename, token['start'],
                            token['end'], gold_label))
                    output_file.write('{0} {1} {2} {3} {4}\n'.format(
                        token['text'], base_filename, token['start'],
                        token['end'], gold_label))
            if verbose: print('\n')
            output_file.write('\n')

    output_file.close()
    print('Done.')
    if not use_pos:
        if tokenizer == 'spacy':
            del spacy_nlp
        elif tokenizer == 'stanford':
            del core_nlp
예제 #59
0
 def __init__(self):
     self.load_data()
     self.nlp = StanfordCoreNLP(config.StanfordCoreNLP_Path)
'''

#from corenlp import *
import sys
import csv
from SBAR_parser import *
import codecs
from HelpingFunctions_2 import *
from lists import *
import ast
import json
#import simplejson as json

#corenlp = StanfordCoreNLP()
from pycorenlp import StanfordCoreNLP
corenlp = StanfordCoreNLP('http://localhost:9000')

reload(sys)
sys.setdefaultencoding('utf-8')

###
### Core function
###
	  
def FeatureExtractor (data_nlp, id): 
	features = {}
	# Feature 1: if a MODAL_VERBS is in the sent
	# Feature 2: if a MODAL_VERBS is tagged as a MD (modal auxiliary) or VB in the sent	
	# Feature 3: if a MODAL_VERBS is in aux relationship with "be"/"feel"
	# Feature 4: if a MODAL_VERBS is followed by "have" + VBN