Python CoreNLPParser.api_call 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: nltk.parse.corenlp

클래스/타입: CoreNLPParser

메소드/함수: api_call

hotexamples.com에서의 예제들: 5

Python CoreNLPParser.api_call - 5개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 nltk.parse.corenlp.CoreNLPParser.api_call에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

CoreNLPParser(30)

raw_parse(29)

tokenize(20)

tag(9)

api_call(5)

parse(4)

parse_text(3)

raw_parse_sents(3)

parse_one(1)

tokenize_sents(1)

예제 #1

파일 보기

파일: standard_run.py 프로젝트: patrickMalikTU/bachelors

    def run_nlp(self, language):
        # Make sure server is running properly (as explained in https://github.com/nltk/nltk/wiki/Stanford-CoreNLP-API-in-NLTK) :
        # might need root
        # english: java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -preload tokenize,ssplit,pos,lemma,ner,parse,depparse,sentiment -status_port 9000 -port 9000 -timeout 15000
        # the german implementation cannot do sentiment analysis, the predictions do not bear any relevance, keeping the code like that just makes it easier to maybe add seom sentiment analysis of the parsed german text in the future
        # if the service times out increasing the timeout helps. This usually happens when a sentence is too long to be handled within the given period.
        self.__check_language(language)
        util.time_log("starting NLP...")
        annotator_dict = {"annotators": "sentiment"}
        classifier = CoreNLPParser("http://localhost:9000")

        ret_list = []

        for k_iter in range(0, self.k):
            prediction = []
            for review in self.test_data_text(language, k_iter):
                response_dict = classifier.api_call(review,
                                                    properties=annotator_dict,
                                                    timeout=500)
                count = 0
                sentiment = 0.0
                for sentence in response_dict["sentences"]:
                    count += 1
                    sentiment += float(sentence["sentimentValue"])

                avg_sentiment = sentiment / count
                # a lot better results with >=2
                prediction.append(1 if avg_sentiment >= 2 else 0)
            ret_list.append(prediction)
        return ret_list

예제 #2

파일 보기

파일: annotate_ws.py 프로젝트: officealexa2020/project

def annotate(sentence, lower=True):
    global client
   
    nlp = CoreNLPParser('http://localhost:9000')

    res = nlp.api_call(sentence,properties={'annotators': 'tokenize,ssplit'})     

        
    words, gloss, after = [], [], []
    
    print(sentence)
    for t in res['sentences']:
        for i in range(len(t['tokens'])):
            words.append(t['tokens'][i]['word'])
            gloss.append(t['tokens'][i]['originalText'])
            after.append(t['tokens'][i]['after'])
    if lower:
        words = [w.lower() for w in words]
    a={
        'gloss': gloss,
        'words': words,
        'after': after,
        }    
    print(a)        
    return {
        'gloss': gloss,
        'words': words,
        'after': after,
        }

예제 #3

파일 보기

파일: tagger.py 프로젝트: asyrofist/TermCooc

def tag_file(inputfile, lemma=True):
    stanford_parser = CoreNLPParser()
    with open(inputfile) as fin:
        content = []
        for line in fin:
            linepos = []
            line = line.strip()
            json_result = stanford_parser.api_call(
                line, properties=additional_properties)
            for sentence in json_result['sentences']:
                for dpos in sentence['tokens']:
                    if lemma:
                        word = dpos['lemma']
                    else:
                        word = dpos['word']
                    pos = dpos['pos']
                    linepos.append((word, pos))
            if linepos:
                content.append(linepos[:])
    print content

예제 #4

파일 보기

def main():
    tokenizer = CoreNLPParser(url='http://localhost:42636')
    vocab = set()
    for line in open(sys.argv[1]):
        word = line.rstrip()
        vocab.add(word)

    document_buffer = ""
    token_buffer = []

    with open(sys.argv[2]) as fin, open(sys.argv[3], "w") as fout:
        start = time.time()

        for e, line in enumerate(fin):
            if line.strip() == "":
                continue
            elif line.strip().lower() != end_of_document_symbol:
                document_buffer += line.strip() + " <br> "
                if len(document_buffer) > 90000:
                    while True:
                        try:
                            json_result = tokenizer.api_call(
                                document_buffer,
                                properties=additional_properties)
                            break
                        except requests.exceptions.HTTPError:
                            pass
                    json_result = tokenizer.api_call(
                        document_buffer, properties=additional_properties)
                    for sentence in json_result['sentences']:
                        token_buffer += [(x["originalText"], x["pos"])
                                         for x in sentence['tokens']]
                    document_buffer = ""
            else:
                while True:
                    try:
                        json_result = tokenizer.api_call(
                            document_buffer, properties=additional_properties)
                        break
                    except requests.exceptions.HTTPError:
                        pass
                for sentence in json_result['sentences']:
                    token_buffer += [(x["originalText"], x["pos"])
                                     for x in sentence['tokens']]

                document = " ".join([
                    x.lower() + "__" +
                    pos if x != "." and x != "<br>" else "<br>"
                    for x, pos in token_buffer
                    if x.lower() in vocab or x in ["<br>", "."]
                ])
                sentences = [
                    x.strip() for x in document.split("<br>") if x.strip()
                ]
                fout.write("<doc>\n" + "\n".join(sentences) + "\n</doc>\n")

                document_buffer = ""
                token_buffer = []

            eta = 30749930 / (e + 1) * (time.time() - start) - (time.time() -
                                                                start)
            if (e + 1) % 500 == 0:
                sys.stdout.write("\rsent: %i/%i\tETA: %f" %
                                 (e + 1, 30749930, eta))
                sys.stdout.flush()

예제 #5

파일 보기

파일: stanford.py 프로젝트: aredev/ir-data-mining

class StanTokenizer(Composable):
    def __init__(self):
        # Annotator dependencies, see https://stanfordnlp.github.io/CoreNLP/dependencies.html
        self.additional_properties = {
            'tokenize.options':
            'ptb3Escaping=false, unicodeQuotes=true, splitHyphenated=true, normalizeParentheses=false, normalizeOtherBrackets=false',
            'annotators': 'tokenize, ssplit, pos, lemma'
        }
        self.stanford_parser = CoreNLPParser()
        # The '-xmx2G' changes the maximum allowable RAM to 2GB instead of the default 512MB.
        internals.config_java(options='-xmx4G')

    def __call__(self,
                 value,
                 positions=False,
                 chars=False,
                 keeporiginal=False,
                 removestops=True,
                 start_pos=0,
                 start_char=0,
                 tokenize=True,
                 mode='',
                 **kwargs):
        """
        :param value: The unicode string to tokenize.
        :param positions: Whether to record token positions in the token.
        :param chars: Whether to record character offsets in the token.
        :param start_pos: The position number of the first token. For example,
            if you set start_pos=2, the tokens will be numbered 2,3,4,...
            instead of 0,1,2,...
        :param start_char: The offset of the first character of the first
            token. For example, if you set start_char=2, the text "aaa bbb"
            will have chars (2,5),(6,9) instead (0,3),(4,7).
        :param tokenize: if True, the text should be tokenized.
        """
        assert isinstance(value, text_type), "%s is not unicode" % repr(value)

        t = Token(positions,
                  chars,
                  removestops=removestops,
                  mode=mode,
                  **kwargs)
        if not tokenize:
            t.original = t.text = value
            t.boost = 1.0
            if positions:
                t.pos = start_pos
            if chars:
                t.startchar = start_char
                t.endchar = start_char + len(value)
            yield t

        else:
            pos = start_pos
            try:
                json_result = self.stanford_parser.api_call(
                    value, properties=self.additional_properties)
                for sentence in json_result['sentences']:
                    for token in sentence['tokens']:
                        if token:
                            t.text = token['word']
                            t.lemma = token['lemma']
                            t.pos = token['pos']
                            t.boost = 1.0
                            if keeporiginal:
                                t.original = token['originalText']
                            t.stopped = False
                            if positions:
                                t.pos = pos
                                pos += 1
                            if chars:
                                t.startchar = token['characterOffsetBegin']
                                t.endchar = token['characterOffsetEnd']
                            yield t
            except Exception as e:
                logging.critical(str(e))
                pass