示例#1
0
def get_corenlp_client(corenlp_path="", corenlp_port=0, annotators=None):
    """

    :param corenlp_path: corenlp path, e.g., /home/xliucr/stanford-corenlp-3.9.2
    :type corenlp_path: str (default = "")
    :param corenlp_port: corenlp port, e.g., 9000
    :type corenlp_port: int (default = 0)
    :param annotators: annotators for corenlp, please refer to https://stanfordnlp.github.io/CoreNLP/annotators.html
    :type annotators: Union[List, None] (default = None)
    :return: the corenlp client and whether the client is external
    :rtype: Tuple[stanfordnlp.server.CoreNLPClient, bool]
    """

    if corenlp_port == 0:
        return None, True

    if not annotators:
        annotators = list(ANNOTATORS)

    if is_port_occupied(port=corenlp_port):
        try:
            os.environ["CORENLP_HOME"] = corenlp_path
            corenlp_client = CoreNLPClient(
                annotators=annotators,
                timeout=99999,
                memory='4G',
                endpoint="http://localhost:%d" % corenlp_port,
                start_server=False,
                be_quiet=False
            )
            # corenlp_client.annotate("hello world", annotators=list(annotators), output_format="json")
            return corenlp_client, True
        except BaseException as err:
            raise err
    elif corenlp_path != "":
        print("Starting corenlp client at port {}".format(corenlp_port))
        corenlp_client = CoreNLPClient(
            annotators=annotators,
            timeout=99999,
            memory='4G',
            endpoint="http://localhost:%d" % corenlp_port,
            start_server=True,
            be_quiet=False
        )
        corenlp_client.annotate("hello world", annotators=list(annotators), output_format="json")
        return corenlp_client, False
    else:
        return None, True
示例#2
0
文件: nlp.py 项目: Yifan-G/EvalCraft
class NLPclient:
  def __init__(self, core_nlp_version = '2018-10-05'):
    from stanza.server import CoreNLPClient
    self.client = CoreNLPClient(annotators=['tokenize','ssplit','pos',
    'lemma','ner','parse','coref'])

  def __enter__(self): return self
  def __exit__(self, exc_type, exc_val, exc_tb): pass
  def __del__(self): self.client.stop()

  def step(self,text) :
      core_nlp_output = self.client.annotate(text=text,
                      annotators=annotators, output_format='json')
      for sentence in core_nlp_output['sentences']:
        lexs=tuple(lexs_of(sentence))
        deps=deps_of(sentence)
        ies=tuple(ies_of(sentence))
        yield lexs,deps,ies

  def extract(self, text):
    tail=clean_text(text)
    while tail:
      chunk=2**13
      head=tail[0:chunk]
      tail=tail[chunk:]
      #print('EXTRACTING FROM',len(head), 'chars.')
      yield from self.step(head)
示例#3
0
def process_q_batch(batch: List[str],
                    tagger_client: CoreNLPClient) -> List[List[str]]:
    n_questions = len(batch)
    assert n_questions > 0
    text = " ".join(batch)
    assert len(text) <= tagger_client.DEFAULT_MAX_CHAR_LENGTH
    ann = tagger_client.annotate(text)
    assert len(ann.sentence) == n_questions
    return [process_tagged(s) for s in ann.sentence]
def _annotate_parse(client: CoreNLPClient, doc: str):
    """
    並列処理用のヘルパー関数

    @param client: corenlp clientインスタンス
    @param doc: 処理するdocument.できるだけ大きくするとよい
    @param index: プロセス番号(0,1,2,...,n_process)
    @return: 処理したドキュメント
    """

    obj_doc = client.annotate(doc)
    iter_sentences = parse_serialized_document(obj_doc)
    return iter_sentences
示例#5
0
class CoreNLPProcessor(AbstractNLPProcessor):
    def grammar(self):
        ADP = '<RB|RBR|RP|TO|IN|PREP>'
        NP = '<JJ|ADJ>*<NN|VBG|RBS|FW|NNS|PRP|PRP$>+<POS>?<CD>?'
        return """
        NP: {{({NP})+({ADP}?<DT>?{NP})*}}
        VP: {{<VB*>+{ADP}?}}
        PNP: {{<NNP|NNPS>+}}        
        """.format(NP=NP, ADP=ADP)

    def __init__(self):
        super().__init__()
        os.environ["CORENLP_HOME"] = os.path.join(
            os.getcwd(), 'stanford-corenlp-full-2018-10-05')
        self.tagger = CoreNLPClient(annotators=['tokenize', 'pos', 'ner'],
                                    timeout=30000,
                                    memory='4G')

    def __del__(self):
        self.tagger.stop()

    def _extract_ner(self, token):
        ann = self.tagger.annotate(token)
        sentence = ann.sentence[0]
        return [(n.entityMentionText, n.entityType) for n in sentence.mentions]

    def extract_named_entities(self, token):
        entities = self._extract_ner(token)
        entities = list(set(map(lambda x: x[0], entities)))
        return entities

    def get_named_entity_types(self, token):
        return [entity[1] for entity in self._extract_ner(token)]

    def extract_phrase_by_type(self, token, type):
        ann = self.tagger.annotate(token)
        sentence = ann.sentence[0]
        tagged = [(token.word, token.pos) for token in sentence.token]
        return self._extract_phrase(tagged, type)
示例#6
0
class CoreNLPBinaryParser:
    DEFAULT_PORT = 9003

    def __init__(self, threads=1, port=None):
        sid = random.randint(0, 65535)
        if port is None:
            port = self.DEFAULT_PORT
        self.corenlp = CoreNLPClient(endpoint='http://localhost:{0}'.format(port), annotators=['parse'],
                                     output_format='json', properties={'ssplit.eolonly': 'true'}, timeout=300000,
                                     memory='8G', threads=threads, server_id='clinicgen{0}'.format(sid))
        self.corenlp.start()
        self.run = True

    def __del__(self):
        self.stop()

    @classmethod
    def _format(cls, tree):
        childstrs = []
        for child in tree:
            if isinstance(child, Tree):
                childstrs.append(cls._format(child))
            elif isinstance(child, tuple):
                childstrs.append("/".join(child))
            elif isinstance(child, string_types):
                childstrs.append('%s' % child)
            else:
                childstrs.append(unicode_repr(child))
        if len(childstrs) > 1:
            return '( %s )' % ' '.join(childstrs)
        else:
            return childstrs[0]

    @classmethod
    def binarize(cls, tree):
        # collapse
        t = Tree.fromstring(tree)
        # chomsky normal form transformation
        Tree.collapse_unary(t, collapsePOS=True, collapseRoot=True)
        Tree.chomsky_normal_form(t)
        s = cls._format(t)
        return s

    def parse(self, text):
        ann = self.corenlp.annotate(text)
        return self.binarize(ann['sentences'][0]['parse'])

    def stop(self):
        if self.run:
            self.corenlp.stop()
            self.run = False
示例#7
0
class StanfordParser(DependencyParserWrapper):
    def __init__(self):
        super().__init__()
        self.client = CoreNLPClient()

    def __call__(self, sent):
        return self.get_spans(sent)    

    def get_spans(self, sent):
        ann = self.client.annotate(sent, annotators='parse', output_format='json')
        dependencies_list = ann['sentences'][0]['basicDependencies']
        heads = get_heads(dependencies_list)
        tree = DependencyParserWrapper.head_to_tree(heads)
        non_singletons = DependencyParserWrapper.compute_spans(tree)
        singletons = [(n, n + 1) for n in range(len(dependencies_list))]
        return set(non_singletons) | set(singletons)
示例#8
0
def annotate(sentence, lower=True):
    global client
    if client is None:
        client = CoreNLPClient(default_annotators='ssplit,tokenize'.split(','))
    words, gloss, after = [], [], []
    for s in client.annotate(sentence):
        for t in s:
            words.append(t.word)
            gloss.append(t.originalText)
            after.append(t.after)
    if lower:
        words = [w.lower() for w in words]
    return {
        'gloss': gloss,
        'words': words,
        'after': after,
    }
示例#9
0
class StanzaClient():
    def __init__(self):
        self.client = CoreNLPClient(annotators=[
            'tokenize',
            'ssplit',
            'pos',
            'lemma',
            'parse',
        ],
                                    timeout=30000,
                                    properties="zh",
                                    output_format="json",
                                    memory='5g')

    def get_parse_tree(self, sent):
        ann = self.client.annotate(sent)
        return ann["sentences"][0]["parse"]
示例#10
0
def coreference_resolution(text):
    import os
    import stanza
    from stanza.server import CoreNLPClient
    import json

    os.environ["CORENLP_HOME"] = "/home/soheil/Downloads/corenlp"

    # set up the client
    # with CoreNLPClient(annotators=['tokenize', 'ssplit', 'pos','lemma','ner', 'parse','dcoref'], timeout=5000, memory='2G', output_format='json') as client:
    # with CoreNLPClient(annotators=['pos','lemma','ner', 'parse','coref'], timeout=5000, memory='2G') as client:

    # properties={'annotators': 'coref', 'coref.algorithm' : 'statistical'}

    client = CoreNLPClient(
        annotators=[
            'tokenize', 'ssplit', 'pos', 'lemma', 'ner', 'parse', 'dcoref'
        ],
        memory='2G',
        timeout=5000,
        output_format='json'
    )  # 'dcoref' to do multipass sieve 'tokenize', 'ssplit', , endpoint='http://localhost:9001'
    # for sieve
    # annotators = tokenize, ssplit, pos, lemma, ner, parse, dcoref

    # annotators needed for coreference resolution
    # annotators = pos, lemma, ner, parse

    # print(client)
    # Start the background server and wait for some time
    client.start()

    # Print background processes and look for java
    # ps -o pid,cmd | grep java

    text = "Albert Einstein was a German-born theoretical physicist. He developed the theory of relativity."
    ann = client.annotate(text)

    # Shut down the background CoreNLP server
    client.stop()

    # print(ann['corefs'])
    for word in ann['corefs']:
        print(word['text'])
示例#11
0
class QuestionModel:
    def __init__(self):
        self.client = CoreNLPClient(annotators=[
            'tokenize', 'ssplit', 'pos', 'lemma', 'ner', 'parse', 'depparse',
            'coref'
        ],
                                    timeout=30000,
                                    memory='16G',
                                    threads=1)

    #Returns whether or not the provided sentence contains any questions
    def isQuestion(self, text):
        ann = self.client.annotate(text)
        sentence = ann.sentence[0]
        constituency_parse = sentence.parseTree
        for child in constituency_parse.child:
            if child.value == "SQ" or child.value == "SBARQ":
                return True
        return False
示例#12
0
def annotate(sentence, lower=True):
    global client
    if client is None:
        # import pdb; pdb.set_trace()
        client = CoreNLPClient(default_annotators='ssplit,tokenize'.split(','),
                               be_quiet=True)
    words, gloss, after = [], [], []
    sent_annotated = client.annotate(sentence).sentence[0]
    for t in sent_annotated.token:
        words.append(t.word)
        gloss.append(t.originalText)
        after.append(t.after)
    if lower:
        words = [w.lower() for w in words]
    return {
        'gloss': gloss,
        'words': words,
        'after': after,
    }
示例#13
0
def annotate(sentence, lower=True):
    global client
    if client is None:
        client = CoreNLPClient(endpoint="http://localhost:9001",
                               annotators=['ssplit', 'tokenize'],
                               start_server=False)
    words, gloss, after = [], [], []
    for s in client.annotate(sentence):
        for t in s:
            words.append(t.word)
            gloss.append(t.originalText)
            after.append(t.after)
    if lower:
        words = [w.lower() for w in words]
    return {
        'gloss': gloss,
        'words': words,
        'after': after,
    }
示例#14
0
class Preprocessor:
    log = None

    def __init__(self, host=None):
        """
        This preprocessor connects to an CoreNLP server to perform sentence splitting, tokenization, syntactic parsing,
        named entity recognition and coref-resolution on passed documents.

        :param host: the core-nlp host
        """

        self.log = logging.getLogger('GiveMe5W')

        # connect to CoreNLP server
        host = "http://localhost:9000" if host is None else host
        self.cnlp = CoreNLPClient(endpoint=host,
                                  start_server=StartServer.DONT_START)

        # define basic base_config and desired processing pipeline
        self.base_config = {
            'timeout': 500000,
            'annotators':
            'tokenize,ssplit,pos,lemma,parse,ner,depparse,mention,coref',
            'tokenize.language': 'English',
            # 'coref.algorithm' :'neural', see https://github.com/smilli/py-corenlp/issues/18
            # CoreNLPs charniak-wrapper has some problems ...
            # 'parse.type': 'charniak',
            # 'parse.executable': '/home/ubuntu/bllip-parser/',
            # 'parse.verbose': 'true',
            # 'parse.model': './parse-50best.sh',#'~/.local/share/bllipparser/WSJ+Gigaword-v2',
            'outputFormat': 'json'
        }

        self._token_index = None

    def _link_leaf_to_core_nlp(self, s):
        """
        this is where the magic happens add there additional information per candidate-part/token/leave
        char index information is in each nlpToken
        """
        if len(self._tokens) - 1 < self._token_index:
            # there seams a bug around numbers,
            # spitted numbers in the same token are called as they have been split to different tokens
            # this leads to a wrong index, everything in this sentence is lost till the end of that sentence
            self.log.error(
                'fix the doc around(reformat number,remove special characters):'
                + s)
            # print the last two tokens to make it spotable
            self.log.error(self._tokens[-1])
            self.log.error(self._tokens[-2])

            # further we can`t return None because this would break extractors
            # therefore we use this bugfix object
            # TODO: reason if it make sense to reject these documents at all, because result isn`t reliable at all
            # TODO: flag document at least with some error flags
            result = {
                'nlpToken': {
                    'index': 7,
                    'word': 'BUGFIX',
                    'originalText': 'BUGFIX',
                    'lemma': 'BUGFIX',
                    'characterOffsetBegin': 0,
                    'characterOffsetEnd': 0,
                    'pos': 'BUGFIX',
                    'ner': 'BUGFIX',
                    'speaker': 'BUGFIX',
                    'before': ' ',
                    'after': ''
                }
            }

            if self._document:
                self._document.set_error_flag('core_nlp')

        else:
            result = {'nlpToken': self._tokens[self._token_index]}

        self._token_index = self._token_index + 1

        return result

    def _build_actual_config(self, document):
        """
        Creates the actual config, consisting of the base_config and dynamic_params. if the same key exists in both
        base_config and dynamic_params, the value will be used from dynamic_params, i.e., base_config will be overwritten.
        :param document:
        :return:
        """
        dynamic_config = {'date': document.get_date()}
        actual_config = {**self.base_config, **dynamic_config}
        return actual_config

    def preprocess(self, document):
        """
        Send the document to CoreNLP server to execute the necessary preprocessing.

        :param document: Document object to process.
        :type document: Document

        :return Document: The processed Document object.
        """
        actual_config = self._build_actual_config(document)
        annotation = self.cnlp.annotate(text=document.get_full_text(),
                                        properties=actual_config)

        if type(annotation) is str:
            print(annotation)
        else:
            document.set_sentences(annotation['sentences'], [], [])
            self._document = document

            tree = []
            for sentence in annotation['sentences']:
                # that's a hack to add to every tree leave a the tokens result
                self._token_index = 0
                self._tokens = sentence['tokens']
                sentence_tree = nltk.ParentedTree.fromstring(
                    sentence['parse'], read_leaf=self._link_leaf_to_core_nlp)

                # add a reference to the original data from parsing for this sentence
                sentence_tree.stanfordCoreNLPResult = sentence

                tree.append(sentence_tree)

            document.set_trees(tree)
            document.set_corefs(annotation['corefs'])

            tokens = []
            pos = []
            ner = []

            for sentence in annotation['sentences']:
                s_tokens = []
                s_pos = []
                s_ner = []
                for token in sentence['tokens']:
                    s_tokens.append(token)
                    s_pos.append((token['originalText'], token['pos']))
                    s_ner.append((token['originalText'], token['ner']))

                tokens.append(s_tokens)
                pos.append(s_pos)
                ner.append(s_ner)

            document.set_tokens(tokens)
            document.set_pos(pos)
            document.set_ner(ner)
            document.set_enhancement('coreNLP', annotation)
            document.is_preprocessed(True)
示例#15
0
class TextParser(object):
    """TextParser uses stanford core nlp tool to parse text.

    Args:
        port(int): the port to launch a stanford core nlp client
    """
    def __init__(self, port=9000):
        resources.get_corenlp()
        while is_port_in_use(port):
            port += 1
        self._core_nlp_client = CoreNLPClient(annotators=['parse'],
                                              timeout=600000,
                                              memory='16G',
                                              be_quiet=True,
                                              endpoint="http://localhost:%d" %
                                              port)

    def _get_parse_tree(self, sentence):
        """Generate a parsing tree from a sentence."""
        annotation = self._core_nlp_client.annotate(sentence)
        if len(annotation.sentence) != 1:
            logger.warning(
                "_get_parse_tree should take one sentence. but %s is given." %
                sentence)
        return annotation.sentence[0].parseTree

    def split_paragraph_to_sentences(self, paragraph):
        """Split a paragraph into a list of sentences.

        Args:
            paragraph (str): a paragraph of English text.
        Returns:
            ([str]): a list of sentences.
        """
        paragraph = preprocess(paragraph)
        annotation = self._core_nlp_client.annotate(paragraph)
        return [
            " ".join([token.word for token in sentence.token])
            for sentence in annotation.sentence
        ]

    def get_phrases(self, sentence, phrase_level=2):
        """Split a sentence into phrases.

        Args:
            sentence (str): a sentence in English.
            phrase_level (int): larger value gives shorter phrases.
        Returns:
            ([str]): a list of phrases.
        """
        root = self._get_parse_tree(sentence)

        phrases = []
        generate_phrases_by_dfs(root, 0, phrase_level, phrases)
        return phrases

    def phrase_level_shuffle(self, paragraph, n):
        """For each sentence, randomly shuffle phrases in that sentence.

        Args:
            paragraph (str): a paragraph of text.
            n (int): number of randomly shuffled paragraph to generate.
        Returns:
            ([str]): a list of ``n`` shuffled paragraphs.
        """
        paragraph = preprocess(paragraph)
        sentences = self.split_paragraph_to_sentences(paragraph)
        bins = []
        for sentence in sentences:
            bins.append(self.get_phrases(sentence))

        ret = []
        for i in range(n):
            sent = ""
            for bin in bins:
                bin_tmp = bin[:]
                random.shuffle(bin_tmp)
                sent += " " + " ".join(bin_tmp)

            ret.append(sent.strip())
        return ret
示例#16
0
def annotate_document_file(client: CoreNLPClient, input: str, filename: str):
    with open(filename, "wb") as file:
        ann = client.annotate(input)
        file.write(ann.SerializeToString())