예제 #1
0
파일: nlp.py 프로젝트: Yifan-G/EvalCraft
class NLPclient:
  def __init__(self, core_nlp_version = '2018-10-05'):
    from stanza.server import CoreNLPClient
    self.client = CoreNLPClient(annotators=['tokenize','ssplit','pos',
    'lemma','ner','parse','coref'])

  def __enter__(self): return self
  def __exit__(self, exc_type, exc_val, exc_tb): pass
  def __del__(self): self.client.stop()

  def step(self,text) :
      core_nlp_output = self.client.annotate(text=text,
                      annotators=annotators, output_format='json')
      for sentence in core_nlp_output['sentences']:
        lexs=tuple(lexs_of(sentence))
        deps=deps_of(sentence)
        ies=tuple(ies_of(sentence))
        yield lexs,deps,ies

  def extract(self, text):
    tail=clean_text(text)
    while tail:
      chunk=2**13
      head=tail[0:chunk]
      tail=tail[chunk:]
      #print('EXTRACTING FROM',len(head), 'chars.')
      yield from self.step(head)
예제 #2
0
파일: parser.py 프로젝트: TCBpenta8/ifcc-1
class CoreNLPBinaryParser:
    DEFAULT_PORT = 9003

    def __init__(self, threads=1, port=None):
        sid = random.randint(0, 65535)
        if port is None:
            port = self.DEFAULT_PORT
        self.corenlp = CoreNLPClient(endpoint='http://localhost:{0}'.format(port), annotators=['parse'],
                                     output_format='json', properties={'ssplit.eolonly': 'true'}, timeout=300000,
                                     memory='8G', threads=threads, server_id='clinicgen{0}'.format(sid))
        self.corenlp.start()
        self.run = True

    def __del__(self):
        self.stop()

    @classmethod
    def _format(cls, tree):
        childstrs = []
        for child in tree:
            if isinstance(child, Tree):
                childstrs.append(cls._format(child))
            elif isinstance(child, tuple):
                childstrs.append("/".join(child))
            elif isinstance(child, string_types):
                childstrs.append('%s' % child)
            else:
                childstrs.append(unicode_repr(child))
        if len(childstrs) > 1:
            return '( %s )' % ' '.join(childstrs)
        else:
            return childstrs[0]

    @classmethod
    def binarize(cls, tree):
        # collapse
        t = Tree.fromstring(tree)
        # chomsky normal form transformation
        Tree.collapse_unary(t, collapsePOS=True, collapseRoot=True)
        Tree.chomsky_normal_form(t)
        s = cls._format(t)
        return s

    def parse(self, text):
        ann = self.corenlp.annotate(text)
        return self.binarize(ann['sentences'][0]['parse'])

    def stop(self):
        if self.run:
            self.corenlp.stop()
            self.run = False
예제 #3
0
def coreference_resolution(text):
    import os
    import stanza
    from stanza.server import CoreNLPClient
    import json

    os.environ["CORENLP_HOME"] = "/home/soheil/Downloads/corenlp"

    # set up the client
    # with CoreNLPClient(annotators=['tokenize', 'ssplit', 'pos','lemma','ner', 'parse','dcoref'], timeout=5000, memory='2G', output_format='json') as client:
    # with CoreNLPClient(annotators=['pos','lemma','ner', 'parse','coref'], timeout=5000, memory='2G') as client:

    # properties={'annotators': 'coref', 'coref.algorithm' : 'statistical'}

    client = CoreNLPClient(
        annotators=[
            'tokenize', 'ssplit', 'pos', 'lemma', 'ner', 'parse', 'dcoref'
        ],
        memory='2G',
        timeout=5000,
        output_format='json'
    )  # 'dcoref' to do multipass sieve 'tokenize', 'ssplit', , endpoint='http://localhost:9001'
    # for sieve
    # annotators = tokenize, ssplit, pos, lemma, ner, parse, dcoref

    # annotators needed for coreference resolution
    # annotators = pos, lemma, ner, parse

    # print(client)
    # Start the background server and wait for some time
    client.start()

    # Print background processes and look for java
    # ps -o pid,cmd | grep java

    text = "Albert Einstein was a German-born theoretical physicist. He developed the theory of relativity."
    ann = client.annotate(text)

    # Shut down the background CoreNLP server
    client.stop()

    # print(ann['corefs'])
    for word in ann['corefs']:
        print(word['text'])
예제 #4
0
class CoreNLPProcessor(AbstractNLPProcessor):
    def grammar(self):
        ADP = '<RB|RBR|RP|TO|IN|PREP>'
        NP = '<JJ|ADJ>*<NN|VBG|RBS|FW|NNS|PRP|PRP$>+<POS>?<CD>?'
        return """
        NP: {{({NP})+({ADP}?<DT>?{NP})*}}
        VP: {{<VB*>+{ADP}?}}
        PNP: {{<NNP|NNPS>+}}        
        """.format(NP=NP, ADP=ADP)

    def __init__(self):
        super().__init__()
        os.environ["CORENLP_HOME"] = os.path.join(
            os.getcwd(), 'stanford-corenlp-full-2018-10-05')
        self.tagger = CoreNLPClient(annotators=['tokenize', 'pos', 'ner'],
                                    timeout=30000,
                                    memory='4G')

    def __del__(self):
        self.tagger.stop()

    def _extract_ner(self, token):
        ann = self.tagger.annotate(token)
        sentence = ann.sentence[0]
        return [(n.entityMentionText, n.entityType) for n in sentence.mentions]

    def extract_named_entities(self, token):
        entities = self._extract_ner(token)
        entities = list(set(map(lambda x: x[0], entities)))
        return entities

    def get_named_entity_types(self, token):
        return [entity[1] for entity in self._extract_ner(token)]

    def extract_phrase_by_type(self, token, type):
        ann = self.tagger.annotate(token)
        sentence = ann.sentence[0]
        tagged = [(token.word, token.pos) for token in sentence.token]
        return self._extract_phrase(tagged, type)
예제 #5
0
class DocumentProcessor(object):
    """This class represents the Document Processor class that processes the whole input document
	"""
    def __init__(self, config_path, lang):
        self.config = yaml.load(open(config_path, "r"))
        self.client = None
        self.lang = lang

    def __enter__(self):
        if environ.get("CORENLP_HOME") is None:
            raise EnvPathException(
                "The CORENLP_HOME path was not found. Please export it pointing to the directory that contains the CoreNLP resources"
            )
        my_path = os.path.abspath(os.path.dirname(__file__))
        settings.init()
        settings.LANGUAGE = self.lang
        stanza.download(self.lang, dir=self.config["stanza"]["dir"])
        self.nlp = stanza.Pipeline(**self.config["stanza"], lang=self.lang)
        language_properties_fp = os.path.join(my_path, "language_resources",
                                              self.lang + "_properties.txt")
        self.client = CoreNLPClient(properties=language_properties_fp,
                                    **self.config["corenlp"])
        self.client.start()
        return self

    def break_json_into_chunks(self, doc_json):
        """Convert an input json to a list of sentences

			Args:
				doc_json (dict): The input json representing the input document

			Returns:
				list : The list of sentences with raw text
				list: The list of sentences as jsons
		"""
        raw_sentences = []
        sentence_jsons = []
        try:
            for sent_json in doc_json:
                sentence_jsons.append(sent_json)
                sent_text = " ".join(
                    [word["word"] for word in sent_json["words"]])
                raw_sentences.append(sent_text)
        except Exception as e:
            raise InavlidJSONFileException(
                "The input JSON file you provided could not be analysed. Please check the example format provided"
            )
        return raw_sentences, sentence_jsons

    def break_text_into_sentences(self, text):
        """Break the input raw text string into sentences using Stanza

			Args:
				doc_json (dict): The input json representing the input document

			Returns:
				list : The list of sentences with raw text
				list : The list of sentences as jsons
		"""
        sentences = []
        stanza_doc = self.nlp(text)
        for sentence in stanza_doc.sentences:
            sentences.append(sentence.text)
        return sentences

    def analyze(self, doc, input_format):
        """Method to analyze the input as either a json or a string and return back a Document object

			Args:
				doc (json / string): The input that needs to be analyzed using Stanza

			Returns:
				Document: The Document object
		"""
        if input_format.lower() not in ["string", "json"]:
            raise InavlidFormatException(
                "Please provide the format as either 'string' or 'json'")

        settings.INPUT_FORMAT = input_format.lower()
        doc_obj = Document(self.lang, self.nlp, self.client)
        if settings.INPUT_FORMAT == "json":  # the input format here is json
            doc = json.loads(doc)
            raw_sentences, sentence_jsons = self.break_json_into_chunks(doc)
            for raw_sent, sent_json in zip(raw_sentences, sentence_jsons):
                sentence = Sentence(self.lang, self.nlp, self.client, raw_sent,
                                    sent_json)
                sentence.json = sent_json
                doc_obj.sentence_objs.append(sentence)
        else:  # the input format here is string
            raw_sentences = self.break_text_into_sentences(doc)
            for raw_sent in raw_sentences:
                sentence = Sentence(self.lang, self.nlp, self.client, raw_sent)
                doc_obj.sentence_objs.append(sentence)
        return doc_obj

    def __exit__(self, exc_type, exc_value, tb):
        """ Method to stop the CoreNLP client"""
        if self.client is not None:
            self.client.stop()