Python CoreNLPClient示例，stanza.server.CoreNLPClient Python示例

示例#1

0

显示文件

    def __init__(self, host=None):
        """
        This preprocessor connects to an CoreNLP server to perform sentence splitting, tokenization, syntactic parsing,
        named entity recognition and coref-resolution on passed documents.

        :param host: the core-nlp host
        """

        self.log = logging.getLogger('GiveMe5W')

        # connect to CoreNLP server
        host = "http://localhost:9000" if host is None else host
        self.cnlp = CoreNLPClient(endpoint=host,
                                  start_server=StartServer.DONT_START)

        # define basic base_config and desired processing pipeline
        self.base_config = {
            'timeout': 500000,
            'annotators':
            'tokenize,ssplit,pos,lemma,parse,ner,depparse,mention,coref',
            'tokenize.language': 'English',
            # 'coref.algorithm' :'neural', see https://github.com/smilli/py-corenlp/issues/18
            # CoreNLPs charniak-wrapper has some problems ...
            # 'parse.type': 'charniak',
            # 'parse.executable': '/home/ubuntu/bllip-parser/',
            # 'parse.verbose': 'true',
            # 'parse.model': './parse-50best.sh',#'~/.local/share/bllipparser/WSJ+Gigaword-v2',
            'outputFormat': 'json'
        }

        self._token_index = None

示例#2

0

显示文件

 def __init__(self):
     super().__init__()
     os.environ["CORENLP_HOME"] = os.path.join(
         os.getcwd(), 'stanford-corenlp-full-2018-10-05')
     self.tagger = CoreNLPClient(annotators=['tokenize', 'pos', 'ner'],
                                 timeout=30000,
                                 memory='4G')

示例#3

0

显示文件

文件： nlp.py 项目： Yifan-G/EvalCraft

class NLPclient:
  def __init__(self, core_nlp_version = '2018-10-05'):
    from stanza.server import CoreNLPClient
    self.client = CoreNLPClient(annotators=['tokenize','ssplit','pos',
    'lemma','ner','parse','coref'])

  def __enter__(self): return self
  def __exit__(self, exc_type, exc_val, exc_tb): pass
  def __del__(self): self.client.stop()

  def step(self,text) :
      core_nlp_output = self.client.annotate(text=text,
                      annotators=annotators, output_format='json')
      for sentence in core_nlp_output['sentences']:
        lexs=tuple(lexs_of(sentence))
        deps=deps_of(sentence)
        ies=tuple(ies_of(sentence))
        yield lexs,deps,ies

  def extract(self, text):
    tail=clean_text(text)
    while tail:
      chunk=2**13
      head=tail[0:chunk]
      tail=tail[chunk:]
      #print('EXTRACTING FROM',len(head), 'chars.')
      yield from self.step(head)

示例#4

0

显示文件

文件： bert_sampling_utils_text_parser.py 项目： BUAAw-ML/Learning-method-for-text-classification

 def __init__(self, port=9000):
     utils.get_corenlp()
     while is_port_in_use(port):
         port += 1
     self._core_nlp_client = CoreNLPClient(
         annotators=['parse'], timeout=600000, memory='16G', be_quiet=True,
         endpoint="http://localhost:%d" % port)

示例#5

0

显示文件

 def __init__(self, port=9001):
     self.nlp = stanza.Pipeline('en')  # initialize English neural pipeline
     self.client = CoreNLPClient(
         annotators=['tokenize', 'ssplit', 'pos', 'lemma', 'parse'],
         timeout=60000,
         memory='4G',
         endpoint=f'http://localhost:{port}')

示例#6

0

显示文件

文件： question.py 项目： waikato-orca/Huri-Whakatau-NLP

 def __init__(self):
     self.client = CoreNLPClient(annotators=[
         'tokenize', 'ssplit', 'pos', 'lemma', 'ner', 'parse', 'depparse',
         'coref'
     ],
                                 timeout=30000,
                                 memory='16G',
                                 threads=1)

示例#7

0

显示文件

文件： parser.py 项目： TCBpenta8/ifcc-1

 def __init__(self, threads=1, port=None):
     sid = random.randint(0, 65535)
     if port is None:
         port = self.DEFAULT_PORT
     self.corenlp = CoreNLPClient(endpoint='http://localhost:{0}'.format(port), annotators=['parse'],
                                  output_format='json', properties={'ssplit.eolonly': 'true'}, timeout=300000,
                                  memory='8G', threads=threads, server_id='clinicgen{0}'.format(sid))
     self.corenlp.start()
     self.run = True

示例#8

0

显示文件

 def __init__(self):
     self.client = CoreNLPClient(annotators=[
         'tokenize',
         'ssplit',
         'pos',
         'lemma',
         'parse',
     ],
                                 timeout=30000,
                                 properties="zh",
                                 output_format="json",
                                 memory='5g')

示例#9

0

显示文件

文件： multi_word_expression_parser.py 项目： s-mizuki-nlp/word_sense_disambiguation

    def __init__(self,
                 dir_corenlp: str,
                 annotators: str = "tokenize,ssplit,pos,lemma,jmwe",
                 jmwe_detector_type: str = "Consecutive",
                 output_format: str = "serialized",
                 threads: int = 1,
                 kwargs_properties: Optional[Dict[str, str]] = None,
                 kwargs_corenlp_client: Optional[Dict[str, str]] = None):
        """
        Stanford Core NLP Client Wrapper class.

        @param dir_corenlp:
        @param annotators:
        @param jmwe_detector_type:
        @param output_format:
        @param kwargs_properties:
        @param kwargs_corenlp_client:
        """
        _props = copy.deepcopy(self._default_corenlp_properties)
        _props["annotators"] = annotators
        _props["customAnnotatorClass.jmwe.detector"] = jmwe_detector_type
        if isinstance(kwargs_properties, dict):
            _props.update(kwargs_properties)

        # instanciate corenlp client
        _args = {
            "properties": _props,
            "output_format": output_format,
            "classpath":
            ":".join(glob2.glob(os.path.join(dir_corenlp, "*.jar"))),
            "start_server": StartServer.TRY_START
        }
        if isinstance(kwargs_corenlp_client, dict):
            _args.update(kwargs_corenlp_client)

        self._corenlp = {}
        if threads == 1:
            self._corenlp[0] = CoreNLPClient(**_args)
        else:
            self._pool = multiprocessing.Pool(threads)
            for index in range(threads):
                _args_i = copy.deepcopy(_args)
                _args_i["start_server"] = StartServer.TRY_START
                _args_i["endpoint"] = f"http://localhost:{9000+index}"
                _args_i["output_format"] = "serialized"
                _args_i["threads"] = 1
                self._corenlp[index] = CoreNLPClient(**_args_i)

        self._corenlp_properties = _props
        self._corenlp_client_args = _args
        self._threads = threads

示例#10

0

显示文件

文件： parser.py 项目： TCBpenta8/ifcc-1

class CoreNLPBinaryParser:
    DEFAULT_PORT = 9003

    def __init__(self, threads=1, port=None):
        sid = random.randint(0, 65535)
        if port is None:
            port = self.DEFAULT_PORT
        self.corenlp = CoreNLPClient(endpoint='http://localhost:{0}'.format(port), annotators=['parse'],
                                     output_format='json', properties={'ssplit.eolonly': 'true'}, timeout=300000,
                                     memory='8G', threads=threads, server_id='clinicgen{0}'.format(sid))
        self.corenlp.start()
        self.run = True

    def __del__(self):
        self.stop()

    @classmethod
    def _format(cls, tree):
        childstrs = []
        for child in tree:
            if isinstance(child, Tree):
                childstrs.append(cls._format(child))
            elif isinstance(child, tuple):
                childstrs.append("/".join(child))
            elif isinstance(child, string_types):
                childstrs.append('%s' % child)
            else:
                childstrs.append(unicode_repr(child))
        if len(childstrs) > 1:
            return '( %s )' % ' '.join(childstrs)
        else:
            return childstrs[0]

    @classmethod
    def binarize(cls, tree):
        # collapse
        t = Tree.fromstring(tree)
        # chomsky normal form transformation
        Tree.collapse_unary(t, collapsePOS=True, collapseRoot=True)
        Tree.chomsky_normal_form(t)
        s = cls._format(t)
        return s

    def parse(self, text):
        ann = self.corenlp.annotate(text)
        return self.binarize(ann['sentences'][0]['parse'])

    def stop(self):
        if self.run:
            self.corenlp.stop()
            self.run = False

示例#11

0

显示文件

 def __enter__(self):
     if environ.get("CORENLP_HOME") is None:
         raise EnvPathException(
             "The CORENLP_HOME path was not found. Please export it pointing to the directory that contains the CoreNLP resources"
         )
     my_path = os.path.abspath(os.path.dirname(__file__))
     settings.init()
     settings.LANGUAGE = self.lang
     stanza.download(self.lang, dir=self.config["stanza"]["dir"])
     self.nlp = stanza.Pipeline(**self.config["stanza"], lang=self.lang)
     language_properties_fp = os.path.join(my_path, "language_resources",
                                           self.lang + "_properties.txt")
     self.client = CoreNLPClient(properties=language_properties_fp,
                                 **self.config["corenlp"])
     return self

示例#12

0

显示文件

def run_conversion(qas, corenlp_home):
    os.environ['CORENLP_HOME'] = 'stanford-corenlp-full-2018-10-05'
    ret = list()
    with CoreNLPClient(
            annotators=['tokenize', 'ssplit', 'pos', 'lemma', 'ner', 'parse'],
            timeout=30000,
            memory='16G',
            properties={
                'ssplit.eolonly': True,
                'ssplit.newlineIsSentenceBreak': 'always',
                'outputFormat': 'json'
            },
            endpoint='http://localhost:9001') as client:
        for question, answer in tqdm(qas):
            parse = client.annotate(question)['sentences'][0]
            tokens = parse['tokens']
            const_parse = read_const_parse(parse['parse'])
            for rule in CONVERSION_RULES:
                sent = rule.convert(question, answer, tokens, const_parse)
                if sent:
                    ret.append([question, answer, sent])
                    break
            else:
                ret.append([question, answer, None])
    return ret

示例#13

0

显示文件

 def process_one_headline(self, headline):
     with self.lock:
         with CoreNLPClient(annotators=self.annotators,
                            timeout=self.timeout,
                            memory=self.memory,
                            classpath=self.core_nlp_folder) as client:
             return self.collect_data(client.annotate(headline))

示例#14

0

显示文件

文件： stanford_openie_rels.py 项目： serenalotreck/knowledge-graph

def main(data_dir, to_annotate, affinity_cap, output_name, graph,
         graph_out_loc):

    properties = {'openie.affinity_probability_cap': affinity_cap}

    dygiepp_jsonl = []
    with CoreNLPClient(annotators=["openie"], output_format="json") as client:
        for doc in to_annotate:

            # Get the doc_key
            doc_key = splitext(basename(doc))[0]

            # Read in the text
            with open(doc) as f:
                text = " ".join(f.read().split('\n'))

                # Perform OpenIE
                ann = client.annotate(text)

            # Convert output to dygiepp format
            dygiepp_jsonl.append(openie_to_dygiepp(ann, doc_key))

            # Graph annotations if requested
            if graph:
                graph_annotations(text, properties, doc_key, graph_out_loc)

    # Write out dygiepp-formatted output
    with jsonlines.open(output_name, 'w') as writer:
        writer.write_all(dygiepp_jsonl)

示例#15

0

显示文件

def POSTag(text, sent_split=True, tolist=True):
    StanfordCoreNLP_chinese_properties = get_StanfordCoreNLP_chinese_properties(
    )
    words = []
    if text != '':
        try:
            lang = langdetect.detect(text)
        except langdetect.lang_detect_exception.LangDetectException:
            lang = "undetermined"
        if sent_split:
            annotators = ['tokenize', 'ssplit', 'pos']
        else:
            annotators = ['tokenize', 'pos']
        ##########
        if (lang == "zh-cn") or (lang == "en"):
            if (lang == "zh-cn"):
                with CoreNLPClient(
                        annotators=annotators,
                        properties=StanfordCoreNLP_chinese_properties,
                        timeout=15000) as client:
                    ann = client.annotate(text)
            elif (lang == "en"):
                with CoreNLPClient(annotators=annotators,
                                   timeout=15000) as client:
                    ann = client.annotate(text)
            #########
            if sent_split:
                words = [[(token.word, token.pos) for token in sent.token]
                         for sent in ann.sentence]
                segmented_list = [
                    ' '.join(['#'.join(posted) for posted in wordlist])
                    for wordlist in words
                ]
                segmented = '\n'.join(segmented_list)
            else:
                words = [(token.word, token.pos)
                         for token in ann.sentencelessToken]
                segmented = ' '.join(['#'.join(posted) for posted in words])
        else:
            segmented = text
            words = segmented.split()
    else:
        segmented = text
    if tolist:
        return words  #list
    else:
        return segmented  #string

示例#16

0

显示文件

def annotate(sentence, lower=True):
    global client
    if client is None:
        client = CoreNLPClient(default_annotators='ssplit,tokenize'.split(','))
    words, gloss, after = [], [], []
    for s in client.annotate(sentence):
        for t in s:
            words.append(t.word)
            gloss.append(t.originalText)
            after.append(t.after)
    if lower:
        words = [w.lower() for w in words]
    return {
        'gloss': gloss,
        'words': words,
        'after': after,
    }

示例#17

0

显示文件

文件： gqa.py 项目： gchaperon/neural-state-machine

def process_q_batch(batch: List[str],
                    tagger_client: CoreNLPClient) -> List[List[str]]:
    n_questions = len(batch)
    assert n_questions > 0
    text = " ".join(batch)
    assert len(text) <= tagger_client.DEFAULT_MAX_CHAR_LENGTH
    ann = tagger_client.annotate(text)
    assert len(ann.sentence) == n_questions
    return [process_tagged(s) for s in ann.sentence]

示例#18

0

显示文件

def Parse(text, lang='zh-cn', annotators=None):
    StanfordCoreNLP_chinese_properties = get_StanfordCoreNLP_chinese_properties(
    )
    if annotators == None:
        annotators = [
            'tokenize', 'ssplit', 'lemma', 'pos', 'ner', 'parse', 'depparse',
            'regnexer', 'coref'
        ]
        # annotators = ['tokenize', 'ssplit', 'lemma', 'pos', 'parse']
    if lang == 'zh-cn':
        with CoreNLPClient(annotators=annotators,
                           properties=StanfordCoreNLP_chinese_properties,
                           timeout=15000) as client:
            ann = client.annotate(text)
    elif lang == 'en':
        with CoreNLPClient(annotators=annotators, timeout=15000) as client:
            ann = client.annotate(text)
    return ann

示例#19

0

显示文件

文件： corpus.py 项目： LogicalExpressivism/ROLEpy

def start_server() -> CoreNLPClient:
    """Starts a CoreNLP server through Stanza and returns it."""
    stanza.install_corenlp(dir="./stanza_corenlp")
    return CoreNLPClient(annotators=[
        'tokenize', 'ssplit', 'pos', 'lemma', 'ner', 'parse', 'depparse',
        'coref', 'kbp', 'natlog', 'openie'
    ],
                         timeout=30000,
                         memory='16G')

示例#20

0

显示文件

文件： corenlp.py 项目： quadrismegistus/lltk

def annotate_do(txt):
    if not txt: return {}
    from stanza.server import CoreNLPClient, StartServer
    with CoreNLPClient(start_server=StartServer.DONT_START,
                       output_format='json') as client:
        try:
            return client.annotate(txt)
        except:
            return

示例#21

0

显示文件

    def __init__(self, url, compound_map_file):

        self.nlp_properties = {
            'annotators': "tokenize,ssplit,pos,lemma,ner",
            "tokenize.options": "splitHyphenated=true,normalizeParentheses=false",
            "tokenize.whitespace": False,
            'ssplit.isOneSentence': True,
            'outputFormat': 'json'
        }

        os.environ["CORENLP_HOME"] = "/content/stanford-corenlp-full-2018-10-05"
        client = CoreNLPClient(annotators=['tokenize','ssplit', 'pos', 'lemma', 'ner'], properties=self.nlp_properties, memory='10G', endpoint=url)
        client.start()
        print(client)
        time.sleep(10)
        self.nlp = client
        
        self.compound_map = self.load_compound_map(compound_map_file)

示例#22

0

显示文件

文件： parse_docs_1.py 项目： doug919/narrative_graph_emnlp2020

def main():
    assert config['config_target'] == 'conll16_discourse'

    # start CoreNLP server manually
    # java -Xmx16G -cp "/homes/lee2226/scratch2/stanford-corenlp-full-2020-04-20/*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9002 -timeout 60000 -threads 5 -maxCharLength 100000 -preload tokenize,ssplit, pos,lemma,ner, parse,depparse,coref,kbp -outputFormat json

    # use use the tokenization from the given parse file and re-parse them with our CoreNLP
    split_dir = config['{}_dir'.format(args.split)]
    parse_fpath = os.path.join(split_dir, 'parses.json')
    logger.info('loading {}...'.format(parse_fpath))
    old_parses = json.load(open(parse_fpath, 'r'))

    fw = open(args.output_file, 'w')
    properties = {
        'tokenize.whitespace': True,
        'tokenize.keepeol': True,
        'ssplit.eolonly': True,
        'ner.useSUTime': False
    }
    annotators = [
        'tokenize', 'ssplit', 'pos', 'lemma', 'ner', 'parse', 'depparse',
        'coref'
    ]
    with CoreNLPClient(annotators=annotators,
                       properties=properties,
                       timeout=300000,
                       endpoint='http://localhost:{}'.format(
                           args.nlp_server_port),
                       start_server=False) as client:
        for doc_id, parse in tqdm(old_parses.items()):
            sents = []
            for i_sent, sent in enumerate(parse['sentences']):
                words = [w[0] for w in sent['words']]
                sent_text = ' '.join(words)
                sents.append(sent_text)
            all_text = '\n'.join(sents)

            try:
                ann = client.annotate(all_text,
                                      annotators=annotators,
                                      properties=properties,
                                      output_format='json')
                ann['doc_id'] = doc_id

                # verify lengths
                assert len(ann['sentences']) == len(
                    parse['sentences']), 'ssplit mismatch'
                for i_sent in range(len(parse['sentences'])):
                    n_words = len(parse['sentences'][i_sent]['words'])
                    assert len(ann['sentences'][i_sent]['tokens']) == n_words

                out = json.dumps(ann)
                fw.write(out + '\n')
            except:
                logger.warning('failed parsing {}'.format(doc_id))
    fw.close()

示例#23

0

显示文件

def run_parsing(gen, prefix):
    logger.info('start parsing {}'.format(prefix))
    fpath = os.path.join(args.output_dir, '{}_parses.json'.format(prefix))
    fw = open(fpath, 'w')
    # java -Xmx16G -cp "/homes/lee2226/scratch2/stanford-corenlp-full-2020-04-20/*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9002 -timeout 300000 -threads 5 -maxCharLength 100000 -preload tokenize,ssplit,pos,lemma,ner,parse,depparse,coref -outputFormat json

    cnt = 0
    failed_sids = []
    properties = {
        # 'tokenize.whitespace': True,
        'tokenize.keepeol': True,
        'ssplit.eolonly': True,
        # 'coref.algorithm': 'statistical',
        'ner.useSUTime': False
    }
    annotators = [
        'tokenize', 'ssplit', 'pos', 'lemma', 'ner', 'parse', 'depparse',
        'coref'
    ]
    with CoreNLPClient(annotators=annotators,
                       properties=properties,
                       timeout=1200000,
                       endpoint='http://localhost:{}'.format(
                           args.nlp_server_port),
                       start_server=False) as client:

        t1 = time.time()
        for sid, doc in tqdm(gen()):
            logger.info('last processing time: {} s'.format(time.time() - t1))
            t1 = time.time()

            text = [doc['lines'][str(i)]['text'] for i in range(1, 6)]
            text = '\n'.join(text)

            # parsing
            try:
                ann = client.annotate(text,
                                      annotators=annotators,
                                      properties=properties,
                                      output_format='json')
            except:
                logger.warning('failed parsing {}'.format(sid))
                failed_sids.append(sid)
                continue

            if len(ann['sentences']) != 5:
                logger.warning('failed sentence length {}'.format(sid))
                failed_sids.append(sid)
                continue
            ann['sid'] = sid
            line = json.dumps(ann)
            fw.write(line + '\n')
            cnt += 1
    fw.close()
    logger.info('failed sids={}'.format(failed_sids))
    logger.info('done: {} files,  {} s'.format(cnt, time.time() - t1))

示例#24

0

显示文件

 def process_multiple_headlines(self, headlines):
     data = []
     with self.lock:
         with CoreNLPClient(annotators=self.annotators,
                            timeout=self.timeout,
                            memory=self.memory,
                            classpath=self.core_nlp_folder) as client:
             for headline in headlines:
                 data.append(self.collect_data(client.annotate(headline)))
         return data

示例#25

0

显示文件

文件： annotate_ws.py 项目： QHZSS/SRAWL

def annotate(sentence, lower=True):
    global client
    if client is None:
        client = CoreNLPClient(endpoint="http://localhost:9001",
                               annotators=['ssplit', 'tokenize'],
                               start_server=False)
    words, gloss, after = [], [], []
    for s in client.annotate(sentence):
        for t in s:
            words.append(t.word)
            gloss.append(t.originalText)
            after.append(t.after)
    if lower:
        words = [w.lower() for w in words]
    return {
        'gloss': gloss,
        'words': words,
        'after': after,
    }

示例#26

0

显示文件

def annotate(sentence, lower=True):
    global client
    if client is None:
        # import pdb; pdb.set_trace()
        client = CoreNLPClient(default_annotators='ssplit,tokenize'.split(','),
                               be_quiet=True)
    words, gloss, after = [], [], []
    sent_annotated = client.annotate(sentence).sentence[0]
    for t in sent_annotated.token:
        words.append(t.word)
        gloss.append(t.originalText)
        after.append(t.after)
    if lower:
        words = [w.lower() for w in words]
    return {
        'gloss': gloss,
        'words': words,
        'after': after,
    }

示例#27

0

显示文件

def Segment_Chinese_only(text, sent_split=True, tolist=True):
    # Grabs a Chinese string and returns as list of words nested in a list of sentences
    # sent_split=True if we want to split the text into sentences, and then parse each sentence individually.
    # tolist=True if we want to receive a list of words, False if we want a sentence split by spaces
    StanfordCoreNLP_chinese_properties = get_StanfordCoreNLP_chinese_properties(
    )
    words = []
    if text != '':
        try:
            lang = langdetect.detect(text)
        except langdetect.lang_detect_exception.LangDetectException:
            lang = "undetermined"
        if (lang == "zh-cn"):  #If text is Chinese, segment it, else leave it
            #########
            if sent_split:
                annotators = ['tokenize', 'ssplit']
                with CoreNLPClient(
                        annotators=annotators,
                        properties=StanfordCoreNLP_chinese_properties,
                        timeout=15000) as client:
                    ann = client.annotate(text)
                words = [[token.word for token in sent.token]
                         for sent in ann.sentence]
                segmented_list = [' '.join(wordlist) for wordlist in words]
                segmented = '\n'.join(segmented_list)
            else:
                annotators = ['tokenize']
                with CoreNLPClient(
                        annotators=annotators,
                        properties=StanfordCoreNLP_chinese_properties,
                        timeout=15000) as client:
                    ann = client.annotate(text)
                words = [token.word for token in ann.sentencelessToken]
                segmented = ' '.join(words)
        else:
            segmented = text
            words = segmented.split()
    else:
        segmented = text
    if tolist:
        return words  #list
    else:
        return segmented  #string

示例#28

0

显示文件

文件： gum2xml.py 项目： yilunzhu/ontogum

def main(coref_path, out_dir, gum_file_lists=None):
    train_list = []
    dev_list = []
    test_list = []

    for filename in os.listdir(gum_file_lists):
        file_path = gum_file_lists + os.sep + filename
        if "train" in filename:
            train_list = find_list(file_path)
        elif "dev" in filename:
            dev_list = find_list(file_path)
        else:
            test_list = find_list(file_path)

    genres = [
        "academic", "bio", "fiction", "interview", "news", "voyage", "whow",
        "reddit", "conversation", "speech", "textbook", "vlog"
    ]

    for genre in genres:
        for filename in os.listdir(coref_path + os.sep + "conll"):
            if genre in filename:
                tsv_file = coref_path + os.sep + "tsv" + os.sep + filename.split(
                    ".")[0] + ".tsv"

                text = build_text(tsv_file)
                with CoreNLPClient(properties={
                        'annotators':
                        'tokenize,ssplit,pos,lemma,ner,parse,dcoref',
                        'ssplit.eolonly': True,
                        'tokenize.whitespace': True,
                },
                                   output_format='xml',
                                   timeout=60000,
                                   memory='8G') as client:
                    xml_out = client.annotate(text)

                if filename.split(".")[0] in train_list:
                    write_file(
                        out_dir + os.sep + 'train' + os.sep +
                        filename.split(".")[0] + '.xml', xml_out)
                elif filename.split(".")[0] in dev_list:
                    write_file(
                        out_dir + os.sep + 'dev' + os.sep +
                        filename.split(".")[0] + '.xml', xml_out)
                elif filename.split(".")[0] in test_list:
                    write_file(
                        out_dir + os.sep + 'test' + os.sep +
                        filename.split(".")[0] + '.xml', xml_out)
                else:
                    sys.stderr.write(f"ERROR: file {filename} not in list.\n")

    print("Done!")

示例#29

0

显示文件

    def corenlp_coref_resolution(self, memory, timeout, properties):
        """
        Perform coreference resolution on given text using Stanford CoreNLP
        :param
            - memory: str
            - timeout: int
            - properties: dict
        :return:
            - texts: list,
                List of sentences resolved and unresolved by coreference resolution operation.
        """

        # Start CoreNLP Server with required properties
        with CoreNLPClient(pipeline='StanfordCoreNLP',
                           timeout=timeout,
                           memory=memory,
                           properties=properties) as client:
            texts = self.input_data()
            index = 0
            time.sleep(10)
            for text in texts:
                doc = self.nlp(text)
                modified_text = [
                    sentence.string.strip() for sentence in doc.sents
                ]
                # submit the request to the server
                ann = client.annotate(text)
                # In each chain, replace the anaphora with the correct representative
                for coref in ann.corefChain:
                    mts = [mention for mention in coref.mention]
                    representative = coref.representative
                    phrase_rep = self.create_phrase(mts[coref.representative],
                                                    ann)
                    antecedent = ' '.join(word for word in phrase_rep)
                    check_rep = 0
                    for mention in coref.mention:
                        if check_rep == representative:
                            check_rep += 1
                            continue
                        phrase = self.create_phrase(mts[check_rep], ann)
                        anaphor = ' '.join(word for word in phrase)
                        anaphor = anaphor + ' '
                        antecedent = antecedent + ' '
                        modified_text[mention.sentenceIndex] = modified_text[
                            mention.sentenceIndex].replace(
                                anaphor, antecedent)
                        check_rep += 1
                modified_text = ' '.join(modified_text)
                texts[index] = modified_text
                index += 1
        if self.coref_output is True:
            self.coref_output_file(texts)
        return texts

示例#30

0

显示文件

文件： multi_word_expression_parser.py 项目： s-mizuki-nlp/word_sense_disambiguation

def _annotate_parse(client: CoreNLPClient, doc: str):
    """
    並列処理用のヘルパー関数

    @param client: corenlp clientインスタンス
    @param doc: 処理するdocument．できるだけ大きくするとよい
    @param index: プロセス番号(0,1,2,...,n_process)
    @return: 処理したドキュメント
    """

    obj_doc = client.annotate(doc)
    iter_sentences = parse_serialized_document(obj_doc)
    return iter_sentences