def get_corenlp_client(corenlp_path="", corenlp_port=0, annotators=None): """ :param corenlp_path: corenlp path, e.g., /home/xliucr/stanford-corenlp-3.9.2 :type corenlp_path: str (default = "") :param corenlp_port: corenlp port, e.g., 9000 :type corenlp_port: int (default = 0) :param annotators: annotators for corenlp, please refer to https://stanfordnlp.github.io/CoreNLP/annotators.html :type annotators: Union[List, None] (default = None) :return: the corenlp client and whether the client is external :rtype: Tuple[stanfordnlp.server.CoreNLPClient, bool] """ if corenlp_port == 0: return None, True if not annotators: annotators = list(ANNOTATORS) if is_port_occupied(port=corenlp_port): try: os.environ["CORENLP_HOME"] = corenlp_path corenlp_client = CoreNLPClient( annotators=annotators, timeout=99999, memory='4G', endpoint="http://localhost:%d" % corenlp_port, start_server=False, be_quiet=False ) # corenlp_client.annotate("hello world", annotators=list(annotators), output_format="json") return corenlp_client, True except BaseException as err: raise err elif corenlp_path != "": print("Starting corenlp client at port {}".format(corenlp_port)) corenlp_client = CoreNLPClient( annotators=annotators, timeout=99999, memory='4G', endpoint="http://localhost:%d" % corenlp_port, start_server=True, be_quiet=False ) corenlp_client.annotate("hello world", annotators=list(annotators), output_format="json") return corenlp_client, False else: return None, True
class NLPclient: def __init__(self, core_nlp_version = '2018-10-05'): from stanza.server import CoreNLPClient self.client = CoreNLPClient(annotators=['tokenize','ssplit','pos', 'lemma','ner','parse','coref']) def __enter__(self): return self def __exit__(self, exc_type, exc_val, exc_tb): pass def __del__(self): self.client.stop() def step(self,text) : core_nlp_output = self.client.annotate(text=text, annotators=annotators, output_format='json') for sentence in core_nlp_output['sentences']: lexs=tuple(lexs_of(sentence)) deps=deps_of(sentence) ies=tuple(ies_of(sentence)) yield lexs,deps,ies def extract(self, text): tail=clean_text(text) while tail: chunk=2**13 head=tail[0:chunk] tail=tail[chunk:] #print('EXTRACTING FROM',len(head), 'chars.') yield from self.step(head)
def process_q_batch(batch: List[str], tagger_client: CoreNLPClient) -> List[List[str]]: n_questions = len(batch) assert n_questions > 0 text = " ".join(batch) assert len(text) <= tagger_client.DEFAULT_MAX_CHAR_LENGTH ann = tagger_client.annotate(text) assert len(ann.sentence) == n_questions return [process_tagged(s) for s in ann.sentence]
def _annotate_parse(client: CoreNLPClient, doc: str): """ 並列処理用のヘルパー関数 @param client: corenlp clientインスタンス @param doc: 処理するdocument.できるだけ大きくするとよい @param index: プロセス番号(0,1,2,...,n_process) @return: 処理したドキュメント """ obj_doc = client.annotate(doc) iter_sentences = parse_serialized_document(obj_doc) return iter_sentences
class CoreNLPProcessor(AbstractNLPProcessor): def grammar(self): ADP = '<RB|RBR|RP|TO|IN|PREP>' NP = '<JJ|ADJ>*<NN|VBG|RBS|FW|NNS|PRP|PRP$>+<POS>?<CD>?' return """ NP: {{({NP})+({ADP}?<DT>?{NP})*}} VP: {{<VB*>+{ADP}?}} PNP: {{<NNP|NNPS>+}} """.format(NP=NP, ADP=ADP) def __init__(self): super().__init__() os.environ["CORENLP_HOME"] = os.path.join( os.getcwd(), 'stanford-corenlp-full-2018-10-05') self.tagger = CoreNLPClient(annotators=['tokenize', 'pos', 'ner'], timeout=30000, memory='4G') def __del__(self): self.tagger.stop() def _extract_ner(self, token): ann = self.tagger.annotate(token) sentence = ann.sentence[0] return [(n.entityMentionText, n.entityType) for n in sentence.mentions] def extract_named_entities(self, token): entities = self._extract_ner(token) entities = list(set(map(lambda x: x[0], entities))) return entities def get_named_entity_types(self, token): return [entity[1] for entity in self._extract_ner(token)] def extract_phrase_by_type(self, token, type): ann = self.tagger.annotate(token) sentence = ann.sentence[0] tagged = [(token.word, token.pos) for token in sentence.token] return self._extract_phrase(tagged, type)
class CoreNLPBinaryParser: DEFAULT_PORT = 9003 def __init__(self, threads=1, port=None): sid = random.randint(0, 65535) if port is None: port = self.DEFAULT_PORT self.corenlp = CoreNLPClient(endpoint='http://localhost:{0}'.format(port), annotators=['parse'], output_format='json', properties={'ssplit.eolonly': 'true'}, timeout=300000, memory='8G', threads=threads, server_id='clinicgen{0}'.format(sid)) self.corenlp.start() self.run = True def __del__(self): self.stop() @classmethod def _format(cls, tree): childstrs = [] for child in tree: if isinstance(child, Tree): childstrs.append(cls._format(child)) elif isinstance(child, tuple): childstrs.append("/".join(child)) elif isinstance(child, string_types): childstrs.append('%s' % child) else: childstrs.append(unicode_repr(child)) if len(childstrs) > 1: return '( %s )' % ' '.join(childstrs) else: return childstrs[0] @classmethod def binarize(cls, tree): # collapse t = Tree.fromstring(tree) # chomsky normal form transformation Tree.collapse_unary(t, collapsePOS=True, collapseRoot=True) Tree.chomsky_normal_form(t) s = cls._format(t) return s def parse(self, text): ann = self.corenlp.annotate(text) return self.binarize(ann['sentences'][0]['parse']) def stop(self): if self.run: self.corenlp.stop() self.run = False
class StanfordParser(DependencyParserWrapper): def __init__(self): super().__init__() self.client = CoreNLPClient() def __call__(self, sent): return self.get_spans(sent) def get_spans(self, sent): ann = self.client.annotate(sent, annotators='parse', output_format='json') dependencies_list = ann['sentences'][0]['basicDependencies'] heads = get_heads(dependencies_list) tree = DependencyParserWrapper.head_to_tree(heads) non_singletons = DependencyParserWrapper.compute_spans(tree) singletons = [(n, n + 1) for n in range(len(dependencies_list))] return set(non_singletons) | set(singletons)
def annotate(sentence, lower=True): global client if client is None: client = CoreNLPClient(default_annotators='ssplit,tokenize'.split(',')) words, gloss, after = [], [], [] for s in client.annotate(sentence): for t in s: words.append(t.word) gloss.append(t.originalText) after.append(t.after) if lower: words = [w.lower() for w in words] return { 'gloss': gloss, 'words': words, 'after': after, }
class StanzaClient(): def __init__(self): self.client = CoreNLPClient(annotators=[ 'tokenize', 'ssplit', 'pos', 'lemma', 'parse', ], timeout=30000, properties="zh", output_format="json", memory='5g') def get_parse_tree(self, sent): ann = self.client.annotate(sent) return ann["sentences"][0]["parse"]
def coreference_resolution(text): import os import stanza from stanza.server import CoreNLPClient import json os.environ["CORENLP_HOME"] = "/home/soheil/Downloads/corenlp" # set up the client # with CoreNLPClient(annotators=['tokenize', 'ssplit', 'pos','lemma','ner', 'parse','dcoref'], timeout=5000, memory='2G', output_format='json') as client: # with CoreNLPClient(annotators=['pos','lemma','ner', 'parse','coref'], timeout=5000, memory='2G') as client: # properties={'annotators': 'coref', 'coref.algorithm' : 'statistical'} client = CoreNLPClient( annotators=[ 'tokenize', 'ssplit', 'pos', 'lemma', 'ner', 'parse', 'dcoref' ], memory='2G', timeout=5000, output_format='json' ) # 'dcoref' to do multipass sieve 'tokenize', 'ssplit', , endpoint='http://localhost:9001' # for sieve # annotators = tokenize, ssplit, pos, lemma, ner, parse, dcoref # annotators needed for coreference resolution # annotators = pos, lemma, ner, parse # print(client) # Start the background server and wait for some time client.start() # Print background processes and look for java # ps -o pid,cmd | grep java text = "Albert Einstein was a German-born theoretical physicist. He developed the theory of relativity." ann = client.annotate(text) # Shut down the background CoreNLP server client.stop() # print(ann['corefs']) for word in ann['corefs']: print(word['text'])
class QuestionModel: def __init__(self): self.client = CoreNLPClient(annotators=[ 'tokenize', 'ssplit', 'pos', 'lemma', 'ner', 'parse', 'depparse', 'coref' ], timeout=30000, memory='16G', threads=1) #Returns whether or not the provided sentence contains any questions def isQuestion(self, text): ann = self.client.annotate(text) sentence = ann.sentence[0] constituency_parse = sentence.parseTree for child in constituency_parse.child: if child.value == "SQ" or child.value == "SBARQ": return True return False
def annotate(sentence, lower=True): global client if client is None: # import pdb; pdb.set_trace() client = CoreNLPClient(default_annotators='ssplit,tokenize'.split(','), be_quiet=True) words, gloss, after = [], [], [] sent_annotated = client.annotate(sentence).sentence[0] for t in sent_annotated.token: words.append(t.word) gloss.append(t.originalText) after.append(t.after) if lower: words = [w.lower() for w in words] return { 'gloss': gloss, 'words': words, 'after': after, }
def annotate(sentence, lower=True): global client if client is None: client = CoreNLPClient(endpoint="http://localhost:9001", annotators=['ssplit', 'tokenize'], start_server=False) words, gloss, after = [], [], [] for s in client.annotate(sentence): for t in s: words.append(t.word) gloss.append(t.originalText) after.append(t.after) if lower: words = [w.lower() for w in words] return { 'gloss': gloss, 'words': words, 'after': after, }
class Preprocessor: log = None def __init__(self, host=None): """ This preprocessor connects to an CoreNLP server to perform sentence splitting, tokenization, syntactic parsing, named entity recognition and coref-resolution on passed documents. :param host: the core-nlp host """ self.log = logging.getLogger('GiveMe5W') # connect to CoreNLP server host = "http://localhost:9000" if host is None else host self.cnlp = CoreNLPClient(endpoint=host, start_server=StartServer.DONT_START) # define basic base_config and desired processing pipeline self.base_config = { 'timeout': 500000, 'annotators': 'tokenize,ssplit,pos,lemma,parse,ner,depparse,mention,coref', 'tokenize.language': 'English', # 'coref.algorithm' :'neural', see https://github.com/smilli/py-corenlp/issues/18 # CoreNLPs charniak-wrapper has some problems ... # 'parse.type': 'charniak', # 'parse.executable': '/home/ubuntu/bllip-parser/', # 'parse.verbose': 'true', # 'parse.model': './parse-50best.sh',#'~/.local/share/bllipparser/WSJ+Gigaword-v2', 'outputFormat': 'json' } self._token_index = None def _link_leaf_to_core_nlp(self, s): """ this is where the magic happens add there additional information per candidate-part/token/leave char index information is in each nlpToken """ if len(self._tokens) - 1 < self._token_index: # there seams a bug around numbers, # spitted numbers in the same token are called as they have been split to different tokens # this leads to a wrong index, everything in this sentence is lost till the end of that sentence self.log.error( 'fix the doc around(reformat number,remove special characters):' + s) # print the last two tokens to make it spotable self.log.error(self._tokens[-1]) self.log.error(self._tokens[-2]) # further we can`t return None because this would break extractors # therefore we use this bugfix object # TODO: reason if it make sense to reject these documents at all, because result isn`t reliable at all # TODO: flag document at least with some error flags result = { 'nlpToken': { 'index': 7, 'word': 'BUGFIX', 'originalText': 'BUGFIX', 'lemma': 'BUGFIX', 'characterOffsetBegin': 0, 'characterOffsetEnd': 0, 'pos': 'BUGFIX', 'ner': 'BUGFIX', 'speaker': 'BUGFIX', 'before': ' ', 'after': '' } } if self._document: self._document.set_error_flag('core_nlp') else: result = {'nlpToken': self._tokens[self._token_index]} self._token_index = self._token_index + 1 return result def _build_actual_config(self, document): """ Creates the actual config, consisting of the base_config and dynamic_params. if the same key exists in both base_config and dynamic_params, the value will be used from dynamic_params, i.e., base_config will be overwritten. :param document: :return: """ dynamic_config = {'date': document.get_date()} actual_config = {**self.base_config, **dynamic_config} return actual_config def preprocess(self, document): """ Send the document to CoreNLP server to execute the necessary preprocessing. :param document: Document object to process. :type document: Document :return Document: The processed Document object. """ actual_config = self._build_actual_config(document) annotation = self.cnlp.annotate(text=document.get_full_text(), properties=actual_config) if type(annotation) is str: print(annotation) else: document.set_sentences(annotation['sentences'], [], []) self._document = document tree = [] for sentence in annotation['sentences']: # that's a hack to add to every tree leave a the tokens result self._token_index = 0 self._tokens = sentence['tokens'] sentence_tree = nltk.ParentedTree.fromstring( sentence['parse'], read_leaf=self._link_leaf_to_core_nlp) # add a reference to the original data from parsing for this sentence sentence_tree.stanfordCoreNLPResult = sentence tree.append(sentence_tree) document.set_trees(tree) document.set_corefs(annotation['corefs']) tokens = [] pos = [] ner = [] for sentence in annotation['sentences']: s_tokens = [] s_pos = [] s_ner = [] for token in sentence['tokens']: s_tokens.append(token) s_pos.append((token['originalText'], token['pos'])) s_ner.append((token['originalText'], token['ner'])) tokens.append(s_tokens) pos.append(s_pos) ner.append(s_ner) document.set_tokens(tokens) document.set_pos(pos) document.set_ner(ner) document.set_enhancement('coreNLP', annotation) document.is_preprocessed(True)
class TextParser(object): """TextParser uses stanford core nlp tool to parse text. Args: port(int): the port to launch a stanford core nlp client """ def __init__(self, port=9000): resources.get_corenlp() while is_port_in_use(port): port += 1 self._core_nlp_client = CoreNLPClient(annotators=['parse'], timeout=600000, memory='16G', be_quiet=True, endpoint="http://localhost:%d" % port) def _get_parse_tree(self, sentence): """Generate a parsing tree from a sentence.""" annotation = self._core_nlp_client.annotate(sentence) if len(annotation.sentence) != 1: logger.warning( "_get_parse_tree should take one sentence. but %s is given." % sentence) return annotation.sentence[0].parseTree def split_paragraph_to_sentences(self, paragraph): """Split a paragraph into a list of sentences. Args: paragraph (str): a paragraph of English text. Returns: ([str]): a list of sentences. """ paragraph = preprocess(paragraph) annotation = self._core_nlp_client.annotate(paragraph) return [ " ".join([token.word for token in sentence.token]) for sentence in annotation.sentence ] def get_phrases(self, sentence, phrase_level=2): """Split a sentence into phrases. Args: sentence (str): a sentence in English. phrase_level (int): larger value gives shorter phrases. Returns: ([str]): a list of phrases. """ root = self._get_parse_tree(sentence) phrases = [] generate_phrases_by_dfs(root, 0, phrase_level, phrases) return phrases def phrase_level_shuffle(self, paragraph, n): """For each sentence, randomly shuffle phrases in that sentence. Args: paragraph (str): a paragraph of text. n (int): number of randomly shuffled paragraph to generate. Returns: ([str]): a list of ``n`` shuffled paragraphs. """ paragraph = preprocess(paragraph) sentences = self.split_paragraph_to_sentences(paragraph) bins = [] for sentence in sentences: bins.append(self.get_phrases(sentence)) ret = [] for i in range(n): sent = "" for bin in bins: bin_tmp = bin[:] random.shuffle(bin_tmp) sent += " " + " ".join(bin_tmp) ret.append(sent.strip()) return ret
def annotate_document_file(client: CoreNLPClient, input: str, filename: str): with open(filename, "wb") as file: ann = client.annotate(input) file.write(ann.SerializeToString())