def __init__(self, core_nlp_version: str = '4.1.0'): self.install_dir = Path('~/.stanfordnlp_resources/').expanduser() self.install_dir.mkdir(exist_ok=True) if len([d for d in self.install_dir.glob('*') if d.is_dir()]) == 0: # No coreNLP directories. Let's check for ZIP archives as well. zip_files = [ d for d in self.install_dir.glob('*') if d.suffix == '.zip' ] if len(zip_files) == 0: # No dir and no ZIP. Let's download it with the desired core_nlp_version. remote_url = 'https://nlp.stanford.edu/software/stanford-corenlp-{}.zip'.format( core_nlp_version) print('Downloading from %s.' % remote_url) output_filename = wget.download(remote_url, out=str(self.install_dir)) print('\nExtracting to %s.' % self.install_dir) else: output_filename = zip_files[0] print('Unzip %s.' % output_filename) zf = ZipFile(output_filename) zf.extractall(path=self.install_dir) zf.close() target_dir = [d for d in self.install_dir.glob('*') if d.is_dir()][0] os.environ['CORENLP_HOME'] = str(self.install_dir / target_dir) from stanfordnlp.server import CoreNLPClient self.client = CoreNLPClient(annotators=['openie'], memory='8G')
class Tokenizer: def __init__(self) -> None: os.environ[ 'CORENLP_HOME'] = '{}/stanford-corenlp-full-2018-10-05'.format( os.environ['HOME']) self.client = CoreNLPClient(annotators=['ssplit']) self.client.ensure_alive() self.do_lower_case = '-cased' not in config.bert_model self.basic_tokenizer: BasicTokenizer \ = BertTokenizer.from_pretrained(config.bert_model, do_lower_case=self.do_lower_case).basic_tokenizer def tokenize(self, doc: str) -> List[List[Token]]: corenlp_annotation = self.client.annotate(doc) sentences = [] for sentence in corenlp_annotation.sentence: text = doc[sentence.characterOffsetBegin:sentence. characterOffsetEnd] if self.do_lower_case: text = text.lower() offset = sentence.characterOffsetBegin bert_tokens = self.basic_tokenizer.tokenize(text) begin = 0 tokens = [] for bert_token in bert_tokens: word = bert_token begin = text.index(word, begin) end = begin + len(word) tokens.append(Token(word, begin + offset, end + offset)) begin = end if len(tokens) > 0: sentences.append(tokens) return sentences
def get_corenlp_client(corenlp_path, corenlp_port): from stanfordnlp.server import CoreNLPClient annotators = ["tokenize", "ssplit"] os.environ["CORENLP_HOME"] = corenlp_path if is_port_occupied(port=corenlp_port): try: corenlp_client = CoreNLPClient(annotators=annotators, timeout=99999, memory='4G', endpoint="http://localhost:%d" % corenlp_port, start_server=False, be_quiet=False) return corenlp_client except Exception as err: raise err else: print("Starting corenlp client at port {}".format(corenlp_port)) corenlp_client = CoreNLPClient(annotators=annotators, timeout=99999, memory='4G', endpoint="http://localhost:%d" % corenlp_port, start_server=True, be_quiet=False) return corenlp_client
def __init__(self, core_nlp_version: str = '2018-10-05', threads: int = 5, close_after_finish: bool = True): self.remote_url = 'http://nlp.stanford.edu/software/stanford-corenlp-full-{}.zip'.format( core_nlp_version) self.install_dir = Path(os.environ['STANFORD_HOME']).expanduser() self.install_dir.mkdir(exist_ok=True) if not (self.install_dir / Path( 'stanford-corenlp-full-{}'.format(core_nlp_version))).exists(): print('Downloading to %s.' % self.install_dir) output_filename = wget.download(self.remote_url, out=str(self.install_dir)) print('\nExtracting to %s.' % self.install_dir) zf = ZipFile(output_filename) zf.extractall(path=self.install_dir) zf.close() os.environ['CORENLP_HOME'] = str(self.install_dir / 'stanford-corenlp-full-2018-10-05') from stanfordnlp.server import CoreNLPClient self.close_after_finish = close_after_finish self.client = CoreNLPClient(annotators=['openie'], memory='8G', threads=threads)
def make_corenlp_client(self, annotators=["tokenize", "ssplit"], endpoint="http://localhost:9000", properties_name="french", properties_dict=None, quiet=True): LEGACY_PROPERTIES = {} FRENCH_PROPERTIES = { "tokenize.language": "French", "tokenize.options": "ptb3Dashes=true" } PROPERTIES = {"legacy": LEGACY_PROPERTIES, "french": FRENCH_PROPERTIES} if properties_dict is not None: properties = properties_dict else: if properties_name in PROPERTIES.keys(): properties = PROPERTIES[properties_name] else: raise ValueError("Unknow properties '%s'" % properties_name) devnull = open(os.devnull) stdout = devnull if quiet else sys.stdout stderr = devnull if quiet else sys.stderr self.corenlp_client = \ CoreNLPClient(annotators=annotators, endpoint=endpoint, stdout=stdout, stderr=stderr, memory="8G", heapsize="8G", threads=8, timeout=15000, properties=properties )
def __init__(self) -> None: os.environ['CORENLP_HOME'] = '{}/stanford-corenlp-full-2018-10-05'.format(os.environ['HOME']) self.client = CoreNLPClient() self.client.ensure_alive() self.do_lower_case = '-cased' not in config.bert_model self.basic_tokenizer: BasicTokenizer \ = BertTokenizer.from_pretrained(config.bert_model, do_lower_case=self.do_lower_case).basic_tokenizer
def __init__(self, tagger='spacy', language='french'): self.tagger = tagger self.tagmodule = None self.tagset = UTagSet # TAG Set by default self.language = language spacy_module = { 'french': 'fr_core_news_sm', 'english': 'en_core_web_sm' } if tagger == 'spacy': self.tagger = self.spacy_pos_tag self.tagset = UDTagSet try: self.tagmodule = spacy.load(spacy_module[language]) except: logger.warning( 'Module for language [{:s}] not installed for Spacy - using french by default' .format(language)) self.tagmodule = spacy.load(spacy_module['french']) elif tagger == 'stanford': self.tagger = self.stanford_pos_tag self.tagset = FTTagSet JAVAHOME = "C:/Program Files (x86)/Java/jre1.8.0_241/bin/java.exe" # Set a JAVAHOME environment variable if not present if not 'JAVAHOME' in os.environ: os.environ['JAVAHOME'] = JAVAHOME root_path = "./stanford-postagger/" # location of Stanford POS Tagger components # Launch the Stanford Pos Tagger (implemented in Java) self.tagmodule = StanfordPOSTagger( root_path + "models/" + language + ".tagger", root_path + "stanford-postagger.jar", encoding='utf8') elif tagger == 'core_nlp': self.tagger = self.corenlp_pos_tag os.environ['CORENLP_HOME'] = './stanford-corenlp-full-2018-10-05' try: self.tagmodule = CoreNLPClient(properties=language, annotators=[ 'pos', ], timeout=30000, memory='1G') except: logger.warning( 'Could not launch Stanford Core NLP for [{:s}]'.format( language)) elif tagger == 'nltk': self.tagger = self.nltk_pos_tag self.tagset = NLTKTagSet if language != 'english': logger.warning( 'nltk does not support [{:s}] language'.format(language)) else: logger.warning('POS tagger [{:s}] unknown'.format(tagger))
def __init__(self, corenlp_home, endpoint='http://localhost:9000', timeout=15000, memory='2G'): print('Set up Stanford CoreNLP Server.') if os.path.exists(corenlp_home): os.environ['CORENLP_HOME'] = corenlp_home else: raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), corenlp_home) self.client = CoreNLPClient(annotators=['depparse'], endpoint=endpoint, timeout=timeout, memory=memory) self.client.annotate('Prepare.')
def __init__(self, start_server=True, endpoint=CoreNLPClient.DEFAULT_ENDPOINT): self.__client = CoreNLPClient(start_server=start_server, endpoint=endpoint, annotators=[ 'tokenize', 'ssplit', 'pos', 'lemma', 'ner', 'parse', 'coref' ], output_format='json') self.__client.start()
def serve_stanfordnlp_client(): return CoreNLPClient(endpoint='http://localhost:9000', timeout=30000, threads=4, annotators='sentiment', memory='8G')
def syntactic_parse_texts(texts: List[str], tokenize=False, sentence_split=False, verbose=False): if verbose: print(f"Parsing {len(texts)} texts...") corenlp_annotators = ['tokenize', 'ssplit', 'pos', 'lemma', 'ner', 'depparse'] annotators_properties = {'tokenize.whitespace': not tokenize, 'ssplit.eolonly': not sentence_split, 'depparse.model': "edu/stanford/nlp/models/parser/nndep/english_SD.gz", 'outputFormat': 'json'} if not STANFORD_CORENLP_DIR.exists(): download_stanford_corenlp() os.environ['CORENLP_HOME'] = str(STANFORD_CORENLP_DIR) parse_results = [] with CoreNLPClient(annotators=corenlp_annotators) as client: for text in tqdm(texts, disable=(not verbose)): if isinstance(text, List): text = ' '.join(text) raw_parse_result = client.annotate(text, properties=annotators_properties) parse_result = format_parser_output(raw_parse_result['sentences']) if len(parse_result['sentences']) > 1 and not sentence_split: parse_result = join_parse_result(parse_result) elif sentence_split: parse_result = split_parse_result(parse_result['sentences']) parse_results.append(parse_result) return parse_results
def process_book(header_annot_dir, lemma_dir, tree_dir, book_id): if os.path.exists(os.path.join( tree_dir, book_id + '.xml')) and os.path.exists( os.path.join(lemma_dir, book_id + '.pkl')): return book_id, 'Exists' os.environ[ "CORENLP_HOME"] = "~/stanford_corenlp/stanford-corenlp-full-2018-10-05" try: with CoreNLPClient(annotators=['tokenize', 'lemma'], timeout=30000, max_char_length=100000000, be_quiet=True, start_server=False) as client: tree, para_end_sentences, lemma_dict = sentencize( header_annot_dir, client, book_id) tree2 = paragraphize(tree, para_end_sentences) filename = os.path.join(tree_dir, book_id + '.xml') tree2.write(filename, pretty_print=True) with open(os.path.join(lemma_dir, book_id + '.pkl'), 'wb') as f: pickle.dump(lemma_dict, f) except Exception as e: print(book_id, e) return book_id, e print(book_id, 'Success!') return book_id, 'Success'
def tokenize(data, src_keys=['title', 'body'], tgt_key='text'): """Use Stanford CoreNLP tokenizer to tokenize all the documents.""" REMAP = { "-LRB-": "(", "-RRB-": ")", "-LCB-": "{", "-RCB-": "}", "-LSB-": "[", "-RSB-": "]", "``": '"', "''": '"' } with CoreNLPClient(annotators=['tokenize', 'ssplit'], threads=CPU_CNT)\ as client: for did, d in tqdm(data.items()): text = '' for k in src_keys: text += d[k] + ' ' ann = client.annotate(text.strip()) tokens = [] # list of tokenized sentences for sent in ann.sentence: tokens.append([ REMAP[t.word] if t.word in REMAP else t.word.lower() for t in sent.token ]) d[tgt_key] = tokens
def process_document(doc, doc_id=None): """Main method: Annotate a document using CoreNLP client Arguments: doc {str} -- raw string of a document doc_id {str} -- raw string of a document ID Returns: sentences_processed {[str]} -- a list of processed sentences with NER tagged and MWEs concatenated doc_ids {[str]} -- a list of processed sentence IDs [docID1_1, docID1_2...] Example: Input: "When I was a child in Ohio, I always wanted to go to Stanford University with respect to higher education. But I had to go along with my parents." Output: 'when I be a child in ['when I be a child in [NER:LOCATION]Ohio , I always want to go to [NER:ORGANIZATION]Stanford_University with_respect_to higher education . 'but I have to go_along with my parent . ' doc1_1 doc1_2 Note: When the doc is empty, both doc_id and sentences processed will be too. (@TODO: fix for consistensy) """ with CoreNLPClient(endpoint="http://localhost:9002", start_server=False) as client: doc_ann = client.annotate(doc) sentences_processed = [] doc_sent_ids = [] for i, sentence in enumerate(doc_ann.sentence): sentences_processed.append(process_sentence(sentence)) doc_sent_ids.append(str(doc_id) + "_" + str(i)) return "\n".join(sentences_processed), "\n".join(doc_sent_ids)
def _preprocess_and_save(self, data, cached_examples_dir): index = dict() index['guids'] = [] index['feafile_name'] = [] index['offset'] = [] index['label_ids'] = [] feafile_name = "fea" output = dict() for key in self.feature_keys: output[key] = list() with CoreNLPClient(annotators=['natlog'], timeout=60000, memory='16G') as client: for ex in tqdm(data): # # preprocess # if self.drop_unk_samples and ex["gold_label_id"] not in self.id2label_dict.keys(): # continue this_output = self._data2feature(ex, client) if len(this_output) == 0: continue for key in self.feature_keys: output[key].append(this_output[key]) output = self._fea2tensor(output) index["guids"] = output["guids"] index['feafile_name'] = [feafile_name] * len(index["guids"]) index['offset'] = list(range(len(index["guids"]))) index["label_ids"] = output["label_ids"].numpy() torch.save(output, os.path.join(cached_examples_dir, feafile_name)) self._save_index(cached_examples_dir, index) return
def __init__(self, tagged_dataset_path, database_path, corenlp_path): self.target_values_map = {} for filename in os.listdir(tagged_dataset_path): filename = os.path.join(tagged_dataset_path, filename) print(sys.stderr, 'Reading dataset from', filename) with open(filename, 'r', 'utf8') as fin: header = fin.readline().rstrip('\n').split('\t') for line in fin: stuff = dict(zip(header, line.rstrip('\n').split('\t'))) ex_id = stuff['id'] original_strings = tsv_unescape_list(stuff['targetValue']) canon_strings = tsv_unescape_list(stuff['targetCanon']) self.target_values_map[ex_id] = to_value_list( original_strings, canon_strings) os.environ['CORENLP_HOME'] = corenlp_path self.client = CoreNLPClient(annotators="ner".split()) self.db_path = database_path
def get_corenlp_client(corenlp_path, corenlp_port): os.environ["CORENLP_HOME"] = corenlp_path assert not is_port_occupied( corenlp_port), "Port {} is occupied by other process".format( corenlp_port) corenlp_client = CoreNLPClient( annotators=['tokenize', 'ssplit', 'pos', 'lemma', 'depparse'], timeout=60000, memory='5G', endpoint="http://localhost:%d" % corenlp_port, start_server=True, be_quiet=False) corenlp_client.annotate( "hello world", annotators=['tokenize', 'ssplit', 'pos', 'lemma', 'depparse'], output_format="json") return corenlp_client
def preprocess_data(args): os.environ['CORENLP_HOME'] = args.corenlp_dir texts = [] if "Treebank" in args.parse_type: print('Join the separated edus in *.edus file into *.text file with a single line...') texts = [(join_edus(fedu),fname) for fedu, fname in [(os.path.join(args.data_dir, fname),fname) for fname in os.listdir(args.data_dir) if fname.endswith('.edus')]] elif args.parse_type == "Wiki": data = pd.read_excel(os.path.join(args.data_dir, "Wikipedia_afd_persuasive.xlsx")) texts = [(text, '') for text in data['rationale'].values] file_list = [] corenlp_list = [] save_path = os.path.join(args.output_dir, args.parse_type) if "Treebank" in args.parse_type: save_path = os.path.join(args.output_dir, args.parse_type, args.data_dir.split("/")[2]) if not os.path.exists(os.path.join(args.output_dir, args.parse_type, "corenlp_data.p")) or not os.path.getsize(os.path.join(args.output_dir, args.parse_type, "corenlp_data.p")): with CoreNLPClient(annotators=['tokenize', 'ssplit', 'pos', 'lemma', 'ner', 'parse'], timeout=30000, memory='16G', output_format='xml') as client: for text, fname in texts: print(text) if text and not pd.isna(text): if args.parse_type == "Wiki": regular = re.compile(r"\[http.*]") stop = ["Keep ", "*<sKeep or </s", "Delete - ", "Weak Keep - ", "<s>Delete</s -", "Keep - ", "Delete ", "Keep - <s>", "Keep, ", "Strong Keep - ", "Keep both - ", "Keep per [[WP:NEXIST]]. ", "Keep per [[WP:SUSTAINED]]. ", "Keep,", "*<sWeak Keep.", "*<sDelete", "Keep<br>", "<sKeep", "Keep—", "Delete, ", "*<sDelete: ", "delete ", "*<sKeep. ", "**<delDelete. ", "::<sKeep ", "Keep--", "Keep - <s>"] for s in stop: text = text.replace(s, "") re_list = re.findall(regular, text) for r in re_list: text = text.replace(r, "link") ann = client.annotate(text) else: ann = '' corenlp_list.append((ann, fname)) with open(os.path.join(save_path, "corenlp_data.p"), 'wb') as file: pickle.dump(corenlp_list, file) with open(os.path.join(save_path, "corenlp_data.p"), 'rb') as file: corenlp_list = pickle.load(file) for ann, fname in corenlp_list: # print(ann) if "Treebank" in args.parse_type: lines = merge_treebank(ann, os.path.join(args.data_dir, fname)) elif args.parse_type == "Wiki": if not ann: lines = [] else: lines = merge(ann) file_list.append(lines) with open(os.path.join(save_path, "processed_data.p"), 'wb') as file: pickle.dump(file_list, file)
class StanfordCoreferenceResolver(CoreferenceResolver): def __init__(self, start_server=True, endpoint=CoreNLPClient.DEFAULT_ENDPOINT): self.__client = CoreNLPClient(start_server=start_server, endpoint=endpoint, annotators=[ 'tokenize', 'ssplit', 'pos', 'lemma', 'ner', 'parse', 'coref' ], output_format='json') self.__client.start() def __del__(self): self.__client.stop() def resolve_coreferences(self, text, entities): annotations = self.__client.annotate(text) entity_mention_indices = [] for chain in annotations.corefChain: mention_indices = [] for mention in chain.mention: sentence = annotations.sentence[mention.sentenceIndex] token_start = sentence.token[mention.beginIndex] token_end = sentence.token[mention.endIndex - 1] char_start = token_start.beginChar char_end = token_end.endChar mention_indices.append((char_start, char_end)) entity_mention_indices.append(mention_indices) entity_sets = [list() for _ in range(len(entity_mention_indices))] for entity in entities: is_coreferred = False for i, mention_indices in enumerate(entity_mention_indices): for start_index, end_index in mention_indices: if entity.start_offset >= start_index and entity.end_offset <= end_index: entity_sets[i].append(entity) is_coreferred = True if not is_coreferred: entity_sets.append([entity]) return entity_sets
def start_corenlp_client(): corenlp_client = CoreNLPClient( start_server=True, endpoint='http://localhost:9000', memory=MEMORY_CORENLP, threads=50, timeout=10000000, annotators=['openie'], output_format="json", properties={ 'annotators': 'openie', 'inputFormat': 'text', 'outputFormat': 'json', 'serializer': 'edu.stanford.nlp.pipeline.ProtobufAnnotationSerializer', 'openie.affinity_probability_cap': '1.0', "openie.max_entailments_per_clause": "500" }) corenlp_client.TIMEOUT = 100 return corenlp_client
def __init__(self, corpus, target=None, **kwargs): """ The corpus is the `HTMLCorpusReader` to preprocess and pickle. The target is the directory on disk to output the pickled corpus to. """ self.corpus = corpus self.target = target self.tagger = pos_tagger('spacy') # Modification for dibutade if model == 'stanford': os.environ[ 'CORENLP_HOME'] = 'C:/Users/alain/OneDrive/Ateliers Dibutade/NLP/stanford-corenlp-full-2018-10-05' self.pos_tagger = CoreNLPClient(properties='french', annotators=[ 'pos', ], timeout=30000, memory='1G') elif model == 'spacy': self.nlp = spacy.load('fr_core_news_sm')
def runClient(text): print('---') print('starting up Java Stanford CoreNLP Server...') # set up the client #with CoreNLPClient(annotators=['tokenize','ssplit','pos','lemma','ner','natlog','openie'], properties={"outputFormat": "json","openie.triple.strict":"true","splitter.disable" : "true","openie.max_entailments_per_clause":"1"}, be_quiet=False, timeout=30000, memory='16G') as client: with CoreNLPClient(annotators=[ 'tokenize', 'ssplit', 'pos', 'lemma', 'ner', 'natlog', 'openie' ], be_quiet=True, timeout=30000, memory='16G') as client: # submit the request to the server # Iterate over all tokens in all sentences, and print out the word, lemma, pos and ner tags text = "Trump is the President of America" document = client.annotate(text) output = client.annotate(text, properties={ "outputFormat": "json", "openie.triple.strict": "true", "splitter.disable": "true", "openie.max_entailments_per_clause": "1" }) #print(output) result = [output["sentences"][0]["openie"] for item in output] print(result) print(result[0][0]["subject"]) # [[{'subject': 'John', 'subjectSpan': [0, 1], 'relation': 'jumps over', 'relationSpan': [1, 3], 'object': 'fox', 'objectSpan': [4, 5]}]] for i in result: for rel in i: relationSent = rel['relation'], rel['subject'], rel['object'] print(relationSent) # print("{:12s}\t{:12s}\t{:6s}\t{}".format("Word", "Lemma", "POS", "NER")) tags = getTags() # #print(document.sentence) props = [] for key, value in enumerate(document.sentence): for t in value.token: props.append({ "word": t.word, "lemma": t.lemma, "pos": t.pos, "pos_full": tags[t.pos], "ner": t.ner }) #print("Word: {}, Lemma: {}, POS: {}, NER: {}".format(t.word,t.lemma,tags[t.pos],t.ner)) replaceWord = getAnalysis(props) if replaceWord["replace"] and replaceWord["type"] == "PERSON": text = text.replace(replaceWord["wordToReplace"], "who") + " ?" print(text) if replaceWord["replace"] and replaceWord["type"] == "LOCATION": #text = ''.join("Where is ",relationSent["subject"]," ?") print("Where is {} ?".format(replaceWord["wordToReplace"]))
def _parse_segmenttokenize_en(document, usage='experiments'): """ Given a document, in str format, containing one or more sentences, returns a set of segmented and tokenized strings, with indexing information. This format is the basis for: (i) the format for storing information on sentences and tokens in the CoNLL 2015 and 2016 Shared Task; (ii) the .Words attribute in Parse-class objects. This function uses the stanford-corenlp package and requires the CoreNLP Java package to be downloaded and built (with Ant or Maven) and saved to the 04_utils folder. """ cwd = getcwd() if usage == 'production' or 'experiments': version = 'stanford-corenlp-full-2018-10-05/' corenlp_path = re.findall(r'\S*/marta-v2', cwd)[0] + '/04_utils/' + version environ["CORENLP_HOME"] = corenlp_path with CoreNLPClient( annotators="tokenize ssplit pos".split(), memory='1G', be_quiet=True, max_char_length=100000, ) as client: annotated = client.annotate(document, output_format='json') client.stop() elif usage == 'experiments_352': version = 'stanford-corenlp-full-2018-10-05/' #-2015-04-20/' corenlp_path = re.findall(r'\S*/marta-v2', cwd)[0] + '/04_utils/' + version chdir(corenlp_path) args = [ "*", '-Xmx500m', 'edu.stanford.nlp.pipeline.StanfordCoreNLP', '-annotators', 'tokenize,ssplit,pos', '-tokenize.whitespace', '-ssplit.eolonly', '-outputFormat', 'json', '-maxLength', '10000' ] # necessary to set -maxLength (default is only 200); neccessary to # specify -tokenize.whitespace, since our sentence is joined from already-tokenized. process = subprocess.Popen(['java', '-cp'] + args, stdout=subprocess.PIPE, stdin=subprocess.PIPE, stderr=subprocess.STDOUT) annotated, error = process.communicate(input=document.encode('utf-8')) chdir( cwd ) # setting the current working directory back to original, else it causes errors downstream # convert byte to utf-8 annotated = annotated.decode('utf-8') # extract the parse sections return annotated
def execute_stanford_analysis(): try: os.environ[ "CORENLP_HOME"] = "C:\\Users\\wenga\\OneDrive - Berner Fachhochschule\\CAS PML\\90_Projektarbeit\\devNG\\_data\\stanford-corenlp-full-2018-10-05" print("Downloading english dictionary") # stanfordnlp.download('en', force=True) print('---') print('starting up Java Stanford CoreNLP Server...') # en_nlp = stanfordnlp.Pipeline(lang='en') # Processing English text # en_doc = en_nlp("Barack Obama was born in Hawaii. He was elected president in 2008.") text = "your rich bitch. Yep. I mean it. RIACH BIATCH. F**k you you f*****g idiot piece of shit. I hate you. Die m**********r." # set up the client # with CoreNLPClient(annotators=['tokenize', 'ssplit', 'pos', 'lemma', 'ner', 'parse', 'depparse', 'coref'], # timeout=60000, memory='16G') as client: with CoreNLPClient(annotators=['sentiment'], timeout=10000, memory='20G') as client: tweets = load_twitter_data(5000000) x = 1 result = [] for singleTweet in tweets: # ann = client.annotate(singleTweet[1], properties={'annotators': 'sentiment', 'outputFormat': 'json'}) ann = client.annotate(singleTweet[1], properties={ 'annotators': 'sentiment', 'outputFormat': 'json' }) neg = 0 neu = 0 pos = 0 for sentence in ann['sentences']: if sentence['sentimentValue'] == "1": neg += 1 elif sentence['sentimentValue'] == "2": neu += 1 else: pos += 1 result.append((singleTweet[0], neg / len(ann['sentences']), neu / len(ann['sentences']), pos / len(ann['sentences']))) if x % 1000 == 0: print("[{}] Sentimented {}/{} tweets".format( datetime.now(), x, len(tweets))) x += 1 except Error as e: print("SQL error: {}".format(e)) except: print("Unexpected error: {}".format(sys.exc_info()[0]))
def __init__(self, core_nlp_version: str = '2018-10-05', annotators=None): if annotators is None or not isinstance(annotators, list): annotators = ['openie', 'dcoref'] self.remote_url = 'https://nlp.stanford.edu/software/stanford-corenlp-full-{}.zip'.format( core_nlp_version) self.install_dir = Path('~/.stanfordnlp_resources/').expanduser() self.install_dir.mkdir(exist_ok=True) if not (self.install_dir / Path( 'stanford-corenlp-full-{}'.format(core_nlp_version))).exists(): print('Downloading from %s.' % self.remote_url) output_filename = wget.download(self.remote_url, out=str(self.install_dir)) print('\nExtracting to %s.' % self.install_dir) zf = ZipFile(output_filename) zf.extractall(path=self.install_dir) zf.close() os.environ['CORENLP_HOME'] = str(self.install_dir / 'stanford-corenlp-full-2018-10-05') from stanfordnlp.server import CoreNLPClient self.client = CoreNLPClient(annotators=annotators, memory='8G') self.parser = CoreNLPParser()
def demo_test(replace_like=False): # ['tokenize','ssplit','pos','lemma','ner','parse','depparse','coref'] # text = "A cat in a cup is like a dog in a bucket." # text = "Rumor of a big battle spread like a grassfire up the valley." # This one doesn't parse correctly. # text = "When the sun came out, Stevie strode proudly into Orange Square," \ # "smiling like a landlord on industrious tenants. A cat is like a dog." text = '' text_big = '''and yet like a child among adults . I don't mean a few aesthetes who play about with sensations , like a young prince in a miniature dabbling his hand in a pool . Oh , he was being queer and careful , pawing about in the drawer and holding the bottle like a snake at the length of his arm . `` I went to the city And there I did Weep , Men a-crowing like asses , And living like sheep . Rumor of a big battle spread like a grassfire up the valley . When the sun came out , Stevie strode proudly into Orange Square , smiling like a landlord on industrious tenants . They gave the room a strange note of incongruity , like a mole on a beautiful face . It always came on , faithfully , just like a radio or juke box , whenever he started to worry too much about something , when the bad things tried to push their way into him . The design of a mechanical interlocking frame is much like a mechanical puzzle , but once understood , the principles can be applied to any track and signal arrangement . The sticks fell like a shower around her and she felt them sting her flesh and send tiny points of pain along her thighs . I saw the pony fall like a stone and the young warrior flew over its head , bouncing like a rubber ball . ''' text += '''This dog is analogous to an atom.''' with CoreNLPClient(annotators=[ 'tokenize', 'ssplit', 'pos', 'lemma', 'ner', 'parse', 'depparse', 'coref' ], timeout=60000, memory='4G', be_quiet=True) as client: print("##########-----About to annotate...-----") # If we want to replace like, that needs to happen here. Let's not do that, however. ann = client.annotate(text) sen = ann.sentence[0] token = sen.token[0] print("*(((((") print(token.word) # sentence is a Sentence. Where and how is this defined? for sentence in ann.sentence: if replace_like: replace_with_like(sentence, signals, "like") for token in sentence.token: print(token.word, end=' ') print() constituency_parse = sentence.parseTree my_parse = CoreNLPNode(constituency_parse) my_parse.create_tree() my_parse.thematic_search() print("BASE: ", my_parse.roles["base"], "TARGET: ", my_parse.roles["target"], "ACTION: ", my_parse.roles["action"])
def __init__(self): with Timer() as self.model_load_time: os.environ["CORENLP_HOME"] = CORENLP_HOME from stanfordnlp.server import CoreNLPClient client = CoreNLPClient( annotators=["tokenize", "ssplit"], timeout=30000, memory="2G", properties={ "tokenize.language": "de", "outputFormat": "text" }, ) self.processor = client.annotate
def syntactic_parse_texts( texts: List[str], tokenize=False, sentence_split=False, verbose=False, with_constituency_parse=False, ): corenlp_annotators = [ "tokenize", "ssplit", "pos", "lemma", "ner", "depparse", ] if with_constituency_parse: corenlp_annotators.append("parse") annotators_properties = { "tokenize.whitespace": not tokenize, "ssplit.eolonly": not sentence_split, "depparse.model": "edu/stanford/nlp/models/parser/nndep/english_SD.gz", "outputFormat": "json", } if not STANFORD_CORENLP_DIR.exists(): download_stanford_corenlp() os.environ["CORENLP_HOME"] = str(STANFORD_CORENLP_DIR) parse_results = [] with CoreNLPClient( annotators=corenlp_annotators, properties=annotators_properties, threads=40, ) as client: for text in tqdm(texts, disable=(not verbose)): if isinstance(text, List): text = " ".join(text) raw_parse_result = client.annotate(text) parse_result = format_parser_output(raw_parse_result["sentences"]) if len(parse_result["sentences"]) > 1 and not sentence_split: parse_result = join_parse_result(parse_result) elif sentence_split: parse_result = split_parse_result(parse_result["sentences"]) parse_results.append(parse_result) return parse_results
def noun_adjective_pairer(reviews_per_business): pair_list = [] with CoreNLPClient( annotators=["tokenize", "ssplit", "pos", "depparse", "lemma"], timeout=120000, memory="5G", ) as client: for review in reviews_per_business: ann = client.annotate(review) for sentence in ann.sentence: dependency_parse = sentence.basicDependencies tokens = sentence.token predicted_heads_and_dependencies = {} predicted_pos = [] predicted_lemm = [] for i in range(len(tokens)): predicted_pos.append(tokens[i].pos) predicted_lemm.append(tokens[i].lemma) for i in range(len(dependency_parse.edge)): source = dependency_parse.edge[i].source target = dependency_parse.edge[i].target dep = dependency_parse.edge[i].dep head_pos = predicted_pos[source - 1] if target - 1 in predicted_heads_and_dependencies: predicted_heads_and_dependencies_list = predicted_heads_and_dependencies[ target - 1] predicted_heads_and_dependencies_list.append( (source - 1, dep, head_pos)) else: predicted_heads_and_dependencies[target - 1] = [ (source - 1, dep, head_pos) ] noun_pairs = get_noun_pairs_index( predicted_heads_and_dependencies) adjective_pairs = get_adjective_pairs_index( predicted_heads_and_dependencies) noun_adjective_pairs = get_noun_adjective_pairs( predicted_heads_and_dependencies, predicted_pos, predicted_lemm, noun_pairs, adjective_pairs, ) pair_list.extend(noun_adjective_pairs) return pair_list
def pipeline1(text, r, t): extractedRelations = [] with CoreNLPClient( annotators=['tokenize', 'ssplit', 'pos', 'lemma', 'ner'], timeout=450000, memory='4G', endpoint="http://localhost:9000", threads=7) as pipeline1: print( "\tAnnotating the webpage using [tokenize, ssplit, pos, lemma, ner] annotators ..." ) ann = pipeline1.annotate(text) sentenceNumber = len(ann.sentence) namedEntity = patterns[toRelation[r]] print( "\tExtracted %d sentences. Processing each sentence one by one to check for presence of right pair of named entity types; if so, will run the second pipeline ..." % sentenceNumber) # if a sentence has the targeted two named entities # add the sentence to the list that the element of which perform extracting kbp annotations processedSentence = [] for i, sentence in enumerate(ann.sentence): # check if those named entity in the query all appear in the extract sentence firstEntity = False secondEntity = False for token in sentence.token: if toRelation[r] == relation[2]: if token.ner == namedEntity[0]: firstEntity = True if token.ner in namedEntity[1]: secondEntity = True else: if token.ner == namedEntity[0]: firstEntity = True if token.ner == namedEntity[1]: secondEntity = True # if both targeted named entity appear, the sentence adds to the list if firstEntity and secondEntity: processedSentence.append([i, to_text(sentence)]) # extract the relations in the list of sentence through pipeline2 extractedRelations += pipeline2(processedSentence, t) print("Extracted kbp annotations for %d out of total %d sentences" % (len(processedSentence), sentenceNumber)) return extractedRelations