def __init__(self, timeout=30000, memory='6G'): self.detok = TreebankWordDetokenizer() self.client = CoreNLPClient( annotators=['tokenize', 'ssplit', 'dcoref'], output_format='json', timeout=timeout, memory=memory)
def __init__(self, annotators=None, properties=None): with open( os.path.join(os.path.dirname(__file__), "assets", "regexes.json")) as f: self.regexer = RegexFeaturizer(json.load(f)) self.annotators = annotators or CORENLP_ANNOTATORS self.properties = properties or CORENLP_PROPERTIES self.client = CoreNLPClient(self.annotators, properties=self.properties, endpoint="http://localhost:9012")
class StanfordParser(object): def __init__(self, nlp=None, annots=None, props=None): if annots is None: annots = "tokenize pos lemma depparse" if nlp is None: self.nlp_client = CoreNLPClient(annotators=annots, output_format='json') else: self.nlp_client = nlp if props is not None: self.nlp_client.default_properties.update(props) _ = self.nlp_client.annotate("Let's get this party started!") del (_) def get_parse(self, sentence): return self.nlp_client.annotate(sentence) def get_deps(self, sentence, deptype='basicDependencies', ret='asis'): if isinstance(sentence, str): sentence = self.get_parse(sentence)['sentences'][0] deps = sentence[deptype] if ret == 'asis': retval = deps else: retval = {} retval['deps'] = {x['dep']: x['dependent'] for x in deps} retval['heads'] = { x['dependentGloss']: x['governorGloss'] for x in deps } retval['governors'] = { x['dependent']: x['governorGloss'] for x in deps } retval['dependents'] = { x['dependent']: x['dependentGloss'] for x in deps } retval['text'] = [ "{}({}-{}, {}-{})".format(x['dep'], x['governorGloss'], x['governor'], x['dependentGloss'], x['dependent']) for x in deps ] return retval
def __init__(self, nlp=None, annots=None, props=None): if annots is None: annots = "tokenize pos lemma depparse" if nlp is None: self.nlp_client = CoreNLPClient(annotators=annots, output_format='json') else: self.nlp_client = nlp if props is not None: self.nlp_client.default_properties.update(props) _ = self.nlp_client.annotate("Let's get this party started!") del (_)
class StanfordService: def __init__(self, parser_path: str): os.environ['JAVANLP_HOME'] = parser_path print( 'starting CoreNLP server with JAVANLP_HOME {}'.format(parser_path)) self.nlp = CoreNLPClient(annotators="tokenize ssplit".split(), timeout=1000000) def tokenize(self, text: str) -> List[Token]: for _ in range(10): try: annotated_result = self.nlp.annotate(text) stanford_document = StanfordDocument.from_proto( annotated_result) return StanfordService.idiomatic_tokens(stanford_document) except: print('exception while annotating result') sleep(10) @staticmethod def idiomatic_tokens(doc: StanfordDocument): stanford_tokens = [ token for sentence in doc.sentences for token in sentence.tokens ] return [ StanfordService.idiomatic_token(token, index) for index, token in enumerate(stanford_tokens) ] @staticmethod def idiomatic_token(token: StanfordToken, token_index: int) -> Token: return Token(token.originalText, token_index, token.characterOffsetBegin)
def make_summaries(args): # Tokenizer corenlp_tokenizer = CoreNLPClient(annotators=['ssplit', 'tokenize'], stdout=sys.stderr, timeout=10000, max_char_length=1020000) corenlp_tokenizer.start() # Load model logging.info("Loading model {}".format(args.model_file)) rewards = { "train": None, "train_single": None, "dev": None, "dev_single": None } model = SimpleRNN(args, rewards) model.cuda() checkpoint = torch.load(args.model_file) model.load_state_dict(checkpoint['model_state_dict']) model.eval() # For counting positions pos_counts = defaultdict(int) # Folder details article_dir = os.path.join(args.data_dir, 'articles') logging.info("Starting evaluation.") try: with torch.no_grad(): for i in tqdm(range(len(os.listdir(article_dir)))): article_name = str(i).rjust(6, '0') + "_article.txt" ref_name = str(i).rjust(6, '0') + "_reference.txt" with open(os.path.join(article_dir, article_name), 'r') as art_file: article_text = art_file.read() doc_sents_words = process_text(corenlp_tokenizer, article_text) doc_ids = convert_tokens_to_ids(doc_sents_words, args) # Write model hypothesis to file summary_idx = model(doc_ids.cuda()) hyp_file = str(i).rjust(6, '0') + '_hypothesis.txt' with open(os.path.join(args.hyp_dir, hyp_file), 'w') as f: hyp_sents_words = [doc_sents_words[j] for j in summary_idx] hyp_sents = [ " ".join(hyp_sent) for hyp_sent in hyp_sents_words ] f.write(".\n".join(hyp_sents)) f.write(" .") # Ensure reference file is similarly formatted for a fair comparison with open(os.path.join(args.ref_dir, ref_name), 'r') as ref_file: ref_text = ref_file.read() ref_sents_words = process_text(corenlp_tokenizer, ref_text) with open(os.path.join(args.ref_dir, ref_name), 'w') as f: ref_sents = [ " ".join(ref_sent) for ref_sent in ref_sents_words ] f.write(".\n".join(ref_sents)) f.write(" .") for pos in summary_idx: pos_counts[pos] += 1 # Count index selected finally: corenlp_tokenizer.stop() # Compute evaluation metrics compute_rouge(args) # Position counts total_count = sum(pos_counts.values()) lead_count = pos_counts[0] + pos_counts[1] + pos_counts[2] logging.info("Overlap with Lead: {}".format(lead_count / total_count))
class StanfordCoreferenceResolution: """ Stanford CoreNLP co-reference. Parameters ---------- timeout : int The timeout for the parser Defaults to 30000 memory : str The memory allocation. Defaults to '6G' """ def __init__(self, timeout=30000, memory='6G'): self.detok = TreebankWordDetokenizer() self.client = CoreNLPClient( annotators=['tokenize', 'ssplit', 'dcoref'], output_format='json', timeout=timeout, memory=memory) def resolve(self, doc, raise_errors=True): """ Resolve the co-references for a single document. Parameters ---------- doc : str A document whose co-references will be resolved. raise_errors : bool, optional Whether to raise errors. Defaults to True. Returns ------- resolve_doc : str or None A document whose co-references have been resolved. If there was a problem and `raise_errors=False`, then `None` will be returned. """ try: parsed = self.client.annotate(doc) except Exception as error: if raise_errors: raise error return return self.replace_coreferences(parsed) def resolve_all(self, docs, raise_errors=True): """ Resolve co-references for all the documents. Parameters ---------- docs : list of str A list of documents raise_errors : bool, optional Whether to raise errors. Defaults to False. Returns ------- resolved_docs : list of str A list of documents, with co-references resolved. """ resolved_docs = [] for doc in tqdm(docs): resolved_docs.append(self.resolve(doc, raise_errors)) return resolved_docs @staticmethod def restructure_coreference_dict(corefs_dict): """ Given a dictionary of co-references, restructure it into a new dictionary where the keys are sentence numbers and the values are lists of references that need to be resolved. Parameters ---------- corefs_dict : dict A co-reference dictionary, output from Stanford. """ corefs_list = [ corefs_dict[key] for key in corefs_dict if len(corefs_dict[key]) > 1 and any( not co['isRepresentativeMention'] for co in corefs_dict[key]) ] corefs_dict = defaultdict(list) for i, coref in enumerate(corefs_list): # get the first representative mention from the list; # if there are no representative mentions, continue represent = [ co['text'] for co in coref if co['isRepresentativeMention'] ] if len(represent) >= 1: represent = represent[0] else: continue # loop through the (non-representative) mentions, # add to the dictionary list for that sentence for co in coref: if not co['isRepresentativeMention']: mention = { 'represent': represent, 'text': co['text'], 'startIndex': co['startIndex'], 'endIndex': co['endIndex'], 'sentNum': co['sentNum'] } corefs_dict[co['sentNum']].append(mention) return corefs_dict def replace_coreferences(self, parsed): """ We want to replace all the references with their representative mention. Parameters ---------- parsed : dict The full output from Stanford, with co-references and sentences. """ corefs = parsed['corefs'] sents = parsed['sentences'] corefs_dict = self.restructure_coreference_dict(corefs) sents = [[s['word'] for s in sent['tokens']] for sent in sents] sents_new = [] # we do this on a sentence-by-sentence basis for sent_i, sent in enumerate(sents, start=1): sent_new = [] # we check to see if the sentence is in the co-reference dictionary; # if it's not we won't need to do anything. if sent_i in corefs_dict: last_end = 0 # we loop through the (sorted) references and add them # to our new sentence list one-by-one, being careful to # capture any preceding or ending text sorted_sent = sorted(corefs_dict[sent_i], key=lambda x: x['startIndex']) for co_i, co in enumerate(sorted_sent): start = co['startIndex'] - 1 end = co['endIndex'] - 1 represent = co['represent'] # here we want to check whether this is the first co-reference; # if it is, then we need to get any text *before* it if co_i == 0: sent_new.extend(sent[:start]) sent_new.append(represent[0].upper() + represent[1:] if start == 0 else represent) # otherwise, we just get the co-reference and anything # between it and the preceding end from the previous co-reference else: sent_new.extend(sent[last_end:start]) sent_new.append(represent) last_end = end sent_new.extend(sent[last_end:]) else: sent_new = sent sents_new.append(sent_new) # we need to detokenize the sentence; basically this handles # putting punctuation and weird symbols for parentheses back together sents = ' '.join([ self.detok.detokenize(sent, convert_parentheses=True) for sent in sents_new ]) return sents
import re import string import pickle # import xmltodict # from collections import Counter from corenlp import CoreNLPClient from datetime import datetime from nltk import pos_tag, word_tokenize, sent_tokenize from nltk.stem import WordNetLemmatizer from src.rdf_graph.rdf_parse import StanfordParser, stanford_parse from scipy.spatial.distance import pdist from scipy.cluster.hierarchy import fcluster nlp = CoreNLPClient() parser = StanfordParser(nlp) class RDFGraph: def __init__(self, top_tfidf=20000, top_n_rel=None, top_n_ent=None, clust_dist_thres=0.2, coref_opt=False, openke_output_folder=os.curdir): '''Inputs: a) top_tfidf = number of top TF-IDF triples to use. To extract novel knowledge statements, we sort tuples by their mean TF-IDF scores and only extract the top TF-IDF tuples. This parameter controls how many
def prepro_each(args, data_type, start_ratio=0.0, stop_ratio=1.0, out_name="default", in_path=None): if args.tokenizer == "PTB": import nltk.tokenize as nltk sent_tokenize = nltk.sent_tokenize def word_tokenize(tokens): return [ token.replace("''", '"').replace("``", '"') for token in nltk.word_tokenize(tokens) ] elif args.tokenizer == 'Stanford': from corenlp import CoreNLPClient interface = CoreNLPClient(annotators="tokenize ssplit".split()) else: raise Exception() """ if not args.split: sent_tokenize = lambda para: [para] """ source_path = in_path or os.path.join( args.source_dir, "{}-{}v1.1.json".format(data_type, args.suffix)) source_data = json.load(open(source_path, 'r')) q, cq, y, rx, rcx, ids, idxs = [], [], [], [], [], [], [] na = [] cy = [] x, cx = [], [] answerss = [] p = [] word_counter, char_counter, lower_word_counter = Counter(), Counter( ), Counter() start_ai = int(round(len(source_data['data']) * start_ratio)) stop_ai = int(round(len(source_data['data']) * stop_ratio)) for ai, article in enumerate(tqdm(source_data['data'][start_ai:stop_ai])): xp, cxp = [], [] pp = [] x.append(xp) cx.append(cxp) p.append(pp) for pi, para in enumerate(article['paragraphs']): # wordss context = para['context'] context = context.replace("''", '" ') context = context.replace("``", '" ') while True: try: temp = interface.annotate(context) break except Exception as e: time.sleep(0.2) context_s = [] for sent in temp.sentence: sent = [word.originalText for word in sent.token] # CoreNLP 가 처리못하는 단어인 \xa0 를 예외처리하기위한 수동 방법... for wi in range(len(sent)): if "\xa0" in sent[wi]: sent = sent[:wi] + sent[wi].split("\xa0") + sent[wi + 1:] wi = 0 context_s.append(sent) xi = context_s xi = [process_tokens(tokens) for tokens in xi] # process tokens # given xi, add chars cxi = [[list(xijk) for xijk in xij] for xij in xi] xp.append(xi) cxp.append(cxi) pp.append(context) for xij in xi: for xijk in xij: word_counter[xijk] += len(para['qas']) lower_word_counter[xijk.lower()] += len(para['qas']) for xijkl in xijk: char_counter[xijkl] += len(para['qas']) rxi = [ai, pi] assert len(x) - 1 == ai assert len(x[ai]) - 1 == pi for qa in para['qas']: # get words while True: try: temp = interface.annotate(qa['question']).sentence[0] break except Exception as e: time.sleep(0.2) #print(temp.token[0]) #exit(-1) qi = [t_s.originalText for t_s in temp.token] print(qi) exit(-1) qi = process_tokens(qi) cqi = [list(qij) for qij in qi] yi = [] cyi = [] answers = [] for answer in qa['answers']: answer_text = answer['text'] answers.append(answer_text) answer_start = answer['answer_start'] answer_stop = answer_start + len(answer_text) # TODO : put some function that gives word_start, word_stop here yi0, yi1 = get_word_span(context, xi, answer_start, answer_stop) # yi0 = answer['answer_word_start'] or [0, 0] # yi1 = answer['answer_word_stop'] or [0, 1] assert len(xi[yi0[0]]) > yi0[1] assert len(xi[yi1[0]]) >= yi1[1] w0 = xi[yi0[0]][yi0[1]] w1 = xi[yi1[0]][yi1[1] - 1] i0 = get_word_idx(context, xi, yi0) i1 = get_word_idx(context, xi, (yi1[0], yi1[1] - 1)) cyi0 = answer_start - i0 cyi1 = answer_stop - i1 - 1 # print(answer_text, w0[cyi0:], w1[:cyi1+1]) assert answer_text[0] == w0[cyi0], (answer_text, w0, cyi0) assert answer_text[-1] == w1[cyi1] assert cyi0 < 32, (answer_text, w0) assert cyi1 < 32, (answer_text, w1) yi.append([yi0, yi1]) cyi.append([cyi0, cyi1]) if len(qa['answers']) == 0: yi.append([(0, 0), (0, 1)]) cyi.append([0, 1]) na.append(True) else: na.append(False) for qij in qi: word_counter[qij] += 1 lower_word_counter[qij.lower()] += 1 for qijk in qij: char_counter[qijk] += 1 q.append(qi) cq.append(cqi) y.append(yi) cy.append(cyi) rx.append(rxi) rcx.append(rxi) ids.append(qa['id']) idxs.append(len(idxs)) answerss.append(answers) if args.debug: break word2vec_dict = get_word2vec(args, word_counter) lower_word2vec_dict = get_word2vec(args, lower_word_counter) # add context here data = { 'q': q, 'cq': cq, 'y': y, '*x': rx, '*cx': rcx, 'cy': cy, 'idxs': idxs, 'ids': ids, 'answerss': answerss, '*p': rx, 'na': na } shared = { 'x': x, 'cx': cx, 'p': p, 'word_counter': word_counter, 'char_counter': char_counter, 'lower_word_counter': lower_word_counter, 'word2vec': word2vec_dict, 'lower_word2vec': lower_word2vec_dict } print("saving ...") save(args, data, shared, out_name)
class Featurizer(): def __init__(self, annotators=None, properties=None): with open( os.path.join(os.path.dirname(__file__), "assets", "regexes.json")) as f: self.regexer = RegexFeaturizer(json.load(f)) self.annotators = annotators or CORENLP_ANNOTATORS self.properties = properties or CORENLP_PROPERTIES self.client = CoreNLPClient(self.annotators, properties=self.properties, endpoint="http://localhost:9012") def __enter__(self): self.client.__enter__() return self def __exit__(self, *args): self.client.__exit__(*args) def _apply_features(self, obj, ann=None): """ Adds features to a graph. """ if "features" not in obj: obj["features"] = {} if ann: assert len(ann.sentence) == 1 sentence = ann.sentence[0] assert len(sentence.token) == len(obj["tokens"]) obj["features"]["lemma"] = [t.lemma for t in sentence.token] obj["features"]["pos"] = [t.pos for t in sentence.token] obj["features"]["ner"] = [t.ner for t in sentence.token] obj["features"]["depparse"] = _dep_to_list( sentence.enhancedPlusPlusDependencies) assert len({tail for _, tail, _ in obj["features"]["depparse"] }) == len(sentence.token) child_to_head, head_to_child, path_length, next_in_path, distance_to_next_token, distance_from_prev_token = compute_dependency_paths( obj) obj["features"]["dep_child_to_head"] = child_to_head obj["features"]["dep_head_to_child"] = head_to_child obj["features"]["dep_path_lengths"] = path_length obj["features"]["dep_traceback"] = next_in_path obj["features"]["dep_dist_to_next"] = distance_to_next_token obj["features"]["dep_dist_from_prev"] = distance_from_prev_token if self.regexer: obj["features"]["regexes"] = self.regexer.featurize(obj["tokens"]) obj["features"][ "typed_values"] = self.regexer.featurize_unit_spans( obj["tokens"]) def featurize_graph(self, obj): ann = self.client.annotate(" ".join(obj["tokens"])) self._apply_features(obj, ann) return obj def featurize_text(self, text): ann = self.client.annotate(text) assert len(ann.sentence) == 1 sentence = ann.sentence[0] obj = { "tokens": [t.word for t in sentence.token], } self._apply_features(obj, ann) return obj
from corenlp import CoreNLPClient from logger import logger parser_client = CoreNLPClient( annotators="tokenize ssplit pos lemma depparse".split()) # natlog # past, 3psgpresent, past part, present, base, gerund/present participle verbposes = ["VBD", "VBZ", "VBN", "VBP", "VB", "VBG"] tag = "JiK" def semanticdependencyparse(string, loglevel=False): depgraph = parser_client.annotate(string) utterances = [] for ss in depgraph.sentence: utterances.append(processdependencies(ss, loglevel)) return utterances def processdependencies(ss, loglevel=False): string = [] deps = [] negation = False adverbial = [] mainverb = False verbchain = [] tense = False subject = False mode = False
def __init__(self, parser_path: str): os.environ['JAVANLP_HOME'] = parser_path print( 'starting CoreNLP server with JAVANLP_HOME {}'.format(parser_path)) self.nlp = CoreNLPClient(annotators="tokenize ssplit".split(), timeout=1000000)
tokens = re.split(r'[^0-9a-zA-Z-]+', lister) if len(tokens) >= 2 and len(tokens) < 8: for i, z in list(enumerate(tokens))[:-1]: if z.lower().split('-')[0] in firstnames: lister = "{} {}".format(tokens[i], tokens[i + 1]) break listedby.append(lister) oklistedby = set() for pair in Counter(listedby).iteritems(): if pair[1] == 1: if not re.search(re_suspect, pair[0], re.IGNORECASE): oklistedby.add(pair[0]) with CoreNLPClient(start_cmd="gradle -p {} server".format("../CoreNLP"), endpoint=args.corenlp_uri, timeout=15000) as client: response = s3_client.get_object(Bucket=bucket, Key="{}.pkl".format(args.vernum)) with open(join(args.odir, 'svc.pkl'), 'w') as fp: fp.write(response['Body'].read()) svc = joblib.load(join(args.odir, 'svc.pkl'), mmap_mode='r') # tried to do a hasattr thing ... # for tup in svc.named_steps['featureunion'].transformer_list: # pipeline = tup[1] # for obj in pipeline.named_steps.values(): # if type(obj) == pipeline = next(x[1] for x in svc.named_steps['featureunion'].transformer_list \ if x[0] == 'text') pipeline.named_steps['vectorizer'].analyzer._client = client pipeline = next(x[1] for x in svc.named_steps['featureunion'].transformer_list \