class GraphMaker: def __init__(self, parserURL='http://localhost:9000'): self.dparser = CoreNLPDependencyParser(url=parserURL) self.clear() # clear saved state def clear(self): self.maxcc = None self.gs = None self.nxgraph = None self.ranked = None #self.words=mdict() # not used ... self.words2lemmas = set() self.noun_set = dict() self.svo_edges_in_graph = [] # digest a file def load(self, fname): self.clear() f = open(fname, 'r') text = f.read() f.close() self.digest(text) def parse(self, text): ts = self.dparser.parse_text(text) return list(ts) # digest a string using dependecy parser def digest(self, text): self.clear() chop = 2**16 gens = [] # deals with files that are too large to be parse at once while len(text) > chop: head = text[:chop] text = text[chop:] #print((head)) if head: hs = list(self.parse(head)) #print('PARSED') gens.append(hs) if gens: self.gs = [x for xs in gens for x in xs] else: self.gs = self.parse(text) #print('!!!',self.gs) # sentence as sequence of words generator def sentence(self): for g in self.gs: yield str.join(' ', list(gwords(g))) def wsentence(self): for g in self.gs: yield tuple(gwords(g)) def nth_sent_words(self, n): ws = tuple(gwords(self.gs[n])) return ws # sentence as sequence of lemmas generator def lsentence(self): for g in self.gs: yield tuple(glemmas(g)) # curates, reverses and adds some new edges # yields an <edge, sentence in which it occurs> pair def edgesInSent(self): self.svo_edges_in_graph = [] def noun_to_def(x, tx, k): if noun_defs: k_ = self.noun_set.get(x) if k == k_: yield (x, tx, 'first_in', k, 'SENT') def edgeOf(k, g): d = w2l(g) merge_dict(self.words2lemmas, d) make_noun_set(g, self.noun_set, k) svo_edges_in_sent = [] for ts in g.triples(): #print('TS',ts) fr, rel, to = list(ts) lfrom, ftag = d[fr[0]] lto, ttag = d[to[0]] # vn is True it is an s->v or o->v link so = isSubj(rel) or isObj(rel) vn = isVerb(ftag) and isNoun(ttag) and so if rel == 'punct' and ttag == '.': # sentence points to predicate verb yield (k, 'SENT', 'predicate', lfrom, ftag) elif vn: # collects vs and vo links to merge them later into svo svo_edges_in_sent.append((lfrom, ftag, rel, lto, ttag)) yield lfrom, ftag, rel, lto, ttag # verb to noun yield k, 'SENT', 'about', lto, ttag # sent to noun # all words recommend sentence #yield lfrom,ftag,'recommends',k,'SENT' # verb to sent - in elif ! for e in noun_to_def( lto, ttag, k, ): yield e # noun to sent if noun_self: yield lto, ttag, 'self', lto, ttag elif isNoun(ttag): # e.g. nmod relation #print('x-->n',k,lfrom,ftag,rel,lto,ttag) yield lfrom, ftag, rel, lto, ttag for e in noun_to_def( lto, ttag, k, ): yield e # noun to sent if noun_self: yield lto, ttag, 'self', lto, ttag #yield lfrom, ftag, 'recommends', k, 'SENT' # dependent of noun to sent else: # yield link as is yield lto, ttag, rel, lfrom, ftag # all words recommend sentence if all_recs: yield lto, ttag, 'recommends', k, 'SENT' # merge compound terms, make their parts recommend them if isNoun(ftag) and isNoun(ttag) and rel == 'compound': comp = lto + ' ' + lfrom yield lfrom, ftag, 'fused', comp, ftag yield lto, ttag, 'fused', comp, ttag for e in noun_to_def(comp, ttag, k): yield e if noun_self: yield comp, ttag, 'self', comp, ttag # collect svo relations self.svo_edges_in_graph.append(to_svo(k, svo_edges_in_sent)) k = 0 for g in self.gs: for e in edgeOf(k, g): # collects words at the two ends of e self.addWordsIn(e) yield e, k k += 1 # yields the edge. possibly for each sentence where is found def multi_edges(self): for e, k in self.edgesInSent(): yield e def edges(self): for e in set(self.multi_edges()): yield e # collects unique words at ends of an edge def addWordsIn(self, e): f, tf, r, t, tt = e if maybeWord(f) and tf != 'SENT': self.words.add(f, tf) if maybeWord(t) and tt != 'SENT': self.words.add(t, tt) yield e # returns final networkx text graph def graph(self): if (self.nxgraph): return self.nxgraph dg = nx.DiGraph() for e in self.edges(): f, tf, r, t, tt = e dg.add_edge(f, t, rel=r) self.nxgraph = dg #print('DG:',dg,'END') #print('NOUN_SET',self.noun_set) return dg # ranks (unless ranked and stored as such) the text graph def pagerank(self): if self.ranked: return self.ranked g = self.graph() pr = self.runPagerank(g) self.ranked = pr if not all_recs: return pr ccs = list(nx.strongly_connected_components(g)) lc = len(ccs) #print('LENCOM', lc) if lc < 4: self.maxcc = max(ccs, key=len) return pr # extracts best k nodes passing filtering test def bestNodes(self, k, filter): g = self.graph() comps = list(nx.strongly_connected_components(g)) pr = self.pagerank() i = 0 ns = [] # not a set - that looses order !!! for x, r in pr: if i >= k: break #print('RANKED',x,r) if filter(x): #print('FILTERED',x,r,'MC') if not self.maxcc or x in self.maxcc: if not x in ns: ns.append(x) i += 1 return ns # specialization returning all best k nodes def bestAny(self, k): return self.bestNodes(k, lambda x: True) # specialization returning best k sentence nodes def bestSentencesByRank(self, k): best = self.bestNodes(100 + k, isSent) if not best: return #print('BEST SENTS:',best) c = 0 for i in best: g = self.gs[i] lems = [w for w in glemmas0(g)] #print('LEMS',lems) if isCleanSent(lems): sent = list(gwords(g)) #sent=str.join(' ',list(gwords(g))) yield (i, sent) c += 1 #else : print('SENT UNCLEAN',lems) if c >= k: break def bestSentences(self, k): for i_s in sorted(self.bestSentencesByRank(k)): yield i_s # specialization returning best k word nodes def bestWords(self, k): #print('NOUNS',self.noun_set) c = 0 best = self.bestNodes(100 + k, maybeWord) #print('BEST WORDS:',best) for w in best: if c >= k: break if not isStopWord(w) and self.hasNoun(w): yield (w) #print('BWORD',w) c += 1 # true if a phrase has a noun in it def hasNoun(self, w): ws = w.split(' ') for v in ws: if v in self.noun_set: return True return False # runs PageRank on text graph def runPagerank(self, g): d = nx.pagerank(g) #print("PR",d) # normalize sentence ranks by favoring those close to everage rank sents = list(self.wsentence()) lens = list(map(len, sents)) #print('LENS:', lens) avg = sum(lens) / len(lens) #print('AVG SENT LENGTH:', avg) # reranks long sentences i = 0 for ws in sents: #print('WS:',ws) if i in d: l = len(ws) r = d[i] newr = adjust_rank(r, l, avg) d[i] = newr #if l<6 : print(r,'--->',newr,l,'ws=',ws) i += 1 sd = sorted(d, key=d.get, reverse=True) return [(k, d[k]) for k in sd] # extracts k highest ranked SVO triplets def bestSVOs(self, k): rank_list = self.pagerank() rank_dict = dict() for (w, rw) in rank_list: rank_dict[w] = rw #print('PRANK',rank_list) ranked = [] # should not be a set ! for rs in self.svo_edges_in_graph: for r in rs: #print('SVO',r) (f, _), (rel, _), (t, _), sent_id = r srank = rank_dict[f] orank = rank_dict[t] if srank and orank: sorank = (2 * srank + orank) / 3 ranked.append((sorank, (f, rel, t, sent_id))) ranked = sorted(ranked, reverse=True) i = 0 exts = set() seen = set() for (_, e) in ranked: i += 1 if i > k: break #print('SVO_EDGE',e) if e in seen: continue seen.add(e) yield e for xe in self.extend_with_wn_links(e, rank_dict): f, _, t, _ = xe if wn.morphy(f.lower()) != wn.morphy(t.lower()): exts.add(xe) i = 0 for xe in exts: i += 1 if i > k: break #print('XE',xe) yield xe # adds wordnet-derived links to a dictionary d # we tag them with is_a or part_of def extend_with_wn_links(self, e, d): s, v, o, sent_id = e m = 1 # how many of each are taken for x in wn_holo(m, s, 'n'): if x in d: yield (s, 'part_of', x, sent_id) for x in wn_mero(m, s, 'n'): if x in d: yield (x, 'part_of', s, sent_id) for x in wn_hyper(m, s, 'n'): if x in d: yield (s, 'is_a', x, sent_id) for x in wn_hypo(m, s, 'n'): if x in d: yield (x, 'is_a', s, sent_id) for x in wn_holo(m, o, 'n'): if x in d: yield (o, 'part_of', x, sent_id) for x in wn_mero(m, o, 'n'): if x in d: yield (x, 'part_of', o, sent_id) for x in wn_hyper(m, o, 'n'): if x in d: yield (o, 'is_a', x, sent_id) for x in wn_hypo(m, o, 'n'): if x in d: yield (x, 'is_a', o, sent_id) # visualize filtered set of edges with graphviz def toDot(self, k, filter, svo=False, show=True, fname='textgraph.gv'): dot = Digraph() g = self.graph() best = self.bestNodes(k, filter) for f, t in g.edges(): if f in best and t in best: dot.edge(str(f), str(t)) if svo: svos = set() for (s, v, o, _) in self.bestSVOs(k): svos.add((s, v, o)) for e in svos: s, v, o = e dot.edge(s, o, label=v) showGraph(dot, show=show, file_name=fname) # visualize filtered set of edges as graphviz dot graph def svoToDot(self, k): dot = Digraph() for e in self.bestSVOs(3 * k): s, v, o = e dot.edge(s, o, label=v) showGraph(dot) # specialize dot graph to words def wordsToDot(self, k): self.toDot(k, isWord) # specialize dot senteces graph words def sentsToDot(self, k): self.toDot(k, isSent) # visualize mixed sentence - word graph def allToDot(self, k): self.toDot(k, lambda x: True)
def DetectAbbreviationFreeFormText(text, language='english'): # path of the script current_dir_path = os.path.dirname(os.path.realpath(__file__)) # define the directories in which the temporary files will be saved temp_dir = os.path.join(current_dir_path, 'TEMP') # create temp dir if it not exists if not os.path.exists(temp_dir): os.makedirs(temp_dir) global temp_file global depparse_file if temp_file is None: temp_file = os.path.join(temp_dir, 'corpus') # specifies the name ending of the temp file. The ending will be a incrementing number so no older # temp file will be overwritten. k = 0 while Path(temp_file + str(k)).exists(): k += 1 temp_file = temp_file + str(k) # Tab separated file with pos tagged dependency parsed annotation. depparse_file = temp_file + '.conllu' # define the Stanford CoreNLP and Stanford NER jar stanford_core_nlp_jar = os.path.join( os.path.join(current_dir_path, '''StanfordCoreNLP'''), 'stanford_core_nlp_custom_document_reader_and_whitespace_lexer.jar') stanford_ner_jar = os.path.join( os.path.join(current_dir_path, '''StanfordNER'''), 'stanford_ner.jar') # define the CRF model for Stanford NER. stanford_ner_model = os.path.join( os.path.join(current_dir_path, '''StanfordNER'''), 'ner-model-abbr-detection.ser.gz') # choosing the property file for Stanford CoreNLP according to the give language param. if language is None: language = 'english' if language.lower() == 'english': props_file = os.path.join( os.path.join(current_dir_path, '''StanfordCoreNLP'''), '''StanfordCoreNLP-english_CSV.properties''') elif language.lower() == 'german': props_file = os.path.join( os.path.join(current_dir_path, '''StanfordCoreNLP'''), '''StanfordCoreNLP-german_CSV.properties''') stanford_core_nlp_server_command = [ "java", "-Xmx45g", "-cp", stanford_core_nlp_jar, "edu.stanford.nlp.pipeline.StanfordCoreNLPServer", "-serverProperties", props_file, "-port", "9000", "-timeout", "15000" ] # The command line argument for running Stanford NER. stanford_ner_command = [ "java", "-jar", stanford_ner_jar, "-Xmx45g", "-cp", '' "*;lib/*" '', "-loadClassifier", stanford_ner_model, "-outputFormat", "tabbedEntities", "-testFile", depparse_file, ">", temp_file, "-encoding", "UTF-8" ] # global STANFORD_CORENLP_PROCESS # if STANFORD_CORENLP_PROCESS is None: # STANFORD_CORENLP_PROCESS = subprocess.Popen( # stanford_core_nlp_server_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, # ) # time.sleep(5) dep_parser = CoreNLPDependencyParser(url='http://localhost:9000') sents_ner = [] try: sents_dep_parsed = dep_parser.parse_text(text) if sents_dep_parsed is None: return [], 0 # sents_dep_parsed = list(sents_dep_parsed) # if len(sents_dep_parsed) == 0: # return [], 0 sent_dep_parsed = next(sents_dep_parsed, None) except requests.exceptions.Timeout: # FinishAbbreviationDetection() # if STANFORD_CORENLP_PROCESS is None: # STANFORD_CORENLP_PROCESS = subprocess.Popen( # stanford_core_nlp_server_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, # ) # time.sleep(5) dep_parser = CoreNLPDependencyParser(url='http://localhost:9000') sents_dep_parsed = dep_parser.parse_text(text) if sents_dep_parsed is None: return [], 0 sent_dep_parsed = next(sents_dep_parsed, None) if sent_dep_parsed is None: return [], 0 # for sent_dep_parsed in sents_dep_parsed: while sent_dep_parsed is not None: sent_dep_parsed = sent_dep_parsed.to_conll(4) with open(depparse_file, 'w', encoding='utf-8') as file: file.write(sent_dep_parsed) if language == 'english': # annotate with universal Tags conll_create_universal_tagging(depparse_file, column_with_penn_tags=1, column_with_universal_tags=2) if language == 'german': # Shrink conll-u and add fake gold ner tags ShrinkConllU(depparse_file, [0, 1, 3], True) else: # Shrink conll-u and add fake gold ner tags ShrinkConllU(depparse_file, [0, 2, 3], True) # actual ner tagging subprocess.call(stanford_ner_command, shell=True) ShrinkConllU(temp_file, [0, 1, 2], False) # Read from the temp file all ABBR annotation and counts it. # Fake POS tags, else we can't get the NER Tags easily with ConllCorpusReader. # corp_reader = betterConllReader('TEMP', os.path.basename(temp_file), ['words', 'ignore', 'ne'],encoding='utf-8') # sents_ner.append(list(corp_reader.iob_sents())) sentences = read_conll(temp_file) sents_ner.append(sentences) sent_dep_parsed = next(sents_dep_parsed, None) result = 0 for senti in sents_ner: for sent in senti: for word in sent: if word[1] == 'ABBR': result += 1 return sents_ner, result