Пример #1
0
class GraphMaker:
    def __init__(self, parserURL='http://localhost:9000'):
        self.dparser = CoreNLPDependencyParser(url=parserURL)
        self.clear()

    # clear saved state
    def clear(self):
        self.maxcc = None
        self.gs = None
        self.nxgraph = None
        self.ranked = None
        #self.words=mdict() # not used ...
        self.words2lemmas = set()
        self.noun_set = dict()
        self.svo_edges_in_graph = []

    # digest a file
    def load(self, fname):
        self.clear()
        f = open(fname, 'r')
        text = f.read()
        f.close()
        self.digest(text)

    def parse(self, text):
        ts = self.dparser.parse_text(text)
        return list(ts)

    # digest a string using dependecy parser
    def digest(self, text):
        self.clear()
        chop = 2**16
        gens = []
        # deals with files that are too large to be parse at once
        while len(text) > chop:
            head = text[:chop]
            text = text[chop:]
            #print((head))
            if head:
                hs = list(self.parse(head))
                #print('PARSED')
                gens.append(hs)
        if gens:
            self.gs = [x for xs in gens for x in xs]
        else:
            self.gs = self.parse(text)
        #print('!!!',self.gs)

    # sentence as sequence of words generator
    def sentence(self):
        for g in self.gs:
            yield str.join(' ', list(gwords(g)))

    def wsentence(self):
        for g in self.gs:
            yield tuple(gwords(g))

    def nth_sent_words(self, n):
        ws = tuple(gwords(self.gs[n]))
        return ws

    # sentence as sequence of lemmas generator
    def lsentence(self):
        for g in self.gs:
            yield tuple(glemmas(g))

# curates, reverses and adds some new edges
# yields an <edge, sentence in which it occurs> pair

    def edgesInSent(self):
        self.svo_edges_in_graph = []

        def noun_to_def(x, tx, k):
            if noun_defs:
                k_ = self.noun_set.get(x)
                if k == k_:
                    yield (x, tx, 'first_in', k, 'SENT')

        def edgeOf(k, g):
            d = w2l(g)
            merge_dict(self.words2lemmas, d)
            make_noun_set(g, self.noun_set, k)
            svo_edges_in_sent = []
            for ts in g.triples():
                #print('TS',ts)
                fr, rel, to = list(ts)
                lfrom, ftag = d[fr[0]]
                lto, ttag = d[to[0]]
                # vn is True it is an s->v or o->v link
                so = isSubj(rel) or isObj(rel)
                vn = isVerb(ftag) and isNoun(ttag) and so
                if rel == 'punct' and ttag == '.':
                    # sentence points to predicate verb
                    yield (k, 'SENT', 'predicate', lfrom, ftag)
                elif vn:
                    # collects vs and vo links to merge them later into svo
                    svo_edges_in_sent.append((lfrom, ftag, rel, lto, ttag))
                    yield lfrom, ftag, rel, lto, ttag  # verb to noun
                    yield k, 'SENT', 'about', lto, ttag  # sent to noun
                    # all words recommend sentence
                    #yield lfrom,ftag,'recommends',k,'SENT' # verb to sent - in elif !
                    for e in noun_to_def(
                            lto,
                            ttag,
                            k,
                    ):
                        yield e  # noun to sent
                    if noun_self: yield lto, ttag, 'self', lto, ttag
                elif isNoun(ttag):  # e.g. nmod relation
                    #print('x-->n',k,lfrom,ftag,rel,lto,ttag)
                    yield lfrom, ftag, rel, lto, ttag
                    for e in noun_to_def(
                            lto,
                            ttag,
                            k,
                    ):
                        yield e  # noun to sent
                    if noun_self: yield lto, ttag, 'self', lto, ttag
                    #yield lfrom, ftag, 'recommends', k, 'SENT' # dependent of noun to sent
                else:  # yield link as is
                    yield lto, ttag, rel, lfrom, ftag
                    # all words recommend sentence
                    if all_recs: yield lto, ttag, 'recommends', k, 'SENT'

                # merge compound terms, make their parts recommend them
                if isNoun(ftag) and isNoun(ttag) and rel == 'compound':
                    comp = lto + ' ' + lfrom
                    yield lfrom, ftag, 'fused', comp, ftag
                    yield lto, ttag, 'fused', comp, ttag
                    for e in noun_to_def(comp, ttag, k):
                        yield e
                    if noun_self: yield comp, ttag, 'self', comp, ttag
            # collect svo relations
            self.svo_edges_in_graph.append(to_svo(k, svo_edges_in_sent))

        k = 0
        for g in self.gs:
            for e in edgeOf(k, g):
                # collects words at the two ends of e
                self.addWordsIn(e)
                yield e, k
            k += 1

    # yields  the edge. possibly for each sentence where is found
    def multi_edges(self):
        for e, k in self.edgesInSent():
            yield e

    def edges(self):
        for e in set(self.multi_edges()):
            yield e

    # collects unique words at ends of an edge
    def addWordsIn(self, e):
        f, tf, r, t, tt = e
        if maybeWord(f) and tf != 'SENT':
            self.words.add(f, tf)
        if maybeWord(t) and tt != 'SENT':
            self.words.add(t, tt)
            yield e

    # returns final networkx text graph
    def graph(self):
        if (self.nxgraph): return self.nxgraph
        dg = nx.DiGraph()

        for e in self.edges():
            f, tf, r, t, tt = e
            dg.add_edge(f, t, rel=r)

        self.nxgraph = dg
        #print('DG:',dg,'END')

        #print('NOUN_SET',self.noun_set)
        return dg

    # ranks (unless ranked and stored as such) the text graph
    def pagerank(self):
        if self.ranked: return self.ranked
        g = self.graph()
        pr = self.runPagerank(g)
        self.ranked = pr
        if not all_recs: return pr
        ccs = list(nx.strongly_connected_components(g))
        lc = len(ccs)
        #print('LENCOM', lc)
        if lc < 4:
            self.maxcc = max(ccs, key=len)
        return pr

    # extracts best k nodes passing filtering test
    def bestNodes(self, k, filter):
        g = self.graph()
        comps = list(nx.strongly_connected_components(g))

        pr = self.pagerank()
        i = 0
        ns = []  # not a set - that looses order !!!
        for x, r in pr:
            if i >= k: break
            #print('RANKED',x,r)
            if filter(x):
                #print('FILTERED',x,r,'MC')
                if not self.maxcc or x in self.maxcc:
                    if not x in ns:
                        ns.append(x)
                        i += 1
        return ns

    # specialization returning all best k nodes
    def bestAny(self, k):
        return self.bestNodes(k, lambda x: True)

    # specialization returning best k sentence nodes
    def bestSentencesByRank(self, k):
        best = self.bestNodes(100 + k, isSent)
        if not best: return
        #print('BEST SENTS:',best)
        c = 0
        for i in best:
            g = self.gs[i]
            lems = [w for w in glemmas0(g)]
            #print('LEMS',lems)
            if isCleanSent(lems):
                sent = list(gwords(g))
                #sent=str.join(' ',list(gwords(g)))
                yield (i, sent)
                c += 1
            #else : print('SENT UNCLEAN',lems)
            if c >= k: break

    def bestSentences(self, k):
        for i_s in sorted(self.bestSentencesByRank(k)):
            yield i_s

    # specialization returning best k word nodes
    def bestWords(self, k):
        #print('NOUNS',self.noun_set)
        c = 0
        best = self.bestNodes(100 + k, maybeWord)
        #print('BEST WORDS:',best)
        for w in best:
            if c >= k: break
            if not isStopWord(w) and self.hasNoun(w):
                yield (w)
                #print('BWORD',w)
            c += 1

    # true if a phrase has a noun in it
    def hasNoun(self, w):
        ws = w.split(' ')
        for v in ws:
            if v in self.noun_set: return True
        return False

    # runs PageRank on text graph
    def runPagerank(self, g):
        d = nx.pagerank(g)
        #print("PR",d)

        # normalize sentence ranks by favoring those close to everage rank
        sents = list(self.wsentence())
        lens = list(map(len, sents))
        #print('LENS:', lens)
        avg = sum(lens) / len(lens)

        #print('AVG SENT LENGTH:', avg)

        # reranks long sentences
        i = 0
        for ws in sents:
            #print('WS:',ws)
            if i in d:
                l = len(ws)
                r = d[i]
                newr = adjust_rank(r, l, avg)
                d[i] = newr
                #if l<6 : print(r,'--->',newr,l,'ws=',ws)
                i += 1

        sd = sorted(d, key=d.get, reverse=True)

        return [(k, d[k]) for k in sd]

    # extracts k highest ranked SVO triplets
    def bestSVOs(self, k):
        rank_list = self.pagerank()
        rank_dict = dict()
        for (w, rw) in rank_list:
            rank_dict[w] = rw
        #print('PRANK',rank_list)
        ranked = []  # should not be a set !
        for rs in self.svo_edges_in_graph:
            for r in rs:
                #print('SVO',r)
                (f, _), (rel, _), (t, _), sent_id = r
                srank = rank_dict[f]
                orank = rank_dict[t]
                if srank and orank:
                    sorank = (2 * srank + orank) / 3
                    ranked.append((sorank, (f, rel, t, sent_id)))
        ranked = sorted(ranked, reverse=True)
        i = 0
        exts = set()
        seen = set()
        for (_, e) in ranked:
            i += 1
            if i > k: break
            #print('SVO_EDGE',e)
            if e in seen: continue
            seen.add(e)
            yield e
            for xe in self.extend_with_wn_links(e, rank_dict):
                f, _, t, _ = xe
                if wn.morphy(f.lower()) != wn.morphy(t.lower()):
                    exts.add(xe)
        i = 0
        for xe in exts:
            i += 1
            if i > k: break
            #print('XE',xe)
            yield xe

    # adds wordnet-derived links to a dictionary d
    # we tag them with is_a or part_of
    def extend_with_wn_links(self, e, d):
        s, v, o, sent_id = e
        m = 1  # how many of each are taken
        for x in wn_holo(m, s, 'n'):
            if x in d: yield (s, 'part_of', x, sent_id)
        for x in wn_mero(m, s, 'n'):
            if x in d: yield (x, 'part_of', s, sent_id)
        for x in wn_hyper(m, s, 'n'):
            if x in d: yield (s, 'is_a', x, sent_id)
        for x in wn_hypo(m, s, 'n'):
            if x in d: yield (x, 'is_a', s, sent_id)
        for x in wn_holo(m, o, 'n'):
            if x in d: yield (o, 'part_of', x, sent_id)
        for x in wn_mero(m, o, 'n'):
            if x in d: yield (x, 'part_of', o, sent_id)
        for x in wn_hyper(m, o, 'n'):
            if x in d: yield (o, 'is_a', x, sent_id)
        for x in wn_hypo(m, o, 'n'):
            if x in d: yield (x, 'is_a', o, sent_id)

    # visualize filtered set of edges with graphviz
    def toDot(self, k, filter, svo=False, show=True, fname='textgraph.gv'):
        dot = Digraph()
        g = self.graph()
        best = self.bestNodes(k, filter)
        for f, t in g.edges():
            if f in best and t in best:
                dot.edge(str(f), str(t))
        if svo:
            svos = set()
            for (s, v, o, _) in self.bestSVOs(k):
                svos.add((s, v, o))
            for e in svos:
                s, v, o = e
                dot.edge(s, o, label=v)
        showGraph(dot, show=show, file_name=fname)

    # visualize filtered set of edges as graphviz dot graph
    def svoToDot(self, k):
        dot = Digraph()
        for e in self.bestSVOs(3 * k):
            s, v, o = e
            dot.edge(s, o, label=v)
        showGraph(dot)

    # specialize dot graph to words
    def wordsToDot(self, k):
        self.toDot(k, isWord)

    # specialize dot senteces graph words
    def sentsToDot(self, k):
        self.toDot(k, isSent)

    # visualize mixed sentence - word graph
    def allToDot(self, k):
        self.toDot(k, lambda x: True)
Пример #2
0
def DetectAbbreviationFreeFormText(text, language='english'):
    # path of the script
    current_dir_path = os.path.dirname(os.path.realpath(__file__))

    # define the directories in which the temporary files will be saved
    temp_dir = os.path.join(current_dir_path, 'TEMP')

    # create temp dir if it not exists
    if not os.path.exists(temp_dir):
        os.makedirs(temp_dir)

    global temp_file
    global depparse_file
    if temp_file is None:
        temp_file = os.path.join(temp_dir, 'corpus')
        # specifies the name ending of the temp file. The ending will be a incrementing number so no older
        # temp file will be overwritten.
        k = 0
        while Path(temp_file + str(k)).exists():
            k += 1
        temp_file = temp_file + str(k)

        # Tab separated file with pos tagged dependency parsed annotation.
        depparse_file = temp_file + '.conllu'

    # define the Stanford CoreNLP and Stanford NER jar
    stanford_core_nlp_jar = os.path.join(
        os.path.join(current_dir_path, '''StanfordCoreNLP'''),
        'stanford_core_nlp_custom_document_reader_and_whitespace_lexer.jar')
    stanford_ner_jar = os.path.join(
        os.path.join(current_dir_path, '''StanfordNER'''), 'stanford_ner.jar')
    # define the CRF model for Stanford NER.
    stanford_ner_model = os.path.join(
        os.path.join(current_dir_path, '''StanfordNER'''),
        'ner-model-abbr-detection.ser.gz')

    # choosing the property file for Stanford CoreNLP according to the give language param.
    if language is None:
        language = 'english'
    if language.lower() == 'english':
        props_file = os.path.join(
            os.path.join(current_dir_path, '''StanfordCoreNLP'''),
            '''StanfordCoreNLP-english_CSV.properties''')
    elif language.lower() == 'german':
        props_file = os.path.join(
            os.path.join(current_dir_path, '''StanfordCoreNLP'''),
            '''StanfordCoreNLP-german_CSV.properties''')

    stanford_core_nlp_server_command = [
        "java", "-Xmx45g", "-cp", stanford_core_nlp_jar,
        "edu.stanford.nlp.pipeline.StanfordCoreNLPServer", "-serverProperties",
        props_file, "-port", "9000", "-timeout", "15000"
    ]

    # The command line argument for running Stanford NER.
    stanford_ner_command = [
        "java", "-jar", stanford_ner_jar, "-Xmx45g", "-cp", ''
        "*;lib/*"
        '', "-loadClassifier", stanford_ner_model, "-outputFormat",
        "tabbedEntities", "-testFile", depparse_file, ">", temp_file,
        "-encoding", "UTF-8"
    ]

    #    global STANFORD_CORENLP_PROCESS
    #   if STANFORD_CORENLP_PROCESS is None:
    #      STANFORD_CORENLP_PROCESS = subprocess.Popen(
    #         stanford_core_nlp_server_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE,
    #    )
    #   time.sleep(5)

    dep_parser = CoreNLPDependencyParser(url='http://localhost:9000')
    sents_ner = []

    try:
        sents_dep_parsed = dep_parser.parse_text(text)
        if sents_dep_parsed is None:
            return [], 0
            #   sents_dep_parsed = list(sents_dep_parsed)
            #  if len(sents_dep_parsed) == 0:
            #     return [], 0
        sent_dep_parsed = next(sents_dep_parsed, None)
    except requests.exceptions.Timeout:
        #  FinishAbbreviationDetection()
        # if STANFORD_CORENLP_PROCESS is None:
        #    STANFORD_CORENLP_PROCESS = subprocess.Popen(
        #       stanford_core_nlp_server_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE,
        #  )
        # time.sleep(5)
        dep_parser = CoreNLPDependencyParser(url='http://localhost:9000')

        sents_dep_parsed = dep_parser.parse_text(text)
        if sents_dep_parsed is None:
            return [], 0

        sent_dep_parsed = next(sents_dep_parsed, None)
    if sent_dep_parsed is None:
        return [], 0

#   for sent_dep_parsed in sents_dep_parsed:
    while sent_dep_parsed is not None:
        sent_dep_parsed = sent_dep_parsed.to_conll(4)
        with open(depparse_file, 'w', encoding='utf-8') as file:
            file.write(sent_dep_parsed)

        if language == 'english':
            # annotate with universal Tags
            conll_create_universal_tagging(depparse_file,
                                           column_with_penn_tags=1,
                                           column_with_universal_tags=2)

        if language == 'german':
            # Shrink conll-u and add fake gold ner tags
            ShrinkConllU(depparse_file, [0, 1, 3], True)
        else:
            # Shrink conll-u and add fake gold ner tags
            ShrinkConllU(depparse_file, [0, 2, 3], True)

        # actual ner tagging
        subprocess.call(stanford_ner_command, shell=True)
        ShrinkConllU(temp_file, [0, 1, 2], False)

        # Read from the temp file all ABBR annotation and counts it.
        # Fake POS tags, else we can't get the NER Tags easily with ConllCorpusReader.
        #       corp_reader = betterConllReader('TEMP', os.path.basename(temp_file), ['words', 'ignore', 'ne'],encoding='utf-8')
        #      sents_ner.append(list(corp_reader.iob_sents()))
        sentences = read_conll(temp_file)
        sents_ner.append(sentences)
        sent_dep_parsed = next(sents_dep_parsed, None)

    result = 0
    for senti in sents_ner:
        for sent in senti:
            for word in sent:
                if word[1] == 'ABBR':
                    result += 1

    return sents_ner, result