예제 #1
0
파일: clf.py 프로젝트: philip-py/mdb_nlp
def clf_sentiment(res):
    res_viz = []
    for i, (doc, doc_vizs) in enumerate(zip(res.spans, res.viz)):
        # print(i, f'/{len(res.spans)}')
        viz = []
        for span in res.spans[doc]:
            text = gendocs(doc)[span[0]:span[1]]
            # texts
            print(text)
            viz.extend([viz for viz in doc_vizs if viz['span_start'] == span[0]])
            for v in viz:
                v['SPAN_SENT'] = 0
            sents = text.strip().split('.')
            for sent in [i for i in sents if i]:
                s = clf(sent.lower())
                print(sent)
                print(s)
            print('-----------')
            t = clf(text)
            print(t)
            print()
            # if any()
            # result = (s['label'], s['score'])
            # if result[0] == 'negative':
            #     for v in viz:
            #         v['SPAN_SENT'] = -1
            # elif result[0] == 'neutral':
            #     for v in viz:
            #         v['SPAN_SENT'] = 0
            # elif result[0] == 'positive':
            #     for v in viz:
            #         v['SPAN_SENT'] = 1
            # print(s['label'], s['score'])
        res_viz.append(viz)
    return res_viz
예제 #2
0
def content_analysis(directory,
                     party="all",
                     sample=None,
                     window_size=25,
                     debug=False):

    if directory != 'test':
        Path(f"res_ca/{directory}/").mkdir(parents=False, exist_ok=False)

    doc_labels = load_data(party)
    if type(sample) == int:
        doc_labels = random.sample(doc_labels, sample)
        text = None
    elif type(sample) == str:
        doc_labels = ['test']
        text = sample
    elif type(sample) == list:
        doc_labels = sample
        text = None
    else:
        text = None
    print("Number of documents: {}".format(len(doc_labels)))
    print(
        f"Beginning Content Analysis with parameters: \n party: {party} | samplesize: {sample} | windowsize: {window_size}"
    )
    nlp = spacy.load("de_core_news_lg")
    ca = ContentAnalysis(nlp, window_size=window_size)
    entity_recognizer = EntityRecognizer(nlp)
    sentiment_recognizer = SentimentRecognizer(nlp)
    sentiws = spaCySentiWS(sentiws_path='sentiws/')
    # clf = TextClassification(nlp)
    # nlp.add_pipe(custom_lemma, last=True)
    nlp.add_pipe(custom_extensions, last=True)
    nlp.add_pipe(sentiment_recognizer, last=True)
    nlp.add_pipe(sentiws, last=True)
    nlp.add_pipe(entity_recognizer, last=True)
    nlp.add_pipe(ca, last=True)
    # nlp.add_pipe(clf, last=True)
    nlp.remove_pipe("ner")
    labels = []
    for label in tqdm(doc_labels):
        labels.append(label)
        if text:
            doc = nlp(text)
            if debug:
                for token in doc:
                    print(token.text, token.ent_type_, token._.is_elite_neg,
                          token._.is_attr, token._.is_negated, 'lemma',
                          token._.lemma)
        else:
            doc = nlp(gendocs(label))
        ca.results.labels.append(label)
    with open(f'res_ca/{directory}/labels.pkl', 'wb') as f:
        pickle.dump(labels, f)
    with open(f'res_ca/{directory}/results_all.pkl', 'wb') as f:
        pickle.dump(ca.results, f)
    print(f"Content Analysis complete. \nResults saved in {directory}/...")

    return ca.results
예제 #3
0
def visualize(self, label, span=None):
    """visualize documents with displacy"""
    row = self.df.loc[self.df['doc'] == label].copy()
    text = gendocs(label)
    viz = self.viz[self.labels.index(label)].copy()

    if span:
        viz_span = []
        for hit in viz:
            if hit['span_start'] == span[0]:
                print(hit)
                # hit['start'] -= span[0]
                hit['end'] -= span[0]
                hit['label'] = f"{hit['label']} | {hit['score']:.2f}"
                viz_span.append(hit)
            ex = [{
                "text":
                text[span[0]:span[1]],
                "ents":
                viz_span,
                "title":
                f"{row['doc'][0]} | {row.name_res[0]} ({row['party'][0]}) | {row['date'][0].strftime('%d/%m/%Y')}",
            }]
        all_ents = {i["label"] for i in viz_span}
        # print(ex)

    else:
        for hit in viz:
            hit['label'] = f"{hit['label']} | {hit['score']:.2f}"
            ex = [{
                "text":
                text,
                "ents":
                viz,
                "title":
                f"{row['doc'][0]} | {row.name_res[0]} ({row['party'][0]}) | {row['date'][0].strftime('%d/%m/%Y')}",
            }]
        # find unique labels for coloring options
        all_ents = {i["label"] for i in viz}

    options = {"ents": all_ents, "colors": dict()}
    for ent in all_ents:
        if ent.startswith("E"):
            options["colors"][ent] = "coral"
        if ent.startswith("V"):
            options["colors"][ent] = "lightgrey"
        if ent.startswith("P"):
            options["colors"][ent] = "yellow"

    displacy.render(ex,
                    style="ent",
                    manual=True,
                    jupyter=True,
                    options=options)
예제 #4
0
파일: clf.py 프로젝트: philip-py/mdb_nlp
def coding(res):
    res_viz = []
    for i, (doc, doc_vizs) in enumerate(zip(res.spans, res.viz)):
        if i % 500 == 0:
            print(i, f'/{len(res.spans)}')
        doc_viz = []
        doc_vizs = Results.filter_viz(doc_vizs, on='start')
        for span in res.spans[doc]:
            viz = []
            text = gendocs(doc)[span[0]:span[1]]
            viz.extend([viz for viz in doc_vizs if viz['span_start'] == span[0]])

            # final coding
            pop_hits_v = 0
            pop_hits_e = 0
            for v in viz:
                v['TOK_IS_POP'] = False
                v['SPAN_IS_POP'] = False

                if v['RLY_GER'] and (v['RLY_V'] == True or v['RLY_E'] == True):
                    v['TOK_IS_POP'] = True
                if v['TOK_IS_POP'] and v['coding'] == 'V':
                    pop_hits_v += 1
                    for attr in viz:
                        if attr['attr_of'] == v['start']:
                            attr['RLY_V'] = True
                            attr['TOK_IS_POP'] = True
                if v['TOK_IS_POP'] and (v['coding'] == 'E' or (v['coding'] == 'EA' and v['pos'] == 'NOUN')):
                    pop_hits_e += 1
                    for attr in viz:
                        if attr['attr_of'] == v['start']:
                            attr['RLY_E'] = True
                            attr['TOK_IS_POP'] = True

            if pop_hits_v > 0 and pop_hits_e > 0:
                for v in viz:
                    v['SPAN_IS_POP'] = True
            doc_viz.extend(viz)
        res_viz.append(doc_viz)
    return res_viz
예제 #5
0
def viz_id(df, id):
    ContentAnalysis.viz(gendocs(id), df.loc[df['doc'] == id])
예제 #6
0
파일: clf.py 프로젝트: philip-py/mdb_nlp
def clf_pop(res):
    # clf = pipeline("zero-shot-classification", model='joeddav/xlm-roberta-large-xnli', device=-1)
    res_viz = []
    for i, (doc, doc_vizs) in enumerate(zip(res.spans, res.viz)):
        if i % 500 == 0:
            print(i, f'/{len(res.spans)}')
        doc_viz = []
        doc_vizs = Results.filter_viz(doc_vizs, on='start')
        for span in res.spans[doc]:
            viz = []
            text = gendocs(doc)[span[0]:span[1]]
            viz.extend([viz for viz in doc_vizs if viz['span_start'] == span[0]])
            for v in viz:
                v['RLY_GER'] = True
                v['RLY_V'] = False
                v['RLY_E'] = False
                v['RLY_REASON'] = set()

            # 1. check if text is ger
            hypothesis_template = 'Der Text handelt von {}'
            candidate_labels = ['Deutschland', 'Europa', 'Ausland']
            s = clf(text, candidate_labels, hypothesis_template, multi_class=False)
            if s['labels'][0] == 'Ausland' and s['scores'][0] >= 0.9:
                for v in viz:
                    v['RLY_GER'] = False

            # 2. check if volk is benachteiligt:
            hypothesis_template = '{} hat Nachteile'
            candidate_labels = []
            for v in viz:
                if v['coding'] == 'V':
                    candidate_labels.append(v['lemma'])
            if hypothesis_template and candidate_labels:
                s = clf(text, candidate_labels, hypothesis_template, multi_class=True)

            candidates_people = []
            for j, label in enumerate(s['labels']):
                if s['scores'][j] >= 0.75:
                    candidates_people.append(label)
                    for v in viz:
                        if v['lemma'] == label:
                            v['RLY_V'] = True


            # 3. check if elite benachteiligt volk:
            for volk in candidates_people:
                h0 = '{} benachteiligt ' + volk
                h1 = '{} entmachtet ' + volk
                h2 = '{} betrügt ' + volk
                # h3 = '{} belügt ' + volk
                candidate_labels = []
                for v in viz:
                    if v['coding'] == 'E':
                        candidate_labels.append(v['lemma'])

                hs = [h0, h1, h2]
                for h, hypothesis_template in enumerate(hs):
                    if candidate_labels:
                        # print(hypothesis_template)
                        s = clf(text, candidate_labels, hypothesis_template, multi_class=True)
                        for j, label in enumerate(s['labels']):
                            if s['scores'][j] >= 0.75:
                                for v in viz:
                                    if v['lemma'] == label:
                                        v['RLY_E'] = True
                                        v['RLY_REASON'].add(h)

            # final coding
            pop_hits_v = 0
            pop_hits_e = 0
            for v in viz:
                v['TOK_IS_POP'] = False
                v['SPAN_IS_POP'] = False

                if v['RLY_GER'] and (v['RLY_V'] == True or v['RLY_E'] == True):
                    v['TOK_IS_POP'] = True
                if v['TOK_IS_POP'] and v['coding'] == 'V':
                    pop_hits_v += 1
                    for attr in viz:
                        if attr['attr_of'] == v['start']:
                            attr['RLY_V'] = True
                            attr['TOK_IS_POP'] = True
                if v['TOK_IS_POP'] and (v['coding'] == 'E' or (v['coding'] == 'EA' and v['pos'] == 'NOUN')):
                    pop_hits_e += 1
                    for attr in viz:
                        if attr['attr_of'] == v['start']:
                            attr['RLY_E'] = True
                            attr['TOK_IS_POP'] = True

            if pop_hits_v > 0 and pop_hits_e > 0:
                for v in viz:
                    v['SPAN_IS_POP'] = True
            # print(viz)
            doc_viz.extend(viz)
        res_viz.append(doc_viz)
    return res_viz
예제 #7
0
파일: clf.py 프로젝트: philip-py/mdb_nlp
import pickle
res = pickle.load(open("res_ca/test/results_all.pkl", "rb"))
res.set_entities()
res.compute_score()
res.create_df()
res.add_meta_plenar()
# display(res.df.groupby('party').mean())
res.compute_score_spans()
# res.visualize('plenar_029688', span=(3788, 4288))

#%%
for i, (doc, _)  in enumerate(zip(res.spans, res.viz)):
    # print(doc, _)
    for span in res.spans[doc]:
        # decisions of transformers
        s = clf(gendocs(doc)[span[0]:span[1]])[0]
        print(gendocs(doc)[span[0]:span[1]])
        print(s['label'], s['score'])
        for v in res.viz[i]:
            if v['span_start'] == span[0]:
                v['span_sent'] = s['label']
                v['span_sent_score'] = s['score']

# %%
class ContentAnalysis():

    def __init__(self, model):
        self.nlp = spacy.load(model)
        self.clf = pipeline("zero-shot-classification", model='joeddav/xlm-roberta-large-xnli', device=-1)

예제 #8
0
def clf_demo(clf, res, debug=False):
    res_viz = []
    for i, (doc, doc_vizs) in enumerate(zip(res.spans, res.viz)):
        if i % 500 == 0:
            print(i, f'/{len(res.spans)}')
        doc_viz = []
        # doc_vizs = Results.filter_viz(doc_vizs, on='start')
        for span in res.spans[doc]:
            viz = []
            text = gendocs(doc)[span[0]:span[1]]
            # viz.extend([viz for viz in doc_vizs if viz['span_start'] == span[0] and viz['span_end'] == span[1]])
            # viz.extend([viz for viz in doc_vizs if viz['span_start'] == span[0] and viz['start'] - viz['span_start'] <= 2_400])
            viz.extend([
                viz for viz in doc_vizs if viz['span_start'] == span[0]
                and viz['span_end'] == span[1] and viz['RLY_GER']
            ])
            checked_history = False
            is_present = True
            demo = [
                'Demokratie', 'Gewaltenteilung', 'Gerechtigkeit',
                'Meinungsfreiheit'
            ]
            for w in demo:
                if w in text:
                    if not checked_history:
                        hypothesis_template = 'Der Text beschreibt {}'
                        candidate_labels = [
                            'Geschichte', 'Nationalsozialismus'
                        ]
                        s = clf(text,
                                candidate_labels,
                                hypothesis_template,
                                multi_class=True)
                        if debug:
                            print(s)
                        if any(i > 0.75 for i in s['scores']):
                            is_present = False
                            checked_history = True

                    if is_present:
                        # REASON IS S
                        hypothesis_template = 'In Deutschland herrscht keine {}'
                        candidate_labels = [w]
                        s = clf(text,
                                candidate_labels,
                                hypothesis_template,
                                multi_class=True)
                        if s['scores'][0] > 0.75:
                            for v in viz:
                                if v['coding'].startswith('E'):
                                    v['RLY_E'] = True
                                    v['RLY_REASON'].add('S')
                                elif v['coding'].startswith('V'):
                                    v['RLY_V'] = True
                                    v['RLY_REASON'].add('S')

                        if debug:
                            pprint(hypothesis_template)
                            pprint(s)

            doc_viz.extend(viz)
        res_viz.append(doc_viz)
    return res_viz
예제 #9
0
def clf_pop(clf, res, debug=False):
    # clf = pipeline("zero-shot-classification", model='joeddav/xlm-roberta-large-xnli', device=-1)
    res_viz = []
    for i, (doc, doc_vizs) in enumerate(zip(res.spans, res.viz)):
        if i % 500 == 0:
            print(i, f'/{len(res.spans)}')
        doc_viz = []
        # doc_vizs = Results.filter_viz(doc_vizs, on='start')
        for span in res.spans[doc]:
            viz = []
            text = gendocs(doc)[span[0]:span[1]]
            # viz.extend([viz for viz in doc_vizs if viz['span_start'] == span[0] and viz['start'] - viz['span_start'] <= 2_400] and viz['RLY_GER'])
            viz.extend([
                viz for viz in doc_vizs if viz['span_start'] == span[0]
                and viz['span_end'] == span[1] and viz['RLY_GER']
            ])
            for v in viz:
                v['RLY_V'] = False
                v['RLY_E'] = False
                v['RLY_REASON'] = set()

            # 2. check if volk is benachteiligt:
            condition = False
            while not condition:
                h0 = '{} hat Nachteile'
                # h1 = 'ungerecht für {}'
                candidate_labels = set()
                for v in viz:
                    if v['coding'] == 'V':
                        candidate_labels.add(v['lemma'])
                candidate_labels = list(candidate_labels)
                hs = [h0]
                for h, hypothesis_template in enumerate(hs):
                    if hypothesis_template and candidate_labels:
                        s = clf(text,
                                candidate_labels,
                                hypothesis_template,
                                multi_class=True)

                    candidates_people = []
                    for j, label in enumerate(s['labels']):
                        if s['scores'][j] >= 0.75:
                            candidates_people.append(label)
                            for v in viz:
                                if v['lemma'] == label:
                                    v['RLY_V'] = True
                                    v['RLY_REASON'].add(h)
                            condition = True

                        if debug:
                            pprint(hypothesis_template)
                            pprint(s)

                condition = True

            # 3. check if elite benachteiligt volk:
            for volk in candidates_people:
                condition = False
                while not condition:
                    h0 = '{} benachteiligt ' + volk
                    h1 = '{} entmachtet ' + volk
                    h2 = '{} betrügt ' + volk
                    # h3 = '{} belügt ' + volk
                    candidate_labels = set()
                    for v in viz:
                        if v['coding'] == 'E' or (v['coding'] == 'EA'
                                                  and v['pos'] == 'NOUN'):
                            candidate_labels.add(v['lemma'])
                    candidate_labels = list(candidate_labels)

                    hs = [h0, h1, h2]
                    for h, hypothesis_template in enumerate(hs):
                        if candidate_labels:
                            s = clf(text,
                                    candidate_labels,
                                    hypothesis_template,
                                    multi_class=True)
                            for j, label in enumerate(s['labels']):
                                if s['scores'][j] >= 0.75:
                                    for v in viz:
                                        if v['lemma'] == label:
                                            v['RLY_E'] = True
                                            v['RLY_REASON'].add(h)
                                    condition = True

                            if debug:
                                pprint(hypothesis_template)
                                pprint(s)
                    condition = True

            doc_viz.extend(viz)
        res_viz.append(doc_viz)
    return res_viz
예제 #10
0
def clf_ger(clf, res, debug=False):
    res_viz = []
    for i, (doc, doc_vizs) in enumerate(zip(res.spans, res.viz)):
        if i % 500 == 0:
            print(i, f'/{len(res.spans)}')
        doc_viz = []
        seen_span = set()
        # doc_vizs = Results.filter_viz(doc_vizs, on='start')
        for span in res.spans[doc]:
            viz = []
            span_id = (span[0], span[1])
            text = gendocs(doc)[span[0]:span[1]]
            # viz.extend([viz for viz in doc_vizs if viz['span_start'] == span[0] and viz['start'] - viz['span_start'] <= 2_400])

            if span_id not in seen_span:
                viz.extend([
                    viz for viz in doc_vizs if viz['span_start'] == span[0]
                    and viz['span_end'] == span[1]
                ])
                seen_span.add(span_id)

            for v in viz:
                v['RLY_GER'] = True

            # if viz:
            # 1. check if text is ger
            hypothesis_template = 'Der Text handelt von {}'
            candidate_labels = ['Deutschland', 'Europa', 'Ausland']
            s = clf(text,
                    candidate_labels,
                    hypothesis_template,
                    multi_class=True)
            # if s['labels'][0] == 'Ausland' and s['scores'][0] > 0.5:
            id_ausland = s['labels'].index('Ausland')
            id_ger = s['labels'].index('Deutschland')
            if s['labels'][
                    -1] == 'Deutschland' and s['scores'][id_ausland] > 0.5:
                for v in viz:
                    v['RLY_GER'] = False

            elif s['labels'][0] == 'Ausland' and s['scores'][id_ausland] / s[
                    'scores'][id_ger] > 2:
                for v in viz:
                    v['RLY_GER'] = False

            ######################################
            # 1. check if text is ger v2:
            # hypothesis_template = 'Der Text beschreibt {}'
            # candidate_labels = ['Deutschland', 'Ausland']
            # s = clf(text, candidate_labels, hypothesis_template, multi_class=False)
            # if s['labels'][0] == 'Ausland' and s['scores'][0] >= 0.9:
            #     for v in viz:
            #         v['RLY_GER'] = False
            #####################################

            if debug:
                pprint(span_id)
                pprint(hypothesis_template)
                pprint(s)

            doc_viz.extend(viz)
        res_viz.append(doc_viz)
    return res_viz