示例#1
0
def preprocessing(df, featurize):
    nlp_utils = nlpUtils()
    if (featurize):
        for i in range(len(df)):
            df.iloc[i, 0] = nlp_utils.featurize(df.iloc[i, 0])

    return df["text"].tolist(), df["label"].tolist()
示例#2
0
def positives_2():
    nlp_utils = nlpUtils()
    positives = list()
    for artist in bufferize_articles():
        inflooenzers = list()
        for el in graph_DAO.linked_to(artist["_id"], "inflooenz_by", False):
            inflooenzers.append(el[1])
        print(inflooenzers)
        if len(inflooenzers) > 0:
            positives.extend(nlp_utils.sentences_with_connections(artist["text"], inflooenzers, None))
    print(positives.__len__())
    return positives
示例#3
0
def negatives_2():
    nlp_utils = nlpUtils()
    negatives = list()
    for artist in bufferize_articles():
        connections = set()
        inflooenzers = set()
        for el in graph_DAO.linked_to(artist["_id"], "KNOWS", False):
            connections.add(el[1])
        for el in graph_DAO.linked_to(artist["_id"], "inflooenz_by", False):
            inflooenzers.add(el[1])
        not_inflooenzers = connections - inflooenzers

        if len(inflooenzers) > 0:
            negatives.extend(nlp_utils.sentences_with_connections(artist["text"], not_inflooenzers, None))
    print(negatives.__len__())
    return negatives
示例#4
0
def positives_1():
    nlp_utils = nlpUtils()
    positives  = list()
    for artist in bufferize_articles():
        style_sections =  select_section(artist["text"], ["influenc"], False)

        if style_sections.__len__() > 0:
            #print(artist["_id"])
            connections = list()
            for el in graph_DAO.linked_to(artist["_id"], "KNOWS", False):
                connections.append(el[1])
            #print(connections)
            for section in style_sections:
                positives.extend(nlp_utils.sentences_with_connections(section, connections, None))
    print(positives.__len__())
    return positives
示例#5
0
def negatives_1():
    nlp_utils = nlpUtils()
    negatives  = list()
    for artist in bufferize_articles():
        if len(select_section(artist["text"], ["influenc"], False)) > 0:
            not_style_sections =  select_section(artist["text"], ["influenc"], True)
            if not_style_sections.__len__() > 0:
                connections = list()
                for el in graph_DAO.linked_to(artist["_id"], "KNOWS", False):
                    connections.append(el[1])
                for section in not_style_sections:
                    for sentence in nlp_utils.sentences_with_connections(section, connections, None):
                        if len(sentence)>70:
                            negatives.append(sentence)
                            print(" #  " +sentence)
    print(negatives.__len__())
    return negatives
示例#6
0
def link_artists_distant_supervision():
    nlp_utils = nlpUtils()
    relex = Relation_Extractor()
    predictor = Predictor()
    duplicates = art_DAO.duplicate_labels()
    buffer = list()
    waiting4linking = list()
    buffer_dim = 100
    try:
        for artist in art_DAO.find_by_linked_ML(False):
            print("----_" + artist["label_ext"])
            connections = graph_DAO.related_to(artist["_id"], False)
            connections_only_label = list()
            for el in connections:
                connections_only_label.append(el[1])

            influencers = set()
            sent_list = nlp_utils.sentences_with_connections(
                artist["text"], connections_only_label, None
            )  # not limited to "inspired" and "influenced" ones because the ML model is supposed to works with every "influenced_by" representation
            for sent in sent_list:
                sent_featurized = nlp_utils.featurize(sent)
                prediction = predictor.predict_sentence(sent_featurized)[0]
                if prediction == 1:
                    sent_influencers = relex.get_artist_from_sentence(
                        sent, connections_only_label)
                    if sent_influencers is not None:
                        influencers.update(sent_influencers)
            for influencer in influencers:
                influencer_URI = get_URI_disambiguated(influencer, connections,
                                                       duplicates)
                print(influencer_URI)
                buffer.append((artist["_id"], influencer_URI))

            waiting4linking.append(artist)
            if waiting4linking.__len__() > buffer_dim:
                print(
                    "------------------- loading relations about %d artists on neo4j"
                    % (buffer_dim))
                process_buffer(buffer, waiting4linking, "inf_ML_by")
        print("--------------------- loading last relations on neo4j")
        process_buffer(buffer, waiting4linking, "inf_ML_by")
    except (pymongo.errors.CursorNotFound):
        print('------------------- CursorNotFound Error: ' + artist['_id'])
        link_artists_distant_supervision()
示例#7
0
def link_artists_patterns():
    relex = Relation_Extractor()
    nlp_utils = nlpUtils()
    duplicates = art_DAO.duplicate_labels()
    buffer = list()
    waiting4linking = list()
    buffer_dim = 100
    try:
        for artist in art_DAO.find_by_linked_patterns(False):
            print("----_" + artist["label_ext"])
            connections = graph_DAO.related_to(artist["_id"], False)
            connections_only_label = list()
            for el in connections:
                connections_only_label.append(el[1])

            influencers = set()
            sent_list = nlp_utils.sentences_with_connections(
                artist["text"], connections_only_label,
                ['inspire', 'influenc'])
            for sent in sent_list:
                sent_influencers = relex.extract_influencers(
                    sent, connections_only_label)
                if sent_influencers is not None:
                    influencers.update(sent_influencers)

            for influencer in influencers:
                influencer_URI = get_URI_disambiguated(influencer, connections,
                                                       duplicates)
                print(influencer_URI)
                buffer.append((artist["_id"], influencer_URI))

            waiting4linking.append(artist)
            if waiting4linking.__len__() > buffer_dim:
                print(
                    "------------------- loading relations about %d artists on neo4j"
                    % (buffer_dim))
                process_buffer(buffer, waiting4linking, "inf_patterns_by")
        print("--------------------- loading last relations on neo4j")
        process_buffer(buffer, waiting4linking, "inf_patterns_by")
    except (pymongo.errors.CursorNotFound):
        print('------------------- CursorNotFound Error: ' + artist['_id'])
        link_artists_patterns()
示例#8
0
from information_extraction.Relation_Extractor import Relation_Extractor
from db_data import WikiArtistDAO as art_DAO
from db_data import GraphDAO as graph_DAO
from information_extraction.nlpUtils import nlpUtils

relex = Relation_Extractor()
nlp_utils = nlpUtils()


for artist in art_DAO.find_by_linked(True):
    print("----------------------------------------")
    print(artist["_id"])
    first_sentence = nlp_utils.first_sentence(artist["text"])
    relex.extract_naitionalityAndType(first_sentence)

    res = graph_DAO.connections_from(artist["_id"], "KNOWS")
    connections = list()
    for record in res:
        connections.append(record["b"]["label"])

    if artist["label"] in connections:
        connections.remove(artist["label"])

    sent_list = nlp_utils.sentences_with_connections(artist["text"], connections, ['inspire', 'influenc'])
    for sent in sent_list:
        relex.extract_influencers(sent, connections)

    input("type something")

'''
frasi = list()
示例#9
0
    def __init__(self):
        path = os.path.dirname(os.path.realpath(__file__))
        self.df = pd.read_csv(os.path.join(path, "../data/countries.csv"))
        self.utils = nlpUtils()
        self.nlp = spacy.load("en_core_web_sm")
        self.nationality_matcher = Matcher(self.nlp.vocab)
        nat_pattern = list()
        nat_pattern.append([{
            'LEMMA': 'be'
        }, {
            'POS': 'DET'
        }, {
            'ENT_TYPE': {
                "IN": ["GPE", "NORP", "LANGUAGE"]
            },
            'OP': "*"
        }, {
            'POS': {
                "IN": ["NOUN", "PROPN", "PUNCT", "ADJ", "SYM"]
            },
            "OP": "*"
        }, {
            'POS': {
                "IN": ["NOUN", "PROPN", "ADJ"]
            },
            "OP": "+"
        }])
        nat_pattern.append([{
            'LEMMA': 'be'
        }, {
            'POS': 'DET'
        }, {
            'ENT_TYPE': {
                "IN": ["GPE", "NORP", "LANGUAGE"]
            },
            'OP': "*"
        }, {
            "DEP": {
                "IN": ["punct", "compound", "amod", "nmod"]
            },
            "OP": "*"
        }, {
            'POS': 'NOUN'
        }, {
            "POS": {
                "IN": ["PUNCT", "NOUN", "ADJ", "PROPN"]
            },
            "OP": "*"
        }, {
            'ORTH': 'and'
        }, {
            'POS': {
                "IN": ["NOUN", "PROPN", "PUNCT", "ADJ"]
            },
            "OP": "*"
        }, {
            'POS': {
                "IN": ["NOUN", "PROPN", "ADJ"]
            },
            "OP": "+"
        }])

        self.nationality_matcher.add("nationality", nat_pattern)

        self.influence_matcher = Matcher(self.nlp.vocab)

        influence1 = list()
        influence1.append([{
            'LEMMA': {
                "IN": ["inspire", "influence"]
            },
            "POS": 'VERB'
        }, {
            'ORTH': 'by'
        }, {
            "OP": "*"
        }])
        self.influence_matcher.add("influence1", influence1)

        influence2 = list()
        influence2.append([{
            'LEMMA': {
                "IN": ["cite", "refer", "list", "mention", "credit", "claim"]
            },
            "POS": 'VERB'
        }, {
            "OP": "*"
        }, {
            'LEMMA': {
                "IN": ["as", "among"]
            }
        }, {
            "OP": "*"
        }, {
            'LEMMA': 'influence',
            "POS": 'NOUN'
        }, {
            "OP": "*"
        }])
        influence2.append([{
            'LEMMA': {
                "IN": ["cite", "refer", "list", "mention", "credit", "claim"]
            },
            "POS": 'VERB'
        }, {
            "OP": "*"
        }, {
            'LEMMA': 'be'
        }, {
            "OP": "*"
        }, {
            'LEMMA': 'influence',
            "POS": 'NOUN'
        }])
        self.influence_matcher.add("influence2", influence2)

        influence3 = list()
        influence3.append([{
            'LEMMA': 'influence',
            "POS": 'NOUN'
        }, {
            'ORTH': 'include',
            "POS": 'VERB'
        }, {
            "OP": "*"
        }])
        self.influence_matcher.add("influence3", influence3)

        influence4 = list()
        influence4.append([{
            'ORTH': 'influences',
            "POS": 'NOUN'
        }, {
            'ORTH': 'cited'
        }, {
            'ORTH': 'by'
        }, {
            "OP": "*"
        }, {
            'ORTH': 'include',
            "POS": 'VERB'
        }, {
            "OP": "*"
        }])
        self.influence_matcher.add("influence4", influence4)

        influence5 = list()
        influence5.append([{
            'LEMMA': 'cite',
            "POS": 'VERB'
        }, {
            'ORTH': ','
        }, {
            "ORTH": "as"
        }, {
            "OP": "*"
        }, {
            'ORTH': 'influences',
            "POS": 'NOUN'
        }, {
            "OP": "*"
        }])
        self.influence_matcher.add("influence5", influence5)

        influence6 = list()
        influence6.append([{
            'LEMMA': 'state',
            "POS": 'VERB'
        }, {
            "OP": "*"
        }, {
            'LEMMA': 'influence',
            "POS": 'NOUN'
        }, {
            'LEMMA': 'be'
        }, {
            "OP": "*"
        }])
        self.influence_matcher.add("influence6", influence6)

        influence7 = list()
        influence7.append([{
            'ORTH': 'influences',
            "POS": 'NOUN'
        }, {
            "ORTH": "?"
        }, {
            "ORTH": "such"
        }, {
            "ORTH": "as"
        }, {
            "OP": "*"
        }])
        self.influence_matcher.add("influence7", influence7)

        influence8 = list()
        influence8.append([{
            'LEMMA': {
                "IN": ["cite", "name"]
            },
            "POS": "VERB"
        }, {
            "OP": "*"
        }, {
            "ORTH": "as"
        }, {
            "ORTH": "one"
        }, {
            "ORTH": "of"
        }, {
            "OP": "*"
        }, {
            "ORTH": "'s"
        }, {
            'LEMMA': 'influence',
            "POS": 'NOUN'
        }])
        self.influence_matcher.add("influence8", influence8)

        influence9 = list()
        influence9.append([{
            'LEMMA': 'influence',
            "POS": 'NOUN'
        }, {
            "ORTH": "including"
        }, {
            "OP": "*"
        }])
        self.influence_matcher.add("influence9", influence9)

        influence10 = list()
        influence10.append([{
            'LEMMA': 'influence',
            "POS": 'NOUN'
        }, {
            "OP": "*"
        }, {
            "ORTH": "from"
        }, {
            "OP": "*"
        }])
        self.influence_matcher.add("influence10", influence10)

        influence11 = list()
        influence11.append([{
            'ORTH': 'citing',
            "POS": 'VERB'
        }, {
            "ORTH": "as"
        }, {
            'LEMMA': 'influence',
            "POS": 'NOUN'
        }, {
            "OP": "*"
        }])
        self.influence_matcher.add("influence11", influence11)

        influence12 = list()
        influence12.append([{
            'LEMMA': 'influence',
            "POS": 'NOUN'
        }, {
            'LEMMA': 'be'
        }, {
            "OP": "*"
        }])
        self.influence_matcher.add("influence12", influence12)

        influence13 = list()
        influence13.append([{
            'LEMMA': 'influence',
            "POS": 'NOUN'
        }, {
            'ORTH': 'of'
        }, {
            "OP": "*"
        }])
        self.influence_matcher.add("influence13", influence13)

        influence14 = list()
        influence14.append([{
            'LEMMA': 'inspiration',
            "POS": 'NOUN'
        }, {
            'ORTH': {
                "IN": ["from", "include"]
            }
        }, {
            "OP": "*"
        }])
        influence14.append([{
            'LEMMA': 'cite',
            "POS": 'VERB'
        }, {
            "OP": "*"
        }, {
            "ORTH": "as"
        }, {
            'LEMMA': 'inspiration',
            "POS": 'NOUN'
        }])
        self.influence_matcher.add("influence14", influence14)

        self.mappa = dict()
        self.mappa[self.nlp.vocab.strings["influence1"]] = "influence1"
        self.mappa[self.nlp.vocab.strings["influence2"]] = "influence2"
        self.mappa[self.nlp.vocab.strings["influence3"]] = "influence3"
        self.mappa[self.nlp.vocab.strings["influence4"]] = "influence4"
        self.mappa[self.nlp.vocab.strings["influence5"]] = "influence5"
        self.mappa[self.nlp.vocab.strings["influence6"]] = "influence6"
        self.mappa[self.nlp.vocab.strings["influence7"]] = "influence7"
        self.mappa[self.nlp.vocab.strings["influence8"]] = "influence8"
        self.mappa[self.nlp.vocab.strings["influence9"]] = "influence9"
        self.mappa[self.nlp.vocab.strings["influence10"]] = "influence10"
        self.mappa[self.nlp.vocab.strings["influence11"]] = "influence11"
        self.mappa[self.nlp.vocab.strings["influence12"]] = "influence12"
        self.mappa[self.nlp.vocab.strings["influence13"]] = "influence13"
        self.mappa[self.nlp.vocab.strings["influence14"]] = "influence14"