def preprocessing(df, featurize): nlp_utils = nlpUtils() if (featurize): for i in range(len(df)): df.iloc[i, 0] = nlp_utils.featurize(df.iloc[i, 0]) return df["text"].tolist(), df["label"].tolist()
def positives_2(): nlp_utils = nlpUtils() positives = list() for artist in bufferize_articles(): inflooenzers = list() for el in graph_DAO.linked_to(artist["_id"], "inflooenz_by", False): inflooenzers.append(el[1]) print(inflooenzers) if len(inflooenzers) > 0: positives.extend(nlp_utils.sentences_with_connections(artist["text"], inflooenzers, None)) print(positives.__len__()) return positives
def negatives_2(): nlp_utils = nlpUtils() negatives = list() for artist in bufferize_articles(): connections = set() inflooenzers = set() for el in graph_DAO.linked_to(artist["_id"], "KNOWS", False): connections.add(el[1]) for el in graph_DAO.linked_to(artist["_id"], "inflooenz_by", False): inflooenzers.add(el[1]) not_inflooenzers = connections - inflooenzers if len(inflooenzers) > 0: negatives.extend(nlp_utils.sentences_with_connections(artist["text"], not_inflooenzers, None)) print(negatives.__len__()) return negatives
def positives_1(): nlp_utils = nlpUtils() positives = list() for artist in bufferize_articles(): style_sections = select_section(artist["text"], ["influenc"], False) if style_sections.__len__() > 0: #print(artist["_id"]) connections = list() for el in graph_DAO.linked_to(artist["_id"], "KNOWS", False): connections.append(el[1]) #print(connections) for section in style_sections: positives.extend(nlp_utils.sentences_with_connections(section, connections, None)) print(positives.__len__()) return positives
def negatives_1(): nlp_utils = nlpUtils() negatives = list() for artist in bufferize_articles(): if len(select_section(artist["text"], ["influenc"], False)) > 0: not_style_sections = select_section(artist["text"], ["influenc"], True) if not_style_sections.__len__() > 0: connections = list() for el in graph_DAO.linked_to(artist["_id"], "KNOWS", False): connections.append(el[1]) for section in not_style_sections: for sentence in nlp_utils.sentences_with_connections(section, connections, None): if len(sentence)>70: negatives.append(sentence) print(" # " +sentence) print(negatives.__len__()) return negatives
def link_artists_distant_supervision(): nlp_utils = nlpUtils() relex = Relation_Extractor() predictor = Predictor() duplicates = art_DAO.duplicate_labels() buffer = list() waiting4linking = list() buffer_dim = 100 try: for artist in art_DAO.find_by_linked_ML(False): print("----_" + artist["label_ext"]) connections = graph_DAO.related_to(artist["_id"], False) connections_only_label = list() for el in connections: connections_only_label.append(el[1]) influencers = set() sent_list = nlp_utils.sentences_with_connections( artist["text"], connections_only_label, None ) # not limited to "inspired" and "influenced" ones because the ML model is supposed to works with every "influenced_by" representation for sent in sent_list: sent_featurized = nlp_utils.featurize(sent) prediction = predictor.predict_sentence(sent_featurized)[0] if prediction == 1: sent_influencers = relex.get_artist_from_sentence( sent, connections_only_label) if sent_influencers is not None: influencers.update(sent_influencers) for influencer in influencers: influencer_URI = get_URI_disambiguated(influencer, connections, duplicates) print(influencer_URI) buffer.append((artist["_id"], influencer_URI)) waiting4linking.append(artist) if waiting4linking.__len__() > buffer_dim: print( "------------------- loading relations about %d artists on neo4j" % (buffer_dim)) process_buffer(buffer, waiting4linking, "inf_ML_by") print("--------------------- loading last relations on neo4j") process_buffer(buffer, waiting4linking, "inf_ML_by") except (pymongo.errors.CursorNotFound): print('------------------- CursorNotFound Error: ' + artist['_id']) link_artists_distant_supervision()
def link_artists_patterns(): relex = Relation_Extractor() nlp_utils = nlpUtils() duplicates = art_DAO.duplicate_labels() buffer = list() waiting4linking = list() buffer_dim = 100 try: for artist in art_DAO.find_by_linked_patterns(False): print("----_" + artist["label_ext"]) connections = graph_DAO.related_to(artist["_id"], False) connections_only_label = list() for el in connections: connections_only_label.append(el[1]) influencers = set() sent_list = nlp_utils.sentences_with_connections( artist["text"], connections_only_label, ['inspire', 'influenc']) for sent in sent_list: sent_influencers = relex.extract_influencers( sent, connections_only_label) if sent_influencers is not None: influencers.update(sent_influencers) for influencer in influencers: influencer_URI = get_URI_disambiguated(influencer, connections, duplicates) print(influencer_URI) buffer.append((artist["_id"], influencer_URI)) waiting4linking.append(artist) if waiting4linking.__len__() > buffer_dim: print( "------------------- loading relations about %d artists on neo4j" % (buffer_dim)) process_buffer(buffer, waiting4linking, "inf_patterns_by") print("--------------------- loading last relations on neo4j") process_buffer(buffer, waiting4linking, "inf_patterns_by") except (pymongo.errors.CursorNotFound): print('------------------- CursorNotFound Error: ' + artist['_id']) link_artists_patterns()
from information_extraction.Relation_Extractor import Relation_Extractor from db_data import WikiArtistDAO as art_DAO from db_data import GraphDAO as graph_DAO from information_extraction.nlpUtils import nlpUtils relex = Relation_Extractor() nlp_utils = nlpUtils() for artist in art_DAO.find_by_linked(True): print("----------------------------------------") print(artist["_id"]) first_sentence = nlp_utils.first_sentence(artist["text"]) relex.extract_naitionalityAndType(first_sentence) res = graph_DAO.connections_from(artist["_id"], "KNOWS") connections = list() for record in res: connections.append(record["b"]["label"]) if artist["label"] in connections: connections.remove(artist["label"]) sent_list = nlp_utils.sentences_with_connections(artist["text"], connections, ['inspire', 'influenc']) for sent in sent_list: relex.extract_influencers(sent, connections) input("type something") ''' frasi = list()
def __init__(self): path = os.path.dirname(os.path.realpath(__file__)) self.df = pd.read_csv(os.path.join(path, "../data/countries.csv")) self.utils = nlpUtils() self.nlp = spacy.load("en_core_web_sm") self.nationality_matcher = Matcher(self.nlp.vocab) nat_pattern = list() nat_pattern.append([{ 'LEMMA': 'be' }, { 'POS': 'DET' }, { 'ENT_TYPE': { "IN": ["GPE", "NORP", "LANGUAGE"] }, 'OP': "*" }, { 'POS': { "IN": ["NOUN", "PROPN", "PUNCT", "ADJ", "SYM"] }, "OP": "*" }, { 'POS': { "IN": ["NOUN", "PROPN", "ADJ"] }, "OP": "+" }]) nat_pattern.append([{ 'LEMMA': 'be' }, { 'POS': 'DET' }, { 'ENT_TYPE': { "IN": ["GPE", "NORP", "LANGUAGE"] }, 'OP': "*" }, { "DEP": { "IN": ["punct", "compound", "amod", "nmod"] }, "OP": "*" }, { 'POS': 'NOUN' }, { "POS": { "IN": ["PUNCT", "NOUN", "ADJ", "PROPN"] }, "OP": "*" }, { 'ORTH': 'and' }, { 'POS': { "IN": ["NOUN", "PROPN", "PUNCT", "ADJ"] }, "OP": "*" }, { 'POS': { "IN": ["NOUN", "PROPN", "ADJ"] }, "OP": "+" }]) self.nationality_matcher.add("nationality", nat_pattern) self.influence_matcher = Matcher(self.nlp.vocab) influence1 = list() influence1.append([{ 'LEMMA': { "IN": ["inspire", "influence"] }, "POS": 'VERB' }, { 'ORTH': 'by' }, { "OP": "*" }]) self.influence_matcher.add("influence1", influence1) influence2 = list() influence2.append([{ 'LEMMA': { "IN": ["cite", "refer", "list", "mention", "credit", "claim"] }, "POS": 'VERB' }, { "OP": "*" }, { 'LEMMA': { "IN": ["as", "among"] } }, { "OP": "*" }, { 'LEMMA': 'influence', "POS": 'NOUN' }, { "OP": "*" }]) influence2.append([{ 'LEMMA': { "IN": ["cite", "refer", "list", "mention", "credit", "claim"] }, "POS": 'VERB' }, { "OP": "*" }, { 'LEMMA': 'be' }, { "OP": "*" }, { 'LEMMA': 'influence', "POS": 'NOUN' }]) self.influence_matcher.add("influence2", influence2) influence3 = list() influence3.append([{ 'LEMMA': 'influence', "POS": 'NOUN' }, { 'ORTH': 'include', "POS": 'VERB' }, { "OP": "*" }]) self.influence_matcher.add("influence3", influence3) influence4 = list() influence4.append([{ 'ORTH': 'influences', "POS": 'NOUN' }, { 'ORTH': 'cited' }, { 'ORTH': 'by' }, { "OP": "*" }, { 'ORTH': 'include', "POS": 'VERB' }, { "OP": "*" }]) self.influence_matcher.add("influence4", influence4) influence5 = list() influence5.append([{ 'LEMMA': 'cite', "POS": 'VERB' }, { 'ORTH': ',' }, { "ORTH": "as" }, { "OP": "*" }, { 'ORTH': 'influences', "POS": 'NOUN' }, { "OP": "*" }]) self.influence_matcher.add("influence5", influence5) influence6 = list() influence6.append([{ 'LEMMA': 'state', "POS": 'VERB' }, { "OP": "*" }, { 'LEMMA': 'influence', "POS": 'NOUN' }, { 'LEMMA': 'be' }, { "OP": "*" }]) self.influence_matcher.add("influence6", influence6) influence7 = list() influence7.append([{ 'ORTH': 'influences', "POS": 'NOUN' }, { "ORTH": "?" }, { "ORTH": "such" }, { "ORTH": "as" }, { "OP": "*" }]) self.influence_matcher.add("influence7", influence7) influence8 = list() influence8.append([{ 'LEMMA': { "IN": ["cite", "name"] }, "POS": "VERB" }, { "OP": "*" }, { "ORTH": "as" }, { "ORTH": "one" }, { "ORTH": "of" }, { "OP": "*" }, { "ORTH": "'s" }, { 'LEMMA': 'influence', "POS": 'NOUN' }]) self.influence_matcher.add("influence8", influence8) influence9 = list() influence9.append([{ 'LEMMA': 'influence', "POS": 'NOUN' }, { "ORTH": "including" }, { "OP": "*" }]) self.influence_matcher.add("influence9", influence9) influence10 = list() influence10.append([{ 'LEMMA': 'influence', "POS": 'NOUN' }, { "OP": "*" }, { "ORTH": "from" }, { "OP": "*" }]) self.influence_matcher.add("influence10", influence10) influence11 = list() influence11.append([{ 'ORTH': 'citing', "POS": 'VERB' }, { "ORTH": "as" }, { 'LEMMA': 'influence', "POS": 'NOUN' }, { "OP": "*" }]) self.influence_matcher.add("influence11", influence11) influence12 = list() influence12.append([{ 'LEMMA': 'influence', "POS": 'NOUN' }, { 'LEMMA': 'be' }, { "OP": "*" }]) self.influence_matcher.add("influence12", influence12) influence13 = list() influence13.append([{ 'LEMMA': 'influence', "POS": 'NOUN' }, { 'ORTH': 'of' }, { "OP": "*" }]) self.influence_matcher.add("influence13", influence13) influence14 = list() influence14.append([{ 'LEMMA': 'inspiration', "POS": 'NOUN' }, { 'ORTH': { "IN": ["from", "include"] } }, { "OP": "*" }]) influence14.append([{ 'LEMMA': 'cite', "POS": 'VERB' }, { "OP": "*" }, { "ORTH": "as" }, { 'LEMMA': 'inspiration', "POS": 'NOUN' }]) self.influence_matcher.add("influence14", influence14) self.mappa = dict() self.mappa[self.nlp.vocab.strings["influence1"]] = "influence1" self.mappa[self.nlp.vocab.strings["influence2"]] = "influence2" self.mappa[self.nlp.vocab.strings["influence3"]] = "influence3" self.mappa[self.nlp.vocab.strings["influence4"]] = "influence4" self.mappa[self.nlp.vocab.strings["influence5"]] = "influence5" self.mappa[self.nlp.vocab.strings["influence6"]] = "influence6" self.mappa[self.nlp.vocab.strings["influence7"]] = "influence7" self.mappa[self.nlp.vocab.strings["influence8"]] = "influence8" self.mappa[self.nlp.vocab.strings["influence9"]] = "influence9" self.mappa[self.nlp.vocab.strings["influence10"]] = "influence10" self.mappa[self.nlp.vocab.strings["influence11"]] = "influence11" self.mappa[self.nlp.vocab.strings["influence12"]] = "influence12" self.mappa[self.nlp.vocab.strings["influence13"]] = "influence13" self.mappa[self.nlp.vocab.strings["influence14"]] = "influence14"