def get_solr_artwork_data(self, query, multiple=False): query_solr_final = Solrindex('http://localhost:8983/solr/ND_final') if functions.represents_int(query): query = 'id:' + str(query) result = query_solr_final.search(query) if len(result.docs) > 0: if multiple: return result.docs else: return result.docs[0] return False
def update_characters_data(self): query = "SELECT processed_characters.id, processed_characters.label, processed_characters.id_entity, dbp_urls1.id AS id_wp, dbp_urls1.url AS url_wp, dbp_urls2.id AS id_mp, dbp_urls2.url AS url_mp FROM processed_characters " \ "LEFT JOIN dbp_urls dbp_urls1 ON dbp_urls1.id = processed_characters.id_dbp_urls_wikipedia " \ "LEFT JOIN dbp_urls dbp_urls2 ON dbp_urls2.id = processed_characters.id_dbp_urls_museodelprado " \ "WHERE (dbp_urls1.id IS NOT NULL OR dbp_urls2.url IS NOT NULL) and (processed_characters.id> 1579)" results = MysqlND.execute_query(query, ()) for res in results: data = {} id, label, id_entity, id_wp, url_wp, id_mp, url_mp = res[0], res[ 1], res[2], res[3], res[4], res[5], res[6] data['id'] = id data['label'] = label data['fecha_nacimiento'] = '' data['fecha_fallecimiento'] = '' data['lugar_nacimiento'] = '' data['lugar_fallecimiento'] = '' data['url_s'] = '' data['image'] = '' data['description_mp'] = '' data['description_wp'] = '' es_wp = False es_mp = False if functions.represents_int(id_wp): es_wp = True if functions.represents_int(id_mp): es_mp = True if es_wp: query = 'id:' + str(id_wp) self.update_image_and_dates_from_wp_character(id_entity) query_solr_final = Solrindex( 'http://localhost:8983/solr/ND_preprocessing') result = query_solr_final.search(query) if len(result.docs) > 0: doc = result.docs[0] texto = '' for key, value in doc.iteritems(): if '_txt_es' in key: texto += '<h3 class="wp_title">' + key.replace( "_txt_es", "" ).replace("_", " ").replace( "string", "" ).capitalize( ) + '</h3><p class="wp_paragraph">' + functions.cleanhtml( value) + '</p>' data['description_wp'] = texto else: if id_entity != -1: preprocess_information = PreprocessInformation() values = preprocess_information.search_label_in_wikidata_and_wikipedia( label) if values: id_wikipedia_dbp_urls, id_url_wikipedia_author, type, wikidata_label, wikidata_instancia_de_id_entity, wikidata_instancia_de_label, wikidata_id_entity, wikidata_label = values[ 0], values[1], values[2], values[3], values[ 4], values[5], values[6], values[7] data['description'] = '' if id_wikipedia_dbp_urls != -1: query = 'id:' + str(id_wikipedia_dbp_urls) query_solr_final = Solrindex( 'http://localhost:8983/solr/ND_preprocessing' ) result = query_solr_final.search(query) if len(result.docs) > 0: doc = result.docs[0] texto = '' for key, value in doc.iteritems(): if '_string_txt_es' in key: texto += '<h3 class="wp_title">' + key.replace( "_string_txt_es", "" ).replace( "description", "descripción" ).replace("_", " ").capitalize( ) + '</h3><p class="wp_paragraph">' + functions.cleanhtml( value) + '</p>' data['description_wp'] = texto # if es_mp: # query = 'id:' + str(id_mp) # query_solr_final = Solrindex('http://localhost:8983/solr/ND_preprocessing_pruebas') # result = query_solr_final.search(query) # if len(result.docs) > 0: # doc = result.docs[0] # if 'p96_E67_p4_gave_birth_date_d' in doc: # data['fecha_nacimiento'] = doc['p96_E67_p4_gave_birth_date_d'] # if 'p100i_E69_p4_death_date_d' in doc: # data['fecha_fallecimiento'] = doc['p100i_E69_p4_death_date_d'] # if 'p96_E67_p7_gave_birth_place_s' in doc: # data['lugar_nacimiento'] = doc['p96_E67_p7_gave_birth_place_s'] # if 'p100i_E69_p7_death_place_s' in doc: # data['lugar_fallecimiento'] = doc['p100i_E69_p7_death_place_s'] # if 'p3_has_note_s' in doc: # data['description_mp'] = doc['p3_has_note_s'] # if 'url_s' in doc: # data['url_s'] = doc['url_s'] # hdr = { # 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11', # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3', # 'Accept-Encoding': 'none', # 'Accept-Language': 'en-US,en;q=0.8', # 'Connection': 'keep-alive'} # req = urllib2.Request(doc['url_s'], headers=hdr) # try: # page = urllib2.urlopen(req) # except urllib2.HTTPError, e: # print e.fp.read() # content = page.read() # soup = BeautifulSoup(content, 'html.parser') # if soup.find('meta', attrs={"name": "twitter:image"}): # image_s = soup.find('meta', attrs={"name": "twitter:image"}) # image_s = image_s['content'] # data['image'] = image_s # else: # print("NO ENCUENTRO imagen para " + doc['url_s']) # update = "UPDATE processed_characters SET fecha_nacimiento=%s,fecha_fallecimiento=%s,lugar_nacimiento=%s,lugar_fallecimiento=%s,description_mp=%s,description_wp=%s,image=%s WHERE id=" + str(id) # pprint(update) # MysqlND.execute_query(update, (data['fecha_nacimiento'], data['fecha_fallecimiento'], data['lugar_nacimiento'], data['lugar_fallecimiento'], data['description_mp'], data['description_wp'], data['image'],)) update = "UPDATE processed_characters SET description_wp=%s WHERE id=" + str( id) pprint(update) MysqlND.execute_query(update, (data['description_wp'], ))
def process_mp_description(self, description, id_museodelprado=-1): array_elements_processed = [] # 1. get all tagged keywords soup = BeautifulSoup(description, 'html.parser') paragraphs = soup.find("p") if paragraphs == None: # get MP description... soup = BeautifulSoup( self.get_mp_description_from_id(id_museodelprado), 'html.parser') description = str(soup) for tag in soup.find_all('em'): tag_name = tag.getText() id, type = self.preprocess.create_reference( tag_name, is_string=True, createIfNotExistsWikidataEntity=False) tag.name = 'a' tag.attrs = {} if id != -1: array_elements_processed.append(tag_name) if type == 2: tag["id_artwork"] = str(id) elif type == 3 or type == 6: tag["id_character"] = str(id) elif type == 8: tag["id_event"] = str(id) else: tag["id_reference"] = str(id) else: tag.name = 'em' description = str(soup) # description_str = soup.getText() description_str = description # 2. find concepts by list of narrative elements narrative_elements = self.get_list_of_narrative_elements() array_ne = [ x for x in narrative_elements if x[0] in description_str and len(x[0]) > 3 and not represents_int(x[0]) ] # pprint(array_ne) for label, id, id_entity, table in array_ne: if label not in array_elements_processed: # print(label) if table == 'processed_characters': insensitive_hippo = re.compile( re.escape(" " + label + " "), re.IGNORECASE) description_str = insensitive_hippo.sub( ' <a id_character="' + str(id) + '">' + str(label) + '</a> ', description_str) elif table == 'wikidata_entidades': data = self.get_id_and_table_from_id_entity( id, id_entity, label) if data: id, label_, type_table = data[0], data[1], data[2] insensitive_hippo = re.compile( re.escape(" " + label + " "), re.IGNORECASE) description_str = insensitive_hippo.sub( ' <a ' + type_table + '="' + str(id) + '">' + str(label) + '</a> ', description_str) elif table == 'wikidata_instancias_de': data = self.get_id_and_table_from_id_entity( id, id_entity, label) if data: id, label_, type_table = data[0], data[1], data[2] insensitive_hippo = re.compile( re.escape(" " + label + " "), re.IGNORECASE) description_str = insensitive_hippo.sub( ' <a ' + type_table + '="' + str(id) + '">' + str(label) + '</a> ', description_str) elif table == 'processed_references': insensitive_hippo = re.compile( re.escape(" " + label + " "), re.IGNORECASE) description_str = insensitive_hippo.sub( ' <a id_reference="' + str(id) + '">' + str(label) + '</a> ', description_str) array_elements_processed.append(label) # 3. Stanford POS tagger nlp = StanfordCoreNLP(r'C:\stanford-corenlp-full-2018-02-27', lang='es') ner = nlp.ner(description_str) nlp.close() last_word = '' array_processed = [] i = 0 for c in ner: word = c[0] type = c[1] if len(ner) == i + 1: next_type = '' else: next_type = ner[i + 1][1] last_word = last_word + " " + word if next_type != type: data = [last_word, type] array_processed.append(data) last_word = '' i = i + 1 array_ner = [x for x in array_processed if x[1] != 'O'] array_ner = [ x for x in array_ner if x[0] not in array_elements_processed ] # pprint(array_ner) people = [x for x in array_ner if x[1] == 'PERSON'] numbers = [x for x in array_ner if x[1] == 'NUMBER'] dates = [x for x in array_ner if x[1] == 'DATE'] references = [ x for x in array_ner if x[1] != 'DATE' and x[1] != 'NUMBER' and x[1] != 'PERSON' ] people = self.remove_redundant_ner_elements(people) numbers = self.remove_redundant_ner_elements(numbers) dates = self.remove_redundant_ner_elements(dates) references = self.remove_redundant_ner_elements(references) for person in people: person_name = person[0].strip() data = self.preprocess.exists_already_label(person_name) if data: id, type = data[0], data[1] else: data = self.preprocess.search_label_in_wikipedia(person_name) if data: label, type, url, id_entity, id_instancia_de, id_ = data[0], data[1], data[2], data[3], data[4], \ data[5] if id_instancia_de == '812': id = self.preprocess.add_character( type, label, -1, url, id_entity, id_instancia_de) else: id = -1 if data and id != -1: if type == 3 or type == 6: description_str = description_str.replace( " " + person_name + " ", ' <a id_character="' + str(id) + '">' + str(person_name) + '</a> ') for w in numbers: name = w[0].strip() if len(name) == 4 and represents_int(name): data = self.preprocess.create_reference( name, is_string=True, createIfNotExistsWikidataEntity=False) if data: id, type = data[0], data[1] if type == 8 and id != -1: description_str = description_str.replace( " " + name + " ", ' <a id_event="' + str(id) + '">' + name + '</a> ') for w in dates: name = w[0].strip() if len(name) == 4 and represents_int(name): data = self.preprocess.create_reference( name, is_string=True, createIfNotExistsWikidataEntity=False) if data: id, type = data[0], data[1] if type == 8 and id != -1: description_str = description_str.replace( " " + name + " ", ' <a id_event="' + id + '">' + name + '</a> ') for w in references: name = w[0].strip() if len(name) > 3: data = self.preprocess.exists_already_label(name) if data: id, type = data[0], data[1] else: data = self.preprocess.search_label_in_wikipedia(name) if data: label, type, url, id_entity, id_instancia_de, id_ = data[0], data[1], data[2], data[3], data[4], \ data[5] id = self.preprocess.add_reference( type, label, url, id_entity, id_instancia_de) if data: if type == 7: description_str = description_str.replace( " " + name + " ", ' <a id_reference="' + str(id) + '">' + str(name) + '</a> ') return description_str