def get_solr_artwork_data(self, query, multiple=False):
     query_solr_final = Solrindex('http://localhost:8983/solr/ND_final')
     if functions.represents_int(query):
         query = 'id:' + str(query)
     result = query_solr_final.search(query)
     if len(result.docs) > 0:
         if multiple:
             return result.docs
         else:
             return result.docs[0]
     return False
    def update_characters_data(self):
        query = "SELECT processed_characters.id, processed_characters.label, processed_characters.id_entity, dbp_urls1.id AS id_wp, dbp_urls1.url AS url_wp, dbp_urls2.id AS id_mp, dbp_urls2.url AS url_mp FROM processed_characters " \
                "LEFT JOIN dbp_urls dbp_urls1 ON dbp_urls1.id = processed_characters.id_dbp_urls_wikipedia " \
                "LEFT JOIN dbp_urls dbp_urls2 ON dbp_urls2.id = processed_characters.id_dbp_urls_museodelprado " \
                "WHERE (dbp_urls1.id IS NOT NULL OR dbp_urls2.url IS NOT NULL) and  (processed_characters.id> 1579)"
        results = MysqlND.execute_query(query, ())
        for res in results:
            data = {}
            id, label, id_entity, id_wp, url_wp, id_mp, url_mp = res[0], res[
                1], res[2], res[3], res[4], res[5], res[6]

            data['id'] = id
            data['label'] = label
            data['fecha_nacimiento'] = ''
            data['fecha_fallecimiento'] = ''
            data['lugar_nacimiento'] = ''
            data['lugar_fallecimiento'] = ''
            data['url_s'] = ''
            data['image'] = ''
            data['description_mp'] = ''
            data['description_wp'] = ''
            es_wp = False
            es_mp = False
            if functions.represents_int(id_wp):
                es_wp = True
            if functions.represents_int(id_mp):
                es_mp = True

            if es_wp:
                query = 'id:' + str(id_wp)
                self.update_image_and_dates_from_wp_character(id_entity)
                query_solr_final = Solrindex(
                    'http://localhost:8983/solr/ND_preprocessing')
                result = query_solr_final.search(query)
                if len(result.docs) > 0:
                    doc = result.docs[0]
                    texto = ''
                    for key, value in doc.iteritems():
                        if '_txt_es' in key:
                            texto += '<h3 class="wp_title">' + key.replace(
                                "_txt_es", ""
                            ).replace("_", " ").replace(
                                "string", ""
                            ).capitalize(
                            ) + '</h3><p class="wp_paragraph">' + functions.cleanhtml(
                                value) + '</p>'
                    data['description_wp'] = texto
                else:
                    if id_entity != -1:
                        preprocess_information = PreprocessInformation()
                        values = preprocess_information.search_label_in_wikidata_and_wikipedia(
                            label)
                        if values:
                            id_wikipedia_dbp_urls, id_url_wikipedia_author, type, wikidata_label, wikidata_instancia_de_id_entity, wikidata_instancia_de_label, wikidata_id_entity, wikidata_label = values[
                                0], values[1], values[2], values[3], values[
                                    4], values[5], values[6], values[7]

                            data['description'] = ''
                            if id_wikipedia_dbp_urls != -1:
                                query = 'id:' + str(id_wikipedia_dbp_urls)
                                query_solr_final = Solrindex(
                                    'http://localhost:8983/solr/ND_preprocessing'
                                )
                                result = query_solr_final.search(query)
                                if len(result.docs) > 0:
                                    doc = result.docs[0]
                                    texto = ''
                                    for key, value in doc.iteritems():
                                        if '_string_txt_es' in key:
                                            texto += '<h3 class="wp_title">' + key.replace(
                                                "_string_txt_es", ""
                                            ).replace(
                                                "description", "descripción"
                                            ).replace("_", " ").capitalize(
                                            ) + '</h3><p class="wp_paragraph">' + functions.cleanhtml(
                                                value) + '</p>'
                                    data['description_wp'] = texto

            # if es_mp:
            #     query = 'id:' + str(id_mp)
            #     query_solr_final = Solrindex('http://localhost:8983/solr/ND_preprocessing_pruebas')
            #     result = query_solr_final.search(query)
            #     if len(result.docs) > 0:
            #         doc = result.docs[0]
            #         if 'p96_E67_p4_gave_birth_date_d' in doc:
            #             data['fecha_nacimiento'] = doc['p96_E67_p4_gave_birth_date_d']
            #         if 'p100i_E69_p4_death_date_d' in doc:
            #             data['fecha_fallecimiento'] = doc['p100i_E69_p4_death_date_d']
            #         if 'p96_E67_p7_gave_birth_place_s' in doc:
            #             data['lugar_nacimiento'] = doc['p96_E67_p7_gave_birth_place_s']
            #         if 'p100i_E69_p7_death_place_s' in doc:
            #             data['lugar_fallecimiento'] = doc['p100i_E69_p7_death_place_s']
            #         if 'p3_has_note_s' in doc:
            #             data['description_mp'] = doc['p3_has_note_s']
            #         if 'url_s' in doc:
            #             data['url_s'] = doc['url_s']
            #             hdr = {
            #                 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
            #                 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            #                 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
            #                 'Accept-Encoding': 'none',
            #                 'Accept-Language': 'en-US,en;q=0.8',
            #                 'Connection': 'keep-alive'}
            #             req = urllib2.Request(doc['url_s'], headers=hdr)
            #             try:
            #                 page = urllib2.urlopen(req)
            #             except urllib2.HTTPError, e:
            #                 print e.fp.read()
            #             content = page.read()
            #             soup = BeautifulSoup(content, 'html.parser')
            #             if soup.find('meta', attrs={"name": "twitter:image"}):
            #                 image_s = soup.find('meta', attrs={"name": "twitter:image"})
            #                 image_s = image_s['content']
            #                 data['image'] = image_s
            #             else:
            #                 print("NO ENCUENTRO imagen para " + doc['url_s'])
            # update = "UPDATE processed_characters SET fecha_nacimiento=%s,fecha_fallecimiento=%s,lugar_nacimiento=%s,lugar_fallecimiento=%s,description_mp=%s,description_wp=%s,image=%s WHERE id=" + str(id)
            # pprint(update)
            # MysqlND.execute_query(update, (data['fecha_nacimiento'], data['fecha_fallecimiento'], data['lugar_nacimiento'], data['lugar_fallecimiento'], data['description_mp'], data['description_wp'], data['image'],))
            update = "UPDATE processed_characters SET description_wp=%s WHERE id=" + str(
                id)
            pprint(update)
            MysqlND.execute_query(update, (data['description_wp'], ))
示例#3
0
    def process_mp_description(self, description, id_museodelprado=-1):
        array_elements_processed = []
        # 1. get all tagged keywords

        soup = BeautifulSoup(description, 'html.parser')
        paragraphs = soup.find("p")
        if paragraphs == None:
            # get MP description...
            soup = BeautifulSoup(
                self.get_mp_description_from_id(id_museodelprado),
                'html.parser')
            description = str(soup)

        for tag in soup.find_all('em'):
            tag_name = tag.getText()
            id, type = self.preprocess.create_reference(
                tag_name,
                is_string=True,
                createIfNotExistsWikidataEntity=False)
            tag.name = 'a'
            tag.attrs = {}
            if id != -1:
                array_elements_processed.append(tag_name)
                if type == 2:
                    tag["id_artwork"] = str(id)
                elif type == 3 or type == 6:
                    tag["id_character"] = str(id)
                elif type == 8:
                    tag["id_event"] = str(id)
                else:
                    tag["id_reference"] = str(id)
            else:
                tag.name = 'em'
        description = str(soup)

        # description_str = soup.getText()
        description_str = description
        # 2. find concepts by list of narrative elements
        narrative_elements = self.get_list_of_narrative_elements()
        array_ne = [
            x for x in narrative_elements if x[0] in description_str
            and len(x[0]) > 3 and not represents_int(x[0])
        ]
        # pprint(array_ne)

        for label, id, id_entity, table in array_ne:
            if label not in array_elements_processed:
                # print(label)
                if table == 'processed_characters':
                    insensitive_hippo = re.compile(
                        re.escape(" " + label + " "), re.IGNORECASE)
                    description_str = insensitive_hippo.sub(
                        ' <a id_character="' + str(id) + '">' + str(label) +
                        '</a> ', description_str)
                elif table == 'wikidata_entidades':
                    data = self.get_id_and_table_from_id_entity(
                        id, id_entity, label)
                    if data:
                        id, label_, type_table = data[0], data[1], data[2]
                        insensitive_hippo = re.compile(
                            re.escape(" " + label + " "), re.IGNORECASE)
                        description_str = insensitive_hippo.sub(
                            ' <a ' + type_table + '="' + str(id) + '">' +
                            str(label) + '</a> ', description_str)
                elif table == 'wikidata_instancias_de':
                    data = self.get_id_and_table_from_id_entity(
                        id, id_entity, label)
                    if data:
                        id, label_, type_table = data[0], data[1], data[2]
                        insensitive_hippo = re.compile(
                            re.escape(" " + label + " "), re.IGNORECASE)
                        description_str = insensitive_hippo.sub(
                            ' <a ' + type_table + '="' + str(id) + '">' +
                            str(label) + '</a> ', description_str)
                elif table == 'processed_references':
                    insensitive_hippo = re.compile(
                        re.escape(" " + label + " "), re.IGNORECASE)
                    description_str = insensitive_hippo.sub(
                        ' <a id_reference="' + str(id) + '">' + str(label) +
                        '</a> ', description_str)
            array_elements_processed.append(label)

        # 3. Stanford POS tagger

        nlp = StanfordCoreNLP(r'C:\stanford-corenlp-full-2018-02-27',
                              lang='es')
        ner = nlp.ner(description_str)
        nlp.close()
        last_word = ''
        array_processed = []
        i = 0
        for c in ner:
            word = c[0]
            type = c[1]
            if len(ner) == i + 1:
                next_type = ''
            else:
                next_type = ner[i + 1][1]
            last_word = last_word + " " + word
            if next_type != type:
                data = [last_word, type]
                array_processed.append(data)
                last_word = ''
            i = i + 1

        array_ner = [x for x in array_processed if x[1] != 'O']
        array_ner = [
            x for x in array_ner if x[0] not in array_elements_processed
        ]
        # pprint(array_ner)
        people = [x for x in array_ner if x[1] == 'PERSON']
        numbers = [x for x in array_ner if x[1] == 'NUMBER']
        dates = [x for x in array_ner if x[1] == 'DATE']
        references = [
            x for x in array_ner
            if x[1] != 'DATE' and x[1] != 'NUMBER' and x[1] != 'PERSON'
        ]

        people = self.remove_redundant_ner_elements(people)
        numbers = self.remove_redundant_ner_elements(numbers)
        dates = self.remove_redundant_ner_elements(dates)
        references = self.remove_redundant_ner_elements(references)

        for person in people:
            person_name = person[0].strip()
            data = self.preprocess.exists_already_label(person_name)
            if data:
                id, type = data[0], data[1]
            else:
                data = self.preprocess.search_label_in_wikipedia(person_name)
                if data:
                    label, type, url, id_entity, id_instancia_de, id_ = data[0], data[1], data[2], data[3], data[4], \
                                                                        data[5]
                    if id_instancia_de == '812':
                        id = self.preprocess.add_character(
                            type, label, -1, url, id_entity, id_instancia_de)
                    else:
                        id = -1
            if data and id != -1:
                if type == 3 or type == 6:
                    description_str = description_str.replace(
                        " " + person_name + " ", ' <a id_character="' +
                        str(id) + '">' + str(person_name) + '</a> ')

        for w in numbers:
            name = w[0].strip()
            if len(name) == 4 and represents_int(name):
                data = self.preprocess.create_reference(
                    name,
                    is_string=True,
                    createIfNotExistsWikidataEntity=False)
                if data:
                    id, type = data[0], data[1]
                    if type == 8 and id != -1:
                        description_str = description_str.replace(
                            " " + name + " ",
                            ' <a id_event="' + str(id) + '">' + name + '</a> ')

        for w in dates:
            name = w[0].strip()
            if len(name) == 4 and represents_int(name):
                data = self.preprocess.create_reference(
                    name,
                    is_string=True,
                    createIfNotExistsWikidataEntity=False)
                if data:
                    id, type = data[0], data[1]
                    if type == 8 and id != -1:
                        description_str = description_str.replace(
                            " " + name + " ",
                            ' <a id_event="' + id + '">' + name + '</a> ')

        for w in references:
            name = w[0].strip()
            if len(name) > 3:
                data = self.preprocess.exists_already_label(name)
                if data:
                    id, type = data[0], data[1]
                else:
                    data = self.preprocess.search_label_in_wikipedia(name)
                    if data:
                        label, type, url, id_entity, id_instancia_de, id_ = data[0], data[1], data[2], data[3], data[4], \
                                                                            data[5]
                        id = self.preprocess.add_reference(
                            type, label, url, id_entity, id_instancia_de)
                if data:
                    if type == 7:
                        description_str = description_str.replace(
                            " " + name + " ", ' <a id_reference="' + str(id) +
                            '">' + str(name) + '</a> ')

        return description_str