Пример #1
0
class SELLightFeatureExtractor:

    # set up logging
    handler = logging.StreamHandler()
    handler.setFormatter(logging.Formatter('%(asctime)s %(name)-12s %(levelname)-8s %(message)s'))
    logger = logging.getLogger(__name__)
    logger.addHandler(handler)
    logger.propagate = False
    logger.setLevel(logging.INFO)

    def __init__(self, features_to_zero = []):

        # __ instance variables
        self.ds = WikipediaDataset()
        self.features_to_zero = features_to_zero

    def get_entity_set(self, entity_list):
        entity_set = set()
        name_by_entity_id = {}
        for e in entity_list:
            entity_set.add(e.entity_id)
            name = e.text
            name_by_entity_id[e.entity_id] = name
        return entity_set, name_by_entity_id

    # 1 Position ___________________________________________________________
    def calc_pos_for_entity(self, text, entity_list, entity_id):
        count = 0
        positions = []
        for e2 in entity_list:
            if e2.entity_id == entity_id:
                i = (e2.start_char / len(text))
                positions.append(i)
                count += 1
        return np.min(positions), np.max(positions), np.mean(positions), np.std(positions)

    def calc_positions(self, text, entity_list, entity_id_set):
        pos = {}
        for entity_id in entity_id_set:
            positions = self.calc_pos_for_entity(text, entity_list, entity_id)
            pos[entity_id] = positions
        return pos

    # 2 First field position ______________________________________________________

    @staticmethod
    def find_nth(haystack, needle, n):
        start = haystack.find(needle)
        while start >= 0 and n > 1:
            start = haystack.find(needle, start + len(needle))
            n -= 1
        return start

    @staticmethod
    def get_normalised_first_pos(entity_list, entity_id, lower_bound, upper_bound):
        first_location = None
        normed = None
        anchor = None
        for e2 in entity_list:
            if e2.entity_id == entity_id:
                anchor = e2.text
                if lower_bound <= e2.start_char < upper_bound:
                    if first_location is None:
                        first_location = e2.start_char
                        normed = (first_location - lower_bound) / (upper_bound - lower_bound)
                    else:
                        if e2.start_char < first_location:
                            first_location = e2.start_char
                            normed = (first_location - lower_bound) / (upper_bound - lower_bound)
        return normed, anchor

    # return the bounds of the first 3 sentences, mid sentences, and last 3 sentences
    def get_body_positions(self, body):
        end_of_third_sentence = self.find_nth(body, '.', 3) + 1
        if end_of_third_sentence == 0:
            end_of_third_sentence = len(body)
        num_sentences = body.count(".") + 1
        start_of_last_three_sentences = self.find_nth(body, '.', num_sentences - 3) + 1
        if start_of_last_three_sentences == 0:
            start_of_last_three_sentences = len(body)
        if start_of_last_three_sentences < end_of_third_sentence:
            start_of_last_three_sentences = end_of_third_sentence

        return [
            0, end_of_third_sentence,  # first 3 sentences
            end_of_third_sentence + 1, start_of_last_three_sentences,  # middle
            start_of_last_three_sentences + 1, len(body)  # last three sentences
        ]

    def calc_first_field_positions_for_entity(self, body, title, entity_list, entity_id, title_entity_list,
                                              title_entity_id):

        first_section_start, first_section_end, \
        mid_section_start, mid_section_end, \
        last_section_start, last_section_end = self.get_body_positions(body)

        norm_first = self.get_normalised_first_pos(entity_list, entity_id, first_section_start, first_section_end)
        norm_middle = self.get_normalised_first_pos(entity_list, entity_id, mid_section_start, mid_section_end)
        norm_end = self.get_normalised_first_pos(entity_list, entity_id, last_section_start, last_section_end)
        title_first_location = self.get_normalised_first_pos(title_entity_list, title_entity_id, 0, len(title))

        return norm_first[0], norm_middle[0], norm_end[0], title_first_location[0]

    def calc_first_field_positions(self, body, title, entity_list, entity_id_set, title_entity_list):
        first_field_positions_by_ent_id = {}
        for entity_id in entity_id_set:
            first_field_positions_by_ent_id[entity_id] = self.calc_first_field_positions_for_entity(body, title,
                                                                                                    entity_list,
                                                                                                    entity_id,
                                                                                                    title_entity_list,
                                                                                                    entity_id)
        return first_field_positions_by_ent_id

    # 3 Sentence Position ______________________________________________________

    @staticmethod
    def get_average_normalised_pos(entity_list, entity_id, lower_bound, upper_bound):
        normed_positions = []
        for e2 in entity_list:
            if e2.entity_id == entity_id:
                if lower_bound <= e2.start_char < upper_bound:
                    normed = (e2.start_char - lower_bound) / (upper_bound - lower_bound)
                    normed_positions.append(normed)
        return normed_positions

    def calc_sentence_positions_for_entity(self, body, entity_list, entity_id):
        num_sentences = body.count(".") + 1
        start_index = 0
        normed_positions = []
        for sentence_num in range(1, num_sentences):
            end_index = self.find_nth(body, '.', sentence_num)
            normed_positions.extend(self.get_average_normalised_pos(entity_list, entity_id, start_index, end_index))
            start_index = end_index + 1  # save a loop by copying

        self.logger.debug('normed positions = %s ', normed_positions)
        return np.mean(normed_positions)

    def calc_sentence_positions(self, body, entity_list, entity_id_set):
        sentence_positions_by_ent_id = {}
        for entity_id in entity_id_set:
            sentence_positions_by_ent_id[entity_id] = self.calc_sentence_positions_for_entity(body, entity_list,
                                                                                              entity_id)
        return sentence_positions_by_ent_id

    # 4 field frequency  ___________________________________________________________


    def calc_field_frequency(self, body, entity_list, title_entity_list):
        first_section_start, first_section_end, \
        mid_section_start, mid_section_end, \
        last_section_start, last_section_end = self.get_body_positions(body)

        field_frequency_by_ent_id = {}

        for e2 in entity_list:
            if e2.entity_id not in field_frequency_by_ent_id:
                field_frequency_by_ent_id[e2.entity_id] = [0, 0, 0, 0]

            if first_section_start <= e2.start_char <= first_section_end:
                field_frequency_by_ent_id[e2.entity_id][0] = field_frequency_by_ent_id[e2.entity_id][0] + 1

            if mid_section_start <= e2.start_char <= mid_section_end:
                field_frequency_by_ent_id[e2.entity_id][1] = field_frequency_by_ent_id[e2.entity_id][1] + 1

            if last_section_start <= e2.start_char <= last_section_end:
                field_frequency_by_ent_id[e2.entity_id][2] = field_frequency_by_ent_id[e2.entity_id][2] + 1

        for e2 in title_entity_list:
            if e2.entity_id not in field_frequency_by_ent_id:
                field_frequency_by_ent_id[e2.entity_id] = [0, 0, 0, 0]
            field_frequency_by_ent_id[e2.entity_id][3] = field_frequency_by_ent_id[e2.entity_id][3] + 1

        return field_frequency_by_ent_id

    # 5 capitalization ___________________________________________________________

    def calc_capitalization_for_entity(self, text, entity_list, entity_id):
        for e2 in entity_list:
            if e2.entity_id == entity_id:
                message = e2.text.strip()
                u = sum(1 for c in message if c.isupper())
                c = len(message)
                if (c == u):
                    return True
        return False

    # return True iff at least one mention of cj is capitalized
    def calc_capitalization(self, text, entity_list, entity_id_set):
        capitalization_by_ent_id = {}
        for entity_id in entity_id_set:
            capitalization_by_ent_id[entity_id] = self.calc_capitalization_for_entity(text, entity_list, entity_id)
        return capitalization_by_ent_id

    # 6 Uppercase ratio ___________________________________________________________

    def calc_uppercase_ratio_for_entity(self, text, entity_list, entity_id):
        count = 0
        upper_count = 0
        for e2 in entity_list:
            if e2.entity_id == entity_id:
                message = e2.text
                u = sum(1 for c in message if c.isupper())
                c = len(message)
                count += c
                upper_count += u
        if count == 0:
            return 0
        else:
            return upper_count / count

    # maximum fraction of uppercase letters among the spots referring to cj
    def calc_uppercase_ratio(self, text, entity_list, entity_id_set):
        uppercase_ratio_by_ent_id = {}
        for entity_id in entity_id_set:
            uppercase_ratio_by_ent_id[entity_id] = self.calc_uppercase_ratio_for_entity(text, entity_list, entity_id)
        return uppercase_ratio_by_ent_id

    # 7 highlighting  ___________________________________________________________
    #
    # Not yet implemented, as we do not have this information

    # 8.1 Average Lengths in words ___________________________________________________________

    @staticmethod
    def calc_average_term_length_in_words_for_entity(text, entity_list, entity_id):
        length_list = []
        for e2 in entity_list:
            if e2.entity_id == entity_id:
                message = e2.text.strip()
                space_count = sum(1 for c in message if c == ' ')
                word_count = space_count + 1
                length_list.append(word_count)

        return np.mean(length_list)

    # length in words
    def calc_average_term_length_in_words(self, text, entity_list, entity_id_set):
        average_term_length_by_ent_id = {}
        for entity_id in entity_id_set:
            average_term_length_by_ent_id[entity_id] = self.calc_average_term_length_in_words_for_entity(text,
                                                                                                         entity_list,
                                                                                                         entity_id)
        return average_term_length_by_ent_id

    # 8.2 Average Lengths in characters___________________________________________________________

    def calc_average_term_length_in_characters_for_entity(self, entity_list, entity_id):
        length_list = []
        for e2 in entity_list:
            if e2.entity_id == entity_id:
                message = e2.text.strip()
                char_length = len(message)
                length_list.append(char_length)
        return np.mean(length_list)

    # length in characters
    def calc_average_term_length_in_characters(self, entity_list, entity_id_set):
        average_term_length_by_ent_id = {}
        for entity_id in entity_id_set:
            average_term_length_by_ent_id[entity_id] = self.calc_average_term_length_in_characters_for_entity(
                entity_list, entity_id)
        return average_term_length_by_ent_id

    # 11 Is In Title ___________________________________________________________

    def calc_is_in_title(self, entity_list, title_entity_list):
        is_in_title_by_ent_id = {}
        for e2 in entity_list:  # ensure we have a full dictionary
            is_in_title_by_ent_id[e2.entity_id] = False
        for e2 in title_entity_list:
            is_in_title_by_ent_id[e2.entity_id] = True
        return is_in_title_by_ent_id

        # 12 link probabilties ___________________________________________________________

        # The link probability for a spot $s_i \in S_D$ is defined as the number of occurrences of $s_i$
        #  being a link to an entity in KB, divided by its
        #  total number of occurrences in KB.
        # i.e. how often is this anchor text actually a link to this entity

    # 13 is person - requires another download  _____________________________________________________

    # from https://www.mpi-inf.mpg.de/departments/databases-and-information-systems/research/yago-naga/yago/downloads/
    # yagoSimpleTypes
    # SIMPLETAX : A simplified rdf:type system. This theme contains all instances, and links them with rdf:type facts to the leaf level of WordNet (use with yagoSimpleTaxonomy)
    # TSV version


    # 14 Entity frequency ___________________________________________________________

    def calc_entity_frequency_for_entity(self, text, entity_id, name_by_entity_id):
        count = text.count(name_by_entity_id[entity_id])
        # print(entity_id,name_by_entity_id[entity_id],count)
        return count

    def calc_entity_frequency(self, text, entity_id_set, name_by_entity_id):
        entity_frequency_by_ent_id = {}
        for entity_id in entity_id_set:
            entity_frequency_by_ent_id[entity_id] = self.calc_entity_frequency_for_entity(text, entity_id,
                                                                                          name_by_entity_id)
        return entity_frequency_by_ent_id

    # 15 distinct mentions  ___________________________________________________________

    # how is this different to the number of mentions?

    # 16 no ambiguity ___________________________________________________________


    # 17 ambiguity ___________________________________________________________

    # calculated as : 1 - reciprocal num candidate entities for spot

    # 18 commonness___________________________________________________________

    # commonness - when a spot points to many candidate entities, ratio number that point to entity A : Number that point to any entity.


    # 19 max commoness x max link probability ___________________________________

    # 20 entity degree ___________________________________
    # In-degree, out-degree and (undirected) degree of cj in the Wikipedia citation graph




    def calc_degrees(self, entity_id_set):
        entity_frequency_by_ent_id = {}
        for entity_id in entity_id_set:
            in_degree = self.ds.get_entity_in_degree(entity_id)
            out_degree = self.ds.get_entity_out_degree(entity_id)
            degree = in_degree + out_degree  # self.ds.get_entity_degree(entity_id)
            result = [in_degree, out_degree, degree]
            entity_frequency_by_ent_id[entity_id] = result
            self.logger.info('entity_id %d in out and total degrees: %s', entity_id, result)
        return entity_frequency_by_ent_id

    # 21 entity degree x max commoness ___________________________________


    # 22 document_length___________________________________________________________

    def calc_document_length(self, body, entity_id_set):
        document_length_by_ent_id = {}
        for entity_id in entity_id_set:
            document_length_by_ent_id[entity_id] = len(body)
        return document_length_by_ent_id

    # Combine all light features, on a per entity basis ___________________________________________________________

    def calc_light_features(self, body, title, entity_list, entity_id_set, name_by_entity_id, title_entity_list,
                            very_light=False):

        self.logger.info('calc_light_features 1')
        position_features_by_ent_id = self.calc_positions(body, entity_list, entity_id_set)  # 1
        self.logger.info('calc_light_features 2')
        field_positions_by_ent_id = self.calc_first_field_positions(body, title, entity_list, entity_id_set,
                                                                    title_entity_list)  # 2
        self.logger.info('calc_light_features 3')
        sentence_positions_by_ent_id = self.calc_sentence_positions(body, entity_list, entity_id_set)  # 3
        self.logger.info('calc_light_features 4')
        frequency_by_ent_id = self.calc_field_frequency(body, entity_list, title_entity_list)  # 4
        self.logger.info('calc_light_features 5')
        capitalization_by_ent_id = self.calc_capitalization(body, entity_list, entity_id_set)  # 5
        self.logger.info('calc_light_features 6')
        uppercase_ratio_by_ent_id = self.calc_uppercase_ratio(body, entity_list, entity_id_set)  # 6
        self.logger.info('calc_light_features 8.1')

        term_length_w_by_ent_id = self.calc_average_term_length_in_words(body, entity_list, entity_id_set)  # 8.1
        self.logger.info('calc_light_features 8.2')
        term_length_c_by_ent_id = self.calc_average_term_length_in_characters(entity_list, entity_id_set)  # 8.2
        self.logger.info('calc_light_features 11')

        title_by_ent_id = self.calc_is_in_title(entity_list, title_entity_list)  # 11
        self.logger.info('calc_light_features 14')
        entity_frequency_by_ent_id = self.calc_entity_frequency(body, entity_id_set, name_by_entity_id)  # 14
        self.logger.info('calc_light_features 20')

        if very_light:
            degrees_by_ent_id = {}
            for entity_id in entity_frequency_by_ent_id.keys():
                degrees_by_ent_id[entity_id] = [0, 0, 0]
        else:
            degrees_by_ent_id = self.calc_degrees(entity_id_set)  # 20
        self.logger.info('calc_light_features 22')
        doc_length_by_ent_id = self.calc_document_length(body, entity_id_set)  # 22

        self.logger.info('Reshaping results for document')

        results = {}
        for entity_id in entity_id_set:
            feature_list = []
            feature_list.extend(position_features_by_ent_id[entity_id])  # 1: 4 position features
            feature_list.extend(field_positions_by_ent_id[entity_id])  # 2
            feature_list.append(sentence_positions_by_ent_id[entity_id])  # 3
            feature_list.extend(frequency_by_ent_id[entity_id])  # 4
            feature_list.append(capitalization_by_ent_id[entity_id])  # 5
            feature_list.append(uppercase_ratio_by_ent_id[entity_id])  # 6 : 1 uppercase feature

            feature_list.append(term_length_w_by_ent_id[entity_id])  # 8.1 :
            feature_list.append(term_length_c_by_ent_id[entity_id])  # 8.2 :

            feature_list.append(title_by_ent_id[entity_id])  # 11 :

            feature_list.append(entity_frequency_by_ent_id[entity_id])  # 14 : 1 entity frequency feature

            feature_list.extend(degrees_by_ent_id[entity_id])  # 20 :
            feature_list.append(doc_length_by_ent_id[entity_id])  # 22 :

            # zero some features in order to do sensitivity checking
            for index in self.features_to_zero:
                if index>= 0 and index < len(feature_list):
                    feature_list[index] = 0

            results[entity_id] = feature_list
        return results

    # ___________________________________________________________


    def get_entity_saliency_list(self, body, title, spotter, very_light=False, spotter_confidence = 0.5):
        entity_list = spotter.get_entity_candidates(body, spotter_confidence)
        entity_id_set, name_by_entity_id = self.get_entity_set(entity_list)
        title_entity_list = spotter.get_entity_candidates(title, spotter_confidence)
        features_by_ent_id = self.calc_light_features(body, title, entity_list, entity_id_set, name_by_entity_id,
                                                      title_entity_list, very_light)
        title_entity_id_set, title_name_by_entity_id = self.get_entity_set(title_entity_list)
        return entity_list, entity_id_set, features_by_ent_id, name_by_entity_id, title_entity_list, title_entity_id_set

    def get_feature_list_by_ent(self, body, title, spotter, very_light=False, spotter_confidence = 0.5):
        entity_list, entity_id_set, features_by_ent_id, name_by_entity_id, title_entity_list, title_entity_id_set = \
            self.get_entity_saliency_list(body, title, spotter, very_light, spotter_confidence = spotter_confidence)
        return features_by_ent_id, name_by_entity_id
Пример #2
0
import logging

from sellibrary.wiki.wikipedia_datasets import WikipediaDataset

# dense to sparse

# set up logging
handler = logging.StreamHandler()
handler.setFormatter(logging.Formatter('%(asctime)s %(name)-12s %(levelname)-8s %(message)s'))
logger = logging.getLogger(__name__)
logger.addHandler(handler)
logger.propagate = False
logger.setLevel(logging.INFO)

if __name__ == "__main__":
    ds = WikipediaDataset()
    curid = 399877

    in_degree = ds.get_entity_in_degree(curid)
    out_degree = ds.get_entity_out_degree(curid)
    degree = ds.get_entity_degree(curid)
    logger.info('degree %d', degree)
    logger.info('in degree %d', in_degree)
    logger.info('out degree %d', out_degree)

    assert(degree >= 54)
    assert(in_degree >= 9)
    assert(out_degree >= 45)