예제 #1
0
    def _prop_by_rep(self, content: Content, node: object, exo_rep: str, exo_props: List[str], row: dict):
        """
        Private method that extracts from the 'content' loaded, the 'exo_props' passed
        from the 'exo_rep' passed, then creates a link between the 'node' passed and properties
        extracted.

            EXAMPLE:
                exo_rep = 0
                exo_props = ['producer', 'director']

                will extract the 'producer' and 'director' property from the representation '0'
                in the 'content' parameter and creates a link from the 'node' passed to said
                properties

        Args:
            content (Content): content loaded
            node (object): node to add properties to
            exo_rep (str): representation from where to extract the 'exo_props'
            exo_props (list): the properties list to extract from 'content'
            row (dict): dict-like object containing eventual score for the properties
        """
        properties = None
        try:
            properties = content.get_exogenous_representation(exo_rep).value
        except KeyError:
            logger.warning("Representation " + exo_rep + " not found for content " + content.content_id)

        if properties is not None:
            for prop in exo_props:
                if prop in properties.keys():
                    preference = self.get_preference(prop, row)
                    self.add_property_node(properties[prop])
                    self.add_link(node, properties[prop], preference, prop)
                else:
                    logger.warning("Property " + prop + " not found for content " + content.content_id)
예제 #2
0
    def extract_features_item(self, item: Content):
        """
        Function that extracts the feature of a loaded item using the item_field parameter passed in the
        constructor.

        It extracts only the chosen representations of the chosen fields in the item loaded
        EXAMPLE:

            with item_field = {'Plot': [0], 'Genre': ['tfidf', 1]}, the function will extracts
            only the representation with '0' as internal id for the field 'Plot' and two representations
            for the field 'Genre': one with 'tfidf' as external id and the other with 1 as internal id

        Args:
            item (Content): item loaded of which we need to extract its feature

        Returns:
            A list containing all representations extracted for the item
        """
        item_bag_list = []
        if item is not None:
            for field in self.item_field:
                field_representations = self.item_field[field]

                for representation in field_representations:
                    item_bag_list.append(
                        item.get_field_representation(field,
                                                      representation).value)

        return item_bag_list
예제 #3
0
    def create_content(self, raw_content: Dict):
        """
        Creates a content processing every field in the specified way.
        This method is iteratively invoked by the fit method.
        Args:
            raw_content (dict): Raw data from which the content will be created

        Returns:
            content (Content): an instance of content with his fields

        Raises:
            general Exception
        """

        if self.__config is None:
            raise Exception("You must set a config with set_config()")

        CONTENT_ID = "content_id"

        timestamp = self.__get_timestamp(raw_content)

        # construct id from the list of the fields that compound id
        content_id = id_merger(raw_content, self.__config.get_id_field_name())
        content = Content(content_id)

        if self.__config.get_lod_properties_retrieval() is not None:
            lod_properties = self.__config.get_lod_properties_retrieval().get_properties(raw_content)
            content.set_lod_properties(lod_properties)

        if self.__indexer is not None:
            self.__indexer.new_content()
            self.__indexer.new_field(CONTENT_ID, content_id)

        interfaces = self.__config.get_interfaces()
        for interface in interfaces:
            interface.new_content()
            interface.new_field(CONTENT_ID, content_id)

        # produce
        for field_name in self.__config.get_field_name_list():
            logger.info("Processing field: %s", field_name)
            # search for timestamp override on specific field
            content.append(field_name,
                           self.__create_field
                           (raw_content, field_name, content_id, timestamp))

        if self.__indexer is not None:
            content.set_index_document_id(self.__indexer.serialize_content())

        for interface in interfaces:
            interface.serialize_content()

        return content
예제 #4
0
 def test_load_serialize(self):
     content_field_repr = FeaturesBagField("test")
     content_field_repr.append_feature("test_key", "test_value")
     content_field = ContentField("test_field", "0000")
     content_field.append(str(0), content_field_repr)
     content = Content("001")
     content.append("test_field", content_field)
     try:
         content.serialize(".")
     except FileNotFoundError:
         self.fail("Could not create file!")
     with lzma.open('001.xz', 'r') as file:
         self.assertEqual(content, pickle.load(file))
예제 #5
0
    def test_append_remove(self):
        content_field_repr = FeaturesBagField("test")
        content_field_repr.append_feature("test_key", "test_value")
        content_field = ContentField("test_field", "0000")
        content_field.append(str(0), content_field_repr)
        content1 = Content("001")
        content1.append("test_field", content_field)

        content2 = Content("002")
        content2.append("test_field", content_field)
        content_field_repr = FeaturesBagField("test")
        content_field_repr.append_feature("test_key", "test_value")
        content_field2 = ContentField("test_field2", "0000")
        content_field2.append(str(0), content_field_repr)
        content2.append("test_field2", content_field2)
        content2.remove("test_field2")
        self.assertTrue(content1.get_field_list(), content2.get_field_list())
예제 #6
0
    def test_append_remove_field(self):
        """
        Tests for append, remove and get methods of the content's field instances
        """
        features_bag = dict()
        features_bag["test_key"] = "test_value"

        content_field_repr = FeaturesBagField(features_bag)
        content_field = RepresentationContainer()
        content_field.append(content_field_repr, "test_1")
        content1 = Content("001")
        content1.append_field("test_field", content_field)

        content2 = Content("002")
        content2.append_field("test_field", content_field)
        content_field_repr = FeaturesBagField(features_bag)
        content_field2 = RepresentationContainer()
        content_field2.append(content_field_repr, "test_1")
        content2.append_field("test_field2", content_field2)
        content2.remove_field("test_field2")
        self.assertEqual(content1.field_dict, content2.field_dict)
        self.assertEqual(content1.get_field("test_field"),
                         content2.get_field("test_field"))
예제 #7
0
    def test_append_remove_exo(self):
        """
        Tests for append, remove and get methods of the content's exogenous instances
        """
        exo_features = dict()
        exo_features["test_key"] = "test_value"

        content_exo_repr = PropertiesDict(exo_features)
        content1 = Content("001")
        content1.append_exogenous(content_exo_repr, "test_exo")

        content2 = Content("002")
        content2.append_exogenous(content_exo_repr, "test_exo")
        content_exo_repr = PropertiesDict(exo_features)
        content2.append_exogenous(content_exo_repr, "test_exo2")
        content2.remove_exogenous("test_exo2")
        self.assertEqual(content1.exogenous_rep_container,
                         content2.exogenous_rep_container)
        self.assertEqual(content1.get_exogenous("test_exo"),
                         content2.get_exogenous("test_exo"))
예제 #8
0
    def test_append_remove_field_repr(self):
        """
        Tests for append, remove and get methods of the content's fields' representation instances
        """
        features_bag = dict()
        features_bag["test_key"] = "test_value"

        content_field_repr_1 = FeaturesBagField(features_bag)
        content_field = RepresentationContainer(content_field_repr_1, "test_1")
        content = Content("001")
        content.append_field("test_field", content_field)

        content_field_repr_2 = FeaturesBagField(features_bag)
        content.append_field_representation("test_field_2",
                                            content_field_repr_2, "test_2")
        self.assertEqual(
            content.get_field_representation("test_field_2", "test_2"),
            content_field_repr_2)
        self.assertEqual(len(content.get_field("test_field_2")), 1)

        content.remove_field_representation("test_field_2", "test_2")
        self.assertEqual(len(content.get_field("test_field_2")), 0)
예제 #9
0
    def create_contents(self) -> List[Content]:
        """
        Creates the contents based on the information defined in the Content Analyzer's config
        Returns:
            contents_list (List[Content]): list of contents created by the method
        """
        if self.__config is None:
            raise Exception("You must set a config with set_config()")

        # will store the contents and is the variable that will be returned by the method
        contents_list = []
        for i, raw_content in enumerate(self.__config.source):

            # two lists are instantiated, one for the configuration names (given by the user) and one for the exogenous
            # properties representations. These lists will maintain the data for the content creation. This is done
            # because otherwise it would be necessary to append directly to the content. But in the Content class
            # the representations are kept as dataframes and appending to dataframes is computationally heavy
            exo_config_names = []
            exo_properties = []

            for exo_config_number, ex_config in enumerate(
                    self.__config.exogenous_representation_list):
                logger.info("Processing exogenous config %d for content %d" %
                            (exo_config_number + 1, i + 1))
                lod_properties = ex_config.exogenous_technique.get_properties(
                    raw_content)
                exo_config_names.append(ex_config.id)
                exo_properties.append(lod_properties)

            # construct id from the list of the fields that compound id
            content_id = id_merger(raw_content, self.__config.id)
            contents_list.append(
                Content(content_id,
                        exogenous_rep_container=RepresentationContainer(
                            exo_properties, exo_config_names)))

        # this dictionary will store any representation list that will be kept in one of the the index
        # the elements will be in the form:
        #   { memory_interface: {'Plot_0': [FieldRepr for content1, FieldRepr for content2, ...]}}
        # the 0 after the Plot field name is used to define the representation number associated with the Plot field
        # since it's possible to store multiple Plot fields in the index
        index_representations_dict = {}
        for field_name in self.__config.get_field_name_list():
            logger.info("Processing field: %s", field_name)
            # stores the field representation for the field name
            results = []
            # stores the field config ids for the field name
            field_config_ids = []
            for repr_number, field_config in enumerate(
                    self.__config.get_configs_list(field_name)):
                field_config_ids.append(field_config.id)

                # technique_result is a list of field representation produced by the content technique
                # each field repr in the list will refer to a content
                # technique_result[0] -> contents_list[0]
                technique_result = field_config.content_technique.produce_content(
                    field_name, field_config.preprocessing,
                    self.__config.source)

                if field_config.memory_interface is not None:
                    memory_interface = field_config.memory_interface
                    # if the index for the directory in the config hasn't been defined yet in the contents producer,
                    # the index associated to the field config that is being processed is added to the
                    # contents producer's memory interfaces list, and will be used for the future field configs with
                    # an assigned memory interface that has the same directory.
                    # This means that only the index defined in the first FieldConfig that has one will actually be used
                    if memory_interface not in self.__memory_interfaces.values(
                    ):
                        self.__memory_interfaces[
                            memory_interface.directory] = memory_interface
                        index_representations_dict[memory_interface] = {}
                    else:
                        memory_interface = self.__memory_interfaces[
                            memory_interface.directory]

                    if field_config.id is not None:
                        index_field_name = "{}#{}#{}".format(
                            field_name, str(repr_number), field_config.id)
                    else:
                        index_field_name = "{}#{}".format(
                            field_name, str(repr_number))

                    index_representations_dict[memory_interface][
                        index_field_name] = technique_result
                    result = []

                    # in order to refer to the representation that will be stored in the index, an IndexField repr will
                    # be added to each content (and it will contain all the necessary information to retrieve the data
                    # from the index)
                    for i in range(0, len(contents_list)):
                        result.append(
                            IndexField(index_field_name, i, memory_interface))
                else:
                    result = technique_result

                results.append(result)

            # each representation is added to the corresponding content
            for i, content in enumerate(contents_list):
                content_field_representations = []
                # retrieves the representations associated with the content
                for representation in results:
                    content_field_representations.append(representation[i])
                content.append_field(
                    field_name,
                    RepresentationContainer(content_field_representations,
                                            field_config_ids))

        # after the contents creation process, the data to be indexed will be serialized inside of the memory interfaces
        # for each created content, a new entry in each index will be created
        # the entry will be in the following form: {"content_id": id, "Plot_0": "...", "Plot_1": "...", ...}
        if len(self.__memory_interfaces) != 0:
            for memory_interface in self.__memory_interfaces.values():
                memory_interface.init_writing(True)
                for i in range(0, len(contents_list)):
                    memory_interface.new_content()
                    memory_interface.new_field("content_id",
                                               contents_list[i].content_id)
                    for field_name in index_representations_dict[
                            memory_interface].keys():
                        memory_interface.new_field(
                            field_name,
                            str(index_representations_dict[memory_interface]
                                [field_name][i].value))
                    memory_interface.serialize_content()
                memory_interface.stop_writing()
            self.__memory_interfaces.clear()

        return contents_list
예제 #10
0
    def test_append_remove_exo(self):
        """
        Tests for append, remove and get methods of the content's exogenous instances
        """
        exo_features = dict()
        exo_features["test_key"] = "test_value"

        content_exo_repr = PropertiesDict(exo_features)
        content_exo_repr2 = PropertiesDict({"test_key2": 'test_value2'})

        content1 = Content("001")
        content1.append_exogenous_representation(content_exo_repr, "test_exo")

        content2 = Content("002")
        content2.append_exogenous_representation(content_exo_repr, "test_exo")
        content_exo_repr = PropertiesDict(exo_features)
        content2.append_exogenous_representation(content_exo_repr, "test_exo2")
        content2.remove_exogenous_representation("test_exo2")
        self.assertEqual(content1.exogenous_rep_container,
                         content2.exogenous_rep_container)
        self.assertEqual(content1.get_exogenous_representation("test_exo"),
                         content2.get_exogenous_representation("test_exo"))

        # test append list of representations
        content3 = Content("003")

        content3.append_exogenous_representation(
            [content_exo_repr, content_exo_repr2], ["id1", "id2"])
        self.assertEqual(len(content3.exogenous_rep_container), 2)
        self.assertEqual(
            content3.get_exogenous_representation("id1").value,
            content_exo_repr.value)
        self.assertEqual(
            content3.get_exogenous_representation("id2").value,
            content_exo_repr2.value)

        # test append list of representations without id
        content4 = Content("004")
        content4.append_exogenous_representation(
            [content_exo_repr, content_exo_repr2])
        self.assertEqual(len(content3.exogenous_rep_container), 2)
        self.assertEqual(
            content3.get_exogenous_representation(0).value,
            content_exo_repr.value)
        self.assertEqual(
            content3.get_exogenous_representation(1).value,
            content_exo_repr2.value)
예제 #11
0
    def test_append_remove_field_repr(self):
        """
        Tests for append, remove and get methods of the content's fields' representation instances
        """
        features_bag = dict()
        features_bag["test_key"] = "test_value"

        # test append_field_repr when field already existent
        content_field_repr_1 = FeaturesBagField(features_bag)
        content_field = RepresentationContainer(content_field_repr_1, "test_1")
        content = Content("001")
        content.append_field("test_field", content_field)

        # test append_field_repr when field not existent
        content_field_repr_2 = FeaturesBagField(features_bag)
        content.append_field_representation("test_field_2",
                                            content_field_repr_2, "test_2")
        self.assertEqual(
            content.get_field_representation("test_field_2", "test_2"),
            content_field_repr_2)
        self.assertEqual(len(content.get_field("test_field_2")), 1)

        # test append_field_repr with list of repr
        content_field_repr_3_first = FeaturesBagField(features_bag)
        content_field_repr_3_second = FeaturesBagField(features_bag)
        content.append_field_representation(
            "test_field_3",
            [content_field_repr_3_first, content_field_repr_3_second],
            ["test_3_first", "test_3_second"])
        self.assertEqual(
            content.get_field_representation("test_field_3", "test_3_first"),
            content_field_repr_3_first)
        self.assertEqual(
            content.get_field_representation("test_field_3", "test_3_second"),
            content_field_repr_3_second)
        self.assertEqual(len(content.get_field("test_field_3")), 2)

        # test append_field_repr with list of repr and no id specified
        content_field_repr_4_first = FeaturesBagField(features_bag)
        content_field_repr_4_second = FeaturesBagField(features_bag)
        content.append_field_representation(
            "test_field_4",
            [content_field_repr_4_first, content_field_repr_4_second])
        self.assertEqual(content.get_field_representation("test_field_4", 0),
                         content_field_repr_4_first)
        self.assertEqual(content.get_field_representation("test_field_4", 1),
                         content_field_repr_4_second)
        self.assertEqual(len(content.get_field("test_field_4")), 2)

        # test remove
        content.remove_field_representation("test_field_2", "test_2")
        self.assertEqual(len(content.get_field("test_field_2")), 0)