def _prop_by_rep(self, content: Content, node: object, exo_rep: str, exo_props: List[str], row: dict): """ Private method that extracts from the 'content' loaded, the 'exo_props' passed from the 'exo_rep' passed, then creates a link between the 'node' passed and properties extracted. EXAMPLE: exo_rep = 0 exo_props = ['producer', 'director'] will extract the 'producer' and 'director' property from the representation '0' in the 'content' parameter and creates a link from the 'node' passed to said properties Args: content (Content): content loaded node (object): node to add properties to exo_rep (str): representation from where to extract the 'exo_props' exo_props (list): the properties list to extract from 'content' row (dict): dict-like object containing eventual score for the properties """ properties = None try: properties = content.get_exogenous_representation(exo_rep).value except KeyError: logger.warning("Representation " + exo_rep + " not found for content " + content.content_id) if properties is not None: for prop in exo_props: if prop in properties.keys(): preference = self.get_preference(prop, row) self.add_property_node(properties[prop]) self.add_link(node, properties[prop], preference, prop) else: logger.warning("Property " + prop + " not found for content " + content.content_id)
def extract_features_item(self, item: Content): """ Function that extracts the feature of a loaded item using the item_field parameter passed in the constructor. It extracts only the chosen representations of the chosen fields in the item loaded EXAMPLE: with item_field = {'Plot': [0], 'Genre': ['tfidf', 1]}, the function will extracts only the representation with '0' as internal id for the field 'Plot' and two representations for the field 'Genre': one with 'tfidf' as external id and the other with 1 as internal id Args: item (Content): item loaded of which we need to extract its feature Returns: A list containing all representations extracted for the item """ item_bag_list = [] if item is not None: for field in self.item_field: field_representations = self.item_field[field] for representation in field_representations: item_bag_list.append( item.get_field_representation(field, representation).value) return item_bag_list
def create_content(self, raw_content: Dict): """ Creates a content processing every field in the specified way. This method is iteratively invoked by the fit method. Args: raw_content (dict): Raw data from which the content will be created Returns: content (Content): an instance of content with his fields Raises: general Exception """ if self.__config is None: raise Exception("You must set a config with set_config()") CONTENT_ID = "content_id" timestamp = self.__get_timestamp(raw_content) # construct id from the list of the fields that compound id content_id = id_merger(raw_content, self.__config.get_id_field_name()) content = Content(content_id) if self.__config.get_lod_properties_retrieval() is not None: lod_properties = self.__config.get_lod_properties_retrieval().get_properties(raw_content) content.set_lod_properties(lod_properties) if self.__indexer is not None: self.__indexer.new_content() self.__indexer.new_field(CONTENT_ID, content_id) interfaces = self.__config.get_interfaces() for interface in interfaces: interface.new_content() interface.new_field(CONTENT_ID, content_id) # produce for field_name in self.__config.get_field_name_list(): logger.info("Processing field: %s", field_name) # search for timestamp override on specific field content.append(field_name, self.__create_field (raw_content, field_name, content_id, timestamp)) if self.__indexer is not None: content.set_index_document_id(self.__indexer.serialize_content()) for interface in interfaces: interface.serialize_content() return content
def test_load_serialize(self): content_field_repr = FeaturesBagField("test") content_field_repr.append_feature("test_key", "test_value") content_field = ContentField("test_field", "0000") content_field.append(str(0), content_field_repr) content = Content("001") content.append("test_field", content_field) try: content.serialize(".") except FileNotFoundError: self.fail("Could not create file!") with lzma.open('001.xz', 'r') as file: self.assertEqual(content, pickle.load(file))
def test_append_remove(self): content_field_repr = FeaturesBagField("test") content_field_repr.append_feature("test_key", "test_value") content_field = ContentField("test_field", "0000") content_field.append(str(0), content_field_repr) content1 = Content("001") content1.append("test_field", content_field) content2 = Content("002") content2.append("test_field", content_field) content_field_repr = FeaturesBagField("test") content_field_repr.append_feature("test_key", "test_value") content_field2 = ContentField("test_field2", "0000") content_field2.append(str(0), content_field_repr) content2.append("test_field2", content_field2) content2.remove("test_field2") self.assertTrue(content1.get_field_list(), content2.get_field_list())
def test_append_remove_field(self): """ Tests for append, remove and get methods of the content's field instances """ features_bag = dict() features_bag["test_key"] = "test_value" content_field_repr = FeaturesBagField(features_bag) content_field = RepresentationContainer() content_field.append(content_field_repr, "test_1") content1 = Content("001") content1.append_field("test_field", content_field) content2 = Content("002") content2.append_field("test_field", content_field) content_field_repr = FeaturesBagField(features_bag) content_field2 = RepresentationContainer() content_field2.append(content_field_repr, "test_1") content2.append_field("test_field2", content_field2) content2.remove_field("test_field2") self.assertEqual(content1.field_dict, content2.field_dict) self.assertEqual(content1.get_field("test_field"), content2.get_field("test_field"))
def test_append_remove_exo(self): """ Tests for append, remove and get methods of the content's exogenous instances """ exo_features = dict() exo_features["test_key"] = "test_value" content_exo_repr = PropertiesDict(exo_features) content1 = Content("001") content1.append_exogenous(content_exo_repr, "test_exo") content2 = Content("002") content2.append_exogenous(content_exo_repr, "test_exo") content_exo_repr = PropertiesDict(exo_features) content2.append_exogenous(content_exo_repr, "test_exo2") content2.remove_exogenous("test_exo2") self.assertEqual(content1.exogenous_rep_container, content2.exogenous_rep_container) self.assertEqual(content1.get_exogenous("test_exo"), content2.get_exogenous("test_exo"))
def test_append_remove_field_repr(self): """ Tests for append, remove and get methods of the content's fields' representation instances """ features_bag = dict() features_bag["test_key"] = "test_value" content_field_repr_1 = FeaturesBagField(features_bag) content_field = RepresentationContainer(content_field_repr_1, "test_1") content = Content("001") content.append_field("test_field", content_field) content_field_repr_2 = FeaturesBagField(features_bag) content.append_field_representation("test_field_2", content_field_repr_2, "test_2") self.assertEqual( content.get_field_representation("test_field_2", "test_2"), content_field_repr_2) self.assertEqual(len(content.get_field("test_field_2")), 1) content.remove_field_representation("test_field_2", "test_2") self.assertEqual(len(content.get_field("test_field_2")), 0)
def create_contents(self) -> List[Content]: """ Creates the contents based on the information defined in the Content Analyzer's config Returns: contents_list (List[Content]): list of contents created by the method """ if self.__config is None: raise Exception("You must set a config with set_config()") # will store the contents and is the variable that will be returned by the method contents_list = [] for i, raw_content in enumerate(self.__config.source): # two lists are instantiated, one for the configuration names (given by the user) and one for the exogenous # properties representations. These lists will maintain the data for the content creation. This is done # because otherwise it would be necessary to append directly to the content. But in the Content class # the representations are kept as dataframes and appending to dataframes is computationally heavy exo_config_names = [] exo_properties = [] for exo_config_number, ex_config in enumerate( self.__config.exogenous_representation_list): logger.info("Processing exogenous config %d for content %d" % (exo_config_number + 1, i + 1)) lod_properties = ex_config.exogenous_technique.get_properties( raw_content) exo_config_names.append(ex_config.id) exo_properties.append(lod_properties) # construct id from the list of the fields that compound id content_id = id_merger(raw_content, self.__config.id) contents_list.append( Content(content_id, exogenous_rep_container=RepresentationContainer( exo_properties, exo_config_names))) # this dictionary will store any representation list that will be kept in one of the the index # the elements will be in the form: # { memory_interface: {'Plot_0': [FieldRepr for content1, FieldRepr for content2, ...]}} # the 0 after the Plot field name is used to define the representation number associated with the Plot field # since it's possible to store multiple Plot fields in the index index_representations_dict = {} for field_name in self.__config.get_field_name_list(): logger.info("Processing field: %s", field_name) # stores the field representation for the field name results = [] # stores the field config ids for the field name field_config_ids = [] for repr_number, field_config in enumerate( self.__config.get_configs_list(field_name)): field_config_ids.append(field_config.id) # technique_result is a list of field representation produced by the content technique # each field repr in the list will refer to a content # technique_result[0] -> contents_list[0] technique_result = field_config.content_technique.produce_content( field_name, field_config.preprocessing, self.__config.source) if field_config.memory_interface is not None: memory_interface = field_config.memory_interface # if the index for the directory in the config hasn't been defined yet in the contents producer, # the index associated to the field config that is being processed is added to the # contents producer's memory interfaces list, and will be used for the future field configs with # an assigned memory interface that has the same directory. # This means that only the index defined in the first FieldConfig that has one will actually be used if memory_interface not in self.__memory_interfaces.values( ): self.__memory_interfaces[ memory_interface.directory] = memory_interface index_representations_dict[memory_interface] = {} else: memory_interface = self.__memory_interfaces[ memory_interface.directory] if field_config.id is not None: index_field_name = "{}#{}#{}".format( field_name, str(repr_number), field_config.id) else: index_field_name = "{}#{}".format( field_name, str(repr_number)) index_representations_dict[memory_interface][ index_field_name] = technique_result result = [] # in order to refer to the representation that will be stored in the index, an IndexField repr will # be added to each content (and it will contain all the necessary information to retrieve the data # from the index) for i in range(0, len(contents_list)): result.append( IndexField(index_field_name, i, memory_interface)) else: result = technique_result results.append(result) # each representation is added to the corresponding content for i, content in enumerate(contents_list): content_field_representations = [] # retrieves the representations associated with the content for representation in results: content_field_representations.append(representation[i]) content.append_field( field_name, RepresentationContainer(content_field_representations, field_config_ids)) # after the contents creation process, the data to be indexed will be serialized inside of the memory interfaces # for each created content, a new entry in each index will be created # the entry will be in the following form: {"content_id": id, "Plot_0": "...", "Plot_1": "...", ...} if len(self.__memory_interfaces) != 0: for memory_interface in self.__memory_interfaces.values(): memory_interface.init_writing(True) for i in range(0, len(contents_list)): memory_interface.new_content() memory_interface.new_field("content_id", contents_list[i].content_id) for field_name in index_representations_dict[ memory_interface].keys(): memory_interface.new_field( field_name, str(index_representations_dict[memory_interface] [field_name][i].value)) memory_interface.serialize_content() memory_interface.stop_writing() self.__memory_interfaces.clear() return contents_list
def test_append_remove_exo(self): """ Tests for append, remove and get methods of the content's exogenous instances """ exo_features = dict() exo_features["test_key"] = "test_value" content_exo_repr = PropertiesDict(exo_features) content_exo_repr2 = PropertiesDict({"test_key2": 'test_value2'}) content1 = Content("001") content1.append_exogenous_representation(content_exo_repr, "test_exo") content2 = Content("002") content2.append_exogenous_representation(content_exo_repr, "test_exo") content_exo_repr = PropertiesDict(exo_features) content2.append_exogenous_representation(content_exo_repr, "test_exo2") content2.remove_exogenous_representation("test_exo2") self.assertEqual(content1.exogenous_rep_container, content2.exogenous_rep_container) self.assertEqual(content1.get_exogenous_representation("test_exo"), content2.get_exogenous_representation("test_exo")) # test append list of representations content3 = Content("003") content3.append_exogenous_representation( [content_exo_repr, content_exo_repr2], ["id1", "id2"]) self.assertEqual(len(content3.exogenous_rep_container), 2) self.assertEqual( content3.get_exogenous_representation("id1").value, content_exo_repr.value) self.assertEqual( content3.get_exogenous_representation("id2").value, content_exo_repr2.value) # test append list of representations without id content4 = Content("004") content4.append_exogenous_representation( [content_exo_repr, content_exo_repr2]) self.assertEqual(len(content3.exogenous_rep_container), 2) self.assertEqual( content3.get_exogenous_representation(0).value, content_exo_repr.value) self.assertEqual( content3.get_exogenous_representation(1).value, content_exo_repr2.value)
def test_append_remove_field_repr(self): """ Tests for append, remove and get methods of the content's fields' representation instances """ features_bag = dict() features_bag["test_key"] = "test_value" # test append_field_repr when field already existent content_field_repr_1 = FeaturesBagField(features_bag) content_field = RepresentationContainer(content_field_repr_1, "test_1") content = Content("001") content.append_field("test_field", content_field) # test append_field_repr when field not existent content_field_repr_2 = FeaturesBagField(features_bag) content.append_field_representation("test_field_2", content_field_repr_2, "test_2") self.assertEqual( content.get_field_representation("test_field_2", "test_2"), content_field_repr_2) self.assertEqual(len(content.get_field("test_field_2")), 1) # test append_field_repr with list of repr content_field_repr_3_first = FeaturesBagField(features_bag) content_field_repr_3_second = FeaturesBagField(features_bag) content.append_field_representation( "test_field_3", [content_field_repr_3_first, content_field_repr_3_second], ["test_3_first", "test_3_second"]) self.assertEqual( content.get_field_representation("test_field_3", "test_3_first"), content_field_repr_3_first) self.assertEqual( content.get_field_representation("test_field_3", "test_3_second"), content_field_repr_3_second) self.assertEqual(len(content.get_field("test_field_3")), 2) # test append_field_repr with list of repr and no id specified content_field_repr_4_first = FeaturesBagField(features_bag) content_field_repr_4_second = FeaturesBagField(features_bag) content.append_field_representation( "test_field_4", [content_field_repr_4_first, content_field_repr_4_second]) self.assertEqual(content.get_field_representation("test_field_4", 0), content_field_repr_4_first) self.assertEqual(content.get_field_representation("test_field_4", 1), content_field_repr_4_second) self.assertEqual(len(content.get_field("test_field_4")), 2) # test remove content.remove_field_representation("test_field_2", "test_2") self.assertEqual(len(content.get_field("test_field_2")), 0)