def execute(self, window_start=None):
        self._word_vector_dict = self._db.get_word_vector_dictionary(self._table_name)

        for target1, target2 in self._pairs_targets:
            features = []
            target1_tuples = self._get_records_by_target_dict(target1)
            target2_tuples = self._get_records_by_target_dict(target2)
            for id in target1_tuples:
                dif_set1, dif_set2 = self.get_word_differences(id, target1_tuples, target2_tuples)

                for aggregation_function in self._aggregation_functions:
                    try:
                        dif1_word_embedding, dif2_word_embedding, subtraction_vec = self._get_differential_vectors(
                            aggregation_function, dif_set1, dif_set2)

                        feature_name = self._get_feature_names(target1, target2, aggregation_function)

                        features = features + Vector_Operations.create_author_feature_for_each_dimention(
                            subtraction_vec, feature_name, id, self._window_start, self._window_end,
                            self.__class__.__name__ + '_')
                        features = features + self.create_distance_features(id, aggregation_function,
                                                                            dif1_word_embedding,
                                                                            dif2_word_embedding, target1, target2, self.__class__.__name__ + '_')
                    except Exception as e1:
                        logging.info(e1)
            self._db.add_author_features(features)
예제 #2
0
    def execute(self, window_start=None):
        for targeted_fields_dict in self._targeted_fields_for_embedding:
            source_id_target_elements_dict = self._get_source_id_target_elements(
                targeted_fields_dict)
            source_id_document_tuples = self._create_documents(
                source_id_target_elements_dict)

            msg = "\rStarting training doc2vec"
            print(msg, end='')
            model = self._train_doc2vec_model(source_id_document_tuples)

            msg = "\rFinishing training doc2vec"
            print(msg, end='')

            targeted_table = targeted_fields_dict['source']['table_name']
            targeted_field_name = targeted_fields_dict['destination']['table_name'] + "-" \
                                  + targeted_fields_dict['destination']['target_field']

            model_type = "{0}_dimensions_{1}_window_size".format(
                self._num_of_dimensions, self._window_size_doc2vec)

            source_ids = source_id_target_elements_dict.keys()
            counter = 0
            authors_features = []
            for i, source_id in enumerate(source_ids):
                msg = "\rExtracting doc2vec features: {0}/{1}:{2}".format(
                    i, len(source_ids), source_id)
                print(msg, end="")
                counter += 1
                if counter % self._max_objects_without_saving == 0:
                    self._db.add_author_features(authors_features)
                    self._db.session.commit()

                source_id_vector = model[source_id]
                feature_name = model_type + '_' + targeted_table + "_" + targeted_field_name
                dimentions_feature_for_author = Vector_Operations.create_author_feature_for_each_dimention(
                    source_id_vector, feature_name, source_id,
                    self._window_start, self._window_end, self._prefix)
                authors_features = authors_features + dimentions_feature_for_author
            self._db.add_author_features(authors_features)
            self._db.session.commit()