def calculate_weight(self, x_test, y_test):
     self.load_models()
     y_test = np.argmax(y_test, axis=1)
     p = 0
     for i in range(len(self.list_model)):
         y_pred = self.list_model[i].predict(x_test)
         y_pred = np.argmax(y_pred, axis=1)
         if self.w_model == 'f1':
             p = f1_score(y_test, y_pred, average='weighted')
         if self.w_model == 'precision':
             p = precision_score(y_test, y_pred, average='weighted')
         if self.w_model == 'f1_min':
             s = classification_report(y_test,
                                       y_pred,
                                       target_names=self.label,
                                       output_dict=True)
             p = float(s[self.min_label]['f1-score'])
         if self.w_model == 'precision_min':
             s = classification_report(y_test,
                                       y_pred,
                                       target_names=self.label,
                                       output_dict=True)
             p = float(s[self.min_label]['precision'])
         if self.nb_dataset == 1 or self.w_model is None:
             p = 1
         if p == 0:
             p = 0.00000000001
         self.list_weight.append(p)
     save_object(os.path.join(self.model_folder, "list_weight.pickle"),
                 self.list_weight)
     return
Пример #2
0
    def get_dataset(self, train_skip=1):
        '''
        This function is responsible of creating the dataset of the wanted task from the given graph_nx. Wraps the
        function 'task_loader.load_task' for caching.
        Args:
            train_skip: float - ratio of the data we take for train. For example, if we have N possible
                                 samples in the given graph_nx, then we take only int(N/train_skip) samples for train.
                                 This is highly important in large graphs.
        Returns:
            X: dict - with keys 'train', 'test'. each value is a np.array of the dataset, where each entery is a sample
                      with the embeddings.
            y: dict - with keys 'train', 'test', each value is a np.array of the dataset, where y[key][i] is the label
                      of X[key][i] for the given task.
        '''
        # load task data
        task_data_path = join(self.dump_folder,
                              f'{self.task}_dataset_{self.pivot_time}.data')
        if os.path.exists(task_data_path):
            X, y = load_object(task_data_path)
        else:
            X, y = loader.load_task(self.graph_nx,
                                    self.task,
                                    train_skip=1,
                                    pivot_time=self.pivot_time,
                                    test_size=self.test_size)
            save_object((X, y), task_data_path)

        X = {'train': X['train'][::train_skip], 'test': X['test']}
        y = {'train': y['train'][::train_skip], 'test': y['test']}

        return X, y
Пример #3
0
 def fit(self, label):
     label_to_int = {}
     for j in range(len(label)):
         label_to_int[label[j]] = j
     self.label_to_int = label_to_int
     save_object(
         os.path.join(self.feature_extraction_folder,
                      "label_to_int.pickle"), self.label_to_int)
 def find_optimal_cutoff(self, x_test, y_test):
     self.load_models()
     y_pred = []
     for i in range(len(self.list_model)):
         y_pred.append(self.list_model[i].predict(x_test))
     y_pred = sum([a * b for a, b in zip(y_pred, self.list_weight)]) / sum(
         self.list_weight)
     y_test = np.argmax(y_test, axis=1)
     self.cutoff = F1(y_test, y_pred[:, 1])
     save_object(os.path.join(self.model_folder, "cutoff.pickle"),
                 self.cutoff)
Пример #5
0
 def fit_transform(self, templates):
     if self.analyser is None:
         self.fit(templates)
     x = self.transform(templates)
     save_object(
         os.path.join(self.feature_extraction_folder, "maxlen.pickle"),
         self.maxlen)
     if self.config_dict["encoding"] == "embedding":
         if self.embedding_matrix is None:
             self.embedding_matrix = self.instantiate_embedding_matrix(
                 templates)
         return x, self.embedding_matrix
     else:
         return x
Пример #6
0
 def calculate_pivot_time(self):
     '''
     Calculate the pivot time that is needed in order to create a 'time_split_ratio' between train edges and
     test edges
     Returns:
         time step representing the pivot time step
     '''
     ratio2pivot = {}
     ratio2pivot_path = join(self.dump_folder, 'ratio2pivot.dict')
     if os.path.exists(ratio2pivot_path):
         ratio2pivot = load_object(ratio2pivot_path)
         if self.test_size in ratio2pivot:
             return ratio2pivot[self.test_size]
     pivot_time = get_pivot_time(self.graph_nx, self.test_size)
     ratio2pivot[self.test_size] = pivot_time
     save_object(ratio2pivot, ratio2pivot_path)
     return pivot_time
Пример #7
0
 def fit(self, templates):
     if self.vocabulary and self.analyser:
         pass
     else:
         vectorizer = TfidfVectorizer(
             ngram_range=(self.config_dict["min_n_gram"],
                          self.config_dict["max_n_gram"]),
             lowercase=True,
             stop_words=None,
             min_df=1)
         vectorizer.fit(templates)
         self.analyser = vectorizer.build_analyzer()
         self.vocabulary = vectorizer.vocabulary_
         save_object(
             os.path.join(self.feature_extraction_folder,
                          "analyzer.pickle"), self.analyser)
         save_object(
             os.path.join(self.feature_extraction_folder,
                          "vocabulary.pickle"), self.vocabulary)
         inputs = self.feature_engineering(templates)
         self.maxlen = max(max(len(x) for x in inputs), self.maxlen)
    def fit(self,
            x_train,
            y_train,
            x_test,
            y_test,
            trainable=True,
            custom=True,
            batch_size=32,
            epochs=5):

        self.create_name_fig(trainable, custom)
        self.new_model()
        label, indexes, counts_elements = np.unique(y_train,
                                                    return_counts=True,
                                                    return_index=True)
        self.min_label = label[np.argmin(counts_elements)]
        self.label = [y_train[index] for index in sorted(indexes)]
        save_object(os.path.join(self.model_folder, "label.pickle"),
                    self.label)
        save_object(os.path.join(self.model_folder, "min_label.pickle"),
                    self.min_label)

        self.feature_extraction = PreprocessingTemplate(
            self.config_dict, self.feature_extraction_folder)
        corpus = np.hstack((x_train, x_test))
        if self.config_dict["encoding"] == "embedding" and custom:
            _, self.embedding_matrix = self.feature_extraction.fit_transform(
                x_train)

        else:
            _ = self.feature_extraction.fit_transform(corpus)
        x_train = self.feature_extraction.transform(x_train)
        x_test = self.feature_extraction.transform(x_test)

        self.encoder = OneHotEncoding(self.feature_extraction_folder)
        self.encoder.fit(self.label)
        y_test = self.encoder.transform(y_test)

        self.n_out = len(label)
        if self.type_model == 'LSTM':
            params = {
                'units_size1': 256,
                'units_sizes': [128],
                'dense_size1': 500,
                'dense_size2': [300],
                'dropout': 0.004
            }
        else:
            params = {
                'filter_sizes': [1, 2],
                'nb_filter': 1024,
                'dense_size1': 400,
                'dense_size2': [250],
                'dropout': 0.001
            }
        model = Model(self.type_model,
                      vocabulary_size=len(self.feature_extraction.vocabulary),
                      embedding_dim=self.config_dict['embedding_dim'],
                      embedding_matrix=self.embedding_matrix,
                      trainable=trainable,
                      params=params,
                      filename=self.model_folder)

        for i in range(self.nb_dataset):
            x_temp, y_temp = generate_data(x_train, y_train, self.methode,
                                           self.threshold)
            y_temp = self.encoder.transform(y_temp)
            model.fit(x_train=x_temp,
                      y_train=y_temp,
                      x_test=x_test,
                      y_test=y_test,
                      batch_size=batch_size,
                      epochs=epochs,
                      i=i)
        self.calculate_weight(x_test=x_test, y_test=y_test)
        if self.cutoff == 'F1':
            self.find_optimal_cutoff(x_test=x_test, y_test=y_test)
        return