Пример #1
0
    def test(self, model_path=''):
        print('Test model')

        try:
            print('*** Load pre-trained model ' + model_path + ' ***')
            self.model = load_checkpoint(self.model, model_path)
        except ValueError as e:
            print('Error while loading the model.', e)

        print('\nTest all')
        # acc = np.mean(self.accuracies)
        # acc = self.accuracies
        graphs = self.data[GRAPH]
        labels = self.labels
        self.run_test(graphs, labels)

        print('\nTest on train graphs')
        graphs = load_pickle(os.path.join(self.odir, 'train'))
        labels = load_pickle(os.path.join(self.odir, 'train_labels'))
        self.run_test(graphs, labels)

        print('\nTest on test graphs')
        graphs = load_pickle(os.path.join(self.odir, 'test'))
        labels = load_pickle(os.path.join(self.odir, 'test_labels'))
        self.run_test(graphs, labels)
Пример #2
0
    def test(self, model_path=''):
        print('[app][test] Test model')

        try:
            print('*** [app][test] Load pre-trained model ' + model_path +
                  ' ***')
            self.model = load_checkpoint(self.model, model_path, self.is_cuda)
        except ValueError as e:
            print('[app][test] Error while loading the model.', e)

        self.save_traintest()

        # print('\n[app][test] Test all')
        # # acc = np.mean(self.accuracies)
        # # acc = self.accuracies
        # graphs = self.data[GRAPH]
        # labels = self.labels
        # self.run_test(graphs, labels)

        graphs = load_pickle(os.path.join(self.odir, 'train'))
        labels = load_pickle(os.path.join(self.odir, 'train_labels'))
        print('\n[app][test] Test on train graphs ({})'.format(len(labels)),
              os.path.join(self.odir, 'train'))
        self.run_test_fold(graphs, labels, fold=300)

        graphs = load_pickle(os.path.join(self.odir, 'test'))
        labels = load_pickle(os.path.join(self.odir, 'test_labels'))
        print('\n[app][test] Test on test graphs ({})'.format(len(labels)),
              os.path.join(self.odir, 'test'))
        self.run_test_fold(graphs, labels, fold=150)
Пример #3
0
 def load_and_merge(self, train_df, test_df, start):
     train_feat = utils.load_pickle(self.train_path)
     test_feat = utils.load_pickle(self.test_path)
     # カラム数によってconcat方法かえる(高速化のため)
     if train_df.shape[1] > 100:
         train_df = utils.fast_concat(train_df, train_feat)
         test_df = utils.fast_concat(test_df, test_feat)
     else:
         train_df = pd.concat([train_df, train_feat], axis=1)
         test_df = pd.concat([test_df, test_feat], axis=1)
     logging.info("load complete ... {:.1f}s".format(time.time() - start))
     return train_df, test_df
Пример #4
0
def extract_feature_groups(corpus, narration_feature_dirs=None):
    group_indices = {
        'i3d': (0, 1024),
        'resnet': (1024, 3072),
        'audio': (3072, 3200),
    }
    n_instances = len(corpus)
    grouped = defaultdict(dict)
    last_task = None
    task_feats = None
    for idx in range(n_instances):
        instance = corpus._get_by_index(idx)
        video_name = instance['video_name']
        features = instance['features']
        for group, (start, end) in group_indices.items():
            grouped[group][video_name] = features[:, start:end]
        if narration_feature_dirs is not None:
            task = instance['task_name']
            if last_task != task:
                task_data = [
                    load_pickle(
                        os.path.join(dir,
                                     'crosstask_narr_{}.pkl'.format(task)))
                    for dir in narration_feature_dirs
                ]
                task_feats = {
                    datum['video']: datum['narration']
                    for data in task_data for datum in data
                }
            grouped['narration'][video_name] = task_feats[video_name]
            last_task = task
    return grouped
Пример #5
0
def extract_pairs(input_mask, input_mynorm, output_path, threshold,
                  std_threshold, background) -> None:
    mask = load_pickle(input_mask)
    print_("Loaded mask.")

    data = DataProcessor(mask=mask)
    data.load_mynorm(mynorm_path=input_mynorm)
    print_("Loaded myNorm.")

    data.check_pairs()
    print_("Pairs checked.")

    data.set_statistics()
    print_("Stats calculated.")

    distance_frame = data.create_record()
    print_("Distance calculated.")

    print_(f"Background: {background}")

    if not background:
        distance_frame = DataProcessor.select(distance_frame=distance_frame,
                                              threshold=threshold)
        distance_frame = DataProcessor.var_filtering(
            distance_frame=distance_frame)

    export_to_csv(distance_frame, path=output_path)
    print_("Done")
Пример #6
0
    def __init__(self, args):
        self.path = args.path + args.dataset
        self.method = args.method
        self.max_item_seq_length = args.max_item_seq_length
        self.load_embedding_flag = True if args.load_embedding_flag == 1 else False

        # load user_lists_dct, list_items_train, valid, test ========================
        self.user_lists_dct = utils.load_pickle(self.path +
                                                '.user_lists.ided.pkl')
        #(self.list_items_dct_no_valid,self.list_items_dct,self.validNegativesDict,
        # self.testNegativesDict)     = self.get_pickle_train_valid_test(self.path+'.list_items.train_valid_test.pkl')
        (self.list_items_dct, self.list_items_wv_dct, self.validNegativesDict,
         self.testNegativesDict) = self.get_pickle_train_valid_test(
             self.path + '.list_items.train_valid_test.pkl')

        ## change ===================
        ##self.validNegativesDict      = self.testNegativesDict ##to-be commented out
        self.list_items_dct_train = self.list_items_dct
        self.list_items_dct = self.list_items_wv_dct
        # ===========================

        self.list_user_dct = self.get_list_user_dct(
            self.user_lists_dct
        )  #user-embedding can be obtained using this(many-to-one)

        self.num_user, self.num_list, self.num_item = self.get_user_list_item_count(
            self.user_lists_dct, self.list_items_dct)
        self.list_user_vec = self.get_list_user_vec(
            self.list_user_dct, self.num_list
        )  #user-embedding can be obtained using this(many-to-one)

        # train valid test array-lists and sequence matrix ==========================
        self.trainArrTriplets = self.get_arraylist_from_train_dict(
            self.list_items_dct)
        self.validArrDubles = self.get_arraylist_from_valid_dict(
            self.validNegativesDict)
        self.testArrDubles = self.get_arraylist_from_valid_dict(
            self.testNegativesDict)
        #===

        self.train_matrix = self.get_train_matrix_sp(self.list_items_dct,
                                                     self.num_list,
                                                     self.num_item)
        self.train_matrix_item_seq = self.get_dct_mat_seq_remove_test(
            dct=self.list_items_dct,
            num_row=self.num_list,
            num_col=self.max_item_seq_length + 1,
            padding_value=0
        )  ##last_index :] for all, :-1] for remove test item, :-2] for removing test and valid
        self.train_matrix_item_seq_for_test = self.get_dct_mat_seq_for_test(
            dct=self.list_items_dct,
            num_row=self.num_list,
            num_col=self.max_item_seq_length,
            padding_value=0
        )  ##last_index :] for all, :-1] for remove test item, :-2] for removing test and valid
Пример #7
0
    def try_to_recover(self):
        try:
            data = load_pickle(self.tmp_file_name)

            if data:
                print('Recovered')
                self.processed_sites = data
                for row in self.processed_sites:
                    print('Checked {}: {}'.format(
                        row['donor'],
                        ('success' if row['success'] else 'fail')))
                    next(self.acceptors)
                    self.processed_donors.append(row['donor'])

        except Exception as e:
            pass

        return False
Пример #8
0
    edge_df.to_pickle(f"../data/edge_angle/{molecule}.pkl")


if __name__ == "__main__":
    with utils.timer("make_feature_per_molecule"):
        for mode in ["train", "test"]:
            meta_df = pd.read_pickle(f"../pickle/{mode}.pkl").set_index("id")
            molecules = meta_df["molecule_name"].unique().tolist()
            st_df = pd.read_pickle("../pickle/structures.pkl")
            ## train or validのstructureに絞る
            st_df = st_df[st_df.molecule_name.isin(molecules)]\
                    [["molecule_name","atom_index","atom","x","y","z"]]
            # 分子単位に処理
            st_gr = st_df.groupby("molecule_name")
            st_dict = {}
            for molecule in tqdm(molecules):
                st_dict[molecule] = st_gr.get_group(molecule)
            all_file_num = len(molecules)
            with Pool(4) as p:
                res = p.map(make_per_molecule, molecules)
    with utils.timer("concatenate_molecules_feature"):
        for mode in ["train", "test"]:
            meta_df = pd.read_pickle(f"../pickle/{mode}.pkl").set_index("id")
            molecules = meta_df["molecule_name"].unique().tolist()
            df_list = []
            for molecule in tqdm(molecules):
                df_list.append(
                    utils.load_pickle(f"../data/edge_angle/{molecule}.pkl"))
            all_df = pd.concat(df_list).reset_index(drop=True)
            utils.save_pickle(all_df, f"../pickle/{mode}_edge_angle.pkl")
Пример #9
0
                       force_save=force_save)
utils.save_pickle_safe(path_optimization + 'cv_results_time_series.pkl',
                       cv_results_time_series,
                       force_save=force_save)

utils.save_pickle_safe(path_optimization + 'model_infos.pkl',
                       model_infos,
                       force_save=force_save)

# Create manually the best_params
#dict[int]['best_params_']
#best_params = {'alpha': 0.2,'copy_X_train': True, 'kernel': None, 'n_restarts_optimizer': 0, 'normalize_y': True,
#               'optimizer': 'fmin_l_bfgs_b','random_state': None}
#best_params_time_series = {i : {'best_params':best_params} for i in np.arange(len(time_series))}

best_params_time_series = utils.load_pickle(path_optimization +
                                            'best_params_time_series.pkl')

# Loop fit and predict
preds = []
for idx, i in enumerate(
        tqdm(time_series,
             desc='Loop fit and predict over time series (using low memory)')):
    # Fit
    estimator = utils_regressor.fit_multioutput_regressor_uniseries_model(
        X_train,
        y_list_train[idx],
        estimator_choice,
        best_params_time_series[idx]['best_params_'],
        n_jobs=max(n_jobs, n_jobs_cv))
    # Predict
    X = utils_regressor.scaler_transform(X, scaler_X)
Пример #10
0
def create_dataset(name='market1501', part='trainval', **kwargs):
    assert name in ['market1501', 'cuhk03', 'duke', 'combined'], \
        "Unsupported Dataset {}".format(name)

    assert part in ['trainval', 'train', 'val', 'test'], \
      "Unsupported Dataset Part {}".format(part)

    ########################################
    # Specify Directory and Partition File #
    ########################################

    if name == 'market1501':
        im_dir = ospeu('/scratch/group/atlas_prid/market1501/images')
        partition_file = ospeu(
            '/scratch/group/atlas_prid/market1501/partitions.pkl')

    ##################
    # Create Dataset #
    ##################

    # Use standard Market1501 CMC settings for all datasets here.
    cmc_kwargs = dict(separate_camera_set=False,
                      single_gallery_shot=False,
                      first_match_break=True)

    partitions = load_pickle(partition_file)
    im_names = partitions['{}_im_names'.format(part)]

    if part == 'trainval':
        ids2labels = partitions['trainval_ids2labels']

        ret_set = TrainSet(im_dir=im_dir,
                           im_names=im_names,
                           ids2labels=ids2labels,
                           **kwargs)
    elif part == 'val':
        marks = partitions['val_marks']
        kwargs.update(cmc_kwargs)

        ret_set = TestSet(im_dir=im_dir,
                          im_names=im_names,
                          marks=marks,
                          **kwargs)

    if part in ['trainval', 'train']:
        num_ids = len(ids2labels)
    elif part in ['val', 'test']:
        ids = [parse_im_name(n, 'id') for n in im_names]
        num_ids = len(list(set(ids)))
        num_query = np.sum(np.array(marks) == 0)
        num_gallery = np.sum(np.array(marks) == 1)
        num_multi_query = np.sum(np.array(marks) == 2)

    # Print dataset information
    print('-' * 40)
    print('{} {} set'.format(name, part))
    print('-' * 40)
    print('NO. Images: {}'.format(len(im_names)))
    print('NO. IDs: {}'.format(num_ids))

    try:
        print('NO. Query Images: {}'.format(num_query))
        print('NO. Gallery Images: {}'.format(num_gallery))
        print('NO. Multi-query Images: {}'.format(num_multi_query))
    except:
        pass

    print('-' * 40)

    return ret_set
 def read_data(cls, input_file):
     return load_pickle(input_file)
Пример #12
0
from utils.utils import load_pickle

if __name__ == '__main__':
    data = load_pickle('find_comments.pkl')
    processed_links = data['processed_links']
    links_to_process = data['links_to_process']
    links_with_comments_form = data['links_with_comments_form']
    processed_domains = data['processed_domains']

    with open('processed_links.txt', 'w') as file:
        for link in data['processed_links']:
            file.write("{}\n".format(link))

    with open('links_to_process.txt', 'w') as file:
        for link in data['links_to_process']:
            file.write("{}\n".format(link))

    with open('links_with_comments_form.txt', 'w') as file:
        for link in data['links_with_comments_form']:
            file.write("{}\n".format(link))

    with open('processed_domains.txt', 'w') as file:
        for link in data['processed_domains']:
            file.write("{}\n".format(link))
Пример #13
0
def get_tokenizer():
    if not os.path.isfile(TOKENIZER_PATH):
        gen_tokenizer(TOKENIZER_PATH)
    tokenizer = load_pickle(TOKENIZER_PATH)
    return tokenizer
Пример #14
0
 def get_pickle_train_valid_test(self, fname):
     return utils.load_pickle(fname)
Пример #15
0
def create_dataset(name='market1501', part='trainval', **kwargs):
    assert name in ['market1501', 'cuhk03', 'duke', 'combined'], \
      "Unsupported Dataset {}".format(name)

    assert part in ['trainval', 'train', 'val', 'test'], \
      "Unsupported Dataset Part {}".format(part)

    ########################################
    # Specify Directory and Partition File #
    ########################################

    if name == 'market1501':
        im_dir = ospeu('/home/mbaharan/shfs/TeCSAR/Datasets/market1501/images')
        partition_file = ospeu(
            '/home/mbaharan/shfs/TeCSAR/Datasets/market1501/partitions.pkl')

    elif name == 'cuhk03':
        im_type = ['detected', 'labeled'][0]
        im_dir = ospeu(
            ospj('/home/mbaharan/shfs/TeCSAR/Datasets/CUHK03', im_type,
                 'images'))
        partition_file = ospeu(
            ospj('/home/mbaharan/shfs/TeCSAR/Datasets/CUHK03', im_type,
                 'partitions.pkl'))

    elif name == 'duke':
        im_dir = ospeu(
            '/home/mbaharan/shfs/TeCSAR/Datasets/DukeMTMC-reID/images')
        partition_file = ospeu(
            '/home/mbaharan/shfs/TeCSAR/Datasets/DukeMTMC-reID/partitions.pkl')

    elif name == 'combined':
        assert part in ['trainval'], \
          "Only trainval part of the combined dataset is available now."
        im_dir = ospeu(
            '/home/mbaharan/shfs/TeCSAR/Datasets/MixDataset_Duke_Market_CUHK03/trainval_images'
        )
        partition_file = ospeu(
            '/home/mbaharan/shfs/TeCSAR/Datasets/MixDataset_Duke_Market_CUHK03/partitions.pkl'
        )

    ##################
    # Create Dataset #
    ##################

    # Use standard Market1501 CMC settings for all datasets here.
    cmc_kwargs = dict(separate_camera_set=False,
                      single_gallery_shot=False,
                      first_match_break=True)

    partitions = load_pickle(partition_file)
    im_names = partitions['{}_im_names'.format(part)]

    if part == 'trainval':
        ids2labels = partitions['trainval_ids2labels']

        ret_set = TrainSet(im_dir=im_dir,
                           im_names=im_names,
                           ids2labels=ids2labels,
                           **kwargs)

    elif part == 'train':
        ids2labels = partitions['train_ids2labels']

        ret_set = TrainSet(im_dir=im_dir,
                           im_names=im_names,
                           ids2labels=ids2labels,
                           **kwargs)

    elif part == 'val':
        marks = partitions['val_marks']
        kwargs.update(cmc_kwargs)

        ret_set = TestSet(im_dir=im_dir,
                          im_names=im_names,
                          marks=marks,
                          **kwargs)

    elif part == 'test':
        marks = partitions['test_marks']
        kwargs.update(cmc_kwargs)

        ret_set = TestSet(im_dir=im_dir,
                          im_names=im_names,
                          marks=marks,
                          **kwargs)

    if part in ['trainval', 'train']:
        num_ids = len(ids2labels)
    elif part in ['val', 'test']:
        ids = [parse_im_name(n, 'id') for n in im_names]
        num_ids = len(list(set(ids)))
        num_query = np.sum(np.array(marks) == 0)
        num_gallery = np.sum(np.array(marks) == 1)
        num_multi_query = np.sum(np.array(marks) == 2)

    # Print dataset information
    print('-' * 40)
    print('{} {} set'.format(name, part))
    print('-' * 40)
    print('NO. Images: {}'.format(len(im_names)))
    print('NO. IDs: {}'.format(num_ids))

    try:
        print('NO. Query Images: {}'.format(num_query))
        print('NO. Gallery Images: {}'.format(num_gallery))
        print('NO. Multi-query Images: {}'.format(num_multi_query))
    except:
        pass

    print('-' * 40)

    return ret_set
Пример #16
0
        "Saved Data, links_with_comments_form: {}, processed_links: {}, links_to_process: {}, processed_domains: {}"
        .format(len(links_with_comments_form), len(processed_links),
                len(links_to_process), len(processed_domains)))
    save_pickle(tmp_file, stage)

    with open('domains.txt', 'w') as file:
        for link in processed_domains:
            file.write("{}\n".format(link))


if __name__ == '__main__':
    links_to_process += start_pages
    grequest_stack = []

    try:
        data = load_pickle(tmp_file)
        processed_links = data['processed_links']
        links_to_process = data['links_to_process']
        links_with_comments_form = data['links_with_comments_form']
        processed_domains = data['processed_domains']
    except Exception:
        pass

    while len(links_to_process):
        grequests_links()

        results = grequests.map(grequest_stack,
                                exception_handler=exception_handler,
                                size=20)

        save_data()
Пример #17
0
from utils.utils import load_pickle
import torch

labels = torch.load('/media/tunguyen/Devs/Security/HAN_sec/data/adnew_iapi/pickle/labels')
labels_txt = load_pickle('/media/tunguyen/Devs/Security/HAN_sec/data/adnew_iapi/pickle/labels_txt')

labels_new = labels.clone()
for i,label in enumerate(labels):
    print(i, label)
    if label == 0: # current malware, change malware to 1
        labels_new[i] = 1
    
    if label == 1: # current benign, change benign to 0
        labels_new[i] = 0

torch.save(labels_new, '/media/tunguyen/Devs/Security/HAN_sec/data/adnew_iapi/pickle/labels_')