Пример #1
0
def parallel_computation(input_path, dataset, model):

    path = os.path.join(input_path, dataset, model)
    input_filenames = [f for f in listdir(path) if isfile(join(path, f))]

    number_of_files = len(input_filenames)
    n_threads = 2

    pbar_inner = tqdm(number_of_files)

    def pbar_update(result):
        pbar_inner.update()
        pbar_inner.set_postfix_str(result)

    # for idx in range(number_of_files):
    #     sublevel_parallel_computation(p_arg[0],p_arg[1],p_arg[2], idx)

    asyncResults = []
    with mp.Pool(n_threads) as innerPool:
        ColorPrint.print_green(
            f"Starting Pool with {n_threads} threads with {len(parallel_args)} tasks."
        )
        for idx in range(number_of_files):
            r = innerPool.apply_async(sublevel_parallel_computation,
                                      [input_path, dataset, model, idx],
                                      callback=pbar_update)
            asyncResults.append(r)
        for r in asyncResults:
            try:
                r.wait()
            except:
                continue

    return model, dataset
Пример #2
0
def main() -> None:
    args = parse_args()
    num_jobs, num_trials = int(args.cores[0]), int(args.trials[0])

    CP.print_green(
        f'Running infinity mirror on {num_jobs} cores for {num_trials} trials')
    # print(args)
    # exit(1)
    Parallel(n_jobs=num_jobs, backend='multiprocessing')(
        delayed(run_infinity_mirror)(trial=i + 1, args=args)
        for i in range(num_trials))

    return
Пример #3
0
    def write_stats_jsons(self,
                          stats: Union[str, list],
                          overwrite: bool = False) -> None:
        """
        write the stats dictionary as a compressed json
        :return:
        """
        # standardize incoming type
        if isinstance(stats, str):
            stats = [stats]

        for statistic in stats:
            assert statistic in [
                method_name for method_name in dir(self)
                if callable(getattr(self, method_name))
                and not method_name.startswith('_')
            ]
            output_directory = get_imt_output_directory()

            file_output_directory = os.path.join(output_directory,
                                                 'graph_stats', self.dataset,
                                                 self.model, statistic)
            ensure_dir(file_output_directory, recursive=True)

            filename = os.path.join(
                output_directory, 'graph_stats', self.dataset, self.model,
                statistic, f'gs_{self.trial}_{self.iteration}.json.gz')

            # if the file already exists and overwrite flag is not set, then don't rework.
            if not overwrite and verify_file(filename):
                CP.print_green(
                    f'Statistic: {statistic} output file for {self.model}-{self.dataset}-{self.trial} already exists. Skipping.'
                )
                return

            try:
                data = self[statistic]  # todo : maybe there's a better way?!
                save_zipped_json(data, filename)
                CP.print_blue(f'Stats json stored at {filename}')
            except Exception as e:
                CP.print_red(f'Exception occurred on {filename}!')
                CP.print_red(str(e))
                if statistic == 'netlsd':
                    save_zipped_json(data, filename + '.failed')
        return
Пример #4
0
                model = 'GCN_AE'
            if model == 'Linear':
                model = 'Linear_AE'
    #for dataset in datasets:
    #    for model in models:
    #        input_directory = f"/data/infinity-mirror/stats/pgd/"
    #        input_filenames = [input_directory + f for f in listdir(input_directory) if
    #                            isfile(join(input_directory, f)) and f'{dataset}_{model}_pgd_full' in f]
            graph_dists = defaultdict(defaultdict)
            #if len(input_filenames) != 1:
            #    ColorPrint.print_red(f'There is file inconsistancy for {dataset} using {model} \n')
            #    exit()

            #filename = input_filenames[0]
            data = pd.read_csv(os.path.join(subdir, filename), sep="\t")
            ColorPrint.print_green(f'Loaded {filename}')
            data['trial'] = data['trial'].apply(
                lambda x: int(str(x).strip('.pkl.gz')))
            original_data = data.loc[(data.gen == 0) & (data.trial == 1)]
            original_data = original_data.drop(['model', 'gen', 'trial'],
                                               axis=1)
            original_data = original_data.to_numpy()[0]
            org_max = original_data.max()

            results = defaultdict(defaultdict)

            for chain_id in [x for x in data.trial.unique() if x != 1]:
                for gen_id in [
                        x
                        for x in data.loc[data.trial == chain_id].gen.unique()
                        if x != 0
    def run(self, use_pickle: bool) -> None:
        """
        New runner - uses list of graphs
        :param use_pickle:
        :return:
        """
        pickle_ext = '.pkl.gz'
        self.graphs = []

        if use_pickle:
            if check_file_exists(self.graphs_pickle_path +
                                 pickle_ext):  # the whole pickle exists
                graphs = load_pickle(self.graphs_pickle_path + pickle_ext)
                #assert len(graphs) == 21, f'Expected 21 graphs, found {len(graphs)}'
                assert len(
                    graphs
                ) == self.num_generations + 1, f'Expected 21 graphs, found {len(graphs)}'
                CP.print_green(
                    f'Using completed pickle at {self.graphs_pickle_path + pickle_ext!r}. Loaded {len(graphs)} graphs'
                )
                return
            else:
                temp_file_pattern = re.compile(
                    f'list_(\d+)_{self.trial}_temp_(\d+).pkl.gz')
                dir_name = '/'.join(self.graphs_pickle_path.split('/')[:-1])

                input_files = [
                    f for f in os.listdir(dir_name)
                    if re.match(temp_file_pattern, f)
                ]
                if len(input_files) > 0:
                    assert len(
                        input_files
                    ) == 1, f'More than one matches found: {input_files}'

                    input_file = input_files[0]
                    total_generations, progress = map(
                        int,
                        temp_file_pattern.fullmatch(input_file).groups())
                    graphs = load_pickle(join(dir_name, input_file))
                    assert len(
                        graphs
                    ) == progress + 1, f'Found {len(graphs)}, expected: {progress}'
                    CP.print_blue(
                        f'Partial pickle found at {input_file!r} trial: {self.trial} progress: {progress}/{total_generations}'
                    )
                    self.graphs = graphs

        remaining_generations = self.num_generations - len(self.graphs)

        tqdm.write(
            f'Running Infinity Mirror on {self.initial_graph.name!r} {self.initial_graph.order(), self.initial_graph.size()} {self.model.model_name!r} {remaining_generations} generations'
        )
        pbar = tqdm(total=remaining_generations,
                    bar_format='{l_bar}{bar}|[{elapsed}<{remaining}]',
                    ncols=50)

        if len(self.graphs) == 0:
            self.initial_graph.level = 0
            self.graphs = [self.initial_graph]
            self.features = [None]

        completed_trial = False
        for i in range(len(self.graphs) - 1, self.num_generations):
            if i == len(self.graphs) - 1:
                curr_graph = self.graphs[-1]  # use the last graph

            level = i + 1
            try:
                fit_time_start = time.perf_counter()
                self.model.update(
                    new_input_graph=curr_graph)  # update the model
                fit_time = time.perf_counter() - fit_time_start
            except Exception as e:
                fit_time = np.nan
                print(f'Model fit failed {e}')
                break

            try:
                gen_time_start = time.perf_counter()
                generated_graphs = self.model.generate(
                    num_graphs=self.num_graphs,
                    gen_id=level)  # generate a new set of graphs
                gen_time = time.perf_counter() - gen_time_start
            except Exception as e:
                gen_time = np.nan
                print(f'Generation failed {e}')
                break

            if self.features:
                self.features.append(self.model.params)
            curr_graph = generated_graphs[
                0]  # we are only generating one graph
            curr_graph.name = f'{self.initial_graph.name}_{level}_{self.trial}'
            curr_graph.gen = level
            self.graphs.append(curr_graph)

            temp_pickle_path = self.graphs_pickle_path + f'_temp_{level}{pickle_ext}'
            prev_temp_pickle_path = self.graphs_pickle_path + f'_temp_{level-1}{pickle_ext}'

            temp_features_path = self.graphs_features_path + f'_temp_{level}{pickle_ext}'
            prev_temp_features_path = self.graphs_features_path + f'_temp_{level-1}{pickle_ext}'

            save_pickle(obj=self.graphs, path=temp_pickle_path)
            save_pickle(obj=self.features, path=temp_features_path)

            delete_files(prev_temp_pickle_path)
            delete_files(prev_temp_features_path)

            self.write_timing_csv(iter_=level,
                                  fit_time=fit_time,
                                  gen_time=gen_time)

            if level == self.num_generations:
                completed_trial = True
            pbar.update(1)
        pbar.close()

        if completed_trial:  # only delete the temp pickle if the trial finishes successfully
            delete_files(
                temp_pickle_path
            )  # delete the temp file if the loop finishes normally
            delete_files(
                temp_features_path
            )  # delete the temp file if the loop finishes normally
            CP.print_green(
                f'List of {len(self.graphs)} Graphs is pickled at "{self.graphs_pickle_path + pickle_ext}"'
            )
            save_pickle(obj=self.graphs,
                        path=self.graphs_pickle_path + pickle_ext)
            save_pickle(obj=self.features,
                        path=self.graphs_features_path + pickle_ext)
        return
Пример #6
0
        'lambda_distance': 'laplacian_eigenvalues'
    }

    # datasets = ['clique-ring-500-4', 'eucore', 'flights', 'tree']

    models = ['Chung-Lu', 'CNRG', 'SBM', 'Erdos-Renyi', 'BUGGE', 'HRG']
    # models = ['BTER', 'BUGGE', 'Chung-Lu', 'CNRG', 'Erdos-Renyi', 'Kronecker', 'SBM', 'GCN_AE', 'Linear_AE']
    #stats = ['pagerank_js', 'degree_js', 'pgd_distance', 'netlsd_distance', 'lambda_distance', 'portrait_divergence']
    stats = ['degree_js', 'pagerank_js', 'lambda_distance']
    # datasets, models, trials, filenames = walker()
    datasets = ['cond-mat', 'enron']

    for dataset in datasets:
        for model in models:
            for stat in stats:
                ColorPrint.print_green(
                    f'computing {stat} distances for {dataset} {model}')
                trials = walker_texas_ranger(dataset,
                                             model,
                                             stat=implemented_metrics[stat],
                                             unique=True)
                args = [[dataset, model, trial, stat] for trial in trials]
                print(args[:5])
                # exit(-1)
                try:
                    results = parallel_async(distance_computation,
                                             args,
                                             num_workers=10)
                    df = pd.concat(results)
                except Exception as e:
                    ColorPrint.print_red(
                        f'Error, for {dataset!r} {model!r} {stat!r}')