Пример #1
0
def run_experiment_bio_orphanet(dataset_path: str,
                                raw_data=False,
                                figures=False):
    # sets the values of b5-b4 to consider (note that b4 is set to 0)
    values_b5 = [0.0, 0.25, 0.5, 0.75, 1, 2]
    kcfs = []
    # creation of the scoring schemes (the KCFs)
    for value_b5 in values_b5:
        kcfs.append(
            ScoringScheme([[0., 1., 1., 0., value_b5, 0.],
                           [1., 1., 0., value_b5, value_b5, 0]]))

    exp1 = ExperimentOrphanet(
        dataset_folder=dataset_path,
        # the kcfs to consider
        scoring_schemes=kcfs,
        # the top-k to consider
        top_k_to_test=[10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110],
        # algorithm to compute the consensus
        algo=get_algorithm(alg=Algorithm.ParCons,
                           parameters={
                               "bound_for_exact":
                               150,
                               "auxiliary_algorithm":
                               get_algorithm(alg=Algorithm.BioConsert)
                           }),
        # selects all the tuples of rankings with at least 100 elements and 3 rankings
        # dataset_selector=DatasetSelector(nb_elem_min=100, nb_rankings_min=3)
        dataset_selector=DatasetSelector(nb_elem_min=100, nb_rankings_min=3),
    )

    # run experiment and print results. If raw_data is true: also print all parameters of experiment (readme)
    # and the raw data that was used to compute the final data. If parameter is false, only final data is displayed
    exp1.run(raw_data, figures=figures)
Пример #2
0
 def __init__(self,
              nb_years: int,
              nb_students_track1: int,
              nb_students_track2: int,
              nb_classes_total: int,
              nb_classes_track1: int,
              nb_classes_track2: int,
              mean_track1: float,
              variance_track1: float,
              mean_track2: float,
              variance_track2: float,
              topk: int,
              scoring_schemes: List[ScoringScheme],
              algo: MedianRanking = get_algorithm(Algorithm.ParCons, parameters={
                  "bound_for_exact": 150, "auxiliary_algorithm": get_algorithm(alg=Algorithm.BioConsert)}),
              ):
     super().__init__()
     self.__alg = algo
     self.__scoring_schemes = scoring_schemes
     self.__nb_years = nb_years
     self.__nb_students_track_1 = nb_students_track1
     self.__nb_students_track_2 = nb_students_track2
     self.__nb_classes_total = nb_classes_total
     self.__nb_classes_track_1 = nb_classes_track1
     self.__nb_classes_track_2 = nb_classes_track2
     self.__mean_track1 = mean_track1
     self.__variance_track1 = variance_track1
     self.__mean_track2 = mean_track2
     self.__variance_track2 = variance_track2
     self.__topk = topk
Пример #3
0
 def __init__(
     self,
     dataset_folder: str,
     scoring_schemes_exp: List[ScoringScheme],
     changing_coeff: Tuple[int, int],
     intervals: List[Tuple[int, int]] = None,
     dataset_selector_exp: DatasetSelector = None,
 ):
     super().__init__(dataset_folder, dataset_selector_exp)
     self.__scoring_schemes = scoring_schemes_exp
     self.__alg = get_algorithm(alg=Algorithm.ParCons,
                                parameters={
                                    "bound_for_exact":
                                    0,
                                    "auxiliary_algorithm":
                                    get_algorithm(alg=Algorithm.AllTied)
                                })
     self.__changing_coeff = changing_coeff
     if intervals is not None:
         self.__intervals = intervals
     else:
         max_n = self.datasets[0].nb_elements
         min_n = not self.datasets[0].nb_elements
         for dataset in self.datasets:
             if dataset.nb_elements > max_n:
                 max_n = dataset.nb_elements
             if dataset.nb_elements < min_n:
                 min_n = dataset.nb_elements
         self.__intervals = [(min_n, max_n)]
Пример #4
0
def run_bench_time_alg_exacts_vldb(path_dataset: str,
                                   raw_data=False,
                                   figures=False):
    # get the scoring schemes (the KCFs)
    kcf1 = ScoringScheme.get_unifying_scoring_scheme_p(1.)
    kcf2 = ScoringScheme.get_extended_measure_scoring_scheme()
    kcf3 = ScoringScheme.get_induced_measure_scoring_scheme_p(1.)
    kcf4 = ScoringScheme.get_pseudodistance_scoring_scheme_p(1.)
    kcfs = [kcf1, kcf2, kcf3, kcf4]

    # optimize = optim1, preprocess = optim2
    ea = get_algorithm(alg=Algorithm.Exact,
                       parameters={
                           "optimize": False,
                           "preprocess": False
                       })
    ea_optim1 = get_algorithm(alg=Algorithm.Exact,
                              parameters={
                                  "optimize": True,
                                  "preprocess": False
                              })
    ea_optim1_optim2 = get_algorithm(alg=Algorithm.Exact,
                                     parameters={
                                         "optimize": True,
                                         "preprocess": True
                                     })
    algorithms_for_bench = [ea, ea_optim1, ea_optim1_optim2]
    # run experiment for each scoring scheme (KCF)
    for kcf in kcfs:
        bench = BenchTime(
            dataset_folder=path_dataset,
            # algorithms for the bench time
            algs=algorithms_for_bench,
            # the scoring scheme that is the kcf to consider
            scoring_scheme=kcf,
            # to select tuples of rankings with number of elements between 30 and 119 and at least 3 rankings
            dataset_selector_exp=DatasetSelector(nb_elem_min=30,
                                                 nb_elem_max=119,
                                                 nb_rankings_min=3),
            # range of size of datasets for the output
            steps=10,
            # re-compute the consensus until final time computation > 1 sec.
            # the average time computation is then returned
            repeat_time_computation_until=1.)

        # run experiment and print results. If parameter is true: also print all parameters of experiment (readme)
        # and the raw data that was used to compute the final data. If parameter is false, only final data is displayed
        bench.run(raw_data, figures=figures)
Пример #5
0
 def _run_raw_data(self) -> str:
     res = ""
     for dataset in self.datasets:
         frontiers = ParFront().compute_frontiers(dataset, self.__scoring_scheme)
         alg = get_algorithm(Algorithm.CopelandMethod)
         consensus = alg.compute_consensus_rankings(dataset, self.__scoring_scheme, True)
         print(frontiers.consistent_with(consensus))
     return res
Пример #6
0
def run_experiment_students_vldb(raw_data=False, figures=False):
    # seed 1 is set for python and numpy
    random.seed(1)
    np.random.seed(1)
    # sets the values of b5-b4 to consider (note that b4 is set to 0)
    #values_b5 = [0., 0.25, 0.5, 0.75, 1., 2]
    values_b5 = [0.]
    kcfs = []
    # creation of the scoring schemes (the KCFs)
    for value_b5 in values_b5:
        kcfs.append(
            ScoringScheme([[0., 1., 1., 0., value_b5, 0.],
                           [1., 1., 0., value_b5, value_b5, 0]]))
    """"
    the parameters are all the ones detailled in the research paper. 100 student classes, each student class
    has 280 students from track 1 and 20 from track 2. In tract 1: choose uniformly 14 classes over 17 and in track
    2: choose uniformly 9 classes over the same 17. The marks obtained by students of track 1: N(10, 5*5) and by 
    students of track 2 : N(16, 4*4). Evaluation is made using top-20 of the consensuses
    """
    exp = MarksExperiment(
        # number of tuples of rankings to create
        nb_years=100,
        # number of students in track1
        nb_students_track1=280,
        # number of students in track2
        nb_students_track2=20,
        # number of classes the students can choose
        nb_classes_total=17,
        # number of classes the students of track1 choose (uniformly)
        nb_classes_track1=14,
        # number of classes the students of track2 choose (uniformly)
        nb_classes_track2=9,
        # mean marks for students in track1 for each class (normal distribution)
        mean_track1=10,
        # square of standard deviation of students marks in track1 for each class
        variance_track1=5,
        # mean marks for students in track2 for each class (normal distribution)
        mean_track2=16,
        # square of standard deviation of students marks in track2 for each class
        variance_track2=4,
        # top-k to consider for the experiment (comparison consensus and overall average)
        topk=20,
        # kcfs to consider
        scoring_schemes=kcfs,
        # algorithm to compute consensus
        algo=get_algorithm(Algorithm.CopelandMethod))

    # run experiment and print results. If raw_data is true: also print all parameters of experiment (readme)
    # and the raw data that was used to compute the final data. If parameter is false, only final data is displayed
    exp.run(raw_data, figures)
Пример #7
0
 def __init__(self,
              dataset_folder: str,
              scoring_schemes: List[ScoringScheme],
              top_k_to_test: List[int],
              algo: MedianRanking = get_algorithm(Algorithm.ParCons, parameters={"bound_for_exact": 150}),
              dataset_selector: DatasetSelector = None,
              ):
     super().__init__(dataset_folder=dataset_folder, dataset_selector=dataset_selector)
     self.__orphanetParser = OrphanetParser.get_orpha_base_for_vldb(join_paths(get_parent_path(
                                                                             get_parent_path(dataset_folder)),
                                                                               "supplementary_data"))
     self.__algo = algo
     self.__remove_useless_datasets()
     self.__scoring_schemes = []
     self.__consensus = {}
     self.__scoring_schemes = scoring_schemes
     self.__top_k_to_test = top_k_to_test
Пример #8
0
def run_bench_exact_optimized_scoring_scheme_vldb(path_dataset: str,
                                                  raw_data=False,
                                                  figures=False):
    # get the scoring schemes (the KCFs)
    kcf1 = ScoringScheme.get_unifying_scoring_scheme_p(1.)
    kcf2 = ScoringScheme.get_extended_measure_scoring_scheme()
    kcf3 = ScoringScheme.get_induced_measure_scoring_scheme_p(1.)
    kcf4 = ScoringScheme.get_pseudodistance_scoring_scheme_p(1.)
    kcfs = [kcf1, kcf2, kcf3, kcf4]

    # optimize = optim1, preprocess = optim2
    ea_optim1_optim2 = get_algorithm(alg=Algorithm.Exact,
                                     parameters={
                                         "optimize": True,
                                         "preprocess": True
                                     })

    # run experiment for each scoring scheme (KCF)
    bench = BenchScalabilityScoringScheme(
        dataset_folder=path_dataset,
        # the algorithm to consider
        alg=ea_optim1_optim2,
        # the kcfs to consider
        scoring_schemes=kcfs,
        # the dataset selector for selection according to the size
        dataset_selector_exp=DatasetSelector(nb_elem_min=130,
                                             nb_elem_max=300,
                                             nb_rankings_min=3),
        # range of size of datasets for the output
        steps=10,
        # max time computation allowed. for each kcf, the computation stops
        # when for a tuple of rankings the time computation is higher
        max_time=600,
        # re-compute the consensus until final time computation > 1 sec. The average time computation is then returned
        repeat_time_computation_until=0)

    # run experiment and print results. If parameter is true: also print all parameters of experiment (readme)
    # and the raw data that was used to compute the final data. If parameter is false, only final data is displayed
    bench.run(raw_data, figures)
Пример #9
0
dataset = Dataset([[[1], [2, 3]], [[3, 1], [4]], [[1], [5], [3, 2]]])
# or d = Dataset.get_rankings_from_file(file_path), with file_path is the path to fhe file
# import a list of datasets in a same folder : Dataset.get_rankings_from_folder(path_folder)

# print information about the dataset
print(dataset.description())
# choose your scoring scheme (or sc = ScoringScheme() for default scoring scheme)
sc = ScoringScheme([[0., 1., 1., 0., 1., 1.], [1., 1., 0., 1., 1., 0.]])

print("scoring scheme : " + str(sc))
# scoring scheme description
print(sc.description())

print("\n### Consensus computation ###\n")

algorithm = get_algorithm(alg=Algorithm.ParCons,
                          parameters={"bound_for_exact": 90})
# compute consensus ranking
consensus = algorithm.compute_consensus_rankings(
    dataset=dataset, scoring_scheme=sc, return_at_most_one_ranking=False)

print(consensus.description())

# if you want the consensus ranking only : print(consensus)
# to get the consensus rankings : consensus.consensus_rankings

# list of rank aggregation algorithms to use among  BioConsert, ParCons, ExactAlgorithm, KwikSortRandom, RepeatChoice,
# PickAPerm, MedRank, BordaCount, BioCo, CopelandMethod

algorithms_to_execute = [
    get_algorithm(alg=Algorithm.KwikSortRandom),
    get_algorithm(alg=Algorithm.BioConsert,
Пример #10
0
            for element in dataset.elements:
                h_gene_list_scores[element] = []
            shuffle(dataset.rankings)
            for i in to_test:
                dataset_new = Dataset(dataset.rankings[0:i])
                dataset_new.name = dataset.name
                consensus = self._algo.compute_consensus_rankings(dataset_new, self._scoring_cheme, True)
                copeland_scores = consensus.copeland_scores
                for element in dataset_new.elements:
                    cop_score_element = copeland_scores.get(element)
                    h_gene_list_scores[element].append(cop_score_element)
            for element in dataset.elements:
                res += dataset.name + ";" + str(element) + ";" + str(h_gene_list_scores[element]) + "\n"
        return res

algor = get_algorithm(Algorithm.CopelandMethod)
scoring_scheme_exp = ScoringScheme.get_pseudodistance_scoring_scheme_p(1.)

"""
rates_presence_min = [0.2]
ic_rates = [0.05]

for rate_presence_minimal in rates_presence_min:
    for ic_rate in ic_rates:
        print(ic_rate)
        print(rate_presence_minimal)
        b = BootstrapExperimentBiologicalIC(dataset_folder="/home/pierre/Bureau/vldb_data/datasets/biological_dataset",
                                          algo=algor,
                                          scoring_scheme=scoring_scheme_exp,
                                          nb_bootstrap=10000,
                                          dataset_selector=DatasetSelector(