def extract_hss(self, cols_count, lhs, rhs):
     # You cannot have len(rhs) > 1, don't check it
     """
     Given the lhs and rhs from the command line parameters, and the column's number of the dataset,
     it creates various combinations of rhs and lhs according to the format of this two parameters.
     If the format of this two parameters is not accordant with the possible combination on rhs and lhs in the
     command line arguments described by the README, the program will print an error message and it will end.
     The program return a list of dict, where each dict contains the indexes of the attributes on the lhs with the key
     'lhs' and the index of the attribute on the rhs with the key 'rhs'.
     :param cols_count: the column's number
     :type cols_count: int
     :param lhs: list of a valid columns' indexes containing the dataset's attributes positioned in the lhs
     :type lhs: list
     :param rhs: list of a valid column's index containing the dataset's attribute positioned in the rhs
     :type rhs: list
     :return: one or more combination of attribute in the rhs and lhs
     :rtype: list
     """
     if rhs == [] and lhs == []:  # each combination case
         hss = ut.get_hs_combination(cols_count)
     elif rhs == [] and not lhs == []:  # error case
         print("You have to specify at least one RHS attribute")
         sys.exit(-1)
     elif not rhs == [] and lhs == []:  # only rhs specified case
         cols_index = list(range(cols_count))
         if not rhs[0] in cols_index:
             print("RHS index is out of bound. Specify a valid value")
             sys.exit(-1)
         hss = list()
         hss.append({'rhs': rhs, 'lhs': cols_index[:rhs[0]] + cols_index[rhs[0] + 1:]})
     else:
         hss = list()
         hss.append({'rhs': rhs, 'lhs': lhs})
     return hss
예제 #2
0
 def test_something(self):
     """
     This method execute the algorithm defined in the class RFDDiscovery for each dataset in the directory resources and for
     each combination of rhs and lhs of them. For each execution of the algorithm, the method saves some information:
         - the dataset's name;
         - the dataset rows' number;
         - number of column;
         - the dataset file's size;
         - the algorithm's elapsed time;
         - the number of RFDs found;
         - the combination of rhs and lhs used for the iteration;
         - the number of the iteration executed on that combination.
     When the test will end, it will save all the information described above in a CSV file with the name
     <date of test>-result-c.csv. During the test, some log information will be printed.
     """
     test_count = 1
     logging.info("Starting test")
     result_df = pd.DataFrame(columns=cols)  # Data frame in which save results
     path = "../resources"  # path in which datasets are stored
     datasets = self.__load_all_files__(path)
     logging.info("All files loaded")
     for ds in datasets:
         logging.info("Starting test for dataset {}".format(ds))
         current_ds = path + "/" + ds                                # abs path for current dataset
         file_size = os.stat(current_ds).st_size                     # get file size
         logging.info("Checking separator and header for dataset {}".format(ds))
         try:
             c_sep, has_header = ut.check_sep_n_header(current_ds)
         except Exception as ex:
             logging.ERROR("Failed to load separator and header. Skipping test for {}".format(ds))
             pass
         logging.info("{} has separator '{}' and has {} header".format(ds, c_sep, "no" if has_header is None else ""))
         ds_shape = self.__get_ds_shape(current_ds, sep=c_sep, first_row_head=has_header)  # get df shape
         lhs_vs_rhs = ut.get_hs_combination(ds_shape['col'])     # combination for HS
         diff_matrix, elapsed_time_dist = self.__get_diff_mtx(c_sep, current_ds, has_header)
         for combination in lhs_vs_rhs:
             logging.info("Testing on combination: {}".format(str(combination)))
             dist_mtx = diff_matrix.split_sides(combination)
             for i in range(ITERATION_TIME):                         # repeat test X times
                 logging.info("Test no.{}".format(i))
                 start_time = time.time()                            # get t0
                 rfdd = RFDDiscovery(dist_mtx)
                 compiled = rfdd.is_compiled()
                 rfd_df = rfdd.get_rfds(rfdd.standard_algorithm, combination)
                 elapsed_time = time.time() - start_time             # get deltaT = now - t0
                 logging.info("RFDs discovery process finished")
                 rfd_count = rfd_df.shape[0]
                 logging.info("Discovered {} RFDs".format(rfd_count))
                 logging.info("Result added")
                 logging.info("Appending result to result's dataframe")
                 # append to result df
                 self.__append_result(ds, ds_shape['row'], ds_shape['col'], file_size, round(elapsed_time*1000,3),
                                      round(elapsed_time_dist*1000,3), rfd_count, str(combination), result_df)
                 test_count += 1
                 elapsed_time_dist = 0
         diff_mtx = None  # for free unused memory
     logging.info("Saving file")
     abs_path = os.path.abspath("../resources/test/{}-results-{}.csv"
                                .format(time.strftime("%Y-%m-%d_%H-%M-%S"), "c" if compiled else "p"))
     result_df.to_csv(abs_path, sep=";", header=cols, decimal=',')
     logging.info("File saved")