def discover_rfds(self):
        columns_count = self.columns_count
        hand_sides_specifications = RFDExtractor.extract_hss(columns_count)

        self.distance_matrix = DiffMatrix(path=self.path, sep=self.separator)
        self.distance_df: DataFrame = self.distance_matrix.distance_df

        self.rfd_data_frame_list: list[DataFrame] = list()

        for combination in hand_sides_specifications:
            '''
            combination is a dictionary containing rhs & lhs as keys,
            and a list of the corresponding indexes as valus.
            For example, given a set of 4 attributes, there will be 
            the following 4 combinations:
            Combination0: {'rhs': [0], 'lhs': [1, 2, 3]}
            Combination1: {'rhs': [1], 'lhs': [0, 2, 3]}
            Combination2: {'rhs': [2], 'lhs': [0, 1, 3]}
            Combination3: {'rhs': [3], 'lhs': [0, 1, 2]}
            '''
            combination_distance_matrix = self.distance_matrix.split_sides(combination)
            '''with ut.timeit_context("RFD Discover time for Combination {}".format(str(combination))):'''
            rfd_discovery = RFDDiscovery(combination_distance_matrix)

            self.rfd_data_frame_list.append(
                rfd_discovery.get_rfds(rfd_discovery.standard_algorithm, combination))

        self.rfds = []
        for df in self.rfd_data_frame_list:
            self.rfds.extend(Transformer.rfd_data_frame_to_rfd_list(df, self.header))

        self.__show_rfds(self.__filter_rfds())
Пример #2
0
def main(csv_file, post_metadata):
    """
    Given a valid CSV file's path and a series of parameters given by the user via the web gui, execute the algorithm on
    the given input. If some of the parameters are not valid, the method returns an error string message.
    It returns the output of the standard_algorithm in a dict where each element is the output of the algorithm
    for each combination given as input and with the combination itself as a key in JSON format.
    :param csv_file: valid path to a CSV file
    :type csv_file: str
    :param post_metadata: dict containing the user's parameters
    :type post_metadata: werkzeug.datastructures.ImmutableMultiDict
    :return: a dict containing the output of each combination or with an error message
    :rtype: dict
    """
    with Timer() as total:
        params = param_to_dict(post_metadata)
        args = {
            'sep': params['separator'],
            'semantic': params['semantic'],
            'missing': params['missing'],
            'datetime': params['datetime']
        }
        if 'header' in params:
            args['first_col_header'] = params['header']
        with Timer() as mtxtime:
            try:
                diff_mtx = DiffMatrix(csv_file, **args)
                labels = diff_mtx.get_labels()
            except Exception as e:
                return {"error": str(e.__doc__)}
                #return {"error":str(traceback.format_exc())}
        cols_count = ut.get_cols_count(csv_file, params['separator'])
        hss = extract_hss(cols_count, params['lhs'], params['rhs'])
        response = {
            'mtxtime': "{:.2f}".format(mtxtime.interval),
            'result': {},
            'timing': []
        }
        for combination in hss:
            with Timer() as c:
                try:
                    comb_dist_mtx = diff_mtx.split_sides(combination)
                    nd = RFDDiscovery(comb_dist_mtx)
                    r = nd.get_rfds(nd.standard_algorithm, combination)
                    rhs = r[[0]]
                    lhs = r.drop([r.columns[0]], axis=1)
                    result_df = pnd.concat([lhs, rhs], axis=1)
                    response['result'][json.dumps(
                        name_combination(labels,
                                         combination))] = result_df.to_csv(
                                             sep=params['separator'])
                except Exception as e:
                    return {"error": str(e.__doc__)}
            response['timing'].append("{:.2f}".format(c.interval))
    response['total'] = "{:.2f}".format(total.interval)
    return response
Пример #3
0
def main(args):
    """
    This method starts the rfd-discovery algorithm. It takes various command line parameters like a valid dataset's
    path, the division on rhs and lhs needed and more. See the section Usage of the README for
    more informations about the available parameters. If the user does not give a valid sequence of
    parameters, the program will end and print on the standard output a message with the required
    format to run the program.
    :param args: list of parameters given as input
    :type args: list
    """
    c_sep, csv_file, has_header, semantic, has_dt, missing, index_col, human = extract_args(args)
    try:
        check_correctness(has_dt, hss, index_col)
    except getopt.GetoptError as gex:
        usage()
        print(str(gex))
        sys.exit(1)
    except AssertionError as aex:
        usage()
        print(str(aex))
        sys.exit(1)

    if hss is None:
        usage()
    if isinstance(hss, list):
        with ut.timeit_context("Whole time"):
            with ut.timeit_context("Distance time"):
                if isinstance(has_header, int) and not has_header:
                    diff_mtx = DiffMatrix(csv_file,
                                          sep=c_sep,
                                          index_col=index_col,
                                          semantic=semantic,
                                          missing=missing,
                                          datetime=has_dt)
                else:
                    diff_mtx = DiffMatrix(csv_file,
                                          sep=c_sep,
                                          first_col_header=has_header,
                                          semantic=semantic,
                                          index_col=index_col,
                                          missing=missing,
                                          datetime=has_dt)
            for combination in hss:
                comb_dist_mtx = diff_mtx.split_sides(combination)
                with ut.timeit_context("RFD Discover time for {}".format(str(combination))):
                    nd = RFDDiscovery(comb_dist_mtx)
                    if human:
                        print(combination)
                        print_human(nd.get_rfds(nd.standard_algorithm, combination))
                    else:
                        print(nd.get_rfds(nd.standard_algorithm, combination))
    def __init__(self, args, debug_mode=False) -> None:
        super().__init__()
        self.args = args
        self.debug_mode = debug_mode
        self.separator_character, self.csv_file, self.has_header, self.semantic, self.has_date_time, self.missing, \
        self.index_column, self.human_readable, self.half_sides_specifications = self.extract_args(self.args)

        self.csv_parser = CSVParser(self.csv_file)
        self.data_frame = self.csv_parser.data_frame
        self.header = self.csv_parser.header

        try:
            self.check_correctness(self.has_date_time, self.half_sides_specifications, self.index_column)
        except getopt.GetoptError as gex:
            self.usage()
            print(str(gex))
            sys.exit(1)
        except AssertionError as aex:
            self.usage()
            print(str(aex))
            sys.exit(1)

        if self.half_sides_specifications is None:
            self.usage()
        elif isinstance(self.half_sides_specifications, list):
            if isinstance(self.has_header, int) and not self.has_header:
                self.distance_matrix = DiffMatrix(self.csv_file,
                                                  sep=self.separator_character,
                                                  index_col=self.index_column,
                                                  semantic=self.semantic,
                                                  missing=self.missing,
                                                  datetime=self.has_date_time)
            else:  # has header
                self.distance_matrix = DiffMatrix(self.csv_file,
                                                  sep=self.separator_character,
                                                  first_col_header=self.has_header,
                                                  semantic=self.semantic,
                                                  index_col=self.index_column,
                                                  missing=self.missing,
                                                  datetime=self.has_date_time)

            self.rfd_data_frame_list = list()
            for combination in self.half_sides_specifications:
                '''
                combination is a dictionary containing rhs & lhs as keys,
                and a list of the corresponding indexes as valus.
                For example, given a set of 4 attributes, there will be 
                the following 4 combinations:
                Combination0: {'rhs': [0], 'lhs': [1, 2, 3]}
                Combination1: {'rhs': [1], 'lhs': [0, 2, 3]}
                Combination2: {'rhs': [2], 'lhs': [0, 1, 3]}
                Combination3: {'rhs': [3], 'lhs': [0, 1, 2]}
                '''
                combination_distance_matrix = self.distance_matrix.split_sides(combination)
                '''with ut.timeit_context("RFD Discover time for Combination {}".format(str(combination))):'''
                rfd_discovery = RFDDiscovery(combination_distance_matrix)
                self.rfd_data_frame_list.append(
                    rfd_discovery.get_rfds(rfd_discovery.standard_algorithm, combination))

                if self.debug_mode:
                    print("\nCombination:", combination)
                    if self.human_readable:
                        self.print_human(self.rfd_data_frame_list[-1])
                    else:
                        print(self.rfd_data_frame_list[-1])
Пример #5
0
 def test_something(self):
     """
     This method execute the algorithm defined in the class RFDDiscovery for each dataset in the directory resources and for
     each combination of rhs and lhs of them. For each execution of the algorithm, the method saves some information:
         - the dataset's name;
         - the dataset rows' number;
         - number of column;
         - the dataset file's size;
         - the algorithm's elapsed time;
         - the number of RFDs found;
         - the combination of rhs and lhs used for the iteration;
         - the number of the iteration executed on that combination.
     When the test will end, it will save all the information described above in a CSV file with the name
     <date of test>-result-c.csv. During the test, some log information will be printed.
     """
     test_count = 1
     logging.info("Starting test")
     result_df = pd.DataFrame(columns=cols)  # Data frame in which save results
     path = "../resources"  # path in which datasets are stored
     datasets = self.__load_all_files__(path)
     logging.info("All files loaded")
     for ds in datasets:
         logging.info("Starting test for dataset {}".format(ds))
         current_ds = path + "/" + ds                                # abs path for current dataset
         file_size = os.stat(current_ds).st_size                     # get file size
         logging.info("Checking separator and header for dataset {}".format(ds))
         try:
             c_sep, has_header = ut.check_sep_n_header(current_ds)
         except Exception as ex:
             logging.ERROR("Failed to load separator and header. Skipping test for {}".format(ds))
             pass
         logging.info("{} has separator '{}' and has {} header".format(ds, c_sep, "no" if has_header is None else ""))
         ds_shape = self.__get_ds_shape(current_ds, sep=c_sep, first_row_head=has_header)  # get df shape
         lhs_vs_rhs = ut.get_hs_combination(ds_shape['col'])     # combination for HS
         diff_matrix, elapsed_time_dist = self.__get_diff_mtx(c_sep, current_ds, has_header)
         for combination in lhs_vs_rhs:
             logging.info("Testing on combination: {}".format(str(combination)))
             dist_mtx = diff_matrix.split_sides(combination)
             for i in range(ITERATION_TIME):                         # repeat test X times
                 logging.info("Test no.{}".format(i))
                 start_time = time.time()                            # get t0
                 rfdd = RFDDiscovery(dist_mtx)
                 compiled = rfdd.is_compiled()
                 rfd_df = rfdd.get_rfds(rfdd.standard_algorithm, combination)
                 elapsed_time = time.time() - start_time             # get deltaT = now - t0
                 logging.info("RFDs discovery process finished")
                 rfd_count = rfd_df.shape[0]
                 logging.info("Discovered {} RFDs".format(rfd_count))
                 logging.info("Result added")
                 logging.info("Appending result to result's dataframe")
                 # append to result df
                 self.__append_result(ds, ds_shape['row'], ds_shape['col'], file_size, round(elapsed_time*1000,3),
                                      round(elapsed_time_dist*1000,3), rfd_count, str(combination), result_df)
                 test_count += 1
                 elapsed_time_dist = 0
         diff_mtx = None  # for free unused memory
     logging.info("Saving file")
     abs_path = os.path.abspath("../resources/test/{}-results-{}.csv"
                                .format(time.strftime("%Y-%m-%d_%H-%M-%S"), "c" if compiled else "p"))
     result_df.to_csv(abs_path, sep=";", header=cols, decimal=',')
     logging.info("File saved")
Пример #6
0
def main(args):
    """
    This method start the rfd-discovery algorithm. It takes various command line parameters like a valid dataset's
    path, the division on rhs and lhs needed and more. See the section Usage of the README for
    more information about the available parameters. If the user does not give a valid sequence of
    parameters, the program will end and print on the standard output a message with the required
    format to run the program.
    :param args: list of parameters given as input
    :type args: list
    """
    separator_character, csv_file, has_header, semantic, has_date_time, missing, index_column, human_readable = extract_args(
        args)
    print("\nCommand-Line Arguments:")
    print("-separator_character:", separator_character)
    print("-csv_file:", csv_file)
    print("-has_header:", has_header)
    print("-semantic:", semantic)
    print("-has_date_time:", has_date_time)
    print("-missing:", missing)
    print("-index_column:", index_column)
    print("-human_readable:", human_readable)
    print()

    try:
        check_correctness(has_date_time, half_sides_specifications,
                          index_column)
    except getopt.GetoptError as gex:
        usage()
        print(str(gex))
        sys.exit(1)
    except AssertionError as aex:
        usage()
        print(str(aex))
        sys.exit(1)

    if half_sides_specifications is None:
        usage()
    if isinstance(half_sides_specifications, list):
        with ut.timeit_context("Whole time"):
            with ut.timeit_context("Distance time"):
                if isinstance(has_header, int) and not has_header:
                    distance_matrix = DiffMatrix(csv_file,
                                                 sep=separator_character,
                                                 index_col=index_column,
                                                 semantic=semantic,
                                                 missing=missing,
                                                 datetime=has_date_time)
                else:  # has header
                    distance_matrix = DiffMatrix(csv_file,
                                                 sep=separator_character,
                                                 first_col_header=has_header,
                                                 semantic=semantic,
                                                 index_col=index_column,
                                                 missing=missing,
                                                 datetime=has_date_time)
            for combination in half_sides_specifications:
                combination_distance_matrix = distance_matrix.split_sides(
                    combination)
                with ut.timeit_context("RFD Discover time for {}".format(
                        str(combination))):
                    rfd_discovery = RFDDiscovery(combination_distance_matrix)
                    if human_readable:
                        print(combination)
                        print_human(
                            rfd_discovery.get_rfds(
                                rfd_discovery.standard_algorithm, combination))
                    else:
                        print(
                            rfd_discovery.get_rfds(
                                rfd_discovery.standard_algorithm, combination))