def create_item(indexes, variable_name, min_val, max_val, description,
                number_operations):
    """ Creates a class of type Item from the values of a NumericAttribute.

    Parameters
    ----------
    indexes : np.ndarray
        Array of indexes where the item is present in the training data.
    variable_name : str
        Name of the attribute/variable that this item is attached to.
    min_val : float
        Minimum value covered by this item. item > min_val.
    max_val : float
        Maximum value covered by this item. item < max_val.
    description : str
        Text describing the interval defined by the item. item < max_val = 1;  min_val <  item < max_val = 2.
    number_operations : int
        Number of logical operators used to define the interval.
    Returns
    ----------
    Item : Item class object
        Item with the characteristics described by the arguments.
    """
    bit_array = indexes2bitset(indexes)
    activation_function = partial(activation_numeric,
                                  attribute_name=variable_name,
                                  minval=min_val,
                                  maxval=max_val)
    return Item(bit_array, variable_name, description, number_operations,
                activation_function)
def run_FSSD_wrapper(dataset, attributes, class_attribute, types, depthmax):
    offset = 0
    nb_attributes = len(attributes)
    timebudget = 3600
    top_k = 1000
    wanted_label = dataset[0]["class"]
    attributes = attributes[offset:offset + nb_attributes]
    types = types[offset:offset + nb_attributes]
    timespent = time()
    pattern_setoutput, pattern_union_info, top_k_returned, header_returned = \
        find_top_k_subgroups_general_precall(dataset, attributes, types, class_attribute, \
                                             wanted_label, top_k, 'fssd', False, timebudget, depthmax)
    timespent = time() - timespent

    # print (top_k_returned[-1])
    range_attributes = []
    for ia, a in enumerate(attributes):
        colvals = [row[a] for row in dataset]
        if types[ia] == "numeric":
            maxval = max(colvals)
            minval = min(colvals)
            range_attributes.append([minval, maxval])
        elif types[ia] == "nominal":
            range_attributes.append(list(set(colvals)))
        elif types[ia] == "simple":
            range_attributes.append(list(set(colvals)))

    c_values = list(set([row["class"] for row in dataset]))
    count_cl = [0 for c in c_values]
    for row in dataset:
        for ic, c in enumerate(c_values):
            if row["class"] == c:
                count_cl[ic] += 1

    subgroup_sets = []
    items = []
    rules_supp = []
    nitems = []
    for pat in pattern_setoutput:
        # items
        nitemsaux = 0
        for ia, a in enumerate(attributes):
            # print(pat[0][ia])
            print("pattern: " + str(set(pat[0][ia])) + "  range: " +
                  str(set(range_attributes[ia])))
            if not set(pat[0][ia]) >= set(range_attributes[ia]):
                nitemsaux += 1
        nitems.append(nitemsaux)
        subgroup_index = pat[1]["support_full"]
        aux_supp = [0 for c in c_values]
        for idx in subgroup_index:
            for ic, c in enumerate(c_values):
                if dataset[idx]["class"] == c:
                    aux_supp[ic] += 1
        rules_supp.append(aux_supp)
        subgroup_sets.append(indexes2bitset(subgroup_index))

    return nitems, subgroup_sets, timespent
Exemplo n.º 3
0
    def create_items(self) -> Tuple[List[Item], Dict[int, int]]:
        """ Creates a list of items from the nominal atrribute.

        Makes a list of items using equality relationship with the categories. Example: x= blue_eyes could be the
        description of one of the items, for the NominalAttribute.name = "eye_colour".

        Returns
        ----------
        List[Item] : List of Items
            A list of all items based on the possible categories (only with equality relationships, not logical ORs).
        """
        self.cardinality_operator = {1: len(self.categories)}
        number_operators =  1
        for category in self.categories:
            vector_category = np.where(self.values == category)[0]
            bit_array = indexes2bitset(vector_category)
            description = self.name + " = " + str(category)
            activation_function = partial(activation_nominal, attribute_name=self.name, category=category)
            self.items.append(Item(bit_array,self.name, description, number_operators,activation_function))
        return self.items, self.cardinality_operator
Exemplo n.º 4
0
def findbitsets(patterns4prediction, X, Y):
    indeces_subgroups = [[] for pattern in patterns4prediction]
    if isinstance(X, pd.DataFrame):
        X = X.values
    if isinstance(Y, pd.DataFrame):
        Y = Y.values

    # find indices
    for ix, x in enumerate(X):
        for nr in range(len(patterns4prediction)):
            decision = decision_pattern(patterns4prediction[nr], x)
            if decision:
                indeces_subgroups[nr].append(ix)
    # clean the empty ones
    indeces_subgroups = [indices for indices in indeces_subgroups if indices]

    # pass to bitsets
    bitsets_subgroups = [
        indexes2bitset(indices) for indices in indeces_subgroups
    ]
    return bitsets_subgroups
Exemplo n.º 5
0
    def init_bitarrays_class(
            self, target_values
    ) -> Tuple[Dict[Any, np.ndarray], Dict[Any, np.ndarray]]:
        """ Initializes the bit array values for each category.

        Returns
        ----------
        Dict[gmpy2.mpz] :
            A dictionary of the bitarray values.
        """
        for namecol, colvals in target_values.iteritems():
            self.bit_arrays_var_class[namecol] = dict()
            self.counts[namecol] = dict()
            self.prob_var_class[namecol] = dict()
            for icat, category in enumerate(self.categories[namecol]):
                category_indexes = np.where(colvals.values == category)[0]
                self.bit_arrays_var_class[namecol][category] = indexes2bitset(
                    category_indexes)
                self.counts[namecol][category] = len(category_indexes)
                self.prob_var_class[namecol][category] = self.counts[namecol][
                    category] / target_values.shape[0]
        return self.bit_arrays_var_class, self.counts, self.prob_var_class
Exemplo n.º 6
0
def run_DSSD_wrapper(algorithmname, beam_width, number_rules_SSD, datasetname,
                     df, task, depthmax, attribute_names, number_targets):
    if algorithmname == "seq-cover":
        conf_file = read_csvfile(
            './otheralgorithms/DSSD/bin/tmp_sequential.conf')
    elif algorithmname == "DSSD":
        conf_file = read_csvfile(
            './otheralgorithms/DSSD/bin/tmp_dssd_diverse.conf')
        conf_file[12] = [
            'postSelect = ' +
            str(int(number_rules_SSD.loc[datasetname, "number_rules"]))
        ]
    elif algorithmname == "top-k":
        conf_file = read_csvfile('./otheralgorithms/DSSD/bin/tmp_topk.conf')
        conf_file[12] = [
            'postSelect = ' +
            str(int(number_rules_SSD.loc[datasetname, "number_rules"]))
        ]
        nrows = df.shape[0]
        if nrows < 2000 and task == "single-nominal":
            conf_file[14] = ['searchType = ' + "dfs"]
        else:
            conf_file[14] = ['searchType = ' + "beam"]
    else:
        raise Exception("Wrong aglorithm name")

    conf_file[19] = ['beamWidth = ' + str(int(beam_width))]
    conf_file[15] = ['maxDepth = ' + str(min(int(depthmax), 10))]

    if task == "multi-nominal" or task == "single-nominal":
        conf_file[23] = ['measure = WKL']
        # conf_file[24] = ['WRAccMode = 1vsAll']
    elif task == "multi-numeric" or task == "single-numeric":
        conf_file[23] = ['measure = meantest']
        conf_file[24] = ['WRAccMode = 1vsAll']
    else:
        raise Exception("Wrong task name")

    write_file_dssd(conf_file, './otheralgorithms/DSSD/bin/tmp.conf')

    # check if path exists
    if not os.path.exists('.//otheralgorithms//DSSD//xps//dssd'):
        os.makedirs('.//otheralgorithms//DSSD//xps//dssd')
    else:
        shutil.rmtree('.//otheralgorithms//DSSD//xps//dssd')
        os.makedirs('.//otheralgorithms//DSSD//xps//dssd')

    # change target variable file - target variables are at the end!
    name_targets = attribute_names[-number_targets:]
    targets_file = pd.read_csv(
        './otheralgorithms/DSSD/data/datasets/tmp/emmModel.emm',
        delimiter="=",
        header=None)
    targets_file.iloc[1, 1] = ' ' + ','.join(
        [tg_name for tg_name in name_targets])
    targets_file.to_csv('./otheralgorithms/DSSD/data/datasets/tmp/tmp.emm',
                        index=False,
                        sep="=",
                        header=False)

    # run DSSD
    timespent = time()
    os.chdir("./otheralgorithms/DSSD/bin")
    call(["emc64-mt-modified.exe"])
    # call(["dssd64.exe"])
    os.chdir("../../../")
    timespent = time() - timespent
    os.remove("./otheralgorithms/DSSD/data/datasets/tmp/tmp.arff")

    # read output files
    auxfiles = [
        path for path in os.listdir('./otheralgorithms/DSSD/xps/dssd/')
    ]
    generated_xp = './otheralgorithms/DSSD/xps/dssd/' + auxfiles[
        -1]  # last one
    timestamp = generated_xp.split('-')[1]
    # find transaction ids of subgroups
    generated_xp_subsets_path = generated_xp + '/subsets'
    all_generated_subgroups_files = [
        generated_xp_subsets_path + '/' + x
        for x in os.listdir(generated_xp_subsets_path)
    ]
    # find descriptions of subgroups
    if algorithmname == "top-k":
        description_files = generated_xp + '/' + "stats1-" + timestamp + ".csv"
    elif algorithmname == "seq-cover":
        description_files = generated_xp + '/' + "stats2-" + timestamp + ".csv"
    elif algorithmname == "DSSD":
        description_files = generated_xp + '/' + "stats3-" + timestamp + ".csv"

    # count number of items per subgroup
    descriptions = read_csvfile(description_files)
    #columnames, typevar, limits = info4prediction(df.iloc[:, :-number_targets], number_targets)
    #patterns4prediction = make_patterns4prediction(descriptions, columnames, typevar, limits)
    # Test dataset
    # nrows_test = Y_test.shape[0]
    # bitsets_subgroups = findbitsets(patterns4prediction,X_test,Y_test)

    nitems = []
    for row in descriptions[1:]:
        # count items
        nitems.append(1 + row[0].count("&&"))

    subgroup_sets_support = []
    subgroup_sets_support_bitset = []
    support_union = set()
    nb_subgroups = 0
    rules_supp = []
    for subgroup_file in all_generated_subgroups_files:
        aux_subgroup = read_csvfile(subgroup_file)[2:]
        subgroup_biset = [row[0] for row in aux_subgroup]
        subgroup_index = set(i for i, x in enumerate(subgroup_biset)
                             if x == '1')
        subgroup_sets_support.append(subgroup_index)
        subgroup_sets_support_bitset.append(indexes2bitset(subgroup_index))
        support = len(subgroup_index)
        rules_supp.append(support)
        nb_subgroups += 1

    return nitems, subgroup_sets_support_bitset, timespent
Exemplo n.º 7
0
def run_CN2SD_wrapper(dataset, attributes, types, class_attribute, beam_width,
                      depthmax, quality):
    wanted_label = dataset[0]["class"]
    # dataset,header=readCSVwithHeader(file,numberHeader=[a for a,t in zip(attributes,types) if t=='numeric'],delimiter=delimiter)
    new_dataset = deepcopy(dataset)
    new_dataset, positive_extent, negative_extent, alpha_ratio_class, _ = transform_dataset(
        dataset, attributes, class_attribute, wanted_label)
    new_dataset.insert(
        0, {
            a: 'c' if t == 'numeric' else 'd'
            for a, t in list(zip(attributes, types)) + [('class', 'class')]
        })
    new_dataset.insert(
        1,
        {a: '' if a != 'class' else 'class'
         for a in attributes + ['class']})
    writeCSVwithHeader(new_dataset,
                       './otheralgorithms/tmpForOrange.csv',
                       selectedHeader=attributes + ['class'],
                       delimiter='\t',
                       flagWriteHeader=True)
    data = Orange.data.Table('./otheralgorithms/tmpForOrange.csv')
    # print(data)
    timespent = time()
    # ordered! CN2SDLearner
    learner = Orange.classification.rules.CN2SDUnorderedLearner()

    if quality == 'entropy':
        learner.rule_finder.quality_evaluator = Orange.classification.rules.EntropyEvaluator(
        )
    elif quality == 'wracc':
        learner.rule_finder.quality_evaluator = Orange.classification.rules.WeightedRelativeAccuracyEvaluator(
        )
    # learner = Orange.classification.rules.CN2SDLearner()
    learner.gamma = 0.
    # learner.evaluator = "Evaluator_Entropy"
    learner.rule_finder.search_algorithm.beam_width = beam_width

    # continuous value space is constrained to reduce computation time

    learner.rule_finder.search_strategy.constrain_continuous = True

    # found rules must cover at least 15 examples
    learner.rule_finder.general_validator.min_covered_examples = max(
        int(15), 1.)

    # learner.rule_finder.general_validator.min_covered_examples = max(int(float(len(positive_extent))/10),1.)

    # found rules may combine at most 2 selectors (conditions)
    learner.rule_finder.general_validator.max_rule_length = depthmax

    classifier = learner(data)
    timespent = time() - timespent

    del classifier.rule_list[-1]
    top_quality = []
    # import inspect
    # inspect.getmembers(learner, lambda a:not(inspect.isroutine(a)))
    # inspect.getmembers(row, lambda a:not(inspect.isroutine(a)))
    subgroup_sets = []
    rules_supp = []
    nitems = []
    for i, row in enumerate(classifier.rule_list):
        s = str(row)
        nitems.append(1 + s.count("AND"))
        subgroup_biset = row.covered_examples
        subgroup_index = set(i for i, x in enumerate(subgroup_biset)
                             if x == True)
        subgroup_sets.append(indexes2bitset(subgroup_index))
        rules_supp.append(row.curr_class_dist.tolist())

    return nitems, subgroup_sets, timespent