def check_feature_status(self): ''' check that features are Na ''' feature_group_dict = self.metainfo.feature_group_dict for inst_ in self.instances.values(): not_ok_steps = [] for step, status in inst_._features_status.items(): if status != "ok": not_ok_steps.append(step) unused_features = set() for u_step in not_ok_steps: not_processed_features = feature_group_dict[u_step] unused_features = unused_features.union(set(not_processed_features)) not_ok_index_features = sorted(list(map(str,self.metainfo.features).index(un_feature) for un_feature in unused_features), reverse=True) ok_index_features = set(range(len(self.metainfo.features))).difference(not_ok_index_features) warned = False for indx in not_ok_index_features: if inst_._features[indx] is not None: if not warned: Printer.print_w("Not all features of %s are NA although the corresponding feature step is not OK." %(inst_._name)) warned = True #inst_._features[indx] = None ok_values = [inst_._features[indx] for indx in ok_index_features] if None in ok_values: Printer.print_e("Missing Features with status OK: %s." % (inst_._name))
def find_files(self): ''' find all expected files in self.dir_ fills self.found_files ''' expected = [ "description.txt", "algorithm_runs.arff", "feature_values.arff", "feature_runstatus.arff" ] optional = [ "ground_truth.arff", "feature_costs.arff", "citation.bib", "cv.arff" ] for expected_file in expected: full_path = os.path.join(self.dir_, expected_file) if not os.path.isfile(full_path): Printer.print_e("Not found: %s (has to be added)" % (full_path)) else: self.found_files.append(full_path) for expected_file in optional: full_path = os.path.join(self.dir_, expected_file) if not os.path.isfile(full_path): Printer.print_w("Not found: %s (maybe you want to add it)" % (full_path)) else: self.found_files.append(full_path)
def read_ground_truth(self, file_): ''' read ground truths of all instances and save them in self.instances @RELATION GROUND_TRUTH_2013-SAT-Competition @ATTRIBUTE instance_id STRING @ATTRIBUTE SATUNSAT {SAT,UNSAT} @ATTRIBUTE OPTIMAL_VALUE NUMERIC ''' Printer.print_c("Read %s" % (file_)) with open(file_, "rb") as fp: try: arff_dict = arff.load(fp) except arff.BadNominalValue: Printer.print_e( "Parsing of arff file failed (%s) - maybe conflict of header and data." % (file_)) if arff_dict["attributes"][0][0] != "instance_id": Printer.print_e("instance_id as first attribute is missing in %s" % (file_)) # extract feature names for attr in arff_dict["attributes"][1:]: self.metainfo.ground_truths[attr[0]] = attr[1] insts = [] for data in arff_dict["data"]: inst_name = data[0] truth = data[1:] inst_ = self.instances.get(inst_name) if not inst_: Printer.print_w( "Instance \"%s\" has ground truths but was not found in performance file" % (inst_name)) continue truth_dict = {} for truth_name, truth_value in zip(arff_dict["attributes"][1:], truth): if type(truth_name[1]) is list: truth_dict[truth_name[0]] = self.metainfo.ground_truths[ truth_name[0]].index( truth_value) if truth_value else -1 else: truth_dict[truth_name[0]] = truth_value inst_._ground_truth = truth_dict if inst_name in insts: Printer.print_e("Instance \"%s\" is not unique in %s" % (inst_name, file_)) else: insts.append(inst_name)
def read_feature_runstatus(self, file_): ''' reads run stati of all pairs instance x feature step and saves them self.instances Expected header: @RELATION FEATURE_RUNSTATUS_2013 - SAT - Competition @ATTRIBUTE instance_id STRING @ATTRIBUTE repetition NUMERIC @ATTRIBUTE preprocessing { ok , timeout , memout , presolved , crash , other } @ATTRIBUTE local_search_probing { ok , timeout , memout , presolved , crash , other } ''' Printer.print_c("Read %s" %(file_)) with open(file_,"rb") as fp: try: arff_dict = arff.load(fp) except arff.BadNominalValue: Printer.print_e("Parsing of arff file failed (%s) - maybe conflict of header and data." %(file_)) if arff_dict["attributes"][0][0] != "instance_id": Printer.print_e("instance_id as first attribute is missing in %s" %(file_)) if arff_dict["attributes"][1][0] != "repetition": Printer.print_e("repetition as second attribute is missing in %s" %(file_)) for f_name in arff_dict["attributes"][2:]: f_name = f_name[0] if not f_name in self.metainfo.feature_group_dict.keys(): Printer.print_e("Feature step \"%s\" was not defined in feature steps" %(f_name)) if len(self.metainfo.feature_group_dict.keys()) != len(arff_dict["attributes"][2:]): Printer.print_e("Number of feature steps in description.txt (%d) and feature_runstatus.arff (%d) does not match." %(len(self.metainfo.feature_group_dict.keys()), len(arff_dict["attributes"][2:-1]))) pairs_inst_rep = [] for data in arff_dict["data"]: inst_name = data[0] repetition = data[1] stati = data[2:] inst_ = self.instances.get(inst_name) if not inst_: Printer.print_w("Instance \"%s\" has feature step status but was not found in performance file" %(inst_name)) continue if (inst_name,repetition) in pairs_inst_rep: Printer.print_w("Pair (%s,%s) is not unique in %s" %(inst_name,repetition,file_)) else: pairs_inst_rep.append((inst_name,repetition)) #=================================================================== # # if runstatus of feature vector is not always ok, remove feature vector # if reduce(lambda x,y: False if ((not x) and y == "ok") else True, stati, False): # inst_._features = None #=================================================================== for status, f_step in zip(stati,arff_dict["attributes"][2:]): inst_._features_status[f_step[0]] = status
def read_cv(self, file_): ''' read cross validation <file_> @RELATION CV_2013 - SAT - Competition @ATTRIBUTE instance_id STRING @ATTRIBUTE repetition NUMERIC @ATTRIBUTE fold NUMERIC ''' Printer.print_c("Read %s" % (file_)) self.metainfo.cv_given = True with open(file_, "rb") as fp: try: arff_dict = arff.load(fp) except arff.BadNominalValue: Printer.print_e( "Parsing of arff file failed (%s) - maybe conflict of header and data." % (file_)) if arff_dict["attributes"][0][0] != "instance_id": Printer.print_e("instance_id as first attribute is missing in %s" % (file_)) if arff_dict["attributes"][1][0] != "repetition": Printer.print_e("repetition as second attribute is missing in %s" % (file_)) if arff_dict["attributes"][2][0] != "fold": Printer.print_e("fold as third attribute is missing in %s" % (file_)) rep_fold_dict = {} for data in arff_dict["data"]: inst_name = data[0] rep = int(data[1]) fold = int(data[2]) inst_ = self.instances.get(inst_name) if not inst_: Printer.print_w( "Instance \"%s\" has ground truths but was not found in performance file" ) continue inst_._fold[rep] = fold fold_distribution = rep_fold_dict.get(rep, {}) rep_fold_dict[rep] = fold_distribution fold_distribution[fold] = fold_distribution.get(fold, 0) fold_distribution[fold] += 1 for rep, fold_dist in rep_fold_dict.items(): Printer.print_c( "%d-th repetition: %s distribution" % (rep, ",".join(map(str, list(fold_dist.values())))))
def read_feature_runstatus(self, file_): ''' reads run stati of all pairs instance x feature step and saves them self.instances Expected header: @RELATION FEATURE_RUNSTATUS_2013 - SAT - Competition @ATTRIBUTE instance_id STRING @ATTRIBUTE repetition NUMERIC @ATTRIBUTE preprocessing { ok , timeout , memout , presolved , crash , other } @ATTRIBUTE local_search_probing { ok , timeout , memout , presolved , crash , other } ''' Printer.print_c("Read %s" %(file_)) fp = open(file_,"rb") arff_dict = arff.load(fp) fp.close() if arff_dict["attributes"][0][0] != "instance_id": Printer.print_e("instance_id as first attribute is missing in %s" %(file_)) if arff_dict["attributes"][1][0] != "repetition": Printer.print_e("repetition as second attribute is missing in %s" %(file_)) for f_name in arff_dict["attributes"][2:]: f_name = f_name[0] if not f_name in self.metainfo.feature_group_dict.keys(): Printer.print_e("Feature step \"%s\" was not defined in feature steps" %(f_name)) if len(self.metainfo.feature_group_dict.keys()) != len(arff_dict["attributes"][2:]): Printer.print_e("Number of feature steps in description.txt (%d) and feature_runstatus.arff (%d) does not match." %(len(self.metainfo.feature_group_dict.keys()), len(arff_dict["attributes"][2:-1]))) pairs_inst_rep = [] for data in arff_dict["data"]: inst_name = data[0] repetition = data[1] stati = data[2:] inst_ = self.instances.get(inst_name) if not inst_: Printer.print_w("Instance \"%s\" has feature step status but was not found in performance file" %(inst_name)) continue if (inst_name,repetition) in pairs_inst_rep: Printer.print_w("Pair (%s,%s) is not unique in %s" %(inst_name,repetition,file_)) else: pairs_inst_rep.append((inst_name,repetition)) #=================================================================== # # if runstatus of feature vector is not always ok, remove feature vector # if reduce(lambda x,y: False if ((not x) and y == "ok") else True, stati, False): # inst_._features = None #=================================================================== for status, f_step in zip(stati,arff_dict["attributes"][2:]): inst_._features_status[f_step[0]] = status
def read_ground_truth(self,file_): ''' read ground truths of all instances and save them in self.instances @RELATION GROUND_TRUTH_2013-SAT-Competition @ATTRIBUTE instance_id STRING @ATTRIBUTE SATUNSAT {SAT,UNSAT} @ATTRIBUTE OPTIMAL_VALUE NUMERIC ''' Printer.print_c("Read %s" %(file_)) with open(file_,"rb") as fp: try: arff_dict = arff.load(fp) except arff.BadNominalValue: Printer.print_e("Parsing of arff file failed (%s) - maybe conflict of header and data." %(file_)) if arff_dict["attributes"][0][0] != "instance_id": Printer.print_e("instance_id as first attribute is missing in %s" %(file_)) # extract feature names for attr in arff_dict["attributes"][1:]: self.metainfo.ground_truths[attr[0]] = attr[1] insts = [] for data in arff_dict["data"]: inst_name = data[0] truth = data[1:] inst_ = self.instances.get(inst_name) if not inst_: Printer.print_w("Instance \"%s\" has ground truths but was not found in performance file" %(inst_name)) continue truth_dict = {} for truth_name, truth_value in zip(arff_dict["attributes"][1:], truth): if type(truth_name[1]) is list: truth_dict[truth_name[0]] = self.metainfo.ground_truths[truth_name[0]].index(truth_value) if truth_value else -1 else: truth_dict[truth_name[0]] = truth_value inst_._ground_truth = truth_dict if inst_name in insts: Printer.print_e("Instance \"%s\" is not unique in %s" %(inst_name,file_)) else: insts.append(inst_name)
def read_feature_costs(self, file_): ''' reads feature time file and saves in self.instances Expected header: @RELATION FEATURE_COSTS_2013-SAT-Competition @ATTRIBUTE instance_id STRING @ATTRIBUTE repetition NUMERIC @ATTRIBUTE preprocessing NUMERIC @ATTRIBUTE local_search_probing NUMERIC ''' Printer.print_c("Read %s" %(file_)) with open(file_,"rb") as fp: try: arff_dict = arff.load(fp) except arff.BadNominalValue: Printer.print_e("Parsing of arff file failed (%s) - maybe conflict of header and data." %(file_)) if arff_dict["attributes"][0][0] != "instance_id": Printer.print_e("\"instance_id\" as first attribute is missing in %s" %(file_)) if arff_dict["attributes"][1][0] != "repetition": Printer.print_e("\"repetition\" as second attribute is missing in %s" %(file_)) found_groups = map(str,sorted(map(lambda x: x[0], arff_dict["attributes"][2:]))) for meta_group in self.metainfo.feature_group_dict.keys(): if meta_group not in found_groups: Printer.print_e("\"%s\" as attribute is missing in %s" %(meta_group, file_)) pairs_inst_rep = [] for data in arff_dict["data"]: inst_name = str(data[0]) repetition = data[1] feature_cost = data[2:] inst_ = self.instances.get(inst_name) if not inst_: Printer.print_w("Instance \"%s\" has feature cost but was not found in algorithm_runs.arff" %(inst_name)) continue for cost, f_group in zip(feature_cost,arff_dict["attributes"][2:]): inst_._feature_group_cost_dict[str(f_group[0])] = cost if (inst_name,repetition) in pairs_inst_rep: Printer.print_w("Pair (%s,%s) is not unique in %s" %(inst_name,repetition, file_)) else: pairs_inst_rep.append((inst_name,repetition))
def read_feature_costs(self, file_): ''' reads feature time file and saves in self.instances Expected header: @RELATION FEATURE_COSTS_2013-SAT-Competition @ATTRIBUTE instance_id STRING @ATTRIBUTE repetition NUMERIC @ATTRIBUTE preprocessing NUMERIC @ATTRIBUTE local_search_probing NUMERIC ''' Printer.print_c("Read %s" %(file_)) fp = open(file_,"rb") arff_dict = arff.load(fp) fp.close() if arff_dict["attributes"][0][0] != "instance_id": Printer.print_e("\"instance_id\" as first attribute is missing in %s" %(file_)) if arff_dict["attributes"][1][0] != "repetition": Printer.print_e("\"repetition\" as second attribute is missing in %s" %(file_)) found_groups = map(str,sorted(map(lambda x: x[0], arff_dict["attributes"][2:]))) for meta_group in self.metainfo.feature_group_dict.keys(): if meta_group not in found_groups: Printer.print_e("\"%s\" as attribute is missing in %s" %(meta_group, file_)) pairs_inst_rep = [] for data in arff_dict["data"]: inst_name = str(data[0]) repetition = data[1] feature_cost = data[2:] inst_ = self.instances.get(inst_name) if not inst_: Printer.print_w("Instance \"%s\" has feature cost but was not found in algorithm_runs.arff" %(inst_name)) continue for cost, f_group in zip(feature_cost,arff_dict["attributes"][2:]): inst_._feature_group_cost_dict[str(f_group[0])] = cost if (inst_name,repetition) in pairs_inst_rep: Printer.print_w("Pair (%s,%s) is not unique in %s" %(inst_name,repetition, file_)) else: pairs_inst_rep.append((inst_name,repetition))
def read_cv(self, file_): ''' read cross validation <file_> @RELATION CV_2013 - SAT - Competition @ATTRIBUTE instance_id STRING @ATTRIBUTE repetition NUMERIC @ATTRIBUTE fold NUMERIC ''' Printer.print_c("Read %s" %(file_)) self.metainfo.cv_given = True with open(file_,"rb") as fp: try: arff_dict = arff.load(fp) except arff.BadNominalValue: Printer.print_e("Parsing of arff file failed (%s) - maybe conflict of header and data." %(file_)) if arff_dict["attributes"][0][0] != "instance_id": Printer.print_e("instance_id as first attribute is missing in %s" %(file_)) if arff_dict["attributes"][1][0] != "repetition": Printer.print_e("repetition as second attribute is missing in %s" %(file_)) if arff_dict["attributes"][2][0] != "fold": Printer.print_e("fold as third attribute is missing in %s" %(file_)) rep_fold_dict = {} for data in arff_dict["data"]: inst_name = data[0] rep = int(data[1]) fold = int(data[2]) inst_ = self.instances.get(inst_name) if not inst_: Printer.print_w("Instance \"%s\" has ground truths but was not found in performance file") continue inst_._fold[rep] = fold fold_distribution = rep_fold_dict.get(rep,{}) rep_fold_dict[rep] = fold_distribution fold_distribution[fold] = fold_distribution.get(fold,0) fold_distribution[fold] += 1 for rep, fold_dist in rep_fold_dict.items(): Printer.print_c("%d-th repetition: %s distribution" %(rep, ",".join(map(str,list(fold_dist.values())))))
def read_ground_truth(self,file_): ''' read ground truths of all instances and save them in self.instances @RELATION GROUND_TRUTH_2013-SAT-Competition @ATTRIBUTE instance_id STRING @ATTRIBUTE SATUNSAT {SAT,UNSAT} @ATTRIBUTE OPTIMAL_VALUE NUMERIC ''' Printer.print_c("Read %s" %(file_)) fp = open(file_,"rb") arff_dict = arff.load(fp) fp.close() if arff_dict["attributes"][0][0] != "instance_id": Printer.print_e("instance_id as first attribute is missing in %s" %(file_)) # extract feature names for attr in arff_dict["attributes"][1:]: self.metainfo.ground_truths[attr[0]] = attr[1] insts = [] for data in arff_dict["data"]: inst_name = data[0] truth = data[1:] inst_ = self.instances.get(inst_name) if not inst_: Printer.print_w("Instance \"%s\" has ground truths but was not found in performance file") continue truth_dict = dict((truth_name[0], self.metainfo.ground_truths[truth_name[0]].index(truth_value) if truth_value else -1)\ for truth_name, truth_value in zip(arff_dict["attributes"][1:], truth)) inst_._ground_truth = truth_dict if inst_name in insts: Printer.print_e("Instance \"%s\" is not unique in %s" %(inst_name,file_)) else: insts.append(inst_name)
def find_files(self): ''' find all expected files in self.dir_ fills self.found_files ''' expected = ["description.txt", "algorithm_runs.arff", "feature_values.arff", "feature_runstatus.arff"] optional = ["ground_truth.arff", "feature_costs.arff", "citation.bib", "cv.arff"] for expected_file in expected: full_path = os.path.join(self.dir_,expected_file) if not os.path.isfile(full_path): Printer.print_e("Not found: %s (has to be added)" %(full_path)) else: self.found_files.append(full_path) for expected_file in optional: full_path = os.path.join(self.dir_,expected_file) if not os.path.isfile(full_path): Printer.print_w("Not found: %s (maybe you want to add it)" %(full_path)) else: self.found_files.append(full_path)
def check_feature_status(self): ''' check that features are Na ''' feature_group_dict = self.metainfo.feature_group_dict for inst_ in self.instances.values(): not_ok_steps = [] for step, status in inst_._features_status.items(): if status.upper() != "OK": not_ok_steps.append(step) not_ok_features = [] for u_step in not_ok_steps: not_ok_features.extend(feature_group_dict[u_step]["provides"]) not_ok_index_features = map(lambda x: self.metainfo.features.index(x), not_ok_features) #=================================================================== # unused_features = set() # for u_step in not_ok_steps: # not_processed_features = feature_group_dict[u_step] # unused_features = unused_features.union(set(not_processed_features)) #=================================================================== #not_ok_index_features = sorted(list(map(str,self.metainfo.features).index(un_feature) for un_feature in unused_features), reverse=True) ok_index_features = set(range(len(self.metainfo.features))).difference(not_ok_index_features) warned = False for indx in not_ok_index_features: if inst_._features[indx] is not None: if not warned: Printer.print_w("Not all features of %s are NA although the corresponding feature step is not OK." %(inst_._name)) warned = True #inst_._features[indx] = None ok_values = [inst_._features[indx] for indx in ok_index_features] if None in ok_values: Printer.print_e("Missing Features with status OK: %s." % (inst_._name))
def read_description(self, file_): ''' reads description file and saves all meta information ''' Printer.print_c("Read %s" %(file_)) with open(file_,"r") as fp: for line in fp: line = line.replace("\n","").strip(" ") if line.startswith("scenario_id"): self.metainfo.scenario = line.split(":")[1].strip(" ") elif line.startswith("performance_measures" ): self.metainfo.performance_measure = map(lambda x: x.strip(" "), line.split(":")[1].strip(" ").split(",")) elif line.startswith("maximize"): try: self.metainfo.maximize = line.split(":")[1].strip(" ").split(",") except ValueError: Printer.print_w("Cannot read MAXIMIZE") elif line.startswith("performance_type"): self.metainfo.performance_type = map(lambda x: x.strip(" "), line.split(":")[1].strip(" ").split(",")) elif line.startswith("algorithm_cutoff_time"): try: self.metainfo.algorithm_cutoff_time= float(line.split(":")[1]) except ValueError: Printer.print_w("Cannot read ALGORITHM_CUTOFF_TIME") elif line.startswith("algorithm_cutoff_memory"): try: self.metainfo.algorithm_cutoff_memory = float(line.split(":")[1]) except ValueError: Printer.print_w("Cannot read ALGORITHM_CUTOFF_MEMORY") elif line.startswith("features_cutoff_time"): try: self.metainfo.features_cutoff_time = float(line.split(":")[1]) except ValueError: Printer.print_w("Cannot read FEATURES_CUTOFF_TIME") elif line.startswith("features_cutoff_memory"): try: self.metainfo.features_cutoff_memory = float(line.split(":")[1]) except ValueError: Printer.print_w("Cannot read FEATURES_CUTOFF_MEMORY") elif line.startswith("features_deterministic"): try: self.metainfo.features_deterministic = map(lambda x: x.strip(" "), line.split(":")[1].strip(" ").split(",")) except ValueError: Printer.print_w("Cannot read FEATURES_DETERMINISTIC") elif line.startswith("features_stochastic"): try: self.metainfo.features_stochastic = map(lambda x: x.strip(" "), line.split(":")[1].strip(" ").split(",")) except ValueError: Printer.print_w("Cannot read FEATURES_STOCHASTIC") elif line.startswith("algorithms_deterministic"): try: self.metainfo.algortihms_deterministics = filter(lambda x: True if x else False, map(lambda x: x.strip(" "), line.split(":")[1].strip(" ").split(","))) except ValueError: Printer.print_w("Cannot read ALGORTIHMS_DETERMINISTIC") elif line.startswith("algorithms_stochastic"): try: self.metainfo.algorithms_stochastic = filter(lambda x: True if x else False, map(lambda x: x.strip(" "), line.split(":")[1].strip(" ").split(","))) except ValueError: Printer.print_w("Cannot read ALGORITHMS_STOCHASTIC") elif line.startswith("feature_step"): try: group_name = line.split(":")[0][12:].strip(" ") features = map(lambda x: x.strip(" "), line.split(":")[1].strip(" ").split(",")) self.metainfo.feature_group_dict[group_name] = features except ValueError: Printer.print_w("Cannot read Feature_Step") elif line.startswith("default_step"): try: self.metainfo.feature_steps = filter(lambda x: True if x else False, map(lambda x: x.strip(" "), line.split(":")[1].strip(" ").split(","))) except ValueError: Printer.print_w("Cannot read DEFAULT_STEPS") self.metainfo.algorithms = list(set(self.metainfo.algorithms_stochastic).union(self.metainfo.algortihms_deterministics)) if not self.metainfo.scenario: Printer.print_w("Have not found SCENARIO_ID") if not self.metainfo.performance_measure: Printer.print_w("Have not found PERFORMANCE_MEASURE") if not self.metainfo.performance_type: Printer.print_w("Have not found PERFORMANCE_TYPE") if not self.metainfo.maximize: Printer.print_w("Have not found MAXIMIZE") if not self.metainfo.algorithm_cutoff_time: Printer.print_e("Have not found algorithm_cutoff_time") if not self.metainfo.algorithm_cutoff_memory: Printer.print_w("Have not found algorithm_cutoff_memory") if not self.metainfo.features_cutoff_time: Printer.print_w("Have not found features_cutoff_time") Printer.print_w("Assumption FEATURES_CUTOFF_TIME == ALGORITHM_CUTOFF_TIME ") self.metainfo.features_cutoff_time = self.metainfo.algorithm_cutoff_time if not self.metainfo.features_cutoff_memory: Printer.print_w("Have not found features_cutoff_memory") if not self.metainfo.features_deterministic: Printer.print_w("Have not found features_deterministic") if not self.metainfo.features_stochastic: Printer.print_w("Have not found features_stochastic") if not self.metainfo.algortihms_deterministics: Printer.print_w("Have not found algortihms_deterministics") if not self.metainfo.algorithms_stochastic: Printer.print_w("Have not found algorithms_stochastic") if not self.metainfo.feature_group_dict: Printer.print_e("Have not found any feature step") if not self.metainfo.feature_steps: Printer.print_e("Have not found default feature step") feature_intersec = set(self.metainfo.features_deterministic).intersection(self.metainfo.features_stochastic) if feature_intersec: Printer.print_w("Intersection of deterministic and stochastic features is not empty: %s" %(str(feature_intersec))) algo_intersec = set(self.metainfo.algortihms_deterministics).intersection(self.metainfo.algorithms_stochastic) if algo_intersec: Printer.print_w("Intersection of deterministic and stochastic algorithms is not empty: %s" %(str(algo_intersec)))
def remove_features(self): ''' inst_dict: instance name -> Instance() meta_info: parsed coseal meta information and command line arguments (meta_info.options) ''' feature_steps = self.metainfo.options.feature_steps feature_group_dict = self.metainfo.feature_group_dict features = None #self.metainfo.options.features if features: #the user specified a subset of features (disables given feature_steps) empty_check = set(features).difference(self.metainfo.features) if empty_check: Printer.print_e( "Features (--features [list]) are not defined in data: %s" % (",".join(empty_check)), -2) unused_features = set(self.metainfo.features).difference(features) # find the corresponding feature steps feature_steps = set() for f in features: for f_group, f_list in feature_group_dict.iteritems(): if f in f_list["provides"]: feature_steps.add(f_group) changed = True while changed: changed = False for step in feature_steps: missing_steps = set(feature_group_dict[step].get( "requires", set())).difference(feature_steps) if missing_steps: changed = True feature_steps = feature_steps.union(missing_steps) Printer.print_w( "Adding missing feature step because of a pre-condition: %s" % (",".join(missing_steps))) unused_steps = set(feature_group_dict.keys()).difference( set(feature_steps)) Printer.print_c("Used feature steps (%d): %s" % (len(feature_steps), ",".join(feature_steps))) else: if not feature_steps: feature_steps = list( self.metainfo.feature_steps ) # if no steps are specified, use default empty_check = set(feature_steps).difference( set(feature_group_dict.keys())) if empty_check: Printer.print_e( "Feature steps (--feature-steps [list]) are not defined in data: %s" % (",".join(empty_check)), -2) # check preconditions of features available_steps = set() used_features = set() for step in feature_steps: #TODO: order of feature steps could be an issue req_steps = set(feature_group_dict[step].get( "requires", set())) miss_steps = req_steps.difference(feature_steps) if miss_steps: Printer.print_w( "Feature Step %s does not met his pre-conditions (%s). Adding feature step to set (but not to feature set!)." % (step, ",".join(miss_steps))) available_steps.add(step) for f in feature_group_dict[step]["provides"]: used_features.add(f) for ms in miss_steps: available_steps.add(ms) for f in feature_group_dict[ms]["provides"]: used_features.add(f) feature_steps = available_steps Printer.print_c("Used Feature Steps (%d): %s" % (len(feature_steps), ",".join(feature_steps))) unused_features = set(self.metainfo.features).difference( set(used_features)) unused_steps = set(feature_group_dict.keys()).difference( set(available_steps)) Printer.print_nearly_verbose("Remove features: %s\n" % (",".join(unused_features))) used_features = set(self.metainfo.features).difference(unused_features) Printer.print_c("Used features (%d): %s\n" % (len(used_features), ",".join(used_features))) if not used_features: Printer.print_w( "Empty feature set - fall back to default feature set.") return False unused_index_features = sorted(list( map(str, self.metainfo.features).index(un_feature) for un_feature in unused_features), reverse=True) # remove unused features for inst_ in self.instances.values(): for un_feature_indx in unused_index_features: inst_._features.pop(un_feature_indx) # compute feature costs for inst_ in self.instances.values(): total_cost = 0 previous_presolved = False for f_step in feature_steps: if inst_._feature_group_cost_dict.get( f_step ) and not previous_presolved: # feature costs are maybe None total_cost += inst_._feature_group_cost_dict[f_step] if inst_._features_status[f_step] == "PRESOLVED": previous_presolved = True for un_step in unused_steps: # remove step status if unused del inst_._features_status[un_step] inst_._feature_cost_total = total_cost inst_._pre_solved = "PRESOLVED" in map( lambda x: x.upper(), inst_._features_status.values()) for un_feature_indx in unused_index_features: self.metainfo.features.pop(un_feature_indx) #======================================================================= # if self.metainfo.options.impute == "none": # for inst_ in self.instances.values(): # if reduce(lambda x,y: False if ((not x) and y.upper() == "OK") else True, inst_._features_status.values(), False): # inst_._features = None #======================================================================= return True
def read_description(self, file_): ''' reads description file and saves all meta information ''' Printer.print_c("Read %s" %(file_)) with open(file_, "r") as fh: description = yaml.load(fh) self.metainfo.scenario = description.get('scenario_id') self.metainfo.performance_measure = description.get('performance_measures') if not isinstance(description.get('performance_measures'), list): Printer.print_e("'performance_measures' has to be list") self.metainfo.maximize = description.get('maximize') if not isinstance(description.get('maximize'), list): Printer.print_e("'maximize' has to be list") self.metainfo.performance_type = description.get('performance_type') if not isinstance(description.get('performance_type'), list): Printer.print_e("'performance_type' has to be list") self.metainfo.algorithm_cutoff_time = description.get('algorithm_cutoff_time') self.metainfo.features_cutoff_memory = description.get('algorithm_cutoff_memory') self.metainfo.features_cutoff_time = description.get('features_cutoff_time') self.metainfo.features_cutoff_memory = description.get('features_cutoff_memory') self.metainfo.features_deterministic = description.get('features_deterministic') if self.metainfo.features_deterministic is None: self.metainfo.features_deterministic = set() self.metainfo.features_stochastic = description.get('features_stochastic') if self.metainfo.features_stochastic is None: self.metainfo.features_stochastic = set() self.metainfo.algortihms_deterministics = description.get('algorithms_deterministic') if self.metainfo.algortihms_deterministics is None: self.metainfo.algortihms_deterministics = set() self.metainfo.algorithms_stochastic = description.get('algorithms_stochastic') if self.metainfo.algorithms_stochastic is None: self.metainfo.algorithms_stochastic = set() self.metainfo.feature_group_dict = description.get('feature_steps') self.metainfo.feature_steps = description.get('default_steps') self.metainfo.algorithms = list( set(self.metainfo.algorithms_stochastic).union( self.metainfo.algortihms_deterministics)) if not self.metainfo.scenario: Printer.print_w("Have not found SCENARIO_ID") if not self.metainfo.performance_measure: Printer.print_w("Have not found PERFORMANCE_MEASURE") if not self.metainfo.performance_type: Printer.print_w("Have not found PERFORMANCE_TYPE") if not self.metainfo.maximize: Printer.print_w("Have not found MAXIMIZE") if not self.metainfo.algorithm_cutoff_time: Printer.print_e("Have not found algorithm_cutoff_time") if not self.metainfo.algorithm_cutoff_memory: Printer.print_w("Have not found algorithm_cutoff_memory") if not self.metainfo.features_cutoff_time: Printer.print_w("Have not found features_cutoff_time") Printer.print_w("Assumption FEATURES_CUTOFF_TIME == ALGORITHM_CUTOFF_TIME ") self.metainfo.features_cutoff_time = self.metainfo.algorithm_cutoff_time if not self.metainfo.features_cutoff_memory: Printer.print_w("Have not found features_cutoff_memory") if not self.metainfo.features_deterministic: Printer.print_w("Have not found features_deterministic") if not self.metainfo.features_stochastic: Printer.print_w("Have not found features_stochastic") if not self.metainfo.algortihms_deterministics: Printer.print_w("Have not found algortihms_deterministics") if not self.metainfo.algorithms_stochastic: Printer.print_w("Have not found algorithms_stochastic") if not self.metainfo.feature_group_dict: Printer.print_e("Have not found any feature step") if not self.metainfo.feature_steps: Printer.print_e("Have not found default feature step") for step, d in self.metainfo.feature_group_dict.items(): if d.get("requires") and not isinstance(d["requires"], list): Printer.print_e("'requires' of a feature step (%s) has to be list." %(step)) if self.metainfo.feature_steps: f_groups = set(self.metainfo.feature_group_dict.keys()) if set(self.metainfo.feature_steps).difference(f_groups): Printer.print_e("Default feature steps are not listed (%s)" %(set(self.metainfo.feature_steps).difference(f_groups))) feature_intersec = set(self.metainfo.features_deterministic).intersection(self.metainfo.features_stochastic) if feature_intersec: Printer.print_w("Intersection of deterministic and stochastic features is not empty: %s" %(str(feature_intersec))) algo_intersec = set(self.metainfo.algortihms_deterministics).intersection(self.metainfo.algorithms_stochastic) if algo_intersec: Printer.print_w("Intersection of deterministic and stochastic algorithms is not empty: %s" %(str(algo_intersec)))
def read_feature_values(self, file_): ''' reads feature file and saves them in self.instances Expected Header: @RELATION FEATURE_VALUES_2013-SAT-Competition @ATTRIBUTE instance_id STRING @ATTRIBUTE repetition NUMERIC @ATTRIBUTE number_of_variables NUMERIC @ATTRIBUTE number_of_clauses NUMERIC @ATTRIBUTE first_local_min_steps NUMERIC ''' Printer.print_c("Read %s" % (file_)) with open(file_, "rb") as fp: try: arff_dict = arff.load(fp) except arff.BadNominalValue: Printer.print_e( "Parsing of arff file failed (%s) - maybe conflict of header and data." % (file_)) if arff_dict["attributes"][0][0] != "instance_id": Printer.print_e("instance_id as first attribute is missing in %s" % (file_)) if arff_dict["attributes"][1][0] != "repetition": Printer.print_e("repetition as second attribute is missing in %s" % (file_)) feature_set = set(self.metainfo.features_deterministic).union( self.metainfo.features_stochastic) for f_name in arff_dict["attributes"][2:]: f_name = f_name[0] self.metainfo.features.append(f_name) if not f_name in feature_set: Printer.print_e( "Feature \"%s\" was not defined as deterministic or stochastic" % (f_name)) pairs_inst_rep = [] encoutered_features = [] for data in arff_dict["data"]: inst_name = data[0] repetition = data[1] features = data[2:] if len(features) != len(self.metainfo.features): Printer.print_e( "Number of features in attributes does not match number of found features; instance: %s" % (inst_name)) if not self.instances.get(inst_name): Printer.print_w( "Instance \"%s\" has features but was not found in performance file" % (inst_name)) continue inst_ = self.instances[inst_name] inst_._features = features #TODO: handle feature repetitions # not only Nones in feature vector and previously seen if reduce(lambda x, y: True if (x or y) else False, features, False) and features in encoutered_features: Printer.print_w("Feature vector found twice: %s" % (",".join(map(str, features)))) else: encoutered_features.append(features) if (inst_name, repetition) in pairs_inst_rep: Printer.print_w("Pair (%s,%s) is not unique in %s" % (inst_name, repetition, file_)) else: pairs_inst_rep.append((inst_name, repetition))
def read_algorithm_runs(self, file_): ''' read performance file and saves information add Instance() in self.instances unsuccessful runs are replaced by algorithm_cutoff_time if performance_type is runtime EXPECTED HEADER: @RELATION ALGORITHM_RUNS_2013-SAT-Competition @ATTRIBUTE instance_id STRING @ATTRIBUTE repetition NUMERIC @ATTRIBUTE algorithm STRING @ATTRIBUTE PAR10 NUMERIC @ATTRIBUTE Number_of_satisfied_clauses NUMERIC @ATTRIBUTE runstatus {ok, timeout, memout, not_applicable, crash, other} ''' Printer.print_c("Read %s" % (file_)) with open(file_, "rb") as fp: try: arff_dict = arff.load(fp) except arff.BadNominalValue: Printer.print_e( "Parsing of arff file failed (%s) - maybe conflict of header and data." % (file_)) if arff_dict["attributes"][0][0] != "instance_id": Printer.print_e("instance_id as first attribute is missing in %s" % (file_)) if arff_dict["attributes"][1][0] != "repetition": Printer.print_e("repetition as second attribute is missing in %s" % (file_)) if arff_dict["attributes"][2][0] != "algorithm": Printer.print_e("algorithm as third attribute is missing in %s" % (file_)) listed_metrics = map( lambda x: x[0], arff_dict["attributes"][3:3 + len(self.metainfo.performance_measure)]) diff_set = set( self.metainfo.performance_measure).difference(listed_metrics) if diff_set: Printer.print_e("\"%s\" as attribute is missing in %s" % (diff_set, file_)) if arff_dict["attributes"][-1][0] != "runstatus": Printer.print_e("runstatus as last attribute is missing in %s" % (file_)) pairs_inst_rep_alg = [] for data in arff_dict["data"]: inst_name = str(data[0]) repetition = data[1] algorithm = str(data[2]) perf_list = data[3:-1] status = data[-1] inst_ = self.instances.get(inst_name, Instance(inst_name)) for p_measure, p_type, perf in zip( self.metainfo.performance_measure, self.metainfo.performance_type, perf_list): if perf is None: Printer.print_e( "The following performance data has missing values. Please impute all missing values.\n" + "%s" % (",".join(map(str, data)))) if p_type == "runtime" and ( perf is None or status != "ok" ): # if broken run, replace with cutoff time perf = self.metainfo.algorithm_cutoff_time inst_._cost[p_measure] = inst_._cost.get(p_measure, {}) perf_measure_dict = inst_._cost[p_measure] perf_measure_dict[algorithm] = perf_measure_dict.get( algorithm, []) perf_measure_dict[algorithm].append(max(float(perf), 0.00001)) inst_._status[algorithm] = status self.instances[inst_name] = inst_ if (inst_name, repetition, algorithm) in pairs_inst_rep_alg: Printer.print_w("Pair (%s,%s,%s) is not unique in %s" % (inst_name, repetition, algorithm, file_)) else: pairs_inst_rep_alg.append((inst_name, repetition, algorithm))
def read_description(self, file_): ''' reads description file and saves all meta information ''' Printer.print_c("Read %s" % (file_)) with open(file_, "r") as fh: description = yaml.load(fh) self.metainfo.scenario = description.get('scenario_id') self.metainfo.performance_measure = description.get( 'performance_measures') if not isinstance(description.get('performance_measures'), list): Printer.print_e("'performance_measures' has to be list") self.metainfo.maximize = description.get('maximize') if not isinstance(description.get('maximize'), list): Printer.print_e("'maximize' has to be list") self.metainfo.performance_type = description.get('performance_type') if not isinstance(description.get('performance_type'), list): Printer.print_e("'performance_type' has to be list") self.metainfo.algorithm_cutoff_time = description.get( 'algorithm_cutoff_time') self.metainfo.features_cutoff_memory = description.get( 'algorithm_cutoff_memory') self.metainfo.features_cutoff_time = description.get( 'features_cutoff_time') self.metainfo.features_cutoff_memory = description.get( 'features_cutoff_memory') self.metainfo.features_deterministic = description.get( 'features_deterministic') if self.metainfo.features_deterministic is None: self.metainfo.features_deterministic = set() self.metainfo.features_stochastic = description.get( 'features_stochastic') if self.metainfo.features_stochastic is None: self.metainfo.features_stochastic = set() self.metainfo.algortihms_deterministics = description.get( 'algorithms_deterministic') if self.metainfo.algortihms_deterministics is None: self.metainfo.algortihms_deterministics = set() self.metainfo.algorithms_stochastic = description.get( 'algorithms_stochastic') if self.metainfo.algorithms_stochastic is None: self.metainfo.algorithms_stochastic = set() self.metainfo.feature_group_dict = description.get('feature_steps') self.metainfo.feature_steps = description.get('default_steps') self.metainfo.algorithms = list( set(self.metainfo.algorithms_stochastic).union( self.metainfo.algortihms_deterministics)) if not self.metainfo.scenario: Printer.print_w("Have not found SCENARIO_ID") if not self.metainfo.performance_measure: Printer.print_w("Have not found PERFORMANCE_MEASURE") if not self.metainfo.performance_type: Printer.print_w("Have not found PERFORMANCE_TYPE") if not self.metainfo.maximize: Printer.print_w("Have not found MAXIMIZE") if not self.metainfo.algorithm_cutoff_time: Printer.print_e("Have not found algorithm_cutoff_time") if not self.metainfo.algorithm_cutoff_memory: Printer.print_w("Have not found algorithm_cutoff_memory") if not self.metainfo.features_cutoff_time: Printer.print_w("Have not found features_cutoff_time") Printer.print_w( "Assumption FEATURES_CUTOFF_TIME == ALGORITHM_CUTOFF_TIME ") self.metainfo.features_cutoff_time = self.metainfo.algorithm_cutoff_time if not self.metainfo.features_cutoff_memory: Printer.print_w("Have not found features_cutoff_memory") if not self.metainfo.features_deterministic: Printer.print_w("Have not found features_deterministic") if not self.metainfo.features_stochastic: Printer.print_w("Have not found features_stochastic") if not self.metainfo.algortihms_deterministics: Printer.print_w("Have not found algortihms_deterministics") if not self.metainfo.algorithms_stochastic: Printer.print_w("Have not found algorithms_stochastic") if not self.metainfo.feature_group_dict: Printer.print_e("Have not found any feature step") if not self.metainfo.feature_steps: Printer.print_e("Have not found default feature step") for step, d in self.metainfo.feature_group_dict.items(): if d.get("requires") and not isinstance(d["requires"], list): Printer.print_e( "'requires' of a feature step (%s) has to be list." % (step)) if self.metainfo.feature_steps: f_groups = set(self.metainfo.feature_group_dict.keys()) if set(self.metainfo.feature_steps).difference(f_groups): Printer.print_e( "Default feature steps are not listed (%s)" % (set(self.metainfo.feature_steps).difference(f_groups))) feature_intersec = set( self.metainfo.features_deterministic).intersection( self.metainfo.features_stochastic) if feature_intersec: Printer.print_w( "Intersection of deterministic and stochastic features is not empty: %s" % (str(feature_intersec))) algo_intersec = set( self.metainfo.algortihms_deterministics).intersection( self.metainfo.algorithms_stochastic) if algo_intersec: Printer.print_w( "Intersection of deterministic and stochastic algorithms is not empty: %s" % (str(algo_intersec)))
def run_extractor(self, args_dic, instance): ''' run extractor and look for instance features Parameter: args_dic : dictionary with options instance : instance to solve ''' # run claspre extractor ret = Claspre2.run_extractor(self, args_dic, instance) Printer.print_c("\nExtended Stats Feature Extraction:") self._set_args(args_dic, instance) cmd = self._cmd cmd[0] = cmd[0].replace("claspre", "exst") del cmd[1] del cmd[1] cmd.append("--outf=2") cmd.append("--stats=2") cmd.append("--time-limit=120") Printer.print_c(" ".join(cmd)) signal.signal(signal.SIGINT, self.__clean_up_with_signal) signal.signal(signal.SIGHUP, self.__clean_up_with_signal) signal.signal(signal.SIGQUIT, self.__clean_up_with_signal) signal.signal(signal.SIGSEGV, self.__clean_up_with_signal) signal.signal(signal.SIGTERM, self.__clean_up_with_signal) signal.signal(signal.SIGXCPU, self.__clean_up_with_signal) signal.signal(signal.SIGXFSZ, self.__clean_up_with_signal) try: # start exst self._popen_ = Popen(cmd, stdin=PIPE, stdout=PIPE, stderr=PIPE) self._instance.seek(0) if isinstance(self._instance, file) and self._instance.name.endswith(".gz"): zcat_popen = Popen(["zcat", self._instance.name], stdout=PIPE) input_ = zcat_popen.stdout.read() else: input_ = self._instance.read() # get exst output (out_, err_) = self._popen_.communicate(input=input_) except OSError: Printer.print_w( "Feature extractor was unable to compute features (path correct?): %s" % (cmd)) try: # extract features from output feature_dict = json.loads(out_) except: try: Printer.print_w("Could not parse features. %s!" % (self._instance.name)) except AttributeError: Printer.print_w("Could not parse features from stdin!") return ret # get Extended Stats element from json preprocessing_feats = feature_dict["Stats"]["Extended Stats"] flat_feats = [] flat_feats.extend([y for (x, y) in preprocessing_feats]) # combine claspre and extended features ret += flat_feats return ret
def remove_features(self): ''' inst_dict: instance name -> Instance() meta_info: parsed coseal meta information and command line arguments (meta_info.options) ''' feature_steps = self.metainfo.options.feature_steps feature_group_dict = self.metainfo.feature_group_dict features = None #self.metainfo.options.features if features: #the user specified a subset of features (disables given feature_steps) empty_check = set(features).difference(self.metainfo.features) if empty_check: Printer.print_e("Features (--features [list]) are not defined in data: %s" %(",".join(empty_check)), -2) unused_features = set(self.metainfo.features).difference(features) # find the corresponding feature steps feature_steps = set() for f in features: for f_group, f_list in feature_group_dict.iteritems(): if f in f_list["provides"]: feature_steps.add(f_group) changed = True while changed: changed = False for step in feature_steps: missing_steps = set(feature_group_dict[step].get("requires",set())).difference(feature_steps) if missing_steps: changed = True feature_steps = feature_steps.union(missing_steps) Printer.print_w("Adding missing feature step because of a pre-condition: %s" %(",".join(missing_steps))) unused_steps = set(feature_group_dict.keys()).difference(set(feature_steps)) Printer.print_c("Used feature steps (%d): %s" %(len(feature_steps), ",".join(feature_steps))) else: if not feature_steps: feature_steps = list(self.metainfo.feature_steps) # if no steps are specified, use default empty_check = set(feature_steps).difference(set(feature_group_dict.keys())) if empty_check: Printer.print_e("Feature steps (--feature-steps [list]) are not defined in data: %s" %(",".join(empty_check)), -2) # check preconditions of features available_steps = set() used_features = set() for step in feature_steps: #TODO: order of feature steps could be an issue req_steps = set(feature_group_dict[step].get("requires", set())) miss_steps = req_steps.difference(feature_steps) if miss_steps: Printer.print_w("Feature Step %s does not met his pre-conditions (%s). Adding feature step to set (but not to feature set!)." %(step, ",".join(miss_steps))) available_steps.add(step) for f in feature_group_dict[step]["provides"]: used_features.add(f) for ms in miss_steps: available_steps.add(ms) for f in feature_group_dict[ms]["provides"]: used_features.add(f) feature_steps = available_steps Printer.print_c("Used Feature Steps (%d): %s" % (len(feature_steps), ",".join(feature_steps))) unused_features = set(self.metainfo.features).difference(set(used_features)) unused_steps = set(feature_group_dict.keys()).difference(set(available_steps)) Printer.print_nearly_verbose("Remove features: %s\n" %(",".join(unused_features))) used_features = set(self.metainfo.features).difference(unused_features) Printer.print_c("Used features (%d): %s\n" %(len(used_features), ",".join(used_features))) if not used_features: Printer.print_w("Empty feature set - fall back to default feature set.") return False unused_index_features = sorted(list(map(str,self.metainfo.features).index(un_feature) for un_feature in unused_features), reverse=True) # remove unused features for inst_ in self.instances.values(): for un_feature_indx in unused_index_features: inst_._features.pop(un_feature_indx) # compute feature costs for inst_ in self.instances.values(): total_cost = 0 previous_presolved = False for f_step in feature_steps: if inst_._feature_group_cost_dict.get(f_step) and not previous_presolved: # feature costs are maybe None total_cost += inst_._feature_group_cost_dict[f_step] if inst_._features_status[f_step] == "PRESOLVED": previous_presolved = True for un_step in unused_steps: # remove step status if unused del inst_._features_status[un_step] inst_._feature_cost_total = total_cost inst_._pre_solved = "PRESOLVED" in map(lambda x: x.upper(), inst_._features_status.values()) for un_feature_indx in unused_index_features: self.metainfo.features.pop(un_feature_indx) #======================================================================= # if self.metainfo.options.impute == "none": # for inst_ in self.instances.values(): # if reduce(lambda x,y: False if ((not x) and y.upper() == "OK") else True, inst_._features_status.values(), False): # inst_._features = None #======================================================================= return True
def read_algorithm_runs(self, file_): ''' read performance file and saves information add Instance() in self.instances unsuccessful runs are replaced by algorithm_cutoff_time if performance_type is runtime EXPECTED HEADER: @RELATION ALGORITHM_RUNS_2013-SAT-Competition @ATTRIBUTE instance_id STRING @ATTRIBUTE repetition NUMERIC @ATTRIBUTE algorithm STRING @ATTRIBUTE PAR10 NUMERIC @ATTRIBUTE Number_of_satisfied_clauses NUMERIC @ATTRIBUTE runstatus {ok, timeout, memout, not_applicable, crash, other} ''' Printer.print_c("Read %s" %(file_)) with open(file_,"rb") as fp: try: arff_dict = arff.load(fp) except arff.BadNominalValue: Printer.print_e("Parsing of arff file failed (%s) - maybe conflict of header and data." %(file_)) if arff_dict["attributes"][0][0] != "instance_id": Printer.print_e("instance_id as first attribute is missing in %s" %(file_)) if arff_dict["attributes"][1][0] != "repetition": Printer.print_e("repetition as second attribute is missing in %s" %(file_)) if arff_dict["attributes"][2][0] != "algorithm": Printer.print_e("algorithm as third attribute is missing in %s" %(file_)) listed_metrics = map(lambda x: x[0], arff_dict["attributes"][3 : 3 + len(self.metainfo.performance_measure)]) diff_set = set(self.metainfo.performance_measure).difference(listed_metrics) if diff_set: Printer.print_e("\"%s\" as attribute is missing in %s" %(diff_set, file_)) if arff_dict["attributes"][-1][0] != "runstatus": Printer.print_e("runstatus as last attribute is missing in %s" %(file_)) pairs_inst_rep_alg = [] for data in arff_dict["data"]: inst_name = str(data[0]) repetition = data[1] algorithm = str(data[2]) perf_list = data[3:-1] status = data[-1] inst_ = self.instances.get(inst_name,Instance(inst_name)) for p_measure, p_type, perf in zip(self.metainfo.performance_measure, self.metainfo.performance_type, perf_list): if perf is None: Printer.print_e("The following performance data has missing values. Please impute all missing values.\n"+ "%s" % (",".join(map(str,data)))) if p_type == "runtime" and (perf is None or status != "ok"): # if broken run, replace with cutoff time perf = self.metainfo.algorithm_cutoff_time inst_._cost[p_measure] = inst_._cost.get(p_measure,{}) perf_measure_dict = inst_._cost[p_measure] perf_measure_dict[algorithm] = perf_measure_dict.get(algorithm,[]) perf_measure_dict[algorithm].append(max(float(perf),0.00001)) inst_._status[algorithm] = status self.instances[inst_name] = inst_ if (inst_name,repetition, algorithm) in pairs_inst_rep_alg: Printer.print_w("Pair (%s,%s,%s) is not unique in %s" %(inst_name, repetition, algorithm, file_)) else: pairs_inst_rep_alg.append((inst_name,repetition, algorithm))
def read_feature_values(self, file_): ''' reads feature file and saves them in self.instances Expected Header: @RELATION FEATURE_VALUES_2013-SAT-Competition @ATTRIBUTE instance_id STRING @ATTRIBUTE repetition NUMERIC @ATTRIBUTE number_of_variables NUMERIC @ATTRIBUTE number_of_clauses NUMERIC @ATTRIBUTE first_local_min_steps NUMERIC ''' Printer.print_c("Read %s" %(file_)) with open(file_,"rb") as fp: try: arff_dict = arff.load(fp) except arff.BadNominalValue: Printer.print_e("Parsing of arff file failed (%s) - maybe conflict of header and data." %(file_)) if arff_dict["attributes"][0][0] != "instance_id": Printer.print_e("instance_id as first attribute is missing in %s" %(file_)) if arff_dict["attributes"][1][0] != "repetition": Printer.print_e("repetition as second attribute is missing in %s" %(file_)) feature_set = set(self.metainfo.features_deterministic).union(self.metainfo.features_stochastic) for f_name in arff_dict["attributes"][2:]: f_name = f_name[0] self.metainfo.features.append(f_name) if not f_name in feature_set: Printer.print_e("Feature \"%s\" was not defined as deterministic or stochastic" %(f_name)) pairs_inst_rep = [] encoutered_features = [] for data in arff_dict["data"]: inst_name = data[0] repetition = data[1] features = data[2:] if len(features) != len(self.metainfo.features): Printer.print_e("Number of features in attributes does not match number of found features; instance: %s" %(inst_name)) if not self.instances.get(inst_name): Printer.print_w("Instance \"%s\" has features but was not found in performance file" %(inst_name)) continue inst_ = self.instances[inst_name] inst_._features = features #TODO: handle feature repetitions # not only Nones in feature vector and previously seen if reduce(lambda x,y: True if (x or y) else False, features, False) and features in encoutered_features: Printer.print_w("Feature vector found twice: %s" %(",".join(map(str,features)))) else: encoutered_features.append(features) if (inst_name,repetition) in pairs_inst_rep: Printer.print_w("Pair (%s,%s) is not unique in %s" %(inst_name,repetition, file_)) else: pairs_inst_rep.append((inst_name,repetition))