def arffwriteToFile(self, X, Y, theFile): #TODO: Fix warning import arff arffFeatObj = { 'description': 'infodens feats', 'relation': 'translationese' } dims = X.get_shape() attrib = [] # list of attributes for i in range(dims[1]): attribTuple = (str(i), "REAL") attrib.append(attribTuple) arffClasses = list(map(str, set(Y))) attrib.append(("y", arffClasses)) Y = sparse.coo_matrix(Y).transpose() data = sparse.hstack([X, Y], "lil") arffFeatObj['attributes'] = attrib arffFeatObj['data'] = data.tocoo() thefile = open(theFile, 'w') arff.dump(arffFeatObj, thefile) thefile.close()
def produce_features_weka(self): root, _ = os.path.splitext(self.args.config) arff_f = '.'.join([root, 'arff']) relation = os.path.basename(root) names = [ 'site', 'code' ] names.extend(imap(str, self.indicators)) arff.dump(arff_f, self._iter_rows(), relation=root, names=names)
def main(instances): # data = [[1,2,'a'], [3, 4, 'john']] data = [] attrs = 7 for i in range(instances): attr = [] for j in range(attrs): a = random.randint(0, 1) attr.append(a) prob = int(random.random() * 10) c = (attr[0] and attr[2]) or int((not attr[2] and not attr[3]) or (attr[6] and attr[4])) # if prob > 6: #attr.append(int(not c)) # attr[3] = int(not(attr[3])) # attr[6] = int(not(attr[6])) #rob = 0.81 # attr[4] = int(not(attr[4])) # attr[2] = int(not(attr[2])) # attr[0] = int(not(attr[0])) attr.append(c) data.append(attr) names = [] for i in range(attrs): names.append("a" + str(i)) names.append("result") arff.dump("result.arff", data, relation="boolean", names=names)
def map_columns_and_write_to_file(self): self.df = self.df[ self.df['repo_name'].isin(self.top_ten_repos_by_count) ] self.df = self.df.replace({'repo_name': self.repo_id_to_name_map}) self.df.drop(columns=['issue_id'], inplace=True, axis=1) self.grouped = self.df.groupby('repo_name') print("Beginning arff export") for name, group in self.grouped: current = group.reset_index() current.drop( columns=['index', 'repo_name'], inplace=True, axis=1 ) arff.dump( f'randomRepos/{name}.arff', current.values, relation=name, names=current.columns ) print(f"{name}.arff completed!") # arff.dump( # f'riivo.arff', # self.df.values, # relation='riivo', # names=self.df.columns # ) print("Finished arff export!")
def predict(self, articles): # modifies the provided articles dict data = { 'attributes': [('title', 'STRING'), ('body', 'STRING'), ('class', ['yes', 'no'])], 'data': [], 'description': u'', 'relation': '0' } for urlid in sorted(articles.keys()): title = re.sub(r'\W', ' ', articles[urlid]['title']) body = re.sub(r'\W', ' ', articles[urlid]['summary']) data['data'].append([title, body, 'no']) # make the testing file 0.arff fnew = open("%s0.arff" % paths['weka.training_arff_dir'], 'w') arff.dump(fnew, data) fnew.close() predictions = self.__predict_arff() for urlid in sorted(articles.keys()): articles[urlid]['categories'] = [] tids = self.__get_tids() for tid in sorted(tids): for (i, urlid) in enumerate(sorted(articles.keys())): if predictions[tid][i][0]: articles[urlid]['categories'].append(str(tid))
def save_to_arff(self, file: Path, name=None) -> None: """Save all the events in all traces into an ARFF file for machine learning. Args: filename: the name of the file to save into. Should end with '.arff'. name: optional relation name to identify this data inside the ARFF file. The default is the base name of 'file'. """ if isinstance(file, str): print( f"WARNING: converting {file} to Path. Please learn to speak pathlib." ) file = Path(file) if name is None: name = file.stem data = self.to_pandas() attributes = [(n, self.arff_type(t)) for (n, t) in zip(data.columns, data.dtypes)] try: import arff except ImportError: print("Please install ARFF support before using save_to_arff.") print("It is a pip only package: pip install liac-arff") return with file.open("w") as output: contents = { "relation": safe_name(name), "attributes": attributes, "data": data.values, # [[tr] for tr in trace_summaries], "description": "Events from " + name } arff.dump(contents, output)
def save_data_arff(dataset, dataset_filename, arff_data=None): """Saves the dataset, arff-formatted, to dataset_filename. if arff_data is provided, dataset is used for arff_data['data'].""" if arff_data is not None: arff_data['data'] = dataset.values.tolist() else: real_attrs = [(name, 'REAL') for name in dataset.select_dtypes( include='floating').columns.values] integer_attrs = [ (name, 'INTEGER') for name in dataset.select_dtypes(include='integer').columns.values ] nominal_attrs = [ (name, list(dataset[name].unique())) for name in dataset.select_dtypes(include='object').columns.values ] # re-arrange columns like above dataset = dataset[[ name for (name, type) in real_attrs + integer_attrs + nominal_attrs ]] dataset_name = os.path.splitext(os.path.basename(dataset_filename))[0] arff_data = { 'relation': dataset_name, 'attributes': real_attrs + integer_attrs + nominal_attrs, 'data': dataset.values.tolist() } arff.dump(arff_data, open(dataset_filename, 'w')) print("Dataset saved as {}".format(dataset_filename))
def dump_to_arff(df, relation_name, description, output_file): attributes = [] for col_n in df.columns: if df[col_n].dtypes == 'object': # sort nominal attributes nominal_att_list = df[col_n].unique().tolist() list.sort(nominal_att_list) # remove missing value mark from the attribute list if '?' in nominal_att_list: nominal_att_list.remove('?') attributes.append((col_n, nominal_att_list)) else: attributes.append((col_n, 'NUMERIC')) arff_dic = { 'attributes': attributes, 'data': df.values, 'relation': relation_name, 'description': description } with open(output_file, 'w', encoding="utf8") as f: arff.dump(arff_dic, f)
def run(args): root = logging.getLogger() root.setLevel(logging.INFO) config_space = sklearnbot.config_spaces.get_config_space( args.classifier_name, args.random_seed) meta_data = openmldefaults.utils.get_dataset_metadata(args.metadata_file) if args.scoring not in meta_data['measure']: raise ValueError('Could not find measure: %s' % args.scoring) metadata_frame = openmldefaults.utils.metadata_file_to_frame( args.metadata_file, config_space, args.scoring) df_surrogate = openmldefaults.utils.generate_grid_dataset( metadata_frame, config_space, args.resized_grid_size, args.scoring, args.random_seed) # if df_surrogate.shape[1] < num_params + len(study.tasks) / 2: # raise ValueError('surrogate frame has too few columns. Min: %d Got %d' % (num_params + len(study.tasks) / 2, # df_surrogate.shape[1])) os.makedirs(args.output_directory, exist_ok=True) df_surrogate.reset_index(inplace=True) arff_object = openmlcontrib.meta.dataframe_to_arff( df_surrogate, 'surrogate_%s' % args.classifier_name, json.dumps(meta_data)) filename = os.path.join( args.output_directory, 'surrogate__%s__%s__c%d.arff' % (args.classifier_name, args.scoring, args.resized_grid_size)) with open(filename, 'w') as fp: arff.dump(arff_object, fp) logging.info('Saved to: %s' % filename)
def convertToWeka(self, fileToConvert, classNominalValues, loanGradeNominalValues, numericAttributesNames, nominalAttributesNames, wekaFile): """ Generate Weka ARFF file from API downloader data""" converter = Converter() apiDataConverted = converter.convertDataFromFile(fileToConvert) data = self.prepareWekaData(apiDataConverted, numericAttributesNames, nominalAttributesNames) dataset = {} dataset['attributes'] = [] for name in numericAttributesNames: attribute = (name, 'REAL') dataset['attributes'].append(attribute) loanGradeAttribute = ('loanGrade', loanGradeNominalValues) dataset['attributes'].append(loanGradeAttribute) classAttribute = ('noteStatus', classNominalValues) dataset['attributes'].append(classAttribute) dataset['data'] = data dataset['description'] = u'' dataset['relation'] = 'downloader data' arff.dump(open(wekaFile, 'w'), dataset)
def createSampleArff(self): """Sample Weka ARFF file generation""" data = { 'attributes': [ ('outlook', ['sunny', 'overcast', 'rainy']), ('temperature', 'REAL'), ('humidity', 'REAL'), ('windy', ['TRUE', 'FALSE']), ('play', ['yes', 'no'])], 'data': [ ['sunny', 85.0, 85.0, None, 'no'], ['sunny', 80.0, 90.0, 'TRUE', 'no'], ['overcast', 83.0, 86.0, 'FALSE', 'yes'], ['rainy', 70.0, 96.0, 'FALSE', 'yes'], ['rainy', 68.0, 80.0, 'FALSE', 'yes'], ['rainy', 65.0, 70.0, 'TRUE', 'no'], ['overcast', 64.0, 65.0, 'TRUE', 'yes'], ['sunny', 72.0, 95.0, 'FALSE', 'no'], ['sunny', 69.0, 70.0, 'FALSE', 'yes'], ['rainy', 75.0, 80.0, 'FALSE', 'yes'], ['sunny', 75.0, 70.0, 'TRUE', 'yes'], ['overcast', 72.0, 90.0, 'TRUE', 'yes'], ['overcast', 81.0, 75.0, 'FALSE', 'yes'], ['rainy', 71.0, 91.0, 'TRUE', 'no']], 'description': u'', 'relation': 'weather' } wekaFile = "../data/test.arff" arff.dump(open(wekaFile, 'w'), data)
def replacingMissingValues(): myArff = arff.load(open('competition-iaa-2018-2019/train.arff', 'r')) data = np.array(myArff['data']) positions1 = getPositionMissingOnes(0) data1 = getMissingDataAttr1() j = 0 for i in positions1: data[i][8] = data1[j] j += 1 positions2 = getPositionMissingOnes(1) data2 = getMissingDataAttr2() j = 0 for i in positions2: data[i][9] = data2[j] j += 1 positions3 = getPositionMissingOnes(2) data3 = getMissingDataAttr3() j = 0 for i in positions3: data[i][10] = data3[j] j += 1 myArff['data'] = data f = open('pruea.arff', 'w') arff.dump(myArff, f)
def arff_to_big_endian(cls, filename, dataset, n_labels): data = Dataset.load_arff(filename, n_labels, endian="little", input_feature_type='float', encode_nominal=True) new_data = np.concatenate((data['Y'], data['X']), axis=1) arff_frame = arff.load(open(filename, 'r'), encode_nominal=True, return_type=arff.DENSE) arff_frame['data'] = new_data.tolist() # make the labels nominal for i in range(data['Y'].shape[0]): for j in range(data['Y'].shape[1]): arff_frame['data'][i][j] = int(arff_frame['data'][i][j]) arff_frame['attributes'] = arff_frame['attributes'][ -n_labels:] + arff_frame['attributes'][:-n_labels] # nominal attributes to int format attributes = arff_frame['attributes'] for j in range(data['Y'].shape[1], data['X'].shape[1] + data['Y'].shape[1]): if isinstance(attributes[j][1], list): for i in range(data['Y'].shape[0]): arff_frame['data'][i][j] = int(arff_frame['data'][i][j]) arff_frame['relation'] = dataset + "_mlcsn: -C " + str(n_labels) f = open(filename, "w") arff.dump(arff_frame, f) f.close()
def spit_datasets(filename=''): path1="../../data/UCI/" + filename + ".csv" df=pd.read_csv(path1,header=None) df[df.columns[-1]]=df[df.columns[-1]].apply(lambda x: True if x==1 else False) count=1 for i in range(4): df = df.sample(frac=1).reset_index(drop=True) #dict, labels = df[df.columns[:-1]], df[df.columns[-1]] #skf = StratifiedKFold(n_splits=5, shuffle=False) arff.dump(data_path + "/CHIRP/Train/" + filename + str(count) + ".arff" , df.values, relation='name', names=df.columns) # for train_index, test_index in skf.split(dict, labels): # X_train, X_test = dict[dict.index.isin(train_index.tolist())], dict[dict.index.isin(test_index.tolist())] # y_train, y_test = labels[labels.index.isin(train_index.tolist())], labels[labels.index.isin(test_index.tolist())] # X_train["class"]=y_train # X_test["class"]=y_test # df = pd.concat([X_train, X_test], ignore_index=True) # arff.dump(data_path+"/CHIRP/Train/"+filename+str(count)+".arff" # , df.values, relation='name', names=df.columns) # arff.dump(data_path + "/CHIRP/Test/" + filename + str(count) + ".arff" # , X_test.values, relation='name', names=X_test.columns) #path2=data_path+"/CHIRP/Train/"+filename+str(count)+".csv" #path3=data_path+"/CHIRP/Test/"+filename+str(count)+".csv" #X_train.to_csv(path2,index=False) #X_test.to_csv(path3, index=False) count+=1
def save_features_to_arff(all_features, output_file): dataset = {} dataset['description'] = 'Android Apps Dataset' dataset['relation'] = 'Android Apps Features for IR detection' dataset['attributes'] = [ \ ('Avg_Wordsize_Flds', 'REAL'),\ ('Avg_Distances_Flds', 'REAL'),\ ('Num_Flds_L1', 'REAL'),\ ('Num_Flds_L2', 'REAL'),\ ('Num_Flds_L3', 'REAL'),\ ('Avg_Wordsize_Mtds', 'REAL'),\ ('Avg_Distances_Mtds', 'REAL'),\ ('Num_Mtds_L1', 'REAL'),\ ('Num_Mtds_L2', 'REAL'),\ ('Num_Mtds_L3', 'REAL'),\ ('Avg_Wordsize_Cls', 'REAL'),\ ('Avg_Distances_Cls', 'REAL'),\ ('Num_Cls_L1', 'REAL'),\ ('Num_Cls_L2', 'REAL'),\ ('Num_Cls_L3', 'REAL'),\ ('class', 'REAL')] dataset['data'] = [] if all_features != []: for item in all_features: dataset['data'].append(item) if dataset['data'] != []: arff.dump(dataset, output_file)
def arff_to_big_endian(cls, filename, dataset, n_labels): data = Dataset.load_arff(filename, n_labels, endian = "little", input_feature_type = 'float', encode_nominal = True) new_data = np.concatenate((data['Y'],data['X']), axis=1) arff_frame = arff.load(open(filename,'r'), encode_nominal = True, return_type=arff.DENSE) arff_frame['data'] = new_data.tolist() # make the labels nominal for i in range(data['Y'].shape[0]): for j in range(data['Y'].shape[1]): arff_frame['data'][i][j] = int(arff_frame['data'][i][j]) arff_frame['attributes'] = arff_frame['attributes'][-n_labels:] + arff_frame['attributes'][:-n_labels] # nominal attributes to int format attributes = arff_frame['attributes'] for j in range(data['Y'].shape[1], data['X'].shape[1] + data['Y'].shape[1]): if isinstance(attributes[j][1], list): for i in range(data['Y'].shape[0]): arff_frame['data'][i][j] = int(arff_frame['data'][i][j]) arff_frame['relation'] = dataset + "_mlcsn: -C " + str(n_labels) f = open(filename,"w") arff.dump(arff_frame, f) f.close()
def _save_split_set(path, name, full_dataset=None, rows=None, cols=None): # X_split = X[indexes, :] # y_split = y.reshape(-1, 1)[indexes, :] log.debug("Saving %s split dataset to %s.", name, path) if rows is None: rows = slice(None) else: assert isinstance(rows, list) rows = np.array(rows) full_attributes = full_dataset['attributes'] if cols is None: cols = slice(None) attributes = full_attributes else: assert isinstance(cols, list) cols = np.array(cols) attributes = [full_attributes[i] for i in cols] if len(attributes) != len(full_attributes): log.debug("Keeping only attributes %s", [a for a, _ in attributes]) with open(path, 'w') as file: description = '\n'.join([ "Split dataset file generated by automlbenchmark.", "", full_dataset['description'] ]) split_data = np.asarray(full_dataset['data'], dtype=object)[rows[:, None], cols] arff.dump( { 'description': description, 'relation': name, 'attributes': attributes, 'data': split_data }, file)
def create_filtered_dataset(file_name, filtered_attacks): file = arff.load(open("Datasets/" + file_name + ".arff")) original_data = file['data'] attributes = file['attributes'] attack_types = get_attack_column(file_name) new_data = remove_attacks( original_data, get_filter_indices(filtered_attacks, "Datasets/" + file_name + ".txt")) new_attack_types = remove_attacks( attack_types, get_filter_indices(filtered_attacks, "Datasets/" + file_name + ".txt")) return_arff = { 'relation': 'KDDFiltered', 'description': '', 'data': new_data, 'attributes': attributes } arff.dump(return_arff, open("Datasets/" + file_name + "_filtered.arff", "w+")) file = open("Datasets/" + file_name + "_filtered_attacks", "w+") for attack in new_attack_types: file.write(attack + "\n") file.close()
def get_size_and_dexsize(path_to_predictions, path_to_arff): # read predictions f = open(path_to_predictions, 'r') content = f.readlines() f.close() error_index = [] for line in content: if '+' in line: error_index.append(int(line.split()[0])) # generate error list f = open(path_to_arff, 'r') file = f.read() f.close() d = arff.loads(file) error_list = [] i = 0 for index in error_index: obj['data'].append(d['data'][index]) error_list.append({ 'size': d['data'][index][0], 'dex_size': d['data'][index][1] }) # write error vectors to arff f = open('incorrectly_classified.arff', 'w') arff.dump(obj, f) f.close() return error_list
def getArchivesIDListFromARFF(): start = int(sys.argv[1]) end = int(sys.argv[2]) f = codecs.open('article_merged_2.arff', 'r', encoding='utf8') span = 0 ArticleList = [] for index, l in enumerate(f): if (l[0] == '@'): span = span + 1 continue elif (index >= start + span and index < end + span): l = l.split(',') ID = l[0].replace('"', '') a = getArticle(ID) if (a == False): continue ArticleList.append(a.toList()) else: continue arff.dump('article_v2_' + str(start) + '_' + str(end) + '.arff', ArticleList, relation="article", names=[ 'ArchivesID', 'Category', 'Department', 'ReadCount', 'Title', 'Content', 'Glossary' ])
def write_arff(self, arff_file, relation='multi_target'): # get string representation of numeric labels (class index or regression target) def encode_labels(labels, label_type): if str(label_type).upper() == 'NUMERIC': return labels elif isinstance(label_type, list): # nominal labels_numeric = np.copy(labels) # copy labels_numeric[labels == None] = -1 label_type_with_missing = label_type + ['?'] # note: can't index by object type array return np.take(label_type_with_missing, labels_numeric.astype(np.int64)) else: raise ValueError("label_type = '%s' not allowed" % label_type) fh = open(arff_file, "w") arff_data = { 'data': np.hstack((np.expand_dims(self.instance_names, axis=-1), self.features, np.hstack([np.expand_dims(encode_labels(self.labels[idx], self.target_types[idx]), axis=-1) \ for idx in range(len(self.labels))]) )), 'attributes': [('instance_name', 'STRING')] \ + list(zip(self.feature_names, self.feature_types)) \ + list(zip(self.target_names, self.target_types)), 'relation': relation, 'description': 'multi-target dataset generated by openXData 0.1' } arff.dump(arff_data, fh) fh.close()
def predict(self, articles): # modifies the provided articles dict data = {'attributes': [('title', 'STRING'), ('body', 'STRING'), ('class', ['yes', 'no'])], 'data': [], 'description': u'', 'relation': '0'} for urlid in sorted(articles.keys()): title = re.sub(r'\W', ' ', articles[urlid]['title']) body = re.sub(r'\W', ' ', articles[urlid]['summary']) data['data'].append([title, body, 'no']) # make the testing file 0.arff fnew = open("%s0.arff" % paths['weka.training_arff_dir'], 'w') arff.dump(fnew, data) fnew.close() predictions = self.__predict_arff() for urlid in sorted(articles.keys()): articles[urlid]['categories'] = [] tids = self.__get_tids() for tid in sorted(tids): for (i, urlid) in enumerate(sorted(articles.keys())): if predictions[tid][i][0]: articles[urlid]['categories'].append(str(tid))
def write_arff(self, filename): instrument_arff = copy.deepcopy(arff_format) note_arff = copy.deepcopy(arff_format) velocity_arff = copy.deepcopy(arff_format) duration_arff = copy.deepcopy(arff_format) time_delta_arff = copy.deepcopy(arff_format) with open(filename + "_instrument.arff", 'w') as instrument_file, \ open(filename + "_note.arff", 'w') as note_file, \ open(filename + "_velocity.arff", 'w') as velocity_file, \ open(filename + "_duration.arff", 'w') as duration_file, \ open(filename + "_time_delta.arff", 'w') as time_delta_file: for song in self.songs: array_instrument, array_note, array_velocity, array_duration, array_time_delta = song.get_arff_arrays( ) instrument_arff['data'] += array_instrument note_arff['data'] += array_note velocity_arff['data'] += array_velocity duration_arff['data'] += array_duration time_delta_arff['data'] += array_time_delta print "writing file " + instrument_file.name arff.dump(instrument_arff, instrument_file) print "writing file " + note_file.name arff.dump(note_arff, note_file) print "writing file " + velocity_file.name arff.dump(velocity_arff, velocity_file) print "writing file " + duration_file.name arff.dump(duration_arff, duration_file) print "writing file " + time_delta_file.name arff.dump(time_delta_arff, time_delta_file)
def kclustering(top=100, pca=0): training = pd.read_csv('documents\csv\drunk\drunk labeling 1300' + '.csv') test = pd.read_csv('documents\csv\drunk\drunkTEXT400U' + '.csv') main_domain = join(training, 'Clean tweet') top = topwords(test, 'Clean tweet', top) main_domain = join(training, 'Clean tweet') main_domain1 = join(test, 'Clean tweet') main_domain.joinall(top.top, 1) main_domain1.joinall(top.top, 1) training = main_domain.df test = main_domain1.df cols = ['Clean tweet'] try: for x in cols: del training[x] del test[x] except: pass print training['L'] training.L = training.L.replace(['y', 'n'], [True, False]) test.L = test.L.replace(['y', 'n'], [True, False]) if pca == 1: dftraining, dftest = pcaf(training, test) training = dftraining.join(training["L"]) test = dftest.join(test["L"]) try: training = training.replace(['True', 'False'], [True, False]) test = test.replace(['True', 'False'], [True, False]) except: pass headers_names = list(training.columns.values) training = training.astype(np.float64) test = test.astype(np.float64) training['L'] = training['L'].astype(bool) test['L'] = test['L'].astype(bool) headers_names.remove('L') headers_names.append('L') pca = str(pca) test = test[headers_names] training = training[headers_names] TRAINING = training.as_matrix(columns=None) TEST = test.as_matrix(columns=None) print training.dtypes main_domain.df.to_csv(r'documents\csv\unsupervised\test.csv', index=False) main_domain.df.to_csv(r'documents\csv\unsupervised\test.csv', index=False) arff.dump(r'documents\Arff\unsupervised' + r'\training' + pca + '.arff', TRAINING, relation="whatever", names=headers_names) arff.dump(r'documents\Arff\unsupervised' + r'\test' + pca + '.arff', TEST, relation="whatever", names=headers_names)
def dump_data_arff(cls, original_filename, destination_filename, X, Y): # dump always in big endian new_data = np.concatenate((Y,X), axis=1) arff_frame = arff.load(open(original_filename,'r'), encode_nominal = True, return_type=arff.DENSE) arff_frame['data'] = new_data.astype(int).tolist() f = open(destination_filename,"w") arff.dump(arff_frame, f) f.close()
def dump2arffnc(data, fname): with open(fname, 'wt') as file_pointer: arff.dump(data, file_pointer) call(['./fixarfffiles.sh', fname]) call([ './arff2nc_noorder', fname, '2', str(len(targetsToPredict)), fname + '.nc' ])
def create_arff_from_dataframes(df_dic,path): for key, val in df_dic.items(): if int(key) != int(cold_start_index): arff.dump(os.path.join(path, str(key) + '.arff') , val.values , relation='relation name' , names=val.columns) lines_to_save = None
def dump(self, path_or_filehandle): output = self._get_arff() if isinstance(path_or_filehandle, six.string_types): with open(path_or_filehandle, "w") as fh: arff.dump(output, fh) else: arff.dump(output, path_or_filehandle)
def dump(df,fp): """ dump DataFrame to file :param DataFrame df: :param file fp: """ arff = __dump(df) liacarff.dump(arff,fp)
def dump(self, path_or_filehandle): output = self._get_arff() if isinstance(path_or_filehandle, types.StringTypes): with open(path_or_filehandle, "w") as fh: arff.dump(output, fh) else: arff.dump(output, path_or_filehandle)
def write_to_arff_file(pred_Y, predicted_Y_file_name): test_X_arff=arff.load(open(test_X_file_path,'r')) arff_data={ 'data':pred_Y, 'relation':test_X_arff['relation'], 'description':'', 'attributes':[('class',['True','False'])] } with open('./predicted_test_Y_dt.arff','w') as arff_file: arff.dump(arff_data,arff_file)
def libsvm2arff(input_file, out_file): X, y = load_svmlight_file(input_file) l,c = X.shape data = np.zeros((l,c+1)) data[:,:-1] = X.toarray() data[:,c] = y arff.dump(out_file, data)
def libsvm2arff(input_file, out_file): X, y = load_svmlight_file(input_file) l, c = X.shape data = np.zeros((l, c + 1)) data[:, :-1] = X.toarray() data[:, c] = y arff.dump(out_file, data)
def export_arff(data, attributes, filename, relation="Data", description=None): exported_arff = { 'relation': relation, 'description': description, 'data': data, 'attributes': attributes } arff.dump(exported_arff, open("Datasets/" + filename + ".arff", "w+"))
def write_feature_file(): data = read_test_data() arff.dump('data.arff',data,relation='stress',names=['token_num']+feature_name+['is_stress']) svm_ofile = open("data.svm","w") for ins in data: line = "+1 " if ins[-1] else "-1 " dict_line = [str(index+1)+":"+str(ins[index]) for index in range(0,len(ins)-1) if ins[index]!=0] svm_ofile.write(line+" ".join(dict_line)+"\n") svm_ofile.close()
def __prepare_arff(self, tid): p = open("%sbag_of_words-0.pickle" % paths['weka.bag_of_words_dir'], 'r') bag_title = pickle.load(p) p.close() p = open("%sbag_of_words-1.pickle" % paths['weka.bag_of_words_dir'], 'r') bag_body = pickle.load(p) p.close() data = { 'attributes': [], 'data': [], 'description': u'', 'relation': tid } for word in bag_title: data['attributes'].append(("title-%s" % word, 'NUMERIC')) for word in bag_body: data['attributes'].append(("body-%s" % word, 'NUMERIC')) data['attributes'].append(('class', ['yes', 'no'])) f = arff.load( open("%s%d.arff" % (paths['weka.training_arff_dir'], tid), 'r')) for record in f['data']: record_bag_title = self.txtpro.simpletextprocess(0, record[0]) record_bag_body = self.txtpro.simpletextprocess(0, record[1]) record_data = [] # iterate through original bag, figure out freq in this record's bag for word in bag_title: if word in record_bag_title: record_data.append(record_bag_title[word]) else: record_data.append(0) for word in bag_body: if word in record_bag_body: record_data.append(record_bag_body[word]) else: record_data.append(0) record_data.append(record[2]) data['data'].append(record_data) fnew = open("%s%d-wordvec-nonsparse.arff" % \ (paths['weka.training_arff_dir'], tid), 'w') arff.dump(fnew, data) fnew.close() # convert to sparse format Popen(("java -cp %s weka.filters.unsupervised.instance.NonSparseToSparse " + "-i %s%d-wordvec-nonsparse.arff -o %s%d-wordvec.arff") % \ (paths['weka.weka_jar'], paths['weka.training_arff_dir'], tid, paths['weka.training_arff_dir'], tid), shell = True).communicate() remove("%s%d-wordvec-nonsparse.arff" % (paths['weka.training_arff_dir'], tid))
def kclustering(top=100,pca=0): training=pd.read_csv('documents\csv\drunk\drunk labeling 1300'+'.csv' ) test=pd.read_csv( 'documents\csv\drunk\drunkTEXT400U'+'.csv' ) main_domain = join(training,'Clean tweet') top = topwords(test,'Clean tweet',top) main_domain = join(training,'Clean tweet') main_domain1 = join(test,'Clean tweet') main_domain.joinall(top.top,1) main_domain1.joinall(top.top,1) training=main_domain.df test=main_domain1.df cols=['Clean tweet'] try: for x in cols: del training[x] del test[x] except: pass print training['L'] training.L=training.L.replace(['y','n'], [True,False]) test.L=test.L.replace(['y','n'], [True,False]) if pca==1: dftraining, dftest=pcaf(training,test) training =dftraining.join(training["L"]) test=dftest.join(test["L"]) try: training=training.replace(['True','False'], [True,False]) test=test.replace(['True','False'], [True,False]) except: pass headers_names=list(training.columns.values) training=training.astype(np.float64) test=test.astype(np.float64) training['L']=training['L'].astype(bool) test['L']=test['L'].astype(bool) headers_names.remove('L') headers_names.append('L') pca=str(pca) test = test[headers_names] training = training[headers_names] TRAINING=training.as_matrix(columns=None) TEST=test.as_matrix(columns=None) print training.dtypes main_domain.df.to_csv(r'documents\csv\unsupervised\test.csv',index=False) main_domain.df.to_csv(r'documents\csv\unsupervised\test.csv',index=False) arff.dump(r'documents\Arff\unsupervised'+r'\training'+pca+'.arff',TRAINING, relation="whatever", names=headers_names) arff.dump(r'documents\Arff\unsupervised'+r'\test'+pca+'.arff',TEST, relation="whatever", names=headers_names)
def getArticleList(): ArticleList = [] for i in range(int(sys.argv[1]), int(sys.argv[2])) : print('page '+str(i)); ArchivesIDList = getArchivesIDList(i) for ID in ArchivesIDList: a = getArticle(ID) ArticleList.append(a.toList()) arff.dump('article_'+str(i)+'.arff', ArticleList, relation="article", names=['ArchivesID', 'Category', 'Department', 'ReadCount', 'Title', 'Content', 'Glossary'])
def write(file_name, data): """ Writes ARFF data dictionary to file. :param file_name: File name :param data: Data dictionary :return: """ f = open(file_name, 'w') arff.dump(data, f) f.close()
def dump_data_arff(cls, original_filename, destination_filename, X, Y): # dump always in big endian new_data = np.concatenate((Y, X), axis=1) arff_frame = arff.load(open(original_filename, 'r'), encode_nominal=True, return_type=arff.DENSE) arff_frame['data'] = new_data.astype(int).tolist() f = open(destination_filename, "w") arff.dump(arff_frame, f) f.close()
def run(args): root = logging.getLogger() root.setLevel(logging.INFO) column_header = [ 'batch_size', 'epochs', 'h_flip', 'learning_rate_init', 'lr_decay', 'momentum', 'patience', 'resize_crop', 'shuffle', 'tolerance', 'v_flip', 'weight_decay' ] all_results = None hyperparameters = None for dataset in config_spaces.DATASETS: results = pd.read_csv(os.path.join(args.input_dir, dataset, '%s-features.csv' % dataset), header=None, names=column_header) accuracy = np.loadtxt(os.path.join(args.input_dir, dataset, '%s-responses-acc.csv' % dataset), delimiter=',') runtime = np.loadtxt(os.path.join(args.input_dir, dataset, '%s-responses-time.csv' % dataset), delimiter=',') assert results.shape[0] == accuracy.shape[0] == runtime.shape[0] results['predictive_accuracy'] = accuracy results['runtime'] = runtime results['dataset'] = dataset config_space = config_spaces.get_config_space(dataset, 0) if hyperparameters is None: hyperparameters = config_space.get_hyperparameter_names() else: assert hyperparameters == config_space.get_hyperparameter_names() # sanity checks on parameter values for hp in config_space.get_hyperparameters(): if isinstance(hp, ConfigSpace.CategoricalHyperparameter): results[hp.name] = results[hp.name].apply(lambda val: hp.choices[val]) elif isinstance(hp, ConfigSpace.hyperparameters.NumericalHyperparameter): for idx, value in enumerate(results[hp.name].values): if not (hp.lower <= value <= hp.upper): raise ValueError('Illegal value for %s at %d: %s' % (hp.name, idx, value)) else: raise ValueError('Hyperparameter type not supported: %s' % hp.name) for idx, value in enumerate(results['predictive_accuracy'].values): assert 0.0 <= value < 100.0, 'Accuracy iteration %d for dataset %s: %f' % (idx, dataset, value) for idx, value in enumerate(results['runtime'].values): assert 0.0 < value if all_results is None: all_results = results else: all_results = all_results.append(results) os.makedirs(args.output_dir, exist_ok=True) json_meta = { 'col_measures': ['predictive_accuracy', 'runtime'], 'col_parameters': hyperparameters } arff_dict = openmlcontrib.meta.dataframe_to_arff(all_results, 'fanova-cnn', json.dumps(json_meta)) output_file = os.path.join(args.output_dir, 'fanova-cnn.arff') with open(output_file, 'w') as fp: arff.dump(arff_dict, fp) logging.info('saved to %s' % output_file)
def write_arff(header, data, fp, root_dir): print('Writing {0}'.format(fp)) new_arff = { 'attributes': header, 'data': data, 'relation': fp, 'description': '' } with open('{0}/{1}.arff'.format(root_dir, fp), "w") as fh: arff.dump(new_arff, fh)
def save_arff(df: pd.DataFrame, file_path: Path): attributes = [(f"Attr{i}", 'NUMERIC') for i in range(FEATURE_COUNT)] attributes += [(f"Class{i}", ['0', '1']) for i in range(CLASS_COUNT)] instance_count = df.shape[0] arff_data = [df.iloc[i].tolist() for i in range(instance_count)] arff_dict = {'attributes': attributes, 'data': arff_data, 'relation': DATASET_NAME, 'description': ''} with file_path.open(mode='wt') as file: arff_output.dump(obj=arff_dict, fp=file)
def test_save(df: pd.DataFrame): attributes = [(j, 'NUMERIC') if df[j].dtypes in ['int64', 'float64'] else (j, df[j].unique().astype(str).tolist()) for j in df] arff_dic = { 'attributes': attributes, 'data': df.values, 'relation': 'myRel', 'description': '' } with open("myfile.arff", "w", encoding="utf8") as f: arff.dump(arff_dic, f)
def generate_arff_file(self, file_path, file_name, arff_data): """ Generates arff file :param file_name: file_name for arff data :param arff_data: dict, arff_data :return: string, generated file path """ if not os.path.exists(file_path): os.makedirs(file_path) arff_file = codecs.open(file_path+file_name, 'w+', encoding='utf-8') arff.dump(arff_data, arff_file) arff_file.close()
def todo(document1,document2,target,target1,A=1,varydocument=0,joineig=0,undersamplingv=0): #varydocument= 0 it varies the source and Joineig=0 it adds the spectral features print 'size: ', 'A= ',A,'eigenvectors=', joineig, 'with or without eigenvectors 1=without 0=with 2=withoutdi' spectral=espectralfeature(document1,document2) df, test=spectral.spectralcluster(A,varydocument,joineig,undersamplingv) print "PASO 1 COMPLETED" headers_names=list(df.columns.values) cols=['Clean tweet','tweet','url'] for x in cols: try: del df[x] del test[x] except: pass try: df=df.replace(['True','False'], [True,False]) except: pass try: test=test.replace(['True','False'], [True,False]) except: pass print headers_names headers_names=list(df.columns.values) headers_names.remove('L') headers_names.append('L') print headers_names print type(headers_names) test = test[headers_names] df= df[headers_names] A=str(A) joineig=str(joineig) varydocument=str(varydocument) undersamplingv=str(undersamplingv) df.to_csv(target+'\Training'+A+'.csv',index=False) test.to_csv(target+'\Test'+A+'.csv',index=False) print "COMPLETED 0", df.dtypes TRAINING=df.as_matrix(columns=None) print "COMPLETED 0.1" arff.dump(target1+r'\training'+A+varydocument+ joineig+undersamplingv+'.arff',TRAINING, relation="whatever", names=headers_names) TEST=test.as_matrix(columns=None) arff.dump(target1+ r'\test'+A+varydocument+joineig+undersamplingv+'.arff',TEST, relation="whatever", names=headers_names) print "COMPLETED"
def __prepare_arff(self, tid): p = open("%sbag_of_words-0.pickle" % paths['weka.bag_of_words_dir'], 'r') bag_title = pickle.load(p) p.close() p = open("%sbag_of_words-1.pickle" % paths['weka.bag_of_words_dir'], 'r') bag_body = pickle.load(p) p.close() data = {'attributes': [], 'data': [], 'description': u'', 'relation': tid} for word in bag_title: data['attributes'].append(("title-%s" % word, 'NUMERIC')) for word in bag_body: data['attributes'].append(("body-%s" % word, 'NUMERIC')) data['attributes'].append(('class', ['yes', 'no'])) f = arff.load(open("%s%d.arff" % (paths['weka.training_arff_dir'], tid), 'r')) for record in f['data']: record_bag_title = self.txtpro.simpletextprocess(0, record[0]) record_bag_body = self.txtpro.simpletextprocess(0, record[1]) record_data = [] # iterate through original bag, figure out freq in this record's bag for word in bag_title: if word in record_bag_title: record_data.append(record_bag_title[word]) else: record_data.append(0) for word in bag_body: if word in record_bag_body: record_data.append(record_bag_body[word]) else: record_data.append(0) record_data.append(record[2]) data['data'].append(record_data) fnew = open("%s%d-wordvec-nonsparse.arff" % \ (paths['weka.training_arff_dir'], tid), 'w') arff.dump(fnew, data) fnew.close() # convert to sparse format Popen(("java -cp %s weka.filters.unsupervised.instance.NonSparseToSparse " + "-i %s%d-wordvec-nonsparse.arff -o %s%d-wordvec.arff") % \ (paths['weka.weka_jar'], paths['weka.training_arff_dir'], tid, paths['weka.training_arff_dir'], tid), shell = True).communicate() remove("%s%d-wordvec-nonsparse.arff" % (paths['weka.training_arff_dir'], tid))
def test_files(self): fname = os.path.join(SRC_DIR, 'example.arff') data = [ ['blonde', 17.2, 1], ['blue', 27.2, 2], ['blue', 18.2, 3], ] arff.dump(fname, data, relation='diabetics_data', names=('hair_color', 'age', 'patno')) data = list(arff.load(os.path.join(SRC_DIR, fname))) arff_rows = arff.dumps(data) reparsed_data = list(arff.loads(arff_rows)) data = [list(row) for row in data] reparsed_data = [list(row) for row in reparsed_data] self.assertEqual(data, reparsed_data)
def generateArffFile(self,datafeatures): print "data features length",len(datafeatures) try: self.features = self.features + self.const.LABEL_FEATURES_GOOD # OUTPUT_FILE_TRAIN output_file = self.const.OUTPUT_FILE_TRAIN if self.mode.lower() == 'test': output_file=self.const.OUTPUT_FILE_TEST print "generating arff file ", output_file ,"this will take time. please wait. " features_underscore = [] for gram in self.features: features_underscore.append(gram.replace(" ","_")) arff.dump(output_file, datafeatures, relation="yelp", names=features_underscore) print "arff file generation done." except: print "Error: Generating Arff file. \n Reason: ",sys.exc_info()
def saveArff(path, filename, dim, X, y): data = X.tolist() for i, row in enumerate(data): row.append(str(y[i])) attributes = ['centroid_%d'%(i+1) for i in range(X.shape[1])] attributes.append('class_name') outFilePath= os.path.join(path, filename) infile = open(outFilePath, 'wb') arff.dump(outFilePath, data, relation="whatever", names=attributes) print '.arff file saved in %s'%outFilePath
def main(): verbose = False splits = 0 arff_filename = '' try: optlist, args = getopt.getopt(sys.argv[1:], 'hvs:a:') except getopt.GetoptError as err: #print str(err) usage() sys.exit(2) #print optlist, args for opt, val in optlist: if opt == '-h': usage() sys.exit(0) elif opt == '-v': verbose = True elif opt == '-s': splits = atoi(val) elif opt == '-a': arff_filename = val #print splits, arff_filename if not arff_filename: usage() sys.exit(2) fd = open(arff_filename,'r') data = arff.load(fd) fd.close() if verbose: print 'Record read from input file:', len(data['data']) if splits: arff_splits = arff.split(data,splits) for s in range(len(arff_splits)): filename_base = arff_filename.rsplit('.',1)[0] split_filename = '%s_split%d.arff' % (filename_base, s) fdsplit = open(split_filename, 'w') arff.dump(fdsplit, arff_splits[s]) fdsplit.close()
def generate_arquivo(fase): jogo_fase = Jogo.objects.all().filter(fase=fase) jsn = [] colunas = ['aluno', 'frustrado','qtd_toques', 'tentativas', 'tempo', 'med_toques_segundo'] colunas.extend(['qtd_toque_tipo_' + x.__str__() for x in range(13)]) qtd_max_toq = 0 for j in jogo_fase: toques = j.toques_set.all().order_by('t') qtd_toques = toques.count() if qtd_max_toq <= qtd_toques : qtd_max_toq = qtd_toques col_toq = [] qtd_toque_tipo = [0] * 13 for t in toques: col_toq.append(t.x) col_toq.append(t.y) col_toq.append(t.t) col_toq.append(t.acao) qtd_toque_tipo[t.acao] = qtd_toque_tipo[t.acao] + 1 med_toques_segundo = 0 if float(j.tempo/100.0) <> 0: med_toques_segundo = qtd_toques / float(j.tempo/100.0) jsn.append([j.aluno, j.frustrado, qtd_toques, j.tentativas, j.tempo, med_toques_segundo] + qtd_toque_tipo + col_toq) ind = 0 for j in jogo_fase: toques = j.toques_set.all().order_by('t') qtd_toques = toques.count() for i in range(qtd_toques, qtd_max_toq): if qtd_toques == qtd_max_toq: break jsn[ind] = jsn[ind] + [0L, 0L, 0L, 0L] ind = ind + 1 for i in range(1, qtd_max_toq+1): colunas.append('toque_' + i.__str__() + '_x') colunas.append('toque_' + i.__str__() + '_y') colunas.append('toque_' + i.__str__() + '_t') colunas.append('toque_' + i.__str__() + '_acao') arff.dump('results/result_fase_'+fase.__str__() +'.arff', jsn, relation="jogo_fase_" + fase.__str__(), names=colunas)
def _select_feature(raw_dict, labels, user_mat): # write to arff file obj = {} obj['relation'] = 'dictionary' obj['attributes'] = _generate_att_list(len(raw_dict)) concat_user_mat = copy.deepcopy(user_mat) for ii in range(len(concat_user_mat)): concat_user_mat[ii].append(labels[ii]) obj['data'] = concat_user_mat arff_file = open('.tmp.arff', 'w', encoding='utf-8') arff.dump(obj, arff_file) # use weka to select feature ll = os.popen('java -jar FeatureSelect/out/artifacts/FeatureSelect_jar/FeatureSelect.jar .tmp.arff').read() selected_index = ll.split() return list(fucking_map(lambda index: raw_dict[int(index)], selected_index))
def predict(): fields = [ 'stats.totals.pts.value', 'stats.totals.ast.value', 'stats.totals.trb.value', 'stats.per_game.pts_per_g.value', 'stats.per_game.ast_per_g.value', 'stats.per_game.trb_per_g.value', 'stats.advanced.per.value', # 'stats.advanced.ws.value', ] query = { #'name': 'Jack McCloskey', 'new_hof_probability': {'$exists': False}, 'stats.advanced.per': {'$exists': True}, 'stats.advanced.per.complete': True, # 'stats.advanced.ws': {'$exists': True}, # 'stats.advanced.ws.complete': True, 'stats.totals.pts': {'$exists': True}, 'stats.totals.pts.complete': True, 'stats.totals.ast': {'$exists': True}, 'stats.totals.ast.complete': True, 'stats.totals.trb': {'$exists': True}, 'stats.totals.trb.complete': True, 'stats.per_game.trb_per_g': {'$exists': True}, 'stats.per_game.trb_per_g.complete': True, 'stats.per_game.pts_per_g': {'$exists': True}, 'stats.per_game.pts_per_g.complete': True, 'stats.per_game.ast_per_g': {'$exists': True}, 'stats.per_game.ast_per_g.complete': True, } for p in db_players.find(query): logger.info('Player {}'.format(p['name'])) player = [nested_get(p, f) for f in fields] player.append(len(nested_get(p, 'honors.allstar_appearances', []))) player.append(len(nested_get(p, 'honors.championships', []))) player.append(nested_get(p, 'honors.mvpshares', 0)) player.append(nested_get(p, 'hall_of_fame')) arff.dump('test.arff', [player], relation="nba", names=fields+['honors.allstar_appearances', 'honors.championships', 'honors.mvpshares', 'hall_of_fame']) raw_output = subprocess.check_output('java -cp /Applications/weka-3-6-9/weka.jar weka.classifiers.functions.RBFNetwork -T test.arff -l new.model -p 0'.split()) prob = parse_probability(raw_output) logger.info('Player {name}\'s HOF Probability is {prob}'.format(name = p['name'], prob = prob)) db_players.update({'_id': p['_id']}, {"$set":{"new_hof_probability": prob}}, safe=True, upsert=True)
def evaluate_apk(permissions, perm_file, model_file): fd = open(perm_file,'r') perm_list = simplejson.load(fd) fd.close() # permissions = get_permissions(filename) bitmap = perm_bitmap(perm_list, permissions)+[True] temp=tempfile.mkstemp(suffix='.arff') arff.dump(temp[1],[bitmap], names=perm_list+['Class']) output = subprocess.check_output(['java','weka.classifiers.bayes.NaiveBayesUpdateable','-p','0','-T',temp[1],'-l',model_file]) #os.remove(temp[1]) virus = output.split()[13]=='1:True' assurance = output.split()[14] if assurance == '+': assurance = output.split()[15] return (virus, str(assurance))
def makeArff(filename,handle,opts): readdata = False dataOut= [] attributesOut=[] relation = filename.split(".")[0] outfile = open(handle,'w') with open(filename) as data_file: dataIn = json.load(data_file) for entry in dataIn["data"]: values = [] attributes = [] for value in entry: attributes.append(value) if isinstance(entry[value], unicode): entry[value] = entry[value].encode('ascii','ignore') values.append(entry[value]) dataOut.append(values) attributesOut.append(attributes) arff.dump(outfile, dataOut, relation=relation, names=attributesOut[0])
def getArchivesIDListFromARFF(): start = int(sys.argv[1]) end = int(sys.argv[2]) f = codecs.open('article_merged_2.arff', 'r', encoding='utf8') span = 0 ArticleList = [] for index, l in enumerate(f): if(l[0] == '@'): span = span + 1 continue elif(index >= start + span and index < end + span): l = l.split(',') ID = l[0].replace('"', '') a = getArticle(ID) if(a == False): continue ArticleList.append(a.toList()) else: continue arff.dump('article_v2_'+str(start)+'_'+str(end)+'.arff', ArticleList, relation="article", names=['ArchivesID', 'Category', 'Department', 'ReadCount', 'Title', 'Content', 'Glossary'])
def genArff(arff_file, features, options, relation='rotations'): #print features #print options data = [] keys = [] featureIndex = -1 nextRotIndex = 0 for key in options: #print "len features[",key,"]",len(features[key]) sampleKey = key #print sampleKey, len(features[sampleKey]) for i in range(len(features[sampleKey])): data.append([]) for feature in options: #if not "next" in feature and not "prev" in feature and feature!="result": if feature!="next": keys.append(feature) for i, val in enumerate(features[feature]): data[i].append(val) # for feature in options: # if "next" in feature # for feature in options: # if feature == "prev": # elif feature in ["prev"] # for idx, val in enumerate(features[feature]): # data[idx].append(val) keys.append('next') for i, val in enumerate(data): val.append(features['next'][i]) if os.getlogin() == 'scottmcclanahan2002': return arff.dump(open(arff_file, 'w'), data, relation, keys) return arff.dump(arff_file, data, relation, keys)
with open(os.path.join(output_dir_, "description.txt"), "w") as fh: for line in description: fh.write(line) fh.write("\n") # Copy feature values and add instance id with open(os.path.join(metafeatures_dir, "feature_values.arff")) as fh: feature_values = arff.load(fh) feature_values['relation'] = scenario_id + "_" + feature_values[ 'relation'] with open(os.path.join(output_dir_, "feature_values.arff"), "w") as fh: arff.dump(feature_values, fh) # Copy feature runstatus and add instance id with open(os.path.join(metafeatures_dir, "feature_runstatus.arff")) as fh: feature_runstatus = arff.load(fh) feature_runstatus['relation'] = scenario_id + "_" + \ feature_runstatus['relation'] with open(os.path.join(output_dir_, "feature_runstatus.arff"), "w") \ as fh: arff.dump(feature_runstatus, fh) # Copy feature runstatus and add instance id with open(
('repetition', 'NUMERIC')] + \ [('%s' % name, 'NUMERIC') for name in metafeature_values.columns] arff_object['relation'] = "FEATURE_VALUES" arff_object['description'] = "" data = [] for idx in metafeature_values.index: line = [idx, 1] line += [value if np.isfinite(value) else None for value in metafeature_values.ix[idx,:].values] data.append(line) arff_object['data'] = data with open(os.path.join(args.output_directory, "feature_values.arff"), "w") as fh: arff.dump(arff_object, fh) # Feature steps and runtimes according to the aslib1.0 format feature_steps = defaultdict(list) metafeature_names = list() for metafeature_name in metafeatures.metafeatures.functions: dependency = metafeatures.metafeatures.get_dependency(metafeature_name) if dependency is not None: feature_steps[dependency].append(metafeature_name) feature_steps[metafeature_name].append(metafeature_name) metafeature_names.append(metafeature_name) # Write the feature runstatus in the aslib1.0 format arff_object = dict() arff_object['attributes'] = [('instance_id', 'STRING'),