def writeArffForInclusiveSubset(filename, data, attributes, subset):
    num_attrs = len(attributes)
    assert(len(subset) <= num_attrs)
    assert(len(subset) >= 2)
    for d in data:
        assert(len(d) == num_attrs)
    attrs_subset = [attributes[i] for i in range(num_attrs) if i in subset]
    data_subset = [[d[i] for i in range(num_attrs) if i in subset] for d in data]
    if False:
        print 'subset = ', len(subset), subset
        print 'num_attrs =', num_attrs
        print 'attributes =', len(attributes), [a['name'] for a in attributes]
        print 'attrs_subset =', len(attrs_subset), [a['name'] for a in attrs_subset]
        assert(len(attrs_subset) >= 2)
    arff.writeArff(filename, None, 'find_best_attr', attrs_subset, data_subset)
Exemplo n.º 2
0
def makeTrainingTestSplit(base_data, split_vector, prefix):
    """ Split <base_data> into training and test data sets. Rows with indexes in 
        <split_vector> go into training file and remaining go into test file.
        Writes training and test .arff files and returns their names. 
        File names are prefixed with <prefix> """
    assert len(base_data) == len(split_vector)
    num_instances = len(base_data)

    training_file_name = prefix + training_file_base
    test_file_name = prefix + test_file_base

    training_data = []
    test_data = []
    for i, x in enumerate(base_data):
        if split_vector[i]:
            test_data.append(x)
        else:
            training_data.append(x)

    arff.writeArff(training_file_name, base_comments, base_relation, base_attrs, training_data)
    arff.writeArff(test_file_name, base_comments, base_relation, base_attrs, test_data)
    return (training_file_name, test_file_name)
		print 'Usage: jython get_attribute_subset.py  <base-arff-file> <attrs-arff-file>'
		sys.exit()

	base_filename = sys.argv[1]
	attrs_filename = sys.argv[2]
	out_filename = os.path.splitext(base_filename)[0] + '.attr_subset' + os.path.splitext(base_filename)[1] 
	
	print base_filename
	print attrs_filename
	print out_filename

   	relation, comments, attributes, data = arff.readArff(base_filename)
	_, _, attributes_subset, _ = arff.readArff(attrs_filename)

	attribute_index_map = {}
	for i,a in enumerate(attributes):
		attribute_index_map[a['name']] = i

	names_subset = [a['name'] for a in attributes_subset]

	indexes_subset = []
	for name in attribute_index_map.keys():
		if name in names_subset:
			indexes_subset.append(attribute_index_map[name])

	out_attributes = [attributes[i] for i in indexes_subset]
	out_data = [[d[i] for i in indexes_subset] for d in data]

	arff.writeArff(out_filename, comments, relation, out_attributes, out_data) 

	attrs_to_combine = []
	remaining_attrs = []
	for val in attributes[class_index]['vals']:
		if val in classes_to_combine:
			attrs_to_combine.append(val)
		else:
			remaining_attrs.append(val)

	combine_attributes = copy.deepcopy(attributes)
	combine_attributes[class_index]['vals'] = [group_name] + remaining_attrs

	separate_attributes = copy.deepcopy(attributes)
	separate_attributes[class_index]['vals'] = attrs_to_combine

	data_to_combine = []
	separate_data = []
	remaining_data = []
	for d in data:
		if d[class_index] in classes_to_combine:
			separate_data.append(d)
			d2 = copy.deepcopy(d)
			d2[class_index] = group_name
			data_to_combine.append(d2)
		else:
			remaining_data.append(d)
	combine_data = data_to_combine + remaining_data

	arff.writeArff(combine_filename, comments, relation, combine_attributes, combine_data)
	arff.writeArff(separate_filename, comments, relation, separate_attributes, separate_data)