Пример #1
0
# ensure data path exists
assert exists(path)
# decide which pipeline to use
if 'idAttribute' in p.keys():
	pipeline = 'base_id.groovy'
else:
	pipeline = 'base.groovy'

###### Write the jobs
classifiers_fn = '%s/%s' % (path, p['classifiersFilename'])
input_fn = '%s/%s' % (path, p['inputFilename'])
assert exists(input_fn)
# Get cross validation values
assert ('foldAttribute' in p) or ('foldCount' in p)
if 'foldAttribute' in p:
    headers = load_arff_headers(input_fn)
    fold_values = headers[p['foldAttribute']]
else:
    fold_values = range(int(p['foldCount']))
# load classifiers from file, skip commented lines
classifiers = filter(lambda x: not x.startswith('#'), open(classifiers_fn).readlines())
classifiers = [_.strip() for _ in classifiers]
working_dir = dirname(abspath(argv[0]))
all_parameters = list(product([working_dir], [path], classifiers, fold_values, bag_values,[pipeline],[args.seed]))
make_jobs(all_parameters)

if args.hpc:
    print 'submitting largeGOPred job to hpc...'
    ####### Write the lsf file 
    script = open(data + '.lsf','w')
    script.write('#!/bin/bash\n#BSUB -P acc_pandeg01a\n#BSUB -q %s\n#BSUB -J %s\n#BSUB -W %s\n#BSUB -R rusage[mem=%s]\n#BSUB -n %s\n#BSUB -sp 100\n' %(args.queue,data,args.time,args.memory,args.node))
Пример #2
0

# ensure project directory exists
project_path = abspath(argv[1])
assert exists(project_path)

# load and parse project properties
p = load_properties(project_path)
classifiers_fn = '%s/%s' % (project_path, p['classifiersFilename'])
input_fn = '%s/%s' % (project_path, p['inputFilename'])
assert exists(input_fn)

# generate cross validation values for leave-one-value-out or k-fold
assert ('foldAttribute' in p) or ('foldCount' in p)
if 'foldAttribute' in p:
    headers = load_arff_headers(input_fn)
    fold_values = headers[p['foldAttribute']]
else:
    fold_values = range(int(p['foldCount']))
nested_fold_values = range(int(p['nestedFoldCount']))
bag_count = int(p['bagCount'])
bag_values = range(bag_count) if bag_count > 1 else [0]

# ensure java's classpath is set
classpath = environ['CLASSPATH']

# command for cluster execution if enabled
use_cluster = False if 'useCluster' not in p else p['useCluster'] == 'true'
cluster_cmd = 'rc.py --cores 1 --walltime 06:00:00 --queue small --allocation acc_9'

# load classifiers from file, skip commented lines
Пример #3
0
        return DataFrame({'fold': fold, 'id': test_df.index.get_level_values('id'), 'label': test_labels, 'prediction': test_predictions, 'diversity': common.diversity_score(test_df.values)})
    except:
        return None

path = abspath(argv[1])
assert exists(path)
if not exists('%s/analysis' % path):
    mkdir('%s/analysis' % path)
p = common.load_properties(path)
input_fn = '%s/%s' % (path, p['inputFilename'])
assert exists(input_fn)

# generate cross validation values for leave-one-value-out or k-fold
assert ('foldAttribute' in p) or ('foldCount' in p)
if 'foldAttribute' in p:
    headers = common.load_arff_headers(input_fn)
    fold_values = headers[p['foldAttribute']]
else:
    fold_values = range(int(p['foldCount']))

stacker = LogisticRegression()

perf_df = []
for fold in fold_values:
	prediction_df = stacked_generalization(fold)
	if prediction_df is not None:
		prediction_df.to_csv('%s/analysis/%s-predictions.csv' %(path,fold))
		fmax = common.fmax_score(prediction_df.label.tolist(),prediction_df.prediction.tolist())
		perf_df.append(DataFrame(data = [[path.split('/')[-1],fold,fmax]], columns=['data','fold','fmax'],index=[0]))

# Get Fmax value for each fold, and dump to local disk