# ensure data path exists assert exists(path) # decide which pipeline to use if 'idAttribute' in p.keys(): pipeline = 'base_id.groovy' else: pipeline = 'base.groovy' ###### Write the jobs classifiers_fn = '%s/%s' % (path, p['classifiersFilename']) input_fn = '%s/%s' % (path, p['inputFilename']) assert exists(input_fn) # Get cross validation values assert ('foldAttribute' in p) or ('foldCount' in p) if 'foldAttribute' in p: headers = load_arff_headers(input_fn) fold_values = headers[p['foldAttribute']] else: fold_values = range(int(p['foldCount'])) # load classifiers from file, skip commented lines classifiers = filter(lambda x: not x.startswith('#'), open(classifiers_fn).readlines()) classifiers = [_.strip() for _ in classifiers] working_dir = dirname(abspath(argv[0])) all_parameters = list(product([working_dir], [path], classifiers, fold_values, bag_values,[pipeline],[args.seed])) make_jobs(all_parameters) if args.hpc: print 'submitting largeGOPred job to hpc...' ####### Write the lsf file script = open(data + '.lsf','w') script.write('#!/bin/bash\n#BSUB -P acc_pandeg01a\n#BSUB -q %s\n#BSUB -J %s\n#BSUB -W %s\n#BSUB -R rusage[mem=%s]\n#BSUB -n %s\n#BSUB -sp 100\n' %(args.queue,data,args.time,args.memory,args.node))
# ensure project directory exists project_path = abspath(argv[1]) assert exists(project_path) # load and parse project properties p = load_properties(project_path) classifiers_fn = '%s/%s' % (project_path, p['classifiersFilename']) input_fn = '%s/%s' % (project_path, p['inputFilename']) assert exists(input_fn) # generate cross validation values for leave-one-value-out or k-fold assert ('foldAttribute' in p) or ('foldCount' in p) if 'foldAttribute' in p: headers = load_arff_headers(input_fn) fold_values = headers[p['foldAttribute']] else: fold_values = range(int(p['foldCount'])) nested_fold_values = range(int(p['nestedFoldCount'])) bag_count = int(p['bagCount']) bag_values = range(bag_count) if bag_count > 1 else [0] # ensure java's classpath is set classpath = environ['CLASSPATH'] # command for cluster execution if enabled use_cluster = False if 'useCluster' not in p else p['useCluster'] == 'true' cluster_cmd = 'rc.py --cores 1 --walltime 06:00:00 --queue small --allocation acc_9' # load classifiers from file, skip commented lines
return DataFrame({'fold': fold, 'id': test_df.index.get_level_values('id'), 'label': test_labels, 'prediction': test_predictions, 'diversity': common.diversity_score(test_df.values)}) except: return None path = abspath(argv[1]) assert exists(path) if not exists('%s/analysis' % path): mkdir('%s/analysis' % path) p = common.load_properties(path) input_fn = '%s/%s' % (path, p['inputFilename']) assert exists(input_fn) # generate cross validation values for leave-one-value-out or k-fold assert ('foldAttribute' in p) or ('foldCount' in p) if 'foldAttribute' in p: headers = common.load_arff_headers(input_fn) fold_values = headers[p['foldAttribute']] else: fold_values = range(int(p['foldCount'])) stacker = LogisticRegression() perf_df = [] for fold in fold_values: prediction_df = stacked_generalization(fold) if prediction_df is not None: prediction_df.to_csv('%s/analysis/%s-predictions.csv' %(path,fold)) fmax = common.fmax_score(prediction_df.label.tolist(),prediction_df.prediction.tolist()) perf_df.append(DataFrame(data = [[path.split('/')[-1],fold,fmax]], columns=['data','fold','fmax'],index=[0])) # Get Fmax value for each fold, and dump to local disk