# Plot time series and best shapelet onto '<class>_shapelet.eps' # 1) Open and read in lc from the source file testing_dir = LC_DIR + '/' + TEST_DIR # sample 3 for each file type class_tests = {} for fname in os.listdir(testing_dir): fclass = fname.split('_')[0] if fclass not in class_tests.keys(): class_tests[fclass] = [fname] else: class_tests[fclass].append(fname) for classname in class_tests.keys(): for fname in random.sample(class_tests[classname], 3): test_lc = file_to_lc(LC_DIR + '/' + TEST_DIR + '/' + fname) test_time = test_lc.time test_flux = test_lc.flux test_class = fname.split('/')[-1].split('_')[0] new_time, new_flux = ([], []) for i in xrange(len(test_flux)): if test_flux[i] != '-': new_time.append(test_time[i]) new_flux.append(test_flux[i]) plt.plot(new_time, new_flux, 'xk') colors = ['r', 'b', 'g', 'c', 'm', 'y'] styles = ['-', '-.'] print "test class:", test_class legtext = 'Original TS (class {0})'.format(test_class) legends = [legtext] for sh_num, sh_class in enumerate(best_line.keys()):
import lightcurve import features import utils LC_PATH = 'lightcurves/norm_n1.5_a100_m0_s400/' lc = lightcurve.file_to_lc(LC_PATH + 'SNe_wide_25.data') print "SNe" print['{0}'.format(obj) for obj in features.time_flux(lc)] #print [round(obj, 3) for obj in features.time_flux(lc)] print "ESE" lc = lightcurve.file_to_lc(LC_PATH + 'ESE_wide_25.data') print[round(obj, 3) for obj in features.time_flux(lc)] print "IDV" lc = lightcurve.file_to_lc(LC_PATH + 'IDV_wide_25.data') print[round(obj, 3) for obj in features.time_flux(lc)] print "Novae" lc = lightcurve.file_to_lc(LC_PATH + 'Novae_wide_25.data') print[round(obj, 3) for obj in features.time_flux(lc)]
def expdir_to_arff(lc_files, dyncache, dyncache_keyset, exp_dir, arff_fname): # Load up the description of each feature (name and #) to use to write the arff featdesc_file = open(FEATDESC_FNAME) feat_names = [] feat_counts = {} for line in featdesc_file: if line[0] == '#': continue line = line.strip().split('\t') feat_names.append(line[0]) feat_counts[line[0]] = int(line[1]) # and the classes class_file = open(CLASS_FNAME) classes = [] for line in class_file: line = line.strip() classes.append(line) # produce the filename and its header arff_file = open(arff_fname, 'w') arff_file.write("% Light curve classification features\n\n") arff_file.write("@RELATION {0}\n\n".format(exp_dir)) for feat_name in feat_names: if feat_counts[feat_name] == 1: # only 1 feature arff_file.write('@ATTRIBUTE {0} NUMERIC\n'.format(feat_name)) else: for i in xrange(feat_counts[feat_name]): arff_file.write('@ATTRIBUTE {0}{1} NUMERIC\n'.format(feat_name, str(i))) arff_file.write('@ATTRIBUTE class {' + ', '.join(classes) + '}\n\n') arff_file.write('@DATA\n') # extract features if not in cache and append # TODO replace cache_file = open(CACHE_FNAME, 'a') to_process = len(lc_files) lc_file = None # try: # to stop corruption of the cache increment = int(round((to_process / 10))) done = 0 conn = sqlite3.connect('feat_cache.db') c = conn.cursor() for lc_file in lc_files: #print lc_file if done % increment == 0 and done != 0: print "{0}/{1}".format(done, len(lc_files)) done += 1 # look for cache hit lc_class = lc_file.split('_')[0] features = None lc_path = '{0}/{1}/{2}'.format(LC_DIR, exp_dir, lc_file) #print "extracting features from:", lc_path # check to see if features are in dynamic cache first if lc_path in dyncache_keyset: features = dyncache[lc_path] else: # do db lookup search_cursor = c.execute('''select * from featcache where key=?''', [lc_path]) search_result = search_cursor.fetchall() if len(search_result) == 0: # cache miss, extract features print "db miss" lc = file_to_lc(lc_path) features = lc_to_features(lc) c.execute('''insert into featcache values {0}'''.format(tuple([lc_path] + features))) else: features = search_result[0][1:] # fetch features and remove key # either if extracted or fetched from db, add to dynamic cache dyncache[lc_path] = features dyncache_keyset.add(lc_path) # finally, write in the features arff_file.write(','.join([str(obj) for obj in features]) + ',' + lc_class + '\n') conn.commit() conn.close() arff_file.close() return (dyncache, dyncache_keyset)
import features import lightcurve import sys classtype = sys.argv[1] num = sys.argv[2] lc_path = 'lightcurves/norm_n1.5_a100_m0_s400/{0}_wide_{1}.data'.format( classtype, num) hc = features.time_flux(lightcurve.file_to_lc(lc_path))[-22:] print hc[:12] print hc[12:] hc = features.flux_only(lightcurve.file_to_lc(lc_path))[-22:] print hc[:12] print hc[12:]
import lightcurve import features import utils LC_PATH = 'lightcurves/norm_n1.5_a100_m0_s400/' lc = lightcurve.file_to_lc(LC_PATH + 'SNe_wide_25.data') print "SNe" print ['{0}'.format(obj) for obj in features.time_flux(lc)] #print [round(obj, 3) for obj in features.time_flux(lc)] print "ESE" lc = lightcurve.file_to_lc(LC_PATH + 'ESE_wide_25.data') print [round(obj, 3) for obj in features.time_flux(lc)] print "IDV" lc = lightcurve.file_to_lc(LC_PATH + 'IDV_wide_25.data') print [round(obj, 3) for obj in features.time_flux(lc)] print "Novae" lc = lightcurve.file_to_lc(LC_PATH + 'Novae_wide_25.data') print [round(obj, 3) for obj in features.time_flux(lc)]
def shapelet_features(apply_dir, args): # Get the parameters associated with the shapelet arguments given from the expt params = getshoutdir.getshfeatdir(args) print "params:", params # This is the diredtory containing the processed shapelets for the arguments shapelet_featureset = params[0] shapelet_feature_path = "shapelet_features/{0}".format(shapelet_featureset) print "extracting shapelet features using shapelets in:", shapelet_feature_path if not os.path.isdir('{0}/{1}'.format("raw_features", shapelet_featureset)): os.mkdir('{0}/{1}'.format("raw_features", shapelet_featureset)) # This is the directory where the resulting features are going feature_out_dir = "{0}/{1}/{2}".format( RAW_FEAT_DIR, shapelet_featureset, apply_dir) # zeroth element is directory name print "features extracted to:", feature_out_dir if os.path.isdir(feature_out_dir): print "directory already exists", feature_out_dir return # do not extract else: print "creating directory:", feature_out_dir os.mkdir(feature_out_dir) use_dtw = params[1] use_md = params[2] best_amt = params[3] dist_func = None if use_md: dist_func = distances.mindist elif use_dtw: dist_func = distances.dtw else: print "error!, no distance measure being used" # This is the lightcurve directory to which we apply the shapelets apply_dir = LC_DIR + "/" + apply_dir for cfnum in xrange(NUM_CROSSFOLDS): print "crossfold", cfnum # cf_best = utils.best_shapelets(crossfold + "/cf{0}".format(cfnum)) test_list = "crossfold/cf{0}/test".format(cfnum) for fnum, fname in enumerate(open(test_list)): if fnum % 10 == 0: print "{0} files processed".format(fnum) fname = fname.strip() extract_file = apply_dir + "/" + fname # Open extraction file to lc extract_from = lightcurve.file_to_lc(extract_file) features = [] # Load all the shapelets from shapelet_features/dir/cf(num) and find distances shapelet_source_dir = "{0}/cf{1}".format(shapelet_feature_path, cfnum) for shapelet_filename in os.listdir(shapelet_source_dir): # Get the shapelet cotents and apply the distance measure shapelet_as_lc = lightcurve.file_to_lc( shapelet_feature_path + '/cf{0}/'.format(cfnum) + shapelet_filename) measure_with_flux = shapelet_as_lc.flux if len(measure_with_flux) == 0: print "missing a shapelet file:", shapelet_feature_path + '/cf{0}/'.format( cfnum) + shapelet_filename continue distance = dist_func(extract_from.flux, measure_with_flux)[0] features.append(distance) # Finally, write out all the features feat_outfname = feature_out_dir + "/" + fname feat_outfile = open(feat_outfname, 'w') feat_outfile.write(','.join([str(o) for o in features])) feat_outfile.close()
if update: best_line[sh_class] = line best[sh_class] = line[-2] best_SD[sh_class] = line[-1] # Write these out to the appropriate directory out_dir = "{0}/cf{1}".format(SHAPELET_FEATURE_DIR, cfnum) if not os.path.isdir(out_dir): os.mkdir(out_dir) debug_dir = "{0}/cf{1}".format(SHAPELET_DEBUG_DIR, cfnum) if not os.path.isdir(debug_dir): os.mkdir(debug_dir) for sh_class in best_line.keys(): print "class:", sh_class, "id:", best_line[sh_class][0] source_filename = best_line[sh_class][1].split('/')[-1] source = lightcurve.file_to_lc('{0}/{1}'.format( SHAPELET_SOURCE_DIR, source_filename)) sh_start = int(best_line[sh_class][2]) sh_end = int(best_line[sh_class][3]) + sh_start debug_index.write('{0},{1}\n'.format(sh_class, best_line[sh_class][0])) out_file = open("{0}/{1}_shapelet_{2}.data".format( out_dir, sh_class, cfnum), 'w') # c is the class print "writing to file:", out_file for t, f in izip(source.time[sh_start:sh_end], source.flux[sh_start:sh_end]): out_file.write('{0}\t{1}\n'.format(t, f)) out_file.close() plt.plot(source.time[:sh_start], source.flux[:sh_start], 'k', source.time[sh_end:], source.flux[sh_end:], 'k', source.time[sh_start:sh_end], source.flux[sh_start:sh_end], 'r')
# Extract features from every light curve in training directory print "extracting features for", exp_feat_dir for tf, train_test in enumerate([train, test]): # just for convenience # Extract shapelets if necessary (external step to other feature extraction) if "-" in feat_id or 'shapelet' in feat_id: # ugh print "extracting shapelet features for directory:", train_test #if tf == 0: # if we are computing the training set OH GOD SO HACKED # print comp_features[feat_id] # if comp_features[feat_id][1] != 'None': # if there is a forced train set # print train_test # train_test = comp_features[feat_id][1] # print "using forced training set:", train_test shapelet_features(train_test, comp_features[feat_id][0]) # extract all shapelets for train_test with args continue # do not proceed (what would we do anyway?) outdir = "{0}/{1}".format(exp_feat_dir, train_test) if os.path.isdir(outdir): print "features already extracted to", outdir, "skipping" continue os.mkdir(outdir) lcs_to_extract = os.listdir("{0}/{1}".format(LC_DIR, train_test)) for fname in lcs_to_extract: outfile = open("{0}/{1}".format(outdir,fname), 'w') if fname == ".DS_Store": # skip this stupid shit continue data_fname = "{0}/{1}/{2}".format(LC_DIR, train_test, fname) lc = lightcurve.file_to_lc(data_fname) features = [data_fname] + apply(eval(feat_id),[lc]) outfile.write(','.join([str(o) for o in features]) + '\n') outfile.close()
import features import lightcurve import sys classtype = sys.argv[1] num = sys.argv[2] lc_path = 'lightcurves/norm_n1.5_a100_m0_s400/{0}_wide_{1}.data'.format(classtype, num) hc = features.time_flux(lightcurve.file_to_lc(lc_path))[-22:] print hc[:12] print hc[12:] hc = features.flux_only(lightcurve.file_to_lc(lc_path))[-22:] print hc[:12] print hc[12:]
def shapelet_features(apply_dir, args): # Get the parameters associated with the shapelet arguments given from the expt params = getshoutdir.getshfeatdir(args) print "params:", params # This is the diredtory containing the processed shapelets for the arguments shapelet_featureset = params[0] shapelet_feature_path = "shapelet_features/{0}".format(shapelet_featureset) print "extracting shapelet features using shapelets in:", shapelet_feature_path if not os.path.isdir('{0}/{1}'.format("raw_features", shapelet_featureset)): os.mkdir('{0}/{1}'.format("raw_features", shapelet_featureset)) # This is the directory where the resulting features are going feature_out_dir = "{0}/{1}/{2}".format(RAW_FEAT_DIR, shapelet_featureset, apply_dir) # zeroth element is directory name print "features extracted to:", feature_out_dir if os.path.isdir(feature_out_dir): print "directory already exists", feature_out_dir return # do not extract else: print "creating directory:", feature_out_dir os.mkdir(feature_out_dir) use_dtw = params[1] use_md = params[2] best_amt = params[3] dist_func = None if use_md: dist_func = distances.mindist elif use_dtw: dist_func = distances.dtw else: print "error!, no distance measure being used" # This is the lightcurve directory to which we apply the shapelets apply_dir = LC_DIR + "/" + apply_dir for cfnum in xrange(NUM_CROSSFOLDS): print "crossfold", cfnum # cf_best = utils.best_shapelets(crossfold + "/cf{0}".format(cfnum)) test_list = "crossfold/cf{0}/test".format(cfnum) for fnum, fname in enumerate(open(test_list)): if fnum % 10 == 0: print "{0} files processed".format(fnum) fname = fname.strip() extract_file = apply_dir + "/" + fname # Open extraction file to lc extract_from = lightcurve.file_to_lc(extract_file) features = [] # Load all the shapelets from shapelet_features/dir/cf(num) and find distances shapelet_source_dir = "{0}/cf{1}".format(shapelet_feature_path, cfnum) for shapelet_filename in os.listdir(shapelet_source_dir): # Get the shapelet cotents and apply the distance measure shapelet_as_lc = lightcurve.file_to_lc(shapelet_feature_path + '/cf{0}/'.format(cfnum) + shapelet_filename) measure_with_flux = shapelet_as_lc.flux if len(measure_with_flux) == 0: print "missing a shapelet file:", shapelet_feature_path + '/cf{0}/'.format(cfnum) + shapelet_filename continue distance = dist_func(extract_from.flux, measure_with_flux)[0] features.append(distance) # Finally, write out all the features feat_outfname = feature_out_dir + "/" + fname feat_outfile = open(feat_outfname, 'w') feat_outfile.write(','.join([str(o) for o in features])) feat_outfile.close()
for tf, train_test in enumerate([train, test]): # just for convenience # Extract shapelets if necessary (external step to other feature extraction) if "-" in feat_id or 'shapelet' in feat_id: # ugh print "extracting shapelet features for directory:", train_test #if tf == 0: # if we are computing the training set OH GOD SO HACKED # print comp_features[feat_id] # if comp_features[feat_id][1] != 'None': # if there is a forced train set # print train_test # train_test = comp_features[feat_id][1] # print "using forced training set:", train_test shapelet_features( train_test, comp_features[feat_id] [0]) # extract all shapelets for train_test with args continue # do not proceed (what would we do anyway?) outdir = "{0}/{1}".format(exp_feat_dir, train_test) if os.path.isdir(outdir): print "features already extracted to", outdir, "skipping" continue os.mkdir(outdir) lcs_to_extract = os.listdir("{0}/{1}".format(LC_DIR, train_test)) for fname in lcs_to_extract: outfile = open("{0}/{1}".format(outdir, fname), 'w') if fname == ".DS_Store": # skip this stupid shit continue data_fname = "{0}/{1}/{2}".format(LC_DIR, train_test, fname) lc = lightcurve.file_to_lc(data_fname) features = [data_fname] + apply(eval(feat_id), [lc]) outfile.write(','.join([str(o) for o in features]) + '\n') outfile.close()
def expdir_to_arff(lc_files, dyncache, dyncache_keyset, exp_dir, arff_fname): # Load up the description of each feature (name and #) to use to write the arff featdesc_file = open(FEATDESC_FNAME) feat_names = [] feat_counts = {} for line in featdesc_file: if line[0] == '#': continue line = line.strip().split('\t') feat_names.append(line[0]) feat_counts[line[0]] = int(line[1]) # and the classes class_file = open(CLASS_FNAME) classes = [] for line in class_file: line = line.strip() classes.append(line) # produce the filename and its header arff_file = open(arff_fname, 'w') arff_file.write("% Light curve classification features\n\n") arff_file.write("@RELATION {0}\n\n".format(exp_dir)) for feat_name in feat_names: if feat_counts[feat_name] == 1: # only 1 feature arff_file.write('@ATTRIBUTE {0} NUMERIC\n'.format(feat_name)) else: for i in xrange(feat_counts[feat_name]): arff_file.write('@ATTRIBUTE {0}{1} NUMERIC\n'.format( feat_name, str(i))) arff_file.write('@ATTRIBUTE class {' + ', '.join(classes) + '}\n\n') arff_file.write('@DATA\n') # extract features if not in cache and append # TODO replace cache_file = open(CACHE_FNAME, 'a') to_process = len(lc_files) lc_file = None # try: # to stop corruption of the cache increment = int(round((to_process / 10))) done = 0 conn = sqlite3.connect('feat_cache.db') c = conn.cursor() for lc_file in lc_files: #print lc_file if done % increment == 0 and done != 0: print "{0}/{1}".format(done, len(lc_files)) done += 1 # look for cache hit lc_class = lc_file.split('_')[0] features = None lc_path = '{0}/{1}/{2}'.format(LC_DIR, exp_dir, lc_file) #print "extracting features from:", lc_path # check to see if features are in dynamic cache first if lc_path in dyncache_keyset: features = dyncache[lc_path] else: # do db lookup search_cursor = c.execute( '''select * from featcache where key=?''', [lc_path]) search_result = search_cursor.fetchall() if len(search_result) == 0: # cache miss, extract features print "db miss" lc = file_to_lc(lc_path) features = lc_to_features(lc) c.execute('''insert into featcache values {0}'''.format( tuple([lc_path] + features))) else: features = search_result[0][ 1:] # fetch features and remove key # either if extracted or fetched from db, add to dynamic cache dyncache[lc_path] = features dyncache_keyset.add(lc_path) # finally, write in the features arff_file.write(','.join([str(obj) for obj in features]) + ',' + lc_class + '\n') conn.commit() conn.close() arff_file.close() return (dyncache, dyncache_keyset)
if update: best_line[sh_class] = line best[sh_class] = line[-2] best_SD[sh_class] = line[-1] # Write these out to the appropriate directory out_dir = "{0}/cf{1}".format(SHAPELET_FEATURE_DIR, cfnum) if not os.path.isdir(out_dir): os.mkdir(out_dir) debug_dir = "{0}/cf{1}".format(SHAPELET_DEBUG_DIR, cfnum) if not os.path.isdir(debug_dir): os.mkdir(debug_dir) for sh_class in best_line.keys(): print "class:", sh_class, "id:", best_line[sh_class][0] source_filename = best_line[sh_class][1].split('/')[-1] source = lightcurve.file_to_lc('{0}/{1}'.format(SHAPELET_SOURCE_DIR, source_filename)) sh_start = int(best_line[sh_class][2]) sh_end = int(best_line[sh_class][3]) + sh_start debug_index.write('{0},{1}\n'.format(sh_class, best_line[sh_class][0])) out_file = open("{0}/{1}_shapelet_{2}.data".format(out_dir, sh_class, cfnum), 'w') # c is the class print "writing to file:", out_file for t, f in izip(source.time[sh_start:sh_end], source.flux[sh_start:sh_end]): out_file.write('{0}\t{1}\n'.format(t,f)) out_file.close() plt.plot(source.time[:sh_start], source.flux[:sh_start], 'k', source.time[sh_end:], source.flux[sh_end:], 'k', source.time[sh_start:sh_end], source.flux[sh_start:sh_end], 'r') plt.xlabel('Time (days)') plt.ylabel('Flux (mJy, normalised)') plt.savefig("{0}/{1}".format(debug_dir,\ '{0}_shapelet{1}.pdf'.format(sh_class, 1), format="pdf")) plt.close()