def xanes_fitting_Line(im_stack, e_list, refs, method="NNLS", alphaForLM=0.05): """Linear combination fit of image data with reference standards""" en, im1, im2 = np.shape(im_stack) im_array = np.mean(im_stack, 2) coeffs_arr = [] meanStats = { "R_Factor": 0, "R_Square": 0, "Chi_Square": 0, "Reduced Chi_Square": 0 } for i in range(im1): stats, coeffs = xanes_fitting_1D(im_array[:, i], e_list, refs, method=method, alphaForLM=alphaForLM) coeffs_arr.append(coeffs) for key in stats.keys(): meanStats[key] += stats[key] for key, vals in meanStats.items(): meanStats[key] = np.around((vals / im1), 5) return meanStats, np.mean(coeffs_arr, axis=0)
def reportStats(self,stats,name,sortby,n): # format stats defined in 'stats' dictionary into an output string. 'sortby' specifies how # the output should be ranked. 'n' is number of entries to print (specify an integer or 'all') sortbyDict = self.getStatsForList([1,1]) headerOrder = sortbyDict['returnOrder'] headerString = '{}\t\t{}\t\t{}\t\t{}\t\t{}\t\t{}\t\t{}\n'.format(name,*headerOrder) if sortby not in sortbyDict.keys(): return 'Unexpected ranking specified' list1,list2 = [],[] for key in stats.keys(): statsDic = stats[key] statsFmtd = [] for key2 in headerOrder: if isinstance(statsDic[key2],float): statsFmtd.append('{0:.3f}'.format(statsDic[key2])) else: statsFmtd.append(str(statsDic[key2])) string = '\t\t'.join([str(key)]+statsFmtd) list1.append(string) list2.append(statsDic[sortby]) sortedList1 = [x for (y,x) in sorted(zip(list2,list1))] # sort by chosen 'sortby' parameter if n != 'all': stringOut = '\n'.join(sortedList1[:n]) else: stringOut = '\n'.join(sortedList1) return headerString+stringOut
def grid_search_own_metrics_class_stratified(ix,iy,stratus_list, clf,get_grid_hyperparams_kargs, regressor_name="Regressor", show_progress_percentage=0.1, kfold=5,shuffle=True, sort_report_by='roc_auc_score'): clfks = get_grid_hyperparams(**get_grid_hyperparams_kargs) report_data = [] hpks = list(clfks[0].keys()) cols = len(clfks) progress_int = max(int(round(cols*show_progress_percentage,0)),1) print("Total number of evaluations:{}".format(cols)) for col,clfk in enumerate(clfks): metrics = cv_metrics_stratified_class(ix, iy.flatten(), stratus_list=stratus_list, clf=clf, clfk=clfk, kfold=kfold,shuffle=shuffle) metrics_report = {'name':regressor_name} for m in metrics.keys(): stats = metrics_stats(metrics[m],rn=3) for sk in stats.keys(): metrics_report[m+"_"+sk]=stats[sk] metrics_report.update(clfk) report_data.append(metrics_report.copy()) if col%progress_int==0: progress = round(100*col/cols,0) print("{} %".format(progress)) print("100.0 %") odf = pd.DataFrame(report_data[:]) odf = odf.sort_values(by=[sort_report_by+"_mean"],ascending=False) mean_cols = list(filter(lambda x: "mean" in x,list(odf.columns))) ocols = ['name'] ocols.extend(hpks) ocols.extend(mean_cols) ocols.extend(list(filter(lambda x: x not in ocols,odf.columns))) odf = odf[ocols].reset_index(drop=True) return odf.copy()
def cv_metrics_df_with_indexes(X, Y, train_indexes, test_indexes, iclf, iclfk={}, report_metrics=[ 'matthews_corr_coef', 'roc_auc_score', 'f1_score', 'sensitivity', 'specificity' ], norm=False, calc_stats=True, report_name='CLF', sort_metric='matthews_corr_coef_min'): output_objs = {} output_metrics = {} stats_df = [] report_name_sufix = '' total_features = X.shape[-1] for train_index, test_index in zip(train_indexes, test_indexes): X_train, X_test = X[train_index], X[test_index] y_train, y_test = Y[train_index], Y[test_index] if len(y_test.shape) > 1: y_test = y_test.argmax(1) if norm == True: X_train, X_test = norm_z_score(X_train, X_test) start_feature_number = 1 end_feature_number = total_features + 1 for feature_number in range(start_feature_number, end_feature_number): tmp_scores = output_metrics.get('F' + str(feature_number), {}) X_train_ = X_train[:, :feature_number] X_test_ = X_test[:, :feature_number] tmp_scores = fit_and_get_metrics(X_train_, X_test_, y_train, y_test, iclf, iclfk, report_metrics, tmp_scores) output_metrics['F' + str(feature_number)] = tmp_scores if calc_stats == True: number_of_features = 1 for fn in range(number_of_features, total_features + 1): fk = 'F' + str(fn) metrics = output_metrics[fk] metrics_report = { 'Name': report_name + report_name_sufix, 'Number of Variables': fn } for m in metrics.keys(): stats = metrics_stats(metrics[m], rn=3) for sk in stats.keys(): metrics_report[m + "_" + sk] = stats[sk] stats_df.append(metrics_report) stats_df = pd.DataFrame(stats_df).sort_values( by=[sort_metric, 'Number of Variables'], ascending=[False, True]).reset_index(drop=True) return output_metrics.copy(), stats_df.copy()
def loadAnalyserResults(yearList, suffix='noStopLoss_partial_360_noOutlier', pricePerc=True): ''' Takes a list of years as input and loads the result files into it. ''' yearStats = {} # suffix = 'min_1_stopLoss_20bp_full_360' # suffix = 'noStopLoss_partial_360' for year in yearList: print 'Currently at year:', year with open( 'results/' + suffix + '/' + 'gapStatsDf_' + str(year) + '_' + suffix + '.pickle', 'rb') as handle: yearStats[year] = pickle.load(handle) # Preprocessing to add desired stats for year in yearList: stockList = yearStats[year].keys() for stock in stockList: yearStats[year][stock]['prevLow'] = yearStats[year][stock][ 'currLow'].shift(1) yearStats[year][stock]['prevHigh'] = yearStats[year][stock][ 'currHigh'].shift(1) yearStats[year][stock]['prevOpen'] = yearStats[year][stock][ 'currOpen'].shift(1) yearStats[year][stock].dropna(axis=0, how='any', inplace=True) stats = yearStats[yearList[0]] stockList = stats.keys() for year in yearList[1:]: for stock in stockList: stats[stock] = stats[stock].append(yearStats[year][stock]) grandStats = pd.DataFrame() for stock in stockList: tmpDf = pd.DataFrame() tmpDf = tmpDf.append(stats[stock]) tmpDf['stockName'] = stock if (pricePerc): tmpDf['pricePerc'] = (tmpDf['currOpen']).rolling(window=20).apply( lambda x: pd.Series(x).rank(pct=True).iloc[-1]) tmpDf['pricePerc'] *= 100.0 tmpDf.dropna(inplace=True) grandStats = grandStats.append(deepcopy(tmpDf)) print 'Loading results complete!' return grandStats
def readStatsDafsComputeStandardizationBins(statAndDafFileName, nBins=50, pMisPol=0.0): stats = {} dafs = [] pMisPolInv = 1 - pMisPol misPolarizedSnps, totalSnps = 0, 0 with open(statAndDafFileName) as statAndDafFile: first = True for line in statAndDafFile: line = line.strip().split() if first: first = False header = line assert header[0] == "daf" for i in range(1, len(header)): stats[header[i]] = [] else: totalSnps += 1 if random.random() >= pMisPolInv: dafs.append(1 - float(line[0])) misPolarizedSnps += 1 else: dafs.append(float(line[0])) for i in range(1, len(line)): stats[header[i]].append(float(line[i])) statInfo = {} for statName in stats.keys(): stats[statName] = np.array(stats[statName]) nonan = ~np.isnan(stats[statName]) score_nonan = stats[statName][nonan] daf_nonan = np.array(dafs)[nonan] bins = allel.stats.selection.make_similar_sized_bins(daf_nonan, nBins) mean_score, _, _ = scipy.stats.binned_statistic(daf_nonan, score_nonan, statistic=np.mean, bins=bins) std_score, _, _ = scipy.stats.binned_statistic(daf_nonan, score_nonan, statistic=np.std, bins=bins) statInfo[statName] = (mean_score, std_score, bins) sys.stderr.write("mispolarized %d of %d (%f%%) " "SNPs when standardizing scores in %s\n" % ( misPolarizedSnps, totalSnps, 100 * misPolarizedSnps / float(totalSnps), statAndDafFileName, )) return statInfo
def _make_stats_tables(stats): name_col = [ k for k in stats.keys() if k != 'intervals' and stats[k] is not None ] value_col = [ round(float(stats[k]), 3) for k in name_col if stats[k] is not None ] interval_names = list(stats['intervals'].keys()) interval_values = [ round(float(i[1]), 1) for i in stats['intervals'].values() ] data = dict( names=name_col, values=value_col, ) source = ColumnDataSource(data) columns = [ TableColumn(field="names", title="Name"), TableColumn(field="values", title="Value"), ] data_table = DataTable(source=source, columns=columns, width=150, fit_columns=True, index_position=None) int_data = dict( names=interval_names, values=interval_values, ) int_source = ColumnDataSource(int_data) int_columns = [ TableColumn(field="names", title="interval"), TableColumn(field="values", title="Max Value"), ] interval_table = DataTable(source=int_source, columns=int_columns, width=120, fit_columns=True, index_position=None) return data_table, interval_table
def xanes_fitting_Binned(im_stack, e_list, refs, method="NNLS", alphaForLM=0.05): """Linear combination fit of image data with reference standards""" im_stack = resize_stack(im_stack, scaling_factor=10) # use a simple filter to find threshold value val = filters.threshold_otsu(im_stack[-1]) en, im1, im2 = np.shape(im_stack) im_array = im_stack.reshape(en, im1 * im2) coeffs_arr = [] meanStats = { "R_Factor": 0, "R_Square": 0, "Chi_Square": 0, "Reduced Chi_Square": 0 } specs_fitted = 0 total_spec = im1 * im2 for i in range(total_spec): spec = im_array[:, i] # do not fit low intensity/background regions if spec[-1] > val: specs_fitted += 1 stats, coeffs = xanes_fitting_1D(spec / spec[-1], e_list, refs, method=method, alphaForLM=alphaForLM) coeffs_arr.append(coeffs) for key in stats.keys(): meanStats[key] += stats[key] else: pass for key, vals in meanStats.items(): meanStats[key] = np.around((vals / specs_fitted), 6) # print(f"{specs_fitted}/{total_spec}") return meanStats, np.mean(coeffs_arr, axis=0)
def concatenate_feature_vectors(eeg,stats): if eeg.keys() != stats.keys(): raise ValueError("Input dictionaries have different keys") output_structure = {} for subset_name in eeg.keys(): if eeg[subset_name].keys() != stats[subset_name].keys(): raise ValueError("Input dictionaries have different keys") output_structure.update({subset_name : {}}) for class_name in eeg[subset_name].keys(): if len(eeg[subset_name][class_name]) != len(stats[subset_name][class_name]): raise ValueError("Numbers of frames in dicts are different") output_structure[subset_name].update({class_name : []}) for i in range(len(eeg[subset_name][class_name])): frame1 = eeg[subset_name][class_name][i] frame2 = stats[subset_name][class_name][i] if len(frame1.shape) != 1 or len(frame2.shape) != 1: raise ValueError("Frame dimensionality not equal to 1") newframe = np.concatenate([frame1, frame2]) output_structure[subset_name][class_name].append(newframe) return output_structure
def cv_metrics_stratified_class_report_with_indexes( X, Y, indexes, clf, clfk={}, kfold=5, shuffle=True, report_metrics=[ 'matthews_corr_coef', 'roc_auc_score', 'f1_score', 'sensitivity', 'specificity' ], regressor_name='Regressor', sort_report_by='roc_auc_score', norm=False): report_data = [] metrics = cv_metrics_stratified_class_with_indexes( X, Y, indexes=indexes, clf=clf, clfk=clfk, report_metrics=report_metrics, norm=norm) metrics_report = {'name': regressor_name} for m in metrics.keys(): stats = metrics_stats(metrics[m], rn=3) for sk in stats.keys(): metrics_report[m + "_" + sk] = stats[sk] metrics_report.update(clfk) report_data.append(metrics_report.copy()) odf = pd.DataFrame(report_data[:]) odf = odf.sort_values(by=[sort_report_by + "_mean"], ascending=False) mean_cols = list(filter(lambda x: "mean" in x, list(odf.columns))) ocols = ['name'] ocols.extend(clfk) ocols.extend(mean_cols) ocols.extend(list(filter(lambda x: x not in ocols, odf.columns))) odf = odf[ocols].reset_index(drop=True) return odf.copy()
def write_stats(eff, stats, filename, titles, names, all_names, corp_names, process_time, write_time, db_update_obj): file = filename + "-stats.tsv" if not os.path.exists("Webapp/Files/results/%s/Stats" %(filename)): os.makedirs("Webapp/Files/results/%s/Stats" %(filename)) file_path = os.path.join("Webapp/Files/results/%s/Stats" %(filename), file) try: if os.path.isfile(file_path): os.unlink(file_path) except Exception as e: print(e) with open(file_path, "w+") as stat: stat.write(file + " was processed in " + str(process_time) + " write back to file in " + str(write_time) +"\n\n") stat.write(str(all_names) + " names were extracted from " +filename + "\n" + str(names) + " unique names " + " --- " + str(names - int(corp_names)) + " Personal names and " + str(corp_names) + " Corporate names" + "\n") stat.write(str(titles) + " titles were extracted from " + filename + "\n\n") stat.write("API searched" +"\t" + "hits" + "\t" + "hit_rate" +"\n") for i in stats.keys(): setattr(db_update_obj, i, str(stats[i])) db_update_obj.save() stat.write(i + "\t" + str(stats[i]) + "\t" + str((int(stats[i])/names)*100) + "\n") stat.write("\n" + "\n") if 'LC' in eff.keys(): stat.write('LC_ID' + '\n' + "names enriched" + "\t" + "average matching score" + "\t" + "median matching score" + "\t" + "variance of matching score" + "\t" + "standard-div of matching score" + "\t" + "hit rate" + "\n") for i in eff["LC"]: stat.write(str(i) + "\t") stat.write("\n" + "\n") if 'VIAF' in eff.keys(): stat.write('VIAF_ID' + '\n' + "names enriched" + "\t" + "average matching score" + "\t" + "median matching score" + "\t" + "variance of matching score" + "\t" + "standard-div of matching score" + "\t" + "hit rate" + "\n") for i in eff["VIAF"]: stat.write(str(i) + "\t") stat.write("\n" + "\n") if 'work_id' in eff.keys(): stat.write('work_id' + '\n' + "titles enriched" + "\t" + "average matching score" + "\t" + "median matching score" + "\t" + "variance of matching score" + "\t" + "standard-div of matching score" + "\t" + "hit rate" + "\n") for i in eff["work_id"]: stat.write(str(i) + "\t") stat.write("\n" + "\n") if 'oclcid' in eff.keys(): stat.write('oclc_id' + '\n' + "titles enriched" + "\t" + "average matching score" + "\t" + "median matching score" + "\t" + "variance of matching score" + "\t" + "standard-div of matching score" + "\t" + "hit rate" + "\n") for i in eff["oclcid"]: stat.write(str(i) + "\t")
def detect(imagefilename,threshold=0.5,prior=-1.0): paramsdir = '../../data/params/' mergepairs = True stats = pickle.load(open('%snormalbayes-detectionoutput.pkl' % (paramsdir),'rb')) if prior>-1.0: stats['posnegratio'] = prior # load descriptors from feature file patchstep = 25 patchsize = 30 #featuretype = 'surf_%d_%d' % (patchstep,patchsize) data = {} features = [] nfeatures = 0 for feature in stats.keys(): if feature != 'posnegratio': descriptorfile = '%s%s_%d_%d' % (imagefilename[:-3],feature,patchstep,patchsize) data[feature] = numpy.genfromtxt(descriptorfile) features.append(feature) # for each descriptor, determine whether there is a match or not matches = [] for i in range(data[features[0]].shape[0]): xkey = data[features[0]][i,0] ykey = data[features[0]][i,1] descriptor = [] descriptor = {} for feature in features: descriptor[feature] = data[feature][i,3:] p = matchprobability(descriptor,stats,features) if p>threshold: if (not mergepairs) or ((xkey-patchstep,ykey) not in matches) and ((xkey,ykey-patchstep) not in matches): matches.append((xkey,ykey)) return matches
def write_stats(eff, stats, filename, titles, names, all_names, corp_names, process_time, write_time): file = filename + "-stats.tsv" if not os.path.exists("results/%s/Stats" %(filename)): os.makedirs("results/%s/Stats" %(filename)) file_path = os.path.join("results/%s/Stats" %(filename), file) try: if os.path.isfile(file_path): os.unlink(file_path) except Exception as e: print(e) with open(file_path, "w+") as stat: stat.write(file + " was processed in " + str(process_time) + " write back to file in " + str(write_time) +"\n\n") stat.write(str(all_names) + " names were extracted from " +filename + "\n" + str(names) + " unique names " + " --- " + str(names - int(corp_names)) + " Personal names and " + str(corp_names) + " Corporate names" + "\n") stat.write(str(titles) + " titles were extracted from " + filename + "\n\n") stat.write("API searched" +"\t" + "hits" + "\t" + "hit_rate" +"\n") for i in stats.keys(): stat.write(i + "\t" + str(stats[i]) + "\t" + str((int(stats[i])/names)*100) + "\n") stat.write("\n" + "\n") if 'LC' in eff.keys(): stat.write('LC_ID' + '\n' + "names enriched" + "\t" + "average matching score" + "\t" + "median matching score" + "\t" + "variance of matching score" + "\t" + "standard-div of matching score" + "\t" + "hit rate" + "\n") for i in eff["LC"]: stat.write(str(i) + "\t") stat.write("\n" + "\n") if 'VIAF' in eff.keys(): stat.write('VIAF_ID' + '\n' + "names enriched" + "\t" + "average matching score" + "\t" + "median matching score" + "\t" + "variance of matching score" + "\t" + "standard-div of matching score" + "\t" + "hit rate" + "\n") for i in eff["VIAF"]: stat.write(str(i) + "\t") stat.write("\n" + "\n") if 'work_id' in eff.keys(): stat.write('work_id' + '\n' + "titles enriched" + "\t" + "average matching score" + "\t" + "median matching score" + "\t" + "variance of matching score" + "\t" + "standard-div of matching score" + "\t" + "hit rate" + "\n") for i in eff["work_id"]: stat.write(str(i) + "\t") stat.write("\n" + "\n") if 'oclcid' in eff.keys(): stat.write('oclc_id' + '\n' + "titles enriched" + "\t" + "average matching score" + "\t" + "median matching score" + "\t" + "variance of matching score" + "\t" + "standard-div of matching score" + "\t" + "hit rate" + "\n") for i in eff["oclcid"]: stat.write(str(i) + "\t")
def main(ref_path, pred_path, r_nodata, p_nodata, out_txt, pred_scale=1, ax_limit=None): sns.set_style('white') sns.set_context(context='paper', rc={'patch.linewidth': 0}) t0 = time.time() r_nodata = int(r_nodata) p_nodata = int(p_nodata) out_dir, basename = os.path.split(out_txt) t1 = time.time() # Find and read reference raster if not os.path.exists(ref_path): raise RuntimeError('ref_path does not exist: %s' % ref_path) ds_r = gdal.Open(ref_path) ar_r = ds_r.ReadAsArray() tx_r = ds_r.GetGeoTransform() prj_r = ds_r.GetProjection() ds_r = None # Find and read pred raster if not os.path.exists(pred_path): raise RuntimeError('pred_path does not exist: %s' % pred_path) ds_p = gdal.Open(pred_path) ar_p = ds_p.ReadAsArray() tx_p = ds_p.GetGeoTransform() prj_p = ds_p.GetProjection() ds_p = None if not tx_p == tx_r and prj_p == prj_r: raise ValueError('Geo transform and/or projection of reference and prediction rasters do not match.') mask = (ar_r != r_nodata) & (ar_p != p_nodata) #print ar_p.min() ar_r = ar_r[mask].astype(np.int32) ar_p = ar_p[mask].astype(np.int32) if 'ltbiomass' in ref_path: ar_r = ar_r * float(pred_scale) else: ar_p = ar_p * float(pred_scale) #import pdb; pdb.set_trace() # Calc stats rmse, rmspe, r2, r, gmfr_a, gmfr_b = calc_stats(ar_r, ar_p) stats = {'n_pixels': ar_p.size, 'rmse': rmse, 'r2': r2, 'pearsonr': r, 'odr_intercept': gmfr_a, 'odr_slope': gmfr_b } # Make 2D histograms xlabel = 'reference' ylabel = 'predicted' this_bn = '%s_%s_vs_%s.png' % (basename.replace('.txt', ''), os.path.basename(ref_path), os.path.basename(pred_path)) title = this_bn.replace('_vs_', ' vs ').replace('.png','') out_png = os.path.join(out_dir, this_bn) ax = histogram_2d(ar_r, ar_p, out_png, hexplot=False, cmap='plasma', xlabel=xlabel, ylabel=ylabel, bins=50) ax_limit = max(max(ax.get_xlim(), ax_limit)) plt.sca(ax) # Plot GMFR (RMA) regression line max_val = max(ar_r.max(), ar_p.max()) x = np.array([0, max_val + 100]) y = x * gmfr_b + gmfr_a plt.plot(x, y, '-', lw=2, color='k') label_text = '$r^2$ = %.3f' % r2 plt.suptitle(title) plt.title(label_text, fontsize=12) #set plotting limits. if not ax_limit: ax_limit = max_val plt.ylim((0, ax_limit)) plt.xlim((0, ax_limit)) plt.savefig(out_png, dpi=300) plt.clf() df_xy = pd.DataFrame({'id': np.arange(ar_r.size), 'landtrendr': ar_p.astype(np.int16), 'lidar': ar_r.astype(np.int16) }) df_xy.set_index('id', inplace=True) df_xy.to_csv(out_png.replace('.png', '_xy.txt')) desc = '2D histograms made with the following parameters:\n' desc += '\tref_path: %s\n' % ref_path desc += '\tpred_path: %s\n' % pred_path desc += '\tr_nodata: %s\n' % r_nodata desc += '\tp_nodata: %s\n' % p_nodata desc += '\tout_txt: %s\n' % out_txt.replace('_stats.txt', '.txt') desc += '\tpred_scale: %s\n' % pred_scale desc += '\nStats for this comparison:' for k in sorted(stats.keys()): stat_str = '%s: %s' % (k, stats[k]) desc += '\n\t' + stat_str print stat_str #desc += '\n\t'.join(['%s: %s' % (k, stats[k]) for k in sorted(stats.keys())]) createMetadata(sys.argv, out_txt, description=desc) print '\nText file written to', out_txt print 'Total time: %.1f minutes' % ((time.time() - t0)/60)
def marc_process(processing_files, apis): #proccess start time tps = datetime.fromtimestamp(time.time()).strftime('%H:%M:%S') db_update_obj = P_progress(pid=processing_files) db_update_obj.save() file = "Webapp/source/%s" % (processing_files.name) #convert .mrc to MARC/XML Marc_XML = MARC_XML(file) files = Marc_XML.convert_marc_xml(db_update_obj) tfs = datetime.fromtimestamp(time.time()).strftime('%H:%M:%S') file = os.path.join('', files) filename = files.replace('.xml', '').replace('Webapp/Files/Processing/', '') print("processing " + filename) ts = datetime.fromtimestamp(time.time()).strftime('%H:%M:%S') log_file = str(filename) + "-error-logs" output = str(filename) + "-enhanced.xml" clearLogs(log_file, filename) query_type = "/authorities/names" # extracting names and titles from BIBFRAME db_update_obj.stage = "Extracting_names_and_titles" db_update_obj.save() mergecheck = True bib_object = Bibframe(file, log_file, mergecheck) transformed = bib_object.convert_bibframe() names = bib_object.extract_names(transformed)[0] titles = bib_object.extract_names(transformed)[1] #getting corp names (for stat report) all_names = bib_object.extract_names(transformed)[2] corp_names = bib_object.extract_names(transformed)[3] print(str(all_names) + " names were extrected from " + filename) print( str(len(names)) + " unique names were extracted from " + filename + " --- " + str(len(names) - corp_names) + " Personal names and " + str(corp_names) + " Corporate names") print(str(len(titles)) + " titles were extracted from " + filename) db_update_obj.all_names = str(len(names)) db_update_obj.all_titles = str(len(titles)) db_update_obj.p_names = str(len(names) - corp_names) db_update_obj.c_names = str(corp_names) db_update_obj.save() #dictionaries for storing URIs (names and titles) and stats enriched_names = {} enriched_titles = {} stats = {} print("enriching names") # iterate over the name dictionary db_update_obj.stage = "Enriching_names" db_update_obj.save() for index, item in enumerate(names.keys()): db_update_obj.name_index = index + 1 db_update_obj.save() name = item.split('-_-_-')[0] #print(index+1, name) enriched_names[item] = [] for api in apis: #check if the stat for the API already exists if api in stats.keys(): pass else: stats[api] = 0 # getting the API method name_result = APIFactory().get_API(name, query_type, api, log_file) # if the results are not empty, append to "enriched_names" dictionary the result using the api name as key if name_result: enriched_names[item].append(name_result) # add number of results to be used latter in stats report stats[api] = stats[api] + len(name_result) print("enriching titles") # iterate over the title dictionary db_update_obj.stage = "Enriching_titles" db_update_obj.save() for index, title in enumerate(titles.keys()): db_update_obj.title_index = index + 1 db_update_obj.save() #print(index+1, title) for authors in titles[title]['authors']: author = authors.split('-_-_-')[0] key = str(author) + "-_-_-" + str(title) enriched_titles[key] = [] title_result = APIFactory().get_API(author, title, 'search_OCLC', log_file) if title_result: enriched_titles[key].append(title_result) # getting rid of unwanted things db_update_obj.stage = "Optimization" db_update_obj.save() name_results = clean_up(enriched_names) title_result = clean_up(enriched_titles) # get the best URI each API (highest score) and storing it in final_names and final_titles result_names_Object = Results(name_results, names, file, 'name', log_file) result_names_Object.maximizer() final_names = result_names_Object.mapping() result_title_Object = Results(title_result, titles, file, 'title', log_file) final_titles = result_title_Object.mapping() tff = datetime.fromtimestamp(time.time()).strftime('%H:%M:%S') #write back the URIs to the BIBFRAME file db_update_obj.stage = "Writing_to_BIBFRAME" db_update_obj.save() write(final_names, final_titles, file, output, log_file, filename) eff = get_stat(final_names, len(names), final_titles, len(titles), filename) stats['names-enriched'] = len(final_names) tfw = datetime.fromtimestamp(time.time()).strftime('%H:%M:%S') write_time = datetime.strptime(tfw, '%H:%M:%S') - datetime.strptime( tff, '%H:%M:%S') file_process_time = datetime.strptime(tfw, '%H:%M:%S') - datetime.strptime( tfs, '%H:%M:%S') write_stats(eff, stats, filename, len(titles), len(names), all_names, corp_names, file_process_time, write_time, db_update_obj) #removing temp-file.xml delete_temp() print(filename + " processed in: ", file_process_time, " --- writing process :", write_time) tpf = datetime.fromtimestamp(time.time()).strftime('%H:%M:%S') process_time = datetime.strptime(tpf, '%H:%M:%S') - datetime.strptime( tps, '%H:%M:%S') db_update_obj.stage = "The process was completed in %s" % (process_time) db_update_obj.save() print("walltime:", process_time) add_to_archive(processing_files, db_update_obj, len(final_names), len(final_titles))
def write_fasta_by_location(peaks): seqs = {} stats = {} for index, row in peaks.iterrows(): seqs.setdefault(row['location'], '') seqs[row['location']] += '\n>{i}_{g}\n{s}'.format(i=index, g=row['gene_name'], s=row['seq']) stats.setdefault(row['gene_name'], {'locations': []}) stats[row['gene_name']]['locations'].append(row['location']) fname = { """3'UTR""": '3prime_utr.fa', """5'UTR""": '5prime_utr.fa', """CDS""": 'cds.fa', """ncRNA""": 'ncRNA.fa' } for label in seqs: with open('data/fasta/{f}'.format(f=fname[label]), 'w') as fh: fh.write(seqs[label]) with open('data/fasta/all_locations_combined_fbfs.fa', 'w') as fh: fh.write("".join(seqs.values())) five_and_three = set() cds_and_three = set() five_cds_and_three = set() five_and_cds = set() only_three = set() only_cds = set() only_five = set() has_secondary = set() has_five = set() has_three = set() has_cds = set() for gene in stats: if ("""3'UTR""" in stats[gene]['locations']) and ("""5'UTR""" in stats[gene]['locations']): five_and_three.add(gene) if ("""3'UTR""" in stats[gene]['locations']) and ("""CDS""" in stats[gene]['locations']): cds_and_three.add(gene) if ("""3'UTR""" in stats[gene]['locations']) and ( """5'UTR""" in stats[gene]['locations']) and ("""CDS""" in stats[gene]['locations']): five_cds_and_three.add(gene) if ("""5'UTR""" in stats[gene]['locations']) and ("""CDS""" in stats[gene]['locations']): five_and_cds.add(gene) if "5'UTR" in stats[gene]['locations']: has_five.add(gene) if "3'UTR" in stats[gene]['locations']: has_three.add(gene) if "CDS" in stats[gene]['locations']: has_cds.add(gene) if set(stats[gene]['locations']) == set(["3'UTR"]): only_three.add(gene) if set(stats[gene]['locations']) == set(["5'UTR"]): only_five.add(gene) if set(stats[gene]['locations']) == set(["CDS"]): only_cds.add(gene) if len(stats[gene]['locations']) > 1: has_secondary.add(gene) print """ Number of genes: {total} Number of genes with a 3' peak: {a_three} Number of genes with a CDS peak: {a_cds} Number of genes with a 5' peak: {a_five} Number of genes with only 3' peaks: {o_3} Number of genes with only CDS peaks: {o_cds} Number of genes with only 5' peaks: {o_5} Number of genes with 3' and 5' peaks: {a} Number of genes with 3' and CDS peaks: {b} Number of genes with 5' and CDS peaks: {five_and_cds} Number of genes with 5' and 3' and CDS peaks: {c} Number of genes with a secondary peak: {has_second} """.format(total=len(list(set(stats.keys()))), a_three=len(has_three), a_cds=len(has_cds), a_five=len(has_five), o_3=len(list(only_three)), o_cds=len(list(only_cds)), o_5=len(list(only_five)), a=len(list(five_and_three)), b=len(list(cds_and_three)), five_and_cds=len(five_and_cds), c=len(list(five_cds_and_three)), has_second=len(list(has_secondary))) labels = ['ncRNA', """3'UTR""", """5'UTR""", 'CDS']
def binaryStats(saveLocation='', ECC=True, PB=True, SEMI=True, hist=False, numBins=100, inf='', to_return=False): """ Plots statistics of binary black hoels orbits from the various snapshots as color coded CDFs 'saveLocation' is the folder in which to save the plots 'ECC' True to plot eccentricity 'PB' True to plot orbital period 'SEMI' True to plot orbital semi-major axis 'hist' True to plot an idividual histogram for each snapshot (WARNING: very slow) 'numBins' to set the number of bins if 'hist' is True 'inf' is the the suffix for the relevant shortfiles (usually the simulation name, e.g. 'N10K_r10_Z01_1') 'to_return' set to true if instead of ploting the data, return it instead """ if not saveLocation == '': if not os.path.exists(saveLocation): os.makedirs(saveLocation) columns = [0] indicies = [1, 1, 1] if ECC: columns = columns + [6] indicies[1] += 1 indicies[2] += 1 if PB: columns = columns + [7] indicies[2] += 1 if SEMI: columns = columns + [8] bevData, meta = ip.bh_data('bev.82', columns, meta_data={}, info=inf) stats = {} for val in bevData: if not val[0] in stats: stats[val[0]] = {'ECC': [], 'PB': [], 'SEMI': []} if ECC: stats[val[0]]['ECC'].append(val[indicies[0]]) if PB: stats[val[0]]['PB'].append(val[indicies[1]]) if SEMI: stats[val[0]]['SEMI'].append(val[indicies[2]]) if to_return: return (stats) all_keys = list(stats.keys()) all_keys.sort() stats_keys = [] for key in all_keys: non_zero = False for sub_key in stats[key]: if stats[key][sub_key] != []: non_zero = True if non_zero: stats_keys.append(key) plt.figure(1) colormap = plt.cm.coolwarm plt.gca().set_prop_cycle( cycler('color', [colormap(i) for i in np.linspace(0, 0.9, len(stats))])) plt.figure(2) plt.gca().set_prop_cycle( cycler('color', [colormap(i) for i in np.linspace(0, 0.9, len(stats))])) plt.figure(3) plt.hold(True) plt.gca().set_prop_cycle( cycler('color', [colormap(i) for i in np.linspace(0, 0.9, len(stats))])) for i, key in enumerate(stats_keys): temp_AU_list = [] for item in stats[key]['SEMI']: AU = (10.0**item) * 0.00465047 temp_AU_list.append(math.log10(AU)) stats[key]['SEMI'] = temp_AU_list stats[key]['ECC'].sort() stats[key]['PB'].sort() stats[key]['SEMI'].sort() ECC_tot = _buildFraction(stats[key]['ECC']) PB_tot = _buildFraction(stats[key]['PB']) SEMI_tot = _buildFraction(stats[key]['SEMI']) if ECC: plt.figure(1) ECC_plot, = plt.step(([0] + stats[key]['ECC']), ([0] + ECC_tot), where='post') if PB: plt.figure(2) PB_plot, = plt.step(([0] + stats[key]['PB']), ([0] + PB_tot), where='post') if SEMI: plt.figure(3) SEMI_plot, = plt.step(([0] + stats[key]['SEMI']), ([0] + SEMI_tot), where='post') if i == 0: if ECC: ECC_pi = ECC_plot if PB: PB_pi = PB_plot if SEMI: SEMI_pi = SEMI_plot if i == (len(stats_keys) - 1): if ECC: ECC_pf = ECC_plot if PB: PB_pf = PB_plot if SEMI: SEMI_pf = SEMI_plot min_key = stats_keys[0] max_key = stats_keys[-1] if ECC: plt.figure(5) Z = [[0, 0], [0, 0]] levels = range(int(min_key), int(max_key), 5) CS3 = plt.contourf(Z, levels, cmap=colormap) plt.clf() plt.figure(1) x1, x2, y1, y2 = plt.axis() plt.axis([x1, x2, 0, 1.1]) plt.colorbar(CS3) plt.title('Binary Black Hole Eccentricity CDF') plt.xlabel('Eccentricity') plt.ylabel('Eccentricity Fraction') plt.savefig((saveLocation + 'ECC.png')) if PB: plt.figure(5) Z = [[0, 0], [0, 0]] levels = range(int(min_key), int(max_key), 5) CS3 = plt.contourf(Z, levels, cmap=colormap) plt.clf() plt.figure(2) x1, x2, y1, y2 = plt.axis() plt.axis([x1, x2, 0, 1.1]) plt.colorbar(CS3) plt.title('Binary Black Hole Period CDF') plt.xlabel('Log_10( Period (days))') plt.ylabel('Period Fraction') plt.savefig((saveLocation + 'PB.png')) time = [] max = [] mean = [] median = [] mode = [] min = [] key_list = stats.keys() key_list.sort() for key in key_list: npList = np.asarray(stats[key]['PB']) modeList = scipy.stats.mode(npList).mode for i in range(len(modeList)): time.append(key) max.append(np.amax(npList)) mean.append(np.mean(npList)) median.append(np.median(npList)) mode.append(modeList[i]) min.append(np.amin(npList)) plt.figure(4) plt.plot(time, max, '-') plt.plot(time, mean, '-') plt.plot(time, median, '-') plt.plot(time, mode, '-') plt.plot(time, min, '-') plt.legend(['Max', 'Mean', 'Median', 'Mode', 'Min']) plt.title('Black Hole Binary Statistics Over Time') plt.xlabel('Physical Time (MY)') plt.ylabel('Log_10( Period (days))') plt.savefig((saveLocation + 'bBH_PBStats.png')) if SEMI: plt.figure(5) Z = [[0, 0], [0, 0]] levels = range(int(min_key), int(max_key), 5) CS3 = plt.contourf(Z, levels, cmap=colormap) plt.clf() plt.figure(3) x1, x2, y1, y2 = plt.axis() plt.axis([x1, x2, 0, 1.1]) plt.colorbar(CS3) plt.title('Binary Black Hole Semi-major Axis CDF') plt.xlabel('Log_10( Semi-major Axis (Au))') plt.ylabel('Semi-major Axis Fraction') plt.savefig((saveLocation + 'SEMI.png')) plt.close('all') if hist: if ECC: for time in stats: plt.figure() n, bins, patches = plt.hist(stats[time]['ECC'], numBins, normed=False, histtype='bar', rwidth=1) plt.title( 'Eccentricity Distribution of Black Hole Binaries: {0} MY'. format(time)) plt.xlabel('Eccentricity') plt.ylabel('N') plt.savefig((saveLocation + 'ECC.{0}MY.png'.format(time))) plt.close('all') if PB: for time in stats: plt.figure() n, bins, patches = plt.hist(stats[time]['PB'], numBins, normed=False, histtype='bar', rwidth=1) plt.title('Period Distribution of Black Hole Binaries: {0} MY'. format(time)) plt.xlabel('Log_10( Period (days))') plt.ylabel('N') plt.savefig((saveLocation + 'PB.{0}MY.png'.format(time))) plt.close('all') if SEMI: for time in stats: plt.figure() n, bins, patches = plt.hist(stats[time]['SEMI'], numBins, normed=False, histtype='bar', rwidth=1) plt.title( 'Semi-major Axis Distribution of Black Hole Binaries: {0} MY' .format(time)) plt.xlabel('Log_10( Semi-major Axis (Au))') plt.ylabel('N') plt.savefig((saveLocation + 'SEMI.{0}MY.png'.format(time))) plt.close('all') plt.close('all')
def main(): #proccess start time tps = datetime.fromtimestamp(time.time()).strftime('%H:%M:%S') # delete files in the processing folder clear_processing() #convert .mrc to MARC/XML Marc_XML = MARC_XML() Marc_XML.convert_marc_xml() BIBFRAME = BIB_builder() BIBFRAME.merger() folder = 'Processing' #iterate over BIBFRAME files for files in os.listdir(folder): tfs = datetime.fromtimestamp(time.time()).strftime('%H:%M:%S') file = os.path.join(folder, files) filename = files.replace('.xml', '') print("processing " + filename) ts = datetime.fromtimestamp(time.time()).strftime('%H:%M:%S') log_file = str(filename) + "-error-logs" print(log_file) output = str(filename) + "-enhanced.xml" clearLogs(log_file, filename) # all the APIs that will be searched - for a new API, add a new method to SearchAPI class and call it with adding a staticmethod to APIFactory apis = [ 'search_api_LC', 'search_api_LCS', 'search_api_VF', 'search_api_VFP', 'search_api_VFC' ] #this is needed for LC APIs query_type = "/authorities/names" # extracting names and titles from BIBFRAME bib_object = Bibframe(file, log_file) transformed = bib_object.convert_bibframe() names = bib_object.extract_names(transformed)[0] titles = bib_object.extract_names(transformed)[1] #getting corp names (for stat report) all_names = bib_object.extract_names(transformed)[2] corp_names = bib_object.extract_names(transformed)[3] print(str(all_names) + " names were extrected from " + filename) print( str(len(names)) + " unique names were extracted from " + filename + " --- " + str(len(names) - corp_names) + " Personal names and " + str(corp_names) + " Corporate names") print(str(len(titles)) + " titles were extracted from " + filename) #dictionaries for storing URIs (names and titles) and stats enriched_names = {} enriched_titles = {} stats = {} print("enriching names") # iterate over the name dictionary for index, item in enumerate(names.keys()): name = item.split('-_-_-')[0] print(index + 1, name) enriched_names[item] = [] for api in apis: #check if the stat for the API already exists if api in stats.keys(): pass else: stats[api] = 0 # getting the API method name_result = APIFactory().get_API(name, query_type, api, log_file) # if the results are not empty, append to "enriched_names" dictionary the result using the api name as key if name_result: enriched_names[item].append(name_result) # add number of results to be used latter in stats report stats[api] = stats[api] + len(name_result) print("enriching titles") # iterate over the title dictionary for index, title in enumerate(titles.keys()): print(index + 1, title) for authors in titles[title]['authors']: author = authors.split('-_-_-')[0] key = str(author) + "-_-_-" + str(title) enriched_titles[key] = [] title_result = APIFactory().get_API(author, title, 'search_OCLC', log_file) if title_result: enriched_titles[key].append(title_result) # getting rid of unwanted things name_results = clean_up(enriched_names) title_result = clean_up(enriched_titles) # get the best URI each API (highest score) and storing it in final_names and final_titles result_names_Object = Results(name_results, names, file, 'name', log_file) result_names_Object.maximizer() final_names = result_names_Object.mapping() result_title_Object = Results(title_result, titles, file, 'title', log_file) final_titles = result_title_Object.mapping() eff = get_stat(final_names, len(names), final_titles, len(titles), filename) stats['names-enriched'] = len(final_names) tff = datetime.fromtimestamp(time.time()).strftime('%H:%M:%S') #write back the URIs to the BIBFRAME file write(final_names, final_titles, file, output, log_file, filename) tfw = datetime.fromtimestamp(time.time()).strftime('%H:%M:%S') write_time = datetime.strptime(tfw, '%H:%M:%S') - datetime.strptime( tff, '%H:%M:%S') file_process_time = datetime.strptime( tfw, '%H:%M:%S') - datetime.strptime(tfs, '%H:%M:%S') write_stats(eff, stats, filename, len(titles), len(names), all_names, corp_names, file_process_time, write_time) #removing temp-file.xml delete_temp() print(filename + " processed in: ", file_process_time, " --- writing process :", write_time) tpf = datetime.fromtimestamp(time.time()).strftime('%H:%M:%S') process_time = datetime.strptime(tpf, '%H:%M:%S') - datetime.strptime( tps, '%H:%M:%S') print("walltime:", process_time)
pass stats.dropna(axis='columns', how='all', inplace=True) #'no_pop_frac', 'no_thresh_frac', 'pop_abs_mis_95width', # 'pop_abs_mis_median', 'rms_test', 'thresh_rel_mis_95width', # 'thresh_rel_mis_median', 'l2_norm_weighted' #print(stats.max()) #print(stats.min()) #print(stats.mean()) #print(stats.abs().mean()) del stats['pop_abs_mis_95width'] del stats['thresh_rel_mis_95width'] stats['rms'] = stats.pop('rms') stats['thresh'] = stats.pop('thresh_rel_mis_median').abs().apply(np.max) stats['no_thresh_frac'] = stats.pop('no_thresh_frac').apply(np.max) stats['pop'] = (14 - stats.pop('pop_abs_mis_median').abs()).apply(np.max) if 'wobble_tot' in stats.keys(): stats['wobble_tot'] = stats.pop('wobble_tot').apply(np.max) stats['wobble_unstab'] = stats.pop('wobble_unstab').apply(np.max) stats['pop_frac'] = (1 - stats.pop('no_pop_frac')).apply(np.max) #stats.dropna(inplace=True) try: del stats['dual_thresh_mismatch_95width'] stats['thresh_mismatch'] = stats.pop( 'dual_thresh_mismatch_median').abs().apply(np.max) except KeyError: pass #(stats/stats.max()).nsmallest(10, 'rms').plot.bar() fig = plt.figure() gs = gridspec.GridSpec(2, 1,
def calculate_accuracy(): stats = {} stats['all'] = {'tp': 0, 'fp': 0, 'fn': 0, 'btp': 0, 'bfp': 0, 'bfn': 0} for c in classes: stats[c] = {'tp': 0, 'fp': 0, 'fn': 0, 'btp': 0, 'bfp': 0, 'bfn': 0} gt_boxes = [] predicted_boxes = [] gt_box_label = [] predicted_box_label = [] for obj_id in set(gt_obj_id): mask = gt_obj_id == obj_id inliers = point_orig_list[mask, :3] prediction = gt_cls_id[mask][0] gt_boxes.append(mask) gt_box_label.append(prediction) for obj_id in set(predicted_obj_id): mask = predicted_obj_id == obj_id if numpy.sum(mask) > 50: inliers = point_orig_list[mask, :3] prediction = scipy.stats.mode(predicted_cls_id[mask])[0][0] predicted_boxes.append(mask) predicted_box_label.append(prediction) predicted_boxes = numpy.array(predicted_boxes) gt_boxes = numpy.array(gt_boxes) matched = numpy.zeros(len(predicted_boxes), dtype=bool) print('%d/%d boxes' % (len(predicted_boxes), len(gt_boxes))) for i in range(len(gt_boxes)): same_cls = gt_box_label[i] == predicted_box_label if numpy.sum(same_cls) == 0: stats[classes[gt_box_label[i]]]['bfn'] += 1 stats['all']['bfn'] += 1 continue intersection = numpy.sum(numpy.logical_and(gt_boxes[i], predicted_boxes[same_cls]), axis=1) IOU = intersection / (1.0 * numpy.sum(gt_boxes[i]) + numpy.sum( predicted_boxes[same_cls], axis=1) - intersection) if IOU.max() > 0.5: matched[numpy.nonzero(same_cls)[0][numpy.argmax(IOU)]] = True stats[classes[gt_box_label[i]]]['btp'] += 1 stats['all']['btp'] += 1 else: stats[classes[gt_box_label[i]]]['bfn'] += 1 stats['all']['bfn'] += 1 for i in range(len(predicted_boxes)): if not matched[i]: stats[classes[predicted_box_label[i]]]['bfp'] += 1 stats['all']['bfp'] += 1 for g in range(len(predicted_cls_id)): if gt_cls_id[g] == predicted_cls_id[g]: stats[classes[int(gt_cls_id[g])]]['tp'] += 1 stats['all']['tp'] += 1 else: stats[classes[int(gt_cls_id[g])]]['fn'] += 1 stats['all']['fn'] += 1 stats[classes[predicted_cls_id[g]]]['fp'] += 1 stats['all']['fp'] += 1 prec_agg = [] recl_agg = [] bprec_agg = [] brecl_agg = [] iou_agg = [] print("%10s %6s %6s %6s %5s %5s %5s %3s %3s %3s %5s %5s" % ('CLASS', 'TP', 'FP', 'FN', 'PREC', 'RECL', 'IOU', 'BTP', 'BFP', 'BFN', 'PREC', 'RECL')) for c in sorted(stats.keys()): try: stats[c]['pr'] = 1.0 * stats[c]['tp'] / (stats[c]['tp'] + stats[c]['fp']) except ZeroDivisionError: stats[c]['pr'] = 0 try: stats[c]['rc'] = 1.0 * stats[c]['tp'] / (stats[c]['tp'] + stats[c]['fn']) except ZeroDivisionError: stats[c]['rc'] = 0 try: stats[c]['IOU'] = 1.0 * stats[c]['tp'] / ( stats[c]['tp'] + stats[c]['fp'] + stats[c]['fn']) except ZeroDivisionError: stats[c]['IOU'] = 0 try: stats[c]['bpr'] = 1.0 * stats[c]['btp'] / (stats[c]['btp'] + stats[c]['bfp']) except ZeroDivisionError: stats[c]['bpr'] = 0 try: stats[c]['brc'] = 1.0 * stats[c]['btp'] / (stats[c]['btp'] + stats[c]['bfn']) except ZeroDivisionError: stats[c]['brc'] = 0 if c not in ['all']: print( "%10s %6d %6d %6d %5.3f %5.3f %5.3f %3d %3d %3d %5.3f %5.3f" % (c, stats[c]['tp'], stats[c]['fp'], stats[c]['fn'], stats[c]['pr'], stats[c]['rc'], stats[c]['IOU'], stats[c]['btp'], stats[c]['bfp'], stats[c]['bfn'], stats[c]['bpr'], stats[c]['brc'])) prec_agg.append(stats[c]['pr']) recl_agg.append(stats[c]['rc']) iou_agg.append(stats[c]['IOU']) bprec_agg.append(stats[c]['bpr']) brecl_agg.append(stats[c]['brc']) c = 'all' print("%10s %6d %6d %6d %5.3f %5.3f %5.3f %3d %3d %3d %5.3f %5.3f" % ('all', stats[c]['tp'], stats[c]['fp'], stats[c]['fn'], stats[c]['pr'], stats[c]['rc'], stats[c]['IOU'], stats[c]['btp'], stats[c]['bfp'], stats[c]['bfn'], stats[c]['bpr'], stats[c]['brc'])) print("%10s %6d %6d %6d %5.3f %5.3f %5.3f %3d %3d %3d %5.3f %5.3f" % ('avg', stats[c]['tp'], stats[c]['fp'], stats[c]['fn'], numpy.mean(prec_agg), numpy.mean(recl_agg), numpy.mean(iou_agg), stats[c]['btp'], stats[c]['bfp'], stats[c]['bfn'], numpy.mean(bprec_agg), numpy.mean(brecl_agg)))
def plot_cdf(stats, out_pathname, legend_x): # color=iter(matplotlib.cm.Set1(np.linspace(0,0.5,len(stats.keys())))) colors = iter(['red', 'black', 'red', 'black']) print("len(stats)=%d" % (len(stats))) x_max = 0 matplotlib.rcParams['xtick.labelsize'] = 20 matplotlib.rcParams['ytick.labelsize'] = 20 if len(stats) == 2: folge = ['Vanilla', 'Gekuerzt'] #['Einfach', 'Botnetz'] linecycler = cycle(["-", "--"]) elif len(stats) == 4: folge = [ 'Vanilla_Einfach', 'Vanilla_Botnetz', 'Gekuerzt_Einfach', 'Gekuerzt_Botnetz' ] linecycler = cycle(["-", "-", "--", "--"]) else: print("error. len(stats)=%d" % (len(stats))) sys.exit(1) for line in folge: # print(line) try: data = stats[line] except: line_n = [s for s in stats.keys() if line in s][0] print("%s was found in stats as %s." % (line, line_n)) data = stats[line_n] # sorted_data=np.sort(data) # yvals=np.arange(1.0,float(len(sorted_data))+1.0)/float(len(sorted_data)) # print(data) #yvals=np.arange(len(sorted_data))/float(len(sorted_data)) # # new x values # xn_ax = np.linspace(sorted_data.min(), sorted_data.max(), 200) # print(len(xn_ax)) # # new y values # yn_ax = spline(sorted_data, yvals, xn_ax) # print(len(yn_ax)) data_max = max(data) data_shown = filter(lambda x: x < data_max, data) shown_percentile = float(len(data_shown)) / len(data) print("shown_percentile:", shown_percentile) x, y = getcdf(data, shown_percentile) # print(find_nearest(y, 0.50)) if shown_percentile > 0: q, i = find_nearest(y, 0.80) # print(out_pathname, legend_x) print("%s: data[%d]=%f equals %f."\ %(line,i,y[i],x[i])) q, i = find_nearest(y, 0.70) # print(out_pathname, legend_x) print("%s: data[%d]=%f equals %f."\ %(line,i,y[i],x[i])) q, i = find_nearest(y, 0.50) # print(out_pathname, legend_x) print("%s: data[%d]=%f equals %f."\ %(line,i,y[i],x[i])) q, i = find_nearest(y, 0.30) # print(out_pathname, legend_x) print("%s: data[%d]=%f equals %f."\ %(line,i,y[i],x[i])) q, i = find_nearest(y, 0.20) # print(out_pathname, legend_x) print("%s: data[%d]=%f equals %f."\ %(line,i,y[i],x[i])) # print(line) # print(y[i],i) # print(x[i]) # print(x) if len(x) != 0 and x_max < max(x): x_max = max(x) matplotlib.pyplot.plot(x, y, color=next(colors), label=line, linewidth=2, linestyle=next(linecycler)) # matplotlib.pyplot.style.use('ggplot') # matplotlib.pyplot.figure(figsize=(8,6), dpi=72, facecolor="white") matplotlib.pyplot.legend(loc='best', fontsize=20) #, fontsize = 'small') # if legend_x == 'Days from first stream': # matplotlib.pyplot.xlim(xmin=0.0, xmax=x_max) # else: # matplotlib.pyplot.xlim(xmin=0.0) # matplotlib.pyplot.xscale('symlog') matplotlib.pyplot.xlim(xmin=0.0, xmax=x_max) # x_max) #0.2) matplotlib.pyplot.ylim(ymin=0.0) # matplotlib.pyplot.yscale('log') # matplotlib.pyplot.yscale('close_to_one') matplotlib.pyplot.yticks(np.arange(0, 1.2, 0.2)) # matplotlib.pyplot.xticks(np.arange(0, x_max, 0.1)) matplotlib.pyplot.xlabel(legend_x, fontsize=20) matplotlib.pyplot.ylabel('eCDF', fontsize=20) # matplotlib.pyplot.title(title, fontsize=fontsize) # matplotlib.pyplot.grid() matplotlib.pyplot.tight_layout() matplotlib.pyplot.savefig(out_pathname) matplotlib.pyplot.close() ######################################################################### # Export Data ######################################################################### a = [] h = [] for line in folge: h.append(line) data = stats[line] y, x = getcdf(data, 1) a.append(x) a.append(y) h = ", ".join(h) np.savetxt(out_pathname + ".dat", np.transpose(a), header=h, fmt="%10s")
def cv_metrics_stratified_class_with_indexes_and_transform( X, Y, indexes, iclf, iclfk={}, transform=None, kfold=5, shuffle=True, report_metrics=[ 'matthews_corr_coef', 'roc_auc_score', 'f1_score', 'sensitivity', 'specificity' ], norm=False, calc_stats=True, report_name='CLF', sort_metric='roc_auc_score_min', transformations=[], features_top_ns=[], X_names=[], vectors=[], vector=None, allow_x_list_size=1): output_objs = {} output_metrics = {} stats_df = [] report_name_sufix = '' report_name_sufix_xs = '' conditions = [ type(X) == list, len(transformations) > allow_x_list_size, len(features_top_ns) == len(transformations), len(X_names) == len(transformations) ] # multiple_x = utils.validate_multiple_conditions(conditions) # for train_index, test_index in indexes: if multiple_x: X_train, X_test, y_train, y_test = transform_and_join( X, Y, train_index, test_index, transformations, features_top_ns, iclf, iclfk, joint_transformation=None, vectors=vectors) report_name_sufix_xs = [ xn + "_" + tr + "_" + str(feats) for xn, tr, feats in zip( X_names, transformations, features_top_ns) ] report_name_sufix_xs = " & ".join(report_name_sufix_xs) else: X_train, X_test = X[train_index], X[test_index] y_train, y_test = Y[train_index], Y[test_index] total_features = X_train.shape[-1] number_of_features = total_features if type(transform) == str: report_name_sufix = report_name_sufix_xs + "_" + transform number_of_features = 1 X_train, X_test, y_train, y_test = transform_x_train_test( X_train, X_test, y_train, y_test, transform=transform, iclf=iclf, iclfk=iclfk, vector=vector) if len(y_test.shape) > 1: y_test = y_test.argmax(1) if norm == True: X_train, X_test = norm_z_score(X_train, X_test) start_feature_number = number_of_features end_feature_number = total_features + 1 for feature_number in range(start_feature_number, end_feature_number): tmp_scores = output_metrics.get('F' + str(feature_number), {}) X_train_ = X_train[:, :feature_number] X_test_ = X_test[:, :feature_number] tmp_scores = fit_and_get_metrics(X_train_, X_test_, y_train, y_test, iclf, iclfk, report_metrics, tmp_scores) output_metrics['F' + str(feature_number)] = tmp_scores if calc_stats == True: for fn in range(number_of_features, total_features + 1): fk = 'F' + str(fn) metrics = output_metrics[fk] metrics_report = { 'Name': report_name + report_name_sufix, 'Number of Variables': fn } for m in metrics.keys(): stats = metrics_stats(metrics[m], rn=3) for sk in stats.keys(): metrics_report[m + "_" + sk] = stats[sk] stats_df.append(metrics_report) stats_df = pd.DataFrame(stats_df).sort_values( by=[sort_metric, 'Number of Variables'], ascending=[False, True]).reset_index(drop=True) return output_metrics.copy(), stats_df.copy()
def main(): #proccess start time tps = datetime.fromtimestamp(time.time()).strftime('%H:%M:%S') # delete files in the processing folder clear_processing() #convert .mrc to MARC/XML Marc_XML = MARC_XML() Marc_XML.convert_marc_xml() BIBFRAME = BIB_builder() BIBFRAME.merger() folder = 'Processing' #iterate over BIBFRAME files for files in os.listdir(folder): tfs = datetime.fromtimestamp(time.time()).strftime('%H:%M:%S') file = os.path.join(folder, files) filename = files.replace('.xml', '') print ("processing " + filename) ts = datetime.fromtimestamp(time.time()).strftime('%H:%M:%S') log_file = str(filename) + "-error-logs" print (log_file) output = str(filename) + "-enhanced.xml" clearLogs(log_file, filename) # all the APIs that will be searched - for a new API, add a new method to SearchAPI class and call it with adding a staticmethod to APIFactory apis = ['search_api_LC', 'search_api_LCS', 'search_api_VF', 'search_api_VFP', 'search_api_VFC'] #this is needed for LC APIs query_type = "/authorities/names" # extracting names and titles from BIBFRAME bib_object = Bibframe(file, log_file) transformed = bib_object.convert_bibframe() names = bib_object.extract_names(transformed)[0] titles = bib_object.extract_names(transformed)[1] #getting corp names (for stat report) all_names = bib_object.extract_names(transformed)[2] corp_names = bib_object.extract_names(transformed)[3] print (str(all_names) + " names were extrected from " + filename) print (str(len(names)) + " unique names were extracted from " + filename + " --- " + str(len(names) - corp_names) + " Personal names and " + str(corp_names) + " Corporate names") print (str(len(titles)) + " titles were extracted from " + filename) #dictionaries for storing URIs (names and titles) and stats enriched_names = {} enriched_titles = {} stats = {} print ("enriching names") # iterate over the name dictionary for index, item in enumerate(names.keys()): name = item.split('-_-_-')[0] print(index+1, name) enriched_names[item] = [] for api in apis: #check if the stat for the API already exists if api in stats.keys(): pass else: stats[api] = 0 # getting the API method name_result = APIFactory().get_API(name, query_type, api, log_file) # if the results are not empty, append to "enriched_names" dictionary the result using the api name as key if name_result: enriched_names[item].append(name_result) # add number of results to be used latter in stats report stats[api] = stats[api] + len(name_result) print ("enriching titles") # iterate over the title dictionary for index, title in enumerate(titles.keys()): print(index+1, title) for authors in titles[title]['authors']: author = authors.split('-_-_-')[0] key = str(author) + "-_-_-" + str(title) enriched_titles[key] = [] title_result = APIFactory().get_API(author, title, 'search_OCLC', log_file) if title_result: enriched_titles[key].append(title_result) # getting rid of unwanted things name_results = clean_up(enriched_names) title_result = clean_up(enriched_titles) # get the best URI each API (highest score) and storing it in final_names and final_titles result_names_Object = Results(name_results, names, file, 'name', log_file) result_names_Object.maximizer() final_names = result_names_Object.mapping() result_title_Object = Results(title_result, titles, file, 'title', log_file) final_titles = result_title_Object.mapping() eff = get_stat(final_names, len(names), final_titles, len(titles), filename) stats['names-enriched'] = len(final_names) tff = datetime.fromtimestamp(time.time()).strftime('%H:%M:%S') #write back the URIs to the BIBFRAME file write(final_names, final_titles, file, output, log_file, filename) tfw = datetime.fromtimestamp(time.time()).strftime('%H:%M:%S') write_time = datetime.strptime(tfw, '%H:%M:%S') - datetime.strptime(tff, '%H:%M:%S') file_process_time = datetime.strptime(tfw, '%H:%M:%S') - datetime.strptime(tfs, '%H:%M:%S') write_stats(eff, stats, filename, len(titles), len(names), all_names, corp_names, file_process_time, write_time) #removing temp-file.xml delete_temp() print(filename + " processed in: ", file_process_time, " --- writing process :", write_time) tpf = datetime.fromtimestamp(time.time()).strftime('%H:%M:%S') process_time = datetime.strptime(tpf, '%H:%M:%S') - datetime.strptime(tps, '%H:%M:%S') print("walltime:", process_time)
return {'mean': mean, 'stddev': stddev} stats = {} correspondences = correspondences[::-1] n_groups = len(correspondences) table = PrettyTable() table.field_names = [ "Sensor", "Old mean", "New mean", "% change", "Old stddev", "New stddev" ] for c in correspondences: if c not in stats.keys(): stats[c] = {} stats[c]['new'] = getStats(*new[c].T) stats[c]['old'] = getStats(*old[c].T) stats[c]['combined'] = getStats(*np.vstack([new[c], old[c]]).T) correspondences = sorted(correspondences, key=lambda c: 100 * (stats[c]['new']['mean'] - stats[ c]['old']['mean']) / stats[c]['old']['mean']) for c in correspondences: table.add_row([ c, round(stats[c]['old']['mean'], 3), round(stats[c]['new']['mean'], 3),
def marc_process(processing_files, apis): #proccess start time tps = datetime.fromtimestamp(time.time()).strftime('%H:%M:%S') db_update_obj = P_progress(pid=processing_files) db_update_obj.save() file = "Webapp/source/%s" %(processing_files.name) #convert .mrc to MARC/XML Marc_XML = MARC_XML(file) files = Marc_XML.convert_marc_xml(db_update_obj) tfs = datetime.fromtimestamp(time.time()).strftime('%H:%M:%S') file = os.path.join('', files) filename = files.replace('.xml', '').replace('Webapp/Files/Processing/', '') print ("processing " + filename) ts = datetime.fromtimestamp(time.time()).strftime('%H:%M:%S') log_file = str(filename) + "-error-logs" output = str(filename) + "-enhanced.xml" clearLogs(log_file, filename) query_type = "/authorities/names" # extracting names and titles from BIBFRAME db_update_obj.stage = "Extracting_names_and_titles" db_update_obj.save() mergecheck = True bib_object = Bibframe(file, log_file, mergecheck) transformed = bib_object.convert_bibframe() names = bib_object.extract_names(transformed)[0] titles = bib_object.extract_names(transformed)[1] #getting corp names (for stat report) all_names = bib_object.extract_names(transformed)[2] corp_names = bib_object.extract_names(transformed)[3] print(str(all_names) + " names were extrected from " + filename) print(str(len(names)) + " unique names were extracted from " + filename + " --- " + str(len(names) - corp_names) + " Personal names and " + str(corp_names) + " Corporate names") print(str(len(titles)) + " titles were extracted from " + filename) db_update_obj.all_names = str(len(names)) db_update_obj.all_titles = str(len(titles)) db_update_obj.p_names = str(len(names) - corp_names) db_update_obj.c_names=str(corp_names) db_update_obj.save() #dictionaries for storing URIs (names and titles) and stats enriched_names = {} enriched_titles = {} stats = {} print ("enriching names") # iterate over the name dictionary db_update_obj.stage = "Enriching_names" db_update_obj.save() for index, item in enumerate(names.keys()): db_update_obj.name_index = index+1 db_update_obj.save() name = item.split('-_-_-')[0] #print(index+1, name) enriched_names[item] = [] for api in apis: #check if the stat for the API already exists if api in stats.keys(): pass else: stats[api] = 0 # getting the API method name_result = APIFactory().get_API(name, query_type, api, log_file) # if the results are not empty, append to "enriched_names" dictionary the result using the api name as key if name_result: enriched_names[item].append(name_result) # add number of results to be used latter in stats report stats[api] = stats[api] + len(name_result) print ("enriching titles") # iterate over the title dictionary db_update_obj.stage = "Enriching_titles" db_update_obj.save() for index, title in enumerate(titles.keys()): db_update_obj.title_index = index+1 db_update_obj.save() #print(index+1, title) for authors in titles[title]['authors']: author = authors.split('-_-_-')[0] key = str(author) + "-_-_-" + str(title) enriched_titles[key] = [] title_result = APIFactory().get_API(author, title, 'search_OCLC', log_file) if title_result: enriched_titles[key].append(title_result) # getting rid of unwanted things db_update_obj.stage = "Optimization" db_update_obj.save() name_results = clean_up(enriched_names) title_result = clean_up(enriched_titles) # get the best URI each API (highest score) and storing it in final_names and final_titles result_names_Object = Results(name_results, names, file, 'name', log_file) result_names_Object.maximizer() final_names = result_names_Object.mapping() result_title_Object = Results(title_result, titles, file, 'title', log_file) final_titles = result_title_Object.mapping() tff = datetime.fromtimestamp(time.time()).strftime('%H:%M:%S') #write back the URIs to the BIBFRAME file db_update_obj.stage = "Writing_to_BIBFRAME" db_update_obj.save() write(final_names, final_titles, file, output, log_file, filename) eff = get_stat(final_names, len(names), final_titles, len(titles), filename) stats['names-enriched'] = len(final_names) tfw = datetime.fromtimestamp(time.time()).strftime('%H:%M:%S') write_time = datetime.strptime(tfw, '%H:%M:%S') - datetime.strptime(tff, '%H:%M:%S') file_process_time = datetime.strptime(tfw, '%H:%M:%S') - datetime.strptime(tfs, '%H:%M:%S') write_stats(eff, stats, filename, len(titles), len(names), all_names, corp_names, file_process_time, write_time, db_update_obj) #removing temp-file.xml delete_temp() print(filename + " processed in: ", file_process_time, " --- writing process :", write_time) tpf = datetime.fromtimestamp(time.time()).strftime('%H:%M:%S') process_time = datetime.strptime(tpf, '%H:%M:%S') - datetime.strptime(tps, '%H:%M:%S') db_update_obj.stage = "The process was completed in %s" %(process_time) db_update_obj.save() print("walltime:", process_time) add_to_archive(processing_files, db_update_obj, len(final_names), len(final_titles))
def homoscedasticity(data, dv=None, group=None, method="levene", alpha=.05): """Test equality of variance. Parameters ---------- data : :py:class:`pandas.DataFrame`, list or dict Iterable. Can be either a list / dictionnary of iterables or a wide- or long-format pandas dataframe. dv : str Dependent variable (only when ``data`` is a long-format dataframe). group : str Grouping variable (only when ``data`` is a long-format dataframe). method : str Statistical test. `'levene'` (default) performs the Levene test using :py:func:`scipy.stats.levene`, and `'bartlett'` performs the Bartlett test using :py:func:`scipy.stats.bartlett`. The former is more robust to departure from normality. alpha : float Significance level. Returns ------- stats : :py:class:`pandas.DataFrame` * ``'W/T'``: Test statistic ('W' for Levene, 'T' for Bartlett) * ``'pval'``: p-value * ``'equal_var'``: True if ``data`` has equal variance See Also -------- normality : Univariate normality test. sphericity : Mauchly's test for sphericity. Notes ----- The **Bartlett** :math:`T` statistic [1]_ is defined as: .. math:: T = \\frac{(N-k) \\ln{s^{2}_{p}} - \\sum_{i=1}^{k}(N_{i} - 1) \\ln{s^{2}_{i}}}{1 + (1/(3(k-1)))((\\sum_{i=1}^{k}{1/(N_{i} - 1))} - 1/(N-k))} where :math:`s_i^2` is the variance of the :math:`i^{th}` group, :math:`N` is the total sample size, :math:`N_i` is the sample size of the :math:`i^{th}` group, :math:`k` is the number of groups, and :math:`s_p^2` is the pooled variance. The pooled variance is a weighted average of the group variances and is defined as: .. math:: s^{2}_{p} = \\sum_{i=1}^{k}(N_{i} - 1)s^{2}_{i}/(N-k) The p-value is then computed using a chi-square distribution: .. math:: T \\sim \\chi^2(k-1) The **Levene** :math:`W` statistic [2]_ is defined as: .. math:: W = \\frac{(N-k)} {(k-1)} \\frac{\\sum_{i=1}^{k}N_{i}(\\overline{Z}_{i.}-\\overline{Z})^{2} } {\\sum_{i=1}^{k}\\sum_{j=1}^{N_i}(Z_{ij}-\\overline{Z}_{i.})^{2} } where :math:`Z_{ij} = |Y_{ij} - \\text{median}({Y}_{i.})|`, :math:`\\overline{Z}_{i.}` are the group means of :math:`Z_{ij}` and :math:`\\overline{Z}` is the grand mean of :math:`Z_{ij}`. The p-value is then computed using a F-distribution: .. math:: W \\sim F(k-1, N-k) .. warning:: Missing values are not supported for this function. Make sure to remove them before using the :py:meth:`pandas.DataFrame.dropna` or :py:func:`pingouin.remove_na` functions. References ---------- .. [1] Bartlett, M. S. (1937). Properties of sufficiency and statistical tests. Proc. R. Soc. Lond. A, 160(901), 268-282. .. [2] Brown, M. B., & Forsythe, A. B. (1974). Robust tests for the equality of variances. Journal of the American Statistical Association, 69(346), 364-367. Examples -------- 1. Levene test on a wide-format dataframe >>> import numpy as np >>> import pingouin as pg >>> data = pg.read_dataset('mediation') >>> pg.homoscedasticity(data[['X', 'Y', 'M']]) W pval equal_var levene 0.434861 0.999997 True 2. Bartlett test using a list of iterables >>> data = [[4, 8, 9, 20, 14], np.array([5, 8, 15, 45, 12])] >>> pg.homoscedasticity(data, method="bartlett", alpha=.05) T pval equal_var bartlett 2.873569 0.090045 True 3. Long-format dataframe >>> data = pg.read_dataset('rm_anova2') >>> pg.homoscedasticity(data, dv='Performance', group='Time') W pval equal_var levene 3.192197 0.079217 True """ assert isinstance(data, (pd.DataFrame, list, dict)) assert method.lower() in ['levene', 'bartlett'] func = getattr(scipy.stats, method) if isinstance(data, pd.DataFrame): # Data is a Pandas DataFrame if dv is None and group is None: # Wide-format # Get numeric data only numdata = data._get_numeric_data() assert numdata.shape[1] > 1, 'Data must have at least two columns.' statistic, p = func(*numdata.to_numpy()) else: # Long-format assert group in data.columns assert dv in data.columns grp = data.groupby(group)[dv] assert grp.ngroups > 1, 'Data must have at least two columns.' statistic, p = func(*grp.apply(list)) elif isinstance(data, list): # Check that list contains other list or np.ndarray assert all(isinstance(el, (list, np.ndarray)) for el in data) assert len(data) > 1, 'Data must have at least two iterables.' statistic, p = func(*data) else: # Data is a dict assert all(isinstance(el, (list, np.ndarray)) for el in data.values()) assert len(data) > 1, 'Data must have at least two iterables.' statistic, p = func(*data.values()) equal_var = True if p > alpha else False stat_name = 'W' if method.lower() == 'levene' else 'T' stats = {stat_name: statistic, 'pval': p, 'equal_var': equal_var} return pd.DataFrame(stats, columns=stats.keys(), index=[method])
def write_fasta_by_location(peaks): seqs = {} stats = {} for index, row in peaks.iterrows(): seqs.setdefault(row['location'], '') seqs[row['location']] += '\n>{i}_{g}\n{s}'.format( i=index, g=row['gene_name'], s=row['seq'] ) stats.setdefault(row['gene_name'], {'locations': []}) stats[row['gene_name']]['locations'].append(row['location']) fname = { """3'UTR""": '3prime_utr.fa', """5'UTR""": '5prime_utr.fa', """CDS""": 'cds.fa', """ncRNA""": 'ncRNA.fa' } for label in seqs: with open('data/fasta/{f}'.format(f=fname[label]), 'w') as fh: fh.write(seqs[label]) five_and_three = set() cds_and_three = set() five_cds_and_three = set() only_three = set() only_cds = set() only_five = set() has_secondary = set() for gene in stats: if ("""3'UTR""" in stats[gene]['locations']) and ( """5'UTR""" in stats[gene]['locations']): five_and_three.add(gene) if ("""3'UTR""" in stats[gene]['locations']) and ( """CDS""" in stats[gene]['locations']): cds_and_three.add(gene) if ("""3'UTR""" in stats[gene]['locations']) and ( """5'UTR""" in stats[gene]['locations']) and ( """CDS""" in stats[gene]['locations'] ): five_cds_and_three.add(gene) if set(stats[gene]['locations']) == set(["3'UTR"]): only_three.add(gene) if set(stats[gene]['locations']) == set(["5'UTR"]): only_five.add(gene) if set(stats[gene]['locations']) == set(["CDS"]): only_cds.add(gene) if len(stats[gene]['locations']) > 1: has_secondary.add(gene) print """ Number of genes: {total} Number of genes with only 3' peaks: {o_3} Number of genes with only CDS peaks: {o_cds} Number of genes with only 5' peaks: {o_5} Number of genes with 3' and 5' peaks: {a} Number of genes with 3' and CDS peaks: {b} Number of genes with 5' and 3' and CDS peaks: {c} Number of genes with a secondary peak: {has_second} """.format( total=len(list(set(stats.keys()))), o_3=len(list(only_three)), o_cds=len(list(only_cds)), o_5=len(list(only_five)), a=len(list(five_and_three)), b=len(list(cds_and_three)), c=len(list(five_cds_and_three)), has_second=len(list(has_secondary)) ) labels = ['ncRNA', """3'UTR""", """5'UTR""", 'CDS']