def __init__(self, data, reference, quantile_normalize=True, min_variance=0, corr_method='pearson'): self.quantile_normalize = quantile_normalize data_index = data.index reference_index = reference.index shared = data.index.intersection(reference.index) self._data = data.loc[shared] self._reference = reference.loc[shared] data_index = self._data.var(1)[min_variance < self._data.var(1)].index reference_index = self._reference.var(1)[ min_variance < self._reference.var(1)].index self._shared = data_index.intersection(reference_index) self._data = self._data.loc[self._shared].copy() self._reference = self._reference.loc[self._shared].copy() if quantile_normalize: qfit = QNorm().fit(self._reference) self._data = qfit.transform(self._data) self._reference = qfit.transform(self._reference) self._corr = pd_concat( [self._data, self._reference], 1).corr(method=corr_method).iloc[self._data.shape[1]:].reset_index( drop=True)
def predict_TestData(Food_df, People_df): cTrainF = rand(len(Food_df)) > .5 cTestF = ~cTrainF cTrainP = rand(len(People_df)) > .5 cTestP = ~cTrainP TrainX_df = pd_concat([People_df[cTrainP], Food_df[cTrainF]], axis=0) TestX_df = pd_concat([People_df[cTestP], Food_df[cTestF]], axis=0) TrainX = TrainX_df.ix[:, 2:].values TestX = TestX_df.ix[:, 2:].values TrainY = concatenate( [ones(len(People_df[cTrainP])), zeros(len(Food_df[cTrainF]))]) TestY = concatenate( [ones(len(People_df[cTestP])), zeros(len(Food_df[cTestF]))]) ET_classifier = ExtraTreesClassifier(n_estimators=50, max_depth=None, min_samples_split=1, random_state=0) ET_classifier.fit(TrainX, TrainY) ET_prediction = ET_classifier.predict(TestX) LinSVC_classifier = svm.LinearSVC() LinSVC_classifier.fit(TrainX, TrainY) LinSVC_predict = LinSVC_classifier.predict(TestX) a = DataFrame() a["url"] = TestX_df.urls.values a["answer"] = TestY a["ET_predict"] = ET_prediction a["LinSVC_predict"] = LinSVC_predict a.to_csv("prediction_for_TestData.csv")
def predict_TestData(Food_df,People_df): cTrainF = rand(len(Food_df)) > .5 cTestF = ~cTrainF cTrainP = rand(len(People_df)) > .5 cTestP = ~cTrainP TrainX_df = pd_concat([People_df[cTrainP], Food_df[cTrainF]],axis=0) TestX_df = pd_concat([People_df[cTestP], Food_df[cTestF]],axis=0) TrainX= TrainX_df.ix[:,2:].values TestX= TestX_df.ix[:,2:].values TrainY = concatenate([ones(len(People_df[cTrainP])), zeros(len(Food_df[cTrainF]))]) TestY = concatenate([ones(len(People_df[cTestP])), zeros(len(Food_df[cTestF]))]) ET_classifier = ExtraTreesClassifier(n_estimators=50, max_depth=None, min_samples_split=1, random_state=0) ET_classifier.fit(TrainX,TrainY) ET_prediction = ET_classifier.predict(TestX) LinSVC_classifier = svm.LinearSVC() LinSVC_classifier.fit(TrainX,TrainY) LinSVC_predict = LinSVC_classifier.predict(TestX) a=DataFrame() a["url"]=TestX_df.urls.values a["answer"]=TestY a["ET_predict"]=ET_prediction a["LinSVC_predict"]=LinSVC_predict a.to_csv("prediction_for_TestData.csv")
def txts_to_tracking_csv(self, txt_dir='', csv_name='all-tracking.csv', dataset_info_df=None): # Check the directory from which the txt-files will be loaded from txt_dir = self._standard_check('', txt_dir) # List all txt-files txts_list = [x for x in os_listdir(txt_dir) if '.txt' in x] dfs_list = [] # For every txt-file receive a DataFrame and put it in a list for txt_file in txts_list: np_data = self.read_txt(txt_file, txt_dir) np_data = self.__add_objectless_frames(np_data, dataset_info_df) df = DataFrame(np_data, columns=self.original_format_column_names) dfs_list.append(df) # Unify all DataFrames of the list df = pd_concat(dfs_list) # Prepare the CSV's path csv_path = os_path_join(txt_dir, csv_name) # Save Unified DataFrame to csv-file df.to_csv(csv_path)
def set_segmentation_on_annotation(annotation_df, seg_df): """ Get segment annotation dataframe by looking into the start and end timestamp of each segment, add a "segment" column to the dataframe Args: annotation_df: annotation dataset to be processed seg_df: input segment dataframe (output of function do_segmentation_on_raw) Return: seg_annotation_df: annotation segment dataframe """ import smdt.annotation as s_annotation print "=========setting segment annotation dataset==============" # print annotation_df.head() seg_annotation_arr = [] for seg_index, one_seg_df in seg_df.groupby(s_info.segment_col): start_time = one_seg_df[s_info.raw_ts_name].iloc[0] end_time = one_seg_df[s_info.raw_ts_name].iloc[-1] one_annotation_df = s_annotation.select_annotation_by_ts( annotation_df, lbound=start_time, rbound=end_time) one_annotation_df[s_info.segment_col] = [ seg_index, ] * len(one_annotation_df) seg_annotation_arr.append(one_annotation_df) seg_annotation_df = pd_concat(seg_annotation_arr) # reset index but keep the original index as a reference to previous dataframe seg_annotation_df = seg_annotation_df.reset_index(drop=False) # rename "index" to "reference index" seg_annotation_df = seg_annotation_df.rename( columns={"index": "reference index"}) return seg_annotation_df
def _construct_segment_dataframe(raw_df, start_anchors, end_anchors): """Helper function to construct segment dataframe structure Args: raw_df: input wockets raw dataframe, only support single session and sensor start_anchors: start index of each segment end_anchors: end indexes of each segment drop: column index to be dropped Return: seg_df: segment dataframe """ print "======segmentation information===============" # loop over anchors and append new segments # NOTE!! concating an array of dataframes is much faster than concat them one by one with append seg_arr = [] indexes = range(0, len(start_anchors)) print "========connect segments=============" for start, end, i in zip(start_anchors, end_anchors, indexes): if end - start <= 1: continue new_seg_df = raw_df.iloc[start:end] new_seg_df[s_info.segment_col] = i seg_arr.append(new_seg_df) seg_df = pd_concat(seg_arr) print "total segmentations: " + str(seg_df[s_info.segment_col].max()) # seg_df = raw_data.copy(deep=True) # only used for test return seg_df
def _data2df(data): tdf = pd_DataFrame(data=data.Data, columns=pd_DatetimeIndex(data.Times), index=["code", "p"]).T rst_df = pd_concat([df["p"] for _, df in tdf.groupby("code")], axis=1) rst_df.columns = [tkr for tkr, _ in tdf.groupby("code")] return rst_df.astype(float)
def _get_symbol_dataframe(df, symbol): try: # this produce a "IndexingError using Boolean Indexing" (on rare occasions) return df[(df['symbol'] == symbol) | (df['symbol_group'] == symbol)].copy() except: df = pd_concat([df[df['symbol'] == symbol], df[df['symbol_group'] == symbol]]) df.loc[:, '_idx_'] = df.index return df.drop_duplicates(subset=['_idx_'], keep='last').drop('_idx_', axis=1)
def doML_NTrials_Times(Food_df, People_df, NTrials): stats = CollectStats(NTrials) for n in range(0, NTrials): print "n= %d" % n cTrainF = rand(len(Food_df)) > .5 cTestF = ~cTrainF cTrainP = rand(len(People_df)) > .5 cTestP = ~cTrainP TrainX_df = pd_concat([People_df[cTrainP], Food_df[cTrainF]], axis=0) TestX_df = pd_concat([People_df[cTestP], Food_df[cTestF]], axis=0) TrainX = TrainX_df.ix[:, 2:].values TestX = TestX_df.ix[:, 2:].values TrainY = concatenate( [ones(len(People_df[cTrainP])), zeros(len(Food_df[cTrainF]))]) TestY = concatenate( [ones(len(People_df[cTestP])), zeros(len(Food_df[cTestF]))]) stats.P[n] = len(People_df[cTestP]) stats.N[n] = len(Food_df[cTestF]) forest2 = ExtraTreesClassifier(n_estimators=50, max_depth=None, min_samples_split=1, random_state=0) forest2.fit(TrainX, TrainY) forestOut2 = forest2.predict(TestX) stats.ET.TP[n] = sum(forestOut2[0:stats.P[n]] == TestY[0:stats.P[n]]) stats.ET.TN[n] = sum(forestOut2[stats.P[n] + 1:] == TestY[stats.P[n] + 1:]) stats.ET.FP[n] = stats.N[n] - stats.ET.TN[n] stats.ET.FN[n] = stats.P[n] - stats.ET.TP[n] clf2 = svm.LinearSVC() clf2.fit(TrainX, TrainY) clfOut2 = clf2.predict(TestX) stats.SVC.TP[n] = sum(clfOut2[0:stats.P[n]] == TestY[0:stats.P[n]]) stats.SVC.TN[n] = sum(clfOut2[stats.P[n] + 1:] == TestY[stats.P[n] + 1:]) stats.SVC.FP[n] = stats.N[n] - stats.SVC.TN[n] stats.SVC.FN[n] = stats.P[n] - stats.SVC.TP[n] return stats
def _get_symbol_dataframe(df, symbol): try: # this produce a "IndexingError using Boolean Indexing" (on rare occasions) return df[(df['symbol'] == symbol) | (df['symbol_group'] == symbol)].copy() except: df = pd_concat( [df[df['symbol'] == symbol], df[df['symbol_group'] == symbol]]) df.loc[:, '_idx_'] = df.index return df.drop_duplicates(subset=['_idx_'], keep='last').drop('_idx_', axis=1)
def _g_target(self): df_tpl = [i[1] for i in self._df.groupby(TKR_COL_NAME)] for tdf in tqdm(df_tpl): yield_ar = tdf[Y_COL_NAME] tdf["y"] = yield_ar.shift(self._predict_period) self._df = pd_concat(df_tpl).dropna() self._feature_lst = self._df.columns.tolist() self._feature_lst.remove(TKR_COL_NAME) self._feature_lst.remove("y") self._feature_lst.remove(DATE_COL_NAME)
def main(folder: str): dataset_dir = 'C:\\Users\\emmanouil.vasilopoul\\Documents\\i-SENSE\\Effector\\Datasets\\Detection\\SEAGULL' vh = Video_Handler(dataset_dir, video_subfolder_path=folder) videos_metadata = [] for video in vh.videos_names: videos_metadata.append(vh.read_metadata(video)) df = pd_concat(videos_metadata) df.to_csv( os_path_join( vh.video_folder, folder.replace('\\', '') + '-metadata.csv' ) )
def write_to_record( dataset: DataFrame, output_dir: str, name: str, size: tuple = (416, 416), ignore_border: bool = True, n_jobs: int = 2, ): """ A function to write the dataset to tf records :param ignore_border: a boolean flag indicating whether to ignore patches with the white border :param n_jobs: number of concurrent processes :param dataset: the dataset dataframe :param output_dir: the output directory :param name: the name of the tf record :param size: the image crop size :return: None """ grouped_train = split(dataset, "tiff_file") all_data = DataFrame() Path(path.join(output_dir, name)).mkdir(exist_ok=True, parents=True) extract_island = partial( extract_island_crops, name=name, output_dir=output_dir, size=size, ignore_border=ignore_border, ) total_patches = 0 for island_number, island in tqdm(enumerate(grouped_train), total=len(grouped_train)): island_records, n_island_patches = extract_island(island) all_data = pd_concat([all_data, island_records]) all_data.to_csv(path.join(output_dir, name, f"records.csv")) total_patches += n_island_patches logger.info( f"Extracted {n_island_patches} patches. Total patches for n={island_number} {total_patches}" ) # for island_records, n_island_patches in tqdm( # pool.imap_unordered(extract_island, grouped_train), total=len(grouped_train) # ): # all_data = pd_concat([all_data, island_records]) # total_patches += n_island_patches # logger.info(f"Extracted {n_island_patches} patches. Total {total_patches}") all_records_path = path.join(output_dir, f"{size[0]}_{name}_all_records.csv") logger.info(f"Writing all normalised records to {all_records_path=}") all_data.drop_duplicates(keep=False, inplace=True) all_data.to_csv(path.join(output_dir, name, f"records.csv"))
def _get_sample_generator(self): df_lst = [i[1] for i in self._df.groupby(DATE_COL_NAME)] len_ = len(df_lst) for i in range(self._sample_lag, len_ - 1, 1): tdf = pd_concat(objs=df_lst[i - self._sample_lag:i]) x_train = tdf.loc[:, self._feature_lst].values y_train = tdf.loc[:, "y"].values.reshape(-1, 1) test_df = df_lst[i + 1] x_test = test_df.loc[:, self._feature_lst].values y_test = test_df.loc[:, "y"].values.reshape(-1, 1) tkr_name = test_df[TKR_COL_NAME].values.reshape(-1, 1) yield x_train, y_train, x_test, y_test, tkr_name
def doML_NTrials_Times(Food_df,People_df,NTrials): stats= CollectStats(NTrials) for n in range(0,NTrials): print "n= %d" % n cTrainF = rand(len(Food_df)) > .5 cTestF = ~cTrainF cTrainP = rand(len(People_df)) > .5 cTestP = ~cTrainP TrainX_df = pd_concat([People_df[cTrainP], Food_df[cTrainF]],axis=0) TestX_df = pd_concat([People_df[cTestP], Food_df[cTestF]],axis=0) TrainX= TrainX_df.ix[:,2:].values TestX= TestX_df.ix[:,2:].values TrainY = concatenate([ones(len(People_df[cTrainP])), zeros(len(Food_df[cTrainF]))]) TestY = concatenate([ones(len(People_df[cTestP])), zeros(len(Food_df[cTestF]))]) stats.P[n] = len(People_df[cTestP]) stats.N[n] = len(Food_df[cTestF]) forest2 = ExtraTreesClassifier(n_estimators=50, max_depth=None, min_samples_split=1, random_state=0) forest2.fit(TrainX,TrainY) forestOut2 = forest2.predict(TestX) stats.ET.TP[n] = sum(forestOut2[0:stats.P[n]] == TestY[0:stats.P[n]]) stats.ET.TN[n] = sum(forestOut2[stats.P[n]+1:] == TestY[stats.P[n]+1:]) stats.ET.FP[n] = stats.N[n] - stats.ET.TN[n] stats.ET.FN[n] = stats.P[n] - stats.ET.TP[n] clf2 = svm.LinearSVC() clf2.fit(TrainX,TrainY) clfOut2 = clf2.predict(TestX) stats.SVC.TP[n] = sum(clfOut2[0:stats.P[n]] == TestY[0:stats.P[n]]) stats.SVC.TN[n] = sum(clfOut2[stats.P[n]+1:] == TestY[stats.P[n]+1:]) stats.SVC.FP[n] = stats.N[n] - stats.SVC.TN[n] stats.SVC.FN[n] = stats.P[n] - stats.SVC.TP[n] return stats
def chunk_res_with_values( query: str, ids: Sequence[Any], con, size: int = 10000, params: Optional[Dict[str, Any]] = None, ) -> DataFrame: """Chunk query result values.""" if params is None: params = {} result = [] for chunk in chunks(ids, size): params.update({"ids": chunk}) result.append(DataFrame(get_res_with_values(query, params, con))) del params["ids"] return pd_concat(result, ignore_index=True)
def CalcHog_FeaturesAndImage_ForOneImage(grey_img,image_name,rgb_img): feat = zeros((1,900)) #People_All_9.csv Food_All_9.csv #get hog features blocks = feature.hog(grey_img, orientations=9, pixels_per_cell=(100,100), cells_per_block=(5,5), visualise=False, normalise=True) #People_All_9.csv Food_All_9.csv #slightly diff params for better hog visualization junk_block,ImageOfHog=feature.hog(grey_img, pixels_per_cell=(10,10), cells_per_block=(30,30),visualise=True,normalise=True) if(len(blocks) == 900): #People_All_9.csv Food_All_9.csv feat[0] = blocks name_df=DataFrame() name_df["image_name"]= image_name feat_df= DataFrame(feat) final_df=pd_concat([name_df,feat_df],axis=1) final_df.to_csv("tmp/HogFeatures.csv") save_ImageOfHog(grey_img,ImageOfHog,rgb_img)
def run_in_separate_threads(items, target, **args): """Wrapper to execute method in separate tread for each "item" and concat them into DataFrame.""" _result = DataFrame() with concurrent.futures.ThreadPoolExecutor(max_workers=100) as executor: future_result = {executor.submit(target, *list(args.values()) + [item]):\ e for e, item in enumerate(items)} for future in concurrent.futures.as_completed(future_result): try: data = future.result() except (ValueError, AttributeError) as exc: logging.warning(' %s generated an exception: %s', target.__name__, exc) data = DataFrame() _result = pd_concat([_result, data]) return _result
def hog_features(ans_url_df,output_pickle_name): urls=ans_url_df.URL.values answers=ans_url_df.answer.values urls_exist=[] ans_exist=[] cnt=-1 for url,ans in zip(urls,answers): cnt+=1 print "cnt= %d , checking urls" % cnt try: read= urllib2.urlopen(url).read() urls_exist.append(url) ans_exist.append(ans) except urllib2.URLError: continue urls_exist= array(urls_exist) ans_exist= array(ans_exist) feat = zeros((len(urls_exist), 900)) count=0 for url in urls_exist: print "count= %d -- calc features" % count read= urllib2.urlopen(url).read() obj = Image.open( cStringIO.StringIO(read) ) img = array(obj.convert('L')) blocks = feature.hog(img, orientations=9, pixels_per_cell=(100,100), cells_per_block=(5,5), visualise=False, normalise=True) #People_All_9.csv Food_All_9.csv if(len(blocks) == 900): feat[count] = blocks count += 1 urls_exist_df= DataFrame(urls_exist,columns=["URL"]) ans_exist_df= DataFrame(ans_exist,columns=["answer"]) feat_df= DataFrame(feat) final_df= pd_concat([urls_exist_df,ans_exist_df,feat_df],axis=1) fout = open(output_pickle_name, 'w') pickle.dump(final_df.dropna(), fout) fout.close()
def df_from_query_by_ids( con, # TODO type annotation for con query: str, ids: Sequence[Any], parameters: Optional[Dict[str, Any]] = None, size: int = 10000, ) -> DataFrame: """Return DataFrame from query by ids.""" if parameters is None: parameters = {} return pd_concat( [ DataFrame([ dict(each.items()) for each in con.execute(query, { "ids": chunk, **parameters }).fetchall() ]) for chunk in chunks(ids, size) ], ignore_index=True, )
def Hog_predict_UploadImage(grey_img,image_name,rgb_img): CalcHog_FeaturesAndImage_ForOneImage(grey_img,image_name,rgb_img) feat_df= read_csv("tmp/HogFeatures.csv") feat_vals= feat_df.ix[:,2:].values root= "machine_learn/HOG/csv_features/" Food_df = read_csv(root+"hog_features_9_NewTraining_Food_everyones.csv") People_df = read_csv(root+"hog_features_9_NewTraining_Faces_everyones.csv") cTrainF = rand(len(Food_df)) > .5 cTestF = ~cTrainF cTrainP = rand(len(People_df)) > .5 cTestP = ~cTrainP TrainX_df = pd_concat([People_df[cTrainP], Food_df[cTrainF]],axis=0) TrainX= TrainX_df.ix[:,2:].values TrainY = concatenate([ones(len(People_df[cTrainP])), zeros(len(Food_df[cTrainF]))]) ET_classifier = ExtraTreesClassifier(n_estimators=50, max_depth=None, min_samples_split=1, random_state=0) ET_classifier.fit(TrainX,TrainY) ET_prediction = ET_classifier.predict(feat_vals) LinSVC_classifier = svm.LinearSVC() LinSVC_classifier.fit(TrainX,TrainY) LinSVC_prediction = LinSVC_classifier.predict(feat_vals) return ET_prediction, LinSVC_prediction ### testing # file="machine_learn/training_image_urls/NewTraining_Food_everyones.txt" # urls=np.loadtxt(file,dtype="str") # url=urls[11] # read= urllib2.urlopen(url).read() # obj = Image.open( cStringIO.StringIO(read) ) # rgb_img= np.array(obj) # grey_img = np.array(obj.convert('L')) # (et,svc)= Hog_predict_UploadImage(grey_img,file,rgb_img) #et_ans= interpret_int_predict(et[0].astype('int')) #svc_ans= interpret_int_predict(svc[0].astype('int'))
def extract_island_crops(record: data, name: str, output_dir: str, size: tuple, ignore_border: bool): island, output, _ = record output_path = path.join(output_dir, name, f"{path.splitext(island)[0]}.tfrecord") writer = tf.compat.v1.python_io.TFRecordWriter(output_path) height = output["image_height"].iloc[0] transformed_out = output.copy() transformed_out["y_pixel"] = transformed_out["y_pixel"].apply( lambda y: height - y) converted = extrapolate_patches(island, transformed_out, size, ignore_extrema=name == "train") island_records = DataFrame() if converted is not None: island_records = pd_concat( [island_records, *[data.object for data in converted]]) for converted_object in converted: tf_example = convert_to_tf_records(converted_object, size, name) writer.write(tf_example.SerializeToString()) writer.flush() writer.close() return island_records, len(converted)
def merge_data(result): tmp = sorted(list(result.keys()), key=lambda x: (x[1], x[2], x[0])) combos = [['fanova', 'ablation'], ['fanova_cut', 'ablation'], ['fanova', 'local improvement analysis'], ['fanova_cut', 'local improvement analysis'], ['ablation', 'local improvement analysis']] skip = False try: del tmp[tmp.index('fanova')] except ValueError: skip = True try: del tmp[tmp.index('ablation')] except ValueError: skip = True try: del tmp[tmp.index('fanova_cut')] except ValueError: skip = True try: del tmp[tmp.index('local improvement analysis')] except ValueError: skip = True df_dict = {} if not skip: multi = [] all = None prev = tmp[0] for combo in combos: tap = np.array(tmp) indices = list( set.intersection(set(list(np.where(tap[:, 1] == combo[0]))[0]), set(list(np.where(tap[:, 2] == combo[1]))[0]))) tap = list(tap[indices]) for benchmark in tap: cols = ['cap_', 'cup_'] if 'fanova' == benchmark[1]: cols[0], cols[1] = cols[0] + 'f', cols[1] + 'f' elif 'ab' in benchmark[1]: cols[0], cols[1] = cols[0] + 'a', cols[1] + 'a' else: cols[0], cols[1] = cols[0] + 'c', cols[1] + 'c' if 'ab' in benchmark[2]: cols[0], cols[1] = cols[0] + 'a', cols[1] + 'a' else: cols[0], cols[1] = cols[0] + 'l', cols[1] + 'l' benchmark = tuple(benchmark) if all is None: all = DataFrame.from_dict(result[benchmark], orient='index') all.columns = cols idx = list( map( lambda y: y[1], sorted(enumerate(list(all.index)), key=lambda x: x[1]))) all = all.loc[idx] else: d = DataFrame.from_dict(result[benchmark], orient='index') d.columns = cols all = pd_concat([all, d]) multi.append(all.copy()) all = None df_dict['joint'] = pd_concat(multi, axis=1) for key in [ 'ablation', 'fanova', 'fanova_cut', 'local improvement analysis' ]: try: df = DataFrame.from_dict(result[key], orient='index') df.columns = ['cap', 'cup'] idx = list(df.index) idx_0 = sorted(enumerate(list(map(lambda x: x.split(' / '), idx))), key=lambda x: (x[1][0], x[1][1])) idx_0 = list(map(lambda x: idx[x[0]], idx_0)) df = df.loc[idx_0] df_dict[key] = df except KeyError: pass return df_dict
def build_gnomAD_FromTranscriptList(transcript_list_file, gmd_version): """ builds all gnomAD data from a given gene list file """ con = dbcon() cur = con.cursor() table_name = "gnomadv2" if gmd_version == "2.1.1": gmd_name = "2" if gmd_version == "3.0": gmd_name = "3" """ this code is now obselete query = "DROP TABLE IF EXISTS '" + table_name + "'" cur.execute(query) query = "CREATE TABLE '" + table_name + "' AS SELECT * FROM gnomad WHERE 0=1" cur.execute(query) con.commit() con.close() """ transcript_dict = parse_transcript_list(transcript_list_file) print(transcript_dict) for chrom, transcript_list in transcript_dict.items(): try: exomes_data = get_gnomad_data(chrom, version=gmd_version, exomes=True) exomes_df = process_gnomad_data(exomes_data, chrom, transcript_list, exomes=True) exomes_df_exists = True os_remove(exomes_data) except HTTPError as err: if err.code == 404: print("No exomes data found") exomes_df_exists = False else: raise try: genomes_data = get_gnomad_data(chrom, version=gmd_version, exomes=False) genomes_df = process_gnomad_data(genomes_data, chrom, transcript_list, exomes=False) genomes_df_exists = True os_remove(genomes_data) except HTTPError as err: if err.code == 404: print("No genomes data found") genomes_df_exists = False else: raise # combine the gnomad exomes data and the gnomad genomes data into one # pandas dataframe, and export it to the database if (exomes_df_exists and genomes_df_exists): combined_df = pd_concat([exomes_df, genomes_df]) elif exomes_df_exists: combined_df = exomes_df elif genomes_df_exists: combined_df = genomes_df else: raise ValueError("Unable to find requested gnomAD data") combined_df.loc[combined_df.duplicated( subset=['chromosome', 'position', 'allele_ref', 'allele_alt'], keep=False), 'source'] = 'both' combined_df = combined_df.drop_duplicates( ['chromosome', 'position', 'allele_ref', 'allele_alt']).reset_index(drop=True) combined_df = combined_df.sort_values(by=['position']) filename = 'chr' + chrom + '_processed.tsv' #combined_df.to_csv(filename, sep='\t', encoding = 'utf-8', index=False) con = dbcon() combined_df.to_sql(table_name, con=con, if_exists="append", index=False) con.commit() con.close() return None
from os import listdir from os.path import join from detection import Video_Handler from pandas import concat as pd_concat if __name__ == '__main__': dataset_dir = 'C:\\Users\\emmanouil.vasilopoul\\Documents\\i-SENSE\\Effector\\Datasets\\Detection\\SEAGULL' videos_dir = 'C:\\Users\\emmanouil.vasilopoul\\Documents\\i-SENSE\\Effector\\Datasets\\Detection\\SEAGULL\\inputs\\videos\\Complete\\visible\\' videos_list = listdir(videos_dir) vh = Video_Handler(dataset_dir) videos_metadata = [] for video in videos_list: video_path = join(videos_dir, video) videos_metadata.append(vh.read_metadata('Complete/visible/' + video)) df = pd_concat(videos_metadata) print(df)
def get_average_observables_wl(dcs: Union[WangLandauDataContainer, Dict[Any, WangLandauDataContainer]], temperatures: List[float], observables: List[str] = None, boltzmann_constant: float = kB, fill_factor_limit: float = None) -> DataFrame: """Returns the average and the standard deviation of the energy from a :ref:`Wang-Landau simulation <wang_landau_ensemble>` for the temperatures specified. If the ``observables`` keyword argument is specified the function will also return the mean and standard deviation of the specified observables. Parameters ---------- dcs data container(s), from which to extract density of states as well as observables temperatures temperatures, at which to compute the averages observables observables, for which to compute averages; the observables must refer to fields in the data container boltzmann_constant Boltzmann constant :math:`k_B` in appropriate units, i.e. units that are consistent with the underlying cluster expansion and the temperature units [default: eV/K] fill_factor_limit use data recorded up to the point when the specified fill factor limit was reached when computing averages; otherwise use data for the last state Raises ------ ValueError if the data container(s) do(es) not contain entropy data from Wang-Landau simulation ValueError if data container(s) do(es) not contain requested observable """ def check_observables(dc: WangLandauDataContainer, observables: Optional[List[str]]) -> None: """ Helper function that checks that observables are available in data frame. """ if observables is None: return for obs in observables: if obs not in dc.data.columns: raise ValueError('Observable ({}) not in data container.\n' 'Available observables: {}'.format( obs, dc.data.columns)) # preparation of observables columns_to_keep = ['potential', 'density'] if observables is not None: columns_to_keep.extend(observables) # check that observables are available in data container # and prepare comprehensive data frame with relevant information if isinstance(dcs, WangLandauDataContainer): check_observables(dcs, observables) df_combined = _extract_filter_data(dcs, columns_to_keep, fill_factor_limit) dcref = dcs elif isinstance(dcs, dict) and isinstance(dcs[next(iter(dcs))], WangLandauDataContainer): dfs = [] for dc in dcs.values(): check_observables(dc, observables) dfs.append( _extract_filter_data(dc, columns_to_keep, fill_factor_limit)) df_combined = pd_concat([df for df in dfs], ignore_index=True) dcref = list(dcs.values())[0] else: raise TypeError('dcs ({}) must be a data container with entropy data' ' or be a list of data containers'.format(type(dcs))) # fetch entropy and density of states from data container(s) df_density, _ = get_density_of_states_wl(dcs, fill_factor_limit) # compute density for each row in data container if observable averages # are to be computed if observables is not None: energy_spacing = dcref.ensemble_parameters['energy_spacing'] # NOTE: we rely on the indices of the df_density DataFrame to # correspond to the energy scale! This is expected to be handled in # the get_density_of_states function. bins = list( np.array(np.round(df_combined.potential / energy_spacing), dtype=int)) data_density = [ dens / bins.count(k) for k, dens in df_density.density[bins].items() ] enref = np.min(df_density.energy) averages = [] for temperature in temperatures: # mean and standard deviation of energy boltz = np.exp(-(df_density.energy - enref) / temperature / boltzmann_constant) sumint = np.sum(df_density.density * boltz) en_mean = np.sum( df_density.energy * df_density.density * boltz) / sumint en_std = np.sum( df_density.energy**2 * df_density.density * boltz) / sumint en_std = np.sqrt(en_std - en_mean**2) record = { 'temperature': temperature, 'potential_mean': en_mean, 'potential_std': en_std } # mean and standard deviation of other observables if observables is not None: boltz = np.exp(-(df_combined.potential - enref) / temperature / boltzmann_constant) sumint = np.sum(data_density * boltz) for obs in observables: obs_mean = np.sum( data_density * boltz * df_combined[obs]) / sumint obs_std = np.sum( data_density * boltz * df_combined[obs]**2) / sumint obs_std = np.sqrt(obs_std - obs_mean**2) record['{}_mean'.format(obs)] = obs_mean record['{}_std'.format(obs)] = obs_std averages.append(record) return DataFrame.from_dict(averages)