Exemplo n.º 1
2
    def __init__(self,
                 data,
                 reference,
                 quantile_normalize=True,
                 min_variance=0,
                 corr_method='pearson'):
        self.quantile_normalize = quantile_normalize

        data_index = data.index
        reference_index = reference.index
        shared = data.index.intersection(reference.index)
        self._data = data.loc[shared]
        self._reference = reference.loc[shared]

        data_index = self._data.var(1)[min_variance < self._data.var(1)].index
        reference_index = self._reference.var(1)[
            min_variance < self._reference.var(1)].index
        self._shared = data_index.intersection(reference_index)
        self._data = self._data.loc[self._shared].copy()
        self._reference = self._reference.loc[self._shared].copy()
        if quantile_normalize:
            qfit = QNorm().fit(self._reference)
            self._data = qfit.transform(self._data)
            self._reference = qfit.transform(self._reference)
        self._corr = pd_concat(
            [self._data, self._reference],
            1).corr(method=corr_method).iloc[self._data.shape[1]:].reset_index(
                drop=True)
Exemplo n.º 2
1
def predict_TestData(Food_df, People_df):
    cTrainF = rand(len(Food_df)) > .5
    cTestF = ~cTrainF
    cTrainP = rand(len(People_df)) > .5
    cTestP = ~cTrainP

    TrainX_df = pd_concat([People_df[cTrainP], Food_df[cTrainF]], axis=0)
    TestX_df = pd_concat([People_df[cTestP], Food_df[cTestF]], axis=0)

    TrainX = TrainX_df.ix[:, 2:].values
    TestX = TestX_df.ix[:, 2:].values
    TrainY = concatenate(
        [ones(len(People_df[cTrainP])),
         zeros(len(Food_df[cTrainF]))])
    TestY = concatenate(
        [ones(len(People_df[cTestP])),
         zeros(len(Food_df[cTestF]))])

    ET_classifier = ExtraTreesClassifier(n_estimators=50,
                                         max_depth=None,
                                         min_samples_split=1,
                                         random_state=0)
    ET_classifier.fit(TrainX, TrainY)
    ET_prediction = ET_classifier.predict(TestX)

    LinSVC_classifier = svm.LinearSVC()
    LinSVC_classifier.fit(TrainX, TrainY)
    LinSVC_predict = LinSVC_classifier.predict(TestX)

    a = DataFrame()
    a["url"] = TestX_df.urls.values
    a["answer"] = TestY
    a["ET_predict"] = ET_prediction
    a["LinSVC_predict"] = LinSVC_predict
    a.to_csv("prediction_for_TestData.csv")
Exemplo n.º 3
1
def predict_TestData(Food_df,People_df):
    cTrainF = rand(len(Food_df)) > .5
    cTestF = ~cTrainF
    cTrainP = rand(len(People_df)) > .5
    cTestP = ~cTrainP

    TrainX_df = pd_concat([People_df[cTrainP], Food_df[cTrainF]],axis=0)
    TestX_df = pd_concat([People_df[cTestP], Food_df[cTestF]],axis=0)

    TrainX= TrainX_df.ix[:,2:].values
    TestX= TestX_df.ix[:,2:].values
    TrainY = concatenate([ones(len(People_df[cTrainP])), zeros(len(Food_df[cTrainF]))])
    TestY = concatenate([ones(len(People_df[cTestP])), zeros(len(Food_df[cTestF]))])

    ET_classifier = ExtraTreesClassifier(n_estimators=50, max_depth=None, min_samples_split=1, random_state=0)
    ET_classifier.fit(TrainX,TrainY)
    ET_prediction = ET_classifier.predict(TestX) 

    LinSVC_classifier = svm.LinearSVC()
    LinSVC_classifier.fit(TrainX,TrainY)
    LinSVC_predict = LinSVC_classifier.predict(TestX)

    a=DataFrame()
    a["url"]=TestX_df.urls.values
    a["answer"]=TestY
    a["ET_predict"]=ET_prediction
    a["LinSVC_predict"]=LinSVC_predict
    a.to_csv("prediction_for_TestData.csv")
    def txts_to_tracking_csv(self,
                             txt_dir='',
                             csv_name='all-tracking.csv',
                             dataset_info_df=None):
        # Check the directory from which the txt-files will be loaded from
        txt_dir = self._standard_check('', txt_dir)

        # List all txt-files
        txts_list = [x for x in os_listdir(txt_dir) if '.txt' in x]

        dfs_list = []
        # For every txt-file receive a DataFrame and put it in a list
        for txt_file in txts_list:
            np_data = self.read_txt(txt_file, txt_dir)

            np_data = self.__add_objectless_frames(np_data, dataset_info_df)
            df = DataFrame(np_data, columns=self.original_format_column_names)

            dfs_list.append(df)
        # Unify all DataFrames of the list
        df = pd_concat(dfs_list)

        # Prepare the CSV's path
        csv_path = os_path_join(txt_dir, csv_name)
        # Save Unified DataFrame to csv-file
        df.to_csv(csv_path)
Exemplo n.º 5
0
def set_segmentation_on_annotation(annotation_df, seg_df):
    """ Get segment annotation dataframe by looking into the start and end timestamp
  of each segment, add a "segment" column to the dataframe

  Args:
    annotation_df: annotation dataset to be processed
    seg_df: input segment dataframe (output of function do_segmentation_on_raw)

  Return:
    seg_annotation_df: annotation segment dataframe
  """
    import smdt.annotation as s_annotation
    print "=========setting segment annotation dataset=============="
    # print annotation_df.head()
    seg_annotation_arr = []
    for seg_index, one_seg_df in seg_df.groupby(s_info.segment_col):
        start_time = one_seg_df[s_info.raw_ts_name].iloc[0]
        end_time = one_seg_df[s_info.raw_ts_name].iloc[-1]

        one_annotation_df = s_annotation.select_annotation_by_ts(
            annotation_df, lbound=start_time, rbound=end_time)
        one_annotation_df[s_info.segment_col] = [
            seg_index,
        ] * len(one_annotation_df)
        seg_annotation_arr.append(one_annotation_df)
    seg_annotation_df = pd_concat(seg_annotation_arr)
    # reset index but keep the original index as a reference to previous dataframe
    seg_annotation_df = seg_annotation_df.reset_index(drop=False)
    # rename "index" to "reference index"
    seg_annotation_df = seg_annotation_df.rename(
        columns={"index": "reference index"})

    return seg_annotation_df
Exemplo n.º 6
0
def _construct_segment_dataframe(raw_df, start_anchors, end_anchors):
    """Helper function to construct segment dataframe structure

  Args:
    raw_df: input wockets raw dataframe, only support single session and sensor
    start_anchors: start index of each segment
    end_anchors: end indexes of each segment
    drop: column index to be dropped

  Return:
    seg_df: segment dataframe
  """
    print "======segmentation information==============="

    # loop over anchors and append new segments
    # NOTE!! concating an array of dataframes is much faster than concat them one by one with append
    seg_arr = []
    indexes = range(0, len(start_anchors))
    print "========connect segments============="
    for start, end, i in zip(start_anchors, end_anchors, indexes):
        if end - start <= 1:
            continue
        new_seg_df = raw_df.iloc[start:end]
        new_seg_df[s_info.segment_col] = i
        seg_arr.append(new_seg_df)
    seg_df = pd_concat(seg_arr)
    print "total segmentations: " + str(seg_df[s_info.segment_col].max())
    # seg_df = raw_data.copy(deep=True) # only used for test
    return seg_df
Exemplo n.º 7
0
 def _data2df(data):
     tdf = pd_DataFrame(data=data.Data,
                        columns=pd_DatetimeIndex(data.Times),
                        index=["code", "p"]).T
     rst_df = pd_concat([df["p"] for _, df in tdf.groupby("code")],
                        axis=1)
     rst_df.columns = [tkr for tkr, _ in tdf.groupby("code")]
     return rst_df.astype(float)
Exemplo n.º 8
0
 def _get_symbol_dataframe(df, symbol):
     try:
         # this produce a "IndexingError using Boolean Indexing" (on rare occasions)
         return df[(df['symbol'] == symbol) | (df['symbol_group'] == symbol)].copy()
     except:
         df = pd_concat([df[df['symbol'] == symbol], df[df['symbol_group'] == symbol]])
         df.loc[:, '_idx_'] = df.index
         return df.drop_duplicates(subset=['_idx_'], keep='last').drop('_idx_', axis=1)
Exemplo n.º 9
0
def doML_NTrials_Times(Food_df, People_df, NTrials):
    stats = CollectStats(NTrials)
    for n in range(0, NTrials):
        print "n= %d" % n
        cTrainF = rand(len(Food_df)) > .5
        cTestF = ~cTrainF
        cTrainP = rand(len(People_df)) > .5
        cTestP = ~cTrainP

        TrainX_df = pd_concat([People_df[cTrainP], Food_df[cTrainF]], axis=0)
        TestX_df = pd_concat([People_df[cTestP], Food_df[cTestF]], axis=0)

        TrainX = TrainX_df.ix[:, 2:].values
        TestX = TestX_df.ix[:, 2:].values
        TrainY = concatenate(
            [ones(len(People_df[cTrainP])),
             zeros(len(Food_df[cTrainF]))])
        TestY = concatenate(
            [ones(len(People_df[cTestP])),
             zeros(len(Food_df[cTestF]))])

        stats.P[n] = len(People_df[cTestP])
        stats.N[n] = len(Food_df[cTestF])

        forest2 = ExtraTreesClassifier(n_estimators=50,
                                       max_depth=None,
                                       min_samples_split=1,
                                       random_state=0)
        forest2.fit(TrainX, TrainY)
        forestOut2 = forest2.predict(TestX)
        stats.ET.TP[n] = sum(forestOut2[0:stats.P[n]] == TestY[0:stats.P[n]])
        stats.ET.TN[n] = sum(forestOut2[stats.P[n] + 1:] == TestY[stats.P[n] +
                                                                  1:])
        stats.ET.FP[n] = stats.N[n] - stats.ET.TN[n]
        stats.ET.FN[n] = stats.P[n] - stats.ET.TP[n]

        clf2 = svm.LinearSVC()
        clf2.fit(TrainX, TrainY)
        clfOut2 = clf2.predict(TestX)
        stats.SVC.TP[n] = sum(clfOut2[0:stats.P[n]] == TestY[0:stats.P[n]])
        stats.SVC.TN[n] = sum(clfOut2[stats.P[n] + 1:] == TestY[stats.P[n] +
                                                                1:])
        stats.SVC.FP[n] = stats.N[n] - stats.SVC.TN[n]
        stats.SVC.FN[n] = stats.P[n] - stats.SVC.TP[n]
    return stats
Exemplo n.º 10
0
 def _get_symbol_dataframe(df, symbol):
     try:
         # this produce a "IndexingError using Boolean Indexing" (on rare occasions)
         return df[(df['symbol'] == symbol) |
                   (df['symbol_group'] == symbol)].copy()
     except:
         df = pd_concat(
             [df[df['symbol'] == symbol], df[df['symbol_group'] == symbol]])
         df.loc[:, '_idx_'] = df.index
         return df.drop_duplicates(subset=['_idx_'],
                                   keep='last').drop('_idx_', axis=1)
    def _g_target(self):
        df_tpl = [i[1] for i in self._df.groupby(TKR_COL_NAME)]
        for tdf in tqdm(df_tpl):
            yield_ar = tdf[Y_COL_NAME]
            tdf["y"] = yield_ar.shift(self._predict_period)

        self._df = pd_concat(df_tpl).dropna()
        self._feature_lst = self._df.columns.tolist()
        self._feature_lst.remove(TKR_COL_NAME)
        self._feature_lst.remove("y")
        self._feature_lst.remove(DATE_COL_NAME)
def main(folder: str):
    dataset_dir = 'C:\\Users\\emmanouil.vasilopoul\\Documents\\i-SENSE\\Effector\\Datasets\\Detection\\SEAGULL'
    vh = Video_Handler(dataset_dir, video_subfolder_path=folder)
    videos_metadata = []
    for video in vh.videos_names:
        videos_metadata.append(vh.read_metadata(video))
    df = pd_concat(videos_metadata)
    df.to_csv(
        os_path_join(
            vh.video_folder,
            folder.replace('\\', '') + '-metadata.csv'
        )
    )
Exemplo n.º 13
0
def write_to_record(
        dataset: DataFrame,
        output_dir: str,
        name: str,
        size: tuple = (416, 416),
        ignore_border: bool = True,
        n_jobs: int = 2,
):
    """
    A function to write the dataset to tf records
    :param ignore_border: a boolean flag indicating whether to ignore patches with the white border
    :param n_jobs: number of concurrent processes
    :param dataset: the dataset dataframe
    :param output_dir: the output directory
    :param name: the name of the tf record
    :param size: the image crop size
    :return: None
    """
    grouped_train = split(dataset, "tiff_file")
    all_data = DataFrame()
    Path(path.join(output_dir, name)).mkdir(exist_ok=True, parents=True)
    extract_island = partial(
        extract_island_crops,
        name=name,
        output_dir=output_dir,
        size=size,
        ignore_border=ignore_border,
    )
    total_patches = 0
    for island_number, island in tqdm(enumerate(grouped_train),
                                      total=len(grouped_train)):
        island_records, n_island_patches = extract_island(island)
        all_data = pd_concat([all_data, island_records])
        all_data.to_csv(path.join(output_dir, name, f"records.csv"))
        total_patches += n_island_patches
        logger.info(
            f"Extracted {n_island_patches} patches. Total patches for n={island_number} {total_patches}"
        )
        # for island_records, n_island_patches in tqdm(
        #     pool.imap_unordered(extract_island, grouped_train), total=len(grouped_train)
        # ):
        #     all_data = pd_concat([all_data, island_records])
        #     total_patches += n_island_patches
        #     logger.info(f"Extracted {n_island_patches} patches. Total {total_patches}")
    all_records_path = path.join(output_dir,
                                 f"{size[0]}_{name}_all_records.csv")
    logger.info(f"Writing all normalised records to {all_records_path=}")
    all_data.drop_duplicates(keep=False, inplace=True)
    all_data.to_csv(path.join(output_dir, name, f"records.csv"))
    def _get_sample_generator(self):
        df_lst = [i[1] for i in self._df.groupby(DATE_COL_NAME)]

        len_ = len(df_lst)
        for i in range(self._sample_lag, len_ - 1, 1):
            tdf = pd_concat(objs=df_lst[i - self._sample_lag:i])
            x_train = tdf.loc[:, self._feature_lst].values
            y_train = tdf.loc[:, "y"].values.reshape(-1, 1)

            test_df = df_lst[i + 1]
            x_test = test_df.loc[:, self._feature_lst].values
            y_test = test_df.loc[:, "y"].values.reshape(-1, 1)
            tkr_name = test_df[TKR_COL_NAME].values.reshape(-1, 1)

            yield x_train, y_train, x_test, y_test, tkr_name
Exemplo n.º 15
0
def doML_NTrials_Times(Food_df,People_df,NTrials):
    stats= CollectStats(NTrials)
    for n in range(0,NTrials):
        print "n= %d" % n
        cTrainF = rand(len(Food_df)) > .5
        cTestF = ~cTrainF
        cTrainP = rand(len(People_df)) > .5
        cTestP = ~cTrainP

        TrainX_df = pd_concat([People_df[cTrainP], Food_df[cTrainF]],axis=0)
        TestX_df = pd_concat([People_df[cTestP], Food_df[cTestF]],axis=0)

        TrainX= TrainX_df.ix[:,2:].values
        TestX= TestX_df.ix[:,2:].values
        TrainY = concatenate([ones(len(People_df[cTrainP])), zeros(len(Food_df[cTrainF]))])
        TestY = concatenate([ones(len(People_df[cTestP])), zeros(len(Food_df[cTestF]))])

        stats.P[n] = len(People_df[cTestP])
        stats.N[n] = len(Food_df[cTestF])

        forest2 = ExtraTreesClassifier(n_estimators=50, max_depth=None, min_samples_split=1, random_state=0)
        forest2.fit(TrainX,TrainY)
        forestOut2 = forest2.predict(TestX)                              
        stats.ET.TP[n] = sum(forestOut2[0:stats.P[n]] == TestY[0:stats.P[n]])
        stats.ET.TN[n] = sum(forestOut2[stats.P[n]+1:] == TestY[stats.P[n]+1:])
        stats.ET.FP[n] = stats.N[n] - stats.ET.TN[n]
        stats.ET.FN[n] = stats.P[n] - stats.ET.TP[n]

        clf2 = svm.LinearSVC()
        clf2.fit(TrainX,TrainY)
        clfOut2 = clf2.predict(TestX)
        stats.SVC.TP[n] = sum(clfOut2[0:stats.P[n]] == TestY[0:stats.P[n]])
        stats.SVC.TN[n] = sum(clfOut2[stats.P[n]+1:] == TestY[stats.P[n]+1:])
        stats.SVC.FP[n] = stats.N[n] - stats.SVC.TN[n]
        stats.SVC.FN[n] = stats.P[n] - stats.SVC.TP[n]
    return stats
Exemplo n.º 16
0
def chunk_res_with_values(
    query: str,
    ids: Sequence[Any],
    con,
    size: int = 10000,
    params: Optional[Dict[str, Any]] = None,
) -> DataFrame:
    """Chunk query result values."""
    if params is None:
        params = {}
    result = []
    for chunk in chunks(ids, size):
        params.update({"ids": chunk})
        result.append(DataFrame(get_res_with_values(query, params, con)))
    del params["ids"]
    return pd_concat(result, ignore_index=True)
Exemplo n.º 17
0
def CalcHog_FeaturesAndImage_ForOneImage(grey_img,image_name,rgb_img):
    feat = zeros((1,900)) #People_All_9.csv Food_All_9.csv
    #get hog features
    blocks = feature.hog(grey_img, orientations=9, pixels_per_cell=(100,100), cells_per_block=(5,5), visualise=False, normalise=True) #People_All_9.csv Food_All_9.csv
    #slightly diff params for better hog visualization
    junk_block,ImageOfHog=feature.hog(grey_img, pixels_per_cell=(10,10), cells_per_block=(30,30),visualise=True,normalise=True)
    
    if(len(blocks) == 900): #People_All_9.csv Food_All_9.csv
        feat[0] = blocks

    name_df=DataFrame()
    name_df["image_name"]= image_name
    feat_df= DataFrame(feat)
    final_df=pd_concat([name_df,feat_df],axis=1) 
    final_df.to_csv("tmp/HogFeatures.csv")
    
    save_ImageOfHog(grey_img,ImageOfHog,rgb_img)
Exemplo n.º 18
0
def run_in_separate_threads(items, target, **args):
    """Wrapper to execute method in separate tread
    for each "item" and concat them into DataFrame."""
    _result = DataFrame()

    with concurrent.futures.ThreadPoolExecutor(max_workers=100) as executor:
        future_result = {executor.submit(target, *list(args.values()) + [item]):\
                         e for e, item in enumerate(items)}
        for future in concurrent.futures.as_completed(future_result):
            try:
                data = future.result()
            except (ValueError, AttributeError) as exc:
                logging.warning(' %s generated an exception: %s',
                                target.__name__, exc)
                data = DataFrame()

            _result = pd_concat([_result, data])

    return _result
Exemplo n.º 19
0
def hog_features(ans_url_df,output_pickle_name):
    urls=ans_url_df.URL.values
    answers=ans_url_df.answer.values
    
    urls_exist=[]
    ans_exist=[]
    cnt=-1
    for url,ans in zip(urls,answers):
        cnt+=1
        print "cnt= %d , checking urls" % cnt
        try:
            read= urllib2.urlopen(url).read()
            urls_exist.append(url)
            ans_exist.append(ans)
        except urllib2.URLError:
            continue

    urls_exist= array(urls_exist)
    ans_exist= array(ans_exist)
    feat = zeros((len(urls_exist), 900))
    count=0
    for url in urls_exist:
        print "count= %d -- calc features" % count
        read= urllib2.urlopen(url).read()
        obj = Image.open( cStringIO.StringIO(read) )
        img = array(obj.convert('L'))
    
        blocks = feature.hog(img, orientations=9, pixels_per_cell=(100,100), cells_per_block=(5,5), visualise=False, normalise=True) #People_All_9.csv Food_All_9.csv
        if(len(blocks) == 900):
            feat[count] = blocks
        count += 1

    urls_exist_df= DataFrame(urls_exist,columns=["URL"])
    ans_exist_df= DataFrame(ans_exist,columns=["answer"])
    feat_df= DataFrame(feat)
    final_df= pd_concat([urls_exist_df,ans_exist_df,feat_df],axis=1)
    fout = open(output_pickle_name, 'w') 
    pickle.dump(final_df.dropna(), fout)
    fout.close()
Exemplo n.º 20
0
def df_from_query_by_ids(
    con,  # TODO type annotation for con
    query: str,
    ids: Sequence[Any],
    parameters: Optional[Dict[str, Any]] = None,
    size: int = 10000,
) -> DataFrame:
    """Return DataFrame from query by ids."""
    if parameters is None:
        parameters = {}
    return pd_concat(
        [
            DataFrame([
                dict(each.items())
                for each in con.execute(query, {
                    "ids": chunk,
                    **parameters
                }).fetchall()
            ]) for chunk in chunks(ids, size)
        ],
        ignore_index=True,
    )
Exemplo n.º 21
0
def Hog_predict_UploadImage(grey_img,image_name,rgb_img):
    CalcHog_FeaturesAndImage_ForOneImage(grey_img,image_name,rgb_img)
    feat_df= read_csv("tmp/HogFeatures.csv")
    feat_vals= feat_df.ix[:,2:].values

    root= "machine_learn/HOG/csv_features/"
    Food_df = read_csv(root+"hog_features_9_NewTraining_Food_everyones.csv")
    People_df = read_csv(root+"hog_features_9_NewTraining_Faces_everyones.csv")
    
    cTrainF = rand(len(Food_df)) > .5
    cTestF = ~cTrainF
    cTrainP = rand(len(People_df)) > .5
    cTestP = ~cTrainP
    
    TrainX_df = pd_concat([People_df[cTrainP], Food_df[cTrainF]],axis=0)
    TrainX= TrainX_df.ix[:,2:].values
    TrainY = concatenate([ones(len(People_df[cTrainP])), zeros(len(Food_df[cTrainF]))])

    ET_classifier = ExtraTreesClassifier(n_estimators=50, max_depth=None, min_samples_split=1, random_state=0)
    ET_classifier.fit(TrainX,TrainY)
    ET_prediction = ET_classifier.predict(feat_vals) 

    LinSVC_classifier = svm.LinearSVC()
    LinSVC_classifier.fit(TrainX,TrainY)
    LinSVC_prediction = LinSVC_classifier.predict(feat_vals)
    return ET_prediction, LinSVC_prediction

### testing
# file="machine_learn/training_image_urls/NewTraining_Food_everyones.txt"
# urls=np.loadtxt(file,dtype="str")
# url=urls[11]
# read= urllib2.urlopen(url).read()
# obj = Image.open( cStringIO.StringIO(read) )
# rgb_img= np.array(obj)
# grey_img = np.array(obj.convert('L'))
# (et,svc)= Hog_predict_UploadImage(grey_img,file,rgb_img)
#et_ans= interpret_int_predict(et[0].astype('int'))
#svc_ans= interpret_int_predict(svc[0].astype('int'))
Exemplo n.º 22
0
def extract_island_crops(record: data, name: str, output_dir: str, size: tuple,
                         ignore_border: bool):
    island, output, _ = record
    output_path = path.join(output_dir, name,
                            f"{path.splitext(island)[0]}.tfrecord")
    writer = tf.compat.v1.python_io.TFRecordWriter(output_path)
    height = output["image_height"].iloc[0]
    transformed_out = output.copy()
    transformed_out["y_pixel"] = transformed_out["y_pixel"].apply(
        lambda y: height - y)
    converted = extrapolate_patches(island,
                                    transformed_out,
                                    size,
                                    ignore_extrema=name == "train")
    island_records = DataFrame()
    if converted is not None:
        island_records = pd_concat(
            [island_records, *[data.object for data in converted]])
        for converted_object in converted:
            tf_example = convert_to_tf_records(converted_object, size, name)
            writer.write(tf_example.SerializeToString())
    writer.flush()
    writer.close()
    return island_records, len(converted)
Exemplo n.º 23
0
def merge_data(result):
    tmp = sorted(list(result.keys()), key=lambda x: (x[1], x[2], x[0]))
    combos = [['fanova', 'ablation'], ['fanova_cut', 'ablation'],
              ['fanova', 'local improvement analysis'],
              ['fanova_cut', 'local improvement analysis'],
              ['ablation', 'local improvement analysis']]
    skip = False
    try:
        del tmp[tmp.index('fanova')]
    except ValueError:
        skip = True
    try:
        del tmp[tmp.index('ablation')]
    except ValueError:
        skip = True
    try:
        del tmp[tmp.index('fanova_cut')]
    except ValueError:
        skip = True
    try:
        del tmp[tmp.index('local improvement analysis')]
    except ValueError:
        skip = True
    df_dict = {}
    if not skip:
        multi = []
        all = None
        prev = tmp[0]
        for combo in combos:
            tap = np.array(tmp)
            indices = list(
                set.intersection(set(list(np.where(tap[:, 1] == combo[0]))[0]),
                                 set(list(np.where(tap[:,
                                                       2] == combo[1]))[0])))
            tap = list(tap[indices])
            for benchmark in tap:
                cols = ['cap_', 'cup_']
                if 'fanova' == benchmark[1]:
                    cols[0], cols[1] = cols[0] + 'f', cols[1] + 'f'
                elif 'ab' in benchmark[1]:
                    cols[0], cols[1] = cols[0] + 'a', cols[1] + 'a'
                else:
                    cols[0], cols[1] = cols[0] + 'c', cols[1] + 'c'
                if 'ab' in benchmark[2]:
                    cols[0], cols[1] = cols[0] + 'a', cols[1] + 'a'
                else:
                    cols[0], cols[1] = cols[0] + 'l', cols[1] + 'l'
                benchmark = tuple(benchmark)
                if all is None:
                    all = DataFrame.from_dict(result[benchmark],
                                              orient='index')
                    all.columns = cols
                    idx = list(
                        map(
                            lambda y: y[1],
                            sorted(enumerate(list(all.index)),
                                   key=lambda x: x[1])))
                    all = all.loc[idx]
                else:
                    d = DataFrame.from_dict(result[benchmark], orient='index')
                    d.columns = cols
                    all = pd_concat([all, d])
            multi.append(all.copy())
            all = None
        df_dict['joint'] = pd_concat(multi, axis=1)
    for key in [
            'ablation', 'fanova', 'fanova_cut', 'local improvement analysis'
    ]:
        try:
            df = DataFrame.from_dict(result[key], orient='index')
            df.columns = ['cap', 'cup']
            idx = list(df.index)
            idx_0 = sorted(enumerate(list(map(lambda x: x.split(' / '), idx))),
                           key=lambda x: (x[1][0], x[1][1]))
            idx_0 = list(map(lambda x: idx[x[0]], idx_0))
            df = df.loc[idx_0]
            df_dict[key] = df
        except KeyError:
            pass
    return df_dict
Exemplo n.º 24
0
def build_gnomAD_FromTranscriptList(transcript_list_file, gmd_version):
    """ builds all gnomAD data from a given gene list file """
    con = dbcon()
    cur = con.cursor()
    table_name = "gnomadv2"
    if gmd_version == "2.1.1":
        gmd_name = "2"
    if gmd_version == "3.0":
        gmd_name = "3"
    """
    this code is now obselete
    query = "DROP TABLE IF EXISTS '" + table_name + "'"
    cur.execute(query)
    query = "CREATE TABLE '" + table_name + "' AS SELECT * FROM gnomad WHERE 0=1"
    cur.execute(query)
    con.commit()
    con.close()
    """
    transcript_dict = parse_transcript_list(transcript_list_file)
    print(transcript_dict)
    for chrom, transcript_list in transcript_dict.items():
        try:
            exomes_data = get_gnomad_data(chrom,
                                          version=gmd_version,
                                          exomes=True)
            exomes_df = process_gnomad_data(exomes_data,
                                            chrom,
                                            transcript_list,
                                            exomes=True)
            exomes_df_exists = True
            os_remove(exomes_data)
        except HTTPError as err:
            if err.code == 404:
                print("No exomes data found")
                exomes_df_exists = False
            else:
                raise
        try:
            genomes_data = get_gnomad_data(chrom,
                                           version=gmd_version,
                                           exomes=False)
            genomes_df = process_gnomad_data(genomes_data,
                                             chrom,
                                             transcript_list,
                                             exomes=False)
            genomes_df_exists = True
            os_remove(genomes_data)
        except HTTPError as err:
            if err.code == 404:
                print("No genomes data found")
                genomes_df_exists = False
            else:
                raise
        # combine the gnomad exomes data and the gnomad genomes data into one
        # pandas dataframe, and export it to the database
        if (exomes_df_exists and genomes_df_exists):
            combined_df = pd_concat([exomes_df, genomes_df])
        elif exomes_df_exists:
            combined_df = exomes_df
        elif genomes_df_exists:
            combined_df = genomes_df
        else:
            raise ValueError("Unable to find requested gnomAD data")
        combined_df.loc[combined_df.duplicated(
            subset=['chromosome', 'position', 'allele_ref', 'allele_alt'],
            keep=False), 'source'] = 'both'
        combined_df = combined_df.drop_duplicates(
            ['chromosome', 'position', 'allele_ref',
             'allele_alt']).reset_index(drop=True)
        combined_df = combined_df.sort_values(by=['position'])
        filename = 'chr' + chrom + '_processed.tsv'
        #combined_df.to_csv(filename, sep='\t', encoding = 'utf-8', index=False)
        con = dbcon()
        combined_df.to_sql(table_name,
                           con=con,
                           if_exists="append",
                           index=False)
        con.commit()
        con.close()
    return None
Exemplo n.º 25
0
from os import listdir
from os.path import join
from detection import Video_Handler
from pandas import concat as pd_concat
if __name__ == '__main__':
    dataset_dir = 'C:\\Users\\emmanouil.vasilopoul\\Documents\\i-SENSE\\Effector\\Datasets\\Detection\\SEAGULL'
    videos_dir = 'C:\\Users\\emmanouil.vasilopoul\\Documents\\i-SENSE\\Effector\\Datasets\\Detection\\SEAGULL\\inputs\\videos\\Complete\\visible\\'
    videos_list = listdir(videos_dir)
    vh = Video_Handler(dataset_dir)
    videos_metadata = []
    for video in videos_list:
        video_path = join(videos_dir, video)
        videos_metadata.append(vh.read_metadata('Complete/visible/' + video))
    df = pd_concat(videos_metadata)
    print(df)
Exemplo n.º 26
0
def get_average_observables_wl(dcs: Union[WangLandauDataContainer,
                                          Dict[Any, WangLandauDataContainer]],
                               temperatures: List[float],
                               observables: List[str] = None,
                               boltzmann_constant: float = kB,
                               fill_factor_limit: float = None) -> DataFrame:
    """Returns the average and the standard deviation of the energy from a
    :ref:`Wang-Landau simulation <wang_landau_ensemble>` for the temperatures
    specified. If the ``observables`` keyword argument is specified
    the function will also return the mean and standard deviation of the
    specified observables.

    Parameters
    ----------
    dcs
        data container(s), from which to extract density of states
        as well as observables
    temperatures
        temperatures, at which to compute the averages
    observables
        observables, for which to compute averages; the observables
        must refer to fields in the data container
    boltzmann_constant
        Boltzmann constant :math:`k_B` in appropriate
        units, i.e. units that are consistent
        with the underlying cluster expansion
        and the temperature units [default: eV/K]
    fill_factor_limit
        use data recorded up to the point when the specified fill factor limit
        was reached when computing averages; otherwise use data for the last
        state

    Raises
    ------
    ValueError
        if the data container(s) do(es) not contain entropy data
        from Wang-Landau simulation
    ValueError
        if data container(s) do(es) not contain requested observable
    """
    def check_observables(dc: WangLandauDataContainer,
                          observables: Optional[List[str]]) -> None:
        """ Helper function that checks that observables are available in data frame. """
        if observables is None:
            return
        for obs in observables:
            if obs not in dc.data.columns:
                raise ValueError('Observable ({}) not in data container.\n'
                                 'Available observables: {}'.format(
                                     obs, dc.data.columns))

    # preparation of observables
    columns_to_keep = ['potential', 'density']
    if observables is not None:
        columns_to_keep.extend(observables)

    # check that observables are available in data container
    # and prepare comprehensive data frame with relevant information
    if isinstance(dcs, WangLandauDataContainer):
        check_observables(dcs, observables)
        df_combined = _extract_filter_data(dcs, columns_to_keep,
                                           fill_factor_limit)
        dcref = dcs
    elif isinstance(dcs, dict) and isinstance(dcs[next(iter(dcs))],
                                              WangLandauDataContainer):
        dfs = []
        for dc in dcs.values():
            check_observables(dc, observables)
            dfs.append(
                _extract_filter_data(dc, columns_to_keep, fill_factor_limit))
        df_combined = pd_concat([df for df in dfs], ignore_index=True)
        dcref = list(dcs.values())[0]
    else:
        raise TypeError('dcs ({}) must be a data container with entropy data'
                        ' or be a list of data containers'.format(type(dcs)))

    # fetch entropy and density of states from data container(s)
    df_density, _ = get_density_of_states_wl(dcs, fill_factor_limit)

    # compute density for each row in data container if observable averages
    # are to be computed
    if observables is not None:
        energy_spacing = dcref.ensemble_parameters['energy_spacing']
        # NOTE: we rely on the indices of the df_density DataFrame to
        # correspond to the energy scale! This is expected to be handled in
        # the get_density_of_states function.
        bins = list(
            np.array(np.round(df_combined.potential / energy_spacing),
                     dtype=int))
        data_density = [
            dens / bins.count(k)
            for k, dens in df_density.density[bins].items()
        ]

    enref = np.min(df_density.energy)
    averages = []
    for temperature in temperatures:

        # mean and standard deviation of energy
        boltz = np.exp(-(df_density.energy - enref) / temperature /
                       boltzmann_constant)
        sumint = np.sum(df_density.density * boltz)
        en_mean = np.sum(
            df_density.energy * df_density.density * boltz) / sumint
        en_std = np.sum(
            df_density.energy**2 * df_density.density * boltz) / sumint
        en_std = np.sqrt(en_std - en_mean**2)
        record = {
            'temperature': temperature,
            'potential_mean': en_mean,
            'potential_std': en_std
        }

        # mean and standard deviation of other observables
        if observables is not None:
            boltz = np.exp(-(df_combined.potential - enref) / temperature /
                           boltzmann_constant)
            sumint = np.sum(data_density * boltz)
            for obs in observables:
                obs_mean = np.sum(
                    data_density * boltz * df_combined[obs]) / sumint
                obs_std = np.sum(
                    data_density * boltz * df_combined[obs]**2) / sumint
                obs_std = np.sqrt(obs_std - obs_mean**2)
                record['{}_mean'.format(obs)] = obs_mean
                record['{}_std'.format(obs)] = obs_std

        averages.append(record)

    return DataFrame.from_dict(averages)