Пример #1
0
def get_plots_for_thresholds(ds,
                             thresholds,
                             leaky_threshold,
                             n_scripts_range,
                             filename_suffix='dye_snippets',
                             y_range=(0, 1),
                             recall_color='black',
                             n_scripts_color='firebrick',
                             **extra_plot_opts):
    resultsdir = ds.config('DYESCORE_RESULTS_DIR')

    # Infile validation
    for threshold in thresholds:
        inpath = os.path.join(
            resultsdir,
            f'dye_score_plot_data_from_{filename_suffix}_{threshold}_leak_{leaky_threshold}.csv'
        )
        ds.file_in_validation(inpath)

    plots = {}
    for threshold in thresholds:
        inpath = os.path.join(
            resultsdir,
            f'dye_score_plot_data_from_{filename_suffix}_{threshold}_leak_{leaky_threshold}.csv'
        )
        if ds.s3:
            with ds.s3.open(inpath, 'r') as f:
                pr_df = pd_read_csv(f)
        else:
            pr_df = pd_read_csv(inpath)
        plots[threshold] = get_pr_plot(pr_df, f'{threshold}', n_scripts_range,
                                       y_range, recall_color, n_scripts_color,
                                       **extra_plot_opts)
    return plots
Пример #2
0
def loading_query(
    test_data_file: str,
    test_score_file: str
):
    data_test = pd_read_csv(test_data_file)
    scores_test = pd_read_csv(test_score_file)
    print("Number of pathways:", len(scores_test))
    print("Total number of reactions:", len(data_test))
    return data_test, scores_test
Пример #3
0
def get_threshold_summary_plot(ds):
    resultsdir = ds.config('DYESCORE_RESULTS_DIR')
    inpath = os.path.join(resultsdir, f'recall_summary_plot_data.csv')
    ds.file_in_validation(inpath)
    if ds.s3:
        with ds.s3.open(inpath, 'r') as f:
            results_df = pd_read_csv(f)
    else:
        results_df = pd_read_csv(inpath)
    recall_thresholds = sorted(results_df.recall_threshold.unique())
    grouped_results_df = results_df.groupby('recall_threshold').agg(
        lambda x: list(x))
    palette = inferno(len(recall_thresholds) +
                      1)  # The yellow is often a little light
    source = ColumnDataSource(grouped_results_df)
    p = figure(
        title=
        f'Scripts captured by distance threshold for {len(recall_thresholds)} recall thresholds (colored)',
        width=800,
        toolbar_location=None,
        tools='',
        y_range=Range1d(results_df.n_over_threshold.min(),
                        results_df.n_over_threshold.max()),
    )
    p.xaxis.axis_label = 'distance threshold'
    p.yaxis.axis_label = 'minimum n_scripts'
    p.yaxis.formatter = NumeralTickFormatter(format="0a")
    p.extra_y_ranges = {
        'percent': Range1d(results_df.percent.min(), results_df.percent.max())
    }
    p.add_layout(
        LinearAxis(y_range_name='percent',
                   axis_label='minimum n_scripts (percent of total)',
                   formatter=NumeralTickFormatter(format='0%')), 'right')
    for i, recall_threshold in enumerate(recall_thresholds):
        view = CDSView(source=source, filters=[IndexFilter([i])])
        opts = dict(source=source,
                    view=view,
                    legend=str(recall_threshold),
                    color=palette[i],
                    line_width=5,
                    line_alpha=0.6)
        p.multi_line(xs='distance_threshold', ys='n_over_threshold', **opts)
        p.multi_line(xs='distance_threshold',
                     ys='percent',
                     y_range_name='percent',
                     **opts)
    p.legend.click_policy = 'hide'
    return p
Пример #4
0
def updateScore(csvfile, score):
    """ Add or update score column and reorder """
    import string
    head, rows = read_csv(csvfile)
    data = pd_read_csv(csvfile)
    data.index = data.index + 1
    cols = data.columns.tolist()
    sco = pd_Series(np_zeros(len(data[cols[0]])), index=data.index)
    if 'Score' not in cols:
        data['Score'] = sco
        cols = ['Score'] + cols
        data = data[cols]
    colk = list(string.ascii_uppercase)
    for sc in score:
        try:
            coln = colk.index(sc[0])
            val = sc[2]
            checked = sc[3]
            if checked:
                sco += val * data.iloc[:, coln]
        except:
            continue
    data['Score'] = sco
    data = data.sort_values('Score', ascending=False)
    updateMSA(os_path.dirname(csvfile), [[v] for v in data['Seq. ID']])
    data = data.reset_index(drop=True)
    data.index = data.index + 1
    data.rename_axis('Select', axis="columns")
    data.to_csv(csvfile, quoting=csv_QUOTE_ALL, index=False)
    return data
Пример #5
0
def _predict_score(
      test_data_file: str,
      test_score_file: str,
      models_path: str,
      features_dset_train,
      no_of_rxns_thres: int
) -> float:
    # ttdf = open(test_data_file, 'r')
    # print('test_data_file')
    # print(ttdf.read())
    # ttsf = open(test_score_file, 'r')
    # print('test_score_file')
    # print(ttsf.read())

    data_test, scores_test = loading_query(
      test_data_file,
      test_score_file
    )

    with NamedTemporaryFile(delete=False) as out_f:
        encode_and_predict(
            data_test,
            scores_test,
            models_path,
            features_dset_train,
            no_of_rxns_thres,
            out_f.name
        )
        out_f.close()
        score_df = pd_read_csv(out_f.name)
        remove(out_f.name)

    return list(score_df.to_dict()['Prob1_mean'].values())
Пример #6
0
 def _build_plot_data_for_score_df(self, s3, inpath, outpath, compare_list):
     if s3:
         with s3.open(inpath, 'r') as f:
             score_df = pd_read_csv(f)
     else:
         score_df = pd_read_csv(inpath)
     pr = pd_DataFrame({
         'dye_score_threshold':
         np.linspace(0, score_df.dye_score.max(), 1000)
     })
     pr['recall'] = pr.dye_score_threshold.apply(self._get_recall,
                                                 score_df=score_df,
                                                 compare_list=compare_list)
     pr['n_over_threshold'] = pr.dye_score_threshold.apply(
         lambda x: (score_df.dye_score > x).sum())
     if s3:
         with s3.open(outpath, 'w') as f:
             pr.to_csv(f, index=False)
     else:
         pr.to_csv(outpath, index=False)
     return outpath
Пример #7
0
def read_from_csv(fileName):
    df_full = pd_read_csv(fileName)
    y = df_full.pop('complex')
    X = df_full

    neg_start_ind = y[y == 0].index[0]
    X_pos = X.iloc[0:neg_start_ind]
    y_pos = y[0:neg_start_ind]
    X_neg = X.iloc[neg_start_ind:]
    y_neg = y[neg_start_ind:]

    return y, X, X_pos, y_pos, X_neg, y_neg
def semesters(ha_df, core_courses, conval_dict, population_IDs=[], program='Computer Science'):
    global se_df
    _h_program = hash( program )
    try:
        if se_df.empty:
            _se_df = pd_read_csv( './data/kuleuven/se_df_%i.csv'%( _h_program ))
            se_df = _se_df
            return se_df
    except:
        _se_df = semesters_features_calc( ha_df,
                                          core_courses,
                                          conval_dict,
                                          population_IDs )
        _se_df.to_csv('./data/kuleuven/se_df_%i.csv'%( _h_program ))
        se_df = _se_df
    return se_df
Пример #9
0
def read_csv(filepath, sep=',', header='infer', names=None, usecols=None, dtype=None, converters=None,
             skiprows=None, nrows=None):
    """Read CSV into DataFrame.

    Eager implementation using pandas, i.e. entire file is read at this point. Only common/relevant parameters
    available at the moment; for full list, could use pandas directly and then convert to baloo.

    Parameters
    ----------
    filepath : str
    sep : str, optional
        Separator used between values.
    header : 'infer' or None, optional
        Whether to infer the column names from the first row or not.
    names : list of str, optional
        List of column names to use. Overrides inferred header.
    usecols : list of (int or str), optional
        Which columns to parse.
    dtype : dict, optional
        Dict of column -> type to parse as.
    converters : dict, optional
        Dict of functions for converting values in certain columns.
    skiprows : int, optional
        Number of lines to skip at start of file.
    nrows : int, optional
        Number of rows to read.

    Returns
    -------
    DataFrame

    See Also
    --------
    pandas.read_csv : https://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_csv.html

    """
    pd_df = pd_read_csv(filepath,
                        sep=sep,
                        header=header,
                        names=names,
                        usecols=usecols,
                        dtype=dtype,
                        converters=converters,
                        skiprows=skiprows,
                        nrows=nrows)

    return DataFrame.from_pandas(pd_df)
def alpha_beta_skewness(ha_df, population_IDs=[], program='Computer Science', overwrite=False):
    global abs_df
    _h_program = hash( program )
    
    def calc_n_save():        
        _abs_df = courses_features_calc( ha_df, population_IDs )
        _abs_df.to_csv('./data/kuleuven/abs_df_%i.csv'%( _h_program ))
        return _abs_df

    if abs_df.empty:
        try:
            _abs_df = pd_read_csv('./data/kuleuven/abs_df_%i.csv'%( _h_program ), index_col=0)
            abs_df = _abs_df
        except:
            abs_df = calc_n_save()
    elif overwrite:
        abs_df = calc_n_save()
    return abs_df
def get_GPA_by_student(ha_df=kuleuven_loader.ha_df):
    global gpa_df
    try:
        if gpa_df.empty:
            #print 'GPA load'
            _gpa_df = pd_read_csv('./data/kuleuven/gpa_df.csv',
            index_col=0,
            dtype={'GPA':float32,
            'ap_GPA':float32,
            'cod_estudiante':int32,
            'performance':float32})
            gpa_df = _gpa_df
        else:
            return gpa_df
    except:
        #print 'GPA load fails'
        _gpa_df = GPA_calc(ha_df)
        _gpa_df.to_csv('./data/kuleuven/gpa_df.csv')
    return _gpa_df
def read_coordinates_of_class(in_situ_crop_directory, directory_as_class):
    class_coordinates = []

    with open(
            os.path.join(in_situ_crop_directory, directory_as_class,
                         'coordinates.txt')) as file:
        df = pd_read_csv(file, sep='\t', header=None)
        for i in range(len(df)):
            row = df.loc[i]

            cls = int(directory_as_class) - 1
            shelf = int(row[0])
            frame = int(row[1])
            xleft = int(row[2])
            yupper = int(row[3])
            xright = int(row[2] + row[4])
            ylower = int(row[3] + row[5])

            class_coordinates.append(
                (cls, shelf, frame, xleft, yupper, xright, ylower))

    return class_coordinates
from numpy import int32, float32

pd.options.mode.chained_assignment = None
pd.options.mode.use_inf_as_null = True

def side_strip(_str):
    try:
        return _str[ :_str.index(' ') ]
    except:
        return _str

'''
Students Academic History
'''
start = time()
ha_df = pd_read_csv('./data/espol/ha_df.csv', index_col=0)
ha_df['cod_materia_acad'] = ha_df['cod_materia_acad'].apply( side_strip )
try:
    ha_df['cod_estudiante'] = ha_df['cod_estudiante'].values.astype(int32)
    ha_df['promedio'] = ha_df['promedio'].values.astype(float32)
    ha_df['anio'] = ha_df['anio'].values.astype(int32)
    ha_df['paralelo'] = ha_df['paralelo'].values.astype(int32)
    ha_df['GPA'] = ha_df['GPA'].values.astype(float32)
    ha_df['ap_GPA'] = ha_df['ap_GPA'].values.astype(float32)
    ha_df['performance'] = ha_df['performance'].values.astype(float32)
    ha_df['promedio_GPA'] = ha_df['promedio_GPA'].values.astype(float32)
except:
    pass
end = time()
print('Exe time: %.2f'%(end - start))
print('loaded dataframe from CSV as DataFrame. records: %d'%len(ha_df))
Пример #14
0
from keras.layers import Input, Conv2D, MaxPooling2D, Flatten, Dense, concatenate, Dropout
from keras.models import Model

from deepgs.parser import load_plink_text
from deepgs.transform import transform
import deepgs.model as models

# if executed from within the test directory, go one level up the tree.
if os.path.split(os.getcwd())[1] == "tests":
    os.chdir("..")

file_path = os.path.join("data", "sample_100-10000")
if os.path.exists(file_path + ".pkl"):
    G = pickle.load(open(file_path + ".pkl", "rb"))
    map_table = pd_read_csv(file_path + ".map",
                            delim_whitespace=True,
                            header=None,
                            names=["chr", "snp", "cm", "bp"])
else:
    g_df, map_table = load_plink_text(file_path)
    G = transform(g_df)

model = models.create_architecture_small(map_table)
model = models.compile_model(model)

GG = models.format_as_model_input(G, map_table)
model.predict(GG)


def create_dummy_architecture(map_table, output_dim=1, output_activation=None):
    """
    A small dummy architecture to be tested on the small 15-by-10 dataset.
Пример #15
0
    def get_recall_summary_plot_data(self,
                                     thresholds,
                                     recall_thresholds,
                                     leaky_threshold,
                                     filename_suffix='dye_snippets',
                                     override=True):
        resultsdir = self.config('DYESCORE_RESULTS_DIR')

        # Infile validation
        for threshold in thresholds:
            inpath = os.path.join(
                resultsdir,
                f'dye_score_plot_data_from_{filename_suffix}_{threshold}_leak_{leaky_threshold}.csv'
            )
            self.file_in_validation(inpath)

        # Outfile validation
        outpath = os.path.join(resultsdir, f'recall_summary_plot_data.csv')
        self.file_out_validation(outpath, override)

        # Gather up relevant results
        results = []
        for threshold in thresholds:
            inpath = os.path.join(
                resultsdir,
                f'dye_score_plot_data_from_{filename_suffix}_{threshold}_leak_{leaky_threshold}.csv'
            )
            if self.s3:
                with self.s3.open(inpath, 'r') as f:
                    pr_df = pd_read_csv(f)
            else:
                pr_df = pd_read_csv(inpath)

            for recall_threshold in recall_thresholds:
                # TODO Use idxmin
                result = {}
                n_over_threshold = pr_df[pr_df > recall_threshold].sort_values(
                    by='recall').iloc[0]['n_over_threshold']
                result['distance_threshold'] = threshold
                result['n_over_threshold'] = n_over_threshold
                result['recall_threshold'] = recall_threshold
                results.append(result)

        # Make DF and save
        inpath = os.path.join(
            resultsdir,
            f'dye_score_from_{filename_suffix}_{thresholds[0]}_leak_{leaky_threshold}.csv'
        )
        if self.s3:
            with self.s3.open(inpath, 'r') as f:
                total_results = len(pd_read_csv(f))
        else:
            total_results = len(pd_read_csv(inpath))

        results_df = pd_DataFrame.from_records(results)
        results_df['percent'] = (results_df.n_over_threshold / total_results)
        if self.s3:
            with self.s3.open(outpath, 'w') as f:
                results_df.to_csv(f, index=False)
        else:
            results_df.to_csv(outpath, index=False)
        return outpath
from sklearn.cross_validation import train_test_split
from numpy import average as np_average
from numpy import array as np_array
from skfuzzy import cmeans, cmeans_predict
from data_loader import kuleuven_loader
from itertools import combinations

in_source = "kuleuven"
dispatcher = WSDispatcher(source=in_source)
se_df = dispatcher.academic_clusterer.semesters_features
sf_df = dispatcher.academic_clusterer.students_features
ss_df = pd_merge(se_df, sf_df, on="student")

# cd doc/calibration_test/

abs_df = pd_read_csv("../../data/kuleuven/abs_df_1716653621.csv", index_col=0)
abs_df = abs_df.fillna(-1000)
ha_df = pd_read_csv("../../data/kuleuven/students_courses.csv", index_col=0)
ha_df = ha_df.drop_duplicates(["year", "status", "course", "grade", "student"])
sha_df = pd_merge(ha_df, sf_df, on="student")
sha_df = pd_merge(sha_df, abs_df, on="course")

OP = []
OP_append = OP.append


def plot_calibration_curve_from_data(X, y, est, name, fig_index):
    """Plot calibration curve for est w/o and with calibration. """
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=7)

    # Calibrated with isotonic calibration
from detection_and_tracking.datasets.seagull.seagull_txt_handler import Seagull_TXT_Handler
from detection_and_tracking.configuration.seagull import dataset_dir, SeagullPaths
from pandas import read_csv as pd_read_csv

seagull_paths = SeagullPaths()
# TODO - Fix maritime

if __name__ == '__main__':
    txt_handler = Seagull_TXT_Handler(dataset_dir)

    dataset_info_df = pd_read_csv(dataset_annotations_info, delimiter=';')

    txt_handler.txts_to_tracking_csv(complete_visible_txt_dir, csv_name='all-complete-visible-tracking.csv', dataset_info_df=dataset_info_df)
    txt_handler.txts_to_detection_csv(complete_visible_txt_dir, csv_name='all-complete-visible-detection_and_tracking.csv')

    txt_handler.txts_to_tracking_csv(complete_infrared_txt_dir, csv_name='all-complete-infrared-tracking.csv', dataset_info_df=dataset_info_df)
    txt_handler.txts_to_detection_csv(complete_infrared_txt_dir, csv_name='all-complete-infrared-detection_and_tracking.csv')

    txt_handler.txts_to_detection_csv(incomplete_visible_txt_dir, csv_name='all-incomplete-visible-detection_and_tracking.csv')
Students Academic History

<class 'pandas.core.frame.DataFrame'>
Int64Index: 120080 entries, 0 to 120079
Data columns (total 7 columns):
student        120080 non-null int32
grade          120063 non-null float32
course         120080 non-null object
name           120080 non-null object
status         120080 non-null object
performance    120080 non-null float32
year           120080 non-null int32
dtypes: float32(2), int32(2), object(3)
"""
start = time()
ha_df = pd_read_csv("./data/kuleuven/students_courses.csv")
# ha_df['cod_materia_acad'] = ha_df['cod_materia_acad'].apply( side_strip )
try:
    ha_df["student"] = ha_df["student"].values.astype(int32)
    ha_df["grade"] = ha_df["grade"].values.astype(float32)
    ha_df["performance"] = ha_df["performance"].values.astype(float32)
    ha_df["year"] = ha_df["year"].values.astype(int32)
except:
    pass
ha_df = ha_df.drop_duplicates(["year", "status", "course", "grade", "student"])
end = time()
print("Exe time: %.2f" % (end - start))
print("loaded dataframe from CSV as DataFrame. records: %d" % len(ha_df))
print("\n")

#'''