Пример #1
0
    def load_clean_datasets(self,dataset_name, project_settings):

        assert dataset_name in ['train_val','train','val','test']

        clean_input_files = project_settings['clean_input_files']

        X_name, y_name = [('X_' + d, 'y_' + d) for d in [dataset_name]][0]

        data_dir = find_data_dir(project_settings)
        data = dict()

        if 'take_nth_row' in project_settings:
            take_nth_row = project_settings['take_nth_row']
        else:
            take_nth_row = 1
        X_abs_filepath = data_dir + '/' + clean_input_files[X_name]
        X = pd.read_csv(X_abs_filepath, sep="\s", engine='python', header=None, skiprows=lambda i: i % take_nth_row != 0)


        y_mat_file_path = data_dir + '/' + clean_input_files[y_name]
        y_mat = pd.read_csv(y_mat_file_path, sep="\s", engine='python', header=None, skiprows= lambda i: i % take_nth_row != 0)
        y = y_mat.iloc[:, 0].tolist()

        data[dataset_name] = (X,y)

        return data
Пример #2
0
def main():

    data_dir = find_data_dir(project_settings)
    processed_dir = data_dir + '/processed'
    if not os.path.isdir(processed_dir):
        os.makedirs(processed_dir)

    raw_files = project_settings['raw_input_files']

    ##Remove duplicate columns or alter their names
    column_names_filepath = data_dir + '/' + raw_files['feature_names']
    feat_df = pd.read_csv(column_names_filepath,sep="\s+",engine='python',names=['file_index','feature_name'])
    feat_df.sort_values('feature_name',inplace=True)
    feat_df['col_name_count'] = feat_df.groupby(['feature_name']).cumcount()+1
    feat_df['new_feature_name'] = feat_df['feature_name'].where(feat_df['col_name_count'] == 1
                                 ,other=feat_df['feature_name'] + '..' + feat_df['col_name_count'].astype(str)
    )


    feature_name_filepath = data_dir + '/' + project_settings['clean_input_files']['feature_names']
    feat_df.sort_values('file_index',inplace=True)
    feat_df[['new_feature_name']].to_csv(feature_name_filepath,index=False,header=False,quoting=False,sep="\t")


    ##Create Validation Set
    num_folds = project_settings['assessment']['cv_num_folds']
    perc = 1 / num_folds
    clean_input_files = project_settings['clean_input_files']
    X_train_val_filepath = find_data_dir(project_settings) + '/' + clean_input_files['X_train_val']
    y_train_val_filepath = find_data_dir(project_settings) + '/' + clean_input_files['y_train_val']
    X_train_val = pd.read_csv(X_train_val_filepath,sep="\s+",engine='python',header=None)
    y_train_val = pd.read_csv(y_train_val_filepath,sep="\s+",engine='python',header=None)
    s2_dfs = [pd.DataFrame(x) for x in train_test_split(X_train_val,y_train_val,test_size=perc)]
    s2_names = ["X_train","X_val","y_train","y_val"]
    dd2 = dict(zip(s2_names,s2_dfs))
    for name in dd2:
        rel_file_path = clean_input_files[name]
        abs_file_path = data_dir + '/' + rel_file_path
        dd2[name].to_csv(abs_file_path,header=None,sep="\t",index=False)
Пример #3
0
    def get_images(images, return_motion=False):
        if len(images[0].shape) == 0:
            data_dir = find_data_dir(work_dir, str(images[0]))
            paths = [[os.path.join(
                data_dir, os.path.split(str(x))[1]) for x in images]]
            fname = glob.glob(os.path.join(data_dir, 'rp_*.txt'))[0]
            motion = [fname]
        else:
            paths = []
            motion = []
            for session_scans in images:
                scans = []
                data_dir = find_data_dir(work_dir, str(session_scans[0]))
                for x in session_scans:
                    scans.append(
                        os.path.join(data_dir, os.path.split(str(x))[1]))
                paths.append(scans)
                fname = glob.glob(os.path.join(data_dir, 'rp_*.txt'))[0]
                motion.append(fname)

        if return_motion:
            return paths, motion
        return paths
Пример #4
0
    def save_normalize(m0):
        # normalize bold
        normalize = m0

        doc['normalize']['bold'] = []

        for session_scans in np.split(
                normalize.resample, np.cumsum(doc['n_scans']))[:-1]:
            scans = []
            data_dir = find_data_dir(work_dir, str(session_scans[0]))

            for x in session_scans:
                scans.append(os.path.join(data_dir, os.path.split(str(x))[1]))
            doc['normalize']['bold'].append(scans)
Пример #5
0
    def save_smooth(m0):
        # smooth
        smooth = m0

        doc['smooth'] = {}
        doc['smooth']['bold'] = []
        doc['smooth']['fwhm'] = float(smooth.fwhm)

        for session_scans in np.split(
                smooth.data, np.cumsum(doc['n_scans']))[:-1]:
            scans = []
            data_dir = find_data_dir(work_dir, str(session_scans[0]))

            for x in session_scans:
                scans.append(os.path.join(data_dir, os.path.split(str(x))[1]))
            doc['smooth']['bold'].append(scans)
Пример #6
0
 def get_path(path):
     data_dir = find_data_dir(work_dir, str(path))
     return os.path.join(data_dir, os.path.split(str(path))[1])
Пример #7
0
def load_intra(location, fix=None, **kwargs):
    doc = {}

    mat = load_matfile(location)['SPM']

    work_dir = os.path.split(os.path.realpath(location))[0]

    doc.update(_check_kwargs(work_dir, **kwargs))

    # doc['mat'] = mat

    doc['design_matrices'] = mat.xX.X.tolist()           # xX: model
    doc['design_conditions'] = [str(i) for i in mat.xX.name]
    doc['design_contrasts'] = {}

    doc['n_scans'] = mat.nscan.tolist() \
        if isinstance(mat.nscan.tolist(), list) else [mat.nscan.tolist()]
    doc['n_sessions'] = mat.nscan.size
    doc['tr'] = float(mat.xY.RT)    # xY: data
    doc['mask'] = os.path.join(work_dir, str(mat.VM.fname))  # VM: mask

    doc['beta_maps'] = []
    doc['c_maps'] = {}
    doc['t_maps'] = {}

    doc['condition_key'] = []
    doc['task_contrasts'] = {}
    doc['onsets'] = []

    swabold = np.split(mat.xY.P.tolist(), np.cumsum(doc['n_scans'])[:-1])

    doc['data'] = {}

    for session in swabold:
        session_dir = find_data_dir(work_dir, session[0])
        scans = [os.path.join(session_dir, os.path.split(s)[1].strip())
                 for s in session]
        doc['data'].setdefault('swabold', []).append(scans)

    for s in doc['data']['swabold']:
        scans = []
        for i in s:
            scans.append(strip_prefix_filename(i, 1))
        doc['data'].setdefault('wabold', []).append(scans)

    for s in doc['data']['swabold']:
        scans = []
        for i in s:
            scans.append(strip_prefix_filename(i, 2))
        doc['data'].setdefault('abold', []).append(scans)

    for s in doc['data']['swabold']:
        scans = []
        for i in s:
            scans.append(strip_prefix_filename(i, 3))
        doc['data'].setdefault('bold', []).append(scans)

    doc['motion'] = []
    if doc['n_sessions'] > 1:
        for session in mat.Sess:
            doc['motion'].append(session.C.C.tolist())
    else:
        doc['motion'].append(mat.Sess.C.C.tolist())

    def get_condition_onsets(condition):
        onset_time = condition.ons.tolist()
        onset_duration = condition.dur.tolist()
        if not isinstance(onset_time, list):
            onset_time = [onset_time]
            onset_duration = [onset_duration]
        onset_weight = [1] * len(onset_time)

        return zip(onset_time, onset_duration, onset_weight)

    if hasattr(mat.Sess, '__iter__'):
        for session in mat.Sess:
            onsets = {}
            condition_key = []
            for condition_id, condition in enumerate(session.U):
                k = 'cond%03i' % (condition_id + 1)
                onsets[k] = get_condition_onsets(condition)
                condition_key.append(str(condition.name))
            doc['condition_key'].append(condition_key)
            doc['onsets'].append(onsets)
    else:
        onsets = {}
        condition_key = []
        for condition_id, condition in enumerate(mat.Sess.U):
            k = 'cond%03i' % (condition_id + 1)
            onsets[k] = get_condition_onsets(condition)
            condition_key.append(str(condition.name))
        doc['condition_key'].append(condition_key)
        doc['onsets'].append(onsets)

    for c in mat.xCon:
        name = str(c.name)
        try:
            doc['c_maps'][name] = os.path.join(work_dir, str(c.Vcon.fname))
            doc['t_maps'][name] = os.path.join(work_dir, str(c.Vspm.fname))
            doc['design_contrasts'][name] = c.c.tolist()
        except:
            # sometimes c.Vcon is an empty array
            pass

    for i, b in enumerate(mat.Vbeta):
        doc['beta_maps'].append(os.path.join(work_dir, str(b.fname)))

    if 'subject_id' not in doc:
        doc['subject_id'] = hashlib.md5(work_dir).hexdigest()

    def get_condition_index(name):
        for i, full_name in enumerate(doc['design_conditions']):
            if name in full_name:
                return i

    # find the indices of the actual experimental conditions in the
    # design matrix, not the additional regressors...
    ii = []
    for session in doc['condition_key']:
        ii.append([get_condition_index(name) for name in session])
    # redefine the contrasts with the experimental conditions
    for k, contrast in doc['design_contrasts'].iteritems():
        doc['task_contrasts'][k] = []

        for per_session in ii:
            doc['task_contrasts'][k].append(
                np.array(contrast)[per_session].tolist())

    # attempt to guess condition names with the contrast names & values
    condition_key = [np.array(ck, dtype='|S32') for ck in doc['condition_key']]
    for contrast_name, session_contrasts in doc['task_contrasts'].items():
        for ck, contrast in zip(condition_key, session_contrasts):
            contrast = np.array(contrast)
            if ((contrast < 0).sum() == 0 and len(contrast.shape) == 1
                and (contrast == np.abs(contrast).max()).sum() == 1):
                ck[np.array(contrast) > 0] = contrast_name
    doc['condition_key'] = condition_key

    # reformat SPM design per session
    (doc['design_matrices'],
     doc['design_conditions'],
     doc['design_contrasts']) = make_design_from_spm(
         doc['n_scans'],
         doc['design_matrices'],
         doc['design_conditions'],
         doc['design_contrasts'])

    if fix is not None:
        doc = fix_experiment(doc, fix)[0]
    return doc
Пример #8
0
def main():

    data_dir = find_data_dir(project_settings)
    processed_dir = data_dir + '/processed'
    if not os.path.isdir(processed_dir):
        os.makedirs(processed_dir)

    data_dir = find_data_dir(project_settings)
    raw_files = project_settings['raw_input_files']

    train_filepath = data_dir + '/' + raw_files['train']
    test_filepath = data_dir + '/' + raw_files['test']

    #for filepath in [train_filepath,test_filepath]: TODO: Figure this out
    df = pd.read_csv(train_filepath,
                     sep=",",
                     header=None,
                     skiprows=1,
                     keep_default_na=False)
    #df = pd.read_csv(train_filepath,sep=",",keep_default_na=False)

    #df = df.drop(64,axis=1) This is GarageQual. Highly correlated with GarageCond
    #Maybe that is why I originally dropped it, considering the comment below, which I wrote
    #a long time before writing this
    df.columns = range(df.shape[1])  #possible duplicate?

    #df = df.drop([0,38,46],axis=1) #Drop lincombo and id column
    #df.columns = range(df.shape[1])  #possible duplicate?

    X = df.iloc[:, range(df.shape[1] - 1)]
    y = df.iloc[:, df.shape[1] - 1]
    #y = pd.Series(np.repeat(1,len(y_str))).where(y_str == 'yes', other = 0)

    data_dict = OrderedDict([
        ('id', 'numeric'),
        ('MSSubClass', [
            '20', '30', '40', '45', '50', '60', '70', '75', '80', '85', '90',
            '120', '160', '180', '190', 'NA'
        ]),  #Got rid of 150
        ('MSZoning', ['C (all)', 'FV', 'RH', 'RL', 'RM',
                      'NA']),  #Got rid of RP,I,C,A
        ('LotFrontage', 'numeric'),
        ('LotArea', 'numeric'),
        ('Street', ['Grvl', 'Pave']),
        ('Alley', ['Grvl', 'Pave', 'NA']),
        ('LotShape', ['Reg', 'IR1', 'IR2', 'IR3']),
        ('LandContour', ['Lvl', 'Bnk', 'HLS', 'Low']),
        ('Utilities', ['AllPub', 'NoSeWa']),  #Dropped NoSewr,ELO
        ('LotConfig', ['Inside', 'Corner', 'CulDSac', 'FR2', 'FR3']),
        ('LandSlope', ['Gtl', 'Mod', 'Sev']),
        ('Neighborhood', [
            'Blmngtn', 'Blueste', 'BrDale', 'BrkSide', 'ClearCr', 'CollgCr',
            'Crawfor', 'Edwards', 'Gilbert', 'IDOTRR', 'MeadowV', 'Mitchel',
            'NAmes', 'NoRidge', 'NPkVill', 'NridgHt', 'NWAmes', 'OldTown',
            'SWISU', 'Sawyer', 'SawyerW', 'Somerst', 'StoneBr', 'Timber',
            'Veenker'
        ]),
        ('Condition1', [
            'Artery', 'Feedr', 'Norm', 'RRNn', 'RRAn', 'PosN', 'PosA', 'RRNe',
            'RRAe'
        ]),
        ('Condition2',
         ['Artery', 'Feedr', 'Norm', 'RRNn', 'RRAn', 'PosA', 'PosN',
          'RRAe']),  #dropped RRNe
        ('BldgType', ['1Fam', '2fmCon', 'Duplex', 'TwnhsE',
                      'Twnhs']),  #Does not match data description fi
        ('HouseStyle', [
            '1Story', '1.5Fin', '1.5Unf', '2Story', '2.5Fin', '2.5Unf',
            'SFoyer', 'SLvl'
        ]),
        ('OverallQual', 'numeric'),
        ('OverallCond', 'numeric'),
        ('YearBuilt', 'numeric'),
        ('YearRemodAdd', 'numeric'),
        ('RoofStyl', ['Flat', 'Gable', 'Gambrel', 'Hip', 'Mansard', 'Shed']),
        ('RoofMatl', [
            'ClyTile', 'CompShg', 'Membran', 'Metal', 'Roll', 'Tar&Grv',
            'WdShake', 'WdShngl'
        ]),
        ('Exterior1st', [
            'AsbShng', 'AsphShn', 'BrkComm', 'BrkFace', 'CBlock', 'CemntBd',
            'HdBoard', 'ImStucc', 'MetalSd', 'Plywood', 'Stone', 'Stucco',
            'VinylSd', 'Wd Sdng', 'WdShing', "NA"
        ]),  #Dropped Other,PreCast
        (
            'Exterior2nd',
            [
                'AsbShng',
                'AsphShn',
                'Brk Cmn',
                'BrkFace',
                'CBlock',
                'CmentBd',
                'HdBoard',
                'ImStucc',
                'MetalSd',
                'Other',
                'Plywood',  #Dropped PreCast
                'Stone',
                'Stucco',
                'VinylSd',
                'Wd Sdng',
                'Wd Shng',
                "NA"
            ]),  #Does not match data desc file
        ('MasVnrType',
         ['BrkCmn', 'BrkFace', 'None', 'Stone',
          'NA']),  #Does not match data desc file (NA) #Dropped CBlock
        ('MasVnrArea', 'numeric'),
        ('ExterQual', ['Ex', 'Gd', 'TA',
                       'Fa']),  #TODO: Make this numeric? #Dropped Po
        ('ExterCond', ['Ex', 'Gd', 'TA', 'Fa', 'Po']),
        ('Foundation', ['BrkTil', 'CBlock', 'PConc', 'Slab', 'Stone', 'Wood']),
        ('BsmtQual', ['Ex', 'Gd', 'TA', 'Fa', 'NA']),  #Dropped Po
        ('BsmtCond', ['Gd', 'TA', 'Fa', 'Po', 'NA']),  #Dropped Ex
        ('BsmtExposure', ['Gd', 'Av', 'Mn', 'No', 'NA']),
        ('BsmtFinType1', ['GLQ', 'ALQ', 'BLQ', 'Rec', 'LwQ', 'Unf', 'NA']),
        ('BsmtFinSF1', 'numeric'),
        ('BsmtFinType2', ['GLQ', 'ALQ', 'BLQ', 'Rec', 'LwQ', 'Unf', 'NA']),
        ('BsmtFinSF2', 'numeric'),
        ('BsmtUnfSF', 'numeric'),
        ('TotalBsmtSF', 'numeric'),
        ('Heating', ['Floor', 'GasA', 'GasW', 'Grav', 'OthW', 'Wall']),
        ('HeatingQC', ['Ex', 'Gd', 'TA', 'Fa', 'Po']),
        ('CentralAir', ['N', 'Y']),
        ('Electrical', ['SBrkr', 'FuseA', 'FuseF', 'FuseP', 'Mix',
                        'NA']),  #Does not match data desc file (NA)
        ('1stFlrSF', 'numeric'),
        ('2ndFlrSF', 'numeric'),
        ('LowQualFinSF', 'numeric'),
        ('GrLivArea', 'numeric'),
        ('BsmtFullBath', 'numeric'),  #TODO: should these be cat?
        ('BsmtHalfBath', 'numeric'),
        ('FullBath', 'numeric'),
        ('HalfBath', 'numeric'),
        ('Bedroom', 'numeric'),
        ('Kitchen', 'numeric'),
        ('KitchenQual', ['Ex', 'Gd', 'TA', 'Fa', 'NA']),  #Dropped Po
        ('TotRmsAbvGrd', 'numeric'),
        ('Functional',
         ['Typ', 'Min1', 'Min2', 'Mod', 'Maj1', 'Maj2', 'Sev',
          'NA']),  #TODO: make numeric?  #dropped Sal
        ('Fireplaces', 'numeric'),
        ('FireplaceQu', ['Ex', 'Gd', 'TA', 'Fa', 'Po', 'NA']),
        ('GarageType',
         ['2Types', 'Attchd', 'Basment', 'BuiltIn', 'CarPort', 'Detchd',
          'NA']),  #Does not match data desc file
        ('GarageYrBlt', 'numeric'),
        ('GarageFinish', ['Fin', 'RFn', 'Unf', 'NA']),
        ('GarageCars', 'numeric'),
        ('GarageArea', 'numeric'),
        ('GarageQual', ['Ex', 'Gd', 'TA', 'Fa', 'Po', 'NA']),
        ('GarageCond', ['Ex', 'Gd', 'TA', 'Fa', 'Po', 'NA']),
        ('PavedDrive', ['Y', 'P', 'N']),
        ('WoodDeckSF', 'numeric'),
        ('OpenPorchSF', 'numeric'),
        ('EnclosedPorch', 'numeric'),
        ('3SsnPorch', 'numeric'),
        ('ScreenPorch', 'numeric'),
        ('PoolArea', 'numeric'),
        ('PoolQC', ['Ex', 'Gd', 'Fa', 'NA']),  #Dropped TA
        ('Fence', ['GdPrv', 'MnPrv', 'GdWo', 'MnWw', 'NA']),
        ('MiscFeature', ['Gar2', 'Othr', 'Shed', 'TenC', 'NA']),  #dropped Elev
        ('MiscVal', 'numeric'),
        ('MoSold',
         ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11',
          '12']),  #TODO: numeric?
        ('YrSold', 'numeric'),
        ('SaleType', [
            'WD', 'CWD', 'New', 'COD', 'Con', 'ConLw', 'ConLI', 'ConLD', 'Oth',
            'NA'
        ]),  #dropped VWD
        ('SaleCondition',
         ['Normal', 'Abnorml', 'AdjLand', 'Alloca', 'Family', 'Partial']),
        #('SalePrice','numeric')
    ])

    X.columns = data_dict.keys()

    #Bad Formatting in source file
    #X.loc[:,'BldgType'] = X.loc[:,'BldgType'].copy().where(X.loc[:,'BldgType'] != '2fmCon',other='2FmCon')
    #X.loc[:,'BldgType'] = X.loc[:,'BldgType'].copy().where(X.loc[:,'BldgType'] != 'Duplex',other='Duplx')

    col_idx = 0
    meta_int_val_map = OrderedDict()
    for col in data_dict.keys():
        col_vals = data_dict[col]
        if col_vals != 'numeric':
            val_int_map = dict([(col_vals[i], i)
                                for i in range(len(col_vals))])
            try:
                X.iloc[:, col_idx] = X.iloc[:, col_idx].astype(str).apply(
                    lambda x: val_int_map[x]).copy()
            except KeyError:
                assert 1 == 0
            meta_int_val_map[col] = {v: k for k, v in val_int_map.items()}
        else:
            meta_int_val_map[col] = 'numeric'
        col_idx += 1

    Xe = pd.DataFrame()
    col_idx = 0
    enc = OneHotEncoder(sparse=False)
    feature_names = list()
    for col in meta_int_val_map:
        if meta_int_val_map[col] == 'numeric':
            #try:
            #    assert 'NA' not in pd.isnull(X.loc[:,col].value_counts().index)
            #except AssertionError:
            #    assert 1 == 0
            X.loc[:, col] = X.loc[:, col].copy().apply(
                lambda x: x if x != 'NA' else np.nan)
            Xe.loc[:, col_idx] = X.loc[:, col].astype(float)
            #Xe.loc[:,col_idx] = Xe.loc[:,col_idx].copy().fillna(Xe.loc[:,col_idx].mean()) #TODO: Make this a transfomer
            feature_names.append(col)
            col_idx += 1
        else:
            assert 0 == 0
            input_array = np.array(list(X.loc[:, col])).reshape(-1, 1)
            Xt = pd.DataFrame(enc.fit_transform(input_array))
            int_val_map = meta_int_val_map[col]
            for int in int_val_map:
                if int_val_map[int] == 'NA':
                    try:
                        Xt.iloc[:, int] = Xt.iloc[:, int].apply(
                            lambda x: np.nan if x == 1.0 else x)
                    except IndexError:
                        Xt.loc[:, int] = 0.0
                else:
                    pass
            Xt_w = Xt.shape[1]
            Xt.columns = [i + col_idx for i in range(Xt_w)]
            if Xe.shape[1] == 0:
                Xe = Xt
            else:
                Xe = pd.merge(Xe, Xt, left_index=True, right_index=True)
            int_val_map = meta_int_val_map[col]
            try:
                assert len(int_val_map) == Xt_w
            except AssertionError:
                assert 1 == 0
            for i in range(len(int_val_map)):
                dim_val = int_val_map[i]
                feature_name = col + '_' + dim_val
                feature_names.append(feature_name)
                col_idx += 1
    assert col_idx == Xe.shape[1]
    Xe.columns = range(col_idx)

    feature_names = [
        x.replace(' ', '').replace('(', '').replace(')', '')
        for x in feature_names
    ]

    clean_input_files = project_settings['clean_input_files']
    feature_names_rel_filepath = clean_input_files['feature_names']
    feature_names_abs_filepath = data_dir + '/' + feature_names_rel_filepath
    pd.Series(feature_names).to_csv(feature_names_abs_filepath,
                                    index=False,
                                    header=None,
                                    sep="\t")

    split_perc = project_settings['train_test_split']
    if split_perc != 1:
        s1_dfs = [
            pd.DataFrame(x) for x in train_test_split(
                Xe, y, test_size=split_perc, random_state=42)
        ]
        s1_names = ["X_test", "X_train_val", "y_test", "y_train_val"]
    else:
        s1_dfs = [Xe, y]
        s1_names = ["X_train_val", "y_train_val"]
    dd1 = dict(zip(s1_names, s1_dfs))
    for name in dd1:
        rel_file_path = clean_input_files[name]
        abs_file_path = data_dir + '/' + rel_file_path
        dd1[name].to_csv(abs_file_path, header=None, sep="\t", index=False)
    X_train_val, y_train_val = dd1['X_train_val'], dd1['y_train_val']

    num_folds = project_settings['assessment']['cv_num_folds']
    perc = 1 / num_folds
    s2_dfs = [
        pd.DataFrame(x)
        for x in train_test_split(X_train_val, y_train_val, test_size=perc)
    ]
    s2_names = ["X_train", "X_val", "y_train", "y_val"]
    dd2 = dict(zip(s2_names, s2_dfs))
    for name in dd2:
        rel_file_path = clean_input_files[name]
        abs_file_path = data_dir + '/' + rel_file_path
        dd2[name].to_csv(abs_file_path, header=None, sep="\t", index=False)
Пример #9
0
def main():

    data_dir = find_data_dir(project_settings)
    processed_dir = data_dir + '/processed'
    if not os.path.isdir(processed_dir):
        os.makedirs(processed_dir)

    data_dir = find_data_dir(project_settings)
    raw_files = project_settings['raw_input_files']

    X_filepath = data_dir + '/' + raw_files['X']

    df = pd.read_csv(X_filepath, sep=";")

    X = df.iloc[:, range(16)]
    y_str = df.iloc[:, 16]
    y = pd.Series(np.repeat(1, len(y_str))).where(y_str == 'yes', other=0)

    data_dict = OrderedDict([
        ('age', 'numeric'),
        ('job', [
            'blue-collar', 'management', 'technician', 'admin.', 'services',
            'retired', 'self-employed', 'entrepreneur', 'unemployed',
            'housemaid', 'student', 'unknown'
        ]), ('marital', ['married', 'single', 'divorced']),
        ('education', ['primary', 'secondary', 'tertiary', 'unknown']),
        ('default', ['no', 'yes']), ('balance', 'numeric'),
        ('housing', ['no', 'yes']), ('loan', ['no', 'yes']),
        ('contact', ['cellular', 'unknown', 'telephone']), ('day', 'numeric'),
        ('month', [
            'jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep',
            'oct', 'nov', 'dec'
        ]), ('duration', 'numeric'), ('campaign', 'numeric'),
        ('pdays', 'numeric'), ('previous', 'numeric'),
        ('poutcome', ['success', 'failure', 'other', 'unknown'])
    ])

    col_idx = 0
    meta_int_val_map = OrderedDict()
    for col in data_dict.keys():
        col_vals = data_dict[col]
        if col_vals != 'numeric':
            val_int_map = dict([(col_vals[i], i)
                                for i in range(len(col_vals))])
            X.iloc[:, col_idx] = X.iloc[:, col_idx].apply(
                lambda x: val_int_map[x]).copy(
                )  #TODO: fix the annoying pandas warning once and for all
            meta_int_val_map[col] = {v: k for k, v in val_int_map.items()}
        else:
            meta_int_val_map[col] = 'numeric'
        col_idx += 1

    Xe = pd.DataFrame()
    col_idx = 0
    enc = OneHotEncoder(sparse=False)
    feature_names = list()
    for col in meta_int_val_map:
        if meta_int_val_map[col] == 'numeric':
            Xe.loc[:, col] = X.loc[:, col]
            feature_names.append(col)
            col_idx += 1
        else:
            input_array = np.array(list(X.loc[:, col])).reshape(-1, 1)
            Xt = pd.DataFrame(enc.fit_transform(input_array))
            Xt_w = Xt.shape[1]
            Xt.columns = [i + col_idx for i in range(Xt_w)]
            Xe = pd.merge(Xe, Xt, left_index=True, right_index=True)
            int_val_map = meta_int_val_map[col]
            for i in range(len(int_val_map)):
                dim_val = int_val_map[i]
                feature_name = col + '_' + dim_val
                feature_names.append(feature_name)
                col_idx += 1
    assert col_idx == Xe.shape[1]
    Xe.columns = range(col_idx)

    clean_input_files = project_settings['clean_input_files']
    feature_names_rel_filepath = clean_input_files['feature_names']
    feature_names_abs_filepath = data_dir + '/' + feature_names_rel_filepath
    pd.Series(feature_names).to_csv(feature_names_abs_filepath,
                                    index=False,
                                    header=None,
                                    sep="\t")

    split_perc = project_settings['train_test_split']
    s1_dfs = [
        pd.DataFrame(x)
        for x in train_test_split(Xe, y, test_size=split_perc, random_state=42)
    ]
    s1_names = ["X_test", "X_train_val", "y_test", "y_train_val"]
    dd1 = dict(zip(s1_names, s1_dfs))
    for name in dd1:
        rel_file_path = clean_input_files[name]
        abs_file_path = data_dir + '/' + rel_file_path
        dd1[name].to_csv(abs_file_path, header=None, sep="\t", index=False)
    X_train_val, y_train_val = dd1['X_train_val'], dd1['y_train_val']

    num_folds = project_settings['assessment']['cv_num_folds']
    perc = 1 / num_folds
    s2_dfs = [
        pd.DataFrame(x)
        for x in train_test_split(X_train_val, y_train_val, test_size=perc)
    ]
    s2_names = ["X_train", "X_val", "y_train", "y_val"]
    dd2 = dict(zip(s2_names, s2_dfs))
    for name in dd2:
        rel_file_path = clean_input_files[name]
        abs_file_path = data_dir + '/' + rel_file_path
        dd2[name].to_csv(abs_file_path, header=None, sep="\t", index=False)
Пример #10
0
def main():

    data_dir = find_data_dir(project_settings)
    processed_dir = data_dir + '/processed'
    if not os.path.isdir(processed_dir):
        os.makedirs(processed_dir)

    data_dir = find_data_dir(project_settings)
    raw_files = project_settings['raw_input_files']

    X_filepath = data_dir + '/' + raw_files['X']

    df = pd.read_csv(X_filepath, sep=";", header=None, skiprows=1)

    X = df.iloc[:, range(32)]
    y = df.iloc[:, 32]
    #y = pd.Series(np.repeat(1,len(y_str))).where(y_str == 'yes', other = 0)

    data_dict = OrderedDict([
        ('school', ['GP', 'MS']),
        ('sex', ['F', 'M']),
        ('age', 'numeric'),
        ('address', ['U', 'R']),
        ('famsize', ['LE3', 'GT3']),
        ('pstatus', ['T', 'A']),
        ('medu', ['0', '1', '2', '3', '4']),
        ('fedu', ['0', '1', '2', '3', '4']),
        ('mjob', ['teacher', 'health', 'services', 'at_home', 'other']),
        ('fjob', ['teacher', 'health', 'services', 'at_home', 'other']),
        ('reason', ['home', 'reputation', 'course', 'other']),
        ('guardian', ['mother', 'father', 'other']),
        ('traveltime', ['1', '2', '3', '4']),
        ('studytime', ['1', '2', '3', '4']),
        ('failures', ['0', '1', '2', '3']),
        ('schoolsup', ['yes', 'no']),
        ('famsup', ['yes', 'no']),
        ('paid', ['yes', 'no']),
        ('activities', ['yes', 'no']),
        ('nursery', ['yes', 'no']),
        ('higher', ['yes', 'no']),
        ('internet', ['yes', 'no']),
        ('romantic', ['yes', 'no']),
        ('famrel', 'numeric'),  #make cat?
        ('freetime', 'numeric'),
        ('goout', 'numeric'),
        ('dalc', 'numeric'),
        ('walc', 'numeric'),
        ('health', 'numeric'),
        ('absences', 'numeric'),
        ('g1', 'numeric'),
        ('g2', 'numeric'),
    ])

    X.columns = data_dict.keys()

    col_idx = 0
    meta_int_val_map = OrderedDict()
    for col in data_dict.keys():
        col_vals = data_dict[col]
        if col_vals != 'numeric':
            val_int_map = dict([(col_vals[i], i)
                                for i in range(len(col_vals))])
            X.iloc[:, col_idx] = X.iloc[:, col_idx].astype(str).apply(
                lambda x: val_int_map[x]).copy()
            meta_int_val_map[col] = {v: k for k, v in val_int_map.items()}
        else:
            meta_int_val_map[col] = 'numeric'
        col_idx += 1

    Xe = pd.DataFrame()
    col_idx = 0
    enc = OneHotEncoder(sparse=False)
    feature_names = list()
    for col in meta_int_val_map:
        if meta_int_val_map[col] == 'numeric':
            Xe.loc[:, col_idx] = X.loc[:, col]  #changed
            feature_names.append(col)
            col_idx += 1
        else:
            input_array = np.array(list(X.loc[:, col])).reshape(-1, 1)
            Xt = pd.DataFrame(enc.fit_transform(input_array))
            Xt_w = Xt.shape[1]
            Xt.columns = [i + col_idx for i in range(Xt_w)]
            if Xe.shape[1] == 0:
                Xe = Xt
            else:
                Xe = pd.merge(Xe, Xt, left_index=True, right_index=True)
            int_val_map = meta_int_val_map[col]
            assert len(int_val_map) == Xt_w
            for i in range(len(int_val_map)):
                dim_val = int_val_map[i]
                feature_name = col + '_' + dim_val
                feature_names.append(feature_name)
                col_idx += 1
    assert col_idx == Xe.shape[1]
    Xe.columns = range(col_idx)

    clean_input_files = project_settings['clean_input_files']
    feature_names_rel_filepath = clean_input_files['feature_names']
    feature_names_abs_filepath = data_dir + '/' + feature_names_rel_filepath
    pd.Series(feature_names).to_csv(feature_names_abs_filepath,
                                    index=False,
                                    header=None,
                                    sep="\t")

    split_perc = project_settings['train_test_split']
    if split_perc != 1:
        s1_dfs = [
            pd.DataFrame(x) for x in train_test_split(
                Xe, y, test_size=split_perc, random_state=42)
        ]
        s1_names = ["X_test", "X_train_val", "y_test", "y_train_val"]
    else:
        s1_dfs = [Xe, y]
        s1_names = ["X_train_val", "y_train_val"]
    dd1 = dict(zip(s1_names, s1_dfs))
    for name in dd1:
        rel_file_path = clean_input_files[name]
        abs_file_path = data_dir + '/' + rel_file_path
        dd1[name].to_csv(abs_file_path, header=None, sep="\t", index=False)
    X_train_val, y_train_val = dd1['X_train_val'], dd1['y_train_val']

    num_folds = project_settings['assessment']['cv_num_folds']
    perc = 1 / num_folds
    s2_dfs = [
        pd.DataFrame(x)
        for x in train_test_split(X_train_val, y_train_val, test_size=perc)
    ]
    s2_names = ["X_train", "X_val", "y_train", "y_val"]
    dd2 = dict(zip(s2_names, s2_dfs))
    for name in dd2:
        rel_file_path = clean_input_files[name]
        abs_file_path = data_dir + '/' + rel_file_path
        dd2[name].to_csv(abs_file_path, header=None, sep="\t", index=False)
Пример #11
0
def main():

    data_dir = find_data_dir(project_settings)
    processed_dir = data_dir + '/processed'
    if not os.path.isdir(processed_dir):
        os.makedirs(processed_dir)

    data_dir = find_data_dir(project_settings)
    raw_files = project_settings['raw_input_files']

    X_filepath = data_dir + '/' + raw_files['X']

    df = pd.read_csv(X_filepath, sep=",", skiprows=1)

    X = df.iloc[:, range(24)]
    y = df.iloc[:, 24]
    #y = pd.Series(np.repeat(1,len(y_str))).where(y_str == 'yes', other = 0)

    data_dict = OrderedDict([('id', 'numeric'),
                             ('amt_given_credit', 'numeric'),
                             ('gender', ['1', '2']),
                             ('education', ['0', '1', '2', '3', '4', '5',
                                            '6']),
                             ('marital', ['0', '1', '2',
                                          '3']), ('age', 'numeric'),
                             ('hist_apr', 'numeric'), ('hist_may', 'numeric'),
                             ('hist_jun', 'numeric'), ('hist_jul', 'numeric'),
                             ('hist_aug', 'numeric'), ('hist_sep', 'numeric'),
                             ('bill_sep', 'numeric'), ('bill_aug', 'numeric'),
                             ('bill_jul', 'numeric'), ('bill_jun', 'numeric'),
                             ('bill_may', 'numeric'), ('bill_apr', 'numeric'),
                             ('prepay_sep', 'numeric'),
                             ('prepay_aug', 'numeric'),
                             ('prepay_jul', 'numeric'),
                             ('prepay_jun', 'numeric'),
                             ('prepay_may', 'numeric'),
                             ('prepay_apr', 'numeric')])

    X.columns = data_dict.keys()

    col_idx = 0
    meta_int_val_map = OrderedDict()
    for col in data_dict.keys():
        col_vals = data_dict[col]
        if col_vals != 'numeric':
            val_int_map = dict([(col_vals[i], i)
                                for i in range(len(col_vals))])
            X.iloc[:, col_idx] = X.iloc[:, col_idx].astype(str).apply(
                lambda x: val_int_map[x]).copy()
            meta_int_val_map[col] = {v: k for k, v in val_int_map.items()}
        else:
            meta_int_val_map[col] = 'numeric'
        col_idx += 1

    Xe = pd.DataFrame()
    col_idx = 0
    enc = OneHotEncoder(sparse=False)
    feature_names = list()
    for col in meta_int_val_map:
        if meta_int_val_map[col] == 'numeric':
            Xe.loc[:, col] = X.loc[:, col]
            feature_names.append(col)
            col_idx += 1
        else:
            input_array = np.array(list(X.loc[:, col])).reshape(-1, 1)
            Xt = pd.DataFrame(enc.fit_transform(input_array))
            Xt_w = Xt.shape[1]
            Xt.columns = [i + col_idx for i in range(Xt_w)]
            Xe = pd.merge(Xe, Xt, left_index=True, right_index=True)
            int_val_map = meta_int_val_map[col]
            for i in range(len(int_val_map)):
                dim_val = int_val_map[i]
                feature_name = col + '_' + dim_val
                feature_names.append(feature_name)
                col_idx += 1
    assert col_idx == Xe.shape[1]
    Xe.columns = range(col_idx)

    clean_input_files = project_settings['clean_input_files']
    feature_names_rel_filepath = clean_input_files['feature_names']
    feature_names_abs_filepath = data_dir + '/' + feature_names_rel_filepath
    pd.Series(feature_names).to_csv(feature_names_abs_filepath,
                                    index=False,
                                    header=None,
                                    sep="\t")

    split_perc = project_settings['train_test_split']
    s1_dfs = [
        pd.DataFrame(x)
        for x in train_test_split(Xe, y, test_size=split_perc, random_state=42)
    ]
    s1_names = ["X_test", "X_train_val", "y_test", "y_train_val"]
    dd1 = dict(zip(s1_names, s1_dfs))
    for name in dd1:
        rel_file_path = clean_input_files[name]
        abs_file_path = data_dir + '/' + rel_file_path
        dd1[name].to_csv(abs_file_path, header=None, sep="\t", index=False)
    X_train_val, y_train_val = dd1['X_train_val'], dd1['y_train_val']

    num_folds = project_settings['assessment']['cv_num_folds']
    perc = 1 / num_folds
    s2_dfs = [
        pd.DataFrame(x)
        for x in train_test_split(X_train_val, y_train_val, test_size=perc)
    ]
    s2_names = ["X_train", "X_val", "y_train", "y_val"]
    dd2 = dict(zip(s2_names, s2_dfs))
    for name in dd2:
        rel_file_path = clean_input_files[name]
        abs_file_path = data_dir + '/' + rel_file_path
        dd2[name].to_csv(abs_file_path, header=None, sep="\t", index=False)
Пример #12
0
def main():

    data_dir = find_data_dir(project_settings)
    raw_files = project_settings['raw_input_files']

    X_filepath = data_dir + '/' + raw_files['X']
    df = pd.read_csv(X_filepath, header=None)

    X = df.iloc[:, range(6)]
    y = df.iloc[:, 6]

    data_dict = OrderedDict([
        ('buying', ['vhigh', 'high', 'med', 'low']),
        ('maint', ['vhigh', 'high', 'med', 'low']),
        ('doors', ['2', '3', '4', '5more']),
        ('persons', ['2', '4', 'more']),
        ('lug_boot', ['small', 'med', 'big']),
        ('safety', ['low', 'med', 'high']),
        #('class', ['unacc', 'acc', 'good', 'vgood'])
    ])

    col_idx = 0
    meta_int_val_map = OrderedDict()
    for col in data_dict.keys():
        col_vals = data_dict[col]
        val_int_map = dict([(col_vals[i], i) for i in range(len(col_vals))])
        X.iloc[:, col_idx] = X.iloc[:, col_idx].apply(lambda x: val_int_map[
            x]).copy()  #TODO: fix the annoying pandas warning once and for all
        meta_int_val_map[col] = {v: k for k, v in val_int_map.items()}
        col_idx += 1

    enc = OneHotEncoder(sparse=False)
    Xe = pd.DataFrame(enc.fit_transform(X))
    feature_names = list()
    for col in meta_int_val_map:
        int_val_map = meta_int_val_map[col]
        for i in range(len(int_val_map)):
            dim_val = int_val_map[i]
            feature_name = col + '_' + dim_val
            feature_names.append(feature_name)
    assert len(feature_names) == Xe.shape[1]

    clean_input_files = project_settings['clean_input_files']
    feature_names_rel_filepath = clean_input_files['feature_names']
    feature_names_abs_filepath = data_dir + '/' + feature_names_rel_filepath
    pd.Series(feature_names).to_csv(feature_names_abs_filepath,
                                    index=False,
                                    header=None,
                                    sep="\t")

    split_perc = project_settings['test_train_split']
    numpy_arrays = train_test_split(Xe,
                                    y,
                                    test_size=split_perc,
                                    random_state=42)

    i = 0
    for dataset_name in ["X_test", "X_train", "y_test", "y_train"]:
        array_i = numpy_arrays[i]
        df_i = pd.DataFrame(array_i)
        rel_file_path = clean_input_files[dataset_name]
        abs_file_path = data_dir + '/' + rel_file_path
        df_i.to_csv(abs_file_path, header=None, sep="\t", index=False)
        i += 1