예제 #1
0
파일: EDA_backend.py 프로젝트: nogur9/PTSD
 def impute(self, df):
     if self.knn:
         knn = KNN()
         return pd.DataFrame(knn.fit_transform(df), columns=df.columns)
     else:
         mice = IterativeImputer()
         return pd.DataFrame(mice.fit_transform(df), columns=df.columns)
예제 #2
0
 def impute(self, df):
     if self.knn:
         knn = KNN()
         return pd.DataFrame(knn.fit_transform(df), columns=df.columns)
     else:
         mice = IterativeImputer()
         return pd.DataFrame(mice.fit_transform(df), columns=df.columns)
예제 #3
0
def impute_features(scaled_features):
    # Impute missing values
    from fancyimpute import KNN

    knn = KNN(k=5)
    imputed_values = knn.fit_transform(scaled_features.values)
    imputed_features = pd.DataFrame(imputed_values,
                                    index=scaled_features.index,
                                    columns=scaled_features.columns)
    return imputed_features
예제 #4
0
파일: main.py 프로젝트: sachinvarriar/ditk
 def impute(self, trained_model, input):
     """
     Loads the input table and gives the imputed table
 
 	:param trained_model: trained model returned by train function - not used in our case
 	:param input: input table which needs to be imputed
 	:return:
 		X_filled_knn: imputed table as a numpy array
     """
     # Use 3 nearest rows which have a feature to fill in each row's missing features
     # will not use trained_model as training happens during imputation
     X_incomplete = input
     knnImpute = KNN(k=3)
     X_filled_knn = knnImpute.fit_transform(X_incomplete)
     return X_filled_knn
예제 #5
0
def KNNfill(df, usecols, predcol, knn_k=5):

    dfcpy = df.copy().fillna(value=float('NaN')).loc[:, usecols]
    minval = dfcpy.loc[dfcpy[predcol].notnull(), (predcol)].min()
    meanval = dfcpy.loc[dfcpy[predcol].notnull(), (predcol)].mean()
    maxval = dfcpy.loc[dfcpy[predcol].notnull(), (predcol)].max()

    predictor = KNN(k=knn_k, min_value=minval, max_value=maxval)
    print("Starting Imputation, Printing NaNs for Passed DataFrame::\n{}\n".
          format(dfcpy.isnull().sum()))
    print("{} values missing for {}".format(dfcpy[predcol].isnull().sum(),
                                            predcol))
    imputed_df = pd.DataFrame(data=predictor.fit_transform(dfcpy),
                              columns=usecols)
    imputed_df['orig_' + predcol] = dfcpy.loc[:, (predcol)]
    return imputed_df
def impute_df(df):
    # imputer = KNN()
    imputer = KNN(k=2)
    object_types = list(df.select_dtypes(include=['object']).columns)
    num_types = list(set(df.columns) - set(object_types))
    encoders_store = {}
    for column in num_types:
        skew = df[column].skew()
        if (-1 < skew < 1):
            df[column] = df[column].fillna(df[column].mean())
        else:
            df[column] = df[column].fillna(df[column].median())
    #create a for loop to iterate through each column in the data
    for columns in object_types:
        new = encode(df[columns])
        encoders_store[columns] = new[1]
    imputed_data = pd.DataFrame(np.round(imputer.fit_transform(df)),
                                columns=df.columns)
    for columns in object_types:
        imputed_data[columns] = encoders_store[columns].inverse_transform(
            np.array(imputed_data[columns]).reshape(-1, 1))
    return imputed_data
예제 #7
0
def tune():
    print("Tuning for k= 2 to 10")
    min_rms = float('inf')
    min_k = -1
    for k in range(2, 11):
        model = KNN(k, verbose=False
                    )  #or different k or SoftImpute or BiScaler or SimpleFill
        #Read Data
        knndf = pd.DataFrame([],
                             columns=[
                                 'ptnum', 'time', 'X1', 'X2', 'X3', 'X4', 'X5',
                                 'X6', 'X7', 'X8', 'X9', 'X10', 'X11', 'X12',
                                 'X13', 'timediff_1', 'timediff_5',
                                 'timediff_median'
                             ])
        for file in os.listdir("train_data/train_with_missing"):
            if file.endswith(".csv"):
                filepath = os.path.join("train_data/train_with_missing", file)
                df_pt = pd.read_csv(filepath)
                ptnum = int(file.split(".")[0])
                df_oth = df_pt.iloc[:, 1:]
                df_time = df_pt.iloc[:, 0]
                df_pt = pd.concat([df_oth, df_time], axis=1)
                df_pt["ptnum"] = ptnum
                mins, maxs = feature_engg(df_pt, returnmin=True)
                df_numeric = df_pt.select_dtypes(
                    include=[np.float]).as_matrix()
                df_filled = pd.DataFrame(model.fit_transform(df_numeric),
                                         columns=[
                                             'X1', 'X2', 'X3', 'X4', 'X5',
                                             'X6', 'X7', 'X8', 'X9', 'X10',
                                             'X11', 'X12', 'X13', 'timediff_1',
                                             'timediff_5', 'timediff_median'
                                         ])
                df_filled["ptnum"] = df_pt["ptnum"]
                df_filled["time"] = df_pt["time"]

                for i in range(13):
                    name = df_filled.iloc[:, i].name
                    max_val = maxs[name]
                    min_val = mins[name]
                    df_filled[name] = df_filled[name].map(
                        lambda x: x * (max_val - min_val) + min_val)

                knndf = knndf.append(df_filled, ignore_index=True)

        knndf = knndf.sort_values(['ptnum', 'time'],
                                  ascending=[1, 1]).reset_index(drop=True)
        #Evaluate
        imputed = knndf.copy()
        rmsList = []
        for i in range(1, 14):
            imputed["X" + str(i) + "_groundtruth"] = df2["X" + str(i)]
            imputed["X" + str(i) + "_masked"] = df["masked_X" + str(i)]
            rms = nrms(imputed["X" + str(i) + "_masked"],
                       imputed["X" + str(i)],
                       imputed["X" + str(i) + "_groundtruth"],
                       imputed["ptnum"])
            print(k, i, rms)
            rmsList.append(rms)
        print(k, rmsList)
        avgrms = np.mean(rmsList)
        print(avgrms)
        if avgrms < min_rms:
            min_rms = avgrms
            min_k = k
예제 #8
0
# Import KNN from fancyimpute
from fancyimpute import KNN

# Copy diabetes to diabetes_knn_imputed
diabetes_knn_imputed = diabetes.copy(deep=True)

# Initialize KNN
knn_imputer = KNN()

# Impute using fit_tranform on diabetes_knn_imputed
diabetes_knn_imputed.iloc[:, :] = knn_imputer.fit_transform(diabetes_knn_imputed)
예제 #9
0
results = calc.pandas(mols.values())
results = results.set_index(pd.Index(mols.keys(), name='CID'))
results.head()

results.shape


# +
def fix(x):
    try:
        x = float(x)
    except:
        x = None
    return x


results = results.applymap(fix)
# -

frac_bad = results.isnull().mean()
good = frac_bad[frac_bad < 0.3].index
results = results.loc[:, good]

from fancyimpute import KNN
knn = KNN(k=5)
results[:] = knn.fit_transform(results.values)

results.to_csv('data/snitz-mordred.csv')

results.shape
예제 #10
0
inner_rank = 4
X = np.dot(np.random.randn(n, inner_rank), np.random.randn(inner_rank, m))
print("Mean squared element: %0.4f" % (X**2).mean())

# X is a data matrix which we're going to randomly drop entries from
missing_mask = np.random.rand(*X.shape) < 0.1
X_incomplete = X.copy()
# missing entries indicated with NaN
X_incomplete[missing_mask] = np.nan

meanFill = SimpleFill("mean")
X_filled_mean = meanFill.fit_transform(X_incomplete)

# Use 3 nearest rows which have a feature to fill in each row's missing features
knnImpute = KNN(k=3)
X_filled_knn = knnImpute.fit_transform(X_incomplete)

# matrix completion using convex optimization to find low-rank solution
# that still matches observed values. Slow!
X_filled_nnm = NuclearNormMinimization().fit_transform(X_incomplete)

# Instead of solving the nuclear norm objective directly, instead
# induce sparsity using singular value thresholding
softImpute = SoftImpute()

# simultaneously normalizes the rows and columns of your observed data,
# sometimes useful for low-rank imputation methods
biscaler = BiScaler()

# rescale both rows and columns to have zero mean and unit variance
X_incomplete_normalized = biscaler.fit_transform(X_incomplete)
예제 #11
0
  if n_filled / n_total < 0.5: continue
  input_array.append(input_array_tmp[k,:])
  
input_array = np.asarray(input_array)
input_shape = input_array.shape

print('Feature array shape: %s' % str(feature_shape))
print('Input shape for imputer: %s' % str(input_shape))

imputed_array = gain(\
  input_array.astype(np.float), gain_parameters)

from fancyimpute import KNN

imp_mean = KNN(5) # IterativeImputer()
imp_mean.fit_transform(input_array) 
#imp_mean.transform(input_array)

df = pd.DataFrame(imputed_array)

plt.matshow(df.corr())
#plt.title('Correlations between laboratory parameters over time', y=-0.01)
plt.xticks(range(0,len(feature_rows)), feature_rows, rotation='vertical',fontsize='6')
plt.yticks(range(0,len(feature_rows)), feature_rows, fontsize='6')
plt.gca().xaxis.set_ticks_position('top')
plt.colorbar()

plt.tight_layout(pad=2)
plt.show()

imputed_array = imputed_array.transpose()
예제 #12
0
X_new = X_new.drop(['Purpose_nan', 'Housing_nan'], axis=1)
Y_new = pd.get_dummies(Y, drop_first=True)
"""print(Y_new)
print(X_new)"""

#treating the job features saperately as they are having numbers but need to be one hot encoded
F = df.loc[:, 'Job']
F = pd.get_dummies(F, prefix='Job')
F = F.drop(['Job_0'], axis=1)
X_new = pd.concat([F, X_new], axis=1)  #Adding the job feature back to X_new

df_new = pd.concat(
    [X_new, Y_new, df_1, df_2], axis=1
)  #Combining X_new, Y_new and the 'Savings account' and 'Checking account' features for imputing the missing data in 'Savings account' and 'Checking account' features

encode_data = pd.DataFrame(np.round(imputer.fit_transform(df_new)),
                           columns=df_new.columns)  #imputing the missing data

#after imputing the missing data, getting dummy variables from the imputed features
F1 = encode_data.loc[:, 'Saving accounts']
F1 = pd.get_dummies(F1, prefix='Saving accounts')
F1 = F1.drop(['Saving accounts_3.0'], axis=1)
F2 = encode_data.loc[:, 'Checking account']
F2 = pd.get_dummies(F2, prefix='Checking account')
F2 = F2.drop(['Checking account_2.0'], axis=1)
encode_data = encode_data.drop(['Saving accounts', 'Checking account'], axis=1)

encode_data = pd.concat(
    [encode_data, F1, F2],
    axis=1)  #again gettig the full dataset by combining all the features
예제 #13
0
class Preprocess():
    def __init__(self, rootDir, word_dict, inv_words):
        self.rootDir_ = rootDir
        self.class_words_dict_ = word_dict
        self.inv_words_dict_ = inv_words
        self.imputer_ = KNN(k=1)
        self.enc_ = OrdinalEncoder()
        self.spanish_stemmer_ = SnowballStemmer('spanish')
        self.special_words_ = ['piez']
        self.stopwords_spanish_ = stopwords.words('spanish')
        self.df_ = pd.DataFrame(columns=[
            'Tipo', 'Tipo_2', 'Tipo_3', 'Tipo_4', 'Marca', 'Submarca',
            'Empaque', 'Contenido', 'UnidadMedida', 'LocalidadGeografica',
            'Fuente', 'precio', 'fecha'
        ])

        self.data_ = self.import_data()
        self.add_stop_words()
        self.preprocess('descripcion')
        self.categorize()
        self.append_df()
        self.join_marca_submarca_drop_null()
        self.imputation()
        self.inv_words_funct()
        self.drop_unused_columns()

    def import_data(self):
        '''
        Import all files in a library without subfolders
        '''
        data = {}
        path = self.rootDir_ + '*.csv'
        for fname in glob.glob(path):
            data[fname.split('\\')[1].split('.csv')[0]] = pd.read_csv(
                fname, index_col=0)
            try:
                data.get(fname.split('\\')[1].split('.csv')
                         [0])['fecha'] = pd.to_datetime(data.get(
                             fname.split('\\')[1].split('.csv')[0])['fecha'],
                                                        format='%d-%m-%Y')
            except KeyError:
                print('Check datetime values, as I didnt find them.')
        return data

    def add_stop_words(self):
        new_stop_words = ['s']
        self.stopwords_spanish_.extend(new_stop_words)

        return self

    def tokenize(self, data):
        '''
        Input: the complete strins
        Output: the tokenize string in a list of strings
        '''
        return word_tokenize(data)

    def remove_stopwords_punctuation(self, data):
        clean_description = []
        for word in data:
            if (word not in self.stopwords_spanish_
                    and word not in string.punctuation):
                clean_description.append(word)

        return clean_description

    def remove_accents(self, data):

        return [unidecode.unidecode(word) for word in data]

    def lowercasing(self, data):

        return [word.lower() for word in data]

    def stemming(self, data):

        return [self.spanish_stemmer_.stem(word) for word in data]

    def remove_duplicates(self, data):
        seen = set()
        result = []
        for item in data:
            if item not in seen:
                seen.add(item)
                result.append(item)

        return result

    def split_number_letter(self, data):
        result = []
        for word in data:
            match = re.match(r'([0-9]+)([a-z]+)', word, re.I)
            if match:
                for element in match.groups():
                    result.append(element)
            else:
                result.append(word)
        return result

    def remove_special_char(self, data):
        result = []
        for word in data:
            if (word not in self.special_words_):
                result.append(word)
        return result

    def preprocess(self, column_name):
        for values in self.data_.values():
            values[column_name] = values.apply(
                lambda row: self.tokenize(row[column_name]), axis=1)
            values[column_name] = values.apply(
                lambda row: self.remove_accents(row[column_name]), axis=1)
            values[column_name] = values.apply(
                lambda row: self.lowercasing(row[column_name]), axis=1)
            values[column_name] = values.apply(
                lambda row: self.split_number_letter(row[column_name]), axis=1)
            values[column_name] = values.apply(
                lambda row: self.remove_stopwords_punctuation(row[column_name]
                                                              ),
                axis=1)
            values[column_name] = values.apply(
                lambda row: self.stemming(row[column_name]), axis=1)
            values[column_name] = values.apply(
                lambda row: self.remove_special_char(row[column_name]), axis=1)
            values[column_name] = values.apply(
                lambda row: self.remove_duplicates(row[column_name]), axis=1)
        return self

    def append_df(self):
        for element in self.data_.keys():
            self.df_ = self.df_.append(self.data_.get(element),
                                       ignore_index=True)

        return self

    def categorize(self):
        for base_key in self.data_.keys():
            self.data_.get(base_key).reset_index(drop=True, inplace=True)
            columns_to_add = [
                'Tipo', 'Tipo_2', 'Tipo_3', 'Tipo_4', 'Marca', 'Submarca',
                'Empaque', 'Contenido', 'UnidadMedida'
            ]
            for i in columns_to_add:
                self.data_.get(base_key)[i] = np.nan
            self.data_.get(base_key)['Fuente'] = base_key
            for row in range(len(self.data_.get(base_key))):
                for element in self.data_.get(base_key)['descripcion'][row]:
                    if element in self.class_words_dict_.get('Tipo'):
                        self.data_.get(base_key)['Tipo'].loc[row] = element
                    if element in self.class_words_dict_.get('Tipo_2'):
                        self.data_.get(base_key)['Tipo_2'].loc[row] = element
                    if element in self.class_words_dict_.get('Tipo_3'):
                        self.data_.get(base_key)['Tipo_3'].loc[row] = element
                    if element in self.class_words_dict_.get('Tipo_4'):
                        self.data_.get(base_key)['Tipo_4'].loc[row] = element
                    if element in self.class_words_dict_.get('Marca'):
                        self.data_.get(base_key)['Marca'].loc[row] = element
                    if element in self.class_words_dict_.get('Submarca'):
                        self.data_.get(base_key)['Submarca'].loc[row] = element
                    if element in self.class_words_dict_.get('Empaque'):
                        self.data_.get(base_key)['Empaque'].loc[row] = element
                    if element in self.class_words_dict_.get('Contenido'):
                        self.data_.get(
                            base_key)['Contenido'].loc[row] = element
                    if element in self.class_words_dict_.get('UnidadMedida'):
                        self.data_.get(
                            base_key)['UnidadMedida'].loc[row] = element

        return self

    def join_marca_submarca_drop_null(self):
        self.df_['Submarca'].fillna('', inplace=True)
        self.df_['Marca'] = self.df_['Marca'] + self.df_['Submarca']
        self.df_.drop(['Submarca'], axis=1, inplace=True)
        self.df_.dropna(subset=['Tipo'], inplace=True)

        return self

    def imputation(self):
        self.df_.fillna('', inplace=True)
        self.df_.reset_index(drop=True, inplace=True)
        for row in range(len(self.df_)):
            if self.df_.Tipo.loc[row] == 'huev' and self.df_.UnidadMedida.loc[
                    row] == '':
                self.df_['UnidadMedida'].loc[row] = 'pz'
            if self.df_.Tipo.loc[
                    row] == 'tortill' and self.df_.UnidadMedida.loc[row] == '':
                self.df_['UnidadMedida'].loc[row] = 'pz'
            if self.df_.Tipo.loc[row] == 'papel' and self.df_.UnidadMedida.loc[
                    row] == '':
                self.df_['UnidadMedida'].loc[row] = 'roll'
            if self.df_.Tipo.loc[row] == 'lech' and self.df_.UnidadMedida.loc[
                    row] == '':
                self.df_['UnidadMedida'].loc[row] = 'l'
            if self.df_.Contenido.loc[row] == '':
                self.df_['Contenido'].loc[row] = '1'
            if self.df_.Marca.loc[row] == '':
                self.df_['Marca'].loc[row] = 'no_especificado'
            if self.df_['Tipo_2'].loc[row] == '':
                if self.df_['Tipo_4'].loc[row] == '' and self.df_[
                        'Tipo_3'].loc[row] == '':
                    self.df_['Tipo_2'].loc[row] = 'no_especificado'
                else:
                    if self.df_['Tipo_4'].loc[row] == '':
                        self.df_['Tipo_2'].loc[row] = self.df_['Tipo_3'].loc[
                            row]
                    else:
                        if self.df_['Tipo_3'].loc[row] == '':
                            self.df_['Tipo_2'].loc[row] = self.df_[
                                'Tipo_4'].loc[row]
                        else:
                            self.df_['Tipo_2'].loc[
                                row] = self.df_['Tipo_3'].loc[
                                    row] + '_' + self.df_['Tipo_4'].loc[row]
            else:
                if self.df_['Tipo_4'].loc[row] == '' and self.df_[
                        'Tipo_3'].loc[row] == '':
                    self.df_['Tipo_2'].loc[row] = self.df_['Tipo_2'].loc[row]
                else:
                    if self.df_['Tipo_4'].loc[row] == '':
                        self.df_['Tipo_2'].loc[row] = self.df_['Tipo_2'].loc[
                            row] + '_' + self.df_['Tipo_3'].loc[row]
                    else:
                        if self.df_['Tipo_3'].loc[row] == '':
                            self.df_['Tipo_2'].loc[
                                row] = self.df_['Tipo_2'].loc[
                                    row] + '_' + self.df_['Tipo_4'].loc[row]
                        else:
                            self.df_['Tipo_2'].loc[row] = self.df_[
                                'Tipo_2'].loc[row] + '_' + self.df_[
                                    'Tipo_3'].loc[row] + '_' + self.df_[
                                        'Tipo_4'].loc[row]
        self.knn_imputer_for_empaque()

        return self

    def knn_imputer_for_empaque(self):
        data = self.df_.copy(deep=True)
        data['Empaque'][(data['Empaque'] == '')] = np.nan
        # initialize variables
        ordinal_enc_dict = {}
        columns_to_encode = ['Tipo', 'Tipo_2', 'Empaque']
        # loop over columns to encode
        for col_name in data[columns_to_encode]:
            # create ordinal encoder for the column
            ordinal_enc_dict[col_name] = OrdinalEncoder()
            # select the non-null values in the column
            col = data[col_name]
            col_not_null = col[col.notnull()]
            reshaped_vals = col_not_null.values.reshape(-1, 1)
            # encode the non-null values of the column
            encoded_vals = ordinal_enc_dict[col_name].fit_transform(
                reshaped_vals)
            # store the values to non-null values of the column in data
            data.loc[col.notnull(), col_name] = np.squeeze(encoded_vals)
        # imputing with KNN
        data.iloc[:,
                  [data.columns.get_loc(col_)
                   for col_ in columns_to_encode]] = np.round(
                       self.imputer_.fit_transform(data[columns_to_encode]))
        for col_name in data[columns_to_encode]:
            # reshape the data
            reshaped = data[col_name].values.reshape(-1, 1)
            # perform inverse transformation of the ordinally encoded columns
            data[col_name] = ordinal_enc_dict[col_name].inverse_transform(
                reshaped)

        self.df_ = data.copy(deep=True)

        return self

    def search_in_dict(self, data):
        for key, value in self.inv_words_dict_.items():
            for i in value:
                if i == data:
                    return key
                else:
                    pass
        return data

    def inv_words_funct(self):
        column_name = [
            'Tipo', 'Tipo_2', 'Marca', 'Empaque', 'UnidadMedida', 'Contenido'
        ]
        for element in column_name:
            self.df_[element] = self.df_.apply(
                lambda row: self.search_in_dict(row[element]), axis=1)
        return self

    def drop_unused_columns(self):
        columns_to_drop = [
            'descripcion', 'producto', 'LocalidadGeografica', 'Tipo_3',
            'Tipo_4'
        ]
        self.df_.drop(columns_to_drop, axis=1, inplace=True)

        return self
예제 #14
0
        with open(args.config) as f:
            config = json.load(f)


        data_path =   config["data_path"]     #Ground truth data
        corrupt_data_path = config["corrupt_data_path"] #Data containing missing values
        n_neighbor = config["n_neighbor"]
        trial_ind = config["trial_ind"]



       # LOAD DATA
        data= pd.read_csv(data_path).values
        data_missing = pd.read_csv(corrupt_data_path).values

        
        n_row = data_missing.shape[1] # dimensionality of data space
        non_missing_row_ind= np.where(np.isfinite(np.sum(data_missing,axis=1)))
        na_ind = np.where(np.isnan(data_missing))
        na_count= len(na_ind[0])
         
        knnImpute = KNN(k=n_neighbor)
        print("Start Knn")
        #X_impute_KNN = knnImpute.complete(Xdata_Missing)
        data_impute_KNN = knnImpute.fit_transform(data_missing)
        print("Knn finished")
        ReconstructionErrorKNN = sum(((data_impute_KNN[na_ind] - data[na_ind])**2)**0.5)/na_count
        print('Reconstruction error (KNN):')
        print(ReconstructionErrorKNN) 
        
       	np.savetxt("./imputed_data_trial_"+str(trial_ind)+"_KNN.csv", data_impute_KNN, delimiter=",")  
예제 #15
0
#%%
# dummify

alldata = pd.get_dummies(alldata, columns=['Age'], prefix='In_AgeGRP')

#%%
# Age - Use Sex, Pclass, Parch, SibSp, Prefix to Fill Age

minage = alldata.loc[alldata['Age'].notnull(), ('Age')].min()
maxage = alldata.loc[alldata['Age'].notnull(), ('Age')].max()
medianage = alldata.loc[alldata['Age'].notnull(), ('Age')].mean()

cols = ['Sex', 'Pclass', 'Parch', 'SibSp', 'Prefix', 'Age']
targetdf = alldata.fillna(value=float('NaN')).copy().loc[:, cols]
predictage = KNN(k=5, min_value=minage, max_value=maxage)
imp_agesdf = pd.DataFrame(data=predictage.fit_transform(targetdf),
                          columns=cols)

imp_agesdf['orig_ages'] = targetdf.loc[:, ('Age')]
imp_agesdf.loc[imp_agesdf['orig_ages'].isnull() == True].head()

imp_minage = imp_agesdf.loc[alldata['Age'].notnull(), ('Age')].min()
imp_maxage = imp_agesdf.loc[alldata['Age'].notnull(), ('Age')].max()
imp_medianage = imp_agesdf.loc[alldata['Age'].notnull(), ('Age')].mean()

print("Min:{}->{},Mean:{}->{},Max:{}->{}".format(minage, imp_minage, medianage,
                                                 imp_medianage, maxage,
                                                 imp_maxage))

#%%
# SibSp and Parch -> Family Size
from fancyimpute import KNN

KNN_imputer = KNN()

num_features = [
    'cod_municipio', 'feature_16', 'feature_17', 'feature_13', 'feature_14',
    'feature_15', 'feature_18', 'feature_04', 'feature_06', 'feature_07',
    'feature_09', 'feature_10'
]
df_knn = cenarios.copy()
df_knn = KNN_imputer.fit_transform(df_knn[num_features])
df_knn
예제 #17
0
# +
# Remove these from the Snitz data
df_snitz_dragon = df_dragon.loc[snitz_cids.difference(no_dragon)]

for nd in no_dragon:
    df_snitz_dragon.loc[nd, :] = 0

# +
# Remove bad features (too many NaNs) and impute remaining NaNs
frac_bad = df_snitz_dragon.isnull().mean()
good = frac_bad[frac_bad<0.3].index
df_snitz_dragon = df_snitz_dragon.loc[:, good]

knn = KNN(k=5)
df_snitz_dragon[:] = knn.fit_transform(df_snitz_dragon.values)

# +
#from olfactometer.odorants import from_cids
#pubchem_data = from_cids([int(x) for x in snitz_cids])
#pd.DataFrame.from_dict(pubchem_data).set_index('CID').to_csv('data/snitz-odorant-info.csv')

# +
#df_snitz_mordred = pd.read_csv('data/snitz-mordred.csv').set_index('CID')
#df_snitz_mordred[:] = mms.fit_transform(df_snitz_mordred.values)
#df_snitz_mordred.head()
# -

df_snitz_features = df_snitz_dragon

# Normalize every molecule to have unit norm (to be unit vector in feature space)