def create_features(df): df['molecule_couples'] = df.groupby('molecule_name')['id'].transform('count') df['molecule_dist_mean'] = df.groupby('molecule_name')['dist'].transform('mean') df['molecule_dist_min'] = df.groupby('molecule_name')['dist'].transform('min') df['molecule_dist_max'] = df.groupby('molecule_name')['dist'].transform('max') df['atom_0_couples_count'] = df.groupby(['molecule_name', 'atom_index_0'])['id'].transform('count') df['atom_1_couples_count'] = df.groupby(['molecule_name', 'atom_index_1'])['id'].transform('count') df[f'molecule_atom_index_0_x_1_std'] = df.groupby(['molecule_name', 'atom_index_0'])['x_1'].transform('std') df[f'molecule_atom_index_0_y_1_mean'] = df.groupby(['molecule_name', 'atom_index_0'])['y_1'].transform('mean') df[f'molecule_atom_index_0_y_1_mean_diff'] = df[f'molecule_atom_index_0_y_1_mean'] - df['y_1'] df[f'molecule_atom_index_0_y_1_mean_div'] = df[f'molecule_atom_index_0_y_1_mean'] / df['y_1'] df[f'molecule_atom_index_0_y_1_max'] = df.groupby(['molecule_name', 'atom_index_0'])['y_1'].transform('max') df[f'molecule_atom_index_0_y_1_max_diff'] = df[f'molecule_atom_index_0_y_1_max'] - df['y_1'] df[f'molecule_atom_index_0_y_1_std'] = df.groupby(['molecule_name', 'atom_index_0'])['y_1'].transform('std') df[f'molecule_atom_index_0_z_1_std'] = df.groupby(['molecule_name', 'atom_index_0'])['z_1'].transform('std') df[f'molecule_atom_index_0_dist_mean'] = df.groupby(['molecule_name', 'atom_index_0'])['dist'].transform('mean') df[f'molecule_atom_index_0_dist_mean_diff'] = df[f'molecule_atom_index_0_dist_mean'] - df['dist'] df[f'molecule_atom_index_0_dist_mean_div'] = df[f'molecule_atom_index_0_dist_mean'] / df['dist'] df[f'molecule_atom_index_0_dist_max'] = df.groupby(['molecule_name', 'atom_index_0'])['dist'].transform('max') df[f'molecule_atom_index_0_dist_max_diff'] = df[f'molecule_atom_index_0_dist_max'] - df['dist'] df[f'molecule_atom_index_0_dist_max_div'] = df[f'molecule_atom_index_0_dist_max'] / df['dist'] df[f'molecule_atom_index_0_dist_min'] = df.groupby(['molecule_name', 'atom_index_0'])['dist'].transform('min') df[f'molecule_atom_index_0_dist_min_diff'] = df[f'molecule_atom_index_0_dist_min'] - df['dist'] df[f'molecule_atom_index_0_dist_min_div'] = df[f'molecule_atom_index_0_dist_min'] / df['dist'] df[f'molecule_atom_index_0_dist_std'] = df.groupby(['molecule_name', 'atom_index_0'])['dist'].transform('std') df[f'molecule_atom_index_0_dist_std_diff'] = df[f'molecule_atom_index_0_dist_std'] - df['dist'] df[f'molecule_atom_index_0_dist_std_div'] = df[f'molecule_atom_index_0_dist_std'] / df['dist'] df[f'molecule_atom_index_1_dist_mean'] = df.groupby(['molecule_name', 'atom_index_1'])['dist'].transform('mean') df[f'molecule_atom_index_1_dist_mean_diff'] = df[f'molecule_atom_index_1_dist_mean'] - df['dist'] df[f'molecule_atom_index_1_dist_mean_div'] = df[f'molecule_atom_index_1_dist_mean'] / df['dist'] df[f'molecule_atom_index_1_dist_max'] = df.groupby(['molecule_name', 'atom_index_1'])['dist'].transform('max') df[f'molecule_atom_index_1_dist_max_diff'] = df[f'molecule_atom_index_1_dist_max'] - df['dist'] df[f'molecule_atom_index_1_dist_max_div'] = df[f'molecule_atom_index_1_dist_max'] / df['dist'] df[f'molecule_atom_index_1_dist_min'] = df.groupby(['molecule_name', 'atom_index_1'])['dist'].transform('min') df[f'molecule_atom_index_1_dist_min_diff'] = df[f'molecule_atom_index_1_dist_min'] - df['dist'] df[f'molecule_atom_index_1_dist_min_div'] = df[f'molecule_atom_index_1_dist_min'] / df['dist'] df[f'molecule_atom_index_1_dist_std'] = df.groupby(['molecule_name', 'atom_index_1'])['dist'].transform('std') df[f'molecule_atom_index_1_dist_std_diff'] = df[f'molecule_atom_index_1_dist_std'] - df['dist'] df[f'molecule_atom_index_1_dist_std_div'] = df[f'molecule_atom_index_1_dist_std'] / df['dist'] df[f'molecule_atom_1_dist_mean'] = df.groupby(['molecule_name', 'atom_1'])['dist'].transform('mean') df[f'molecule_atom_1_dist_min'] = df.groupby(['molecule_name', 'atom_1'])['dist'].transform('min') df[f'molecule_atom_1_dist_min_diff'] = df[f'molecule_atom_1_dist_min'] - df['dist'] df[f'molecule_atom_1_dist_min_div'] = df[f'molecule_atom_1_dist_min'] / df['dist'] df[f'molecule_atom_1_dist_std'] = df.groupby(['molecule_name', 'atom_1'])['dist'].transform('std') df[f'molecule_atom_1_dist_std_diff'] = df[f'molecule_atom_1_dist_std'] - df['dist'] df[f'molecule_type_0_dist_std'] = df.groupby(['molecule_name', 'type_0'])['dist'].transform('std') df[f'molecule_type_0_dist_std_diff'] = df[f'molecule_type_0_dist_std'] - df['dist'] df[f'molecule_type_dist_mean'] = df.groupby(['molecule_name', 'type'])['dist'].transform('mean') df[f'molecule_type_dist_mean_diff'] = df[f'molecule_type_dist_mean'] - df['dist'] df[f'molecule_type_dist_mean_div'] = df[f'molecule_type_dist_mean'] / df['dist'] df[f'molecule_type_dist_max'] = df.groupby(['molecule_name', 'type'])['dist'].transform('max') df[f'molecule_type_dist_min'] = df.groupby(['molecule_name', 'type'])['dist'].transform('min') df[f'molecule_type_dist_std'] = df.groupby(['molecule_name', 'type'])['dist'].transform('std') df[f'molecule_type_dist_std_diff'] = df[f'molecule_type_dist_std'] - df['dist'] df = reduce_mem_usage(df) return df
def add_qm9_features(df): data_qm9 = pd.read_pickle('../input/data.covs.pickle') to_drop = [ 'type', 'linear', 'atom_index_0', 'atom_index_1', 'scalar_coupling_constant', 'U', 'G', 'H', 'mulliken_mean', 'r2', 'U0' ] data_qm9 = data_qm9.drop(columns=to_drop, axis=1) data_qm9 = reduce_mem_usage(data_qm9, verbose=False) df = pd.merge(df, data_qm9, how='left', on=['molecule_name', 'id']) del data_qm9 df = dummies(df, ['type', 'atom_1']) return df
def get_train_test_data(use_prev=False, prev_data_version=None, prev_trial_no=None): if use_prev: assert prev_data_version is not None assert prev_trial_no is not None file_folder = '../input' train = pd.read_csv(f'{file_folder}/train.csv') if not use_prev: test = pd.read_csv(f'{file_folder}/test.csv') structures = pd.read_csv(f'{file_folder}/structures.csv') scalar_coupling_contributions = pd.read_csv( f'{file_folder}/scalar_coupling_contributions.csv') # train_cos = unpickle(save_path / "train_003.df.pkl", )[["id", "f003:cos_0_1", "f003:cos_1"]] # test_cos = unpickle(save_path / "test_003.df.pkl", )[["id", "f003:cos_0_1", "f003:cos_1"]] train_add = unpickle(save_path / "train_006.df.pkl", ) test_add = unpickle(save_path / "test_006.df.pkl", ) babel_train = pd.read_csv(save_path / "babel_train.csv", usecols=use_cols.babel_cols) babel_test = pd.read_csv(save_path / "babel_test.csv", usecols=use_cols.babel_cols) use_cols.good_columns += [c for c in use_cols.rdkit_cols if c != 'id'] rdkit_train = pd.read_csv(save_path / "rdkit_train.csv", usecols=use_cols.rdkit_cols) rdkit_test = pd.read_csv(save_path / "rdkit_test.csv", usecols=use_cols.rdkit_cols) coulomb_train = pd.read_csv(save_path / "coulomb_interaction_train.csv") coulomb_test = pd.read_csv(save_path / "coulomb_interaction_test.csv") bond_calc_train = unpickle(save_path / "bond_calc_feat_train.pkl") bond_calc_test = unpickle(save_path / "bond_calc_feat_test.pkl") ob_charges = pd.read_csv(save_path / "ob_charges.csv", index_col=0) tda_radius_df = pd.read_csv(save_path / "tda_radius_df.csv", index_col=0) tda_radius_df_03 = pd.read_csv(save_path / "tda_radius_df_v003.csv", index_col=0) pca_feat = unpickle(save_path / "pca_feat_df.pkl") #################################################################################################### # Feature Engineering train = pd.merge( train, scalar_coupling_contributions, how='left', left_on=['molecule_name', 'atom_index_0', 'atom_index_1', 'type'], right_on=['molecule_name', 'atom_index_0', 'atom_index_1', 'type']) train = map_atom_info(train, 0, structures) train = map_atom_info(train, 1, structures) test = map_atom_info(test, 0, structures) test = map_atom_info(test, 1, structures) train_p_0 = train[['x_0', 'y_0', 'z_0']].values train_p_1 = train[['x_1', 'y_1', 'z_1']].values test_p_0 = test[['x_0', 'y_0', 'z_0']].values test_p_1 = test[['x_1', 'y_1', 'z_1']].values train['dist'] = np.linalg.norm(train_p_0 - train_p_1, axis=1) test['dist'] = np.linalg.norm(test_p_0 - test_p_1, axis=1) train['dist_x'] = (train['x_0'] - train['x_1'])**2 test['dist_x'] = (test['x_0'] - test['x_1'])**2 train['dist_y'] = (train['y_0'] - train['y_1'])**2 test['dist_y'] = (test['y_0'] - test['y_1'])**2 train['dist_z'] = (train['z_0'] - train['z_1'])**2 test['dist_z'] = (test['z_0'] - test['z_1'])**2 train['type_0'] = train['type'].apply(lambda x: x[0]) test['type_0'] = test['type'].apply(lambda x: x[0]) train['abs_dist'] = np.linalg.norm(train_p_0 - train_p_1, axis=1, ord=1) test['abs_dist'] = np.linalg.norm(test_p_0 - test_p_1, axis=1, ord=1) dist12('dist_xy', 'x', 'y') dist12('dist_xz', 'x', 'z') dist12('dist_yz', 'y', 'z') atom_count = structures.groupby(['molecule_name', 'atom']).size().unstack(fill_value=0) train = pd.merge(train, atom_count, how='left', left_on='molecule_name', right_on='molecule_name') test = pd.merge(test, atom_count, how='left', left_on='molecule_name', right_on='molecule_name') train = create_features(train) test = create_features(test) angle_df_train, angle_df_test = angle_feature_conv(structures) train = train.merge(angle_df_train, on="id", how="left") test = test.merge(angle_df_test, on="id", how="left") train = train.merge(train_add, on="id", how="left") test = test.merge(test_add, on="id", how="left") # train = train.merge(train_cos, on="id", how="left") # test = test.merge(test_cos, on="id", how="left") train = train.merge(babel_train, on="id", how="left") test = test.merge(babel_test, on="id", how="left") train = train.merge(rdkit_train, on="id", how="left") test = test.merge(rdkit_test, on="id", how="left") train = train.merge(coulomb_train, on="id", how="left") test = test.merge(coulomb_test, on="id", how="left") train = train.merge(bond_calc_train, on="id", how="left") test = test.merge(bond_calc_test, on="id", how="left") train = train.merge(tda_radius_df, on="molecule_name", how="left") test = test.merge(tda_radius_df, on="molecule_name", how="left") train = train.merge(tda_radius_df_03, on="molecule_name", how="left") test = test.merge(tda_radius_df_03, on="molecule_name", how="left") train = train.merge(pca_feat, on="molecule_name", how="left") test = test.merge(pca_feat, on="molecule_name", how="left") train = map_ob_charges(train, ob_charges, 0) train = map_ob_charges(train, ob_charges, 1) test = map_ob_charges(test, ob_charges, 0) test = map_ob_charges(test, ob_charges, 1) train = reduce_mem_usage(train) test = reduce_mem_usage(test) for f in ['atom_1', 'type_0', 'type']: if f in use_cols.good_columns: lbl = LabelEncoder() lbl.fit(list(train[f].values) + list(test[f].values)) train[f] = lbl.transform(list(train[f].values)) test[f] = lbl.transform(list(test[f].values)) Path(save_path / f"{DATA_VERSION}_{TRIAL_NO}").mkdir(parents=True, exist_ok=True) to_pickle( save_path / f"{DATA_VERSION}_{TRIAL_NO}/train_concat_{DATA_VERSION}_{TRIAL_NO}.pkl", train) to_pickle( save_path / f"{DATA_VERSION}_{TRIAL_NO}/test_concat_{DATA_VERSION}_{TRIAL_NO}.pkl", test) else: sample_loaded = False prev_folder = f"../processed/{prev_data_version}/{prev_data_version}_{prev_trial_no}" if DEBUG: # v003_033 train_path = Path( f"{prev_folder}/train_concat_{prev_data_version}_{prev_trial_no}_basic_sampled.pkl" ) test_path = Path( f"{prev_folder}/test_concat_{prev_data_version}_{prev_trial_no}_basic_sampled.pkl" ) if train_path.exists() and test_path.exists(): print("sample loading") train = unpickle(train_path) test = unpickle(test_path) sample_loaded = True print("sample load finish") if not sample_loaded: print(f"loading previous dataest") print("train loading") train: pd.DataFrame = unpickle( f"{prev_folder}/train_concat_{prev_data_version}_{prev_trial_no}_basic.pkl", ) assert "scalar_coupling_constant" in train.columns print("test loading") test: pd.DataFrame = unpickle( f"{prev_folder}/test_concat_{prev_data_version}_{prev_trial_no}_basic.pkl", ) print(f"loading finished") if DEBUG and not sample_loaded: n_sample = 5000 print(f"sampling {n_sample} rows.") train = train.sample(n=n_sample) test = test.sample(n=n_sample) Path( f"../processed/{prev_data_version}/{prev_data_version}_{prev_trial_no}" ).mkdir(parents=True, exist_ok=True) to_pickle( f"{prev_folder}/train_concat_{prev_data_version}_{prev_trial_no}_basic_sampled.pkl", train) to_pickle( f"{prev_folder}/test_concat_{prev_data_version}_{prev_trial_no}_basic_sampled.pkl", test) print("saved.") ################################################################################################### # add additional feature for trying # Path(save_path / f"{DATA_VERSION}_{TRIAL_NO}").mkdir(parents=True, exist_ok=True) # to_pickle(save_path / f"{DATA_VERSION}_{TRIAL_NO}/train_concat_{DATA_VERSION}_{TRIAL_NO}.pkl", train) # to_pickle(save_path / f"{DATA_VERSION}_{TRIAL_NO}/test_concat_{DATA_VERSION}_{TRIAL_NO}.pkl", test) return train, test
how="left") # .merge(test_cos, on="id", how="left") # train = train.merge(train_angle_add, on="id", how="left") # test = test.merge(test_angle_add, on="id", how="left") train = train.merge(train_add, on="id", how="left") test = test.merge(test_add, on="id", how="left") train = train.merge(babel_train, on="id", how="left") test = test.merge(babel_test, on="id", how="left") ob_charges = pd.read_csv("../processed/v003/ob_charges.csv", index_col=0) train = map_ob_charges(train, 0) train = map_ob_charges(train, 1) test = map_ob_charges(test, 0) test = map_ob_charges(test, 1) train = reduce_mem_usage(train) test = reduce_mem_usage(test) for f in ['atom_1', 'type_0', 'type']: if f in good_columns: lbl = LabelEncoder() lbl.fit(list(train[f].values) + list(test[f].values)) train[f] = lbl.transform(list(train[f].values)) test[f] = lbl.transform(list(test[f].values)) to_pickle(save_path / f"train_concat_v003_{DATA_VERSION}_{TRIAL_NO}.pkl", train) to_pickle(save_path / f"test_concat_v003_{DATA_VERSION}_{TRIAL_NO}.pkl", test) X = train[good_columns].copy() y = train['scalar_coupling_constant']
def transform(self, X): X['distance'] = np.linalg.norm(X[['x_x', 'y_x', 'z_x']].values - X[['x_y', 'y_y', 'z_y']].values, axis=1) X['x_dist'] = X['x_x'] - X['x_y'] X['y_dist'] = X['y_x'] - X['y_y'] X['z_dist'] = X['z_x'] - X['z_y'] X['x_dist_abs'] = np.absolute(X['x_dist']) X['y_dist_abs'] = np.absolute(X['y_dist']) X['z_dist_abs'] = np.absolute(X['z_dist']) X['inv_distance3'] = 1 / (X['distance']**3) X['dimension_x'] = np.absolute( X.groupby(['molecule_name'])['x_x'].transform('max') - X.groupby(['molecule_name'])['x_x'].transform('min')) X['dimension_y'] = np.absolute( X.groupby(['molecule_name'])['y_x'].transform('max') - X.groupby(['molecule_name'])['y_x'].transform('min')) X['dimension_z'] = np.absolute( X.groupby(['molecule_name'])['z_x'].transform('max') - X.groupby(['molecule_name'])['z_x'].transform('min')) X['molecule_dist_mean_x'] = X.groupby( ['molecule_name'])['dist_mean_x'].transform('mean') X['molecule_dist_mean_y'] = X.groupby( ['molecule_name'])['dist_mean_y'].transform('mean') X['molecule_dist_mean_bond_x'] = X.groupby( ['molecule_name'])['dist_mean_bond_x'].transform('mean') X['molecule_dist_mean_bond_y'] = X.groupby( ['molecule_name'])['dist_mean_bond_y'].transform('mean') X['molecule_dist_range_x'] = X.groupby(['molecule_name'])['dist_mean_x'].transform('max') - \ X.groupby(['molecule_name'])['dist_mean_x'].transform('min') X['molecule_dist_range_y'] = X.groupby(['molecule_name'])['dist_mean_y'].transform('max') - \ X.groupby(['molecule_name'])['dist_mean_y'].transform('min') X['molecule_dist_std_x'] = X.groupby( ['molecule_name'])['dist_mean_x'].transform('std') X['molecule_dist_std_y'] = X.groupby( ['molecule_name'])['dist_mean_y'].transform('std') X['molecule_atom_0_dist_mean'] = X.groupby( ['molecule_name', 'atom_x'])['distance'].transform('mean') X['molecule_atom_1_dist_mean'] = X.groupby( ['molecule_name', 'atom_y'])['distance'].transform('mean') X['molecule_atom_0_dist_std_diff'] = X.groupby([ 'molecule_name', 'atom_x' ])['distance'].transform('std') - X['distance'] X['molecule_atom_1_dist_std_diff'] = X.groupby([ 'molecule_name', 'atom_y' ])['distance'].transform('std') - X['distance'] X['molecule_type_dist_min'] = X.groupby( ['molecule_name', 'type'])['distance'].transform('min') X['molecule_type_dist_max'] = X.groupby( ['molecule_name', 'type'])['distance'].transform('max') X['molecule_dist_mean_no_bond_x'] = X.groupby( ['molecule_name'])['dist_mean_no_bond_x'].transform('mean') X['molecule_dist_mean_no_bond_y'] = X.groupby( ['molecule_name'])['dist_mean_no_bond_y'].transform('mean') X['molecule_atom_index_0_dist_min'] = X.groupby([ 'molecule_name', 'atom_index_0' ])['distance'].transform('min') # new variable - dont include X['molecule_atom_index_0_dist_std'] = X.groupby([ 'molecule_name', 'atom_index_0' ])['distance'].transform('std') # new variable - dont include X['molecule_atom_index_0_dist_min_div'] = X[ 'molecule_atom_index_0_dist_min'] / X[ 'distance'] # new variable - include X['molecule_atom_index_0_dist_std_div'] = X[ 'molecule_atom_index_0_dist_std'] / X[ 'distance'] # new variable - include X['molecule_atom_index_0_dist_mean'] = X.groupby([ 'molecule_name', 'atom_index_0' ])['distance'].transform('mean') # new variable - include X['molecule_atom_index_0_dist_max'] = X.groupby([ 'molecule_name', 'atom_index_0' ])['distance'].transform('max') # new variable - include X['molecule_atom_index_0_dist_mean_diff'] = X[ 'molecule_atom_index_0_dist_mean'] - X[ 'distance'] # new variable - include X['molecule_atom_index_1_dist_mean'] = X.groupby([ 'molecule_name', 'atom_index_1' ])['distance'].transform('mean') # new variable - include X['molecule_atom_index_1_dist_max'] = X.groupby([ 'molecule_name', 'atom_index_1' ])['distance'].transform('max') # new variable - include X['molecule_atom_index_1_dist_min'] = X.groupby([ 'molecule_name', 'atom_index_1' ])['distance'].transform('min') # new variable - include X['molecule_atom_index_1_dist_std'] = X.groupby([ 'molecule_name', 'atom_index_1' ])['distance'].transform('std') # new variable - dont include X['molecule_atom_index_1_dist_min_div'] = X[ 'molecule_atom_index_1_dist_min'] / X[ 'distance'] # new variable - include X['molecule_atom_index_1_dist_std_diff'] = X[ 'molecule_atom_index_1_dist_std'] - X[ 'distance'] # new variable - include X['molecule_atom_index_1_dist_mean_div'] = X[ 'molecule_atom_index_1_dist_mean'] / X[ 'distance'] # new variable - include X['molecule_atom_index_1_dist_min_diff'] = X[ 'molecule_atom_index_1_dist_min_div'] - X[ 'distance'] # new variable - include le = LabelEncoder() for feat in ['atom_x', 'atom_y']: le.fit(X[feat]) X[feat] = le.transform(X[feat]) X = reduce_mem_usage(X, verbose=False) return X
def transform(self, X): atom_rad = [self.atomic_radius[x] for x in X['atom'].values] X['rad'] = atom_rad position = X[['x', 'y', 'z']].values p_temp = position molec_name = X['molecule_name'].values m_temp = molec_name radius = X['rad'].values r_temp = radius bond = 0 dist_keep = 0 dist_bond = 0 no_bond = 0 dist_no_bond = 0 dist_matrix = np.zeros((X.shape[0], 2 * 29)) dist_matrix_bond = np.zeros((X.shape[0], 2 * 29)) dist_matrix_no_bond = np.zeros((X.shape[0], 2 * 29)) for i in range(29): p_temp = np.roll(p_temp, -1, axis=0) m_temp = np.roll(m_temp, -1, axis=0) r_temp = np.roll(r_temp, -1, axis=0) mask = (m_temp == molec_name) dist = np.linalg.norm(position - p_temp, axis=1) * mask dist_temp = np.roll(np.linalg.norm(position - p_temp, axis=1) * mask, i + 1, axis=0) diff_radius_dist = (dist - (radius + r_temp)) * (dist < (radius + r_temp)) * mask diff_radius_dist_temp = np.roll(diff_radius_dist, i + 1, axis=0) bond += (dist < (radius + r_temp)) * mask bond_temp = np.roll((dist < (radius + r_temp)) * mask, i + 1, axis=0) no_bond += (dist >= (radius + r_temp)) * mask no_bond_temp = np.roll((dist >= (radius + r_temp)) * mask, i + 1, axis=0) bond += bond_temp no_bond += no_bond_temp dist_keep += dist * mask dist_matrix[:, 2 * i] = dist dist_matrix[:, 2 * i + 1] = dist_temp dist_matrix_bond[:, 2 * i] = dist * (dist < (radius + r_temp)) * mask dist_matrix_bond[:, 2 * i + 1] = dist_temp * bond_temp dist_matrix_no_bond[:, 2 * i] = dist * (dist > (radius + r_temp)) * mask dist_matrix_no_bond[:, 2 * i + 1] = dist_temp * no_bond_temp X['n_bonds'] = bond X['n_no_bonds'] = no_bond X['dist_mean'] = np.nanmean(np.where(dist_matrix == 0, np.nan, dist_matrix), axis=1) X['dist_median'] = np.nanmedian(np.where(dist_matrix == 0, np.nan, dist_matrix), axis=1) X['dist_std_bond'] = np.nanstd(np.where(dist_matrix_bond == 0, np.nan, dist_matrix), axis=1) X['dist_mean_bond'] = np.nanmean(np.where(dist_matrix_bond == 0, np.nan, dist_matrix), axis=1) X['dist_median_bond'] = np.nanmedian(np.where(dist_matrix_bond == 0, np.nan, dist_matrix), axis=1) X['dist_mean_no_bond'] = np.nanmean(np.where(dist_matrix_no_bond == 0, np.nan, dist_matrix), axis=1) X['dist_std_no_bond'] = np.nanstd(np.where(dist_matrix_no_bond == 0, np.nan, dist_matrix), axis=1) X['dist_median_no_bond'] = np.nanmedian(np.where( dist_matrix_no_bond == 0, np.nan, dist_matrix), axis=1) X['dist_std'] = np.nanstd(np.where(dist_matrix == 0, np.nan, dist_matrix), axis=1) X['dist_min'] = np.nanmin(np.where(dist_matrix == 0, np.nan, dist_matrix), axis=1) X['dist_max'] = np.nanmax(np.where(dist_matrix == 0, np.nan, dist_matrix), axis=1) X['range_dist'] = np.absolute(X['dist_max'] - X['dist_min']) X['dist_bond_min'] = np.nanmin(np.where(dist_matrix_bond == 0, np.nan, dist_matrix), axis=1) X['dist_bond_max'] = np.nanmax(np.where(dist_matrix_bond == 0, np.nan, dist_matrix), axis=1) X['range_dist_bond'] = np.absolute(X['dist_bond_max'] - X['dist_bond_min']) X['dist_no_bond_min'] = np.nanmin(np.where(dist_matrix_no_bond == 0, np.nan, dist_matrix), axis=1) X['dist_no_bond_max'] = np.nanmax(np.where(dist_matrix_no_bond == 0, np.nan, dist_matrix), axis=1) X['range_dist_no_bond'] = np.absolute(X['dist_no_bond_max'] - X['dist_no_bond_min']) X['n_diff'] = pd.DataFrame(np.around(dist_matrix_bond, 5)).nunique(axis=1).values # 5 X = reduce_mem_usage(X, verbose=False) return X
cat_features = ['type', 'atom_x', 'atom_y'] atomic_radius = {'H': 0.43, 'C': 0.82, 'N': 0.8, 'O': 0.78, 'F': 0.76} electronegativity = {'H': 2.2, 'C': 2.55, 'N': 3.04, 'O': 3.44, 'F': 3.98} t0 = time() struct = pd.read_csv('../input/structures.csv') pipeline_model1 = make_pipeline( MoreStructureProperties(atomic_radius, electronegativity)) pipeline_model2 = make_pipeline(MakeMoreFeatures()) train = pd.read_csv('../input/train.csv') test = pd.read_csv('../input/test.csv') struct = pd.read_csv('../input/structures.csv') structures_yukawa = pd.read_csv('../input/structures_yukawa.csv') struct = pd.concat([struct, structures_yukawa], axis=1) del structures_yukawa struct = reduce_mem_usage(struct, verbose=False) gc.collect() train = get_features(train, struct.copy()) test = get_features(test, struct.copy()) y = train['scalar_coupling_constant'] del struct gc.collect() struct = pd.read_csv('../input/structures.csv') struct = pipeline_model1.fit_transform(struct) train = feat_from_structures(train, struct) train = pipeline_model2.fit_transform( train.drop(['scalar_coupling_constant'], axis=1), train['scalar_coupling_constant']) test = feat_from_structures(test, struct) test = pipeline_model2.transform(test)