def main(): train_df = pd.read_csv('../data/processed/train_dropped.csv', dtype=const.DTYPE) question_df = pd.read_csv(const.INPUT_DATA_DIR / 'questions.csv') train_features_df = get_features(train_df, question_df) save_features(train_features_df, data_type='train')
def main(): train_df = pd.read_feather('../data/input/train_data.feather') test_df = pd.read_feather('../data/input/test_data.feather') train_features_df, test_features_df = get_features(train_df, test_df) save_features(train_features_df, data_type='train') save_features(test_features_df, data_type='test')
def main(): train_df = pd.read_csv('../data/input/train_concated.csv') test_df = pd.read_csv('../data/input/test.csv') train_features_df, test_features_df = get_features(train_df, test_df) save_features(train_features_df, data_type='train') save_features(test_features_df, data_type='test')
def main(): train_df = pd.read_csv(const.INPUT_DATA_DIR / 'train.csv', dtype=const.DTYPE) questions_df = pd.read_csv('../data/input/questions.csv') q2p = dict(questions_df[['question_id', 'part']].values) train_df['part'] = train_df['content_id'].map(q2p) train_features_df = get_features(train_df) save_features(train_features_df, data_type='train')
def main(): train_df = pd.read_csv('../data/processed/train_dropped.csv', dtype=const.DTYPE) question_df = pd.read_csv(const.INPUT_DATA_DIR / 'questions.csv', dtype=const.DTYPE) question_df.rename(columns={'question_id': 'content_id'}, inplace=True) train_df = pd.merge(train_df, question_df, on='content_id', how='left') train_features_df = get_features(train_df) save_features(train_features_df, data_type='train')
def main(): train_df = pd.read_csv(const.INPUT_DATA_DIR / 'train.csv', dtype=const.DTYPE) question_df = pd.read_csv(const.INPUT_DATA_DIR / 'questions.csv') question_df.rename(columns={'question_id': 'content_id'}, inplace=True) train_df = pd.merge(train_df, question_df, on='content_id', how='left') usecols = ['user_id', 'content_id', 'part', 'prior_question_elapsed_time'] train_features_df = get_features(train_df[usecols]) save_features(train_features_df, data_type='train')
def main(): train_df = dh.load('../data/input/train_concated.csv') test_df = dh.load('../data/input/test.csv') whole_df = pd.concat([train_df, test_df], axis=0, sort=False, ignore_index=True) whole_features_df = get_features(whole_df) train_features_df = whole_features_df.iloc[:len(train_df)] test_features_df = whole_features_df.iloc[len(train_df):] save_features(train_features_df, data_type='train') save_features(test_features_df, data_type='test')
def main(): train_df = pd.read_csv('../data/processed/train_dropped.csv', dtype=const.DTYPE) lectures_df = pd.read_csv(const.INPUT_DATA_DIR / 'lectures.csv') lectures_df.rename(columns={'lecture_id': 'content_id'}, inplace=True) attempt_c = pd.read_feather('../features/dropped___attempt_c_train.feather' )['dropped___attempt_c'].values train_df['dropped___attempt_c'] = np.where(attempt_c <= 3, attempt_c, 4) train_df = pd.merge(train_df, lectures_df, on='content_id', how='left') train_features_df = get_features(train_df) save_features(train_features_df, data_type='train')
def main(): train_df = dh.load('../data/input/train_concated.csv') test_df = dh.load('../data/input/test.csv') train2020_size_df = pd.read_csv('../data/input/train_image_size.csv') train2019_size_df = pd.read_csv('../data/input/train_2019.csv', usecols=['image_name', 'height', 'width']) train_size_df = pd.concat([ train2020_size_df, train2019_size_df ], axis=0, sort=False, ignore_index=True) test_size_df = pd.read_csv('../data/input/test_image_size.csv') train_df = train_df.merge(train_size_df, on='image_name', how='left') test_df = test_df.merge(test_size_df, on='image_name', how='left') train_features_df, test_features_df = get_features(train_df, test_df) save_features(train_features_df, data_type='train') save_features(test_features_df, data_type='test')
def main(): train_df = pd.read_csv(const.INPUT_DATA_DIR / 'train.csv', dtype=const.DTYPE) train_features_df = get_features(train_df) save_features(train_features_df, data_type='train')
def main(): train_df = pd.read_csv('../data/processed/train_dropped.csv', dtype=const.DTYPE) train_features_df = get_features(train_df) save_features(train_features_df, data_type='train')