def my_features(): # Get train data from svmlight_file X_train, Y_train = utils.get_data_from_svmlight( "../deliverables/features_svmlight.train") # Read in the test events and feature_map data events = pd.read_csv("../data/test/events.csv") feature_map = pd.read_csv("../data/test/event_feature_map.csv") # Aggregate test events data using the aggregate_events method from etl.py aggregated_events = aggregate_events( events, None, feature_map, "../data/test/test_aggregated_events.csv") # Create the test features patient_feautures = create_test_features(aggregated_events) # Generate the test features file save_test_features(patient_feautures, "../deliverables/test_features.txt") # Get test data from svmlight_file created above X_test, patient_ids = utils.get_data_from_svmlight( "../deliverables/test_features.txt") return X_train, Y_train, X_test
def my_features(): X_train, Y_train = utils.get_data_from_svmlight( "../deliverables/features_svmlight.train") filepath = '../data/test/' filtered_events = pd.read_csv(filepath + 'events.csv')[[ 'patient_id', 'event_id', 'value' ]] feature_map = pd.read_csv(filepath + 'event_feature_map.csv') aggregated_events = etl.aggregate_events(filtered_events, None, feature_map, '') patient_features = aggregated_events.groupby('patient_id')[[ 'feature_id', 'feature_value' ]].apply(lambda x: [tuple(x) for x in x.values]).to_dict() events_mortality = pd.DataFrame(aggregated_events['patient_id']) events_mortality['label'] = aggregated_events['patient_id'] mortality = events_mortality.set_index('patient_id')['label'].to_dict() etl.save_svmlight(patient_features, mortality, '../deliverables/test_features.txt', '../deliverables/features.txt') X_test = load_svmlight_file('../deliverables/test_features.txt', n_features=3190)[0] # X_testt, Y_testt = utils.get_data_from_svmlight("../data/features_svmlight.validate") clf = GradientBoostingClassifier() clf = clf.fit(X_train, Y_train) model = SelectFromModel(clf, prefit=True) X_train_n = model.transform(X_train) X_test_n = model.transform(X_test) # X_testt_n=model.transform(X_testt) return X_train_n.todense(), Y_train, X_test_n.todense()
def my_features(): # TODO: complete this train_path = '../data/test/' deliverables_path = '../deliverables/' # Calculate index date events = pd.read_csv(train_path + 'events.csv') feature_map = pd.read_csv(train_path + 'event_feature_map.csv') # Aggregate the event values for each pat ient aggregated_events = etl.aggregate_events(events, None, feature_map, deliverables_path) ''' TODO: Complete the code below by creating two dictionaries - 1. patient_features : Key - patient_id and value is array of tuples(feature_id, feature_value) 2. mortality : Key - patient_id and value is mortality label ''' patient_features = {} for index, row in aggregated_events.iterrows(): if not patient_features.get(row['patient_id']): patient_features[row['patient_id']] = [(row['feature_id'], row['feature_value'])] else: patient_features[row['patient_id']].append( (row['feature_id'], row['feature_value'])) line = '' line_svm = '' for key, value in sorted(patient_features.iteritems()): line += str(int(key)) + ' ' line_svm += str(1) + ' ' value = sorted(value) for item in value: line += str(int(item[0])) + ":" + str(format(item[1], '.6f')) + ' ' line_svm += str(int(item[0])) + ":" + str(format(item[1], '.6f')) + ' ' line += '\n' line_svm += '\n' deliverable2 = open(deliverables_path + 'test_features.txt', 'wb') deliverable2.write(line) deliverable2.close() svm_file = open(deliverables_path + 'test_mymodel_features.train', 'wb') svm_file.write(line_svm) svm_file.close() data_train = load_svmlight_file(deliverables_path + 'test_mymodel_features.train', n_features=3190) X_test = data_train[0] print(X_test.shape) X_train, Y_train = utils.get_data_from_svmlight( "../deliverables/features_svmlight.train") return X_train, Y_train, X_test
def my_features(): #TODO: complete this train_path = '../data/train/' test_path = '../data/test/' train_events = pd.read_csv(train_path + 'events.csv') train_mortality = pd.read_csv(train_path + 'mortality_events.csv') train_feature_map = pd.read_csv(train_path + 'event_feature_map.csv') test_events = pd.read_csv(test_path + 'events.csv') test_feature_map = pd.read_csv(test_path + 'event_feature_map.csv') patient_features, mortality = etl.create_features(train_events, train_mortality, train_feature_map) etl.save_svmlight(patient_features, mortality, '../others/features_svmlight.train', '../others/features.train') X_train, Y_train = utils.get_data_from_svmlight( "../others/features_svmlight.train") deliverables_path = '../others/' aggregated_events = etl.aggregate_events( test_events[['patient_id', 'event_id', 'value']], train_mortality, test_feature_map, deliverables_path) merged = pd.merge(test_events, train_mortality, on='patient_id', suffixes=['_x', '_y'], how='left') merged.fillna(0, inplace=True) test_patient_features = aggregated_events.groupby('patient_id')[[ 'feature_id', 'feature_value' ]].apply(lambda x: [tuple(x) for x in x.values]).to_dict() test_mortality = merged.groupby('patient_id')['label'].apply( lambda x: x.unique()[0]).to_dict() etl.save_svmlight(test_patient_features, test_mortality, '../others/features_svmlight.test', '../others/features.test') deliverable1 = open('../deliverables/test_features.txt', 'wb') sorted_keys = sorted(test_patient_features.keys()) d1 = '' for i in sorted_keys: deliverable1.write(str(int(i))) others = sorted(test_patient_features[i]) for j in others: deliverable1.write(' ' + str(int(j[0])) + ':' + '%.6f' % (j[1])) deliverable1.write(' \n') X_test, Y_test = utils.get_data_from_svmlight( '../others/features_svmlight.test') return X_train, Y_train, X_test
def my_features(): #TODO: complete this X_train, Y_train = utils.get_data_from_svmlight( '../deliverables/features_svmlight.train') deliverables_path = '../deliverables/' test_events = pd.read_csv('../data/test/events.csv') test_events_map = pd.read_csv('../data/test/event_feature_map.csv') test_aggregated_events = etl.aggregate_events(test_events, None, test_events_map, deliverables_path) #make patient_features for test data test_patient_features = test_aggregated_events.groupby('patient_id')[[ 'feature_id', 'feature_value' ]].apply(lambda g: list(map(tuple, g.values.tolist()))).to_dict() #store test_feature.txt and test_svmlight file line_svm = '' line_test = '' for key in sorted(test_patient_features): line_svm += '1 ' line_test += str(int(key)) + ' ' for tup in sorted(test_patient_features[key]): line_svm += str(int(tup[0])) + ':' + str("{:.6f}".format( tup[1])) + ' ' line_test += str(int(tup[0])) + ':' + str("{:.6f}".format( tup[1])) + ' ' line_svm += '\n' line_test += '\n' test_featuresfile = open(deliverables_path + 'test_features.txt', 'wb') test_svmlightfile = open(deliverables_path + 'test_mymodel_svm.train', 'wb') test_svmlightfile.write(bytes(line_svm, 'UTF-8')) #Use 'UTF-8' test_featuresfile.write(bytes(line_test, 'UTF-8')) test_data = load_svmlight_file(deliverables_path + 'test_mymodel_svm.train', n_features=3190) X_test = test_data[0] return X_train, Y_train, X_test
def my_features(): #TODO: complete this X_train, Y_train = utils.get_data_from_svmlight( '../deliverables/features_svmlight.train') events_test = pd.read_csv('../data/test/events.csv') feature_map_test = pd.read_csv('../data/test/event_feature_map.csv') deliverables_path = '../deliverables/' aggregated_events_test = etl.aggregate_events(events_test, None, feature_map_test, deliverables_path) patient_features_test = aggregated_events_test.groupby('patient_id')[[ 'feature_id', 'feature_value' ]] patient_features_test = patient_features_test.apply( lambda g: list(map(tuple, g.values.tolist()))).to_dict() op_file = deliverables_path + 'features_svmlight.test' op_deliverable = deliverables_path + 'test_features.txt' deliverable1 = open(op_file, 'wb') deliverable2 = open(op_deliverable, 'wb') line1 = line2 = '' for key in sorted(patient_features_test.keys()): line1 += '1 ' line2 += str(int(key)) + ' ' for value in sorted(patient_features_test[key]): line1 += str(int(value[0])) + ':' + str("{:.6f}".format( value[1])) + ' ' line2 += str(int(value[0])) + ':' + str("{:.6f}".format( value[1])) + ' ' line1 += '\n' line2 += '\n' deliverable1.write(bytes(line1, 'UTF-8')) #Use 'UTF-8' deliverable2.write(bytes(line2, 'UTF-8')) X_test = load_svmlight_file(deliverables_path + 'features_svmlight.test', n_features=3190)[0] return X_train, Y_train, X_test
def my_features(): # Get train data from svmlight_file X_train, Y_train = utils.get_data_from_svmlight("../deliverables/features_svmlight.train") # Read in the test events and feature_map data events = pd.read_csv("../data/test/events.csv") feature_map = pd.read_csv("../data/test/event_feature_map.csv") # Aggregate test events data using the aggregate_events method from etl.py aggregated_events = aggregate_events(events , None, feature_map , "../data/test/test_aggregated_events.csv") # Create the test features patient_feautures = create_test_features(aggregated_events) # Generate the test features file save_test_features(patient_feautures,"../deliverables/test_features.txt") # Get test data from svmlight_file created above X_test, patient_ids = utils.get_data_from_svmlight("../deliverables/test_features.txt") return X_train,Y_train,X_test
def my_features(filepath_train, filepath_test): #TODO: complete this events, mortality, feature_map = etl.read_csv(filepath_train) events_test = pd.read_csv(filepath_test + 'events.csv') feature_map_test = pd.read_csv(filepath_test + 'event_feature_map.csv') #deliverables_path = 'C:/Users/yyan/Downloads/' indx_date = etl.calculate_index_date(events, mortality, '') filtered_events = etl.filter_events(events, indx_date, '') aggregated_events = etl.aggregate_events(filtered_events, mortality, feature_map, deliverables_path) feature_count = aggregated_events.groupby(by=['feature_id']).count() n = 600 selected_features = list( feature_count[feature_count['patient_id'] >= n].index) aggregated_events = aggregated_events[aggregated_events['feature_id'].isin( selected_features)] df = aggregated_events.join(mortality.set_index('patient_id'), on='patient_id', lsuffix='', rsuffix='_r') patient_features = df.set_index('patient_id')[[ 'feature_id', 'feature_value' ]].T.apply(tuple).to_frame() patient_features.columns = ['features'] patient_features = patient_features.groupby( by=['patient_id'])['features'].apply(np.array) mortality = df.fillna(0).drop_duplicates().set_index( 'patient_id')['label'].to_dict() s = aggregated_events.pivot_table(index='patient_id', columns='feature_id', values='feature_value').fillna(0) l = df[['patient_id', 'label']].fillna(0).drop_duplicates() df_test = events_test.join(feature_map_test.set_index('event_id'), on='event_id', lsuffix='', rsuffix='_r') sub_sum = df_test[df_test['event_id'].str.startswith(( 'DIAG', 'DRUG')) == True].groupby(by=['patient_id', 'idx']).sum() sub_count = df_test[df_test['event_id'].str.startswith(( 'LAB')) == True].groupby(by=['patient_id', 'idx']).count() sub_count = sub_count[['value']] columns = ['patient_id', 'feature_id', 'feature_value'] agg_events = pd.concat([sub_sum, sub_count]).reset_index() agg_events.columns = columns agg_events[ 'feature_value'] = agg_events['feature_value'] / agg_events.groupby( ['feature_id'])['feature_value'].transform('max') #agg_events = agg_events[agg_events['feature_id'].isin(selected_features)] X_train = s Y_train = l.set_index('patient_id') clf = LogisticRegression(penalty='l1') clf.fit(X_train, Y_train) coef = clf.coef_ selected_features = pd.DataFrame(coef, columns=X_train.columns).columns.delete(0) X_train = X_train[selected_features] agg_events = agg_events[agg_events['feature_id'].isin( selected_features)].fillna(0) patient_features_test = agg_events.set_index('patient_id')[[ 'feature_id', 'feature_value' ]].T.apply(tuple).to_frame() patient_features_test.columns = ['features'] patient_features_test = patient_features_test.groupby( by=['patient_id'])['features'].apply(np.array) X_test = agg_events.pivot_table(index='patient_id', columns='feature_id', values='feature_value').fillna(0) #X_test = X_test[selected_features] deliverable = open( 'C:/Users/yyan/Downloads/homework1/deliverables/test_features.txt', 'wb') keys = patient_features_test.keys() for k in keys: f_k = sorted(patient_features_test[k], key=lambda tup: tup[0]) l = utils.bag_to_svmlight(f_k) + " " + "\n" l_id = str(k).replace('.0', "") + " " + l deliverable.write(bytes((l_id), 'UTF-8')) return X_train, Y_train, X_test