def main(): print "get data" data = cu.get_dataframe("train.csv") print "sort by creation date" data = data.sort_index(by="PostCreationDate") print "cut off" header = cu.get_header("train.csv") splits = np.array_split(data, 3) frames = [splits[0], splits[1]] train_data = pd.concat(frames) test_data = splits[2] # cutoff = datetime.datetime(2012, 7, 18) print "write to csv" cu.write_sample("train_data.csv", header, train_data) train_data.to_csv(os.path.join(cu.data_path, "train_data.csv"), index=False, header=header) test_data.to_csv(os.path.join(cu.data_path, "test_data.csv"), index=False, header=header)
def main(): header, sample = sample_train(os.path.join(cu.data_path, "train.csv")) cu.write_sample("train-sample1.csv", header, sample) header, sample = sample_train(os.path.join(cu.data_path, "train-A.csv")) cu.write_sample("train-A-sample1.csv", header, sample)
def main(): print("Reading the data", train_file) header = cu.get_header(train_file) records = cu.get_lines(train_file, lines) cu.write_sample(output_file, header, records)
def main(): header, sample = sample_train(os.path.join(cu.data_path, "train.csv")) cu.write_sample("train-sample1.csv", header, sample) header, sample = sample_train(os.path.join(cu.data_path, "train-A.csv")) save_sample("train-A-sample1.csv", header, sample)
def main(): # header, sample = sample_train("train.csv") # cu.write_sample("train-sample1.csv", header, sample) header, sample = sample_train("train_data.csv") cu.write_sample("train-sample.csv", header, sample)