def sample_train(input_file): closed_count = cu.get_closed_count(input_file) sample = reservoir_sample(cu.iter_open_questions(input_file), closed_count) sample.extend(cu.iter_closed_questions(input_file)) random.shuffle(sample) header = cu.get_header(input_file) return header, sample
def main(): data = cu.get_dataframe("train.csv") data = data.sort_index(by="PostCreationDate") header = cu.get_header("train.csv") cutoff = datetime.datetime(2012, 7, 18) data[data["PostCreationDate"] < cutoff].to_csv(os.path.join(cu.data_path, "train-A.csv"), index=False) data[data["PostCreationDate"] >= cutoff].to_csv(os.path.join(cu.data_path, "train-B.csv"), index=False)
def sample_train(input_file): print("get closed question count") closed_count = cu.get_closed_count(input_file) print("sample open questions") sample = reservoir_sample(cu.iter_open_questions(input_file), closed_count) print("get all closed questions") sample.extend(cu.iter_closed_questions(input_file)) print("shuffle all the data") random.shuffle(sample) header = cu.get_header(input_file) return header, sample
def main(): print "get data" data = cu.get_dataframe("train.csv") print "sort by creation date" data = data.sort_index(by="PostCreationDate") print "cut off" header = cu.get_header("train.csv") splits = np.array_split(data, 3) frames = [splits[0], splits[1]] train_data = pd.concat(frames) test_data = splits[2] # cutoff = datetime.datetime(2012, 7, 18) print "write to csv" cu.write_sample("train_data.csv", header, train_data) train_data.to_csv(os.path.join(cu.data_path, "train_data.csv"), index=False, header=header) test_data.to_csv(os.path.join(cu.data_path, "test_data.csv"), index=False, header=header)
def main(): print("Reading the data", train_file) header = cu.get_header(train_file) records = cu.get_lines(train_file, lines) cu.write_sample(output_file, header, records)
values.append(q[field]) else: values.append("''") writer.writerow(values) i = i + 1 print "written out total for this class: " + str(i) return i if __name__=="__main__": start = time.time() filename_in = train_file filename_out = os.path.join(main_path, "data", output_sampled_file) writer = csv.writer(open(filename_out, "w"), lineterminator="\n") writer.writerow(cu.get_header(filename_in)) total_written = 0 if output_all_entries == 0: per_class_limit = cu.output_rows_limit / len(question_status) else: per_class_limit = -1 for status in question_status: total_written = total_written + sample_by_class(writer,status,per_class_limit) print "total rows written:" + str(total_written) finish = time.time() print "completed in %0.4f seconds" % (finish-start)
import nltk import nltk.stem.snowball as snowball import competition_utilities as cu DATA_DIR = '../data/' RESOURCES_DIR = './resources/' file_name = 'train.csv' logging.basicConfig( level=logging.INFO, format='%(asctime)s %(levelname)s %(message)s' ) log = logging.getLogger(__name__) log.info( "π: read data" ) header = cu.get_header( file_name ) open_status = [ r[14] for r in cu.get_reader( file_name ) ] def generate_tags(): log.info( "π: read tags" ) tags = [ r[8:13] for r in cu.get_reader( file_name ) ] log.info( "π: process tags" ) res = {} for st in pd.Series( open_status ).unique(): # res.setdefault( st, set() ) res.setdefault( st, [] ) for i,x in enumerate( open_status ): # res[x] = res[x].union( tags[i] ) res[x].extend( tags[i] )
else: values.append("''") writer.writerow(values) i = i + 1 print "written out total for this class: " + str(i) return i if __name__ == "__main__": start = time.time() filename_in = train_file filename_out = os.path.join(main_path, "data", output_sampled_file) writer = csv.writer(open(filename_out, "w"), lineterminator="\n") writer.writerow(cu.get_header(filename_in)) total_written = 0 if output_all_entries == 0: per_class_limit = cu.output_rows_limit / len(question_status) else: per_class_limit = -1 for status in question_status: total_written = total_written + sample_by_class( writer, status, per_class_limit) print "total rows written:" + str(total_written) finish = time.time() print "completed in %0.4f seconds" % (finish - start)