def get_data(): global train, test test = u.normalize_test_set_classification_scheme(test) train = u.normalize_test_set_classification_scheme(train) # Normalize data? train = u.reduce_dataset(train, 3000) # To compansate for poor TSV data structure i_d = 4 if len(test[0]) > 4 else 3 t_d = 4 if len(train[0]) > 4 else 3 docs_test, y_test = test[:,i_d], test[:,i_d-1] docs_train, y_train = train[:,t_d], train[:,t_d-1] docs_train_subjectivity, y_train_subjectivity, docs_train_polarity, y_train_polarity = u.generate_two_part_dataset(train) return docs_test, y_test, docs_train, y_train, docs_train_subjectivity, y_train_subjectivity, docs_train_polarity, y_train_polarity
def read_tsv(filename): data = np.array([line.split("\t") for line in open(filename).read().decode("ISO8859-16").split("\n") if len(line) > 0]) return u.normalize_test_set_classification_scheme(data)