def save_file_vector(file_path, vocab, out_file_path, header): log.debug("Saving vector for file {0}".format(file_path)) with open(out_file_path, 'w+') as out_file_handle: out_file = csv.writer(out_file_handle) out_file.writerow(header) data = preprocess_file(file_path, vocab) for mu in np.arange(0, 1, 1. / (parameters.pivot_count + 1))[1:]: result = lowbow_single(data, vocab, mu, parameters.c, parameters.sigma) out_file.writerow(list(result)) log.debug("Got vector for mu={0}".format(mu))
def save_dataset_total(data_class, vocab): log.debug('total: starting with {0}ing data'.format(data_class)) csv_out_path = os.path.join(parameters.out_root, '{0}_data.csv'.format(data_class)) file_count = len([file for root, subFolders, files in os.walk(parameters.data_root) for file in files if data_class in root]) cnt = 1 with open(csv_out_path, 'w+') as out_file_handle: out_file = csv.writer(out_file_handle) for topic in parameters.topics: path = os.path.join(parameters.data_root, topic) if os.path.exists(path): actual_path = os.path.join(path, data_class) for label in os.listdir(actual_path): label_path = os.path.join(actual_path, label) for file_name in os.listdir(label_path): log.debug("Saving vectors for file {0}, {1}/{2}".format(file_name, cnt, file_count)) file_path = os.path.join(label_path, file_name) data = preprocess_file(file_path, vocab) attributes = [] for mu in np.arange(0, 1, 1. / (parameters.pivot_count + 1))[1:]: result = list(lowbow_single(data, vocab, mu, parameters.c, parameters.sigma)) attributes.extend(result) log.debug("Got vector for mu={0}".format(mu)) out_file.writerow(attributes + [label]) cnt += 1