def load_count_vec_data(spark, hdfs_dir_name, dict_dir_name): hdfs_dir_name = add_slash_to_dir(hdfs_dir_name) dict_dir_name = add_slash_to_dir(dict_dir_name) count_vector_matrix = load_count_vector_matrix( spark, hdfs_dir_name + 'count_vector_matrix') col_dict = load_dict(dict_dir_name + 'col_dict.json') return col_dict, count_vector_matrix
def save_dict(output_dir_dict, out_dict, filename='col_dict.json'): output_dir_dict = add_slash_to_dir(output_dir_dict) output_name_dict = output_dir_dict + filename make_sure_path_exists(output_dir_dict) json.dump(out_dict, open(output_name_dict, mode='w'))
def save_rdd_mat(output_dir_rdd, rdd_matrix, filename='count_vector_matrix'): output_dir_rdd = add_slash_to_dir(output_dir_rdd) output_name_rdd = output_dir_rdd + filename rdd_matrix.saveAsTextFile(output_name_rdd) print('***** RDD matrix saved. *****')
def load_id_list(dir_name): return pickle.load( open(add_slash_to_dir(dir_name) + 'test_ids.pickle', mode='rb'))
def save_id_list(id_list, dir_name): pickle.dump( id_list, open(add_slash_to_dir(dir_name) + 'test_ids.pickle', mode='wb'))