def get_mimic_sequence_data(data_pickle_path, word_dict_path, predict_dict_path, seq_max, vec_len, sgns_path, save_path, save=False): all_events = CsvUtility.read_pickle(data_pickle_path, 'r') word_dict = CsvUtility.read_pickle(word_dict_path, 'r') predict_dict = CsvUtility.read_pickle(predict_dict_path, 'r') # pprint(all_events[0]) print "word_dict:", len(word_dict), "predict_dict:", len(predict_dict), "all_events:", len(all_events) feature_dict = WordIndexMap(list(word_dict)) pred_dict = WordIndexMap(list(predict_dict)) filter_event = __filter_events(all_events=all_events) sgns_model = get_sgns_embedding('MIMIC', sgns_path) feature_tensor = np.zeros((len(filter_event), seq_max, vec_len)) feature_count_tensor = np.zeros((len(filter_event), seq_max)) result_tensor = np.zeros((len(filter_event), len(predict_dict))) find_nan = {} for i_iter, event_line in enumerate(filter_event): for seq_iter, sequence_item in enumerate(event_line[0]): for event_code in sequence_item: if event_code in sgns_model: feature_tensor[i_iter][seq_iter] += sgns_model[event_code] feature_count_tensor[i_iter][seq_iter] += 1 else: if event_code in find_nan: find_nan[event_code] += 1 else: find_nan[event_code] = 1 for pred_item in event_line[1]: result_tensor[i_iter][pred_dict.get_index_by_word(pred_item)] = 1 if i_iter % 1000 == 0: print 'complete {0} of {1}'.format(i_iter, len(filter_event)) print 'words not in docs:', len(find_nan) if save: CsvUtility.write_dict2csv(feature_dict.get_word2index(), save_path, 'feature2index_seq_embedding'+str(vec_len)+'.csv') CsvUtility.write_dict2csv(pred_dict.get_word2index(), save_path, 'predict2index_seq_embedding'+str(vec_len)+'.csv') CsvUtility.write_array2csv(feature_tensor.reshape((feature_tensor.shape[0], -1)), save_path, 'feature_matrix_seq_embedding'+str(vec_len)+'.csv') CsvUtility.write_array2csv(result_tensor.reshape((result_tensor.shape[0], -1)), save_path, 'result_matrix_seq_embedding'+str(vec_len)+'.csv') return feature_tensor, feature_count_tensor, result_tensor
def __change_Movie_word_index__(self, gamma, word2index_pickle): feature_word2id = CsvUtility.read_pickle(word2index_pickle, 'r') print 'feature size: ', len(feature_word2id) change_index_result = np.zeros((gamma.shape[0], len(feature_word2id))) for j in range(gamma.shape[1]): new_index = feature_word2id[self.dictionary.__getitem__(j)] for i in range(gamma.shape[0]): change_index_result[i][new_index] += gamma[i][j] if j % 1000 == 0: print j, 'line' print 'after changing the size of result: ', change_index_result.shape return change_index_result
def _load_and_process_metadata(sentence_dir, movie_review_path, num_processor=8): # Extract the filenames. sentence_filenames = glob.glob(os.path.join(sentence_dir, "*/*")) print(len(sentence_filenames)) # print(sentence_filenames[-10:]) # sentence_filenames = sentence_filenames[-20:] word2index = CsvUtility.read_pickle( movie_review_path + '/new_word2index.pkl', 'r') index2word = CsvUtility.read_pickle( movie_review_path + '/new_index2word.pkl', 'r') # Break the files into num_threads batches. spacing = np.linspace(0, len(sentence_filenames), num_processor + 1).astype(np.int) ranges = [] for i in xrange(len(spacing) - 1): ranges.append([spacing[i], spacing[i + 1]]) p = Pool(num_processor) res = [] for i in range(num_processor): start = ranges[i][0] end = ranges[i][1] res.append( p.apply_async(get_corpus_contend_thread, args=(str(i), sentence_filenames[start:end], word2index))) print(str(i) + ' processor started !') # get_corpus_contend(sentence_filenames, word2index) p.close() p.join()