示例#1
0
def get_mimic_sequence_data(data_pickle_path, word_dict_path, predict_dict_path, seq_max, vec_len, sgns_path, save_path, save=False):
    all_events = CsvUtility.read_pickle(data_pickle_path, 'r')
    word_dict = CsvUtility.read_pickle(word_dict_path, 'r')
    predict_dict = CsvUtility.read_pickle(predict_dict_path, 'r')
    # pprint(all_events[0])
    print "word_dict:", len(word_dict), "predict_dict:", len(predict_dict), "all_events:", len(all_events)
    feature_dict = WordIndexMap(list(word_dict))
    pred_dict = WordIndexMap(list(predict_dict))

    filter_event = __filter_events(all_events=all_events)
    sgns_model = get_sgns_embedding('MIMIC', sgns_path)

    feature_tensor = np.zeros((len(filter_event), seq_max, vec_len))
    feature_count_tensor = np.zeros((len(filter_event), seq_max))
    result_tensor = np.zeros((len(filter_event), len(predict_dict)))

    find_nan = {}
    for i_iter, event_line in enumerate(filter_event):
        for seq_iter, sequence_item in enumerate(event_line[0]):
            for event_code in sequence_item:
                if event_code in sgns_model:

                    feature_tensor[i_iter][seq_iter] += sgns_model[event_code]
                    feature_count_tensor[i_iter][seq_iter] += 1
                else:
                    if event_code in find_nan:
                        find_nan[event_code] += 1
                    else:
                        find_nan[event_code] = 1
        for pred_item in event_line[1]:
            result_tensor[i_iter][pred_dict.get_index_by_word(pred_item)] = 1

        if i_iter % 1000 == 0:
            print 'complete {0} of {1}'.format(i_iter, len(filter_event))
    print 'words not in docs:', len(find_nan)
    if save:
        CsvUtility.write_dict2csv(feature_dict.get_word2index(), save_path, 'feature2index_seq_embedding'+str(vec_len)+'.csv')
        CsvUtility.write_dict2csv(pred_dict.get_word2index(), save_path, 'predict2index_seq_embedding'+str(vec_len)+'.csv')
        CsvUtility.write_array2csv(feature_tensor.reshape((feature_tensor.shape[0], -1)), save_path, 'feature_matrix_seq_embedding'+str(vec_len)+'.csv')
        CsvUtility.write_array2csv(result_tensor.reshape((result_tensor.shape[0], -1)), save_path, 'result_matrix_seq_embedding'+str(vec_len)+'.csv')

    return feature_tensor, feature_count_tensor, result_tensor
示例#2
0
文件: lda.py 项目: ykpku/LDA-Reg-TKDE
 def __change_Movie_word_index__(self, gamma, word2index_pickle):
     feature_word2id = CsvUtility.read_pickle(word2index_pickle, 'r')
     print 'feature size: ', len(feature_word2id)
     change_index_result = np.zeros((gamma.shape[0], len(feature_word2id)))
     for j in range(gamma.shape[1]):
         new_index = feature_word2id[self.dictionary.__getitem__(j)]
         for i in range(gamma.shape[0]):
             change_index_result[i][new_index] += gamma[i][j]
         if j % 1000 == 0:
             print j, 'line'
     print 'after changing the size of result: ', change_index_result.shape
     return change_index_result
示例#3
0
def _load_and_process_metadata(sentence_dir,
                               movie_review_path,
                               num_processor=8):

    # Extract the filenames.
    sentence_filenames = glob.glob(os.path.join(sentence_dir, "*/*"))
    print(len(sentence_filenames))
    # print(sentence_filenames[-10:])
    # sentence_filenames = sentence_filenames[-20:]

    word2index = CsvUtility.read_pickle(
        movie_review_path + '/new_word2index.pkl', 'r')
    index2word = CsvUtility.read_pickle(
        movie_review_path + '/new_index2word.pkl', 'r')

    # Break the files into num_threads batches.
    spacing = np.linspace(0, len(sentence_filenames),
                          num_processor + 1).astype(np.int)
    ranges = []
    for i in xrange(len(spacing) - 1):
        ranges.append([spacing[i], spacing[i + 1]])

    p = Pool(num_processor)
    res = []
    for i in range(num_processor):
        start = ranges[i][0]
        end = ranges[i][1]
        res.append(
            p.apply_async(get_corpus_contend_thread,
                          args=(str(i), sentence_filenames[start:end],
                                word2index)))
        print(str(i) + ' processor started !')

    # get_corpus_contend(sentence_filenames, word2index)
    p.close()
    p.join()