def select_cropus(input_corpus_path, input_label_corpus_path, data_size): corpus_list = read_list(input_corpus_path) label_list = read_list(input_label_corpus_path) corpus_list = corpus_list[:data_size] label_list = label_list[:data_size] raw_corpus_list = [i.split() for i in corpus_list] raw_label_list = [i.split() for i in label_list] padd_corpus_list = padd_corpus(raw_corpus_list) processed_label_list = process_y(raw_label_list) raw_corpus_array = np.array(padd_corpus_list) raw_label_array = np.array(processed_label_list) shuffle_indices = np.random.permutation(np.arange(data_size)) corpus_array = raw_corpus_array[shuffle_indices] labels_array = raw_label_array[shuffle_indices] train_size = int(data_size * 0.9) print('data_size:', data_size) print('train_size:', train_size) train_corpus_array = corpus_array[:train_size] train_labels_array = labels_array[:train_size] eval_corpus_array = corpus_array[train_size:] eval_labels_array = labels_array[train_size:] return train_corpus_array, train_labels_array, eval_corpus_array, eval_labels_array
def compare_file(file_1_path, file_2_path): corpus_list_1 = read_list(file_1_path) corpus_list_2 = read_list(file_2_path) md5_1_set = set() md5_2_set = set() for line in corpus_list_1: line_list = line.split('\t') if len(line_list) == 2: if len(line_list[0]) == len('22d68441d949b94907c41309ffbf01b1'): md5_1_set.add(line_list[0]) for line in corpus_list_2: line_list = line.split('\t') if len(line_list) == 2: if len(line_list[0]) == len('22d68441d949b94907c41309ffbf01b1'): md5_2_set.add(line_list[0]) additive_set = md5_2_set - md5_1_set lack_set = md5_1_set - md5_2_set print('additive_set size:', len(additive_set)) print('lack_set size:', len(lack_set)) write_list(list(additive_set), '../../result_data/temp/additive_set.txt') write_list(list(lack_set), '../../result_data/temp/lack_set.txt')
def build_train_corpus(corpus_path, label_path, output_x_path, output_y_path): input_x_list = [] input_y_list = [] label_list = read_list(label_path) corpus_id_topic_map = {} for line in label_list: line_list = line.split('\t') topic_id = line_list[0] topic_id_line = line_list[1] corpus_id_topic_map[topic_id] = topic_id_line print('corpus_id_topic_map size:', len(corpus_id_topic_map)) iter_num = 0 with codecs.open(corpus_path, 'r', "utf-8") as read_file: for line in read_file: iter_num += 1 line_list = line.split('\t') topic_id = line_list[0] title_id_line = line_list[1] if topic_id in corpus_id_topic_map: title_id_line = title_id_line.strip() input_x_list.append(title_id_line) input_y_list.append(corpus_id_topic_map[topic_id]) write_list(input_x_list, output_x_path) write_list(input_y_list, output_y_path) print('input_x_list size:', len(input_x_list)) print('input_y_list size:', len(input_y_list))
def word_statistic(input_path,output_path): all_word_set = set() all_word_list= [] line_num = 0 corpus_list = read_list(input_path) for line in corpus_list: line_num += 1 line_list = line.split("\t") id = line_list[0] title = line_list[1] new_line = title if len(line_list) == 3: content = line_list[2] new_line = title + ","+content new_line_list = new_line.split(',') all_word_list += new_line_list # print('new_line_list:',new_line_list) if len(all_word_list) > 10000: temp_word_set = set(all_word_list) all_word_set = all_word_set | temp_word_set all_word_list = [] if line_num%10000 == 0: print("line_num:",line_num) all_word_list = list(all_word_set) all_word_list = [int(i.replace("w","")) for i in all_word_list if 'w' in i] all_word_list.sort() print('all_word_list size:',len(all_word_list)) write_list(all_word_list,output_path)
def read_question_train_set(input_path): lines = read_list(input_path) iter_num = 0 for line in lines: iter_num += 1 # line_list = line.split('\t') # print(line_list) # if iter_num == 10: # exit() print("iter_num:",iter_num)
def word_embeding_statistic(input_path): lines = read_list(input_path) iter_num = 0 for line in lines: iter_num += 1 # line_list = line.split('\t') print(line) if iter_num == 10: exit() print("iter_num:",iter_num)
def compare_two_files(input_path_1,input_path_2): line_list_1 = read_list(input_path_1) line_list_2 = read_list(input_path_2) if len(line_list_1) != len(line_list_2): print('Error:line size is not equal!exit!') exit() for i in range(len(line_list_1)): word_list_1,part_speech_list_1 = get_word_list(line_list_1[i]) word_list_2,part_speech_list_2 = get_word_list(line_list_2[i]) is_same_list, not_same_list_1,not_same_list_2 = compare_two_lines_words(word_list_1,word_list_2) not_same_word_list_1 = [word_list_1[x] for x in not_same_list_1] not_same_word_list_2 = [word_list_2[x] for x in not_same_list_2] # print('is_same_list:',is_same_list) print('*************************************************************') if len(is_same_list) !=0: print('not_same_word_list_1:',not_same_word_list_1) print('not_same_word_list_2:',not_same_word_list_2) else: print('two lines is same')
def statistic_dict_size(dict_path): dict = read_list(dict_path) print('dict size:',len(dict))