Пример #1
0
def select_cropus(input_corpus_path, input_label_corpus_path, data_size):
    corpus_list = read_list(input_corpus_path)
    label_list = read_list(input_label_corpus_path)

    corpus_list = corpus_list[:data_size]
    label_list = label_list[:data_size]

    raw_corpus_list = [i.split() for i in corpus_list]
    raw_label_list = [i.split() for i in label_list]

    padd_corpus_list = padd_corpus(raw_corpus_list)
    processed_label_list = process_y(raw_label_list)

    raw_corpus_array = np.array(padd_corpus_list)
    raw_label_array = np.array(processed_label_list)

    shuffle_indices = np.random.permutation(np.arange(data_size))

    corpus_array = raw_corpus_array[shuffle_indices]
    labels_array = raw_label_array[shuffle_indices]

    train_size = int(data_size * 0.9)
    print('data_size:', data_size)
    print('train_size:', train_size)

    train_corpus_array = corpus_array[:train_size]
    train_labels_array = labels_array[:train_size]
    eval_corpus_array = corpus_array[train_size:]
    eval_labels_array = labels_array[train_size:]

    return train_corpus_array, train_labels_array, eval_corpus_array, eval_labels_array
Пример #2
0
def compare_file(file_1_path, file_2_path):
    corpus_list_1 = read_list(file_1_path)
    corpus_list_2 = read_list(file_2_path)
    md5_1_set = set()
    md5_2_set = set()

    for line in corpus_list_1:
        line_list = line.split('\t')
        if len(line_list) == 2:
            if len(line_list[0]) == len('22d68441d949b94907c41309ffbf01b1'):
                md5_1_set.add(line_list[0])

    for line in corpus_list_2:
        line_list = line.split('\t')
        if len(line_list) == 2:
            if len(line_list[0]) == len('22d68441d949b94907c41309ffbf01b1'):
                md5_2_set.add(line_list[0])

    additive_set = md5_2_set - md5_1_set
    lack_set = md5_1_set - md5_2_set

    print('additive_set size:', len(additive_set))
    print('lack_set size:', len(lack_set))
    write_list(list(additive_set), '../../result_data/temp/additive_set.txt')
    write_list(list(lack_set), '../../result_data/temp/lack_set.txt')
Пример #3
0
def build_train_corpus(corpus_path, label_path, output_x_path, output_y_path):
    input_x_list = []
    input_y_list = []

    label_list = read_list(label_path)
    corpus_id_topic_map = {}
    for line in label_list:
        line_list = line.split('\t')
        topic_id = line_list[0]
        topic_id_line = line_list[1]
        corpus_id_topic_map[topic_id] = topic_id_line
    print('corpus_id_topic_map size:', len(corpus_id_topic_map))

    iter_num = 0
    with codecs.open(corpus_path, 'r', "utf-8") as read_file:
        for line in read_file:
            iter_num += 1
            line_list = line.split('\t')
            topic_id = line_list[0]
            title_id_line = line_list[1]
            if topic_id in corpus_id_topic_map:
                title_id_line = title_id_line.strip()
                input_x_list.append(title_id_line)
                input_y_list.append(corpus_id_topic_map[topic_id])
    write_list(input_x_list, output_x_path)
    write_list(input_y_list, output_y_path)
    print('input_x_list size:', len(input_x_list))
    print('input_y_list size:', len(input_y_list))
Пример #4
0
def word_statistic(input_path,output_path):
    all_word_set = set()
    all_word_list= []
    line_num  = 0
    corpus_list = read_list(input_path)
    for line in corpus_list:
        line_num += 1
        line_list = line.split("\t")
        id = line_list[0]
        title = line_list[1]
        new_line = title
        if len(line_list) == 3:
            content = line_list[2]
            new_line = title + ","+content
        new_line_list = new_line.split(',')
        all_word_list  += new_line_list
#         print('new_line_list:',new_line_list)
       
        if len(all_word_list) > 10000:
            temp_word_set = set(all_word_list)
            all_word_set =  all_word_set | temp_word_set
            all_word_list = []
        
        if line_num%10000 == 0:
            print("line_num:",line_num)
    
    all_word_list = list(all_word_set)
    all_word_list = [int(i.replace("w",""))  for i in all_word_list if 'w' in i]
    all_word_list.sort()
    print('all_word_list size:',len(all_word_list))
    write_list(all_word_list,output_path)
Пример #5
0
def read_question_train_set(input_path):
    lines = read_list(input_path)
    iter_num = 0
    for line in lines:
        iter_num += 1
#         line_list = line.split('\t')
#         print(line_list)
#         if iter_num == 10:
#             exit()
    print("iter_num:",iter_num)
Пример #6
0
def word_embeding_statistic(input_path):
    lines = read_list(input_path)
    iter_num = 0
    for line in lines:
        iter_num += 1
#         line_list = line.split('\t')
        print(line)
        if iter_num == 10:
            exit()
    print("iter_num:",iter_num)
Пример #7
0
def compare_two_files(input_path_1,input_path_2):
    line_list_1 = read_list(input_path_1)
    line_list_2 = read_list(input_path_2)
    if len(line_list_1) != len(line_list_2):
        print('Error:line size is not equal!exit!')
        exit()
    
    for i in range(len(line_list_1)):
        word_list_1,part_speech_list_1 = get_word_list(line_list_1[i])
        word_list_2,part_speech_list_2 = get_word_list(line_list_2[i])
        is_same_list, not_same_list_1,not_same_list_2 = compare_two_lines_words(word_list_1,word_list_2)
        not_same_word_list_1 = [word_list_1[x] for x in not_same_list_1]
        not_same_word_list_2 = [word_list_2[x] for x in not_same_list_2]
#         print('is_same_list:',is_same_list)
        print('*************************************************************')
        if len(is_same_list) !=0:
            print('not_same_word_list_1:',not_same_word_list_1)
            print('not_same_word_list_2:',not_same_word_list_2)
        else:
            print('two lines is same')
def statistic_dict_size(dict_path):
    dict = read_list(dict_path)
    print('dict size:',len(dict))