Пример #1
0
def data_sample(read_directory, write_directory1, write_directory2):
    '''
    
    :param read_directory:
    :param write_directory1:
    :param write_directory2:
    '''
    file_number = np.sum(
        [len(files) for root, dirs, files in os.walk(read_directory)])
    sample_size = 250
    sample_time = []
    ratio = []

    for i in range(file_number):
        vsm_matrix = get_text_to_nparray(
            read_directory + '/' + str(i + 1) + '.txt', 'int')
        vsm_matrix = vsm_matrix.T

        print 'Batch: %d' % (i + 1)
        start = time.clock()

        data_dimension = vsm_matrix.shape[0]

        Q = np.zeros((sample_size, data_dimension))
        for k in range(Q.shape[0]):
            for j in range(Q.shape[1]):
                Q[k, j] = random.gauss(
                    1, np.sqrt(np.true_divide(1, np.sqrt(sample_size))))

        sample_result = np.dot(Q, vsm_matrix)

        this_ratio = np.true_divide(sample_size, data_dimension) * 8.0 / 4.0
        ratio.append(str(this_ratio))

        interval = time.clock() - start
        print 'Time: %f' % interval
        sample_time.append(str(interval))

        write_result = []
        for each in sample_result:
            write_result.append(" ".join([str(x) for x in each]))
        quick_write_list_to_text(write_result,
                                 write_directory1 + '/' + str(i + 1) + '.txt')

    quick_write_list_to_text(sample_time,
                             write_directory2 + '/sample_time.txt')
    quick_write_list_to_text(ratio, write_directory2 + '/ratio.txt')
def data_sample(read_directory, write_directory1, write_directory2):
    '''
    
    :param read_directory:
    :param write_directory1:
    :param write_directory2:
    '''
    file_number = np.sum([len(files) for root, dirs, files in os.walk(read_directory)])
    sample_size = 250
    sample_time = []
    ratio = []
    
    for i in range(file_number):
        vsm_matrix = get_text_to_nparray(read_directory + '/' + str(i + 1) + '.txt', 'int')
        vsm_matrix = vsm_matrix.T
        
        print 'Batch: %d' % (i + 1)
        start = time.clock()
        
        data_dimension = vsm_matrix.shape[0]
        
        Q = np.zeros((sample_size, data_dimension))
        for k in range(Q.shape[0]):
            for j in range(Q.shape[1]):
                Q[k, j] = random.gauss(1, np.sqrt(np.true_divide(1, np.sqrt(sample_size))))
                
        sample_result = np.dot(Q, vsm_matrix)
        
        this_ratio = np.true_divide(sample_size, data_dimension) * 8.0 / 4.0
        ratio.append(str(this_ratio))
        
        interval = time.clock() - start
        print 'Time: %f' % interval
        sample_time.append(str(interval))
        
        write_result = []
        for each in sample_result:
            write_result.append(" ".join([str(x) for x in each]))
        quick_write_list_to_text(write_result, write_directory1 + '/' + str(i + 1) + '.txt')
        
    quick_write_list_to_text(sample_time, write_directory2 + '/sample_time.txt')
    quick_write_list_to_text(ratio, write_directory2 + '/ratio.txt')
def merge_batch(read_directory1, read_directory2, read_directory3,
                read_directory4, read_filename, write_directory1,
                write_directory2):

    all_batch_index = []
    f = open(read_filename)
    line = f.readline()
    while line:
        all_batch_index.append(line.split())
        line = f.readline()

    f.close()

    for i in range(len(all_batch_index)):
        this_word_list = []
        f1 = open(read_directory2 + '/' + str(i + 1) + '.txt', 'rb')
        line = f1.readline()
        while line:
            this_word_list.append(line.strip())
            line = f1.readline()

        f1.close()

        result = []
        result_id_time = []

        for j in range(len(all_batch_index[i])):

            word_list = []
            f2 = open(read_directory3 + '/' + all_batch_index[i][j] + '.txt',
                      'rb')
            line = f2.readline()
            while line:
                word_list.append(line.split()[0])
                line = f2.readline()

            f2.close()

            vsm_nparray = get_text_to_nparray(
                read_directory1 + '/' + all_batch_index[i][j] + '.txt', 'int')

            id_time = []
            get_text_to_single_list(
                id_time,
                read_directory4 + '/' + all_batch_index[i][j] + '.txt')

            for each2 in id_time:
                result_id_time.append(each2)

            for each in vsm_nparray:
                tf_dict = {}
                for k in range(len(each)):
                    if each[k] > 0.0001:
                        tf_dict[word_list[k]] = each[k]

                tf_dict2 = {}
                for each1 in this_word_list:
                    if each1 in tf_dict.keys():
                        tf_dict2[each1] = tf_dict[each1]
                    else:
                        tf_dict2[each1] = 0

                this_line = []
                for key in this_word_list:
                    this_line.append(str(tf_dict2[key]))

                #每一行合并为字符串,方便写入
                result.append(" ".join(this_line))

        quick_write_list_to_text(result,
                                 write_directory1 + '/' + str(i + 1) + '.txt')
        quick_write_list_to_text(result_id_time,
                                 write_directory2 + '/' + str(i + 1) + '.txt')
def merge_batch(read_directory1, read_directory2, read_directory3, read_directory4, read_filename, write_directory1, write_directory2):
    
    all_batch_index = []
    f = open(read_filename)
    line = f.readline()
    while line:
        all_batch_index.append(line.split())
        line = f.readline()
        
    f.close()
    
    for i in range(len(all_batch_index)):
        this_word_list = []
        f1 = open(read_directory2 + '/' + str(i + 1) + '.txt', 'rb')
        line = f1.readline()
        while line:
            this_word_list.append(line.strip())
            line = f1.readline()
        
        f1.close()
        
        result = []
        result_id_time = []
        
        for j in range(len(all_batch_index[i])):
            
            word_list = []
            f2 = open(read_directory3 + '/' + all_batch_index[i][j] + '.txt', 'rb')
            line = f2.readline()
            while line:
                word_list.append(line.split()[0])
                line = f2.readline()
        
            f2.close()
            
            
            vsm_nparray = get_text_to_nparray(read_directory1 + '/' + all_batch_index[i][j] + '.txt', 'int')
            
            id_time = []
            get_text_to_single_list(id_time, read_directory4 + '/' + all_batch_index[i][j] + '.txt')
            
            for each2 in id_time:
                result_id_time.append(each2)
            
            for each in vsm_nparray:
                tf_dict = {}
                for k in range(len(each)):
                    if each[k] > 0.0001:
                        tf_dict[word_list[k]] = each[k]
                
                tf_dict2 = {}
                for each1 in this_word_list:
                    if each1 in tf_dict.keys():
                        tf_dict2[each1] = tf_dict[each1]
                    else:
                        tf_dict2[each1] = 0
            
                this_line = []
                for key in this_word_list:
                    this_line.append(str(tf_dict2[key]))
            
                #每一行合并为字符串,方便写入
                result.append(" ".join(this_line))
        
        quick_write_list_to_text(result, write_directory1 + '/' + str(i + 1) + '.txt')
        quick_write_list_to_text(result_id_time, write_directory2 + '/' + str(i + 1) + '.txt')