示例#1
0
def mergeTestResult(original, IDcol, files):
    merged = []

    for i in range(files):

        # load test result array from result file
        result = RD.loadArray('result_split_' + str(i) + '.csv', ',')
        result = result[1:]  # remove first value with column title

        # read corresponding ID
        test = RD.loadArray(original + '_sub_' + str(i) + '.txt')
        testIDs_ = np.array(test)[:, IDcol]

        # testIDs_ = [0, 1, 2, ...] -> testIDs = [[0], [1], [2], ...]
        testIDs = []
        for j in range(len(testIDs_)):
            testIDs.append([testIDs_[j]])

        # merge result and test array
        IDsAndResult = np.concatenate(
            [np.array(testIDs), np.array(result)], axis=1)

        print('\n <<< IDs-Result for file ' + str(i) + ' >>>')
        print(np.array(IDsAndResult))

        # append this to array 'merged'
        for j in range(len(IDsAndResult)):
            merged.append([IDsAndResult[j][0], IDsAndResult[j][1]])

    # sort the result array 'merged'
    merged = sorted(merged, key=lambda x: x[0])

    # write to file
    RD.saveArray('result_split_final.txt', merged)
示例#2
0
def mergeTrain(TRI, TRO, TRIO):

    # read array
    TI = np.array(RD.loadArray(TRI, '\t'))
    TO = np.array(RD.loadArray(TRO, '\t'))

    # concatenate arrays
    TIO = np.concatenate((TI, TO), axis=1)

    # write array
    RD.saveArray(TRIO, TIO, '\t', 500)
示例#3
0
def deepLearningQ_training(Q, deviceName, epoch, printed):

    model = defineModel()
    
    # Q : [state, action_reward, i (UAV/cluster index), k (device index)]
    
    # Q Table           = [[[s0], [Q00, Q01, ...]], [[s1], [Q10, Q11, ...]], ...]
    # convert to input  = converted version of [[s0], [s1], ...]
    #            output = original  version of [[Q00, Q01, ...], [Q10, Q11, ...], ...]
    # where           s = [q[n][l], {a[n][l][k_l]}, {R[n][k_l]}]
    #        and      Q = reward

    # input array (need to convert original array [s0])
    inputData = []
    for i in range(len(Q)):

        # convert into 1d array (valid if not converted)
        try:
            inputData.append(stateTo1dArray(Q[i][0], Q[i][3]))

        # executed if already converted to 1d array
        except:
            inputData.append(Q[i][0])

    # output array (as original)
    outputData = []
    for i in range(len(Q)): outputData.append(Q[i][1])

    # save input and output array as file
    if len(inputData) > 0:
        RD.saveArray('Q_input.txt', inputData)
    if len(outputData) > 0:
        RD.saveArray('Q_output.txt', outputData)

    # save normalized data
    if len(inputData) > 0:
        normalizedInputData = normalize(inputData, False, 'input' + str(len(inputData)), True)
        normalizedOutputData = normalize(outputData, False, 'output' + str(len(inputData)), True)
        RD.saveArray('Q_input_normalized.txt', normalizedInputData)
        RD.saveArray('Q_output_normalized.txt', normalizedOutputData)

    # train using deep learning and save the model (testInputFile and testOutputFile is None)
    # need: modelConfig.txt
    # DON'T NEED TO APPLY SIGMOID to training output data, because DLmain.deeplearning applies it
    try:
        Q_input_noramlized = np.array(RD.loadArray('Q_input_normalized.txt', '\t')).astype(float)
        Q_output_noramlized = np.array(RD.loadArray('Q_output_normalized.txt', '\t')).astype(float)
        trainDataWithModel(Q_input_noramlized, Q_output_noramlized, model, 15)

    except:
        print('[train] Q_input_normalized.txt or Q_output_normalized.txt does not exist.')
示例#4
0
def extractTrainAndTest(testRate):

    # load the csv file including all the data (except for the first row)
    csvArray = RD.loadArray('train_test.csv', splitter=',')
    csvArray = csvArray[1:]
    numOfRows = len(csvArray)

    # write first row
    firstRow = ['label']
    
    imgSize = 32
    for i in range(imgSize):
        for j in range(imgSize):
            firstRow.append(str(i) + '_' + str(j))

    # designate training data and test data
    trainingData = [firstRow]
    testData = [firstRow]

    for i in range(numOfRows):
        rand = random.random()

        if rand >= testRate: trainingData.append(csvArray[i])
        else: testData.append(csvArray[i])

    # save file (trainingData and testData)
    RD.saveArray('train.csv', trainingData, splitter=',', saveSize=1000)
    RD.saveArray('test.csv', testData, splitter=',', saveSize=1000)
示例#5
0
def readCSV(fn, colRange, rowRange, delimiter=','):
    csv = RD.loadArray(fn, delimiter)

    # colRange start and end, rowRange start and end
    cs = colRange[0]
    ce = colRange[1]
    rs = rowRange[0]
    re = rowRange[1]

    # colRange: [a, None]
    if ce == None:

        # rowRange: [a, None]
        if re == None:
            return np.array(csv)[rs:, cs:]

            # rowRange: [a, b]
        else:
            return np.array(csv)[rs:re, cs:]

    # colRange: [a, b]
    else:

        # rowRange: [a, None]
        if re == None:
            return np.array(csv)[rs:, cs:ce]

            # rowRange: [a, b]
        else:
            return np.array(csv)[rs:re, cs:ce]
示例#6
0
def extractTrainAndTest():

    # load the csv file including all the data (except for the first row)
    csvArray = RD.loadArray('train_test.csv', splitter=',')
    csvArray = csvArray[1:]
    numOfRows = len(csvArray)

    # load label list
    labelList = RD.loadArray('label_list.csv')

    # write first row
    firstRow = ['label']

    imgWidth = 64
    imgHeight = 64
    for i in range(imgHeight):
        for j in range(imgWidth):
            firstRow.append(str(i) + '_' + str(j))

    # designate training data and test data
    trainingData = [firstRow]
    testData = [firstRow]
    trainingLabel = []
    testLabel = []

    for i in range(numOfRows):

        # train or test
        if int(labelList[i][0]) >= 120: train = False
        else: train = True

        # car: 0, bus: 1
        csvArray[i][0] = str(int(csvArray[i][0]) % 2)

        # append to training/test data
        if train == True:
            trainingData.append(csvArray[i])
            trainingLabel.append(labelList[i])
        else:
            testData.append(csvArray[i])
            testLabel.append(labelList[i])

    # save file (trainingData and testData)
    RD.saveArray('train.csv', trainingData, splitter=',', saveSize=500)
    RD.saveArray('test.csv', testData, splitter=',', saveSize=500)
    RD.saveArray('trainLabels.csv', trainingLabel)
    RD.saveArray('testLabels.csv', testLabel)
示例#7
0
def readMNISTData():
    print('reading MNIST data...')

    # TRAINING DATA
    # read and print col=1, ... and row=1, ... of the CSV file
    train_input = np.array(RD.loadArray('mnist_train_input.txt'))
    train_output = np.array(RD.loadArray('mnist_train_output.txt'))
    print(train_input)
    print(train_output)

    # TEST DATA
    test_input = np.array(RD.loadArray('mnist_test_input.txt'))
    test_output = np.array(RD.loadArray('mnist_test_output.txt'))
    print(test_input)
    print(test_output)

    return (train_input, train_output, test_input, test_output)
示例#8
0
def useTestOutput(fn, threshold):

    # read file
    testResult = RD.loadArray(fn)

    # write final result
    finalResult = []

    for i in range(len(testResult)):
        value = float(testResult[i][0])

        if threshold == None:
            finalResult.append([value])
        else:
            if value < threshold:
                finalResult.append([0])
            else:
                finalResult.append([1])

    # write file
    RD.saveArray('to_submit.txt', finalResult)
示例#9
0
def compare(finalResult, validName, validationCol, trainValid_validRows):

    # assert that finalResult and validRows have the same length
    dataLen = len(finalResult)  # length of data
    assert (dataLen == len(trainValid_validRows))  # assert the same length

    # read data
    validData = RD.loadArray(validName)  # original validation data array
    validResult = []  # validation data array
    for i in range(dataLen):
        validResult.append(validData[trainValid_validRows[i]][validationCol])
    validResult = np.array(validResult)

    # compute MAE and MSE
    result = ''

    MAE = 0.0
    MSE = 0.0
    for i in range(dataLen):
        finalResult[i] = float(finalResult[i])
        validResult[i] = float(validResult[i])
        MAE += abs(float(finalResult[i]) - float(validResult[i]))
        MSE += pow(float(finalResult[i]) - float(validResult[i]), 2)

        result += 'pred=' + str(finalResult[i]) + ' val=' + str(
            validResult[i]) + '\n'

    MAE /= dataLen
    MSE /= dataLen

    # print and write result
    print('\n **** validation result ****')
    print('MAE=' + str(MAE))
    print('MSE=' + str(MSE))

    f = open('result_valid.txt', 'w')
    f.write(result + '\nMAE = ' + str(MAE) + ', MSE = ' + str(MSE))
    f.close()
示例#10
0
import sys
sys.path.insert(0, '../../AI_BASE')

import math
import numpy as np
import readData as RD

if __name__ == '__main__':

    count = 1

    for i in range(count):
        if i == 0:
            answer_array = np.array(RD.loadArray('bert_valid_result_count_' + str(i) + '.txt'))[:, 5:6].astype(float)
        else:
            answer_array += np.array(RD.loadArray('bert_valid_result_count_' + str(i) + '.txt'))[:, 5:6].astype(float)

    answer_array /= count

    RD.saveArray('final_answer.txt', answer_array)
示例#11
0
        # convert : 0 ~ 255 -> -1 ~ 2
        imgArr = np.round_(3 * (imgArr / 255) - 1, 2)

        inputArr.append(imgArr)


if __name__ == '__main__':

    # SUBMISSION
    # prediction of 'survived' column

    # rows
    train_rows = 61578
    test_rows = 79975
    valid_rate = float(RD.loadArray('val_rate.txt')[0][0])

    # directories
    train_input_dir = 'images_training_rev1/images_training_rev1'
    train_output_file = 'training_solutions_rev1/training_solutions_rev1.csv'
    test_input_dir = 'images_test_rev1/images_test_rev1'

    # set data to validate (array 'isValid')
    isValid = []
    for i in range(train_rows):
        isValid.append(False)

    count = 0
    while count < train_rows * valid_rate:
        rand = random.randint(0, train_rows - 1)
示例#12
0
        text_models.append(text_model)

    # train / test result array
    train_result = [[0 for j in range(6)] for i in range(rows_to_train)]
    test_result = [[0 for j in range(6)] for i in range(rows_to_test)]

    # train / test max length

    # for donorschoose-application-screening,
    # max_length_train = 132, 2183, 818, 387, 224, 234
    # max_length_test  = 159, 2583, 993, 245, 163, 107
    # max_length       = 159, 2583, 993, 387, 224, 234

    try:
        max_lengths_train = RD.loadArray('bert_max_lengths_train.txt')[0]
        max_lengths_test = RD.loadArray('bert_max_lengths_test.txt')[0]
        max_lengths = RD.loadArray('bert_max_lengths.txt')[0]

    except:
        max_lengths_train = []
        max_lengths_test = []
        max_lengths = []

        for i in range(6):
            print('checking max length : ' + str(i))

            max_lengths_train.append(
                convertForBert(data_to_train[i], None, print_interval,
                               tokenizer, None))
            max_lengths_test.append(
示例#13
0
# train_converted.csv -> train_input.txt, train_output_0.txt, and train_output_1.txt
# test_converted.csv  -> test_input.txt
#                     -> my_extract_result.txt

if __name__ == '__main__':

    train_rows = 2400
    test_rows = 600
    input_cols_cat = 1
    input_cols_cont = 29

    result = ''

    # TRAIN
    train_data = RD.loadArray('train_converted.csv', ',')
    train_input = np.array(train_data)[1:, 1:31]
    train_output0 = np.array(train_data)[1:,
                                         31:32]  # formation_energy_ev_natom
    train_output1 = np.array(train_data)[1:, 32:33]  # bandgap_energy_ev

    # AVG and STD for each column of CONTINUOUS training input
    cont_avgs = []
    cont_stds = []

    # categorical
    for i in range(input_cols_cat):
        cont_avgs.append(-1)
        cont_stds.append(-1)

    # continuous
示例#14
0
    final.close()


if __name__ == '__main__':

    use_n_sub = True  # use n_sub mode
    size = 25  # the number of rows/columns in each input data
    inputLength = size * size  # length of input vector

    # read array from final_X.csv

    # when using n_sub mode
    if use_n_sub == True:

        try:
            sub0 = RD.loadArray('final_0.csv', ',')
            sub1 = RD.loadArray('final_1.csv', ',')
            sub2 = RD.loadArray('final_2.csv', ',')
            sub3 = RD.loadArray('final_3.csv', ',')
            sub4 = RD.loadArray('final_4.csv', ',')
        except:
            weight = [1]
            readTestOutput('test_id_sub_0.txt', inputLength,
                           'test_output_n_sub_0.txt', 'final_0.csv', weight)
            readTestOutput('test_id_sub_1.txt', inputLength,
                           'test_output_n_sub_1.txt', 'final_1.csv', weight)
            readTestOutput('test_id_sub_2.txt', inputLength,
                           'test_output_n_sub_2.txt', 'final_2.csv', weight)
            readTestOutput('test_id_sub_3.txt', inputLength,
                           'test_output_n_sub_3.txt', 'final_3.csv', weight)
            readTestOutput('test_id_sub_4.txt', inputLength,
示例#15
0
import sys
sys.path.insert(0, '../../AI_BASE')
import readData as RD
import numpy as np

if __name__ == '__main__':

    train_input_fn = 'train_input.txt'
    train_output_fn = 'train_output.txt'
    augment_test = False

    # read training input and output
    train_input = RD.loadArray(train_input_fn)
    train_output = RD.loadArray(train_output_fn)
    train_rows = len(train_input)

    print(' ==== before augmentation ====')
    print(np.shape(train_input))
    print(np.array(train_input))
    print('')
    print(np.shape(train_output))
    print(np.array(train_output))

    # augment training input and output
    for i in range(train_rows):
        train_input.append(train_input[i][::-1])

        if augment_test == True:
            train_output.append(train_output[i][::-1])
        else:
            train_output.append(train_output[i])
示例#16
0
def readAllSubs(size):

    # read train and test data
    train = RD.loadArray('train.csv', ',')
    test = RD.loadArray('test.csv', ',')

    # write id-delta, input and output of training data
    # write id-delta and input         of test     data
    try:
        _ = open('train_id.txt', 'r')
        _.close()
        _ = open('train_input.txt', 'r')
        _.close()
        _ = open('train_output.txt', 'r')
        _.close()
        _ = open('test_id.txt', 'r')
        _.close()
        _ = open('test_input.txt', 'r')
        _.close()

    except:
        # train.txt -> id, delta, start1~625, stop1~625 (if size=25) -> train_id.txt     : extract id and delta
        #                                                            -> train_input.txt  : extract delta and stop1~625 (if size=25)
        #                                                            -> train_output.txt : extract delta and start1~625 (if size=25)
        RD.saveArray('train_id.txt', np.array(train)[:, 0:2])
        RD.saveArray(
            'train_input.txt',
            np.concatenate([
                np.array(train)[:, 1:2],
                np.array(train)[:, size * size + 2:2 * size * size + 2]
            ],
                           axis=1))
        RD.saveArray('train_output.txt', np.array(train)[:, 1:size * size + 2])

        # test.txt  -> id, delta, stop1~625 (if size=25)             -> test_id.txt      : extract id and delta
        #                                                            -> test_input.txt   : extract delta and stop1~625 (if size=25)
        RD.saveArray('test_id.txt', np.array(test)[:, 0:2])
        RD.saveArray('test_input.txt', np.array(test)[:, 1:size * size + 2])

    # split train and test data into files
    try:
        # try to read file
        for i in range(5):
            _ = open('train_id_sub_' + str(i) + '.txt', 'r')
            _.close()
            _ = open('train_input_sub_' + str(i) + '.txt', 'r')
            _.close()
            _ = open('train_output_sub_' + str(i) + '.txt', 'r')
            _.close()
            _ = open('test_id_sub_' + str(i) + '.txt', 'r')
            _.close()
            _ = open('test_input_sub_' + str(i) + '.txt', 'r')
            _.close()
    except:
        # write train_id, train_input, train_output, test_id and test_input files
        deltaOrder = [[1], [2], [3], [4],
                      [5]]  # order of delta (1, 2, 3, 4, 5)

        # train_id_sub_X.txt     : id             of training data with delta X
        # train_input_sub_X.txt  : input  (stop)  of training data with delta X
        # train_output_sub_X.txt : output (start) of training data with delta X
        # test_id_sub_X.txt      : id             of test data with delta X
        # test_input_sub_X.txt   : input  (stop)  of test data with delta X
        RD.splitArray('train_id.txt', [1], deltaOrder, True)
        RD.splitArray('train_input.txt', [0], deltaOrder, True)
        RD.splitArray('train_output.txt', [0], deltaOrder, True)
        RD.splitArray('test_id.txt', [1], deltaOrder, True)
        RD.splitArray('test_input.txt', [0], deltaOrder, True)
示例#17
0
    # meta info
    TRI = 'train_input.txt'
    TRO = 'train_output.txt'
    TEI = 'test_input.txt'
    TEO = ['test_output.txt']

    TE_real = None
    TE_report = 'report_test.txt'
    VAL_rate = 0.0
    VAL_report = 'report_val.txt'
    modelConfig = 'model_config.txt'

    # load array
    print('loading training input...')
    TRI_array = RD.loadArray(TRI, '\t', UTF8=False, type_='f')

    print('loading training output...')
    TRO_array = RD.loadArray(TRO, '\t', UTF8=False, type_='f')

    print('loading test input...')
    TEI_array = RD.loadArray(TEI, '\t', UTF8=False, type_='f')

    # user data
    deviceName = input('device name (for example, cpu:0 or gpu:0)')
    epoch = int(input('epoch'))
    printed = int(input('printed? (0 -> do not print)'))

    # print mode
    if VAL_rate > 0.0:
        print('VALIDATION mode')
示例#18
0
import used_model

if __name__ == '__main__':

    import warnings
    warnings.filterwarnings('ignore')
    warnings.filterwarnings('always')
    
    # training input and test input:
    # NORMALIZED using avg and stddev of each training input column

    # meta info
    TE_real = None
    TE_report = 'report_test.txt'
    VAL_rate = float(RD.loadArray('val_rate.txt')[0][0])
    VAL_report = 'report_val.txt'
    modelConfig = 'model_config.txt'
    augmented = False

    if augmented == True:
        TRI = 'train_input_augmented.txt'
        TRO = 'train_output_augmented.txt'
    else:
        TRI = 'train_input.txt'
        TRO = 'train_output.txt'
    TEI = 'test_input.txt'
    TEO = 'test_predict.txt'

    # user data
    deviceName = input('device name (for example, cpu:0 or gpu:0)')
示例#19
0
# open sample_submission.csv refer to
# https://stackoverflow.com/questions/53410490/i-am-getting-an-error-as-name-while-opening-csv-file-in-excel-2016

import numpy as np
import sys
sys.path.insert(0, '../../AI_BASE')
import readData as RD

if __name__ == '__main__':

    # final result
    finalResult = 'Id,Votes\n'

    sampleSub = RD.loadArray('sample_submission.csv', ',')

    results = 22956
    times = 16
    algorithm = 'deepLearning'

    # avg and std for votes
    avgs_and_stds = RD.loadArray('train_output_avg_and_std.txt')
    print(avgs_and_stds)

    # sum of prediction of useful votes for each review
    final_sum = []

    # read file
    for count in range(times):

        print('count = ' + str(count))
示例#20
0
def makeDataFrame(fn, validExcept, rows, ftype, fcols, isTrain, target,
                  exceptCols, useLog, logConstant):
    print('')
    print('+============================+')
    print('|  Function : makeDataFrame  |')
    print('+============================+')

    # copy fcols (original fcols)
    originalFcols = []
    for i in range(len(fcols)):
        originalFcols.append(fcols[i])

    # open and show plt data
    if ftype == 'json':  # type is 'json'
        jf = open(fn, 'r')
        df_loaded = json.load(jf)
        df_data = pd.DataFrame(df_loaded)

        # NOT TESTED FOR THIS CASE - SO THERE MAY BE SOME ERRORS

    elif ftype == 'csv':  # type is 'csv'
        df_data = pd.read_csv(fn)

        # NOT TESTED FOR THIS CASE - SO THERE MAY BE SOME ERRORS

    elif ftype == 'txt':  # type is 'txt'
        df_array = RD.loadArray(fn)

        print('\n<<< [before] fcols >>>')
        print(fcols)
        print('\n<<< [before] data array [0:5] >>>')
        print(df_array[:5])

        # remove columns indexed by elements of validExcept
        # (used in training but not used in validation)
        if validExcept != None:
            for i in range(len(validExcept)):
                try:
                    fcols.remove(validExcept[i])
                except:
                    print('validExcept remove error (0) : ' +
                          str(validExcept[i]))
                try:
                    df_array = np.delete(
                        df_array, getIndex(validExcept[i], originalFcols), 1)
                except:
                    print('validExcept remove error (1) : ' +
                          str(validExcept[i]))

        # remove target column for test data
        if isTrain == False:
            try:
                fcols.remove(target)
            except:
                print('target      remove error (0) : ' + str(target))
            try:
                df_array = np.delete(df_array, getIndex(target, originalFcols),
                                     1)
            except:
                print('target      remove error (1) : ' + str(target))

        # remove except column from both fcols and df_array
        for exceptCol in exceptCols:
            try:
                fcols.remove(exceptCol)
            except:
                print('exceptCol   remove error (0) : ' + str(exceptCol))
            try:
                df_array = np.delete(df_array,
                                     getIndex(exceptCol, originalFcols), 1)
            except:
                print('exceptCol   remove error (1) : ' + str(exceptCol))

        print('\n<<< [after] fcols >>>')
        print(fcols)
        print('\n<<< [after] data array [0:5] >>>')
        print(df_array[:5])

        # make dataframe using df_array
        if fcols != None:
            df_data = pd.DataFrame(data=df_array, columns=fcols)
        else:
            cols = []
            for i in range(len(df_array[0])):
                cols.append('col' + str(i))
            df_data = pd.DataFrame(data=df_array, columns=cols)

    targetCol = -1  # index of target column

    # extract column name before change into np.array
    dataCols = np.array(df_data.columns)

    # change df_data into np.array
    df_data = df_data.to_numpy()
    df_data = pd.DataFrame(df_data, columns=dataCols)
    print('\n<<< [0] df_data.shape >>>')
    print('columns : ' + str(df_data.columns))
    print('shape   : ' + str(df_data.shape))

    # create data
    # .data and .target
    if isTrain == True: targetCol = target  # target column name

    dataPart = []  # data columns
    if isTrain == True: targetPart = []  # target column
    extractCols = []  # extracted columns = dataPart + targetPart
    extractColInfo = []  # info about extracted columns (type, etc.)

    for col in dataCols:

        # except for these columns
        continueThis = False
        for i in range(len(exceptCols)):
            if col == exceptCols[i]:
                continueThis = True
                break
        if continueThis == True: continue

        dataPartAdded = False

        # accept columns only if not all values are the same (then not meaningful)
        # so check if max(col) > min(col)

        # not in targetCol and all of values are numeric -> dataPart
        if isTrain == True:  # train mode -> targetCol exists
            if col != targetCol:
                dataPart.append(col)
                extractCols.append(col)
                extractColInfo.append('data')
                dataPartAdded = True

        else:  # test mode -> targetCol does not exist
            dataPart.append(col)
            extractCols.append(col)
            extractColInfo.append('data')
            dataPartAdded = True

        # if equal to targetCol
        if isTrain == True and dataPartAdded == False:
            if col == targetCol:
                targetPart.append(col)
                extractCols.append(col)
                extractColInfo.append('target')

                # set index to the index of target column
                targetCol = len(extractCols) - 1

    print('\n<<< [1] dataPart and extractCols >>>')
    for i in range(len(dataPart)):
        print(dataPart[i])
    print('')
    for i in range(len(extractCols)):
        print(extractCols[i] + ' : ' + extractColInfo[i])

    # bind the data and target
    if isTrain == True:
        dataSet = {'data': df_data[dataPart], 'target': df_data[targetPart]}
    else:
        dataSet = {'data': df_data[dataPart]}

    dataSetDF = df_data[extractCols]

    # change dataSetDF into float type
    try:
        dataSetDF = dataSetDF.astype(float)
    except:
        doNothing = 0  # do nothing

    # print dataFrame
    print('\n<<< [2] dataSetDF >>>')
    print(dataSetDF)

    # again, change dataSetDF into float type
    try:
        dataSetDF = dataSetDF.astype(float)
    except:
        doNothing = 0  # do nothing

    # apply log for extractCols if useLog is true
    if useLog == True:

        # prevent error when frequentWords is None
        if frequentWords == None: frequentWords = []

        # actual column name is 'CT_' + each frequent word
        CTfreqWords = []
        for i in range(len(frequentWords)):
            CTfreqWords.append('CT_' + frequentWords[i])

        # using log: x -> log2(x + logConstant)
        for col in extractCols + CTfreqWords:

            if col == target or col[:3] == 'CT_':
                continue  # except for target column and CT_ columns

            for i in range(len(dataSetDF)):
                dataSetDF.at[i, col] = math.log(
                    max(0, dataSetDF.at[i, col]) + logConstant, 2)

        print('\n<<< [2-3] dataSetDF log applied >>>')
        print(dataSetDF)

    # again, change dataSetDF into float type
    try:
        dataSetDF = dataSetDF.astype(float)
    except:
        doNothing = 0  # do nothing

    print('\n<<< [2-4] dataSetDF original >>>')
    print(dataSetDF)

    # remove rows not included in 'rows'
    if rows != None:
        dataSetDF = dataSetDF.iloc[rows]

        print('\n<<< [2-5] dataSetDF after row extraction >>>')
        print(dataSetDF)

    print('')
    print('+========================+')
    print('|  Exit : makeDataFrame  |')
    print('+========================+')

    # return dataFrame
    return (dataSetDF, targetCol)
示例#21
0
def valid(fn, thresholdList, size, n, modelName, validRate, use_n_sub):

    # window size
    ws = int((n-1)/2)

    ### read ID to validate, from the validation report
    id0ToValidate = []
    
    report = open(fn, 'r')
    rows = report.readlines()
    leng = len(rows)
    report.close()

    # using parsing
    for i in range(leng-9):
        id_ = rows[i].split(']')[0][1:]
        id0ToValidate.append(int(id_))

    ### add randomly select rows to validate, from delta = 2 to 5, using validRate
    # ID: delta 1 = 000000 ~ 624999
    #     delta 2 = 625000 ~ 1.249M
    #     delta 3 = 1.250M ~ 1.874M
    #     delta 4 = 1.875M ~ 2.499M
    #     delta 5 = 2.500M ~ 3.124M (total 3,125,000 rows)
    # for delta = 2 to delta = 5, use (line No.) + 625000 * (delta - 1)
    
    # T: training, V: validation, then, for example
    # training data, delta = 1 -> TTTTTTTTVV
    # training data, delta = 2 -> ...V..V...
    # training data, delta = 3 -> .V.V......
    # training data, delta = 4 -> ..V.....V.
    # training data, delta = 5 -> ......VV..

    # load ID and training input/output file
    print('<00> loading ID files...')
    print('use_n_sub:', use_n_sub)
    id0 = RD.loadArray('train_id_sub_0.txt')
    print('id 0 finished')
    id1 = RD.loadArray('train_id_sub_1.txt')
    print('id 1 finished')
    id2 = RD.loadArray('train_id_sub_2.txt')
    print('id 2 finished')
    id3 = RD.loadArray('train_id_sub_3.txt')
    print('id 3 finished')
    id4 = RD.loadArray('train_id_sub_4.txt')
    print('id 4 finished')

    print('<01> loading training input/output files...')
    if use_n_sub == True:
        trainInput0 = RD.loadArray('train_input_n_sub_0.txt')
        print('n_sub 0 input finished')
        trainInput1 = RD.loadArray('train_input_n_sub_1.txt')
        print('n_sub 1 input finished')
        trainInput2 = RD.loadArray('train_input_n_sub_2.txt')
        print('n_sub 2 input finished')
        trainInput3 = RD.loadArray('train_input_n_sub_3.txt')
        print('n_sub 3 input finished')
        trainInput4 = RD.loadArray('train_input_n_sub_4.txt')
        print('n_sub 4 input finished')

        trainOutput0 = RD.loadArray('train_output_n_sub_0.txt')
        print('n_sub 0 output finished')
        trainOutput1 = RD.loadArray('train_output_n_sub_1.txt')
        print('n_sub 1 output finished')
        trainOutput2 = RD.loadArray('train_output_n_sub_2.txt')
        print('n_sub 2 output finished')
        trainOutput3 = RD.loadArray('train_output_n_sub_3.txt')
        print('n_sub 3 output finished')
        trainOutput4 = RD.loadArray('train_output_n_sub_4.txt')
        print('n_sub 4 output finished')
        
    else:
        trainInput0 = RD.loadArray('train_input_sub_0.txt')
        print('sub 0 input finished')
        trainInput1 = RD.loadArray('train_input_sub_1.txt')
        print('sub 1 input finished')
        trainInput2 = RD.loadArray('train_input_sub_2.txt')
        print('sub 2 input finished')
        trainInput3 = RD.loadArray('train_input_sub_3.txt')
        print('sub 3 input finished')
        trainInput4 = RD.loadArray('train_input_sub_4.txt')
        print('sub 4 input finished')

        trainOutput0 = RD.loadArray('train_output_sub_0.txt')
        print('sub 0 output finished')
        trainOutput1 = RD.loadArray('train_output_sub_1.txt')
        print('sub 1 output finished')
        trainOutput2 = RD.loadArray('train_output_sub_2.txt')
        print('sub 2 output finished')
        trainOutput3 = RD.loadArray('train_output_sub_3.txt')
        print('sub 3 output finished')
        trainOutput4 = RD.loadArray('train_output_sub_4.txt')
        print('sub 4 output finished')

    # list of training input/output data and validating IDs
    trainInputData = [trainInput0, trainInput1, trainInput2, trainInput3, trainInput4]
    trainOutputData = [trainOutput0, trainOutput1, trainOutput2, trainOutput3, trainOutput4]

    # extract data to validate from trainInput0, using ID list id0ToValidate
    print('<02> extracting data to validate, from training input when delta=1...')
    inputDataToValidate = []

    for i in range(leng-9):
        inputDataToValidate.append(trainInput0[idToValidate[i]])

    # randomly select (validate count of when delta=1) rows for delta = 2 to 5
    print('<03> randomly select data to validate, from training input when delta=2~5...')
    totalCount = 25 * 25 * 1000
    
    for i in range(1, 5):
        count = 0

        # to check an ID is to be validated
        idToValidate_ = []
        for j in range(totalCount): idToValidate.append(False)

        # randomly select IDs until the number of IDs for the delta reaches (validate count of when delta=1)
        while count < leng-9:
            if count % 100 == 0: print(count, '/', leng-9)
            
            rand = random.randint(i * totalCount, (i+1) * totalCount - 1)

            if idToValidate_[rand % totalCount] == False:
                idToValidate_[rand % totalCount] = True

                # append to the list of id to validate and input data to validate
                idToValidate.append(i * totalCount + rand)
                inputDataToValidate.append(trainInputData[i][rand])
                count += 1

    ### validate (get output for validation input) using model of modelName
    # for delta = 1 to 5
    print('<04> validate when delta=1~5...')
    inputDataToValidate = np.array(inputDataToValidate).astype('float')
    rows = len(inputDataToValidate)

    # set of validation outputs
    validOutputs = []

    # validate each validation input row
    for i in range(rows):
        if i % 1000 == 0: print(i, '/', rows)

        delta = int(idToValidate[i] / totalCount)
        
        # initialize valid output as the input data
        validOutput = copy.deepCopy(inputDataToValidate[i])

        # derive output
        # for delta = 1, 2, 3, 4 and 5
        for _ in range(delta):
            validOutput = DL.getTestResult(modelName, validOutput, 0)

            # inverse sigmoid
            for i in range(len(validOutput)): # for each output data
                for j in range(len(validOutput[0])): # for each value of output data
                    validOutput[i][j] = helper.invSigmoid(validOutput[i][j])

        validOutputs.append(validOutput)

    #   (length of idToValidate)
    # = (length of inputDataToValidate)
    # = (length of validOutputs)

    ### compute loss (binary)
    # compare with training output data
    print('<05> comparing the result with corresponding training output data...')
    avgLoss = []
    elementsInEachRow = len(validOutputs[0])

    for thr in thresholdList:
        print('threshold =', thr)

        # sum of the loss for this threshold
        sumLossForThr = 0

        # compute loss for each element
        for i in range(rows):

            delta = int(idToValidate[i] / totalCount) + 1
            ID = idToValidate[i] % totalCount
            TO = trainOutputData[delta-1][ID] 
            
            for j in range(elementsInEachRow):
                if validOutputs[i][j] >= thr and TO[j] == 0:
                    sumLossForThr += 1
                elif validOutputs[i][j] < thr and TO[j] == 1:
                    sumLossForThr += 1

        # find average loss
        avgLoss.append(sumLossForThr / (rows * elementsInEachRow))

    ### write validation report (name: fn = report.txt -> file name = report_repeatDelta.txt)
    print('<06> writing validation report...')
    report_fn = fn.split('.')[0] + '_repeatDelta.txt'

    rf = open(report_fn, 'w')
    rfContent = ''

    for i in range(thresholdList):
        rfContent += '[thr = ' + str(thresholdList[i]) + '] loss=' + str(avgLoss[i]) + '\n'

    rf.write(rfContent)
    rf.close()

    ### write (validation output) + (actual training output)
    print('<07> writing validation and actual training output...')
    compare_fn = fn.split('.')[0] + '_repeatDelta_compare.txt'

    cf = open(compare_fn, 'w')
    cfContent = ''

    for i in range(rows):
        delta = int(idToValidate[i] / totalCount) + 1
        ID = idToValidate[i] % totalCount
        
        cfContent += (str(delta) + '\t' + str(ID) + '\t' +
                      str(np.array(validOutputs[i])) + '|\t' + str(np.array(trainOutputData[delta-1][ID])))

    cf.write(cfContent)
    cf.close()
示例#22
0
                i) + '.txt'  # file to make
            modelConfig = 'model_n_sub_' + str(i) + '.txt'  # file to make
            modelName = 'model_n_sub_' + str(i)  # model name

        else:  # do not use n-sub mode ( -> use normal mode)
            trainIName = 'train_input_sub_' + str(i) + '.txt'
            trainOName = 'train_output_sub_' + str(i) + '.txt'
            testIName = 'test_input_sub_' + str(i) + '.txt'
            testOName = 'test_output_sub_' + str(i) + '.txt'  # file to make
            testReport = 'test_report_sub_' + str(i) + '.txt'  # file to make
            validReport = 'valid_report_sub_' + str(i) + '.txt'  # file to make
            modelConfig = 'model_sub_' + str(i) + '.txt'  # file to make
            modelName = 'model_sub_' + str(i)  # model name

        # load arrays (no difference between normal and n-sub mode)
        train_id_list = RD.loadArray('train_id_sub_' + str(i) + '.txt')
        trainI_array = RD.loadArray(trainIName)
        trainO_array = RD.loadArray(trainOName)

        if validRate == 0.0:  # for test mode
            test_id_list = RD.loadArray('test_id_sub_' + str(i) + '.txt')
            testI_array = RD.loadArray(testIName)

        # print training and test array (consider n-sub mode)
        if verbose == True:
            for j in range(5):
                trainI_ = np.array(trainI_array)[j]
                trainO_ = np.array(trainO_array)[j]
                if validRate == 0.0:
                    testI_ = np.array(testI_array)[j]  # for test mode
示例#23
0
import sys
sys.path.insert(0, '../../AI_BASE')
import readData as RD
import numpy as np

if __name__ == '__main__':

    # read file
    testResult = RD.loadArray('test_output.txt')

    # write final result
    finalResult = []

    for i in range(len(testResult)):
        finalResult.append([float(testResult[i][0]) + 8.0])

    # write file
    RD.saveArray('to_submit.txt', finalResult)
示例#24
0
import sys
sys.path.insert(0, '../../AI_BASE')
import readData as RD
import numpy as np

if __name__ == '__main__':

    train_rows = 300000
    test_rows = 200000
    input_cols_cat = 19
    input_cols_cont = 11

    result = ''

    # TRAIN
    train_data = RD.loadArray('train.csv', ',')
    train_input = np.array(train_data)[1:, 1:31]
    train_output = np.array(train_data)[1:, 31:]

    # AVG and STD for each column of CONTINUOUS training input
    cont_avgs = []
    cont_stds = []

    # categorical
    for i in range(input_cols_cat):
        cont_avgs.append(-1)
        cont_stds.append(-1)

    # continuous
    for i in range(input_cols_cat, input_cols_cat + input_cols_cont):
        thisCol = train_input[:, i].astype(float)
示例#25
0

if __name__ == '__main__':

    files = 64

    # [result0, result1, ...]
    # where each element resultX is [1, 3, 4, 2, 9, 6, 5, 7, ...] for example
    finalResults = []

    # sum of test results
    sumTestResults = []

    # read file
    for i in range(files):
        testResult = np.array(RD.loadArray('test_output_' + str(i) +
                                           '.txt'))[:, :9].astype(float)

        if i == 0:
            sumTestResults = np.array(copy.deepcopy(list(testResult)))
        else:
            sumTestResults = sumTestResults + np.array(
                copy.deepcopy(list(testResult)))

        finalResult = getFinalResult(testResult)
        finalResults.append(finalResult)

    # save the sum of test results
    RD.saveArray('sumTestResults.txt', sumTestResults)

    # write final result
    # USE THE RIGHTMOST COLUMN OF to_submit.txt AS FINAL RESULT
示例#26
0
def writeFinalInput(trainTest, rows):

    business = RD.loadArray('yelp_' + str(trainTest) + '_set_business.txt')
    checkin = RD.loadArray('yelp_' + str(trainTest) + '_set_checkin.txt')
    review = RD.loadArray('yelp_' + str(trainTest) + '_set_review.txt')
    user = RD.loadArray('yelp_' + str(trainTest) + '_set_user.txt')

    finalInput = []

    for i in range(rows[2]):
        if i % 500 == 0: print('row : ' + str(i))

        thisRow = []

        # append review info
        # columns : ['user_id', 'business_id', 'text', 'stars', 'date']
        for j in range(2, 5):
            thisRow.append(review[i][j])

        # append user info
        # columns : ['user_id', 'review_count', 'average_stars']
        userid = review[i][0]
        users = rows[3]
        userFound = False

        for j in range(users):
            if user[j][0] == userid:
                for k in range(1, 3):
                    thisRow.append(user[j][k])
                userFound = True

        if userFound == False:
            for k in range(2):
                thisRow.append(0)

        # append business info
        # columns : ['business_id', 'review_count', 'longitude', 'stars', 'latitude', 'open']
        businessid = review[i][1]
        businesses = rows[0]
        businessFound = False

        for j in range(businesses):
            if business[j][0] == businessid:
                for k in range(1, 6):
                    thisRow.append(business[j][k])
                businessFound = True

        if businessFound == False:
            for k in range(5):
                thisRow.append(0)

        # append checkin info
        # columns : ['business_id', 'checkin_info']
        checkins = rows[1]
        checkinFound = False

        for j in range(checkins):
            if checkin[j][0] == businessid:
                thisRow.append(checkin[j][1])
                checkinFound = True

        if checkinFound == False:
            thisRow.append(0)

        finalInput.append(thisRow)

    if trainTest == 'training':
        RD.saveArray('train_input.txt', finalInput)
    elif trainTest == 'test':
        RD.saveArray('test_input.txt', finalInput)
示例#27
0
def makeData(delta, n, n_, size, limitLen, writeTestInput):

    # window size
    ws = int((n - 1) / 2)  # for training/test input
    ws_ = int((n_ - 1) / 2)  # for training/test output

    # read data
    trainInput = RD.loadArray('train_input_sub_' + str(delta - 1) + '.txt')
    trainOutput = RD.loadArray('train_output_sub_' + str(delta - 1) + '.txt')
    testInput = RD.loadArray('test_input_sub_' + str(delta - 1) + '.txt')

    trainLen = min(len(trainInput), limitLen)
    testLen = len(testInput)

    # input data to make
    trainInputData = []

    # output data to make
    trainOutputData = []

    # test input data to make
    if writeTestInput == True: testInputData = []

    # reshape training data
    for i in range(trainLen):
        if i % 10 == 0:
            print('makeData (training) : ' + str(i) + ' / ' + str(trainLen))

        # trainInput and trainOutput as numeric type
        trainInput = np.array(trainInput).astype('float')
        trainOutput = np.array(trainOutput).astype('float')

        # reshape to derive n*n training data (with ws-sized padding)
        trainInputReshaped = np.pad(
            np.array(trainInput[i]).reshape(size, size), ((ws, ws), (ws, ws)),
            'wrap')
        trainOutputReshaped = np.pad(
            np.array(trainOutput[i]).reshape(size, size),
            ((ws_, ws_), (ws_, ws_)), 'wrap')

        # save training data into array trainInputData and trainOutputData
        for j in range(size):
            for k in range(size):
                trainInputData.append(
                    list(trainInputReshaped[j:j + 2 * ws + 1,
                                            k:k + 2 * ws + 1].reshape(n * n)))
                trainOutputData.append(
                    list(trainOutputReshaped[j:j + 2 * ws_ + 1, k:k + 2 * ws_ +
                                             1].reshape(n_ * n_)))

    # reshape test data
    if writeTestInput == True:
        for i in range(testLen):
            if i % 10 == 0:
                print('makeData (test) : ' + str(i) + ' / ' + str(testLen))

            # trainInput and trainOutput as numeric type
            testInput = np.array(testInput).astype('float')

            # reshape to derive n*n training data (with ws-sized padding)
            testInputReshaped = np.pad(
                np.array(testInput[i]).reshape(size, size),
                ((ws, ws), (ws, ws)), 'wrap')

            # save test data into array testInputData
            for j in range(size):
                for k in range(size):
                    testInputData.append(
                        list(testInputReshaped[j:j + 2 * ws + 1, k:k + 2 * ws +
                                               1].reshape(n * n)))

    # save as file
    # [ADDED] saveSize=10000
    RD.saveArray('train_input_n_sub_' + str(delta - 1) + '.txt',
                 trainInputData,
                 saveSize=10000)
    RD.saveArray('train_output_n_sub_' + str(delta - 1) + '.txt',
                 trainOutputData,
                 saveSize=10000)
    if writeTestInput == True:
        RD.saveArray('test_input_n_sub_' + str(delta - 1) + '.txt',
                     testInputData)
示例#28
0
        else:
            _ = open('final_input.txt', 'r')
            _.close()

            _ = open('final_output.txt', 'r')
            _.close()

            _ = open('final_input_test.txt', 'r')
            _.close()

    except:

        # using PCA
        if usePCA == True:
            team_info_pca = RD.loadArray('team_info_pca.txt', '\t')

        # TRAINING INPUT : using raw_result

        final_input = []
        final_output = []

        for i in range(len(raw_result)):

            team0 = raw_result[i][0]
            team1 = raw_result[i][1]
            season = raw_result[i][4]

            team0_ = team0 - N_start
            team1_ = team1 - N_start
示例#29
0
    # TRAIN AND TEST USING LIGHTGBM

    # execute deep learning
    TRI = 'final_input.txt'
    TRO = 'final_output.txt'
    TEI = 'final_input_test.txt'
    TEO = ['final_output_test.txt']

    TE_real = None
    TE_report = 'report_test.txt'
    VAL_rate = 0.0
    VAL_report = 'report_val.txt'

    # load array
    TRI_array = RD.loadArray(TRI, '\t')
    TRO_array = RD.loadArray(TRO, '\t')
    TEI_array = RD.loadArray(TEI, '\t')

    # create Pandas DataFrame
    # tv_input  : test / validation input
    # tv_output : test / validation output
    (train_input, train_output, tv_input,
     tv_output) = create_dataframe(TRI_array, TRO_array, TEI_array, TEO,
                                   TE_report, VAL_rate, VAL_report)

    # convert to lightgbm dataset
    train_ds = lgb.Dataset(train_input, label=train_output)
    test_ds = lgb.Dataset(tv_input, label=tv_output)

    # set parameters
示例#30
0
    TEI = 'test_input.txt'

    # merge train input and output
    try:
        _ = open(TRIO, 'r')
        _.close()
    except:
        mergeTrain(TRI, TRO, TRIO)

    # K-means clustering
    finalResult = None
    trainName = 'train_IO.txt'
    testName = 'test_input.txt'
    ftype = 'txt'

    TRIO_array = RD.loadArray(TRIO, '\t')
    TEI_array = RD.loadArray(TEI, '\t')
    
    dfTrain = pd.DataFrame(TRIO_array)
    dfTest = pd.DataFrame(TEI_array)
    
    dfTestWeight = None
    caseWeight = False
    targetCol = 14
    targetIndex = 14

    k = 100
    useAverage = True

    # execute algorithm
    AIBASE_KNN.kNN(dfTrain, dfTest, dfTestWeight, caseWeight,