Exemplo n.º 1
0
def readTestingData(testDataFname, fieldMaps):
    """
    read test data
    @param testDataFname: name of the testing data file
    @return: (DatasetPair with just X, sampleWeights)
    """

    fieldNames = ['pclass', 'name', 'sex', 'age', 'sibsp', 'parch', 'ticket', 'fare', 'cabin', 'embarked','weight']
    nameMap = dict(zip(fieldNames, range(len(fieldNames))))
    dataTypes = np.array([np.int, '|S82', '|S82', np.float, np.int, np.int, '|S82', np.float, '|S82', '|S82', np.int])
    outputFieldNames = []

    data = map(list, csv2dict(testDataFname, hasHeader=True, dataTypes=dataTypes, colIndices=None, defaultNumValue=float('nan')))
    all_x = list()

    # sample weights
    sampleWeights = np.array(getCol(data, [nameMap['weight']]))

    # attach numerical fields first
    for name in ['pclass', 'age', 'sibsp', 'parch', 'fare']:
        outputFieldNames.append(name)
        all_x.append(getCol(data, [nameMap[name]]))

    # attach text fields
    for name in ['sex', 'name', 'ticket', 'cabin', 'embarked']:
        outputFieldNames.append(name)
        fieldMap = fieldMaps[name]
        all_x.append([fieldMap[v] for v in getCol(data, [nameMap[name]])])

    all_x = np.array(zip(*all_x))

    return DatasetPair(all_x, fieldNames=outputFieldNames), sampleWeights
Exemplo n.º 2
0
def compareResultsToTrueResults(testResults, trueResFname):
    """
    compares results to true results
    @param testResults: vector of test results
    @param trueResFname: name of the file containing a column of true results
    @return: the evaluation score
    """

    y_true = getCol(csv2dict(trueResFname, hasHeader=True, dataTypes=[np.int]), 0)
    return evaluate(testResults, y_true)
Exemplo n.º 3
0
def readTrainingData(trainDataFname, testDataFname):
    """
    @param trainDataFname: name of the training data file
    @param testDataFname: name of the testing data file. used here only to get the string->index mapping of the text columns
    @return: allDataPair, fieldMaps, sampleWeights
    """

    fieldNames = ['survived', 'pclass', 'name', 'sex', 'age', 'sibsp', 'parch', 'ticket', 'fare', 'cabin', 'embarked','weight']
    nameMap = dict(zip(fieldNames, range(len(fieldNames))))
    dataTypes = np.array([np.int, np.int, '|S82', '|S82', np.float, np.int, np.int, '|S82', np.float, '|S82', '|S82', np.int])
    outputFieldNames = []

    # ------ read original data ---------
    data = map(list, csv2dict(trainDataFname, hasHeader=True, dataTypes=dataTypes, colIndices=None, defaultNumValue=float('nan')))
    all_y = np.array(getCol(data, [nameMap['survived']]))
    all_x = list()

    # sample weights
    sampleWeights = np.array(getCol(data, [nameMap['weight']]))

    # attach numerical fields first
    for name in ['pclass', 'age', 'sibsp', 'parch', 'fare']:
        outputFieldNames.append(name)
        all_x.append(getCol(data, [nameMap[name]]))

    # attach text fields
    testData = csv2dict(testDataFname, hasHeader=True)
    numTrainingDataPts = len(data)
    fieldMaps = {}
    for name in ['sex', 'name', 'ticket', 'cabin', 'embarked']:
        outputFieldNames.append(name)
        col, fieldMap = integerizeList(getCol(data, [nameMap[name]]) + list(testData[:, nameMap[name]-1]))
        all_x.append(col[:numTrainingDataPts])
        fieldMaps[name] = fieldMap

    # normalize data
    allDataPair = DatasetPair(np.array(zip(*all_x)), all_y, outputFieldNames)

    return allDataPair, fieldMaps, sampleWeights