Exemplo n.º 1
0
def inputFiles(trainingFile, testingFile):

    # we have two "header" rows before the data actually starts, which will throw off our csv parser
    trainingFile, dataDescriptionLine = removeHeaderRows(trainingFile)
    testingFile, testingDataDescriptionLine = removeHeaderRows(testingFile)


    # grab the dataDescription row and make it lowercase
    expectedRowLength = len( dataDescriptionLine )
    dataDescriptionRaw = [x.lower() for x in dataDescriptionLine]
    hasID, testHeaderValidationLength, hasCustomValidationSplit = validation.dataDescription( dataDescriptionRaw )

    # the user told us whether this is 'output regression' or 'output category'
    # we need to split out the problem type (regression, category, or multi-category), and leave only 'output'
    dataDescription = []
    for columnType in dataDescriptionRaw:
        if columnType[0:6] == 'output':
            dataDescription.append('output')
            problemType = columnType[7:]
        elif columnType[0:8] == 'groupby ':
            dataDescription.append( columnType[8:] )
        else:
            dataDescription.append(columnType)


    testingDataDescription = [x.lower() for x in testingDataDescriptionLine]


    # we will break out separately the ID column, the output column, and then the rest of the data
    outputData = []
    idColumn = []
    validationSplitColumn = []
    outputColumn = []

    with open(trainingFile, 'rU') as trainingInput:
        # detect the "dialect" of this type of csv file
        try:
            dialect = csv.Sniffer().sniff(trainingInput.read(1024))
        except:
            dialect = 'excel'
        trainingInput.seek(0)
        trainingRows = csv.reader(trainingInput, dialect)

        rowCount = 0
        for row in trainingRows:
            # grab the header row and make it lowercase
            if rowCount == 0:
                validation.rowLength( row, expectedRowLength, rowCount )
                headerRow = [x.lower() for x in row]

            else:
                validation.rowLength( row, expectedRowLength, rowCount )
                trimmedRow = []
                if hasID == False:
                    # while we won't be using these IDs, we do need to make sure our idColumn has the right number of rows, so we are putting them in here. 
                    idColumn.append( int(rowCount + 8000000000) )

                for idx, val in enumerate(row):
                    if dataDescription[idx] == 'id':
                        idColumn.append(val)
                    elif dataDescription[idx] == 'validation split':
                        validationSplitColumn.append(val)
                    elif dataDescription[idx] == 'output':
                        # TODO: add in some error handling around making sure everything in the outputColumn is the same type.
                        try:
                            outputColumn.append(float(val))
                        except:
                            outputColumn.append(val)
                    elif dataDescription[idx] == 'ignore':
                        # some columns contain data we do not want to use. It seems trivial to remove these from our dataset here, rather than forcing them to try to open the dataset up in some other program to attempt to delete the column. 
                        pass
                    else:
                        trimmedRow.append(val)

                outputData.append(trimmedRow)
            # keep track of which row we are on for error logging purposes
            rowCount += 1

        # keep track of how long our training data set is so we can split back out again later
        trainingLength = len(outputData)


    # TODO TODO TODO: properly handle paring off the top line from our testing csv file

    with open(testingFile, 'rU') as testingInput:
        # detect the dialect of the csv file
        try:
            dialect = csv.Sniffer().sniff(testingInput.read(1024))
        except:
            dialect = 'excel'
        testingInput.seek(0)

        testingRows = csv.reader(testingInput, dialect)
        testingRowCount = 0


        # set missingOutputIndex equal to infinity to start with
        missingOutputIndex = float('inf')

        for row in testingRows:
            if testingRowCount == 0:
                testingHeader = [x.lower() for x in row]
                

                # check to make sure that with all the IGNOREs considered, we have the right number of columns
                colsValidated = validation.testingHeaderRow( row, expectedRowLength, headerRow )
                if colsValidated == False:
                    # if not, assume that the missing column is the output column, and store that index position
                    missingOutputIndex = dataDescription.index('output')
                expectedTestingRowLength = len( row )
            else:
                # build up each row in the testing dataset
                validation.testingRowLength( row, expectedTestingRowLength, testingRowCount )
                trimmedRow = []
                for idx, val in enumerate(row):
                    if testingDataDescription[idx] == 'id':
                        idColumn.append(val)
                    elif testingDataDescription[idx] == 'output':
                        outputColumn.append(val)
                    elif testingDataDescription[idx] == 'ignore':
                        pass
                    else:
                        trimmedRow.append(val)
                # NOTE: we are appending both the training and the testing data into one dataset
                # this ensures we will be processing them consistently
                    # if we treated them separately, it could cause an issue if we have a feature present in the testing data but not the training data, for example
                outputData.append(trimmedRow)
            testingRowCount += 1


    os.remove(trainingFile)
    os.remove(testingFile)


    try:
        idHeader = headerRow[ dataDescription.index('id') ]
    except:
        idHeader = testingHeader[ testingDataDescription.index('id') ]

    return dataDescription, headerRow, trainingLength, outputData, idColumn, outputColumn, idHeader, problemType, dataDescriptionRaw, hasCustomValidationSplit, validationSplitColumn
Exemplo n.º 2
0
def datasets(X, joinFileName, XHeaderRow, dataDescription, args, groupByIndices, dateIndices):

    # TODO: read in and remove the first row. having two "header" rows appers to be throwing off the sniffer when we have multiple commas in a single column, even if they are quote-enclosed.
    # write all but the first row (or two, possibly) to the temp.csv file.
    # read in that temp.csv file later.
    # with open(joinFileName, 'rU') as f:
    #     with open(args['outputFolder'] + 'temp.csv','w+') as f1:
    #         f.next() # skip header line
    #         for line in f:
    #             f1.write(line)

    # 1. read in data in joinFileName
    joinDict = {}
    with open(joinFileName, "rU") as joinFile:
        # detect the "dialect" of this type of csv file
        try:
            dialect = csv.Sniffer().sniff(joinFile.read(2048))
            joinFile.seek(0)
        except:
            dialect = "excel"
        joinRows = csv.reader(joinFile, dialect)

        rowCount = 0
        joinedValsLength = -1

        for row in joinRows:
            if rowCount < 2:
                # grab the joinDataDescription row and the header row, and make them both lowercase
                if rowCount == 0:
                    expectedRowLength = len(row)
                    # 2. get joinDataDescription row from that file
                    joinDataDescription = [x.lower() for x in row]
                    validation.joinDataDescription(joinDataDescription)

                else:
                    validation.rowLength(row, expectedRowLength, rowCount)
                    # 3. get header row from that file
                    headerRow = [x.lower() for x in row]

                    joinOnIdColumn = False
                    # determine joinIndex
                    try:
                        # 4. see if we have an args['on'] property to join the files on
                        joinHeader = args["on"].lower()
                        joinIndex = headerRow.index(joinHeader)
                        xJoinIndex = XHeaderRow.index(joinHeader)
                    except:
                        try:
                            # see if our idColumn is what we're joining on (which seems like it'll happen at some point)
                            joinIndex = headerRow.index(args["idHeader"])
                            joinOnIdColumn = True
                        except:
                            # 5. see if we have the same headerRow name in both files to join on
                            for x in headerRow:
                                if x in XHeaderRow:
                                    joinHeader = x
                            # joinHeader =  set(headerRow).intersection(XHeaderRow)
                            joinIndex = headerRow.index(joinHeader)
                            xJoinIndex = XHeaderRow.index(joinHeader)
            else:
                validation.rowLength(row, expectedRowLength, rowCount)
                trimmedRow = []
                joinVal = row[joinIndex]

                for idx, val in enumerate(row):
                    if joinDataDescription[idx] != "id" and joinDataDescription[idx] != "ignore" and idx != joinIndex:
                        trimmedRow.append(val)
                joinDict[joinVal] = trimmedRow
                if len(trimmedRow) > joinedValsLength:
                    joinedValsLength = len(trimmedRow)

            # keep track of which row we are on for error logging purposes
            rowCount += 1

    newX = []
    # 5. join the files
    blankVals = [None for x in range(0, joinedValsLength)]
    for rowIndex, row in enumerate(X):
        if joinOnIdColumn:
            try:
                joinID = idColumn[rowIndex]
                newVals = joinDict[joinID]
                newX.append(row.join(newVals))
            except:
                # append blank values so all rows still have the same number of columns
                newX.append(row.join(blankVals))
        else:
            try:
                joinID = row[xJoinIndex]
                newVals = joinDict[joinID]
                newX.append(row + newVals)
            except:
                # append blank values so all rows still have the same number of columns
                newX.append(row + blankVals)
        X[rowIndex] = None
        del row

        # just do it myself so we have more control and don't need to convert to dataFrames and back
        # read the join file into a dict
        # the keys for the dict will be the matching column
        # ignore any ignore columns
        # remove any ignored columns from the header row and dataDescription row
        # iterate through X. for each row:
        # append all the values in the joinFile dict for that id

    # append header rows
    for idx, name in enumerate(headerRow):
        if joinDataDescription[idx] != "id" and joinDataDescription[idx] != "ignore" and idx != joinIndex:
            XHeaderRow.append(name)

    # append dataDescription rows, and our groupByIndices and dateIndices
    originalDataDescriptionLength = len(dataDescription)
    for idx, name in enumerate(joinDataDescription):
        if name != "id" and name != "ignore" and idx != joinIndex:
            if name[0:7] == "groupby":
                # append only the non groupby part of this name
                dataDescription.append(name[8:])
                groupByIndices.append(idx + originalDataDescriptionLength)
            elif name == "date":
                dataDescription.append(name)
                dateIndices.append(idx + originalDataDescriptionLength)
            else:
                dataDescription.append(name)

    del X
    return newX, dataDescription, XHeaderRow, groupByIndices, dateIndices
Exemplo n.º 3
0
def datasets(X, joinFileName, XHeaderRow, dataDescription, args,
             groupByIndices, dateIndices):

    # TODO: read in and remove the first row. having two "header" rows appers to be throwing off the sniffer when we have multiple commas in a single column, even if they are quote-enclosed.
    # write all but the first row (or two, possibly) to the temp.csv file.
    # read in that temp.csv file later.
    # with open(joinFileName, 'rU') as f:
    #     with open(args['outputFolder'] + 'temp.csv','w+') as f1:
    #         f.next() # skip header line
    #         for line in f:
    #             f1.write(line)

    # 1. read in data in joinFileName
    joinDict = {}
    with open(joinFileName, 'rU') as joinFile:
        # detect the "dialect" of this type of csv file
        try:
            dialect = csv.Sniffer().sniff(joinFile.read(2048))
            joinFile.seek(0)
        except:
            dialect = 'excel'
        joinRows = csv.reader(joinFile, dialect)

        rowCount = 0
        joinedValsLength = -1

        for row in joinRows:
            if rowCount < 2:
                # grab the joinDataDescription row and the header row, and make them both lowercase
                if rowCount == 0:
                    expectedRowLength = len(row)
                    # 2. get joinDataDescription row from that file
                    joinDataDescription = [x.lower() for x in row]
                    validation.joinDataDescription(joinDataDescription)

                else:
                    validation.rowLength(row, expectedRowLength, rowCount)
                    # 3. get header row from that file
                    headerRow = [x.lower() for x in row]

                    joinOnIdColumn = False
                    # determine joinIndex
                    try:
                        # 4. see if we have an args['on'] property to join the files on
                        joinHeader = args['on'].lower()
                        joinIndex = headerRow.index(joinHeader)
                        xJoinIndex = XHeaderRow.index(joinHeader)
                    except:
                        try:
                            # see if our idColumn is what we're joining on (which seems like it'll happen at some point)
                            joinIndex = headerRow.index(args['idHeader'])
                            joinOnIdColumn = True
                        except:
                            # 5. see if we have the same headerRow name in both files to join on
                            for x in headerRow:
                                if x in XHeaderRow:
                                    joinHeader = x
                            # joinHeader =  set(headerRow).intersection(XHeaderRow)
                            joinIndex = headerRow.index(joinHeader)
                            xJoinIndex = XHeaderRow.index(joinHeader)
            else:
                validation.rowLength(row, expectedRowLength, rowCount)
                trimmedRow = []
                joinVal = row[joinIndex]

                for idx, val in enumerate(row):
                    if joinDataDescription[idx] != 'id' and joinDataDescription[
                            idx] != 'ignore' and idx != joinIndex:
                        trimmedRow.append(val)
                joinDict[joinVal] = trimmedRow
                if len(trimmedRow) > joinedValsLength:
                    joinedValsLength = len(trimmedRow)

            # keep track of which row we are on for error logging purposes
            rowCount += 1

    newX = []
    # 5. join the files
    blankVals = [None for x in range(0, joinedValsLength)]
    for rowIndex, row in enumerate(X):
        if joinOnIdColumn:
            try:
                joinID = idColumn[rowIndex]
                newVals = joinDict[joinID]
                newX.append(row.join(newVals))
            except:
                # append blank values so all rows still have the same number of columns
                newX.append(row.join(blankVals))
        else:
            try:
                joinID = row[xJoinIndex]
                newVals = joinDict[joinID]
                newX.append(row + newVals)
            except:
                # append blank values so all rows still have the same number of columns
                newX.append(row + blankVals)
        X[rowIndex] = None
        del row

        # just do it myself so we have more control and don't need to convert to dataFrames and back
        # read the join file into a dict
        # the keys for the dict will be the matching column
        # ignore any ignore columns
        # remove any ignored columns from the header row and dataDescription row
        # iterate through X. for each row:
        # append all the values in the joinFile dict for that id

    # append header rows
    for idx, name in enumerate(headerRow):
        if joinDataDescription[idx] != 'id' and joinDataDescription[
                idx] != 'ignore' and idx != joinIndex:
            XHeaderRow.append(name)

    # append dataDescription rows, and our groupByIndices and dateIndices
    originalDataDescriptionLength = len(dataDescription)
    for idx, name in enumerate(joinDataDescription):
        if name != 'id' and name != 'ignore' and idx != joinIndex:
            if name[0:7] == 'groupby':
                # append only the non groupby part of this name
                dataDescription.append(name[8:])
                groupByIndices.append(idx + originalDataDescriptionLength)
            elif name == 'date':
                dataDescription.append(name)
                dateIndices.append(idx + originalDataDescriptionLength)
            else:
                dataDescription.append(name)

    del X
    return newX, dataDescription, XHeaderRow, groupByIndices, dateIndices