示例#1
0
    def read_bandit_data(self, repo_dir, instance_type):
        if repo_dir.endswith('.txt'):
            fileName = repo_dir
        else:
            fileName = repo_dir + instance_type + '.txt'

        if not os.path.exists(fileName):
            print("Dataset:read_bandit_data\t[ERR]\tInput file not found at ",
                  fileName,
                  flush=True)
            sys.exit(0)

        f = open(fileName, 'r')
        allLines = f.readlines()
        f.close()

        header = allLines[0]
        commentIndex = header.find('#')
        if commentIndex >= 0:
            header = header[:commentIndex]
        tokens = header.split()
        numRecords = int(tokens[0])
        numFeatures = int(tokens[1])
        numLabels = int(tokens[2])

        currIndex = 0
        instanceList = []
        print("Dataset:read_bandit_data\t[LOG]\tFilename: %s Number of instances: %d" %\
                (fileName, numRecords), flush=True)

        for i in range(numRecords):
            currIndex += 1
            currentLine = allLines[currIndex]
            commentIndex = currentLine.find('#')
            if commentIndex >= 0:
                currentLine = currentLine[:commentIndex]
            tokens = currentLine.split()
            sampledAction = tokens[0]
            sampledLoss = float(tokens[1])
            sampledPropensity = float(tokens[2])

            newInstance = None
            sampledY = None
            instanceFeature = None
            if instance_type == 'MultiClass':
                newInstance = Instance.MultiClass(numLabels, numFeatures)
                sampledY = int(sampledAction)
                instanceFeature = scipy.sparse.csr_matrix(
                    (1, numFeatures), dtype=numpy.longdouble)
            elif instance_type == 'MultiLabel':
                newInstance = Instance.MultiLabel(numLabels, numFeatures)
                sampledY = numpy.zeros(numLabels, dtype=numpy.int)
                if sampledAction != '-1':
                    for eachLabel in sampledAction.split(','):
                        sampledY[int(eachLabel)] = 1
                instanceFeature = scipy.sparse.csr_matrix(
                    (1, numFeatures), dtype=numpy.longdouble)
            elif instance_type == 'Brute':
                newInstance = Instance.Brute(numFeatures)
                sampledY = int(sampledAction)

            if instance_type == 'MultiClass' or instance_type == 'MultiLabel':
                for j in range(3, len(tokens)):
                    idVal = tokens[j].split(':')
                    instanceFeature[0, int(idVal[0])] = float(idVal[1])

            elif instance_type == 'Brute':
                numActions = int(tokens[3])
                instanceFeature = scipy.sparse.csr_matrix(
                    (numActions, numFeatures), dtype=numpy.longdouble)
                for k in range(numActions):
                    currIndex += 1
                    currentAction = allLines[currIndex]
                    commentIndex = currentAction.find('#')
                    if commentIndex >= 0:
                        currentAction = currentAction[:commentIndex]
                    tokens = currentAction.split()
                    currentRow = int(tokens[0])
                    for j in range(1, len(tokens)):
                        idVal = tokens[j].split(':')
                        instanceFeature[currentRow,
                                        int(idVal[0])] = float(idVal[1])

            newInstance.set(sampledPropensity, sampledLoss, instanceFeature,
                            sampledY)
            instanceList.append(newInstance)
            if i % 20 == 0:
                print(".", flush=True, end='')
        print('')
        print("Dataset:read_bandit_data\t[LOG]\tFinished loading filename: %s Number of instances: %d" %\
                (fileName, numRecords), flush=True)
        return instanceList