示例#1
0
    def generate_criteo_stream(self, file_name, feature_ids=None):
        if not os.path.exists(file_name):
            print(
                "Dataset:generate_criteo_stream\t[ERR]\tInput file not found at ",
                file_name,
                flush=True)
            sys.exit(0)

        instances = []

        instanceLines = []

        featureIDs = None
        unseenFeatures = False
        if feature_ids is not None:
            featureIDs = feature_ids
        else:
            featureIDs = collections.Counter()
            unseenFeatures = True

        f = None
        if file_name.endswith('.gz'):
            f = gzip.open(file_name, 'rt')
        else:
            f = open(file_name, 'r')

        line = None
        while True:
            line = next(f, None)
            if line is None:
                break

            line = line.strip()
            if len(line) > 0 and line[0] == 's':
                #Process the previous example
                if len(instanceLines) > 0:
                    numCandidates = len(
                        instanceLines
                    ) - 2  #One header, and one trailing empty line
                    header = instanceLines[0]
                    sharedFeatures = header[header.index('|') + 2:]
                    sharedCols = []
                    sharedVals = []

                    tokens = sharedFeatures.split(' ')
                    for token in tokens:
                        val = 1
                        featIDStr = token
                        if ':' in token:
                            tempToken = token.split(':', 1)
                            featIDStr = tempToken[0]
                            val = int(tempToken[1])

                        featID = None
                        if featIDStr in featureIDs:
                            featID = featureIDs[featIDStr]
                        elif unseenFeatures:
                            featID = len(featureIDs)
                            featureIDs[featIDStr] = featID
                        else:
                            continue

                        sharedCols.append(featID)
                        sharedVals.append(val)

                    selectedLine = instanceLines[1]
                    splits = selectedLine.split('|', 1)
                    labelInfo = splits[0]
                    toks = labelInfo.split(':', 2)
                    loss = float(toks[1])
                    propensity = float(toks[2])
                    chosenAction = 0

                    numSharedCols = len(sharedCols)
                    rows = []
                    cols = []
                    vals = []

                    for j in range(numCandidates):
                        featLine = None
                        if j == 0:
                            featLine = splits[1].split(' ')
                        else:
                            featLine = instanceLines[j + 1].split(' ')

                        rows.extend([j] * numSharedCols)
                        cols.extend(sharedCols)
                        vals.extend(sharedVals)

                        for k in range(1, len(featLine)):
                            val = 1
                            featIDStr = featLine[k]
                            if ':' in featIDStr:
                                tempToken = featIDStr.split(':', 1)
                                featIDStr = tempToken[0]
                                val = int(tempToken[1])

                            featID = None
                            if featIDStr in featureIDs:
                                featID = featureIDs[featIDStr]
                            elif unseenFeatures:
                                featID = len(featureIDs)
                                featureIDs[featIDStr] = featID
                            else:
                                continue

                            rows.append(j)
                            cols.append(featID)
                            vals.append(val)

                    currInstance = Instance.Brute(73989)
                    x = scipy.sparse.coo_matrix((vals, (rows, cols)),
                                                shape=(numCandidates, 73989),
                                                dtype=numpy.int)
                    x = x.tocsr()
                    currInstance.set(propensity, loss, x, 0)
                    instances.append(currInstance)

                    if len(instances) % 10000 == 0:
                        print('.', end='', flush=True)

                    instanceLines.clear()

            instanceLines.append(line)

        f.close()

        #Process the final example
        if len(instanceLines) > 0:
            numCandidates = len(
                instanceLines) - 2  #One header, and one trailing empty line
            header = instanceLines[0]
            sharedFeatures = header[header.index('|') + 2:]
            sharedCols = []
            sharedVals = []

            tokens = sharedFeatures.split(' ')
            for token in tokens:
                val = 1
                featIDStr = token
                if ':' in token:
                    tempToken = token.split(':', 1)
                    featIDStr = tempToken[0]
                    val = int(tempToken[1])

                featID = None
                if featIDStr in featureIDs:
                    featID = featureIDs[featIDStr]
                elif unseenFeatures:
                    featID = len(featureIDs)
                    featureIDs[featIDStr] = featID
                else:
                    continue

                sharedCols.append(featID)
                sharedVals.append(val)

            selectedLine = instanceLines[1]
            splits = selectedLine.split('|', 1)
            labelInfo = splits[0]
            toks = labelInfo.split(':', 2)
            loss = float(toks[1])
            propensity = float(toks[2])
            chosenAction = 0

            numSharedCols = len(sharedCols)
            rows = []
            cols = []
            vals = []

            for j in range(numCandidates):
                featLine = None
                if j == 0:
                    featLine = splits[1].split(' ')
                else:
                    featLine = instanceLines[j + 1].split(' ')

                rows.extend([j] * numSharedCols)
                cols.extend(sharedCols)
                vals.extend(sharedVals)

                for k in range(1, len(featLine)):
                    val = 1
                    featIDStr = featLine[k]
                    if ':' in featIDStr:
                        tempToken = featIDStr.split(':', 1)
                        featIDStr = tempToken[0]
                        val = int(tempToken[1])

                    featID = None
                    if featIDStr in featureIDs:
                        featID = featureIDs[featIDStr]
                    elif unseenFeatures:
                        featID = len(featureIDs)
                        featureIDs[featIDStr] = featID
                    else:
                        continue

                    rows.append(j)
                    cols.append(featID)
                    vals.append(val)

            currInstance = Instance.Brute(73989)
            x = scipy.sparse.coo_matrix((vals, (rows, cols)),
                                        shape=(numCandidates, 73989),
                                        dtype=numpy.int)
            x = x.tocsr()
            currInstance.set(propensity, loss, x, 0)
            instances.append(currInstance)

        return instances, featureIDs
示例#2
0
    def read_bandit_data(self, repo_dir, instance_type):
        if repo_dir.endswith('.txt'):
            fileName = repo_dir
        else:
            fileName = repo_dir + instance_type + '.txt'

        if not os.path.exists(fileName):
            print("Dataset:read_bandit_data\t[ERR]\tInput file not found at ",
                  fileName,
                  flush=True)
            sys.exit(0)

        f = open(fileName, 'r')
        allLines = f.readlines()
        f.close()

        header = allLines[0]
        commentIndex = header.find('#')
        if commentIndex >= 0:
            header = header[:commentIndex]
        tokens = header.split()
        numRecords = int(tokens[0])
        numFeatures = int(tokens[1])
        numLabels = int(tokens[2])

        currIndex = 0
        instanceList = []
        print("Dataset:read_bandit_data\t[LOG]\tFilename: %s Number of instances: %d" %\
                (fileName, numRecords), flush=True)

        for i in range(numRecords):
            currIndex += 1
            currentLine = allLines[currIndex]
            commentIndex = currentLine.find('#')
            if commentIndex >= 0:
                currentLine = currentLine[:commentIndex]
            tokens = currentLine.split()
            sampledAction = tokens[0]
            sampledLoss = float(tokens[1])
            sampledPropensity = float(tokens[2])

            newInstance = None
            sampledY = None
            instanceFeature = None
            if instance_type == 'MultiClass':
                newInstance = Instance.MultiClass(numLabels, numFeatures)
                sampledY = int(sampledAction)
                instanceFeature = scipy.sparse.csr_matrix(
                    (1, numFeatures), dtype=numpy.longdouble)
            elif instance_type == 'MultiLabel':
                newInstance = Instance.MultiLabel(numLabels, numFeatures)
                sampledY = numpy.zeros(numLabels, dtype=numpy.int)
                if sampledAction != '-1':
                    for eachLabel in sampledAction.split(','):
                        sampledY[int(eachLabel)] = 1
                instanceFeature = scipy.sparse.csr_matrix(
                    (1, numFeatures), dtype=numpy.longdouble)
            elif instance_type == 'Brute':
                newInstance = Instance.Brute(numFeatures)
                sampledY = int(sampledAction)

            if instance_type == 'MultiClass' or instance_type == 'MultiLabel':
                for j in range(3, len(tokens)):
                    idVal = tokens[j].split(':')
                    instanceFeature[0, int(idVal[0])] = float(idVal[1])

            elif instance_type == 'Brute':
                numActions = int(tokens[3])
                instanceFeature = scipy.sparse.csr_matrix(
                    (numActions, numFeatures), dtype=numpy.longdouble)
                for k in range(numActions):
                    currIndex += 1
                    currentAction = allLines[currIndex]
                    commentIndex = currentAction.find('#')
                    if commentIndex >= 0:
                        currentAction = currentAction[:commentIndex]
                    tokens = currentAction.split()
                    currentRow = int(tokens[0])
                    for j in range(1, len(tokens)):
                        idVal = tokens[j].split(':')
                        instanceFeature[currentRow,
                                        int(idVal[0])] = float(idVal[1])

            newInstance.set(sampledPropensity, sampledLoss, instanceFeature,
                            sampledY)
            instanceList.append(newInstance)
            if i % 20 == 0:
                print(".", flush=True, end='')
        print('')
        print("Dataset:read_bandit_data\t[LOG]\tFinished loading filename: %s Number of instances: %d" %\
                (fileName, numRecords), flush=True)
        return instanceList