示例#1
0
    def __init__(self, filename, separator = None):
        """Reads in a database file.

        The lines of the database file have to look like this:
            UserID<separator>ItemID<separator>Rating
        If there is just an UserID and an ItemID the rating is set to 1.
        Everything coming after the rating will be ignored,
        but when omit the rating and still have something following the ItemID
        this will be understood as the rating."""
        self.R = {}
        self.uidDict = helper.idOrigDict()
        self.iidDict = helper.idOrigDict()
        self.dbfile = open(filename, "r")
        self.numberOfTransactions = 0
        dialect = csv.Sniffer().sniff(self.dbfile.read(1024), 
                delimiters = separator)
        self.dbfile.seek(0)
        csvReader = csv.reader(self.dbfile, dialect)

        print("Start reading the database.")
        for split in csvReader:
            self.numberOfTransactions += 1
#            split = line.strip().split(separator, 3)
            origUid = split[0]
            origIid = split[1]

            try:
                rating = int(float(split[2]))
            except IndexError:
                rating = 1

            if self.numberOfTransactions % 10000 == 0:
                print("%r Lines read." % self.numberOfTransactions)

            uid = self.uidDict.add(origUid)
            iid = self.iidDict.add(origIid)

            # put in R when not already there
            if uid in self.R:
                self.R[uid].add((iid, rating))
            else:
                self.R[uid] = {(iid, rating)}

        self.matrix = np.matrix(np.zeros((
            self.getMaxUid() + 1, self.getMaxIid() + 1)))
        for u in self.R.iterkeys():
            for d in self.R[u]:
                item = d[0]
                rating = d[1]
                self.matrix[u, item] = rating
示例#2
0
    def __init__(self, filename, separator = None):
        """Reads in a database file with consecutive IDs.

        The lines of the database file have to look like this:
            UserID<separator>ItemID<separator>Rating
        If there is just an UserID and an ItemID the rating is set to 1.
        Everything coming after the rating will be ignored,
        but when omit the rating and still have something following the ItemID
        this will be understood as the rating."""
        self.R = {}
        self.uidDict = helper.idOrigDict()
        self.iidDict = helper.idOrigDict()
        self.dbfile = open(filename, 'r')
        self.numberOfTransactions = 0
        self.maxUid = 0
        self.maxIid = 0
        self.matrix = None
        dialect = csv.Sniffer().sniff(self.dbfile.read(1024), 
                delimiters = separator)
        self.dbfile.seek(0)
        csvReader = csv.reader(self.dbfile, dialect)

        print("Start reading the database.")
        for split in csvReader:
            self.numberOfTransactions += 1
            uid = int(split[0])
            iid = int(split[1])

            if uid > self.maxUid:
                self.maxUid = uid
            if iid > self.maxIid:
                self.maxIid = iid

            try:
                rating = int(float(split[2]))
            except IndexError:
                rating = 1

            if self.numberOfTransactions % 100000 == 0:
                print("%r Lines read." % self.numberOfTransactions)

            # put in R when not already there
            if uid in self.R:
                self.R[uid].add((iid, rating))
            else:
                self.R[uid] = {(iid, rating)}