def randomGenerateDataSet(ratingFile): trainSet = utils.createWriteFile('RDTrain.dat') testSet = utils.createWriteFile('RDTest.dat') for row in ratingFile: random.seed() value = random.random() if value > 0.75: testSet.write(row) else: trainSet.write(row)
def splitDatasetWithUsers(splitRatio, ratingFile, dataset, savePrefix): dataList = {} userIDDict = {} itemIDDict = {} functions = { 'eachMovie6': convertStringToRatingInfoForEachMovie, 'eachMovie': convertStringToRatingInfoForEachMovie, 'movieLens': convertStringToRatingInfo } for row in ratingFile: userID, itemID, rating, weight, timestamp = functions[dataset]( row, separator) # userID, itemID, rating, timestamp = convertStringToRatingInfo(row, separator) if int(userID) not in userIDDict: userIDDict[int(userID)] = 1 dataList[int(userID)] = [row] else: dataList[int(userID)].append(row) if int(itemID) not in itemIDDict: itemIDDict[int(itemID)] = 1 # dataList.append(row) userIDList = userIDDict.keys() random.shuffle(userIDList) totalLength = len(userIDList) splitedIndex = totalLength * splitRatio trainSet = utils.createWriteFile(savePrefix + 'splitedUserTrain.dat') testSet = utils.createWriteFile(savePrefix + 'splitedUserTest.dat') count = 0 ccount = 0 print len(userIDList) for userID in userIDList: rows = dataList[userID] if len(rows) <= 10: ccount += 1 if userID > splitedIndex: for row in rows: testSet.write(row) else: for row in rows: trainSet.write(row) count += 1 print count, len(userIDList) - count print ccount
def splitDataWithACertainTime(timestampArray, ratingDict, separator, splitRatio): oldUserDict = {} oldItemDict = {} count = 0 trainFile = utils.createWriteFile('train.dat') testFile = utils.createWriteFile('test.dat') total = len(timestampArray) ratingsWithNewUser = 0 ratingsWithNewItem = 0 ratings = 0 newToNew = 0 for timestamp in timestampArray: string = ratingDict[timestamp] userID, itemID, rating, timestamp = convertStringToRatingInfo( string, separator) if count <= split * total: if userID not in oldUserDict: oldUserDict[userID] = 1 else: oldUserDict[userID] += 1 if itemID not in oldItemDict: oldItemDict[itemID] = 1 else: oldItemDict[itemID] += 1 trainFile.write(string) else: flag = 0 ratings += 1 if userID not in oldUserDict: ratingsWithNewUser += 1 flag = 1 if itemID not in oldItemDict: ratingsWithNewItem += 1 if flag == 1: newToNew += 1 testFile.write(string) count += 1 print ratings, ratingsWithNewUser, ratingsWithNewItem, newToNew
import utils if __name__ == "__main__": inputFile = 'inputQ3' inputFile = 'C-small-2-attempt0.in' inputFile = 'C-large.in' #inputFile = "D-small-attempt0.in" #inputFile = "C-small-attempt0.in" #inputFile = "A-large.in.txt" #inputFile = "inputQ3" outputFile = "outputQ3" inputData = utils.createReadFile(inputFile) outputData = utils.createWriteFile(outputFile) cases = inputData.next() cases = cases.strip() print cases for index in range(1, int(cases) + 1): print "case ", index outputString = "Case #" + str(index) + ": " rowData = inputData.next() rowData = rowData.strip() strs = rowData.split(' ') N = int(strs[0]) K = int(strs[1]) array = {} large = N array[large] = 1 i = 0 while i < K: selectN = large