Exemplo n.º 1
0
def runEntry(entry, students, helpers, assignName, sourceSuffix, resultsSuffix, allowPartners):
	# create an empty PairResults object
	resultFilename = common.makeFilenameSafe(entry["sources"][0]) + resultsSuffix
	results = common.PairResults(assignName, resultFilename, helpers)

	# for each pair of students
	for i in range(len(students)):
		for j in range(i + 1):
			if i != j:
				# are these students partners?
				student1 = students[i]
				student2 = students[j]

				# get both file paths
				safeFilename = common.makeFilenameSafe(entry["sources"][0]) + sourceSuffix
				path1 = helpers.getPreprocessedPath(student1, assignName, safeFilename)
				path2 = helpers.getPreprocessedPath(student2, assignName, safeFilename)

				# make sure both paths exist
				if path1 != None and path2 != None:
					editDistance = runEditDistance(path1, path2)

					# save the pair result
					result = common.PairResult(student1, student2, editDistance)
					results.add(result)

	# flush results to disk
	results.finish()
	helpers.printf("Finished '{}/{}'!\n".format(assignName, entry["sources"][0]))
Exemplo n.º 2
0
def runAssignment(assignment, students, args, helpers):
	clusters = {}
	allowPartners = assignment.args["allowPartners"]

	# for each file
	files = assignment.args["files"]
	for filename in files:
		# find collisions
		for student in students:
			safeFilename = common.makeFilenameSafe(filename) + args["sourceSuffix"]
			studentText = helpers.readFromPreprocessed(student, assignment.name, safeFilename)

			if studentText != None:
				hashedText = hashText(studentText)

				if hashedText not in clusters:
					clusters[hashedText] = common.Cluster(allowPartners, filename, 100)

				member = common.Member(student, assignment.name, helpers)
				clusters[hashedText].add(member)

	# process the clusters
	clusterArray = []
	for key in clusters:
		clusterArray.append(clusters[key])

	# write the results for this assignment
	# clustersToStandardJSON will make sure that there are at least two people in the cluster
	common.clustersToStandardJSON(clusterArray, assignment.name, args["resultsSuffix"], helpers)

	# say we're done with this assignment
	helpers.printf("Finished assignment '{}'...\n".format(assignment.name))
Exemplo n.º 3
0
def runEntry(filename, students, helpers, assignment, args, allowPartners):
    # get the data
    assignName = assignment.name
    sourceSuffix = args["sourceSuffix"]
    resultsSuffix = args["resultsSuffix"]
    threshold = assignment.args["threshold"]
    above = args["above"]
    minThreshold = None
    if assignment.args.has_key("minThreshold"):
        minThreshold = assignment.args["minThreshold"]

    safeFilename = common.makeFilenameSafe(filename) + sourceSuffix
    filepath = helpers.getProcessedPath(assignName, safeFilename)

    if filepath != None:
        rawData = common.PairResults(assignName, safeFilename, helpers)
        data = []

        # convert into python objects
        for pair in rawData.iterate():
            data.append(pair)

        # get the mean
        mean = getMean(data)

        # get the deviation
        deviation = getDeviation(data, mean)
        helpers.printf("{}/{}: mean {}, deviation {}\n".format(
            assignName, filename, mean, deviation))

        # filter out data
        filtered = filterData(data, mean, deviation, threshold, above,
                              minThreshold)

        # create the clusters
        clusters = createClusters(filtered, filename, assignName,
                                  allowPartners, helpers)

        # flush to disk
        common.clustersToStandardJSON(
            clusters, assignName,
            common.makeFilenameSafe(filename) + resultsSuffix, helpers)

        # all done!
        helpers.printf("Finished '{}', with {} results!\n".format(
            assignName, len(clusters)))
Exemplo n.º 4
0
def runEntry(filename, students, helpers, assignment, args, allowPartners):
	# get the data
	assignName = assignment.name
	sourceSuffix = args["sourceSuffix"]
	resultsSuffix = args["resultsSuffix"]
	threshold = assignment.args["threshold"]
	above = args["above"]
	minThreshold = None
	if assignment.args.has_key("minThreshold"):
		minThreshold = assignment.args["minThreshold"]

	safeFilename = common.makeFilenameSafe(filename) + sourceSuffix
	filepath = helpers.getProcessedPath(assignName, safeFilename)

	if filepath != None:
		rawData = common.PairResults(assignName, safeFilename, helpers)
		data = []

		# convert into python objects
		for pair in rawData.iterate():
			data.append(pair)

		# get the mean
		mean = getMean(data)

		# get the deviation
		deviation = getDeviation(data, mean)
		helpers.printf("{}/{}: mean {}, deviation {}\n".format(assignName, filename, mean, deviation))

		# filter out data
		filtered = filterData(data, mean, deviation, threshold, above, minThreshold)

		# create the clusters
		clusters = createClusters(filtered, filename, assignName, allowPartners, helpers)

		# flush to disk
		common.clustersToStandardJSON(clusters, assignName, common.makeFilenameSafe(filename) + resultsSuffix, helpers)

		# all done!
		helpers.printf("Finished '{}', with {} results!\n".format(assignName, len(clusters)))
Exemplo n.º 5
0
def getStats(students, assign, filename, helpers):
    # gather students stats into an array
    studentDict = {}
    array = []

    for student in students:
        safeFilename = common.makeFilenameSafe(filename) + "stats.json"
        path = helpers.getPreprocessedPath(student, assign.name, safeFilename)
        if path != None:
            json = io.readJSON(path)
            studentDict[student] = json
            array.append(json)

    return (studentDict, array)
Exemplo n.º 6
0
def getStats(students, assign, filename, helpers):
	# gather students stats into an array
	studentDict = {}
	array = []

	for student in students:
		safeFilename = common.makeFilenameSafe(filename) + "stats.json"
		path = helpers.getPreprocessedPath(student, assign.name, safeFilename)
		if path != None:
			json = io.readJSON(path)
			studentDict[student] = json
			array.append(json)

	return (studentDict, array)
Exemplo n.º 7
0
def doAssignment(students, assign, helpers):
	helpers.printf("tokenizing '{}' in parellel...\n".format(assign.name))

	# for each student
	for student in students:
		# for each specificied file
		files = assign.args["files"]
		for filename in files:
			path = helpers.getAssignmentPath(student, assign.name, filename)
			if path != None:
				# tokenize the file
				result = tokenizer.simple(path)

				# write the result
				safeFilename = common.makeFilenameSafe(filename) + "tokenized.txt"
				helpers.writeToPreprocessed(result, student, assign.name, safeFilename)

	# all done!
	helpers.printf("Finished '{}'!\n".format(assign.name))
Exemplo n.º 8
0
def run(students, assignments, args, helpers):
	# for each assignment
	for assign in assignments:
		# for each student
		for student in students:
			# for each specificied file
			files = assign.args["files"]
			for filename in files:
				# read the raw text
				rawText = helpers.readFromAssignment(student, assign.name, filename)

				if rawText != None:
					# make a friendly filename for saving
					safeFilename = common.makeFilenameSafe(filename)

					# mangle it, write the mangled text
					mangle(rawText, student, assign.name, safeFilename, helpers)

	# all done
	return True
Exemplo n.º 9
0
def runFile(students, assign, helpers):
	helpers.printf("Processing assignment '{}' in parellel...\n".format(assign.name))

	# for each student
	for student in students:
		# for each specificied file
		files = assign.args["files"]
		for filename in files:
			# get the path
			path = helpers.getAssignmentPath(student, assign.name, filename)

			if path != None:
				# get the identifiers
				text = tokenize(path)

				# write to a file
				safeFilename = common.makeFilenameSafe(filename) + "identifiers.txt"
				helpers.writeToPreprocessed(text, student, assign.name, safeFilename)

	# all done
	helpers.printf("Finished '{}'!\n".format(assign.name))
Exemplo n.º 10
0
Arquivo: lazy.py Projeto: kmack3/Algae
def run(students, assignments, args, helpers):
    # for each assignment
    for assign in assignments:
        # for each student
        for student in students:
            # for each specificied file
            files = assign.args["files"]
            for filename in files:
                # read the raw text
                rawText = helpers.readFromAssignment(student, assign.name,
                                                     filename)

                if rawText != None:
                    # make a friendly filename for saving
                    safeFilename = common.makeFilenameSafe(filename)

                    # mangle it, write the mangled text
                    mangle(rawText, student, assign.name, safeFilename,
                           helpers)

    # all done
    return True
Exemplo n.º 11
0
def doAssignment(students, assign, helpers, compress):
	helpers.printf("processing '{}' in parellel...\n".format(assign.name))

	# for each student
	for student in students:
		# for each entry
		entries = assign.args["entries"]
		for entry in entries:
			entryPoint = entry["entryPoint"]
			path = helpers.getAssignmentPath(student, assign.name, entryPoint)
			sources = entry["sources"]

			if path != None:
				# tokenize the file
				result = tokenizer.mted(path, sources, compress)

				# write the result
				safeFilename = common.makeFilenameSafe(sources[0]) + "mted.txt"
				helpers.writeToPreprocessed(result, student, assign.name, safeFilename)

	# all done
	helpers.printf("Finished '{}'!\n".format(assign.name))
Exemplo n.º 12
0
def runAssignment(students, assign, helpers):
	helpers.printf("Processing assignment '{}' in parellel...\n".format(assign.name))

	# for each student
	for student in students:
		# for each specificied file
		files = assign.args["files"]
		for filename in files:
			# get the path
			path = helpers.getAssignmentPath(student, assign.name, filename)

			if path != None:
				# get the stats
				stats = genStats(path, helpers)

				# write to a file
				safeFilename = common.makeFilenameSafe(filename) + "stats.json"
				text = io.getJSONString(stats, True)
				helpers.writeToPreprocessed(text, student, assign.name, safeFilename)

	# all done
	helpers.printf("Finished '{}'!\n".format(assign.name))
Exemplo n.º 13
0
def doAssignment(students, assign, helpers):
	helpers.printf("processing '{}' in parellel...\n".format(assign.name))

	# for each student
	for student in students:
		# for each entry
		entries = assign.args["entries"]
		for entry in entries:
			sources = entry["sources"]

			# try to read the text
			text = helpers.readFromAssignment(student, assign.name, sources[0])

			if text != None:
				# tokenize the file
				result = tokenize(text)

				# write the result
				safeFilename = common.makeFilenameSafe(sources[0]) + "vted.txt"
				helpers.writeToPreprocessed(result, student, assign.name, safeFilename)

	# all done
	helpers.printf("Finished '{}'!\n".format(assign.name))
Exemplo n.º 14
0
Arquivo: mted.py Projeto: kmack3/Algae
def doAssignment(students, assign, helpers, compress):
    helpers.printf("processing '{}' in parellel...\n".format(assign.name))

    # for each student
    for student in students:
        # for each entry
        entries = assign.args["entries"]
        for entry in entries:
            entryPoint = entry["entryPoint"]
            path = helpers.getAssignmentPath(student, assign.name, entryPoint)
            sources = entry["sources"]

            if path != None:
                # tokenize the file
                result = tokenizer.mted(path, sources, compress)

                # write the result
                safeFilename = common.makeFilenameSafe(sources[0]) + "mted.txt"
                helpers.writeToPreprocessed(result, student, assign.name,
                                            safeFilename)

    # all done
    helpers.printf("Finished '{}'!\n".format(assign.name))
Exemplo n.º 15
0
def runAssignment(assignment, students, args, helpers):
    clusters = {}
    allowPartners = assignment.args["allowPartners"]

    # for each file
    files = assignment.args["files"]
    for filename in files:
        # find collisions
        for student in students:
            safeFilename = common.makeFilenameSafe(
                filename) + args["sourceSuffix"]
            studentText = helpers.readFromPreprocessed(student,
                                                       assignment.name,
                                                       safeFilename)

            if studentText != None:
                hashedText = hashText(studentText)

                if hashedText not in clusters:
                    clusters[hashedText] = common.Cluster(
                        allowPartners, filename, 100)

                member = common.Member(student, assignment.name, helpers)
                clusters[hashedText].add(member)

    # process the clusters
    clusterArray = []
    for key in clusters:
        clusterArray.append(clusters[key])

    # write the results for this assignment
    # clustersToStandardJSON will make sure that there are at least two people in the cluster
    common.clustersToStandardJSON(clusterArray, assignment.name,
                                  args["resultsSuffix"], helpers)

    # say we're done with this assignment
    helpers.printf("Finished assignment '{}'...\n".format(assignment.name))
Exemplo n.º 16
0
Arquivo: mips.py Projeto: kmack3/Algae
def doAssignment(students, assign, helpers):
    helpers.printf("processing '{}' in parellel...\n".format(assign.name))

    # for each student
    for student in students:
        # for each entry
        entries = assign.args["entries"]
        for entry in entries:
            sources = entry["sources"]

            # try to read the text
            text = helpers.readFromAssignment(student, assign.name, sources[0])

            if text != None:
                # process the file
                result = processMIPS(text)

                # write the result
                safeFilename = common.makeFilenameSafe(sources[0]) + "mips.txt"
                helpers.writeToPreprocessed(result, student, assign.name,
                                            safeFilename)

    # all done
    helpers.printf("Finished '{}'!\n".format(assign.name))
Exemplo n.º 17
0
def runFile(students, assign, helpers):
    helpers.printf("Processing assignment '{}' in parellel...\n".format(
        assign.name))

    # for each student
    for student in students:
        # for each specificied file
        files = assign.args["files"]
        for filename in files:
            # get the path
            path = helpers.getAssignmentPath(student, assign.name, filename)

            if path != None:
                # get the identifiers
                text = tokenize(path)

                # write to a file
                safeFilename = common.makeFilenameSafe(
                    filename) + "identifiers.txt"
                helpers.writeToPreprocessed(text, student, assign.name,
                                            safeFilename)

    # all done
    helpers.printf("Finished '{}'!\n".format(assign.name))
Exemplo n.º 18
0
def runAssignment(students, assign, helpers):
    helpers.printf("Processing assignment '{}' in parellel...\n".format(
        assign.name))

    # for each student
    for student in students:
        # for each specificied file
        files = assign.args["files"]
        for filename in files:
            # get the path
            path = helpers.getAssignmentPath(student, assign.name, filename)

            if path != None:
                # get the stats
                stats = genStats(path, helpers)

                # write to a file
                safeFilename = common.makeFilenameSafe(filename) + "stats.json"
                text = io.getJSONString(stats, True)
                helpers.writeToPreprocessed(text, student, assign.name,
                                            safeFilename)

    # all done
    helpers.printf("Finished '{}'!\n".format(assign.name))
Exemplo n.º 19
0
def runAssignment(students, assignment, args, helpers, weightFun, genKeys):
	assignName = assignment.name
	files = assignment.args["files"]
	allowPartners = assignment.args["allowPartners"]
	threshold = args["threshold"] * float(len(students))
	sourceSuffix = args["sourceSuffix"]
	resultsSuffix = args["resultsSuffix"]

	helpers.printf("Running assignment '{}' in parellel...\n".format(assignName))

	for filename in files:
		# build the index
		index = InvertedIndex()

		for student in students:
			# try to read the file
			safeFilename = common.makeFilenameSafe(filename) + sourceSuffix
			text = helpers.readFromPreprocessed(student, assignName, safeFilename)
			if text != None:
				# generate the keys
				keys = genKeys(text)

				# add to the index
				for key in keys:
					index.add(key, student)

		# prune and weight
		index.prune(threshold)
		index.weight(weightFun, len(students))

		# build the denormalized pair results
		resultFilename = common.makeFilenameSafe(filename) + "raw_" + resultsSuffix
		results = common.PairResults(assignName, resultFilename, helpers)

		seen = []
		for student in students:
			# retreive the keys
			safeFilename = common.makeFilenameSafe(filename) + sourceSuffix
			text = helpers.readFromPreprocessed(student, assignName, safeFilename)
			if text != None:
				# generate the keys
				keys = genKeys(text)

				# get the member (for the partner)
				member = common.Member(student, assignName, helpers)
				partner = member.partner

				# handle allowPartners
				if not allowPartners:
					partner = None

				# get the score results
				studentResults = index.scoreStudent(student, partner, keys)

				# add to pair results
				for other in studentResults:
					if other not in seen:
						pair = common.PairResult(student, other, studentResults[other])
						results.add(pair)

			# prevent duplicates
			seen.append(student)

		# normalize the scores to range 0-100
		results.finish()

		biggest = 0.0
		for pair in results.iterate():
			if pair.score > biggest:
				biggest = float(pair.score)

		# flush to disk
		finalResultFilename = common.makeFilenameSafe(filename) + resultsSuffix
		finalResults = common.PairResults(assignName, finalResultFilename, helpers)

		for pair in results.iterate():
			pair.score = (float(pair.score) / biggest) * 100.0
			finalResults.add(pair)

		finalResults.finish()

	# all done
	helpers.printf("Finished '{}'!\n".format(assignName))
Exemplo n.º 20
0
def runAssignment(students, assignment, args, helpers, weightFun, genKeys):
    assignName = assignment.name
    files = assignment.args["files"]
    allowPartners = assignment.args["allowPartners"]
    threshold = args["threshold"] * float(len(students))
    sourceSuffixes = ["tokenized.txt", "identifiers.txt", "literals.txt"]
    resultsSuffix = args["resultsSuffix"]

    helpers.printf(
        "Running assignment '{}' in parellel...\n".format(assignName))

    for filename in files:
        indexes = [InvertedIndex(), InvertedIndex(), InvertedIndex()]

        # for each type of Data
        for i in range(3):
            sourceSuffix = sourceSuffixes[i]
            curWeightFun = weightFun[i]
            curGenKeys = genKeys[i]
            index = indexes[i]

            for student in students:
                # try to read the file
                safeFilename = common.makeFilenameSafe(filename) + sourceSuffix
                text = helpers.readFromPreprocessed(student, assignName,
                                                    safeFilename)
                if text != None:
                    # generate the keys
                    keys = curGenKeys(text)

                    # add to the index
                    for key in keys:
                        index.add(key, student)

            # prune and weight
            index.prune(threshold)
            index.weight(curWeightFun, len(students))

        # build the denormalized pair results
        resultFilename = common.makeFilenameSafe(
            filename) + "raw_" + resultsSuffix
        results = common.PairResults(assignName, resultFilename, helpers)

        seen = []
        for student in students:
            combined = {}

            for i in range(3):
                # retreive the keys
                safeFilename = common.makeFilenameSafe(
                    filename) + sourceSuffixes[i]
                text = helpers.readFromPreprocessed(student, assignName,
                                                    safeFilename)
                index = indexes[i]

                if text != None:
                    # generate the keys
                    keys = genKeys[i](text)

                    # get the member (for the partner)
                    member = common.Member(student, assignName, helpers)
                    partner = member.partner

                    # handle allowPartners
                    if not allowPartners:
                        partner = None

                    # get the score results
                    studentResults = index.scoreStudent(student, partner, keys)

                    # add to results
                    for other in studentResults:
                        if other in combined:
                            # add the score
                            combined[other] += studentResults[other]
                        else:
                            # create the entry
                            combined[other] = studentResults[other]

            # add to pair results
            for other in combined:
                if other not in seen:
                    pair = common.PairResult(student, other, combined[other])
                    results.add(pair)

            # prevent duplicates
            seen.append(student)

        # normalize the scores to range 0-100
        results.finish()

        biggest = 0.0
        for pair in results.iterate():
            if pair.score > biggest:
                biggest = float(pair.score)

        # flush to disk
        finalResultFilename = common.makeFilenameSafe(filename) + resultsSuffix
        finalResults = common.PairResults(assignName, finalResultFilename,
                                          helpers)

        for pair in results.iterate():
            pair.score = (float(pair.score) / biggest) * 100.0
            finalResults.add(pair)

        finalResults.finish()

    # all done
    helpers.printf("Finished '{}'!\n".format(assignName))
Exemplo n.º 21
0
def runEntry(filename, students, helpers, assignment, args, allowPartners):
	# get the data
	assignName = assignment.name
	sourceSuffix = args["sourceSuffix"]
	resultsSuffix = args["resultsSuffix"]
	percent = float(args["percent"]) / 100.0
	top = args["top"]

	safeFilename = common.makeFilenameSafe(filename) + sourceSuffix
	filepath = helpers.getProcessedPath(assignName, safeFilename)

	if filepath != None:
		rawData = common.PairResults(assignName, safeFilename, helpers)
		data = []

		# convert into python objects
		i = 0
		for pair in rawData.iterate():
			data.append(pair)

			i += 1
			if i % 100000 == 0:
				gc.collect()

		# sort them
		data.sort(sortFun)

		# calculate and print stats
		mean = getMean(data)
		dev = getDeviation(data, mean)
		helpers.printf("{}/{} mean: {}, std. devation: {}\n".format(assignName, filename, mean, dev))

		# take to the top bottom percent
		takeNum = math.floor(float(len(data)) * percent)
		if "maxResults" in args:
			takeNum = min(args["maxResults"], takeNum)

		if top:
			data = data[::-1] # conveniently reverse

		results = []
		taken = 0
		index = 0
		while taken < takeNum:
			current = data[index]

			member1 = common.Member(current.pair[0], assignName, helpers)
			member2 = common.Member(current.pair[1], assignName, helpers)
			cluster = common.Cluster(allowPartners, filename, current.score)
			cluster.add(member1)
			cluster.add(member2)

			if cluster.hasCheating() == False:
				# student are partners, ignore
				index += 1
				continue

			# take this entry
			taken += 1
			index += 1
			results.append(current)

			if index % 50000 == 0:
				gc.collect()

		# create the clusters
		clusters = createClusters(results, filename, assignName, allowPartners, helpers)

		# group pairs if necessary
		if args.has_key("groupPairs") and args["groupPairs"] == True:
			clusters = common.groupPairClusters(clusters, top)

		# free up RAM
		gc.collect()

		# flush to disk
		common.clustersToStandardJSON(clusters, assignName, common.makeFilenameSafe(filename) + resultsSuffix, helpers)

		# all done!
		helpers.printf("Finished '{}/{}', with {} results!\n".format(assignName, filename, len(clusters)))
Exemplo n.º 22
0
def run(students, assignments, args, helpers):
    # for each assignment
    for assign in assignments:
        # for each student
        for student in students:
            cwd = os.getcwd()
            helpers.makeStudentPath(student, assign.name)
            os.chdir(helpers.getStudentPath(student, assign.name))

            command = "cp ../{0}*{1} .".format(student, args['input'])
            p = subprocess.Popen(command,
                                 stdout=subprocess.PIPE,
                                 stderr=subprocess.PIPE,
                                 shell=True)
            o, e = p.communicate()

            newest = ""
            newestNum = 0
            for f in os.listdir(os.getcwd()):
                m = re.match("{0}".format(student), f)

                # Given the set of student submissions, pick the newest
                if (m):
                    num = f.split('_')[1]
                    if (num > newestNum):
                        newestNum = num
                        newest = f

            if (newest != ""):
                if (args['input'] == "tar"):
                    command = "tar xf {0}".format(newest)
                else:
                    print "INPUT TYPE NOT SUPPORTED - {0}".format(
                        args['input'])
                p = subprocess.Popen(command,
                                     stdout=subprocess.PIPE,
                                     stderr=subprocess.PIPE,
                                     shell=True)
                o, e = p.communicate()

                # for each specificied file
                files = assign.args["files"]
                for filename in files:
                    # read the raw text
                    rawText = helpers.readFromAssignment(
                        student, assign.name, filename)

                    if rawText != None:
                        # make a friendly filename for saving
                        safeFilename = common.makeFilenameSafe(filename)

                        # mangle it, write the mangled text
                        mangle(rawText, student, assign.name, safeFilename,
                               helpers)

            # Delete the other input files then go back
            command = "rm {0}*{1}".format(student, args['input'])
            p = subprocess.Popen(command,
                                 stdout=subprocess.PIPE,
                                 stderr=subprocess.PIPE,
                                 shell=True)
            o, e = p.communicate()
            os.chdir(cwd)

    # all done
    return True
Exemplo n.º 23
0
def runEntry(filename, students, helpers, assignment, args, allowPartners):
    # get the data
    assignName = assignment.name
    sourceSuffix = args["sourceSuffix"]
    resultsSuffix = args["resultsSuffix"]
    percent = float(args["percent"]) / 100.0
    top = args["top"]

    safeFilename = common.makeFilenameSafe(filename) + sourceSuffix
    filepath = helpers.getProcessedPath(assignName, safeFilename)

    if filepath != None:
        rawData = common.PairResults(assignName, safeFilename, helpers)
        data = []

        # convert into python objects
        i = 0
        for pair in rawData.iterate():
            data.append(pair)

            i += 1
            if i % 100000 == 0:
                gc.collect()

        # sort them
        data.sort(sortFun)

        # calculate and print stats
        mean = getMean(data)
        dev = getDeviation(data, mean)
        helpers.printf("{}/{} mean: {}, std. devation: {}\n".format(
            assignName, filename, mean, dev))

        # take to the top bottom percent
        takeNum = math.floor(float(len(data)) * percent)
        if "maxResults" in args:
            takeNum = min(args["maxResults"], takeNum)

        if top:
            data = data[::-1]  # conveniently reverse

        results = []
        taken = 0
        index = 0
        while taken < takeNum:
            current = data[index]

            member1 = common.Member(current.pair[0], assignName, helpers)
            member2 = common.Member(current.pair[1], assignName, helpers)
            cluster = common.Cluster(allowPartners, filename, current.score)
            cluster.add(member1)
            cluster.add(member2)

            if cluster.hasCheating() == False:
                # student are partners, ignore
                index += 1
                continue

            # take this entry
            taken += 1
            index += 1
            results.append(current)

            if index % 50000 == 0:
                gc.collect()

        # create the clusters
        clusters = createClusters(results, filename, assignName, allowPartners,
                                  helpers)

        # group pairs if necessary
        if args.has_key("groupPairs") and args["groupPairs"] == True:
            clusters = common.groupPairClusters(clusters, top)

        # free up RAM
        gc.collect()

        # flush to disk
        common.clustersToStandardJSON(
            clusters, assignName,
            common.makeFilenameSafe(filename) + resultsSuffix, helpers)

        # all done!
        helpers.printf("Finished '{}/{}', with {} results!\n".format(
            assignName, filename, len(clusters)))