def runEntry(entry, students, helpers, assignName, sourceSuffix, resultsSuffix, allowPartners): # create an empty PairResults object resultFilename = common.makeFilenameSafe(entry["sources"][0]) + resultsSuffix results = common.PairResults(assignName, resultFilename, helpers) # for each pair of students for i in range(len(students)): for j in range(i + 1): if i != j: # are these students partners? student1 = students[i] student2 = students[j] # get both file paths safeFilename = common.makeFilenameSafe(entry["sources"][0]) + sourceSuffix path1 = helpers.getPreprocessedPath(student1, assignName, safeFilename) path2 = helpers.getPreprocessedPath(student2, assignName, safeFilename) # make sure both paths exist if path1 != None and path2 != None: editDistance = runEditDistance(path1, path2) # save the pair result result = common.PairResult(student1, student2, editDistance) results.add(result) # flush results to disk results.finish() helpers.printf("Finished '{}/{}'!\n".format(assignName, entry["sources"][0]))
def runEntry(filename, students, helpers, assignment, args, allowPartners): # get the data assignName = assignment.name sourceSuffix = args["sourceSuffix"] resultsSuffix = args["resultsSuffix"] threshold = assignment.args["threshold"] above = args["above"] minThreshold = None if assignment.args.has_key("minThreshold"): minThreshold = assignment.args["minThreshold"] safeFilename = common.makeFilenameSafe(filename) + sourceSuffix filepath = helpers.getProcessedPath(assignName, safeFilename) if filepath != None: rawData = common.PairResults(assignName, safeFilename, helpers) data = [] # convert into python objects for pair in rawData.iterate(): data.append(pair) # get the mean mean = getMean(data) # get the deviation deviation = getDeviation(data, mean) helpers.printf("{}/{}: mean {}, deviation {}\n".format( assignName, filename, mean, deviation)) # filter out data filtered = filterData(data, mean, deviation, threshold, above, minThreshold) # create the clusters clusters = createClusters(filtered, filename, assignName, allowPartners, helpers) # flush to disk common.clustersToStandardJSON( clusters, assignName, common.makeFilenameSafe(filename) + resultsSuffix, helpers) # all done! helpers.printf("Finished '{}', with {} results!\n".format( assignName, len(clusters)))
def runAssignment(students, assignment, args, helpers, weightFun, genKeys): assignName = assignment.name files = assignment.args["files"] allowPartners = assignment.args["allowPartners"] threshold = args["threshold"] * float(len(students)) sourceSuffixes = ["tokenized.txt", "identifiers.txt", "literals.txt"] resultsSuffix = args["resultsSuffix"] helpers.printf( "Running assignment '{}' in parellel...\n".format(assignName)) for filename in files: indexes = [InvertedIndex(), InvertedIndex(), InvertedIndex()] # for each type of Data for i in range(3): sourceSuffix = sourceSuffixes[i] curWeightFun = weightFun[i] curGenKeys = genKeys[i] index = indexes[i] for student in students: # try to read the file safeFilename = common.makeFilenameSafe(filename) + sourceSuffix text = helpers.readFromPreprocessed(student, assignName, safeFilename) if text != None: # generate the keys keys = curGenKeys(text) # add to the index for key in keys: index.add(key, student) # prune and weight index.prune(threshold) index.weight(curWeightFun, len(students)) # build the denormalized pair results resultFilename = common.makeFilenameSafe( filename) + "raw_" + resultsSuffix results = common.PairResults(assignName, resultFilename, helpers) seen = [] for student in students: combined = {} for i in range(3): # retreive the keys safeFilename = common.makeFilenameSafe( filename) + sourceSuffixes[i] text = helpers.readFromPreprocessed(student, assignName, safeFilename) index = indexes[i] if text != None: # generate the keys keys = genKeys[i](text) # get the member (for the partner) member = common.Member(student, assignName, helpers) partner = member.partner # handle allowPartners if not allowPartners: partner = None # get the score results studentResults = index.scoreStudent(student, partner, keys) # add to results for other in studentResults: if other in combined: # add the score combined[other] += studentResults[other] else: # create the entry combined[other] = studentResults[other] # add to pair results for other in combined: if other not in seen: pair = common.PairResult(student, other, combined[other]) results.add(pair) # prevent duplicates seen.append(student) # normalize the scores to range 0-100 results.finish() biggest = 0.0 for pair in results.iterate(): if pair.score > biggest: biggest = float(pair.score) # flush to disk finalResultFilename = common.makeFilenameSafe(filename) + resultsSuffix finalResults = common.PairResults(assignName, finalResultFilename, helpers) for pair in results.iterate(): pair.score = (float(pair.score) / biggest) * 100.0 finalResults.add(pair) finalResults.finish() # all done helpers.printf("Finished '{}'!\n".format(assignName))
def runEntry(filename, students, helpers, assignment, args, allowPartners): # get the data assignName = assignment.name sourceSuffix = args["sourceSuffix"] resultsSuffix = args["resultsSuffix"] percent = float(args["percent"]) / 100.0 top = args["top"] safeFilename = common.makeFilenameSafe(filename) + sourceSuffix filepath = helpers.getProcessedPath(assignName, safeFilename) if filepath != None: rawData = common.PairResults(assignName, safeFilename, helpers) data = [] # convert into python objects i = 0 for pair in rawData.iterate(): data.append(pair) i += 1 if i % 100000 == 0: gc.collect() # sort them data.sort(sortFun) # calculate and print stats mean = getMean(data) dev = getDeviation(data, mean) helpers.printf("{}/{} mean: {}, std. devation: {}\n".format( assignName, filename, mean, dev)) # take to the top bottom percent takeNum = math.floor(float(len(data)) * percent) if "maxResults" in args: takeNum = min(args["maxResults"], takeNum) if top: data = data[::-1] # conveniently reverse results = [] taken = 0 index = 0 while taken < takeNum: current = data[index] member1 = common.Member(current.pair[0], assignName, helpers) member2 = common.Member(current.pair[1], assignName, helpers) cluster = common.Cluster(allowPartners, filename, current.score) cluster.add(member1) cluster.add(member2) if cluster.hasCheating() == False: # student are partners, ignore index += 1 continue # take this entry taken += 1 index += 1 results.append(current) if index % 50000 == 0: gc.collect() # create the clusters clusters = createClusters(results, filename, assignName, allowPartners, helpers) # group pairs if necessary if args.has_key("groupPairs") and args["groupPairs"] == True: clusters = common.groupPairClusters(clusters, top) # free up RAM gc.collect() # flush to disk common.clustersToStandardJSON( clusters, assignName, common.makeFilenameSafe(filename) + resultsSuffix, helpers) # all done! helpers.printf("Finished '{}/{}', with {} results!\n".format( assignName, filename, len(clusters)))