Пример #1
0
def runEntry(entry, students, helpers, assignName, sourceSuffix, resultsSuffix, allowPartners):
	# create an empty PairResults object
	resultFilename = common.makeFilenameSafe(entry["sources"][0]) + resultsSuffix
	results = common.PairResults(assignName, resultFilename, helpers)

	# for each pair of students
	for i in range(len(students)):
		for j in range(i + 1):
			if i != j:
				# are these students partners?
				student1 = students[i]
				student2 = students[j]

				# get both file paths
				safeFilename = common.makeFilenameSafe(entry["sources"][0]) + sourceSuffix
				path1 = helpers.getPreprocessedPath(student1, assignName, safeFilename)
				path2 = helpers.getPreprocessedPath(student2, assignName, safeFilename)

				# make sure both paths exist
				if path1 != None and path2 != None:
					editDistance = runEditDistance(path1, path2)

					# save the pair result
					result = common.PairResult(student1, student2, editDistance)
					results.add(result)

	# flush results to disk
	results.finish()
	helpers.printf("Finished '{}/{}'!\n".format(assignName, entry["sources"][0]))
Пример #2
0
def runEntry(filename, students, helpers, assignment, args, allowPartners):
    # get the data
    assignName = assignment.name
    sourceSuffix = args["sourceSuffix"]
    resultsSuffix = args["resultsSuffix"]
    threshold = assignment.args["threshold"]
    above = args["above"]
    minThreshold = None
    if assignment.args.has_key("minThreshold"):
        minThreshold = assignment.args["minThreshold"]

    safeFilename = common.makeFilenameSafe(filename) + sourceSuffix
    filepath = helpers.getProcessedPath(assignName, safeFilename)

    if filepath != None:
        rawData = common.PairResults(assignName, safeFilename, helpers)
        data = []

        # convert into python objects
        for pair in rawData.iterate():
            data.append(pair)

        # get the mean
        mean = getMean(data)

        # get the deviation
        deviation = getDeviation(data, mean)
        helpers.printf("{}/{}: mean {}, deviation {}\n".format(
            assignName, filename, mean, deviation))

        # filter out data
        filtered = filterData(data, mean, deviation, threshold, above,
                              minThreshold)

        # create the clusters
        clusters = createClusters(filtered, filename, assignName,
                                  allowPartners, helpers)

        # flush to disk
        common.clustersToStandardJSON(
            clusters, assignName,
            common.makeFilenameSafe(filename) + resultsSuffix, helpers)

        # all done!
        helpers.printf("Finished '{}', with {} results!\n".format(
            assignName, len(clusters)))
Пример #3
0
def runAssignment(students, assignment, args, helpers, weightFun, genKeys):
    assignName = assignment.name
    files = assignment.args["files"]
    allowPartners = assignment.args["allowPartners"]
    threshold = args["threshold"] * float(len(students))
    sourceSuffixes = ["tokenized.txt", "identifiers.txt", "literals.txt"]
    resultsSuffix = args["resultsSuffix"]

    helpers.printf(
        "Running assignment '{}' in parellel...\n".format(assignName))

    for filename in files:
        indexes = [InvertedIndex(), InvertedIndex(), InvertedIndex()]

        # for each type of Data
        for i in range(3):
            sourceSuffix = sourceSuffixes[i]
            curWeightFun = weightFun[i]
            curGenKeys = genKeys[i]
            index = indexes[i]

            for student in students:
                # try to read the file
                safeFilename = common.makeFilenameSafe(filename) + sourceSuffix
                text = helpers.readFromPreprocessed(student, assignName,
                                                    safeFilename)
                if text != None:
                    # generate the keys
                    keys = curGenKeys(text)

                    # add to the index
                    for key in keys:
                        index.add(key, student)

            # prune and weight
            index.prune(threshold)
            index.weight(curWeightFun, len(students))

        # build the denormalized pair results
        resultFilename = common.makeFilenameSafe(
            filename) + "raw_" + resultsSuffix
        results = common.PairResults(assignName, resultFilename, helpers)

        seen = []
        for student in students:
            combined = {}

            for i in range(3):
                # retreive the keys
                safeFilename = common.makeFilenameSafe(
                    filename) + sourceSuffixes[i]
                text = helpers.readFromPreprocessed(student, assignName,
                                                    safeFilename)
                index = indexes[i]

                if text != None:
                    # generate the keys
                    keys = genKeys[i](text)

                    # get the member (for the partner)
                    member = common.Member(student, assignName, helpers)
                    partner = member.partner

                    # handle allowPartners
                    if not allowPartners:
                        partner = None

                    # get the score results
                    studentResults = index.scoreStudent(student, partner, keys)

                    # add to results
                    for other in studentResults:
                        if other in combined:
                            # add the score
                            combined[other] += studentResults[other]
                        else:
                            # create the entry
                            combined[other] = studentResults[other]

            # add to pair results
            for other in combined:
                if other not in seen:
                    pair = common.PairResult(student, other, combined[other])
                    results.add(pair)

            # prevent duplicates
            seen.append(student)

        # normalize the scores to range 0-100
        results.finish()

        biggest = 0.0
        for pair in results.iterate():
            if pair.score > biggest:
                biggest = float(pair.score)

        # flush to disk
        finalResultFilename = common.makeFilenameSafe(filename) + resultsSuffix
        finalResults = common.PairResults(assignName, finalResultFilename,
                                          helpers)

        for pair in results.iterate():
            pair.score = (float(pair.score) / biggest) * 100.0
            finalResults.add(pair)

        finalResults.finish()

    # all done
    helpers.printf("Finished '{}'!\n".format(assignName))
Пример #4
0
def runEntry(filename, students, helpers, assignment, args, allowPartners):
    # get the data
    assignName = assignment.name
    sourceSuffix = args["sourceSuffix"]
    resultsSuffix = args["resultsSuffix"]
    percent = float(args["percent"]) / 100.0
    top = args["top"]

    safeFilename = common.makeFilenameSafe(filename) + sourceSuffix
    filepath = helpers.getProcessedPath(assignName, safeFilename)

    if filepath != None:
        rawData = common.PairResults(assignName, safeFilename, helpers)
        data = []

        # convert into python objects
        i = 0
        for pair in rawData.iterate():
            data.append(pair)

            i += 1
            if i % 100000 == 0:
                gc.collect()

        # sort them
        data.sort(sortFun)

        # calculate and print stats
        mean = getMean(data)
        dev = getDeviation(data, mean)
        helpers.printf("{}/{} mean: {}, std. devation: {}\n".format(
            assignName, filename, mean, dev))

        # take to the top bottom percent
        takeNum = math.floor(float(len(data)) * percent)
        if "maxResults" in args:
            takeNum = min(args["maxResults"], takeNum)

        if top:
            data = data[::-1]  # conveniently reverse

        results = []
        taken = 0
        index = 0
        while taken < takeNum:
            current = data[index]

            member1 = common.Member(current.pair[0], assignName, helpers)
            member2 = common.Member(current.pair[1], assignName, helpers)
            cluster = common.Cluster(allowPartners, filename, current.score)
            cluster.add(member1)
            cluster.add(member2)

            if cluster.hasCheating() == False:
                # student are partners, ignore
                index += 1
                continue

            # take this entry
            taken += 1
            index += 1
            results.append(current)

            if index % 50000 == 0:
                gc.collect()

        # create the clusters
        clusters = createClusters(results, filename, assignName, allowPartners,
                                  helpers)

        # group pairs if necessary
        if args.has_key("groupPairs") and args["groupPairs"] == True:
            clusters = common.groupPairClusters(clusters, top)

        # free up RAM
        gc.collect()

        # flush to disk
        common.clustersToStandardJSON(
            clusters, assignName,
            common.makeFilenameSafe(filename) + resultsSuffix, helpers)

        # all done!
        helpers.printf("Finished '{}/{}', with {} results!\n".format(
            assignName, filename, len(clusters)))