/
main.py
303 lines (280 loc) · 12.6 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
# -*- coding: utf-8 -*-
"""
Created on Mon Sep 2 12:55:55 2019
@author: s-moh
"""
import re
import json
import xlsxwriter
import pandas as pd
from glob import glob
from HomeworkTask import HomeworkTask
from Submission import Submission
from pathlib import Path
import argparse
############ Filtering Submissions to keep most recent one only #################
def filter_submissions(path, rerun_flag):
with open("FailedNBs.txt", "w") as f:
f.write('')
submissions = list(sorted(path.glob('*.ipynb')))
#print(submissions)
final_submissions = []
ids = []
for s in range(len(submissions)-1, -1, -1):
fname, _ = submissions[s].name.split(".") # ID_TRIAL.ext
student_id, trial_no = fname.split("_") #Split ID and TRIAL nr
#print('Student ID:', student_id, ',Trial NO:', trial_no, ' -->', submissions[s])
if student_id in ids:
continue
else:
try:
processed_submission = Submission(student_id, str(submissions[s]), trial_no, rerun_flag)
final_submissions.append(processed_submission)
ids.append(student_id)
except Exception as e:
with open("FailedNBs.txt", "a") as f:
f.write(str(student_id) + '\n')
print('Error 1.1: Error Reading Submission of Student ID ' + student_id + '-->', e)
return final_submissions
############ END: Filtering Submissions to keep most recent one only #################
# Possibility to add a path to configuration JSON as an argument.
parser = argparse.ArgumentParser()
parser.add_argument('-conf', type=str, default='conf.json', help="JSON configuration file, that has information about the HW tasks and scores. (default json given)")
args = parser.parse_args()
path_conf = args.conf
#### BEGIN: READ CONFIGURATION ########
try:
with open(path_conf, 'r', encoding="utf8") as f:
data = json.load(f)
hw_no = data['HW_NO'] #Homework number
path = Path(data['HW_Path']) #HW Submission Folder
student_ids_path = Path(data['student_ids']) #path to student ids file
rerun_flag = data['Rerun']
if 'plag_threshold' in data.keys():
plagiarsim_threshold = data['plag_threshold']
else:
plagiarsim_threshold = 2
task_begin_flag = ''
task_end_flag = ''
except Exception as e:
print('Error 0.1: Error Reading JSON Configuration File - ', e)
# Initialize Information for each task
hwTasks = []
for task in data['Tasks']:
if task['Task_NO'] == "Timing":
continue
hwTasks.append(HomeworkTask(hw_no, task['Task_NO'], task['Grade'], task['Task_Begin_Flag'], task['Task_End_Flag']))
#### END: READ CONFIGURATION #########
opt = int(input("Enter:\n 1 to Collect Solutions of each task in a separate notebook\n 2 to Collect Grades to an excel sheet\n 3 to collect timings\n 4 to check plagiarism:\n"))
############ BEGIN: Collect Solutions of a specific task #################
if opt == 1:
# Filter Submission to keep most recent only
print('Filtering Submissions.Please Wait...')
try:
filtered_submissions = filter_submissions(path, rerun_flag)
except Exception as e:
print("Error 1.2: An exception occured during running notebooks ", e)
print('Creating Tasks Notebooks. Please Wait...')
try:
for task in hwTasks:
task.create_task_ipynb()
print(task.task_dir)
for sol in filtered_submissions:
solution = sol.find_task(task.task_no, task.task_grade, task.begin_flag, task.end_flag, task.task_dir)
task.append_solution(solution)
except Exception as e:
print("Error 1.3: An exception occured during creating tasks notebooks ", e)
############ END: Collect Solutions of a specific task #################
############ BEGIN: Save to Excel Sheet #################
if opt == 2:
#Read Student IDs from the grades excel sheet
try:
student_ids_file = open(student_ids_path,"r+")
student_ids = student_ids_file.readlines()
for i in range(len(student_ids)):
if student_ids[i][-1] == '\n':
student_ids[i] = student_ids[i][:-1]
except Exception as e:
print("Error 2.1: An exception occurred During Reading Students IDs... Make sure of its path!", e)
#Create Excel Sheet for the grades
try:
workbook = xlsxwriter.Workbook('Grades_HW' + str(hw_no) + '.xlsx')
worksheet = workbook.add_worksheet()
worksheet.write('A1', 'ID')
i = 2
for student_id in student_ids:
worksheet.write('A' + str(i), student_id)
i += 1
except Exception as e:
print("Error 2.2: Failed to create an Excel Sheet for the grades!", e)
#Add Tasks Grades to the grades excel sheet
try:
taskLetter = 66 #B = 66 in ASCII (To start with Column B)
for task in hwTasks:
id_grade, id_comments = task.extract_results()
present_ids = list(id_grade.keys()) #list of students who solved this task
if taskLetter < 91:
taskColHeader = str(chr(taskLetter))
else:
taskColHeader = str('A' + chr(taskLetter - 26))
worksheet.write(taskColHeader + '1', str(task.task_no)) #read task number header
i = 2
for student_id in student_ids:
if student_id not in present_ids: #student has not submitted the homework
id_grade[student_id] = 0
id_comments[student_id] = ''
worksheet.write(taskColHeader + str(i), str(id_grade[student_id])) #write the grade
if len(id_comments[student_id]) > 10: #there is a comment
worksheet.write_comment(taskColHeader + str(i), id_comments[student_id]) #write the comment
i += 1
taskLetter += 1 #Move to next column for the next task
workbook.close()
print('Done: Adding Grades to the Excel Sheet')
except Exception as e:
print("Error 2.3: Something went wrong when adding the tasks grades!", e)
############ END: Save to Excel Sheet #################
############ BEGIN: Calculate AVG Time #################
if opt == 3:
#Read Student IDs from the grades excel sheet
# try:
task_beg = None
task_end = None
for task in data['Tasks']:
if task['Task_NO'] == "Timing":
task_beg, task_end = task['Task_Begin_Flag'], task['Task_End_Flag']
# used as a sanity check of the number of fields expected for the timing
if 'TASK_NR_FIELDS' in task:
task_nr_fields = int(task['TASK_NR_FIELDS'])
else:
task_nr_fields = int(input("How many fields timing has for this homework (enter an integer):"))
# print("This feature will be added later.")
print("Timing extraction setting review: ")
print(" Beg tag: ", task_beg)
print(" End tag: ", task_end)
print(" path: ", path)
print('Filtering Submissions.Please Wait...')
filtered_submissions = filter_submissions(path, 0)
p1 = re.compile("Task \w+|TOTAL", flags=re.IGNORECASE) # pattern 1 for task #
p2 = re.compile("\d+[.\d+]* hours", flags=re.IGNORECASE) # pattern 2 for task hours
timings = {}
nr_tasks = None
cols = None
for j, sol in enumerate(filtered_submissions):
# print(j) # for debugging purposes
timing = []
columns = []
solution = sol.find_task(task['Task_NO'], "1", task_beg, task_end)
next_is_hour = False
for s in solution:
# print(s['source'][0])
if next_is_hour:
next_is_hour=False
if s['source'] == []:
continue
rslt = p2.search(s['source'][0])
if rslt is not None:
rslt = re.sub('[+#$%^&*(<>-]', '', rslt.group(0)) # remove symbols like plus in "5+ hours"
timing.append(float(rslt.split(' hours')[0].replace(',','.')))
else:
timing.append(0)
continue
#
rslt = p1.search(s['source'][0])
if rslt is not None:
str_task = rslt.group(0).split('Task ')[-1]
if str_task == 'TOTAL':
columns.append('TT')
else:
columns.append("T%s" % str_task)
# print("%s%s",(str_task[0],str_task[-1]))
next_is_hour = True
# print(rslt.group(0))
# print(s['source'][0].split('Task '))
# print(s.split('Task'))
p3 = re.compile("\w*\d+")
rslt = p3.search(s['source'][0])
std_id = rslt.group(0)
if task_nr_fields == len(timing):
timings[std_id]=timing
else:
print("ignore timing from %s, expected %i fields in timing but found %i " % (std_id,
task_nr_fields, len(timing)))
# break
# TODO: Following 2 lines could be done in a better! (I assume student would not change the
# titles, and places that they are not supposed to change!)
nr_tasks = len(timing) # from the last student's
cols = columns # from the last student's columns
# print(len(timings.keys()))
# print(cols)
#
#Read Student IDs from the grades excel sheet
try:
student_ids_file = open(student_ids_path,"r+")
student_ids = student_ids_file.readlines()
for i in range(len(student_ids)):
if student_ids[i][-1] == '\n':
student_ids[i] = student_ids[i][:-1]
except Exception as e:
print("Error 2.1: An exception occurred During Reading Students IDs... Make sure of its path!", e)
# for student who were not present replace timing with zeros
print("Matching students..")
failed_extractions = []
for id in student_ids:
if id not in timings.keys():
failed_extractions.append(id)
timings[id] = [0]*nr_tasks
if failed_extractions:
print("Failed to extract timings for %i students: " % len(failed_extractions), failed_extractions)
# remove students that are not part of this group!
# TODO: perhaps these two loops could be handled better!
keys_to_delete = []
for k in timings.keys():
if k not in student_ids:
keys_to_delete.append(k)
# remove studetns not in the list
for k in keys_to_delete:
del timings[k]
# to keep the order as in the student_ids.txt (FIXME: this is a hack!)
timings_aranged = {}
for id in student_ids:
timings_aranged[id] = timings[id]
# print(student_ids)
# save as excel file
df = pd.DataFrame.from_dict(timings_aranged, orient='index', columns=cols)
# print(df)
# df.sort_index(inplace=True)
filename = 'Timings_HW' + str(hw_no) + '.xlsx'
df.to_excel(filename)
print("Number of (current) submissions: ", len(filtered_submissions))
print("Number of records for timings to store (whole class): ", len(df))
print("Successfully wrote to file: ", filename)
# except Exception as e:
# print("Error 3.1: An exception occurred During AVG Time Calculation ... Make sure of its path!", e)
############ END: Calculate AVG Time #################
############ BEGIN: Similarity Checker #################
if opt == 4:
#Calculate Similarity for each task
#try:
suspects_freq = {}
with open("Plagiarism_Suspects.txt", "w") as f:
f.write('SUSPECTS PER TASK\nStudent1,Student2,Similarity_Score,Task\n')
suspects_total = 0
for task in hwTasks:
with open("Plagiarism_Suspects.txt", "a") as f:
suspects = task.similarity_calculator(plagiarsim_threshold)
for s in suspects:
suspects_key = str(s[0]) + ',' + str(s[1])
f.write(suspects_key + ',' + str(s[2] * 100) + ',' + str(task.task_no) + '\n')
if suspects_key in suspects_freq.keys():
suspects_freq[suspects_key] += 1
else:
suspects_freq[suspects_key] = 1
suspects_total += len(suspects)
with open("Plagiarism_Suspects.txt", "a") as f:
f.write('#########################################\nFREQUENCY OF SAME SUSPECTS\nStudent1,Student2,Frequency\n')
for k in suspects_freq.keys():
f.write(k + ',' + str(suspects_freq[k]) + '\n')
print('Done: Checking for Plagiarism --> ', suspects_total, ' Suspections Found!')
#except Exception as e:
# print("Error 4.2: Something went wrong when checking similarity!", e)
############ END: Similarity Checker #################