def process_item(self, item, spider): if isinstance(item, CoursesItem): course_code = ''.join(item['code']) course_time = item['link'][0].split("/") year = course_time[1] semester = ''.join(course_time[-1]) #Check if entry already exists course = Course.select().where(Course.code == course_code, Course.year == year, Course.semester == semester) if not course.exists(): print "course record not found, creating" try: with db.transaction(): Course.create( code=course_code, name=''.join(item['title']), year=year, semester=semester, url=''.join(item['link']), path='raw_data'.join(item['link']) ) except peewee.OperationalError as e: print "Could not create a record for {} due to {}".format(course_code, e) return item
def process_item(self, item, spider): if isinstance(item, CoursesItem): course_code = ''.join(item['code']) year = item['year'] semester = item['semester'] #Check if entry already exists course = Course.select().where(Course.code == course_code, Course.year == year, Course.semester == semester) if not course.exists(): print "course record not found, creating" with db.atomic(): try: Course.create(code=course_code, name=''.join(item['title']), year=item['year'], semester=semester, url=''.join(item['link']), path='raw_data'.join(item['link'])) except peewee.OperationalError as e: print "Could not create a record for {} due to {}".format( course_code, e) return item
def lda_over_courses(self): """ Perform LDA over all courses, no material/lecture level details. """ courses = Course.select() courses_size = Course.select(Course.code).distinct().count() courses_dict = [] for course in courses: course_words = CourseWord.select().where( CourseWord.course == course) courses_dict.append(dict([(x.word, x.count) for x in course_words])) print "Performing LDA over all courses.." model, vocab = self.__perform_lda_default(courses_dict, courses_size) log_likelihoods = [] for i, x in enumerate(model.loglikelihoods_): row_dict = {'iteration': i * 10, 'loglikelihood': round(x, 2)} log_likelihoods.append(row_dict) norm_topic_word_rows = self.__resolve_topic_words( self.__normalize(model.topic_word_), vocab, 2) topic_word_rows = self.__resolve_topic_words(model.topic_word_, vocab, 1) # Document-topic distributions doc_topic = model.doc_topic_ course_topic_rows = [] for i in range(courses.count()): top_topics = np.argsort(doc_topic[i])[:-self.n_top_topic - 1:-1] topic_probs = doc_topic[i][top_topics] for top_topic, top_weight in zip(top_topics, topic_probs): row_dict = { 'course': courses[i], 'topic': top_topic, 'weight': round(top_weight * 100, 2) } course_topic_rows.append(row_dict) if self.debug: doc_topic_str = ", ".join([ str(x) + "(" + str(round(y * 100, 2)) + "%)" for x, y in zip(top_topics, topic_probs) ]) print("{} (top {} topics: {})".format( courses[i].name.encode('utf-8'), self.n_top_topic, doc_topic_str)) with db.atomic(): self.__insert_rows(LDALogLikelihood, log_likelihoods) self.__insert_rows(TopicWord, norm_topic_word_rows) self.__insert_rows(TopicWord, topic_word_rows) self.__insert_rows(CourseTopic, course_topic_rows)
def extract_all_lectures_tokens_per_course(self): # Tokenize and clean each lecture separately result_data = {} for course in Course.select(): tokens = self.extract_all_lectures_tokens_per_course_helper( course.id) res_data[course.code] = tokens return result_data
def process_item(self, item, spider): if isinstance(item, DataItem): url = ''.join(item['link']) dir_name = 'raw_data' + ''.join(item['path']) + '/' course_code = ''.join(item['course_code']) content = ''.join(item['content']) path = '' year = ''.join(item['year']) semester = ''.join(item['semester']) course = Course.select().where(Course.code == course_code, Course.year == year, Course.semester == semester) if not course.exists(): course = None print "Non-existing course: {}".format(course_code) if not os.path.exists(dir_name): try: os.makedirs(dir_name) except OSError as e: print "Could not create directory: {} due to {}".format(dir_name, e) lecture = Lecture.select().where(Lecture.course == course, Lecture.url == url) # if no lecture record and no content, then download data (pdf, pptx, etc.) according to url if not lecture.exists() and len(content) == 0: filename = os.path.basename(url) path = dir_name + filename print "Saving {} => {}".format(url, path) try: urllib.urlretrieve(url, path) except IOError as e: print "Could not save file: {} into {}. Cause {}".format(url, path, e) if not lecture.exists(): print "Lecture record not found, creating ..." try: title = self.__get_title(url) with db.transaction(): Lecture.create( course=course, url=url, path=path, name=title, content=content ) except peewee.OperationalError as e: print "Could not create a record for course {} lecture {} due to {}".format(course_code, url, e) else: if len(content) > 0: try: with db.transaction(): lecture.content = content lecture.save() except peewee.OperationalError as e: print e return item
def __remove_duplicates(): """ Removes all duplicate material within the scope of a single course. Currently duplicate detection works based on material name, i.e if the names without extensions match, we remove one of the duplicates (preferably the .pdf one, since extracting text from pdf is more prone to errors). """ print "removing duplicates: {}".format(len(Course.select())) lectures_to_delete = [] for course in Course.select(): print "Course: {}".format(course.code) lectures = {} print "Lectures: {}".format( len(Lecture.select().where(Lecture.course == course))) for lecture in Lecture.select().where(Lecture.course == course): extension = __resolve_extension(lecture.name) if not extension: continue pure_name = lecture.name[:-len( extension)] # Get lecture name without extension if pure_name in lectures: existing = lectures[pure_name] if existing.name.endswith( '.pdf'): # Prefer anything to .pdf extension lectures_to_delete.append(existing) lectures[pure_name] = lecture else: lectures_to_delete.append(lecture) else: lectures[pure_name] = lecture # Initial insert try: with db.transaction(): for lecture in lectures_to_delete: lecture.delete_instance() except peewee.OperationalError as e: print e
def __persist(self, results): rows = [] for k, v in results.items(): course = Course.select().where(Course.code == k.code, Course.year == k.year, Course.semester == k.semester) if not course.exists(): print "Non-existing course in SIS data: {}".format(k) continue rows.append({ 'course': course, 'url': '', 'path': self.filename, 'name': 'SISdata', 'content': v.decode('latin-1').encode('utf-8'), 'time': datetime.datetime.now(), 'size': 0 }) with db.atomic(): Lecture.insert_many(rows).execute()
def lda_over_lectures(self): """ Peform LDA over lectures within the scope of an individual course. Basically we perform as many LDA modellings as there are courses. """ lectures = [] for course in Course.select(): course_lectures = list( Lecture.select().where(Lecture.course == course)) lda_tools = [ DictVectorizer(), lda.LDA(n_topics=len(course_lectures), n_iter=1000, random_state=1) ] lectures.append((course, course_lectures, LectureWord, lda_tools)) res = self.pool.map(self.__lda_for_course_material, lectures) with db.atomic(): LectureTopicWord.insert_many([x for y in res for x in y[0]]).execute() LectureTopic.insert_many([x for y in res for x in y[1]]).execute()
def __get_courses(course_id=0): if course_id: courses = Course.select().where(Course.id == course_id) else: courses = Course.select() return list(courses)
def process_item(self, item, spider): if isinstance(item, DataItem): url = ''.join(item['link']) dir_name = 'raw_data' + ''.join(item['path']) + '/' course_code = ''.join(item['course_code']) content = ''.join(item['content']) path = '' year = ''.join(item['year']) semester = ''.join(item['semester']) prefix = os.path.dirname(os.path.dirname( os.path.abspath(__file__))) + '/' course = Course.select().where(Course.code == course_code, Course.year == year, Course.semester == semester) if not course.exists(): print "Non-existing course: {}".format(course_code) return if len(content) == 0 and not os.path.exists(dir_name): try: os.makedirs(dir_name) except OSError as e: print "Could not create directory: {} due to {}".format( dir_name, e) lecture = Lecture.select().where(Lecture.course == course, Lecture.url == url) file_size = 0 # if no lecture record and no content, then download data (pdf, pptx, etc.) according to url if len(content) == 0: try: info = urllib.urlopen(url).info() if 'Content-Length' in info: file_size = float(info['Content-Length']) except Exception as e: print "Failed to retrieve file size for {} due to {}".format( url, e) if not lecture.exists(): path = self.__download(url, dir_name) else: lecture_instance = lecture.first() # Re-download only if the file has been updated if lecture_instance.size == 0 or lecture_instance.size != file_size: os.remove(prefix + lecture_instance.path) self.__download(url, dir_name) else: content = lecture_instance.content # No need to re-extract content later if not lecture.exists(): print "Lecture record not found, creating ..." title = self.__get_title(url) with db.atomic(): try: Lecture.create(course=course, url=url, path=path, name=title, content=content, size=file_size, time=datetime.datetime.now()) except peewee.OperationalError as e: print "Could not create a record for course {} lecture {} due to {}".format( course_code, url, e) else: with db.atomic(): try: lecture_instance = lecture.first() lecture_instance.content = content lecture_instance.time = datetime.datetime.now() lecture_instance.save() except peewee.OperationalError as e: print e return item
def lda_over_courses(n_top_words, n_top_topic): courses = Course.select() courses_size = Course.select(Course.code).distinct().count() courses_dict = [] for course in courses: course_words = CourseWord.select().where(CourseWord.course == course) courses_dict.append(dict([(x.word, x.count) for x in course_words])) print "Performing LDA over all courses.." model, vocab = perform_lda(courses_dict, courses_size) for i, x in enumerate(model.loglikelihoods_): try: with db.transaction() as txn: LDALogLikelihood.create( iteration=i * 10, loglikelihood=round(x, 2), ) txn.commit() except peewee.OperationalError as e: print "Could not create a record for loglikelihood {}, {}".format(x, e) # Iterate over topic word distributions for i, topic_dist in enumerate(model.topic_word_): top_topic_words = np.array(vocab)[np.argsort(topic_dist)][:-n_top_words - 1:-1] top_word_probs = topic_dist[np.argsort(topic_dist)][:-n_top_words - 1:-1] for top_word, top_weight in zip(top_topic_words, top_word_probs): try: with db.transaction() as txn: TopicWord.create( topic=i, word=top_word, weight=round(top_weight * 100, 2) ) txn.commit() except peewee.OperationalError as e: print "Could not create a record for topic {}, word {}, {}".format(i, top_word, e) top_word_str = ", ".join([remove_accents(x) + "(" + str(round(y * 100, 2)) + "%)" for x, y in zip(top_topic_words, top_word_probs)]) print('Topic {}: {}'.format(i, top_word_str)) # Document-topic distributions doc_topic = model.doc_topic_ for i in range(courses_size): top_topics = np.argsort(doc_topic[i])[:-n_top_topic - 1:-1] topic_probs = doc_topic[i][top_topics] for top_topic, top_weight in zip(top_topics, topic_probs): try: with db.transaction() as txn: CourseTopic.create( course=courses[i], topic=top_topic, weight=round(top_weight * 100, 2) ) txn.commit() except peewee.OperationalError as e: print "Could not create a record for course {0}, topic {1}, {2}" \ .format(remove_accents(courses[i].name), i, e) doc_topic_str = ", ".join( [str(x) + "(" + str(round(y * 100, 2)) + "%)" for x, y in zip(top_topics, topic_probs)]) print("{} (top {} topics: {})".format(remove_accents(courses[i].name), n_top_topic, doc_topic_str))
def lda_over_lectures(n_top_words, n_top_topic): courses = Course.select() for course in courses: print("LDA for course: " + course.name) lda_for_course_material(course, n_top_words, n_top_topic)
def getCourses(self, courseId=0): if courseId: courses = Course.select().where(Course.id == courseId) else: courses = Course.select() return list(courses)
def getCourseRecord(self, courseId): try: data = Course.select().where(Course.id == courseId).get() return data except Exception: return None
import sys def is_valid_semester(course_entry, allowed): return any([ x[0] == course_entry.year and x[1] == course_entry.semester for x in allowed ]) if __name__ == '__main__': prefix = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + '/' lectures = Lecture.select().where(Lecture.time.is_null(True)) for lec in lectures: path = prefix + lec.path if lec.path and os.path.exists(path): os.remove(path) semesters = [] if len(sys.argv) == 2: semesters = parse_semesters(sys.argv[1]) courses = Course.select() with db.atomic(): for lec in lectures: lec.delete_instance() for course in courses: if is_valid_semester(course, semesters): continue course.delete_instance(recursive=True)