def extract_all_lectures_tokens(self): print "extract_all_lectures_tokens" print "lectures total: {}".format(len(Lecture.select())) # Tokenize and clean each lecture separately # result_data = [x for x in (self.pool.map(self.__extract_lecture_tokens, Lecture.select())) if x] result_data = [] i = 1 for x in Lecture.select().order_by(Lecture.id): y = self.__extract_lecture_tokens(x) if y: print "index: {}".format(i) result_data.append(y) i += 1 print "result_data {}".format(result_data) #Create acronym dictionary and replace acronyms with definitions self.__create_acronym_dict(result_data) result_data = self.pool.map(self.__replace_acronyms, result_data) # Perform co-occurrence over entire word corpus, filter by course code limit docs = [(y[0].course.code, y[2]) for y in result_data] self.co_occurring_words = self.co_occ.find_co_occurring_words( docs, self.acronyms) print "Co-occurring words:", self.co_occurring_words, "; total count:", len( self.co_occurring_words) # Re-count co-occurring words and remove 'standalone' words print "done" return self.pool.map(self.__adjust_lecture_counts, result_data)
def process_item(self, item, spider): if isinstance(item, DataItem): url = ''.join(item['link']) dir_name = 'raw_data' + ''.join(item['path']) + '/' course_code = ''.join(item['course_code']) content = ''.join(item['content']) path = '' year = ''.join(item['year']) semester = ''.join(item['semester']) course = Course.select().where(Course.code == course_code, Course.year == year, Course.semester == semester) if not course.exists(): course = None print "Non-existing course: {}".format(course_code) if not os.path.exists(dir_name): try: os.makedirs(dir_name) except OSError as e: print "Could not create directory: {} due to {}".format(dir_name, e) lecture = Lecture.select().where(Lecture.course == course, Lecture.url == url) # if no lecture record and no content, then download data (pdf, pptx, etc.) according to url if not lecture.exists() and len(content) == 0: filename = os.path.basename(url) path = dir_name + filename print "Saving {} => {}".format(url, path) try: urllib.urlretrieve(url, path) except IOError as e: print "Could not save file: {} into {}. Cause {}".format(url, path, e) if not lecture.exists(): print "Lecture record not found, creating ..." try: title = self.__get_title(url) with db.transaction(): Lecture.create( course=course, url=url, path=path, name=title, content=content ) except peewee.OperationalError as e: print "Could not create a record for course {} lecture {} due to {}".format(course_code, url, e) else: if len(content) > 0: try: with db.transaction(): lecture.content = content lecture.save() except peewee.OperationalError as e: print e return item
def __count_ext(self): extensions = {} lectures = Lecture.select().where(Lecture.content == '') for lecture in lectures: clean_url = None if lecture.url.startswith('https'): clean_url = lecture.url[8:] elif lecture.url.startswith('http'): clean_url = lecture.url[7:] ext = lecture.name.split('.')[-1] if ext in self.blacklist \ or (not ext.isalpha()) \ or (not clean_url) \ or clean_url.find('/') < 0\ or ext.strip() == '' \ or lecture.name.find('.') < 0: continue if ext in extensions: extensions[ext] += 1 else: extensions[ext] = 1 return { k: v for k, v in extensions.iteritems() if v > 1 and len(k) < 5 }
def extract_text(self): lectures = Lecture.select().where(Lecture.content == '', Lecture.url % "*docx") result_lectures = self.pool.map(self.__convert, lectures) for lecture in result_lectures: if lecture: try: with db.transaction(): lecture.save() except peewee.OperationalError as e: print e
def extract_text(self): lectures = Lecture.select().where(Lecture.content != '', Lecture.path == "") for lecture in list(lectures): soup = BeautifulSoup(lecture.content) print lecture.url lecture.content = soup.get_text() lecture.path = 'html2txt' try: with db.transaction(): lecture.save() except peewee.OperationalError as e: print e
def extract_text(self): lectures = Lecture.select().where(Lecture.content == '', Lecture.url % "*pptx") for lecture in list(lectures): if not os.path.exists(self.prefix+lecture.path): print "File not found: {0}".format(lecture.path) continue print lecture.url lecture.content = self.__convert(self.prefix+lecture.path) try: with db.transaction(): lecture.save() except peewee.OperationalError as e: print e
def __remove_duplicates(): """ Removes all duplicate material within the scope of a single course. Currently duplicate detection works based on material name, i.e if the names without extensions match, we remove one of the duplicates (preferably the .pdf one, since extracting text from pdf is more prone to errors). """ print "removing duplicates: {}".format(len(Course.select())) lectures_to_delete = [] for course in Course.select(): print "Course: {}".format(course.code) lectures = {} print "Lectures: {}".format( len(Lecture.select().where(Lecture.course == course))) for lecture in Lecture.select().where(Lecture.course == course): extension = __resolve_extension(lecture.name) if not extension: continue pure_name = lecture.name[:-len( extension)] # Get lecture name without extension if pure_name in lectures: existing = lectures[pure_name] if existing.name.endswith( '.pdf'): # Prefer anything to .pdf extension lectures_to_delete.append(existing) lectures[pure_name] = lecture else: lectures_to_delete.append(lecture) else: lectures[pure_name] = lecture # Initial insert try: with db.transaction(): for lecture in lectures_to_delete: lecture.delete_instance() except peewee.OperationalError as e: print e
def __persist(self, results): rows = [] for k, v in results.items(): course = Course.select().where(Course.code == k.code, Course.year == k.year, Course.semester == k.semester) if not course.exists(): print "Non-existing course in SIS data: {}".format(k) continue rows.append({ 'course': course, 'url': '', 'path': self.filename, 'name': 'SISdata', 'content': v.decode('latin-1').encode('utf-8'), 'time': datetime.datetime.now(), 'size': 0 }) with db.atomic(): Lecture.insert_many(rows).execute()
def extract_all_lectures_tokens(self): # Tokenize and clean each lecture separately result_data = [ x for x in ( self.pool.map(self.__extract_lecture_tokens, Lecture.select())) if x ] #Create acronym dictionary and replace acronyms with definitions self.__create_acronym_dict(result_data) result_data = self.pool.map(self.__replace_acronyms, result_data) # Perform co-occurrence over entire word corpus, filter by course code limit docs = [(y[0].course.code, y[2]) for y in result_data] self.co_occurring_words = self.co_occ.find_co_occurring_words( docs, self.acronyms) print "Co-occurring words:", self.co_occurring_words, "; total count:", len( self.co_occurring_words) # Re-count co-occurring words and remove 'standalone' words return self.pool.map(self.__adjust_lecture_counts, result_data)
def lda_over_lectures(self): """ Peform LDA over lectures within the scope of an individual course. Basically we perform as many LDA modellings as there are courses. """ lectures = [] for course in Course.select(): course_lectures = list( Lecture.select().where(Lecture.course == course)) lda_tools = [ DictVectorizer(), lda.LDA(n_topics=len(course_lectures), n_iter=1000, random_state=1) ] lectures.append((course, course_lectures, LectureWord, lda_tools)) res = self.pool.map(self.__lda_for_course_material, lectures) with db.atomic(): LectureTopicWord.insert_many([x for y in res for x in y[0]]).execute() LectureTopic.insert_many([x for y in res for x in y[1]]).execute()
def process_item(self, item, spider): if isinstance(item, DataItem): url = ''.join(item['link']) dir_name = 'raw_data' + ''.join(item['path']) + '/' course_code = ''.join(item['course_code']) content = ''.join(item['content']) path = '' year = ''.join(item['year']) semester = ''.join(item['semester']) prefix = os.path.dirname(os.path.dirname( os.path.abspath(__file__))) + '/' course = Course.select().where(Course.code == course_code, Course.year == year, Course.semester == semester) if not course.exists(): print "Non-existing course: {}".format(course_code) return if len(content) == 0 and not os.path.exists(dir_name): try: os.makedirs(dir_name) except OSError as e: print "Could not create directory: {} due to {}".format( dir_name, e) lecture = Lecture.select().where(Lecture.course == course, Lecture.url == url) file_size = 0 # if no lecture record and no content, then download data (pdf, pptx, etc.) according to url if len(content) == 0: try: info = urllib.urlopen(url).info() if 'Content-Length' in info: file_size = float(info['Content-Length']) except Exception as e: print "Failed to retrieve file size for {} due to {}".format( url, e) if not lecture.exists(): path = self.__download(url, dir_name) else: lecture_instance = lecture.first() # Re-download only if the file has been updated if lecture_instance.size == 0 or lecture_instance.size != file_size: os.remove(prefix + lecture_instance.path) self.__download(url, dir_name) else: content = lecture_instance.content # No need to re-extract content later if not lecture.exists(): print "Lecture record not found, creating ..." title = self.__get_title(url) with db.atomic(): try: Lecture.create(course=course, url=url, path=path, name=title, content=content, size=file_size, time=datetime.datetime.now()) except peewee.OperationalError as e: print "Could not create a record for course {} lecture {} due to {}".format( course_code, url, e) else: with db.atomic(): try: lecture_instance = lecture.first() lecture_instance.content = content lecture_instance.time = datetime.datetime.now() lecture_instance.save() except peewee.OperationalError as e: print e return item
def lda_over_all_material(self): """ Perform LDA over all material without any course limitations. The topic count is 1/10 of the material count. """ lectures = Lecture.select() lectures_dict = [] for lecture in lectures: lecture_words = LectureWord.select().where( LectureWord.lecture == lecture) lectures_dict.append( dict([(x.word, x.count) for x in lecture_words])) topic_count = int(len(lectures_dict) / 10) print "Performing LDA over all material.." model, vocab = self.__perform_lda_default(lectures_dict, topic_count) topic_word_rows = [] # Iterate over topic word distributions for i, topic_dist in enumerate(model.topic_word_): top_topic_words = np.array(vocab)[self.__max_values( topic_dist, self.n_top_words)] top_word_probs = topic_dist[np.argsort( topic_dist)][:-self.n_top_words - 1:-1] for top_word, top_weight in zip(top_topic_words, top_word_probs): row_dict = { 'topic': i, 'word': top_word, 'weight': round(top_weight * 100, 2) } topic_word_rows.append(row_dict) if self.debug: top_word_str = ", ".join([ x.encode('utf-8') + "(" + str(round(y * 100, 2)) + "%)" for x, y in zip(top_topic_words, top_word_probs) ]) print('Topic {}: {}'.format(i, top_word_str)) # Document-topic distributions doc_topic = model.doc_topic_ lecture_topic_rows = [] for i in range(lectures.count()): top_topics = np.argsort(doc_topic[i])[:-self.n_top_topic - 1:-1] topic_probs = doc_topic[i][top_topics] for top_topic, top_weight in zip(top_topics, topic_probs): rounded_weight = round(top_weight * 100, 2) if rounded_weight < 10: continue row_dict = { 'lecture': lectures[i], 'topic': top_topic, 'weight': rounded_weight } lecture_topic_rows.append(row_dict) if self.debug: doc_topic_str = ", ".join([ str(x) + "(" + str(round(y * 100, 2)) + "%)" for x, y in zip(top_topics, topic_probs) ]) print("{} (top {} topics: {})".format( lectures[i].name.encode('utf-8'), self.n_top_topic, doc_topic_str)) with db.atomic(): self.__insert_rows(MaterialTopicWord, topic_word_rows) self.__insert_rows(MaterialTopic, lecture_topic_rows)
def lda_for_course_material(course, n_top_words, n_top_topic): lectures = Lecture.select().where(Lecture.course == course) lectures_size = Lecture.select().where(Lecture.course == course).count() lecture_dict = [] for lecture in lectures: lecture_words = LectureWord.select().where(LectureWord.lecture == lecture) lecture_dict.append(dict([(x.word, x.count) for x in lecture_words])) if not lecture_dict: return model, vocab = perform_lda(lecture_dict, lectures_size) for i, topic_dist in enumerate(model.topic_word_): top_topic_words = np.array(vocab)[np.argsort(topic_dist)][:-n_top_words - 1:-1] top_word_probs = topic_dist[np.argsort(topic_dist)][:-n_top_words - 1:-1] for top_word, top_weight in zip(top_topic_words, top_word_probs): try: with db.transaction() as txn: LectureTopicWord.create( course=course, topic=i, word=top_word, weight=round(top_weight * 100, 2) ) txn.commit() except peewee.OperationalError as e: print "Could not create a record for topic {}, word {}, {}".format(i, top_word, e) top_word_str = ", ".join([remove_accents(x) + "(" + str(round(y, 2) * 100) + "%)" for x, y in zip(top_topic_words, top_word_probs)]) print('Topic {}: {}'.format(i, top_word_str)) # Document-topic distributions doc_topic = model.doc_topic_ for i in range(lectures_size): top_topics = np.argsort(doc_topic[i])[:-n_top_topic - 1:-1] topic_probs = doc_topic[i][top_topics] #Substitude this title = remove_accents(lectures[i].path.split("/")[-1]) for top_topic, top_weight in zip(top_topics, topic_probs): try: with db.transaction() as txn: LectureTopic.create( lecture=lectures[i], topic=top_topic, weight=round(top_weight * 100, 2) ) txn.commit() except peewee.OperationalError as e: print "Could not create a record for lecture {0}, topic {1}, {2}" \ .format(remove_accents(lectures[i].name), i, e) doc_topic_str = ", ".join( [str(x) + "(" + str(round(y * 100, 2)) + "%)" for x, y in zip(top_topics, topic_probs)]) print("{} (top {} topics: {})".format(title, n_top_topic, doc_topic_str))
def getLectureRecord(self, lectureId): try: data = Lecture.select().where(Lecture.id == lectureId).get() return data except Exception: return None
def getLectures(self, course): lectures = Lecture.select().where(Lecture.course == course) return list(lectures)
from utils.SemesterUtils import parse_semesters import os import sys def is_valid_semester(course_entry, allowed): return any([ x[0] == course_entry.year and x[1] == course_entry.semester for x in allowed ]) if __name__ == '__main__': prefix = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + '/' lectures = Lecture.select().where(Lecture.time.is_null(True)) for lec in lectures: path = prefix + lec.path if lec.path and os.path.exists(path): os.remove(path) semesters = [] if len(sys.argv) == 2: semesters = parse_semesters(sys.argv[1]) courses = Course.select() with db.atomic(): for lec in lectures: lec.delete_instance() for course in courses: if is_valid_semester(course, semesters): continue