예제 #1
0
    def process_item(self, item, spider):
        if isinstance(item, CoursesItem):
            course_code = ''.join(item['code'])
            course_time = item['link'][0].split("/")
            year = course_time[1]
            semester = ''.join(course_time[-1])

            #Check if entry already exists
            course = Course.select().where(Course.code == course_code, Course.year == year, Course.semester == semester)

            if not course.exists():
                print "course record not found, creating"
                try:
                    with db.transaction():
                        Course.create(
                            code=course_code,
                            name=''.join(item['title']),
                            year=year,
                            semester=semester,
                            url=''.join(item['link']),
                            path='raw_data'.join(item['link'])
                        )
                except peewee.OperationalError as e:
                    print "Could not create a record for {} due to {}".format(course_code, e)

        return item
예제 #2
0
    def process_item(self, item, spider):
        if isinstance(item, CoursesItem):
            course_code = ''.join(item['code'])
            year = item['year']
            semester = item['semester']

            #Check if entry already exists
            course = Course.select().where(Course.code == course_code,
                                           Course.year == year,
                                           Course.semester == semester)

            if not course.exists():
                print "course record not found, creating"
                with db.atomic():
                    try:
                        Course.create(code=course_code,
                                      name=''.join(item['title']),
                                      year=item['year'],
                                      semester=semester,
                                      url=''.join(item['link']),
                                      path='raw_data'.join(item['link']))
                    except peewee.OperationalError as e:
                        print "Could not create a record for {} due to {}".format(
                            course_code, e)

        return item
예제 #3
0
    def lda_over_courses(self):
        """
        Perform LDA over all courses, no material/lecture level details.
        """

        courses = Course.select()
        courses_size = Course.select(Course.code).distinct().count()
        courses_dict = []
        for course in courses:
            course_words = CourseWord.select().where(
                CourseWord.course == course)
            courses_dict.append(dict([(x.word, x.count)
                                      for x in course_words]))

        print "Performing LDA over all courses.."
        model, vocab = self.__perform_lda_default(courses_dict, courses_size)

        log_likelihoods = []
        for i, x in enumerate(model.loglikelihoods_):
            row_dict = {'iteration': i * 10, 'loglikelihood': round(x, 2)}
            log_likelihoods.append(row_dict)

        norm_topic_word_rows = self.__resolve_topic_words(
            self.__normalize(model.topic_word_), vocab, 2)
        topic_word_rows = self.__resolve_topic_words(model.topic_word_, vocab,
                                                     1)

        # Document-topic distributions
        doc_topic = model.doc_topic_
        course_topic_rows = []
        for i in range(courses.count()):
            top_topics = np.argsort(doc_topic[i])[:-self.n_top_topic - 1:-1]
            topic_probs = doc_topic[i][top_topics]

            for top_topic, top_weight in zip(top_topics, topic_probs):
                row_dict = {
                    'course': courses[i],
                    'topic': top_topic,
                    'weight': round(top_weight * 100, 2)
                }
                course_topic_rows.append(row_dict)

            if self.debug:
                doc_topic_str = ", ".join([
                    str(x) + "(" + str(round(y * 100, 2)) + "%)"
                    for x, y in zip(top_topics, topic_probs)
                ])
                print("{} (top {} topics: {})".format(
                    courses[i].name.encode('utf-8'), self.n_top_topic,
                    doc_topic_str))

        with db.atomic():
            self.__insert_rows(LDALogLikelihood, log_likelihoods)
            self.__insert_rows(TopicWord, norm_topic_word_rows)
            self.__insert_rows(TopicWord, topic_word_rows)
            self.__insert_rows(CourseTopic, course_topic_rows)
예제 #4
0
    def extract_all_lectures_tokens_per_course(self):
        # Tokenize and clean each lecture separately
        result_data = {}

        for course in Course.select():
            tokens = self.extract_all_lectures_tokens_per_course_helper(
                course.id)
            res_data[course.code] = tokens
        return result_data
예제 #5
0
    def process_item(self, item, spider):
        if isinstance(item, DataItem):
            url = ''.join(item['link'])
            dir_name = 'raw_data' + ''.join(item['path']) + '/'
            course_code = ''.join(item['course_code'])
            content = ''.join(item['content'])
            path = ''
            year = ''.join(item['year'])
            semester = ''.join(item['semester'])

            course = Course.select().where(Course.code == course_code, Course.year == year, Course.semester == semester)
            if not course.exists():
                course = None
                print "Non-existing course: {}".format(course_code)

            if not os.path.exists(dir_name):
                try:
                    os.makedirs(dir_name)
                except OSError as e:
                    print "Could not create directory: {} due to {}".format(dir_name, e)

            lecture = Lecture.select().where(Lecture.course == course, Lecture.url == url)
            # if no lecture record and no content, then download data (pdf, pptx, etc.) according to url
            if not lecture.exists() and len(content) == 0:
                filename = os.path.basename(url)
                path = dir_name + filename
                print "Saving {} => {}".format(url, path)
                try:
                    urllib.urlretrieve(url, path)
                except IOError as e:
                    print "Could not save file: {} into {}. Cause {}".format(url, path, e)

            if not lecture.exists():
                print "Lecture record not found, creating ..."
                try:
                    title = self.__get_title(url)
                    with db.transaction():
                        Lecture.create(
                            course=course,
                            url=url,
                            path=path,
                            name=title,
                            content=content
                        )
                except peewee.OperationalError as e:
                    print "Could not create a record for course {} lecture {} due to {}".format(course_code, url, e)
            else:
                if len(content) > 0:
                    try:
                        with db.transaction():
                            lecture.content = content
                            lecture.save()
                    except peewee.OperationalError as e:
                        print e
        return item
예제 #6
0
def __remove_duplicates():
    """
    Removes all duplicate material within the scope of a single course.
    Currently duplicate detection works based on material name, i.e if
    the names without extensions match, we remove one of the duplicates
    (preferably the .pdf one, since extracting text from pdf is more
    prone to errors).
    """
    print "removing duplicates: {}".format(len(Course.select()))
    lectures_to_delete = []
    for course in Course.select():
        print "Course: {}".format(course.code)
        lectures = {}
        print "Lectures: {}".format(
            len(Lecture.select().where(Lecture.course == course)))
        for lecture in Lecture.select().where(Lecture.course == course):
            extension = __resolve_extension(lecture.name)
            if not extension:
                continue

            pure_name = lecture.name[:-len(
                extension)]  # Get lecture name without extension
            if pure_name in lectures:
                existing = lectures[pure_name]

                if existing.name.endswith(
                        '.pdf'):  # Prefer anything to .pdf extension
                    lectures_to_delete.append(existing)
                    lectures[pure_name] = lecture
                else:
                    lectures_to_delete.append(lecture)
            else:
                lectures[pure_name] = lecture  # Initial insert
    try:
        with db.transaction():
            for lecture in lectures_to_delete:
                lecture.delete_instance()
    except peewee.OperationalError as e:
        print e
예제 #7
0
    def __persist(self, results):
        rows = []
        for k, v in results.items():
            course = Course.select().where(Course.code == k.code,
                                           Course.year == k.year,
                                           Course.semester == k.semester)
            if not course.exists():
                print "Non-existing course in SIS data: {}".format(k)
                continue

            rows.append({
                'course': course,
                'url': '',
                'path': self.filename,
                'name': 'SISdata',
                'content': v.decode('latin-1').encode('utf-8'),
                'time': datetime.datetime.now(),
                'size': 0
            })

        with db.atomic():
            Lecture.insert_many(rows).execute()
예제 #8
0
    def lda_over_lectures(self):
        """
        Peform LDA over lectures within the scope of an individual course.
        Basically we perform as many LDA modellings as there are courses.
        """

        lectures = []
        for course in Course.select():
            course_lectures = list(
                Lecture.select().where(Lecture.course == course))
            lda_tools = [
                DictVectorizer(),
                lda.LDA(n_topics=len(course_lectures),
                        n_iter=1000,
                        random_state=1)
            ]
            lectures.append((course, course_lectures, LectureWord, lda_tools))

        res = self.pool.map(self.__lda_for_course_material, lectures)

        with db.atomic():
            LectureTopicWord.insert_many([x for y in res
                                          for x in y[0]]).execute()
            LectureTopic.insert_many([x for y in res for x in y[1]]).execute()
예제 #9
0
 def __get_courses(course_id=0):
     if course_id:
         courses = Course.select().where(Course.id == course_id)
     else:
         courses = Course.select()
     return list(courses)
예제 #10
0
    def process_item(self, item, spider):
        if isinstance(item, DataItem):
            url = ''.join(item['link'])
            dir_name = 'raw_data' + ''.join(item['path']) + '/'
            course_code = ''.join(item['course_code'])
            content = ''.join(item['content'])
            path = ''
            year = ''.join(item['year'])
            semester = ''.join(item['semester'])
            prefix = os.path.dirname(os.path.dirname(
                os.path.abspath(__file__))) + '/'

            course = Course.select().where(Course.code == course_code,
                                           Course.year == year,
                                           Course.semester == semester)
            if not course.exists():
                print "Non-existing course: {}".format(course_code)
                return

            if len(content) == 0 and not os.path.exists(dir_name):
                try:
                    os.makedirs(dir_name)
                except OSError as e:
                    print "Could not create directory: {} due to {}".format(
                        dir_name, e)

            lecture = Lecture.select().where(Lecture.course == course,
                                             Lecture.url == url)
            file_size = 0
            # if no lecture record and no content, then download data (pdf, pptx, etc.) according to url
            if len(content) == 0:
                try:
                    info = urllib.urlopen(url).info()
                    if 'Content-Length' in info:
                        file_size = float(info['Content-Length'])
                except Exception as e:
                    print "Failed to retrieve file size for {} due to {}".format(
                        url, e)
                if not lecture.exists():
                    path = self.__download(url, dir_name)
                else:
                    lecture_instance = lecture.first()

                    # Re-download only if the file has been updated
                    if lecture_instance.size == 0 or lecture_instance.size != file_size:
                        os.remove(prefix + lecture_instance.path)
                        self.__download(url, dir_name)
                    else:
                        content = lecture_instance.content  # No need to re-extract content later

            if not lecture.exists():
                print "Lecture record not found, creating ..."
                title = self.__get_title(url)
                with db.atomic():
                    try:
                        Lecture.create(course=course,
                                       url=url,
                                       path=path,
                                       name=title,
                                       content=content,
                                       size=file_size,
                                       time=datetime.datetime.now())
                    except peewee.OperationalError as e:
                        print "Could not create a record for course {} lecture {} due to {}".format(
                            course_code, url, e)
            else:
                with db.atomic():
                    try:
                        lecture_instance = lecture.first()
                        lecture_instance.content = content
                        lecture_instance.time = datetime.datetime.now()
                        lecture_instance.save()
                    except peewee.OperationalError as e:
                        print e
        return item
예제 #11
0
def lda_over_courses(n_top_words, n_top_topic):
    courses = Course.select()
    courses_size = Course.select(Course.code).distinct().count()
    courses_dict = []
    for course in courses:
        course_words = CourseWord.select().where(CourseWord.course == course)
        courses_dict.append(dict([(x.word, x.count) for x in course_words]))

    print "Performing LDA over all courses.."
    model, vocab = perform_lda(courses_dict, courses_size)

    for i, x in enumerate(model.loglikelihoods_):
        try:
            with db.transaction() as txn:
                LDALogLikelihood.create(
                    iteration=i * 10,
                    loglikelihood=round(x, 2),
                )
                txn.commit()
        except peewee.OperationalError as e:
            print "Could not create a record for loglikelihood {}, {}".format(x, e)

    # Iterate over topic word distributions
    for i, topic_dist in enumerate(model.topic_word_):
        top_topic_words = np.array(vocab)[np.argsort(topic_dist)][:-n_top_words - 1:-1]
        top_word_probs = topic_dist[np.argsort(topic_dist)][:-n_top_words - 1:-1]

        for top_word, top_weight in zip(top_topic_words, top_word_probs):
            try:
                with db.transaction() as txn:
                    TopicWord.create(
                        topic=i,
                        word=top_word,
                        weight=round(top_weight * 100, 2)
                    )
                    txn.commit()
            except peewee.OperationalError as e:
                print "Could not create a record for topic {}, word {}, {}".format(i, top_word, e)

        top_word_str = ", ".join([remove_accents(x) + "(" + str(round(y * 100, 2)) + "%)"
                                  for x, y in zip(top_topic_words, top_word_probs)])

        print('Topic {}: {}'.format(i, top_word_str))

    # Document-topic distributions
    doc_topic = model.doc_topic_

    for i in range(courses_size):
        top_topics = np.argsort(doc_topic[i])[:-n_top_topic - 1:-1]
        topic_probs = doc_topic[i][top_topics]

        for top_topic, top_weight in zip(top_topics, topic_probs):
            try:
                with db.transaction() as txn:
                    CourseTopic.create(
                        course=courses[i],
                        topic=top_topic,
                        weight=round(top_weight * 100, 2)
                    )
                    txn.commit()
            except peewee.OperationalError as e:
                print "Could not create a record for course {0}, topic {1}, {2}" \
                    .format(remove_accents(courses[i].name), i, e)

        doc_topic_str = ", ".join(
            [str(x) + "(" + str(round(y * 100, 2)) + "%)" for x, y in zip(top_topics, topic_probs)])
        print("{} (top {} topics: {})".format(remove_accents(courses[i].name), n_top_topic, doc_topic_str))
예제 #12
0
def lda_over_lectures(n_top_words, n_top_topic):
    courses = Course.select()
    for course in courses:
        print("LDA for course: " + course.name)
        lda_for_course_material(course, n_top_words, n_top_topic)
예제 #13
0
 def getCourses(self, courseId=0):
     if courseId:
         courses = Course.select().where(Course.id == courseId)
     else:
         courses = Course.select()
     return list(courses)
예제 #14
0
 def getCourseRecord(self, courseId):
     try:
         data = Course.select().where(Course.id == courseId).get()
         return data
     except Exception:
         return None
예제 #15
0
import sys


def is_valid_semester(course_entry, allowed):
    return any([
        x[0] == course_entry.year and x[1] == course_entry.semester
        for x in allowed
    ])


if __name__ == '__main__':
    prefix = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + '/'

    lectures = Lecture.select().where(Lecture.time.is_null(True))
    for lec in lectures:
        path = prefix + lec.path
        if lec.path and os.path.exists(path):
            os.remove(path)

    semesters = []
    if len(sys.argv) == 2:
        semesters = parse_semesters(sys.argv[1])
        courses = Course.select()
        with db.atomic():
            for lec in lectures:
                lec.delete_instance()
            for course in courses:
                if is_valid_semester(course, semesters):
                    continue
                course.delete_instance(recursive=True)