Exemplo n.º 1
0
    def process_item(self, item, spider):
        if isinstance(item, DataItem):
            url = ''.join(item['link'])
            dir_name = 'raw_data' + ''.join(item['path']) + '/'
            course_code = ''.join(item['course_code'])
            content = ''.join(item['content'])
            path = ''
            year = ''.join(item['year'])
            semester = ''.join(item['semester'])

            course = Course.select().where(Course.code == course_code, Course.year == year, Course.semester == semester)
            if not course.exists():
                course = None
                print "Non-existing course: {}".format(course_code)

            if not os.path.exists(dir_name):
                try:
                    os.makedirs(dir_name)
                except OSError as e:
                    print "Could not create directory: {} due to {}".format(dir_name, e)

            lecture = Lecture.select().where(Lecture.course == course, Lecture.url == url)
            # if no lecture record and no content, then download data (pdf, pptx, etc.) according to url
            if not lecture.exists() and len(content) == 0:
                filename = os.path.basename(url)
                path = dir_name + filename
                print "Saving {} => {}".format(url, path)
                try:
                    urllib.urlretrieve(url, path)
                except IOError as e:
                    print "Could not save file: {} into {}. Cause {}".format(url, path, e)

            if not lecture.exists():
                print "Lecture record not found, creating ..."
                try:
                    title = self.__get_title(url)
                    with db.transaction():
                        Lecture.create(
                            course=course,
                            url=url,
                            path=path,
                            name=title,
                            content=content
                        )
                except peewee.OperationalError as e:
                    print "Could not create a record for course {} lecture {} due to {}".format(course_code, url, e)
            else:
                if len(content) > 0:
                    try:
                        with db.transaction():
                            lecture.content = content
                            lecture.save()
                    except peewee.OperationalError as e:
                        print e
        return item
Exemplo n.º 2
0
    def extractLectureTokens(self, lecture):
        if lecture is None:
            return False

        text = lecture.content
        tokens = self.extractTokens(text)
        sorted_tokens = sorted(tokens.items(), key=operator.itemgetter(1))

        for token in sorted_tokens:
            try:
                with db.transaction() as txn:
                    LectureWord.create(
                        lecture=lecture,
                        word=token[0],
                        count=token[1],
                        active=True,
                        weight=0
                    )
                    txn.commit()
            except peewee.OperationalError as e:
                print "Could not create a record for lecture {0}, word {1}, {2}".format(lecture.id, token[0], e)

            if self.debug:
                print token

        return True
Exemplo n.º 3
0
    def process_item(self, item, spider):
        if isinstance(item, CoursesItem):
            course_code = ''.join(item['code'])
            course_time = item['link'][0].split("/")
            year = course_time[1]
            semester = ''.join(course_time[-1])

            #Check if entry already exists
            course = Course.select().where(Course.code == course_code, Course.year == year, Course.semester == semester)

            if not course.exists():
                print "course record not found, creating"
                try:
                    with db.transaction():
                        Course.create(
                            code=course_code,
                            name=''.join(item['title']),
                            year=year,
                            semester=semester,
                            url=''.join(item['link']),
                            path='raw_data'.join(item['link'])
                        )
                except peewee.OperationalError as e:
                    print "Could not create a record for {} due to {}".format(course_code, e)

        return item
Exemplo n.º 4
0
    def createCourseTokens(self):
        for course in self.getCourses():
            print "{}: {}".format(course.id, course.name.encode('utf8'))
            token_dict = {}
            lecture_token = {}

            for lecture in self.getLectures(course):
                lectureWords = self.getLectureWords(lecture)
                for lectureWord in lectureWords:
                    if not token_dict.has_key(lectureWord.word):
                        token_dict[lectureWord.word] = 0
                        lecture_token[lectureWord.word] = 0

                    token_dict[lectureWord.word] += lectureWord.count
                    lecture_token[lectureWord.word] += 1
            sorted_tokens = sorted(token_dict.items(), key=operator.itemgetter(1))
            for token in sorted_tokens:
                try:
                    with db.transaction() as txn:
                        CourseWord.create(
                            course=course,
                            word=token[0],
                            count=token[1],
                            active=True,
                            lectures=lecture_token[token[0]]
                        )
                        txn.commit()
                except peewee.OperationalError as e:
                    print "Could not create a record for course {0}, word {1}, {2}".format(course.name.encode('utf8'),
                                                                                           token[0].encode('utf8'), e)
Exemplo n.º 5
0
    def extract_text(self):
        lectures = Lecture.select().where(Lecture.content == '', Lecture.url % "*docx")

        result_lectures = self.pool.map(self.__convert, lectures)

        for lecture in result_lectures:
            if lecture:
                try:
                    with db.transaction():
                        lecture.save()
                except peewee.OperationalError as e:
                    print e
Exemplo n.º 6
0
 def extract_text(self):
     lectures = Lecture.select().where(Lecture.content != '', Lecture.path == "")
     for lecture in list(lectures):
         soup = BeautifulSoup(lecture.content)
         print lecture.url
         lecture.content = soup.get_text()
         lecture.path = 'html2txt'
         try:
             with db.transaction():
                 lecture.save()
         except peewee.OperationalError as e:
             print e
Exemplo n.º 7
0
 def extract_text(self):
     lectures = Lecture.select().where(Lecture.content == '', Lecture.url % "*pptx")
     for lecture in list(lectures):
         if not os.path.exists(self.prefix+lecture.path):
             print "File not found: {0}".format(lecture.path)
             continue
         print lecture.url
         lecture.content = self.__convert(self.prefix+lecture.path)
         try:
             with db.transaction():
                 lecture.save()
         except peewee.OperationalError as e:
             print e
Exemplo n.º 8
0
    def __clean_words_table(table, removable_words):
        print "Cleaning table {}".format(table.__name__)

        current_records = [
            record for record in table.select()
            if record.word in removable_words
        ]

        try:
            with db.transaction():
                for record in current_records:
                    record.delete_instance()
        except peewee.OperationalError as e:
            print e
Exemplo n.º 9
0
    def calc_tf(self):
        for course in self.getCourses(55):
            print course.name
            for lecture in self.getLectures(course):
                maxCount = 0
                for lectureWord in self.getLectureWords(lecture):
                    maxCount = max(maxCount, lectureWord.count)

                for lectureWord in self.getLectureWords(lecture):
                    try:
                        with db.transaction():
                            lectureWord.weight = 0.5 + (0.5 * lectureWord.count) / maxCount
                            lectureWord.save()
                    except peewee.OperationalError as e:
                        print e
Exemplo n.º 10
0
    def createCorpusTokens(self):
        token_dict = {}
        for courseWord in self.getCourseWords():
            if token_dict.has_key(courseWord.word):
                token_dict[courseWord.word] += courseWord.count
            else:
                token_dict[courseWord.word] = courseWord.count

        sorted_tokens = sorted(token_dict.items(), key=operator.itemgetter(1))
        for token in sorted_tokens:
            print token
            try:
                with db.transaction() as txn:
                    CorpusWord.create(
                        word=token[0],
                        count=token[1],
                        active=True
                    )
                    txn.commit()
            except peewee.OperationalError as e:
                print "Could not create a record for word {}, {}".format(token[0], e)
Exemplo n.º 11
0
def __remove_duplicates():
    """
    Removes all duplicate material within the scope of a single course.
    Currently duplicate detection works based on material name, i.e if
    the names without extensions match, we remove one of the duplicates
    (preferably the .pdf one, since extracting text from pdf is more
    prone to errors).
    """
    print "removing duplicates: {}".format(len(Course.select()))
    lectures_to_delete = []
    for course in Course.select():
        print "Course: {}".format(course.code)
        lectures = {}
        print "Lectures: {}".format(
            len(Lecture.select().where(Lecture.course == course)))
        for lecture in Lecture.select().where(Lecture.course == course):
            extension = __resolve_extension(lecture.name)
            if not extension:
                continue

            pure_name = lecture.name[:-len(
                extension)]  # Get lecture name without extension
            if pure_name in lectures:
                existing = lectures[pure_name]

                if existing.name.endswith(
                        '.pdf'):  # Prefer anything to .pdf extension
                    lectures_to_delete.append(existing)
                    lectures[pure_name] = lecture
                else:
                    lectures_to_delete.append(lecture)
            else:
                lectures[pure_name] = lecture  # Initial insert
    try:
        with db.transaction():
            for lecture in lectures_to_delete:
                lecture.delete_instance()
    except peewee.OperationalError as e:
        print e
Exemplo n.º 12
0
def lda_over_courses(n_top_words, n_top_topic):
    courses = Course.select()
    courses_size = Course.select(Course.code).distinct().count()
    courses_dict = []
    for course in courses:
        course_words = CourseWord.select().where(CourseWord.course == course)
        courses_dict.append(dict([(x.word, x.count) for x in course_words]))

    print "Performing LDA over all courses.."
    model, vocab = perform_lda(courses_dict, courses_size)

    for i, x in enumerate(model.loglikelihoods_):
        try:
            with db.transaction() as txn:
                LDALogLikelihood.create(
                    iteration=i * 10,
                    loglikelihood=round(x, 2),
                )
                txn.commit()
        except peewee.OperationalError as e:
            print "Could not create a record for loglikelihood {}, {}".format(x, e)

    # Iterate over topic word distributions
    for i, topic_dist in enumerate(model.topic_word_):
        top_topic_words = np.array(vocab)[np.argsort(topic_dist)][:-n_top_words - 1:-1]
        top_word_probs = topic_dist[np.argsort(topic_dist)][:-n_top_words - 1:-1]

        for top_word, top_weight in zip(top_topic_words, top_word_probs):
            try:
                with db.transaction() as txn:
                    TopicWord.create(
                        topic=i,
                        word=top_word,
                        weight=round(top_weight * 100, 2)
                    )
                    txn.commit()
            except peewee.OperationalError as e:
                print "Could not create a record for topic {}, word {}, {}".format(i, top_word, e)

        top_word_str = ", ".join([remove_accents(x) + "(" + str(round(y * 100, 2)) + "%)"
                                  for x, y in zip(top_topic_words, top_word_probs)])

        print('Topic {}: {}'.format(i, top_word_str))

    # Document-topic distributions
    doc_topic = model.doc_topic_

    for i in range(courses_size):
        top_topics = np.argsort(doc_topic[i])[:-n_top_topic - 1:-1]
        topic_probs = doc_topic[i][top_topics]

        for top_topic, top_weight in zip(top_topics, topic_probs):
            try:
                with db.transaction() as txn:
                    CourseTopic.create(
                        course=courses[i],
                        topic=top_topic,
                        weight=round(top_weight * 100, 2)
                    )
                    txn.commit()
            except peewee.OperationalError as e:
                print "Could not create a record for course {0}, topic {1}, {2}" \
                    .format(remove_accents(courses[i].name), i, e)

        doc_topic_str = ", ".join(
            [str(x) + "(" + str(round(y * 100, 2)) + "%)" for x, y in zip(top_topics, topic_probs)])
        print("{} (top {} topics: {})".format(remove_accents(courses[i].name), n_top_topic, doc_topic_str))
Exemplo n.º 13
0
def lda_for_course_material(course, n_top_words, n_top_topic):
    lectures = Lecture.select().where(Lecture.course == course)
    lectures_size = Lecture.select().where(Lecture.course == course).count()
    lecture_dict = []
    for lecture in lectures:
        lecture_words = LectureWord.select().where(LectureWord.lecture == lecture)
        lecture_dict.append(dict([(x.word, x.count) for x in lecture_words]))

    if not lecture_dict:
        return

    model, vocab = perform_lda(lecture_dict, lectures_size)

    for i, topic_dist in enumerate(model.topic_word_):
        top_topic_words = np.array(vocab)[np.argsort(topic_dist)][:-n_top_words - 1:-1]
        top_word_probs = topic_dist[np.argsort(topic_dist)][:-n_top_words - 1:-1]

        for top_word, top_weight in zip(top_topic_words, top_word_probs):
            try:
                with db.transaction() as txn:
                    LectureTopicWord.create(
                        course=course,
                        topic=i,
                        word=top_word,
                        weight=round(top_weight * 100, 2)
                    )
                    txn.commit()
            except peewee.OperationalError as e:
                print "Could not create a record for topic {}, word {}, {}".format(i, top_word, e)

        top_word_str = ", ".join([remove_accents(x) + "(" + str(round(y, 2) * 100) + "%)"
                                  for x, y in zip(top_topic_words, top_word_probs)])

        print('Topic {}: {}'.format(i, top_word_str))

    # Document-topic distributions
    doc_topic = model.doc_topic_

    for i in range(lectures_size):
        top_topics = np.argsort(doc_topic[i])[:-n_top_topic - 1:-1]
        topic_probs = doc_topic[i][top_topics]


        #Substitude this
        title = remove_accents(lectures[i].path.split("/")[-1])

        for top_topic, top_weight in zip(top_topics, topic_probs):
            try:
                with db.transaction() as txn:
                    LectureTopic.create(
                        lecture=lectures[i],
                        topic=top_topic,
                        weight=round(top_weight * 100, 2)
                    )
                    txn.commit()
            except peewee.OperationalError as e:
                print "Could not create a record for lecture {0}, topic {1}, {2}" \
                    .format(remove_accents(lectures[i].name), i, e)

        doc_topic_str = ", ".join(
            [str(x) + "(" + str(round(y * 100, 2)) + "%)" for x, y in zip(top_topics, topic_probs)])
        print("{} (top {} topics: {})".format(title, n_top_topic, doc_topic_str))