예제 #1
0
    def extract_all_lectures_tokens(self):
        print "extract_all_lectures_tokens"
        print "lectures total: {}".format(len(Lecture.select()))
        # Tokenize and clean each lecture separately
        # result_data = [x for x in (self.pool.map(self.__extract_lecture_tokens, Lecture.select())) if x]
        result_data = []
        i = 1
        for x in Lecture.select().order_by(Lecture.id):
            y = self.__extract_lecture_tokens(x)
            if y:
                print "index: {}".format(i)
                result_data.append(y)
                i += 1
        print "result_data {}".format(result_data)

        #Create acronym dictionary and replace acronyms with definitions
        self.__create_acronym_dict(result_data)
        result_data = self.pool.map(self.__replace_acronyms, result_data)

        # Perform co-occurrence over entire word corpus, filter by course code limit
        docs = [(y[0].course.code, y[2]) for y in result_data]
        self.co_occurring_words = self.co_occ.find_co_occurring_words(
            docs, self.acronyms)
        print "Co-occurring words:", self.co_occurring_words, "; total count:", len(
            self.co_occurring_words)
        # Re-count co-occurring words and remove 'standalone' words
        print "done"
        return self.pool.map(self.__adjust_lecture_counts, result_data)
예제 #2
0
    def process_item(self, item, spider):
        if isinstance(item, DataItem):
            url = ''.join(item['link'])
            dir_name = 'raw_data' + ''.join(item['path']) + '/'
            course_code = ''.join(item['course_code'])
            content = ''.join(item['content'])
            path = ''
            year = ''.join(item['year'])
            semester = ''.join(item['semester'])

            course = Course.select().where(Course.code == course_code, Course.year == year, Course.semester == semester)
            if not course.exists():
                course = None
                print "Non-existing course: {}".format(course_code)

            if not os.path.exists(dir_name):
                try:
                    os.makedirs(dir_name)
                except OSError as e:
                    print "Could not create directory: {} due to {}".format(dir_name, e)

            lecture = Lecture.select().where(Lecture.course == course, Lecture.url == url)
            # if no lecture record and no content, then download data (pdf, pptx, etc.) according to url
            if not lecture.exists() and len(content) == 0:
                filename = os.path.basename(url)
                path = dir_name + filename
                print "Saving {} => {}".format(url, path)
                try:
                    urllib.urlretrieve(url, path)
                except IOError as e:
                    print "Could not save file: {} into {}. Cause {}".format(url, path, e)

            if not lecture.exists():
                print "Lecture record not found, creating ..."
                try:
                    title = self.__get_title(url)
                    with db.transaction():
                        Lecture.create(
                            course=course,
                            url=url,
                            path=path,
                            name=title,
                            content=content
                        )
                except peewee.OperationalError as e:
                    print "Could not create a record for course {} lecture {} due to {}".format(course_code, url, e)
            else:
                if len(content) > 0:
                    try:
                        with db.transaction():
                            lecture.content = content
                            lecture.save()
                    except peewee.OperationalError as e:
                        print e
        return item
예제 #3
0
    def __count_ext(self):
        extensions = {}
        lectures = Lecture.select().where(Lecture.content == '')
        for lecture in lectures:
            clean_url = None
            if lecture.url.startswith('https'):
                clean_url = lecture.url[8:]
            elif lecture.url.startswith('http'):
                clean_url = lecture.url[7:]

            ext = lecture.name.split('.')[-1]
            if ext in self.blacklist \
                    or (not ext.isalpha()) \
                    or (not clean_url)  \
                    or clean_url.find('/') < 0\
                    or ext.strip() == '' \
                    or lecture.name.find('.') < 0:
                continue

            if ext in extensions:
                extensions[ext] += 1
            else:
                extensions[ext] = 1
        return {
            k: v
            for k, v in extensions.iteritems() if v > 1 and len(k) < 5
        }
예제 #4
0
    def extract_text(self):
        lectures = Lecture.select().where(Lecture.content == '', Lecture.url % "*docx")

        result_lectures = self.pool.map(self.__convert, lectures)

        for lecture in result_lectures:
            if lecture:
                try:
                    with db.transaction():
                        lecture.save()
                except peewee.OperationalError as e:
                    print e
예제 #5
0
파일: html2txt.py 프로젝트: mvels/biseminar
 def extract_text(self):
     lectures = Lecture.select().where(Lecture.content != '', Lecture.path == "")
     for lecture in list(lectures):
         soup = BeautifulSoup(lecture.content)
         print lecture.url
         lecture.content = soup.get_text()
         lecture.path = 'html2txt'
         try:
             with db.transaction():
                 lecture.save()
         except peewee.OperationalError as e:
             print e
예제 #6
0
파일: pptx2txt.py 프로젝트: mvels/biseminar
 def extract_text(self):
     lectures = Lecture.select().where(Lecture.content == '', Lecture.url % "*pptx")
     for lecture in list(lectures):
         if not os.path.exists(self.prefix+lecture.path):
             print "File not found: {0}".format(lecture.path)
             continue
         print lecture.url
         lecture.content = self.__convert(self.prefix+lecture.path)
         try:
             with db.transaction():
                 lecture.save()
         except peewee.OperationalError as e:
             print e
예제 #7
0
def __remove_duplicates():
    """
    Removes all duplicate material within the scope of a single course.
    Currently duplicate detection works based on material name, i.e if
    the names without extensions match, we remove one of the duplicates
    (preferably the .pdf one, since extracting text from pdf is more
    prone to errors).
    """
    print "removing duplicates: {}".format(len(Course.select()))
    lectures_to_delete = []
    for course in Course.select():
        print "Course: {}".format(course.code)
        lectures = {}
        print "Lectures: {}".format(
            len(Lecture.select().where(Lecture.course == course)))
        for lecture in Lecture.select().where(Lecture.course == course):
            extension = __resolve_extension(lecture.name)
            if not extension:
                continue

            pure_name = lecture.name[:-len(
                extension)]  # Get lecture name without extension
            if pure_name in lectures:
                existing = lectures[pure_name]

                if existing.name.endswith(
                        '.pdf'):  # Prefer anything to .pdf extension
                    lectures_to_delete.append(existing)
                    lectures[pure_name] = lecture
                else:
                    lectures_to_delete.append(lecture)
            else:
                lectures[pure_name] = lecture  # Initial insert
    try:
        with db.transaction():
            for lecture in lectures_to_delete:
                lecture.delete_instance()
    except peewee.OperationalError as e:
        print e
예제 #8
0
    def __persist(self, results):
        rows = []
        for k, v in results.items():
            course = Course.select().where(Course.code == k.code,
                                           Course.year == k.year,
                                           Course.semester == k.semester)
            if not course.exists():
                print "Non-existing course in SIS data: {}".format(k)
                continue

            rows.append({
                'course': course,
                'url': '',
                'path': self.filename,
                'name': 'SISdata',
                'content': v.decode('latin-1').encode('utf-8'),
                'time': datetime.datetime.now(),
                'size': 0
            })

        with db.atomic():
            Lecture.insert_many(rows).execute()
예제 #9
0
    def extract_all_lectures_tokens(self):
        # Tokenize and clean each lecture separately
        result_data = [
            x for x in (
                self.pool.map(self.__extract_lecture_tokens, Lecture.select()))
            if x
        ]

        #Create acronym dictionary and replace acronyms with definitions
        self.__create_acronym_dict(result_data)
        result_data = self.pool.map(self.__replace_acronyms, result_data)

        # Perform co-occurrence over entire word corpus, filter by course code limit
        docs = [(y[0].course.code, y[2]) for y in result_data]
        self.co_occurring_words = self.co_occ.find_co_occurring_words(
            docs, self.acronyms)
        print "Co-occurring words:", self.co_occurring_words, "; total count:", len(
            self.co_occurring_words)
        # Re-count co-occurring words and remove 'standalone' words
        return self.pool.map(self.__adjust_lecture_counts, result_data)
예제 #10
0
    def lda_over_lectures(self):
        """
        Peform LDA over lectures within the scope of an individual course.
        Basically we perform as many LDA modellings as there are courses.
        """

        lectures = []
        for course in Course.select():
            course_lectures = list(
                Lecture.select().where(Lecture.course == course))
            lda_tools = [
                DictVectorizer(),
                lda.LDA(n_topics=len(course_lectures),
                        n_iter=1000,
                        random_state=1)
            ]
            lectures.append((course, course_lectures, LectureWord, lda_tools))

        res = self.pool.map(self.__lda_for_course_material, lectures)

        with db.atomic():
            LectureTopicWord.insert_many([x for y in res
                                          for x in y[0]]).execute()
            LectureTopic.insert_many([x for y in res for x in y[1]]).execute()
예제 #11
0
    def process_item(self, item, spider):
        if isinstance(item, DataItem):
            url = ''.join(item['link'])
            dir_name = 'raw_data' + ''.join(item['path']) + '/'
            course_code = ''.join(item['course_code'])
            content = ''.join(item['content'])
            path = ''
            year = ''.join(item['year'])
            semester = ''.join(item['semester'])
            prefix = os.path.dirname(os.path.dirname(
                os.path.abspath(__file__))) + '/'

            course = Course.select().where(Course.code == course_code,
                                           Course.year == year,
                                           Course.semester == semester)
            if not course.exists():
                print "Non-existing course: {}".format(course_code)
                return

            if len(content) == 0 and not os.path.exists(dir_name):
                try:
                    os.makedirs(dir_name)
                except OSError as e:
                    print "Could not create directory: {} due to {}".format(
                        dir_name, e)

            lecture = Lecture.select().where(Lecture.course == course,
                                             Lecture.url == url)
            file_size = 0
            # if no lecture record and no content, then download data (pdf, pptx, etc.) according to url
            if len(content) == 0:
                try:
                    info = urllib.urlopen(url).info()
                    if 'Content-Length' in info:
                        file_size = float(info['Content-Length'])
                except Exception as e:
                    print "Failed to retrieve file size for {} due to {}".format(
                        url, e)
                if not lecture.exists():
                    path = self.__download(url, dir_name)
                else:
                    lecture_instance = lecture.first()

                    # Re-download only if the file has been updated
                    if lecture_instance.size == 0 or lecture_instance.size != file_size:
                        os.remove(prefix + lecture_instance.path)
                        self.__download(url, dir_name)
                    else:
                        content = lecture_instance.content  # No need to re-extract content later

            if not lecture.exists():
                print "Lecture record not found, creating ..."
                title = self.__get_title(url)
                with db.atomic():
                    try:
                        Lecture.create(course=course,
                                       url=url,
                                       path=path,
                                       name=title,
                                       content=content,
                                       size=file_size,
                                       time=datetime.datetime.now())
                    except peewee.OperationalError as e:
                        print "Could not create a record for course {} lecture {} due to {}".format(
                            course_code, url, e)
            else:
                with db.atomic():
                    try:
                        lecture_instance = lecture.first()
                        lecture_instance.content = content
                        lecture_instance.time = datetime.datetime.now()
                        lecture_instance.save()
                    except peewee.OperationalError as e:
                        print e
        return item
예제 #12
0
    def lda_over_all_material(self):
        """
        Perform LDA over all material without any course limitations. The topic count is 1/10 of the material count.
        """

        lectures = Lecture.select()
        lectures_dict = []
        for lecture in lectures:
            lecture_words = LectureWord.select().where(
                LectureWord.lecture == lecture)
            lectures_dict.append(
                dict([(x.word, x.count) for x in lecture_words]))

        topic_count = int(len(lectures_dict) / 10)

        print "Performing LDA over all material.."
        model, vocab = self.__perform_lda_default(lectures_dict, topic_count)

        topic_word_rows = []
        # Iterate over topic word distributions
        for i, topic_dist in enumerate(model.topic_word_):
            top_topic_words = np.array(vocab)[self.__max_values(
                topic_dist, self.n_top_words)]
            top_word_probs = topic_dist[np.argsort(
                topic_dist)][:-self.n_top_words - 1:-1]

            for top_word, top_weight in zip(top_topic_words, top_word_probs):
                row_dict = {
                    'topic': i,
                    'word': top_word,
                    'weight': round(top_weight * 100, 2)
                }
                topic_word_rows.append(row_dict)

            if self.debug:
                top_word_str = ", ".join([
                    x.encode('utf-8') + "(" + str(round(y * 100, 2)) + "%)"
                    for x, y in zip(top_topic_words, top_word_probs)
                ])
                print('Topic {}: {}'.format(i, top_word_str))

        # Document-topic distributions
        doc_topic = model.doc_topic_
        lecture_topic_rows = []
        for i in range(lectures.count()):
            top_topics = np.argsort(doc_topic[i])[:-self.n_top_topic - 1:-1]
            topic_probs = doc_topic[i][top_topics]

            for top_topic, top_weight in zip(top_topics, topic_probs):
                rounded_weight = round(top_weight * 100, 2)
                if rounded_weight < 10:
                    continue
                row_dict = {
                    'lecture': lectures[i],
                    'topic': top_topic,
                    'weight': rounded_weight
                }
                lecture_topic_rows.append(row_dict)

            if self.debug:
                doc_topic_str = ", ".join([
                    str(x) + "(" + str(round(y * 100, 2)) + "%)"
                    for x, y in zip(top_topics, topic_probs)
                ])
                print("{} (top {} topics: {})".format(
                    lectures[i].name.encode('utf-8'), self.n_top_topic,
                    doc_topic_str))

        with db.atomic():
            self.__insert_rows(MaterialTopicWord, topic_word_rows)
            self.__insert_rows(MaterialTopic, lecture_topic_rows)
예제 #13
0
def lda_for_course_material(course, n_top_words, n_top_topic):
    lectures = Lecture.select().where(Lecture.course == course)
    lectures_size = Lecture.select().where(Lecture.course == course).count()
    lecture_dict = []
    for lecture in lectures:
        lecture_words = LectureWord.select().where(LectureWord.lecture == lecture)
        lecture_dict.append(dict([(x.word, x.count) for x in lecture_words]))

    if not lecture_dict:
        return

    model, vocab = perform_lda(lecture_dict, lectures_size)

    for i, topic_dist in enumerate(model.topic_word_):
        top_topic_words = np.array(vocab)[np.argsort(topic_dist)][:-n_top_words - 1:-1]
        top_word_probs = topic_dist[np.argsort(topic_dist)][:-n_top_words - 1:-1]

        for top_word, top_weight in zip(top_topic_words, top_word_probs):
            try:
                with db.transaction() as txn:
                    LectureTopicWord.create(
                        course=course,
                        topic=i,
                        word=top_word,
                        weight=round(top_weight * 100, 2)
                    )
                    txn.commit()
            except peewee.OperationalError as e:
                print "Could not create a record for topic {}, word {}, {}".format(i, top_word, e)

        top_word_str = ", ".join([remove_accents(x) + "(" + str(round(y, 2) * 100) + "%)"
                                  for x, y in zip(top_topic_words, top_word_probs)])

        print('Topic {}: {}'.format(i, top_word_str))

    # Document-topic distributions
    doc_topic = model.doc_topic_

    for i in range(lectures_size):
        top_topics = np.argsort(doc_topic[i])[:-n_top_topic - 1:-1]
        topic_probs = doc_topic[i][top_topics]


        #Substitude this
        title = remove_accents(lectures[i].path.split("/")[-1])

        for top_topic, top_weight in zip(top_topics, topic_probs):
            try:
                with db.transaction() as txn:
                    LectureTopic.create(
                        lecture=lectures[i],
                        topic=top_topic,
                        weight=round(top_weight * 100, 2)
                    )
                    txn.commit()
            except peewee.OperationalError as e:
                print "Could not create a record for lecture {0}, topic {1}, {2}" \
                    .format(remove_accents(lectures[i].name), i, e)

        doc_topic_str = ", ".join(
            [str(x) + "(" + str(round(y * 100, 2)) + "%)" for x, y in zip(top_topics, topic_probs)])
        print("{} (top {} topics: {})".format(title, n_top_topic, doc_topic_str))
예제 #14
0
 def getLectureRecord(self, lectureId):
     try:
         data = Lecture.select().where(Lecture.id == lectureId).get()
         return data
     except Exception:
         return None
예제 #15
0
 def getLectures(self, course):
     lectures = Lecture.select().where(Lecture.course == course)
     return list(lectures)
예제 #16
0
from utils.SemesterUtils import parse_semesters
import os
import sys


def is_valid_semester(course_entry, allowed):
    return any([
        x[0] == course_entry.year and x[1] == course_entry.semester
        for x in allowed
    ])


if __name__ == '__main__':
    prefix = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + '/'

    lectures = Lecture.select().where(Lecture.time.is_null(True))
    for lec in lectures:
        path = prefix + lec.path
        if lec.path and os.path.exists(path):
            os.remove(path)

    semesters = []
    if len(sys.argv) == 2:
        semesters = parse_semesters(sys.argv[1])
        courses = Course.select()
        with db.atomic():
            for lec in lectures:
                lec.delete_instance()
            for course in courses:
                if is_valid_semester(course, semesters):
                    continue