Exemplo n.º 1
0
def process_raw_document(raw_document, user):
    """ Process a RawDocument instance in to a Note instance """
    try:
        convert_raw_document(raw_document, user=user)
    except:
        logger.error(traceback.format_exc())
Exemplo n.º 2
0
    def handle(self, *args, **kwargs):
        if len(args) != 1:
            raise TypeError(
                "Expected one argument, got none: please specify a directory to parse."
            )

        # Convert given path to an absolute path, not relative.
        path = os.path.abspath(args[0])

        if not os.path.isdir(path):
            raise TypeError("First argument should be a directory to parse.")

        # for now, assume the school is MIT and find by its US DepEd ID.
        # TODO for later, do something more clever
        dbschool = School.objects.filter(usde_id=121415)[0]

        # for now, assume license is the default OCW license: CC-BY-NC
        # TODO for later, do something more clever.
        dblicense = License.objects.get_or_create(
            name='cc-by-nc-3.0',
            html=
            '<a rel="license" href="http://creativecommons.org/licenses/by-nc/3.0/"><img alt="Creative Commons License" style="border-width:0" src="http://i.creativecommons.org/l/by-nc/3.0/88x31.png" /></a>'
        )[0]

        # find all *.json files in the given directory
        def is_json_file(filename):
            return filename[-5:].lower() == '.json'

        json_files = filter(is_json_file, os.listdir(path))

        # prepend filenames with absolute paths
        def full_path_to_file(filename):
            return os.path.sep.join((path, filename))

        json_files = map(full_path_to_file, json_files)

        # parse each json file and process it for courses and notes.
        for filename in json_files:
            # each file is assumed to contain courses for a single department
            with open(filename, 'r') as jsondata:
                # parse JSON into python
                parsed = json.load(jsondata)

                # find the department or create one.
                dept_info = {
                    'name': parsed['subject'],
                    'school': dbschool,
                    'url': parsed['departmentLink'],
                }
                # Defer department creation only until there is a valid course.
                # keeping it at this scope is a bit like caching.
                dbdept = None

                # process courses
                for course in parsed['courses']:
                    if 'noteLinks' not in course or not course['noteLinks']:
                        print "No Notes in course."
                        continue

                    # only create department if necessary at this time
                    if dbdept is None:
                        dbdept = Department.objects.get_or_create(
                            **dept_info)[0]

                    # Assume first hit is always right. Solving the identity
                    # problem by name alone will always be a fool's errand.
                    dbprof = Professor.objects.get_or_create(
                        name=course['professor'])[0]

                    # Extract the course info
                    course_info = {
                        'name': course['courseTitle'],
                        'department': dbdept,
                    }
                    # Create or Find the Course object.
                    dbcourse = Course.objects.get_or_create(**course_info)[0]
                    dbcourse.professor.add(dbprof)
                    dbcourse.instructor_name = course['professor']
                    dbcourse.school = dbschool
                    dbcourse.save()
                    print "Course is in the database: {0}".format(
                        dbcourse.name)

                    # process notes for each course
                    for note in course['noteLinks']:
                        # Check to see if the Note is already uploaded.
                        url = note['link']
                        dbnote = Note.objects.filter(upstream_link=url)
                        if len(dbnote) > 2:
                            print "WARNING Skipping Note: Too many notes for {0}".format(
                                url)
                            continue
                        if len(dbnote) == 1:
                            dbnote = dbnote[0]
                            # should only be 1 entry, but get() errors loudly
                            # when none are found. filter is easier to work with
                            dbcontent = NoteContent.objects.filter(
                                note_id=dbnote.id)
                            if dbnote.text and len(dbnote.text) and \
                               len(dbcontent) and \
                               len(dbcontent[0].html):
                                print "Already there, moving on: {0}".format(
                                    url)
                                continue
                            else:
                                # Partially completed note. Remove it and try
                                # again.
                                dbnote.tags.set()  # clear tags
                                for content in dbcontent:
                                    dbcontent.delete(
                                    )  # delete any note content
                                dbnote.delete()  # delete note
                                print "Found and removed incomplete note {0}.".format(
                                    url)

                        # Upload URL of note to Filepicker if it is not already
                        # in RawDocument.
                        rd_test = RawDocument.objects.filter(upstream_link=url)
                        if not len(rd_test):
                            # https://developers.filepicker.io/docs/web/rest/#blob-store
                            print "Uploading link {0} to FP.".format(url)
                            ulresp = requests.post(
                                'https://www.filepicker.io/api/store/S3',
                                params={
                                    'key': FILEPICKER_API_KEY,
                                    'policy': Document.fp_policy,
                                    'signature': Document.fp_signature,
                                },
                                data={
                                    'url': url,
                                })
                            try:
                                ulresp.raise_for_status()
                            except Exception, e:
                                print "Failed to upload note: " + str(e)
                                print "Skipping."
                                continue
                            # Filepicker returns JSON, so use that
                            uljson = ulresp.json()

                            print "Saving raw document to database."
                            # Extract the note info
                            dbnote = RawDocument()
                            dbnote.course = dbcourse
                            dbnote.name = note['fileName']
                            dbnote.license = dblicense
                            dbnote.upstream_link = url
                            dbnote.fp_file = uljson['url']
                            dbnote.mimetype = uljson['type']
                            dbnote.is_processed = True  # hack to bypass celery
                            # Create the RawDocument object.
                            dbnote.save()
                        else:
                            # Find the right RawDocument
                            print "Already uploaded link {0} to FP.".format(
                                url)
                            dbnote = rd_test[0]

                        # Do tags separately
                        dbnote.tags.add('mit-ocw', 'karma')

                        print "Converting document and saving note text."
                        while True:
                            try:
                                convert_raw_document(dbnote)
                            except ValueError, e:
                                # only catch one specific error
                                if not str(e).startswith(
                                        'PDF file could not be'):
                                    raise e
                                # write the link to file.
                                with open('pdferrors.log', 'a') as pdferrs:
                                    pdferrs.write(url + '\n')
                                # delete the partial Note created in convert_raw_doc
                                dbnote = Note.objects.filter(
                                    upstream_link=url)[0]
                                dbnote.tags.set()
                                dbnote.delete()
                                print "This note errored, so it is removed :("
                                break
                            except Exception, e:
                                if '403' in str(e):
                                    print "Failed: " + str(e)
                                    print "Trying again."
                                    continue
                                else:
                                    print "Failed: " + str(e)
                                    print "Aborting."
                                    break
                            else:
                                print "This note is done."
                                break

                    print "Notes for {0} are done.".format(dbcourse.name)
Exemplo n.º 3
0
    def handle(self, *args, **kwargs):
        if len(args) != 1:
            raise ArgumentError("Expected one argument, got none: please specify a directory to parse.")

        # Convert given path to an absolute path, not relative.
        path = os.path.abspath(args[0])

        if not os.path.isdir(path):
            raise ArgumentError("First argument should be a directory to parse.")

        # for now, assume the school is MIT and find by its US DepEd ID.
        # TODO for later, do something more clever
        dbschool = School.objects.filter(usde_id=121415)[0]

        # for now, assume license is the default OCW license: CC-BY-NC 3
        # TODO for later, do something more clever.
        dblicense = License.objects.get_or_create(
          name='cc-by-nc-3.0',
          html='<a rel="license" href="http://creativecommons.org/licenses/by-nc/4.0/"><img alt="Creative Commons License" style="border-width:0" src="http://i.creativecommons.org/l/by-nc/4.0/88x31.png" /></a>'
        )[0]

        # build Filepicker upload URL
        # http://stackoverflow.com/questions/14115280/store-files-to-filepicker-io-from-the-command-line
        fpurl = 'https://www.filepicker.io/api/store/S3?key={0}'.format(FILEPICKER_API_KEY)

        # find all *.json files in the given directory
        def is_json_file(filename):
            return filename[-5:].lower() == '.json'
        json_files = filter(is_json_file, os.listdir(path))
        # prepend filenames with absolute paths
        def full_path_to_file(filename):
            return os.path.sep.join((path, filename))
        json_files = map(full_path_to_file, json_files)

        # parse each json file and process it for courses and notes.
        for filename in json_files:
            with open(filename, 'r') as jsondata:
                # parse JSON into python
                parsed = json.load(jsondata)

                # find the department or create one.
                dept_info = {
                    'name': parsed['subject'],
                    'school': dbschool,
                    'url': parsed['departmentLink'],
                }
                dbdept = Department.objects.get_or_create(**dept_info)[0]

                # process courses
                for course in parsed['courses']:
                    # Assume first hit is always right. Solving the identity
                    # problem by name alone will always be a fool's errand.
                    dbprof = Professor.objects.get_or_create(name=course['professor'])[0]

                    # Associate the professor with the department.
                    # (no need to track the result)
                    ProfessorAffiliation.objects.get_or_create(
                        professor=dbprof,
                        department=dbdept)

                    # Extract the course info
                    course_info = {
                      'name': course['courseTitle'],
                      'department': dbdept,
                    }
                    # Create or Find the Course object.
                    dbcourse = Course.objects.get_or_create(**course_info)[0]
                    dbcourse.professor = dbprof
                    dbcourse.instructor_name = course['professor']
                    dbcourse.school = dbschool
                    dbcourse.save()
                    print "Course is in the database: {0}".format(dbcourse.name)

                    ProfessorTaught.objects.get_or_create(
                        professor=dbprof,
                        course=dbcourse)

                    if 'noteLinks' not in course or not course['noteLinks']:
                        print "No Notes in course."
                        continue

                    # process notes for each course
                    for note in course['noteLinks']:
                        # Check to see if the Note is already uploaded.
                        url = note['link']
                        dbnote = Note.objects.filter(upstream_link=url)
                        if len(dbnote) > 2:
                            print "WARNING Skipping Note: Too many notes for {0}".format(url)
                            continue
                        if len(dbnote) == 1:
                            dbnote = dbnote[0]
                            if dbnote.text and len(dbnote.text) or \
                               dbnote.html and len(dbnote.html):
                                print "Already there, moving on: {0}".format(url)
                                continue
                            else:
                                # Partially completed note. Remove it and try
                                # again.
                                dbnote.tags.set() # clear tags
                                dbnote.delete() # delete note
                                print "Found and removed incomplete note {0}.".format(url)

                        # Upload URL of note to Filepicker if it is not already
                        # in RawDocument.
                        rd_test = RawDocument.objects.filter(upstream_link=url)
                        if not len(rd_test):
                            # https://developers.inkfilepicker.com/docs/web/#inkblob-store
                            print "Uploading link {0} to FP.".format(url)
                            ulresp = requests.post(fpurl, data={
                              'url': url,
                            })
                            try:
                                ulresp.raise_for_status()
                            except Exception, e:
                                print "Failed to upload note: " + str(e)
                                print "Skipping."
                                continue
                            # Filepicker returns JSON, so use that
                            uljson = ulresp.json()

                            print "Saving raw document to database."
                            # Extract the note info
                            dbnote = RawDocument()
                            dbnote.course = dbcourse
                            dbnote.name = note['fileName']
                            dbnote.license = dblicense
                            dbnote.upstream_link = url
                            dbnote.fp_file = uljson['url']
                            dbnote.mimetype = uljson['type']
                            dbnote.is_processed = True # hack to bypass celery
                            # Create the RawDocument object.
                            dbnote.save()
                        else:
                            # Find the right RawDocument
                            print "Already uploaded link {0} to FP.".format(url)
                            dbnote = rd_test[0]

                        # Do tags separately
                        dbnote.tags.add('mit-ocw','karma')

                        print "Converting document and saving note to S3."
                        while True:
                            try:
                                convert_raw_document(dbnote)
                            except ValueError, e:
                                # only catch one specific error
                                if not str(e).startswith('PDF file could not be'):
                                    raise e
                                # write the link to file.
                                with open('pdferrors.log', 'a') as pdferrs:
                                    pdferrs.write(url + '\n')
                                # delete the partial Note created in convert_raw_doc
                                dbnote = Note.objects.filter(upstream_link=url)[0]
                                dbnote.tags.set()
                                dbnote.delete()
                                print "This note errored, so it is removed :("
                                break
                            except Exception, e:
                                if '403' in str(e):
                                    print "Failed: " + str(e)
                                    print "Trying again."
                                    continue
                                else:
                                    print "Failed: " + str(e)
                                    print "Aborting."
                                    break
                            else:
                                print "This note is done."
                                break

                    print "Notes for {0} are done.".format(dbcourse.name)