Exemplo n.º 1
0
def _soffice_process(tempfile_path, filename, md5_hash, process_type):
    '''create processed file,upload to s3, store ref'''

    #libre office requires invidual environs to run multiple instances
    #make empty file named to hash for unique we haz already.
    loffice_environ_path = os.path.join('/tmp', md5_hash)

    try:
        os.makedirs(loffice_environ_path)
    except FileExistsError:
        pass

    s = filename.split('.')
    child_name = '.'.join(s[:-1]) + '.' + process_type
    extension = s[-1]

    outpath = os.path.join('/tmp', child_name)

    #t1 = time.time()

    try:
        os.system('/usr/bin/soffice -env:UserInstallation=file://%s \
            --headless --convert-to %s %s --outdir %s' \
            % (loffice_environ_path, process_type, tempfile_path, '/tmp'))
    except:
        raise HTTPExceptions.UNPROCESSABLE_ENTITY

    s3 = S3(settings.AWS_ANNOTATIONS_BUCKET)

    saved_file = open(outpath, 'rb')

    s3.save_to_bucket(child_name, saved_file)

    #save ref to db
    ref = FileUpload(filename=child_name,
                     md5_hash=md5_hash,
                     extension=extension,
                     is_original=False)

    ref.save()

    cleanup_temp_file(child_name)
    cleanup_temp_file(filename)

    #remove environment file
    try:
        shutil.rmtree(loffice_environ_path)
    except:
        #shrug
        pass

    return child_name
Exemplo n.º 2
0
def ocr_pdf(filename, parent_id, md5_hash, force_flag):

    if not os.path.exists('/tmp/ocr_clients'):
        os.makedirs('/tmp/ocr_clients')

    lockfile = os.path.join('/tmp/ocr_clients', md5_hash)

    try:
        #prevent too many heavy ocr processes from running at once
        current_process_count = len(os.listdir('/tmp/ocr_clients'))

        if current_process_count >= int(settings.MAX_SIM_OCR_PROCESSES):
            raise MaxProcessesExceededError()

        #add to current process count with file
        try:
            f = open(lockfile, 'x')
            f.close()
        except FileExistsError:
            raise FileInProcessError()

        input_path = os.path.join('/tmp', filename)

        #download file and save 
        s3 = S3(settings.AWS_OCR_BUCKET)

        file_obj = s3.download_fileobj_from_bucket(filename)
        #file_obj.save(input_path)
        with open (input_path, 'wb') as tmpfile:
            tmpfile.write(file_obj.getbuffer())
            
       
        basename = '.'.join(filename.split('.')[:-1])
        if force_flag:
            processed_filename = basename + '_ocr_force.pdf'
            force_flag = True
        else:
            processed_filename = basename + '_ocr.pdf'
            force_flag = False

        output_path = os.path.join('/tmp', processed_filename)

        if force_flag:
            f = '--force-ocr'

        else:
            f = ''

        cmd = '/usr/bin/ocrmypdf {} {} {}'.format(f, input_path, output_path)

        rslt = subprocess.check_output(cmd, shell=True)

        #save to s3 
        with open(output_path, 'rb') as file_:
            s3.save_to_bucket(processed_filename, file_)

            file_.seek(0)

            hash_ = md5(file_.read()).hexdigest()

        #record to db
        ref = OCRUpload(filename=processed_filename, md5_hash=hash_,
                is_original=False, is_forced=force_flag, parent_id=parent_id)

        ref.save()

        #remove from process count 
        os.remove(lockfile)

        cleanup_temp_file(filename)
        cleanup_temp_file(processed_filename)

    except Exception as e:
        try:
            os.remove(os.path.join('/tmp/ocr_clients', md5_hash))
            cleanup_temp_file(filename)
            cleanup_temp_file(processed_filename)
        except:
            pass

        raise e
Exemplo n.º 3
0
def upload(request):
    if request.method == 'POST':

        file_ = request.FILES.get('pdf-file')

        processing_error = None

        if file_ is None:
            raise HTTPExceptions.NOT_ACCEPTABLE  #Error code 406

        filename = file_.name

        if not filename or len(filename) < 3 or not '.' in filename:
            raise SuspiciousFileOperation('improper file name')

        filename = sanitize(filename)

        filename = filename.replace("'", '').replace('"', '')
        filename = re.sub(r"[\(,\),\s]+", "-", filename)

        temp = filename.split('.')
        basename = '.'.join(temp[:-1])
        extension = temp[-1]

        if not extension in ('pdf', 'PDF'):
            raise SuspiciousFileOperation('improper file type')

        basename = basename[:60]

        new_filename = '{0}-{1}.{2}'.format(basename, randword(5), extension)

        #save to /tmp
        md5_hash, tempfile_path = save_temp_file(new_filename, file_)

        #file already exists in system?
        existing_name = check_ocr_file_exists(md5_hash)

        #already_has_text?
        if check_pdf_has_text(new_filename):
            processing_error = 'This PDF already has text. Use the "Force OCR" button to overwrite text with a fresh OCR if desired. If file was OCRd on previous upload those results will be provided'

        if not existing_name:
            already_exists = False

            #upload original to S3
            s3 = S3(settings.AWS_OCR_BUCKET)

            saved_file = open(tempfile_path, 'rb')

            s3.save_to_bucket(new_filename, saved_file)

            ref = OCRUpload(filename=new_filename,
                            md5_hash=md5_hash,
                            is_original=True)

            ref.save()

            cleanup_temp_file(new_filename)

        else:
            already_exists = True

            new_filename = existing_name

            cleanup_temp_file(new_filename)

        data = {
            'file_info': {
                'filename': filename,
                'size': file_.size,
                'new_filename': new_filename,
                'processing_error': processing_error,
                'tempfile_path': tempfile_path,
                'already_exists': already_exists,
                'md5_hash': md5_hash
            }
        }

        return JsonResponse(data)

    return HttpResponseNotAllowed(['POST,'])
Exemplo n.º 4
0
def upload(request):
    filename = ""
    if request.method == 'POST':
        file_ = request.FILES['file']

        filename = file_.name

        if not filename or len(filename) < 3 or not '.' in filename:
            raise SuspiciousFileOperation('improper file name')

        filename = sanitize(filename)

        filename = filename.replace("'", '').replace('"', '')
        filename = re.sub(r"[\(,\),\s]+", "-", filename)

        temp = filename.split('.')
        basename = '.'.join(temp[:-1])
        extension = temp[-1]

        basename = basename[:60]

        new_filename = '{0}-{1}.{2}'.format(basename, randword(5), extension)

        #save file to disk temporarily.
        #later it will be deleted after uploading to s3.
        md5_hash, tempfile_path = save_temp_file(new_filename, file_)

        extension = extension.lower()

        #if file (or processed child) exists, return the name
        existing_name = check_file_exists(md5_hash)

        if existing_name:
            cleanup_temp_file(new_filename)

            return HttpResponse(existing_name)

        #transform process if needed
        process_to_file_type = False

        if extension in ['doc', 'docx', 'odt', 'ott', 'rtf', 'odp', 'ppt', 'pptx']:
            process_to_file_type = 'pdf'

        if extension in ['xls', 'xlsx', 'ods']:
            process_to_file_type = 'csv' 

        if process_to_file_type:
            child_name = _soffice_process(
                    tempfile_path, new_filename, md5_hash, process_to_file_type)

            if child_name:
                cleanup_temp_file(child_name)

                return HttpResponse(child_name)

            else:
                cleanup_temp_file(child_name)
                raise HTTPExceptions.UNPROCESSABLE_ENTITY


        if extension == 'pdf':
            #check if is an image pdf or if it has text
            if not check_pdf_has_text(new_filename):
                cleanup_temp_file(new_filename)
                raise HTTPExceptions.NOT_ACCEPTABLE #Error code 406


        #upload to cloud
        s3 = S3(settings.AWS_ANNOTATIONS_BUCKET)

        saved_file = open(tempfile_path, 'rb')

        s3.save_to_bucket(new_filename, saved_file)

        #save ref to db
        ref = FileUpload(filename=new_filename, md5_hash=md5_hash,
                extension=extension, is_original=True)

        ref.save()

        cleanup_temp_file(new_filename)

        return HttpResponse(new_filename)

    return HttpResponseNotAllowed(['POST,'])