예제 #1
0
파일: cli.py 프로젝트: akash0675/yapot
def run():
    print "Welcome to yapot!"
    if len(sys.argv) != 2:
        print "Usage:\n\n\tpython cli-tool.py <pdf_filename>\n\n"

    else:
        pdf_filename = sys.argv[1]
        #base_page_name = os.path.expanduser(pdf_filename)

        temp_dir =  str(uuid.uuid4())

        success, pdf_text = convert_document(
            pdf_filename = pdf_filename,
            #base_page_name = base_page_name,
            resolution = 200,
            delete_files = False,
            page_delineation = '\n--------\n',
            verbose = True,
            temp_dir = temp_dir,
            make_thumbs = True,
            thumb_size = 512,
            thumb_dir = '{0}/thumbs'.format(temp_dir),
        )

        with open('%s.txt' % pdf_filename, 'w') as f:
            f.write(pdf_text)

        print "Done."
예제 #2
0
def convert_document(filename):

    """ converts a pdf document to text using yapot """
    success, pdf_text = yapot.convert_document(
        pdf_filename = filename,
        #resolution = 200,
        delete_files = False,
        page_delineation = '\n',
        verbose = _DEBUG,
        temp_dir = './.tmp',
        #make_thumbs = False,
        #thumb_size = None,
        #thumb_dir = None,
    )
    
    if _DEBUG == True:
        print "PDF Contents:\n\n"
        print pdf_text
        print "\n\n"

    text = None
    if success and pdf_text.strip() != '':
        text = pdf_text.strip()
        text.replace('\r','\n')
        for i in range(0,3):
            text = re.sub(' +',' ', text)
            text = re.sub('\t+',' ', text)
            text = re.sub('\n+',' ', text)
    return text
예제 #3
0
파일: cli.py 프로젝트: ryansb/yapot
def run():
    print "Welcome to yapot!"
    if len(sys.argv) != 2:
        print "Usage:\n\n\tpython cli-tool.py <pdf_filename>\n\n"

    else:
        pdf_filename = sys.argv[1]
        base_page_name = os.path.expanduser(pdf_filename)

        success, pdf_text = convert_document(
            pdf_filename = pdf_filename,
            base_page_name = base_page_name,
            resolution = 200,
            delete_files = True,
            page_delineation = '\n--------\n',
            verbose = True,
        )

        with open('%s.txt' % pdf_filename, 'w') as f:
            f.write(pdf_text)

        print "Done."
예제 #4
0
파일: views.py 프로젝트: thequbit/yapot-web
def view_doc_post(request):

    resp = {}
    resp['code'] = 200
    resp['status'] = 'Success.'

    if 'file' in request.POST:

        #filename = request.POST['file'].filename
        doc_file = request.POST['file'].file

        doc_uid = '%s' % uuid.uuid4()

        filename = '%s/%s.pdf' % (UPLOAD_FOLDER, doc_uid)
        with open(filename, 'wb') as f:
            doc_file.seek(0)
            while True:
                data = doc_file.read(2<<16)
                if not data:
                    break
                f.write(data)

        if magic.from_file(filename, mime=True) == 'application/pdf':
            text = yapot.convert_document(filename, resolution=300, pool_count=8)
            with open('%s/%s.txt' % (UPLOAD_FOLDER, doc_uid), 'w') as f:
                f.write(text)
            resp['doc_uid'] = doc_uid
            resp['text'] = text
            resp['code'] = 200
            resp['status'] = "File uploaded successfully."
        else:
            resp['code'] = 415
            resp['status'] = "Invalid file type."

    else:
        resp['code'] = 400
        resp['status'] = "Missing file for upload."

    return Response(json.dumps(resp), content_type='application/json')
예제 #5
0
    def _convert_document(self, doc):

        """
        doc = {
            "parent_url": parent_url,
            "doc_url": doc_url,
            "scraper_id": "",
            "scrape_datetime": datetime.datetime.utc(),
            "converted": False,
            "convert_datetime": None,
            "local_filename": "",
            "link_text": link_text,
            "document_meta_data": {
            },
            "contents": "",
        }
        """

        if self.verbose == True:
            print "Unconverted document found, processing."

        success = False
        if True:
        #try:

            doc_filename = self._download_document(doc['doc_url'])
            doc_path = '{0}{1}'.format(self.download_dir, doc_filename)

            start_time = time.time()
            success, pdf_text = yapot.convert_document(
                pdf_filename     = doc_path,
                resolution       = self.resolution,
                delete_files     = True,
                page_delineation = '\n--------\n',
                verbose          = self.verbose,
                make_thumbs      = True,
                thumb_size       = 512,
                thumb_dir        = self.download_dir,
                thumb_prefix     = '{0}_thumb_page_'.format(doc_filename),
            )
            convert_time = time.time() - start_time

            if self.verbose == True:
                print "Updating document ..."

            if success == True:
                session.update_document(
                    id                 = doc['_id'],
                    contents           = pdf_text,
                    document_meta_data = {},
                    local_filename     = doc_filename,
                    convert_time       = convert_time,
                )

            success = True

            if self.verbose == True:
                print "Done updating document."

        #except:
        #    pass

        if self.verbose == True:
            print "Done processing document."

        return success
예제 #6
0
    def _convert_document(self, doc):
        """
        doc = {
            "parent_url": parent_url,
            "doc_url": doc_url,
            "scraper_id": "",
            "scrape_datetime": datetime.datetime.utc(),
            "converted": False,
            "convert_datetime": None,
            "local_filename": "",
            "link_text": link_text,
            "document_meta_data": {
            },
            "contents": "",
        }
        """

        if self.verbose == True:
            print "Unconverted document found, processing."

        success = False
        if True:
            #try:

            doc_filename = self._download_document(doc['doc_url'])
            doc_path = '{0}{1}'.format(self.download_dir, doc_filename)

            start_time = time.time()
            success, pdf_text = yapot.convert_document(
                pdf_filename=doc_path,
                resolution=self.resolution,
                delete_files=True,
                page_delineation='\n--------\n',
                verbose=self.verbose,
                make_thumbs=True,
                thumb_size=512,
                thumb_dir=self.download_dir,
                thumb_prefix='{0}_thumb_page_'.format(doc_filename),
            )
            convert_time = time.time() - start_time

            if self.verbose == True:
                print "Updating document ..."

            if success == True:
                session.update_document(
                    id=doc['_id'],
                    contents=pdf_text,
                    document_meta_data={},
                    local_filename=doc_filename,
                    convert_time=convert_time,
                )

            success = True

            if self.verbose == True:
                print "Done updating document."

        #except:
        #    pass

        if self.verbose == True:
            print "Done processing document."

        return success
예제 #7
0
def ocr(fn):
    success, text = yapot.convert_document(fn)
    if success:
        return text
    else:
        return ''