예제 #1
0
 def runTest():
     OcrHelper.setMinTextLayer(8)
     print("text layer is: " + str(OcrHelper.getMinTextLayer()))
     pdf_path = "file:///" + os.getcwd(
     ) + "/../ocr/src/test/resources/pdfs/"
     data = OcrHelper.createDataset(spark=SparkContextForTest.spark,
                                    input_path=pdf_path,
                                    output_col="region",
                                    metadata_col="metadata")
     data.show()
     OcrHelper.setMinTextLayer(0)
     print("Text layer disabled")
     data = OcrHelper.createDataset(spark=SparkContextForTest.spark,
                                    input_path=pdf_path,
                                    output_col="region",
                                    metadata_col="metadata")
     data.show()
     OcrHelper.setMinTextLayer(10)
     content = OcrHelper.createMap(
         input_path="../ocr/src/test/resources/pdfs")
     print(content)
     document_assembler = DocumentAssembler() \
         .setInputCol("region") \
         .setOutputCol("document")
     document_assembler.transform(data).show()
예제 #2
0
 def runTest():
     data = OcrHelper.createDataset(
         spark=SparkContextForTest.spark,
         input_path="../ocr/src/test/resources/pdfs/",
         output_col="region",
         metadata_col="metadata")
     data.show()
     content = OcrHelper.createMap(
         input_path="../ocr/src/test/resources/pdfs/")
     print(content)
     document_assembler = DocumentAssembler() \
         .setInputCol("region") \
         .setOutputCol("document")
     document_assembler.transform(data).show()
예제 #3
0
 def runTest():
     OcrHelper.setPreferredMethod('text')
     print("text layer is: " + str(OcrHelper.getPreferredMethod()))
     pdf_path = "file:///" + os.getcwd() + "/../ocr/src/test/resources/pdfs/"
     data = OcrHelper.createDataset(
         spark=SparkContextForTest.spark,
         input_path=pdf_path)
     data.show()
     OcrHelper.setPreferredMethod('image')
     print("Text layer disabled")
     data = OcrHelper.createDataset(
         spark=SparkContextForTest.spark,
         input_path=pdf_path)
     data.show()
     OcrHelper.setPreferredMethod('text')
     content = OcrHelper.createMap(input_path="../ocr/src/test/resources/pdfs")
     print(content)
     document_assembler = DocumentAssembler() \
         .setInputCol("text") \
         .setOutputCol("document")
     document_assembler.transform(data).show()