import os import glob import sys import shutil from luigi import Parameter, BoolParameter from luiginlp.engine import Task, TargetInfo, StandardWorkflowComponent, registercomponent, InputComponent, Parallel, run, ComponentParameters, InputFormat from luiginlp.util import replaceextension, DirectoryHandler, getlog from luiginlp.modules.pdf import Pdf2images from luiginlp.modules.folia import Foliacat, FoliaHOCR log = getlog() class Tesseract(Task): """Does OCR on a TIFF image, outputs a hOCR file""" executable = 'tesseract' language = Parameter() outputdir = Parameter(default="") in_tiff = None #input slot def out_hocr(self): if self.outputdir and self.outputdir != '.': return TargetInfo(self, os.path.join(self.outputdir, os.path.basename(replaceextension(self.in_tiff().path, ('.tif','.tiff'),'.hocr')))) else: return TargetInfo(self, replaceextension(self.in_tiff().path, ('.tif','.tiff'),'.hocr')) def run(self): self.ex(self.in_tiff().path, self.out_hocr().path[:-5], #output path without hocr extension (-5), Tesseract adds it already l=self.language, c="tessedit_create_hocr=T",
import sys import os import unittest import glob import shutil import luiginlp import luigi import json from luiginlp.engine import Task, StandardWorkflowComponent, PassParameters, InputFormat, InputComponent, InputSlot, Parameter, IntParameter, registercomponent, ParallelBatch from luiginlp.util import getlog, chunk log = getlog() class VoweleaterTask(Task): """Example of a task that invokes an external tool and uses stdin and stdout. This one simply removes vowels from a text.""" executable = 'sed' in_txt = InputSlot() encoding = Parameter(default='utf-8') def out_txt(self): return self.outputfrominput(inputformat='txt', stripextension='.txt', addextension='.novowels.txt') def run(self): self.ex(e='s/[aeiouAEIOU]//g', __stdin_from=self.in_txt().path, __stdout_to=self.out_txt().path)