def extract_image(opt, page_nr, filename): try: width, height = image_size(page_nr, filename) subsample = 1 while (width*height) / subsample > (1 << 20) * 50: subsample += 1 subsample = min(subsample, 12) except Exception: utils.print_traceback("Unable to get image size, subsample=1", filename) subsample = 1 if subsample != 1: print "subsample", subsample tiff_name = opt.out_dir + 'page_%04d.tif' % page_nr ddjvu = djvulibre_path + 'ddjvu' ls = subprocess.Popen([ ddjvu, "-format=tiff", "-page=%d" % page_nr, "-subsample=%d" % subsample, filename, tiff_name], stdout=subprocess.PIPE, preexec_fn=setrlimits, close_fds = True) text = utils.safe_read(ls.stdout) if text: print text ls.wait() if ls.returncode != 0: print >> sys.stderr, "extract_image fail: ", ls.returncode, filename, page_nr return None return tiff_name
def ocr(filename, out_basename, lang, config = ''): if tesseract_data_prefix: os.environ['TESSDATA_PREFIX'] = tesseract_data_prefix ls = subprocess.Popen([ tesseract_path, filename, out_basename, "-l", lang, config], stdout=subprocess.PIPE, preexec_fn=setrlimits, close_fds = True) text = utils.safe_read(ls.stdout) if text: print text, ls.wait() if config == '': out_filename = out_basename + ".txt" else: out_filename = out_basename + ".hocr" if not os.path.exists(out_filename) or ls.returncode: # in case returncode == 0 print >> sys.stderr, "ocr.ocr() fail to exec tesseract:", ls.returncode, filename fd = open(out_filename, 'w') fd.write('An error occurred during ocr processing: ' + filename) fd.close() fd = open(out_filename) txt = fd.read() fd.close() if tesseract_data_prefix: del os.environ['TESSDATA_PREFIX'] if ls.returncode != 0: print >> sys.stderr, "ocr.ocr() fail to exec tesseract:", ls.returncode, filename return None return txt
def ocr(filename, out_basename, lang, config = ''): if tesseract_data_prefix: os.environ['TESSDATA_PREFIX'] = tesseract_data_prefix ls = subprocess.Popen([ tesseract_path, filename, out_basename, "-l", lang, config], stdout=subprocess.PIPE, preexec_fn=setrlimits, close_fds = True) text = utils.safe_read(ls.stdout) if text: print text, ls.wait() if config == '': out_filename = out_basename + ".txt" else: out_filename = out_basename + ".html" if not os.path.exists(out_filename) and not ls.returncode: # in case returncode == 0 print >> sys.stderr, "ocr.ocr() fail to exec tesseract:", ls.returncode, filename fd = open(out_filename, 'w') fd.write('An error occurred during ocr processing: ' + filename) fd.close() fd = open(out_filename) txt = fd.read() fd.close() if tesseract_data_prefix: del os.environ['TESSDATA_PREFIX'] if ls.returncode != 0: print >> sys.stderr, "ocr.ocr() fail to exec tesseract:", ls.returncode, filename return None return txt
def get_nr_pages_djvu(filename): djvused = djvulibre_path + 'djvused' ls = subprocess.Popen([ djvused, "-e", "n", filename], stdout=subprocess.PIPE, preexec_fn=setrlimits, close_fds = True) text = utils.safe_read(ls.stdout) ls.wait() if ls.returncode != 0: print >> sys.stderr, "Error: djvused fail to exec", ls.returncode return None return int(text)
def image_size(page_nr, filename): djvused = djvulibre_path + 'djvused' ls = subprocess.Popen([ djvused, "-e", "select %d; size" % page_nr, filename], stdout=subprocess.PIPE, preexec_fn=setrlimits, close_fds = True) text = utils.safe_read(ls.stdout) ls.wait() if ls.returncode != 0: print >> sys.stderr, "Error: djvused fail to exec", ls.returncode return None match = re.search('width=(\d+) height=(\d+)', text) return int(match.group(1)), int(match.group(2))
''' extract the speech from the whole dataset, and save each file under the same folder as the ''' from utils import get_all_transcript_paths, safe_read from extract_speech import extract_speech_string from tqdm import tqdm if __name__ == '__main__': for path in tqdm(list(get_all_transcript_paths())): transcript = extract_speech_string(safe_read(path)) with open(f'transcripts/{path.parts[-1]}', 'w') as f: f.write(transcript)