def run_extract_cmd(targetfield,
                    inputfile=None,
                    crop=False,
                    where='',
                    limit=1,
                    page=1,
                    psm=7,
                    outputpng=False,
                    printcmd=False,
                    outputcrop=False,
                    outputchop=False,
                    whiteThreshold=97,
                    validfile=None):

    cl_params = {}

    cl_params['limit'] = limit
    cl_params['page'] = page
    cl_params['crop'] = crop
    cl_params['psm'] = psm
    cl_params['targetfield'] = targetfield
    cl_params['where'] = where
    cl_params['outputpng'] = outputpng
    cl_params['outputcrop'] = outputcrop
    cl_params['outputchop'] = outputchop
    cl_params['printcmd'] = printcmd
    cl_params['whiteThreshold'] = whiteThreshold
    if validfile is None:
        validfile = "%s%s.tsv" % (st.raw_data_dir, targetfield)
    else:
        validfile = validfile

    for i, p in enumerate(st.extract_params):

        all_p = dict(p.items() + cl_params.items())

        if i == 0:
            all_p['inputfile'] = inputfile
            #all_p['where'] = "and ep.params='%(b_params)s'" % b_params
        else:
            all_p['inputfile'] = invalidfile

        all_p['outfile'] = "%s%s_%i.tsv" % (st.to_validate_dir, targetfield, i)
        invalidfile = "%s%s_%i.tsv" % (st.invalid_data_dir, targetfield, i)

        if i == len(p) - 1:
            all_p['outputpng'] = True
            all_p['outputcrop'] = True
            all_p['outputchop'] = True

        rp.run_cmd(**all_p)

        osutil.getStdoutFromCmd(
            '''cat %s | python %svalidate.py %s  > %s ''' %
            (all_p['outfile'], st.python_bin, validfile, invalidfile),
            shell=True)
def checkKnownFields(tokens, field):

    regex_arr = []

    for t in tokens.keys():
        if tokens[t] is None:
            regex_arr.append(t)
        else:
            regex_arr.append(tokens[t])

    full_regex_arr = []

    for r in regex_arr:
        regex_t = r.replace('l', '@')
        regex_t = regex_t.replace('i', '@')
        regex_t = regex_t.replace('@', '[li]')
        full_regex_arr.append(regex_t)

    greps = 'cat validation/validate_%s.tsv' % field
    for r in full_regex_arr:
        greps += ''' | grep -w '%s' ''' % r.lower()

    match = osutil.getStdoutFromCmd(greps, shell=True)
    match = match.strip()

    if match != '':
        for i, r in enumerate(full_regex_arr):
            matches = re.findall(r.lower(), match)
            tokens[tokens.items()[i][0]] = matches[0]
def checkUrl(tokens, file_id):

    regex_arr = []

    for t in tokens.keys():
        if tokens[t] is None:
            regex_arr.append(t)
        else:
            regex_arr.append(None)  # Placeholder so index stays the same

    full_regex_arr = []

    for r in regex_arr:
        if r is None:
            full_regex_arr.append(None)
        else:
            regex_t = r.replace('l', '@')
            regex_t = regex_t.replace('i', '@')
            regex_t = regex_t.replace('@', '[li]')
            # replace slashes as well
            # may become obsolete TODO
            regex_t = regex_t.replace('/', '/ && /')
            full_regex_arr.append(regex_t)

    awk_tokens = '/%s/' % '/ && /'.join(
        [r for r in full_regex_arr if r is not None])
    match = osutil.getStdoutFromCmd(
        '''awk '/%s/ && %s' validation/validate_url.tsv ''' % (file_id, awk_tokens.lower()), shell=True)
    match = match.strip()

    if match != '':
        for i, r in enumerate(full_regex_arr):
            if r is not None:
                matches = re.findall(r.lower(), match)
                tokens[tokens.items()[i][0]] = matches[0]
def checkKnownFields(tokens, field):

    regex_arr = []

    for t in tokens.keys():
        if tokens[t] is None:
            regex_arr.append(t)
        else:
            regex_arr.append(tokens[t])

    full_regex_arr = []

    for r in regex_arr:
        regex_t = r.replace('l', '@')
        regex_t = regex_t.replace('i', '@')
        regex_t = regex_t.replace('@', '[li]')
        full_regex_arr.append(regex_t)

    greps = 'cat validation/validate_%s.tsv' % field
    for r in full_regex_arr:
        greps += ''' | grep -w '%s' ''' % r.lower()

    match = osutil.getStdoutFromCmd(greps, shell=True)
    match = match.strip()

    if match != '':
        for i, r in enumerate(full_regex_arr):
            matches = re.findall(r.lower(), match)
            tokens[tokens.items()[i][0]] = matches[0]
def run_extract_cmd(targetfield, inputfile=None, crop=False, where='', limit=1, page=1, psm=7, outputpng=False, printcmd=False, outputcrop=False, outputchop=False, whiteThreshold=97, validfile=None):

    cl_params = {}

    cl_params['limit'] = limit
    cl_params['page'] = page
    cl_params['crop'] = crop
    cl_params['psm'] = psm
    cl_params['targetfield'] = targetfield
    cl_params['where'] = where
    cl_params['outputpng'] = outputpng
    cl_params['outputcrop'] = outputcrop
    cl_params['outputchop'] = outputchop
    cl_params['printcmd'] = printcmd
    cl_params['whiteThreshold'] = whiteThreshold
    if validfile is None:
        validfile = "%s%s.tsv" % (st.raw_data_dir, targetfield)
    else:
        validfile = validfile 


    for i, p in enumerate(st.extract_params):

        all_p = dict(p.items() + cl_params.items())

        if i == 0:
            all_p['inputfile'] = inputfile
            #all_p['where'] = "and ep.params='%(b_params)s'" % b_params
        else:
            all_p['inputfile'] = invalidfile

        all_p['outfile'] = "%s%s_%i.tsv" % (st.to_validate_dir, targetfield, i)
        invalidfile = "%s%s_%i.tsv" % (st.invalid_data_dir, targetfield, i)

        if i == len(p) - 1:
            all_p['outputpng'] = True
            all_p['outputcrop'] = True
            all_p['outputchop'] = True

        rp.run_cmd(**all_p)

        osutil.getStdoutFromCmd('''cat %s | python %svalidate.py %s  > %s ''' % (all_p['outfile'], st.python_bin, validfile, invalidfile), shell=True)
def checkCommonTokensFuzzy(tokens):
    for t in tokens.keys():
        if tokens[t] is None:
            regex_t = t.replace('l', '@')
            regex_t = regex_t.replace('i', '@')
            regex_t = regex_t.replace('@', '[li]')

            match = osutil.getStdoutFromCmd(
                '''grep -x -m 1 '%s' validation/common_tokens.tsv ''' % regex_t.lower(), shell=True)
            match = match.strip()

            if match != '':
                tokens[t] = match
def checkCommonTokensFuzzy(tokens):
    for t in tokens.keys():
        if tokens[t] is None:
            regex_t = t.replace('l', '@')
            regex_t = regex_t.replace('i', '@')
            regex_t = regex_t.replace('@', '[li]')

            match = osutil.getStdoutFromCmd(
                '''grep -x -m 1 '%s' validation/common_tokens.tsv ''' %
                regex_t.lower(),
                shell=True)
            match = match.strip()

            if match != '':
                tokens[t] = match
def checkUrl(tokens, file_id):

    regex_arr = []

    for t in tokens.keys():
        if tokens[t] is None:
            regex_arr.append(t)
        else:
            regex_arr.append(None)  # Placeholder so index stays the same

    full_regex_arr = []

    for r in regex_arr:
        if r is None:
            full_regex_arr.append(None)
        else:
            regex_t = r.replace('l', '@')
            regex_t = regex_t.replace('i', '@')
            regex_t = regex_t.replace('@', '[li]')
            # replace slashes as well
            # may become obsolete TODO
            regex_t = regex_t.replace('/', '/ && /')
            full_regex_arr.append(regex_t)

    awk_tokens = '/%s/' % '/ && /'.join(
        [r for r in full_regex_arr if r is not None])
    match = osutil.getStdoutFromCmd(
        '''awk '/%s/ && %s' validation/validate_url.tsv ''' %
        (file_id, awk_tokens.lower()),
        shell=True)
    match = match.strip()

    if match != '':
        for i, r in enumerate(full_regex_arr):
            if r is not None:
                matches = re.findall(r.lower(), match)
                tokens[tokens.items()[i][0]] = matches[0]
def run_cmd(resolution, targetfield, size=100, inputfile=None, outfile=None, crop=False, where='', limit=1, page=1, psm=7, rotation=None, language='engarial', outputpng=False, maxprocs=8, printcmd=False, limitIMthreads=True, median=None, sharpen=None, textcleaner=None, nudge=None, bbox=False, targettext=None, outputcrop=False, outputchop=False, threshold=False, deskew=None, cuneiform=False, whiteThreshold=None, noinfo=False, dryrun=None):
    '''Extract fields from a PDF using GNU Parallel
    -o <str>, --outfile=<str>
    -s <int>, --size=<int> 
    -r <int>, --resolution=<int>

    '''

    python_bin = st.python_bin
    raw_data_dir = st.raw_data_dir

    chop_top = 50
    chop_bottom = 20
    chop_right = 1
    chop_left = 0

    (l, r, d, u) = (0, 0, 0, 0)

    limitIM = ''
    resize_arg = ''
    median_arg = ''
    sharpen_arg = ''
    rotation_arg = ''
    deskew_arg = ''
    whiteThreshold_arg = ''
    cropbox = ''
    convert_crop = ''
    textcleaner_arg = ''
    threshold_arg = ''
    output_file = ''
    dryrun_arg = ''

    extract_params = 'resolution %(resolution)s  size %(size)s  median %(median)s  sharpen %(sharpen)s  textcleaner %(textcleaner)s  nudge %(nudge)s  deskew %(deskew)s  threshold %(threshold)s  cuneiform %(cuneiform)s' % locals()

    if limitIMthreads:
        limitIM = '-limit thread 1'

    if size != 100:
        resize_arg = '-resize %i%%' % size

    if median is not None:
        median_arg = '-median %d' % median

    if sharpen is not None:
        sharpen_arg = '-sharpen %s' % sharpen

    if rotation is not None:
        rotation_arg = '-rotation %i' % rotation

    if deskew is not None:
        deskew_arg = '-deskew %i' % deskew

    if whiteThreshold is not None:
        whiteThreshold_arg = '-white-threshold %i%%' % whiteThreshold

    if nudge is not None:
        nudgestrs = nudge.strip()[1:-1].split(',')
        (l, r, d, u) = map(int, nudgestrs)

    if dryrun is not None:
        dryrun_arg = '--dryrun' 

    ghostscript = ''''gs -q -dSAFER -sDEVICE=png16m -dFirstPage=%(page)i -dLastPage=%(page)i -g{2}x{3} -r%(resolution)s -o - -c "<</Install {-$((%(r)i-%(l)i+{4})) -$((%(u)i-%(d)i+{5})) translate}>> setpagedevice" -f "{1}" ''' % locals()

    if inputfile is None:
        pdfpaths = '''python %(python_bin)spq.py printParallelParams --where="%(where)s" --limit="%(limit)s" --resolution="%(resolution)s" --targetfield="%(targetfield)s"''' % locals()
    else:
        pdfpaths = '''cat %(inputfile)s''' % locals()

    parallel = '''| parallel %(dryrun_arg)s --no-run-if-empty --colsep '\\t' --max-procs=%(maxprocs)i --ungroup''' % locals()

    convert_resize = '''| convert %(limitIM)s %(median_arg)s %(sharpen_arg)s %(rotation_arg)s %(resize_arg)s - - ''' % locals()

    if outputpng:
        convert_resize += ''' | tee /tmp/{1/.}_{8}.png'''

    if crop:
        cropbox = '''| python -u %(python_bin)scropbox.py {6} {7}''' % locals()

        convert_crop = '''| convert %(limitIM)s -gravity North -chop 0x%(chop_top)s%% -gravity East -chop %(chop_right)ix0%% -gravity West -chop %(chop_left)ix0%% -gravity South -chop 0x%(chop_bottom)s%% -bordercolor white -border 4x4 %(deskew_arg)s %(whiteThreshold_arg)s - -''' % locals()

        if outputcrop:
            cropbox += ''' | tee /tmp/{1/.}_crop_{8}.png'''

        if outputchop:
            convert_crop += ''' | tee /tmp/{1/.}_chop_{8}.png'''

    if textcleaner is not None:
        if textcleaner == 0:
            textcleaner_arg = '| textcleaner -T -s 1 png:- png:-'
        elif textcleaner == 1:
            textcleaner_arg = '| textcleaner -T -e normalize png:- png:-'
        elif textcleaner == 2:
            textcleaner_arg = '| textcleaner -T -s 1 -e normalize png:- png:-'

    if threshold:
        threshold_arg = '| python threshold.py gaussian 40'

    if not cuneiform:
        ocr = '''| tesseract303 - - -psm %(psm)s -lang=%(language)s {8} 2> /dev/null | python %(python_bin)snoinput.py''' % locals()
    else:
        ocr = '''| cuneiform --singlecolumn - -o /tmp/{1/.}_cuneiform.txt > /dev/null 2> /dev/null || touch /tmp/{1/.}_cuneiform.txt && cat /tmp/{1/.}_cuneiform.txt | python %(python_bin)snoinput.py''' % locals()

    remove_nl = ''' | tr -s "\\n" " " | sed "s/$/\\n/g" '''

    if bbox:
        ocr += '''| python hocrCoords.py %(targettext)s''' % locals()

    addinfo = ''
    if not noinfo:
        addinfo = ''' | sed -e "s|\(.\+\)$|\\1\\t{8}\\t%(page)i\\t%(extract_params)s\\t{1}\\t{2}\\t{3}\\t{4}\\t{5}\\t{6}\\t{7}\\t{8}|g" ''' % locals()

    addinfo += '\''  # quote for end of parallel section

    if outfile is not None:
        output_file = '''> %(outfile)s ''' % locals()

    cmd = ' '.join([pdfpaths, parallel, ghostscript, convert_resize, cropbox,
                    convert_crop, textcleaner_arg, threshold_arg, ocr, remove_nl, addinfo, output_file])

    if printcmd:
        print(cmd)
    else:
        cmdout = osutil.getStdoutFromCmd(cmd, shell=True)
        if cmdout.strip() != '':
            print cmdout


    # remove cuneiform output files
    filelist = glob.glob("/tmp/*cuneiform.txt")
    for f in filelist:
        os.remove(f)
Exemplo n.º 10
0
def run_cmd(resolution,
            targetfield,
            size=100,
            inputfile=None,
            outfile=None,
            crop=False,
            where='',
            limit=1,
            page=1,
            psm=7,
            rotation=None,
            language='engarial',
            outputpng=False,
            maxprocs=8,
            printcmd=False,
            limitIMthreads=True,
            median=None,
            sharpen=None,
            textcleaner=None,
            nudge=None,
            bbox=False,
            targettext=None,
            outputcrop=False,
            outputchop=False,
            threshold=False,
            deskew=None,
            cuneiform=False,
            whiteThreshold=None,
            noinfo=False,
            dryrun=None):
    '''Extract fields from a PDF using GNU Parallel
    -o <str>, --outfile=<str>
    -s <int>, --size=<int> 
    -r <int>, --resolution=<int>

    '''

    python_bin = st.python_bin
    raw_data_dir = st.raw_data_dir

    chop_top = 50
    chop_bottom = 20
    chop_right = 1
    chop_left = 0

    (l, r, d, u) = (0, 0, 0, 0)

    limitIM = ''
    resize_arg = ''
    median_arg = ''
    sharpen_arg = ''
    rotation_arg = ''
    deskew_arg = ''
    whiteThreshold_arg = ''
    cropbox = ''
    convert_crop = ''
    textcleaner_arg = ''
    threshold_arg = ''
    output_file = ''
    dryrun_arg = ''

    extract_params = 'resolution %(resolution)s  size %(size)s  median %(median)s  sharpen %(sharpen)s  textcleaner %(textcleaner)s  nudge %(nudge)s  deskew %(deskew)s  threshold %(threshold)s  cuneiform %(cuneiform)s' % locals(
    )

    if limitIMthreads:
        limitIM = '-limit thread 1'

    if size != 100:
        resize_arg = '-resize %i%%' % size

    if median is not None:
        median_arg = '-median %d' % median

    if sharpen is not None:
        sharpen_arg = '-sharpen %s' % sharpen

    if rotation is not None:
        rotation_arg = '-rotation %i' % rotation

    if deskew is not None:
        deskew_arg = '-deskew %i' % deskew

    if whiteThreshold is not None:
        whiteThreshold_arg = '-white-threshold %i%%' % whiteThreshold

    if nudge is not None:
        nudgestrs = nudge.strip()[1:-1].split(',')
        (l, r, d, u) = map(int, nudgestrs)

    if dryrun is not None:
        dryrun_arg = '--dryrun'

    ghostscript = ''''gs -q -dSAFER -sDEVICE=png16m -dFirstPage=%(page)i -dLastPage=%(page)i -g{2}x{3} -r%(resolution)s -o - -c "<</Install {-$((%(r)i-%(l)i+{4})) -$((%(u)i-%(d)i+{5})) translate}>> setpagedevice" -f "{1}" ''' % locals(
    )

    if inputfile is None:
        pdfpaths = '''python %(python_bin)spq.py printParallelParams --where="%(where)s" --limit="%(limit)s" --resolution="%(resolution)s" --targetfield="%(targetfield)s"''' % locals(
        )
    else:
        pdfpaths = '''cat %(inputfile)s''' % locals()

    parallel = '''| parallel %(dryrun_arg)s --no-run-if-empty --colsep '\\t' --max-procs=%(maxprocs)i --ungroup''' % locals(
    )

    convert_resize = '''| convert %(limitIM)s %(median_arg)s %(sharpen_arg)s %(rotation_arg)s %(resize_arg)s - - ''' % locals(
    )

    if outputpng:
        convert_resize += ''' | tee /tmp/{1/.}_{8}.png'''

    if crop:
        cropbox = '''| python -u %(python_bin)scropbox.py {6} {7}''' % locals()

        convert_crop = '''| convert %(limitIM)s -gravity North -chop 0x%(chop_top)s%% -gravity East -chop %(chop_right)ix0%% -gravity West -chop %(chop_left)ix0%% -gravity South -chop 0x%(chop_bottom)s%% -bordercolor white -border 4x4 %(deskew_arg)s %(whiteThreshold_arg)s - -''' % locals(
        )

        if outputcrop:
            cropbox += ''' | tee /tmp/{1/.}_crop_{8}.png'''

        if outputchop:
            convert_crop += ''' | tee /tmp/{1/.}_chop_{8}.png'''

    if textcleaner is not None:
        if textcleaner == 0:
            textcleaner_arg = '| textcleaner -T -s 1 png:- png:-'
        elif textcleaner == 1:
            textcleaner_arg = '| textcleaner -T -e normalize png:- png:-'
        elif textcleaner == 2:
            textcleaner_arg = '| textcleaner -T -s 1 -e normalize png:- png:-'

    if threshold:
        threshold_arg = '| python threshold.py gaussian 40'

    if not cuneiform:
        ocr = '''| tesseract303 - - -psm %(psm)s -lang=%(language)s {8} 2> /dev/null | python %(python_bin)snoinput.py''' % locals(
        )
    else:
        ocr = '''| cuneiform --singlecolumn - -o /tmp/{1/.}_cuneiform.txt > /dev/null 2> /dev/null || touch /tmp/{1/.}_cuneiform.txt && cat /tmp/{1/.}_cuneiform.txt | python %(python_bin)snoinput.py''' % locals(
        )

    remove_nl = ''' | tr -s "\\n" " " | sed "s/$/\\n/g" '''

    if bbox:
        ocr += '''| python hocrCoords.py %(targettext)s''' % locals()

    addinfo = ''
    if not noinfo:
        addinfo = ''' | sed -e "s|\(.\+\)$|\\1\\t{8}\\t%(page)i\\t%(extract_params)s\\t{1}\\t{2}\\t{3}\\t{4}\\t{5}\\t{6}\\t{7}\\t{8}|g" ''' % locals(
        )

    addinfo += '\''  # quote for end of parallel section

    if outfile is not None:
        output_file = '''> %(outfile)s ''' % locals()

    cmd = ' '.join([
        pdfpaths, parallel, ghostscript, convert_resize, cropbox, convert_crop,
        textcleaner_arg, threshold_arg, ocr, remove_nl, addinfo, output_file
    ])

    if printcmd:
        print(cmd)
    else:
        cmdout = osutil.getStdoutFromCmd(cmd, shell=True)
        if cmdout.strip() != '':
            print cmdout

    # remove cuneiform output files
    filelist = glob.glob("/tmp/*cuneiform.txt")
    for f in filelist:
        os.remove(f)