def run_extract_cmd(targetfield, inputfile=None, crop=False, where='', limit=1, page=1, psm=7, outputpng=False, printcmd=False, outputcrop=False, outputchop=False, whiteThreshold=97, validfile=None): cl_params = {} cl_params['limit'] = limit cl_params['page'] = page cl_params['crop'] = crop cl_params['psm'] = psm cl_params['targetfield'] = targetfield cl_params['where'] = where cl_params['outputpng'] = outputpng cl_params['outputcrop'] = outputcrop cl_params['outputchop'] = outputchop cl_params['printcmd'] = printcmd cl_params['whiteThreshold'] = whiteThreshold if validfile is None: validfile = "%s%s.tsv" % (st.raw_data_dir, targetfield) else: validfile = validfile for i, p in enumerate(st.extract_params): all_p = dict(p.items() + cl_params.items()) if i == 0: all_p['inputfile'] = inputfile #all_p['where'] = "and ep.params='%(b_params)s'" % b_params else: all_p['inputfile'] = invalidfile all_p['outfile'] = "%s%s_%i.tsv" % (st.to_validate_dir, targetfield, i) invalidfile = "%s%s_%i.tsv" % (st.invalid_data_dir, targetfield, i) if i == len(p) - 1: all_p['outputpng'] = True all_p['outputcrop'] = True all_p['outputchop'] = True rp.run_cmd(**all_p) osutil.getStdoutFromCmd( '''cat %s | python %svalidate.py %s > %s ''' % (all_p['outfile'], st.python_bin, validfile, invalidfile), shell=True)
def run_extract_cmd(targetfield, inputfile=None, crop=False, where='', limit=1, page=1, psm=7, outputpng=False, printcmd=False, outputcrop=False, outputchop=False, whiteThreshold=97, validfile=None): cl_params = {} cl_params['limit'] = limit cl_params['page'] = page cl_params['crop'] = crop cl_params['psm'] = psm cl_params['targetfield'] = targetfield cl_params['where'] = where cl_params['outputpng'] = outputpng cl_params['outputcrop'] = outputcrop cl_params['outputchop'] = outputchop cl_params['printcmd'] = printcmd cl_params['whiteThreshold'] = whiteThreshold if validfile is None: validfile = "%s%s.tsv" % (st.raw_data_dir, targetfield) else: validfile = validfile for i, p in enumerate(st.extract_params): all_p = dict(p.items() + cl_params.items()) if i == 0: all_p['inputfile'] = inputfile #all_p['where'] = "and ep.params='%(b_params)s'" % b_params else: all_p['inputfile'] = invalidfile all_p['outfile'] = "%s%s_%i.tsv" % (st.to_validate_dir, targetfield, i) invalidfile = "%s%s_%i.tsv" % (st.invalid_data_dir, targetfield, i) if i == len(p) - 1: all_p['outputpng'] = True all_p['outputcrop'] = True all_p['outputchop'] = True rp.run_cmd(**all_p) osutil.getStdoutFromCmd('''cat %s | python %svalidate.py %s > %s ''' % (all_p['outfile'], st.python_bin, validfile, invalidfile), shell=True)
def test_invoice_cropped(self): expectedOut = ['', '', 'is: Midwest Communications & Media Attention: Accounts Payable 2015 Roundwyck Lane Powell, OH 43065'] expectedOut.sort() params = {'size': 54, 'resolution': 398, 'targetfield': 'invoice_uncropped', 'where' : "KOCO-TV_14043097411984", 'limit' : 1, 'page': 2, 'noinfo': True, 'crop': False, 'psm': 3 } with patch('sys.stdout', new=BytesIO()) as cap_stdout: rp.run_cmd(**params) out = cap_stdout.getvalue() out_list = out.split('\n') out_list.sort() out_list = [o.strip() for o in out_list] self.assertEquals(out_list, expectedOut)
def test_invoice(self): expectedOut = ['06104114 - 06/23/14', '', 'HIETT 4 CORP COMMISS', '', 'Hiean/Corporation Commi:', '1196633', '105126114 - 06120114', '1196633-1', 'W7196822', '', '', ''] expectedOut.sort() params = {'size': 54, 'resolution': 398, 'targetfield': 'invoice', 'where' : "KOCO-TV_14043097411984", 'limit' : 1, 'page': 2, 'noinfo': True, 'crop': True, 'psm': 7 } with patch('sys.stdout', new=BytesIO()) as cap_stdout: rp.run_cmd(**params) out = cap_stdout.getvalue() out_list = out.split('\n') out_list.sort() out_list = [o.strip() for o in out_list] self.assertEquals(out_list, expectedOut)