def compr_binary_parse_on_files(fnamelist): assert(len(fnamelist) >= 2) for i, fname in enumerate(fnamelist): new = parse_binary(fname=fname).replace(' ', '').replace('\n', '') assert(len(new) > 20) if i == 0: old = new else: assert(old == new) old = new
def parse_multi_layer_file(uri, txt=None, ftype=None, okext=OKEXT): """ Can handle files that contain files, e.g. emails with attachments. Returns a list with parsed files each in a dict tree. type: is ether the extension or a set definition, e.g. email. """ if isinstance(txt, NoneType): with open(uri) as fogj: txt = fogj.read() if is_an_email(uri, text=txt): parsedtxtlist = email_whole_parse(uri=uri, text=txt) info, mime, ftype = get_file_info_from_buffer(txt) for i, parsedtxt in enumerate(parsedtxtlist): if i == 0: # the zero gen file does not return with a file name parsedtxt['filename'] = uri emlparsed = [(fit_into_data_mold(parseddict=parsedtxt, txt=txt, uri=auto_unicode_dang_it(uri), ftype=u'email', mime=mime, info=info))] else: attchtxt = parsedtxt['body'] if not attchtxt: attchtxt = u'' info, mime, ftype = get_file_info_from_buffer(attchtxt) fname = parsedtxt['filename'] for parsedbit in parse_multi_layer_file(uri=fname, txt=attchtxt, ftype=ftype, okext=okext): emlparsed.append(parsedbit) return emlparsed parsedtxt = parse_binary(string=txt, fname=uri) if not ftype: ftype = uri.split('.')[-1] if not parsedtxt: parseddict = {u'body': u''} else: parseddict = {u'body': parsedtxt} info, mime, ftype = get_file_info_from_buffer(txt) parseddict[u'filename'] = sane_unicode(uri) return [fit_into_data_mold(parseddict=parseddict, txt=txt, uri=uri, ftype=ftype, mime=mime, info=info)]
def test__parse_binary__parse_pdf_XML__test_if_run_and_len_20(): fname = join(TESTDIR, 'pdf/pg1661-mini_XML.pdf') prsd = parse_binary(fname=fname) assert(len(prsd) >= 20)
def test__parse_binary__parse_docx__test_if_run_and_len_20(): fname = join(TESTDIR, 'doc/pure_doc/pg1661-mini.docx') prsd = parse_binary(fname=fname) assert(len(prsd) > 20)