Пример #1
0
def compr_binary_parse_on_files(fnamelist):
    assert(len(fnamelist) >= 2)
    for i, fname in enumerate(fnamelist):
        new = parse_binary(fname=fname).replace(' ', '').replace('\n', '')
        assert(len(new) > 20)
        if i == 0:
            old = new
        else:
            assert(old == new)
            old = new
Пример #2
0
def parse_multi_layer_file(uri, txt=None, ftype=None, okext=OKEXT):
    """
    Can handle files that contain files, e.g. emails with attachments.
    Returns a list with parsed files each in a dict tree.

        type: is ether the extension or a set definition, e.g. email.
    """
    if isinstance(txt, NoneType):
        with open(uri) as fogj:
            txt = fogj.read()
    if is_an_email(uri, text=txt):
        parsedtxtlist = email_whole_parse(uri=uri,
                                          text=txt)
        info, mime, ftype = get_file_info_from_buffer(txt)
        for i, parsedtxt in enumerate(parsedtxtlist):
            if i == 0:
                # the zero gen file does not return with a file name
                parsedtxt['filename'] = uri
                emlparsed = [(fit_into_data_mold(parseddict=parsedtxt,
                                                 txt=txt,
                                                 uri=auto_unicode_dang_it(uri),
                                                 ftype=u'email',
                                                 mime=mime,
                                                 info=info))]
            else:
                attchtxt = parsedtxt['body']
                if not attchtxt:
                    attchtxt = u''
                info, mime, ftype = get_file_info_from_buffer(attchtxt)
                fname = parsedtxt['filename']
                for parsedbit in parse_multi_layer_file(uri=fname,
                                                        txt=attchtxt,
                                                        ftype=ftype,
                                                        okext=okext):
                    emlparsed.append(parsedbit)
        return emlparsed
    parsedtxt = parse_binary(string=txt,
                             fname=uri)
    if not ftype:
        ftype = uri.split('.')[-1]
    if not parsedtxt:
        parseddict = {u'body': u''}
    else:
        parseddict = {u'body': parsedtxt}
    info, mime, ftype = get_file_info_from_buffer(txt)
    parseddict[u'filename'] = sane_unicode(uri)
    return [fit_into_data_mold(parseddict=parseddict,
                               txt=txt,
                               uri=uri,
                               ftype=ftype,
                               mime=mime,
                               info=info)]
Пример #3
0
def test__parse_binary__parse_pdf_XML__test_if_run_and_len_20():
    fname = join(TESTDIR, 'pdf/pg1661-mini_XML.pdf')
    prsd = parse_binary(fname=fname)
    assert(len(prsd) >= 20)
Пример #4
0
def test__parse_binary__parse_docx__test_if_run_and_len_20():
    fname = join(TESTDIR, 'doc/pure_doc/pg1661-mini.docx')
    prsd = parse_binary(fname=fname)
    assert(len(prsd) > 20)