示例#1
0
文件: do.py 项目: rjmolesa/blueflower
def do_data(ftype, data, afile):
    if ftype == constants.BF_UNKNOWN:
        return
    elif ftype == constants.BF_BZIP2:
        from blueflower.modules.bzip2 import bzip2_do_data
        bzip2_do_data(data, afile)
    elif ftype == constants.BF_DOCX:
        from blueflower.modules.docx import docx_do_data
        docx_do_data(data, afile)
    elif ftype == constants.BF_GZ:
        from blueflower.modules.gz import gz_do_data
        gz_do_data(data, afile)
    elif ftype == constants.BF_PDF:
        from blueflower.modules.pdf import pdf_do_data
        pdf_do_data(data, afile)
    elif ftype == constants.BF_TAR:
        from blueflower.modules.tar import tar_do_data
        tar_do_data(data, afile)
    elif ftype == constants.BF_TEXT:
        from blueflower.modules.text import text_do_data
        text_do_data(data, afile)
    elif ftype == constants.BF_XLSX:
        from blueflower.modules.xlsx import xlsx_do_data
        xlsx_do_data(data, afile)
    elif ftype == constants.BF_ZIP:
        from blueflower.modules.zip import zip_do_data
        zip_do_data(data, afile)
示例#2
0
def do_data(ftype, data, afile):
    if ftype == constants.BF_UNKNOWN:
        return
    elif ftype == constants.BF_BZIP2:
        from blueflower.modules.bzip2 import bzip2_do_data
        bzip2_do_data(data, afile)
    elif ftype == constants.BF_DOCX:
        from blueflower.modules.docx import docx_do_data
        docx_do_data(data, afile)
    elif ftype == constants.BF_GZ:
        from blueflower.modules.gz import gz_do_data
        gz_do_data(data, afile)
    elif ftype == constants.BF_PDF:
        from blueflower.modules.pdf import pdf_do_data
        pdf_do_data(data, afile)
    elif ftype == constants.BF_TAR:
        from blueflower.modules.tar import tar_do_data
        tar_do_data(data, afile)
    elif ftype == constants.BF_TEXT:
        from blueflower.modules.text import text_do_data
        text_do_data(data, afile)
    elif ftype == constants.BF_XLSX:
        from blueflower.modules.xlsx import xlsx_do_data
        xlsx_do_data(data, afile)
    elif ftype == constants.BF_ZIP:
        from blueflower.modules.zip import zip_do_data
        zip_do_data(data, afile)
示例#3
0
文件: do.py 项目: nmnz/blueflower
def do_data(ftype, data, afile):
    if ftype == "other":
        return
    elif ftype == "bzip2":
        from blueflower.modules.bzip2 import bzip2_do_data

        bzip2_do_data(data, afile)
    elif ftype == "gz":
        from blueflower.modules.gz import gz_do_data

        gz_do_data(data, afile)
    elif ftype == "pdf":
        from blueflower.modules.pdf import pdf_do_data

        pdf_do_data(data, afile)
    elif ftype == "tar":
        from blueflower.modules.tar import tar_do_data

        tar_do_data(data, afile)
    elif ftype == "text":
        from blueflower.modules.text import text_do_data

        text_do_data(data, afile)
    elif ftype == "zip":
        from blueflower.modules.zip import zip_do_data

        zip_do_data(data, afile)
示例#4
0
def xlsx_do_xlsx(axl, afile):
    rows = []
    try:
        for i in xrange(axl.nsheets):
            sheet = axl.sheet_by_index(i)
            for j in xrange(sheet.nrows):
                rows.append(' '.join(sheet.row_values(j)))
    except TypeError as e:
        log_error(str(e), afile)
        return

    text = '\n\n'.join(rows)
    text_do_data(text, afile)
示例#5
0
文件: pdf.py 项目: nmnz/blueflower
def pdf_do_pdf(astream, afile):
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    pagenos = set()
    for page in PDFPage.get_pages(astream, pagenos, maxpages=0, password='', \
                                  caching=True, check_extractable=True):
        interpreter.process_page(page)
    device.close()
    text = retstr.getvalue()
    retstr.close()
    text_do_data(text, afile)
示例#6
0
文件: pdf.py 项目: nmnz/blueflower
def pdf_do_pdf(astream, afile):
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    pagenos = set()
    for page in PDFPage.get_pages(astream, pagenos, maxpages=0, password='', \
                                  caching=True, check_extractable=True):
        interpreter.process_page(page)
    device.close()
    text = retstr.getvalue()
    retstr.close()
    text_do_data(text, afile)
示例#7
0
def docx_do_docx(azip, afile):
    namespace = '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}'
    par = namespace + 'p'
    txt = namespace + 't'

    xml_content = azip.read('word/document.xml')
    tree = XML(xml_content)

    paragraphs = []
    for paragraph in tree.getiterator(par):
        texts = [node.text for node in paragraph.getiterator(txt) if node.text]
        if texts:
            paragraphs.append(''.join(texts))

    text = '\n\n'.join(paragraphs)
    text_do_data(text, afile)
示例#8
0
文件: docx.py 项目: nmnz/blueflower
def docx_do_docx(azip, afile):
    word_namespace = "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}"
    par = word_namespace + "p"
    txt = word_namespace + "t"

    xml_content = azip.read("word/document.xml")
    tree = XML(xml_content)

    paragraphs = []
    for paragraph in tree.getiterator(par):
        texts = [node.text for node in paragraph.getiterator(txt) if node.text]
        if texts:
            paragraphs.append("".join(texts))

    text = "\n\n".join(paragraphs)
    text_do_data(text, afile)
示例#9
0
def docx_do_docx(azip, afile):
    namespace = '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}'
    par = namespace + 'p'
    txt = namespace + 't'

    xml_content = azip.read('word/document.xml')
    tree = XML(xml_content)

    paragraphs = []
    for paragraph in tree.getiterator(par):
        texts = [node.text for node in paragraph.getiterator(txt)
                 if node.text]
        if texts:
            paragraphs.append(''.join(texts))

    text = '\n\n'.join(paragraphs)
    text_do_data(text, afile)
示例#10
0
def pdf_do_pdf(astream, afile):
    outstream = io.BytesIO()
    laparams = LAParams()
    rsrcmgr = PDFResourceManager(caching=True)
    device = TextConverter(rsrcmgr, outstream, codec='utf-8', laparams=laparams,
                               imagewriter=None)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    try:
        for page in PDFPage.get_pages(astream, set(),
                                      maxpages=0, password='',
                                      caching=True, check_extractable=True):
            interpreter.process_page(page)
    except PDFTextExtractionNotAllowed as e:
        log_error(str(e), afile)
        return
    text = outstream.getvalue()
    text_do_data(text, afile)
    outstream.close()
示例#11
0
def do_data(ftype, data, afile):
    if ftype == 'other':
        return
    elif ftype == 'bzip2':
        from blueflower.modules.bzip2 import bzip2_do_data
        bzip2_do_data(data, afile)
    elif ftype == 'gz':
        from blueflower.modules.gz import gz_do_data
        gz_do_data(data, afile)
    elif ftype == 'pdf':
        from blueflower.modules.pdf import pdf_do_data
        pdf_do_data(data, afile)
    elif ftype == 'tar':
        from blueflower.modules.tar import tar_do_data
        tar_do_data(data, afile)
    elif ftype == 'text':
        from blueflower.modules.text import text_do_data
        text_do_data(data, afile)
    elif ftype == 'zip':
        from blueflower.modules.zip import zip_do_data
        zip_do_data(data, afile)
示例#12
0
def pdf_do_pdf(astream, afile):
    outstream = io.BytesIO()
    laparams = LAParams()
    rsrcmgr = PDFResourceManager(caching=True)
    device = TextConverter(rsrcmgr,
                           outstream,
                           codec='utf-8',
                           laparams=laparams,
                           imagewriter=None)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    try:
        for page in PDFPage.get_pages(astream,
                                      set(),
                                      maxpages=0,
                                      password='',
                                      caching=True,
                                      check_extractable=True):
            interpreter.process_page(page)
    except PDFTextExtractionNotAllowed as e:
        log_error(str(e), afile)
        return
    text = outstream.getvalue()
    text_do_data(text, afile)
    outstream.close()