Exemplo n.º 1
0
def main():
    for pdf in os.listdir():
        file_name, file_extension = os.path.splitext(pdf)
        if file_extension == '.pdf':
            cmd = "pdfgrep -Pn '^(?s:(?=.*Revenue)|(?=.*Income))' " + pdf + " | awk -F\":\" '$0~\":\"{print $1}' | tr '\n' ','"
            pages = subprocess.check_output(cmd, shell=True).decode("utf-8")
            print(pdf)
            tables = camelot.read_pdf(pdf, flavor='stream', pages=pages, edge_tol=100)
            filtered = []
            for index, table in enumerate(tables):
                whitespace = tables[index].parsing_report.get('whitespace')
                if whitespace <= 25:
                    filtered.append(tables[index])
            filtered_tables = TableList(filtered)
            filtered_tables.export('test.xlsx', f='excel', compress=True)
Exemplo n.º 2
0
def table_list(gp_response):
    """
    Convert Camelot GoPro's response to Camelot's TableList type object
    :param gp_response: Response received from Camelot GoPro
    :return: Camelot's TableList like object with extra

      Attributes
    ----------
    Regular Camelot's TableList object
    Pages: int
            Total number of input pages in pdf or 1 if image
    JobStatus    : str ('Success'|'Failed'|'Processing'|'Incomplete')
                Job status of finding tables
    """
    prev_page = 0
    order = 1
    if gp_response["JobStatus"].lower().startswith("succe"):
        pass
    elif gp_response["JobStatus"].lower().startswith("process"):
        print("-=- "*15)
        print(f'[Info]: Table Extraction process is {gp_response["JobStatus"]}')
        print("Check more info using '__dict__' descriptor on the result object.")
        print("Use the 'JobId' from the response, to check and retrieve the output when the 'Processing' is 'Success'")
        print("JobId is:", gp_response["JobId"])
        print("Follow the last step in link: "
              "https://github.com/ExtractTable/camelotpro/blob/master/how%20to%20code.ipynb")
        print("-=- "*15)
    elif gp_response["JobStatus"].lower().startswith("fail"):
        print("[Info]: Table Extraction is Failed. Complete Response Below")
        for k, v in gp_response.items():
            print(f"{k}: {v}")
    elif not any([gp_response["JobStatus"].lower().startswith("succe"), gp_response.get("Tables", [])]):
        print("[Info]: Table Extraction is not completed. Status:", gp_response["JobStatus"])
        print("Check more info using '__dict__' descriptor on the result object.\n")

    for each in gp_response.get("Tables", []):
        if each["Page"] == prev_page:
            order += 1
        else:
            prev_page = each["Page"]
            order = 1
        each["Order"] = order
    tmp_tbl_list = TableList([Table(gp_table) for gp_table in gp_response.get("Tables", [])])
    tmp_tbl_list.Pages = gp_response.pop("Pages", "NA")
    for k, v in gp_response.items():
        if k != "Tables":
            tmp_tbl_list.__setattr__(k, v)
    return tmp_tbl_list
Exemplo n.º 3
0
def test_table_order():
    def _make_table(page, order):
        t = Table([], [])
        t.page = page
        t.order = order
        return t

    table_list = TableList([
        _make_table(2, 1),
        _make_table(1, 1),
        _make_table(3, 4),
        _make_table(1, 2)
    ])

    assert [(t.page, t.order) for t in sorted(table_list)] == [
        (1, 1),
        (1, 2),
        (2, 1),
        (3, 4),
    ]
    assert [(t.page, t.order) for t in sorted(table_list, reverse=True)] == [
        (3, 4),
        (2, 1),
        (1, 2),
        (1, 1),
    ]
Exemplo n.º 4
0
def extract(job_id):
    try:
        session = Session()
        job = session.query(Job).filter(Job.job_id == job_id).first()
        rule = session.query(Rule).filter(Rule.rule_id == job.rule_id).first()
        file = session.query(File).filter(File.file_id == job.file_id).first()

        rule_options = json.loads(rule.rule_options)
        flavor = rule_options.pop('flavor')
        pages = rule_options.pop('pages')

        tables = []
        filepaths = json.loads(file.filepaths)
        for p in pages:
            kwargs = pages[p]
            kwargs.update(rule_options)
            parser = Lattice(
                **kwargs) if flavor.lower() == 'lattice' else Stream(**kwargs)
            t = parser.extract_tables(filepaths[p])
            for _t in t:
                _t.page = int(p)
            tables.extend(t)
        tables = TableList(tables)

        froot, fext = os.path.splitext(file.filename)
        datapath = os.path.dirname(file.filepath)
        for f in ['csv', 'excel', 'json', 'html']:
            f_datapath = os.path.join(datapath, f)
            mkdirs(f_datapath)
            ext = f if f != 'excel' else 'xlsx'
            f_datapath = os.path.join(f_datapath, '{}.{}'.format(froot, ext))
            tables.export(f_datapath, f=f, compress=True)

        # for render
        jsonpath = os.path.join(datapath, 'json')
        jsonpath = os.path.join(jsonpath, '{}.json'.format(froot))
        tables.export(jsonpath, f='json')
        render_files = {
            os.path.splitext(os.path.basename(f))[0]: f
            for f in glob.glob(os.path.join(datapath, 'json/*.json'))
        }

        job.datapath = datapath
        job.render_files = json.dumps(render_files)
        job.is_finished = True
        job.finished_at = dt.datetime.now()

        session.commit()
        session.close()
    except Exception as e:
        logging.exception(e)
Exemplo n.º 5
0
def extract(job_id):
    try:
        session = Session()
        job = session.query(Job).filter(Job.job_id == job_id).first()
        rule = session.query(Rule).filter(Rule.rule_id == job.rule_id).first()
        file = session.query(File).filter(File.file_id == job.file_id).first()

        rule_options = json.loads(rule.rule_options)
        flavor = rule_options.pop("flavor")
        pages = rule_options.pop("pages")

        tables = []
        filepaths = json.loads(file.filepaths)
        for p in pages:
            kwargs = pages[p]
            kwargs.update(rule_options)
            parser = (Lattice(
                **kwargs) if flavor.lower() == "lattice" else Stream(**kwargs))
            t = parser.extract_tables(filepaths[p])
            for _t in t:
                _t.page = int(p)
            tables.extend(t)
        tables = TableList(tables)

        froot, fext = os.path.splitext(file.filename)
        datapath = os.path.dirname(file.filepath)
        for f in ["csv", "excel", "json", "html"]:
            f_datapath = os.path.join(datapath, f)
            mkdirs(f_datapath)
            ext = f if f != "excel" else "xlsx"
            f_datapath = os.path.join(f_datapath, f"{froot}.{ext}")
            tables.export(f_datapath, f=f, compress=True)

        # for render
        jsonpath = os.path.join(datapath, "json")
        jsonpath = os.path.join(jsonpath, f"{froot}.json")
        tables.export(jsonpath, f="json")
        render_files = {
            os.path.splitext(os.path.basename(f))[0]: f
            for f in glob.glob(os.path.join(datapath, "json/*.json"))
        }

        job.datapath = datapath
        job.render_files = json.dumps(render_files)
        job.is_finished = True
        job.finished_at = dt.datetime.now()

        session.commit()
        session.close()
    except Exception as e:
        logging.exception(e)
Exemplo n.º 6
0
def extract(job_id):
    try:
        session = Session()
        job = session.query(Job).filter(Job.job_id == job_id).first()
        rule = session.query(Rule).filter(Rule.rule_id == job.rule_id).first()
        file = session.query(File).filter(File.file_id == job.file_id).first()

        parent_folder = os.path.join(conf.PDFS_FOLDER, file.file_id, '')
        docs = os.listdir(parent_folder)
        docs = sorted(
            list(
                map(lambda x: os.path.join(parent_folder, x),
                    filter(lambda x: x[0:4] == "file", docs))))

        rule_options = json.loads(rule.rule_options)
        flavor = rule_options.pop('flavor')
        pages = rule_options.pop('pages')

        filepaths = json.loads(file.filepaths)
        filepaths_as_list = list(filepaths.values())

        tables = []
        i = 0
        for doc in docs:
            for f in filepaths_as_list:
                os.remove(f)
            gs_call = 'gs -q -sDEVICE=pdfwrite -dNOPAUSE -dBATCH -dSAFER -o {}page-%d.pdf {}'.format(
                parent_folder, doc)
            gs_call = gs_call.encode().split()
            null = open(os.devnull, 'wb')
            with Ghostscript(*gs_call, stdout=null) as gs:
                pass
            null.close()
            for p in pages:
                kwargs = pages[p]
                kwargs.update(rule_options)
                parser = Lattice(
                    **kwargs) if flavor.lower() == 'lattice' else Stream(
                        **kwargs)
                t = parser.extract_tables(filepaths[p])
                for _t in t:
                    _t.page = int(p) + i
                tables.extend(t)
            i += len(pages)

        tables = TableList(tables)

        froot, fext = os.path.splitext(file.filename)
        datapath = os.path.dirname(file.filepath)
        for f in ['csv', 'excel', 'json', 'html']:
            f_datapath = os.path.join(datapath, f)
            mkdirs(f_datapath)
            ext = f if f != 'excel' else 'xlsx'
            f_datapath = os.path.join(f_datapath, '{}.{}'.format(froot, ext))
            tables.export(f_datapath, f=f, compress=True)

        # for render
        jsonpath = os.path.join(datapath, 'json')
        jsonpath = os.path.join(jsonpath, '{}.json'.format(froot))
        tables.export(jsonpath, f='json')
        render_files = {
            os.path.splitext(os.path.basename(f))[0]: f
            for f in glob.glob(os.path.join(datapath, 'json/*.json'))
        }

        job.datapath = datapath
        job.render_files = json.dumps(render_files)
        job.is_finished = True
        job.finished_at = dt.datetime.now()

        session.commit()
        session.close()
    except Exception as e:
        logging.exception(e)
Exemplo n.º 7
0
def extract(job_id):  # noqa
    try:
        session = Session()
        job = session.query(Job).filter(Job.job_id == job_id).first()
        rule = session.query(Rule).filter(Rule.rule_id == job.rule_id).first()
        file = session.query(File).filter(File.file_id == job.file_id).first()

        rule_options = json.loads(rule.rule_options)
        flavor = rule_options.pop("flavor")
        pages = rule_options.pop("pages")

        tables = []
        filepaths = json.loads(file.filepaths)
        for p in pages:
            if p not in filepaths:
                continue

            if flavor.lower() == "lattice":
                kwargs = pages[p]
                parser = Lattice(**kwargs)

                t = parser.extract_tables(filepaths[p])
                for _t in t:
                    _t.page = int(p)
                tables.extend(t)

            else:
                opts = pages[p]
                areas, columns = (
                    opts.get("table_areas", None),
                    opts.get("columns", None),
                )
                if areas and columns:
                    page_order = 1
                    for area, column in zip(areas, columns):
                        bbox = ([
                            round(v, 2) for v in map(float, area.split(","))
                        ] if area else [])
                        cols = list(map(float,
                                        column.split(","))) if column else []
                        split_text = rule_options.get("split_text", False)

                        if cols and bbox:
                            abs_cols = [round(c + bbox[0], 2) for c in cols]
                            table_region = bbox
                            table_area = ",".join(map(str, bbox))
                            table_columns = ",".join(map(str, abs_cols))
                            if len(abs_cols) > 4 and split_text:
                                pass  # split_text = False

                        elif bbox:
                            table_region = bbox
                            table_area = ",".join(map(str, bbox))
                            table_columns = None
                            split_text = False

                        else:
                            table_region = None
                            table_area = None
                            table_columns = None

                        kwargs = dict(
                            table_regions=[table_region]
                            if table_region else None,
                            table_areas=[table_area] if table_area else None,
                            columns=[table_columns] if table_columns else None,
                            row_tol=rule_options.get("row_close_tol", 2),
                            column_tol=rule_options.get("col_close_tol", 0),
                            edge_tol=rule_options.get("edge_close_tol", 50),
                            flag_size=rule_options.get("flag_size", False),
                            split_text=split_text,
                            strip_text=rule_options.get("strip_text", ""),
                        )
                        print(f"Using Stream({kwargs!r})")
                        parser = Stream(**kwargs)
                        t = parser.extract_tables(filepaths[p])
                        print(f"Result: {t}")
                        for _t in t:

                            _t.page = int(p)
                            _t.order = page_order
                            print(
                                f"Table {_t.order}, Page {_t.page}: {_t.parsing_report}"
                            )

                            if _t.df.shape == (1, 2):
                                _t.df = _t.df.T

                            elif _t.shape == (1, 1):
                                _t.df = pd.concat(
                                    [
                                        _t.df[0],
                                        _t.df.replace(
                                            {0: {
                                                _t.df.iat[0, 0]: ""
                                            }})[0],
                                    ],
                                    axis=0,
                                    ignore_index=True,
                                )

                            if len(_t.df.shape) < 2:
                                _t.df = _t.df.to_frame()

                            if _t.df.shape[1] < 4:
                                _t.df = (_t.df.replace({
                                    "": pd.np.nan
                                }).dropna(how="all").fillna(""))

                            print(_t.df)
                            page_order += 1
                        tables.extend(t)
                else:
                    continue

        tables = TableList(tables)

        froot, fext = os.path.splitext(file.filename)
        datapath = os.path.dirname(file.filepath)
        for f in ["csv", "excel", "json", "html"]:
            f_datapath = os.path.join(datapath, f)
            for dirname, dirs, files in os.walk(datapath):
                for of in files:
                    if of.endswith(("." + f, ".zip", ".xlsx")):
                        fp = os.path.join(dirname, of)
                        os.remove(fp)

            try:
                os.removedirs(f_datapath)
            except FileNotFoundError:
                pass

        for f in ["csv", "excel", "json", "html"]:
            f_datapath = os.path.join(datapath, f)
            mkdirs(f_datapath)
            ext = f if f != "excel" else "xlsx"
            f_datapath = os.path.join(f_datapath, "{}.{}".format(froot, ext))
            print(f"Exporting as {f} to {f_datapath}")
            tables.export(f_datapath, f=f, compress=True)

        # for render
        jsonpath = os.path.join(datapath, "json")
        jsonpath = os.path.join(jsonpath, "{}.json".format(froot))
        tables.export(jsonpath, f="json")
        render_files = {
            os.path.splitext(os.path.basename(f))[0]: f
            for f in glob.glob(os.path.join(datapath, "json/*.json"))
        }

        job.datapath = datapath
        job.render_files = json.dumps(render_files)
        job.is_finished = True
        job.finished_at = dt.datetime.now()

        session.commit()
        session.close()
    except Exception as e:
        logging.exception(e)