Exemplo n.º 1
0
def try_parse_odt(file_name):
    """
    Tried to parse tables in odt files.

    Warning: if parsing fails, it will not return an error, rather it will just not return the content of that table.

    :param file_name: str
    :return: Pandas Dataframe
    """
    zf = zipfile.ZipFile(file_name)

    bs = bs4.BeautifulSoup(zf.read("content.xml"), 'xml')

    tables = bs.findAll("table")

    csv_tables = [table_to_csv(tables_to_lists(table)) for table in tables]

    success = []
    for n, table in enumerate(csv_tables):
        try:
            success.append(csv_cleaner.try_to_parse_csv(raw_text=table))
        except:
            pass

    return pd.concat(success)
Exemplo n.º 2
0
def test_csvs():
    for f in files_to_test_csv:
        known = pd.read_csv(test_data_path + f, encoding="latin-1", index_col=0)
        cleaned = csv_cleaner.try_to_parse_csv(raw_data_path + f)
        assert_frame_equal(known, cleaned)
        print("CSV Test for {} Passed".format(f))
    print("All CSV passed.")
Exemplo n.º 3
0
def try_parse_doc(file_path):

    file_name = os.path.split(file_path)[-1]
    htmled = HTML_PATH + file_name + ".html"

    if not os.path.exists(htmled):
        convert_to_html(file_path)

    with open(htmled, "rb") as f:
        text = f.read()
        f.close()

    bs = bs4.BeautifulSoup(text, 'html.parser')

    tables = bs.findAll("table")

    csv_tables = [table_to_csv(tables_to_lists(table)) for table in tables]

    success = []
    for n, table in enumerate(csv_tables):
        try:
            success.append(csv_cleaner.try_to_parse_csv(raw_text=table))
        except:
            pass

    return pd.concat(success)
Exemplo n.º 4
0
def try_parse_ods(file_path):
    doc = ezodf.opendoc(file_path)

    csved = convert_odf_to_csv(get_sheet(doc.sheets))

    df = csv_cleaner.try_to_parse_csv(raw_text=csved)

    return df
Exemplo n.º 5
0
def test_csvs():
    for f in files_to_test_csv:
        known = pd.read_csv(test_data_path + f,
                            encoding="latin-1",
                            index_col=0)
        cleaned = csv_cleaner.try_to_parse_csv(raw_data_path + f)
        assert_frame_equal(known, cleaned)
        print("CSV Test for {} Passed".format(f))
    print("All CSV passed.")
Exemplo n.º 6
0
def try_parse_pdf(file_path):
    fixed = PDFFixup.fixer.get_tables(file_path)

    csv_tables = pdf_table_to_csvs(stitch_together_tables(fixed))

    success = []
    for n, table in enumerate(csv_tables):
        try:
            success.append(csv_cleaner.try_to_parse_csv(raw_text=table))
        except:
            pass

    return pd.concat(success)
Exemplo n.º 7
0
def parse_row(row, download_to="../data/raw/"):
    """
    Main entry point for each row of the table.

    Splits, downloads, parses the documents.
    """

    row_template = utils.get_row_template(row)

    url = row_template["link"]
    link_format = url.split(".")[-1].lower()
    file_name = url.split("/")[-1]
    local_file_path = download_to + file_name

    if not os.path.exists(local_file_path):
        utils.download_file(url, local_file_path)

    if link_format == "csv":
        table_df = csv_cleaner.try_to_parse_csv(local_file_path)
    elif link_format == "pdf":
        table_df = pdf_cleaner.try_parse_pdf(local_file_path)
    elif link_format == "odt":
        table_df = odt_cleaner.try_parse_odt(local_file_path)
    elif link_format == "doc":
        table_df = doc_cleaner.try_parse_doc(local_file_path)
    elif link_format == "xlsx":
        table_df = xlxs_cleaner.try_parse_xlsx(local_file_path)
    elif link_format == "ods":
        table_df = ods_cleaner.try_parse_ods(local_file_path)
    else:
        raise Exception("Not sure how to parse {}...".format(local_file_path))

    if table_df is None:
        return None

    table_df["department"] = row_template["department"]
    table_df["period"] = row_template["period"]
    table_df["link"] = row_template["link"]

    return table_df