Exemplos de get_paragraphs em Python, exemplos de text_table_tools.get_paragraphs em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: get_entities_from_tables.py Projeto: dssg/machine_learning_legislation

def get_entities(path, conn):
        if "congress" in path:
            path_util = path_tools.ReportPathUtils(path  = path)
        else:
            path_util = path_tools.BillPathUtils(path  = path)

        docid = path_util.get_db_document_id()
        print path, docid


        fields = ["entity_text", "entity_type", "entity_offset", "entity_length", "entity_inferred_name", "source", "document_id"]
        
        paragrapghs_list = text_table_tools.get_paragraphs(open(path,'r'))
        f_str = open(path,'r').read()
        tables = text_table_tools.identify_tables(paragrapghs_list)


        for table in tables:
            table_offset = table.offset
            try:
                csv_rows =[]
                column_indices = sorted(text_table_tools.get_candidate_columns(table))

                for row in table.rows:
                    (entity_text, entity_inferred_name) = get_row_entity_text_and_entity_inferred_name(row, column_indices)
                    csv_row = [entity_text[:2048], "table_row", table_offset+row.offset, row.length, entity_inferred_name[:2048], table.type + "_table", docid]
                    csv_rows.append(csv_row)

                cmd = "insert into entities (" + ", ".join(fields) + ") values (%s, %s, %s, %s, %s, %s, %s)"
                cur = conn.cursor()
                cur.executemany(cmd, csv_rows)
                conn.commit()
            except Exception as e:
                print len(clean_row_text)
                print csv_row
                logging.exception("SCREW UP")

Exemplo n.º 2

0

Exibir arquivo

Arquivo: evaluate_tables.py Projeto: vruvora/machine_learning_legislation

            tables.append("".join(lines))
            lines = []
        if parsing:
            lines.append(line)
        if line.startswith("<dashTable>") or line.startswith("<dotTable>"):
            parsing = True
    return tables


labeled_folder = "/mnt/data/sunlight/tables/labeled"
out_folder = "/mnt/data/sunlight/tables/evaluate"

for doc in os.listdir(labeled_folder):
    # get table parser tables
    doc_file = os.path.join(labeled_folder, doc)
    paragraph_list = ttt.get_paragraphs(codecs.open(doc_file, "r", "utf8"))
    table_parser_tables = ttt.identify_tables(paragraph_list)

    # get labeled tables
    labeled_tables = find_labeled(doc_file)

    print "For document %s:" % doc_file
    print "Number of hand labeled tables: %s" % len(labeled_tables)
    print "Number of table parser tables: %s" % len(table_parser_tables)
    print "\n\n\n\n\n"

    # output results for manual inspection
    labeled_file = open(os.path.join(out_folder, doc + "-labeled"), "w")
    parsed_file = open(os.path.join(out_folder, doc + "-parsed"), "w")
    labeled_file.write("TABLE:\n".join(labeled_tables))
    parsed_file.write("TABLE:\n".join("".join(table.content) for table in table_parser_tables))

Exemplo n.º 3

0

Exibir arquivo

Arquivo: evaluate_tables.py Projeto: dssg/machine_learning_legislation

            parsing = False
            tables.append("".join(lines))
            lines = []
        if parsing:
            lines.append(line)
        if line.startswith("<dashTable>") or line.startswith("<dotTable>"):
            parsing = True
    return tables

labeled_folder = "/mnt/data/sunlight/tables/labeled"
out_folder = "/mnt/data/sunlight/tables/evaluate"

for doc in os.listdir(labeled_folder):
    # get table parser tables
    doc_file = os.path.join(labeled_folder, doc)
    paragraph_list = ttt.get_paragraphs(codecs.open(doc_file, 'r', 'utf8'))
    table_parser_tables = ttt.identify_tables(paragraph_list)

    # get labeled tables
    labeled_tables = find_labeled(doc_file)

    print "For document %s:" % doc_file
    print "Number of hand labeled tables: %s" % len(labeled_tables)
    print "Number of table parser tables: %s" % len(table_parser_tables)
    print "\n\n\n\n\n"

    # output results for manual inspection
    labeled_file = open(os.path.join(out_folder, doc + "-labeled"), "w")
    parsed_file = open(os.path.join(out_folder, doc + "-parsed"), "w")
    labeled_file.write("TABLE:\n".join(labeled_tables))
    parsed_file.write("TABLE:\n".join("".join(table.content) for table in table_parser_tables))