def get_entities(path, conn): if "congress" in path: path_util = path_tools.ReportPathUtils(path = path) else: path_util = path_tools.BillPathUtils(path = path) docid = path_util.get_db_document_id() print path, docid fields = ["entity_text", "entity_type", "entity_offset", "entity_length", "entity_inferred_name", "source", "document_id"] paragrapghs_list = text_table_tools.get_paragraphs(open(path,'r')) f_str = open(path,'r').read() tables = text_table_tools.identify_tables(paragrapghs_list) for table in tables: table_offset = table.offset try: csv_rows =[] column_indices = sorted(text_table_tools.get_candidate_columns(table)) for row in table.rows: (entity_text, entity_inferred_name) = get_row_entity_text_and_entity_inferred_name(row, column_indices) csv_row = [entity_text[:2048], "table_row", table_offset+row.offset, row.length, entity_inferred_name[:2048], table.type + "_table", docid] csv_rows.append(csv_row) cmd = "insert into entities (" + ", ".join(fields) + ") values (%s, %s, %s, %s, %s, %s, %s)" cur = conn.cursor() cur.executemany(cmd, csv_rows) conn.commit() except Exception as e: print len(clean_row_text) print csv_row logging.exception("SCREW UP")
tables.append("".join(lines)) lines = [] if parsing: lines.append(line) if line.startswith("<dashTable>") or line.startswith("<dotTable>"): parsing = True return tables labeled_folder = "/mnt/data/sunlight/tables/labeled" out_folder = "/mnt/data/sunlight/tables/evaluate" for doc in os.listdir(labeled_folder): # get table parser tables doc_file = os.path.join(labeled_folder, doc) paragraph_list = ttt.get_paragraphs(codecs.open(doc_file, "r", "utf8")) table_parser_tables = ttt.identify_tables(paragraph_list) # get labeled tables labeled_tables = find_labeled(doc_file) print "For document %s:" % doc_file print "Number of hand labeled tables: %s" % len(labeled_tables) print "Number of table parser tables: %s" % len(table_parser_tables) print "\n\n\n\n\n" # output results for manual inspection labeled_file = open(os.path.join(out_folder, doc + "-labeled"), "w") parsed_file = open(os.path.join(out_folder, doc + "-parsed"), "w") labeled_file.write("TABLE:\n".join(labeled_tables)) parsed_file.write("TABLE:\n".join("".join(table.content) for table in table_parser_tables))
parsing = False tables.append("".join(lines)) lines = [] if parsing: lines.append(line) if line.startswith("<dashTable>") or line.startswith("<dotTable>"): parsing = True return tables labeled_folder = "/mnt/data/sunlight/tables/labeled" out_folder = "/mnt/data/sunlight/tables/evaluate" for doc in os.listdir(labeled_folder): # get table parser tables doc_file = os.path.join(labeled_folder, doc) paragraph_list = ttt.get_paragraphs(codecs.open(doc_file, 'r', 'utf8')) table_parser_tables = ttt.identify_tables(paragraph_list) # get labeled tables labeled_tables = find_labeled(doc_file) print "For document %s:" % doc_file print "Number of hand labeled tables: %s" % len(labeled_tables) print "Number of table parser tables: %s" % len(table_parser_tables) print "\n\n\n\n\n" # output results for manual inspection labeled_file = open(os.path.join(out_folder, doc + "-labeled"), "w") parsed_file = open(os.path.join(out_folder, doc + "-parsed"), "w") labeled_file.write("TABLE:\n".join(labeled_tables)) parsed_file.write("TABLE:\n".join("".join(table.content) for table in table_parser_tables))