def label_doc(self, doc_path, congress, chamber, document_type, number): print doc_path paragraphs_list = text_table_tools.get_paragraphs(open(doc_path,'r')) tables = text_table_tools.identify_tables(paragraphs_list) for table in tables: table_offset = table.offset column_indices = sorted(text_table_tools.get_candidate_columns(table)) sponsor_indices = self.sponsor_coder.find_sponsor_index(table, congress) for row in table.rows: self.label_row(row, column_indices, table_offset, congress, chamber, document_type, number, sponsor_indices)
def label_doc(self, doc_path, congress, chamber, document_type, number): print doc_path paragraphs_list = text_table_tools.get_paragraphs(open(doc_path, 'r')) tables = text_table_tools.identify_tables(paragraphs_list) for table in tables: table_offset = table.offset column_indices = sorted( text_table_tools.get_candidate_columns(table)) sponsor_indices = self.sponsor_coder.find_sponsor_index( table, congress) for row in table.rows: self.label_row(row, column_indices, table_offset, congress, chamber, document_type, number, sponsor_indices)
def extract_tables(document_paths): print "begin table extraction" for path in document_paths: paragraphs_list = ttt.get_paragraphs(codecs.open(path[0], 'r', 'utf8')) tables = ttt.identify_tables(paragraphs_list) try: params = [(t.offset, t.length, ','.join(t.header), ' '.join(t.title), ' '.join(t.body), ' '.join(t.content), path[1]) for t in tables] cmd = 'INSERT INTO tables ("offset", "length", headers, title, body, content, document_id) VALUES (%s,%s,%s,%s,%s,%s,%s)' cur = conn.cursor() cur.executemany(cmd, params) conn.commit() except Exception as ex: print "Failed to import doc %s: %s" % (path[0], ex)
def extract_tables(document_paths): print "begin table extraction" for path in document_paths: paragraphs_list = ttt.get_paragraphs(codecs.open(path[0], 'r', 'utf8')) tables = ttt.identify_tables(paragraphs_list) try: params = [ (t.offset, t.length, ','.join(t.header), ' '.join(t.title), ' '.join(t.body), ' '.join(t.content), path[1]) for t in tables ] cmd = 'INSERT INTO tables ("offset", "length", headers, title, body, content, document_id) VALUES (%s,%s,%s,%s,%s,%s,%s)' cur = conn.cursor() cur.executemany(cmd, params) conn.commit() except Exception as ex: print "Failed to import doc %s: %s" % (path[0], ex)