def index_all_table(company_id): from getCompanyName_machineId import getCN_MID getCompanyName_machineId = getCN_MID() company_name, machine_id = getCompanyName_machineId[company_id] model_number = '1' project_id, url_id = company_id.split('_') all_doc_table_to_process = [] norm_res_list = sObj.slt_normresids(project_id, url_id) db_file = os.path.join('/mnt/eMB_db/', company_name, model_number, 'company_report.db') conn = qObj.create_connection(db_file) cur = conn.cursor() table_name = 'Table_Report' column_list = [('row_id', 'INTEGER PRIMARY KEY AUTOINCREMENT'), ('table_id', 'VARCHAR(20)'), ('doc_id', 'VARCHAR(20)'), ('classification', 'VARCHAR(256)'), ('normalization', 'VARCHAR(1)'), ('error_accepted', 'VARCHAR(1)'), ('db_status', 'VARCHAR(1)')] column_tup = tuple(map(lambda x: x[0], column_list[1:])) qObj.createLiteTable(conn, cur, '', table_name, column_list) data = [] for doc_tup in norm_res_list: doc_id, page_number, norm_table_id = map(lambda x: str(x), doc_tup) db_tup = (norm_table_id, doc_id, '', 'Y', 'N', 'N') data.append(db_tup) stmt = 'delete from %s' % (table_name) cur.execute(stmt) qObj.insertIntoLite(conn, cur, '', table_name, column_tup, data) conn.commit() conn.close() return 'done'
def insert_update_table_report(company_id, table_ids): from getCompanyName_machineId import getCN_MID getCompanyName_machineId = getCN_MID() company_name, machine_id = getCompanyName_machineId[company_id] model_number = '1' project_id, url_id = company_id.split('_') model_number = copy.deepcopy(project_id) project_id, url_id = company_id.split('_') all_doc_table_to_process = [] norm_res_list = sObj.slt_normresids(project_id, url_id) db_file = os.path.join('/mnt/eMB_db/', company_name, model_number, 'company_report.db') print 'db_file', db_file conn = qObj.create_connection(db_file) cur = conn.cursor() table_name = 'Table_Report' column_list = [('row_id', 'INTEGER PRIMARY KEY AUTOINCREMENT'), ('table_id', 'VARCHAR(20)'), ('doc_id', 'VARCHAR(20)'), ('classification', 'VARCHAR(256)'), ('normalization', 'VARCHAR(1)'), ('error_accepted', 'VARCHAR(1)'), ('db_status', 'VARCHAR(1)')] column_tup = tuple(map(lambda x: x[0], column_list[1:])) qObj.createLiteTable(conn, cur, '', table_name, column_list) stmt = "select table_id, doc_id, classification, normalization, error_accepted, db_status from Table_Report" cur.execute(stmt) res = cur.fetchall() selected_dict = {} for r in res: table_id, doc_id, classification, normalization, error_accepted, db_status = map( str, r) selected_dict[table_id] = (classification, normalization, error_accepted, db_status) tids = [] data = [] for (doc_id, table_id, page_no, g_id, lng, g_u) in table_ids: classification, normalization, error_accepted, db_status = selected_dict.get( table_id, ('', 'Y', 'N', 'N')) normalization = 'Y' error_accepted = 'N' db_status = 'N' data.append((table_id, doc_id, classification, normalization, error_accepted, db_status)) tids.append('"' + table_id + '"') tstr = ', '.join(tids) stmt = 'delete from %s where table_id in (%s)' % (table_name, tstr) cur.execute(stmt) qObj.insertIntoLite(conn, cur, '', table_name, column_tup, data) conn.commit() conn.close() return 'done'
def get_comp_model(company_id): from getCompanyName_machineId import getCN_MID getCompanyName_machineId = getCN_MID() company_name, machine_id = getCompanyName_machineId[company_id] return company_name
def generate(company_id): doc_page_cord_dict = cobj.get_adjustment_coordinates1(company_id) from getCompanyName_machineId import getCN_MID getCompanyName_machineId = getCN_MID() company_name, machine_id = getCompanyName_machineId[company_id] project_id, url_id = company_id.split('_') all_doc_table_to_process = [] norm_res_list = sObj.slt_normresids(project_id, url_id) doc_page_grid_dict = {} doc_table_page_dict = {} for doc_tup in norm_res_list: doc_id, page_number, norm_table_id = map(lambda x: str(x), doc_tup) #if doc_id != '44':continue #if norm_table_id != '6334':continue ktup = (doc_id, norm_table_id) doc_table_page_dict[ktup] = page_number all_doc_table_to_process.append(ktup) if doc_id not in doc_page_grid_dict: doc_page_grid_dict[doc_id] = {} if page_number not in doc_page_grid_dict[doc_id]: doc_page_grid_dict[doc_id][page_number] = [] doc_page_grid_dict[doc_id][page_number].append(norm_table_id) #print doc_page_grid_dict['28'].keys() #sys.exit() res = pprocess.pmap( lambda x: generate_map_ds(company_id, all_doc_table_to_process[x]), range(0, len(all_doc_table_to_process)), 8) doc_id_page_number_bbox_dict = {} ####################################### total = len(all_doc_table_to_process) cnt = 1 for (ktup, rdict, celldata) in res: doc_id, table_id = ktup #page_number = doc_table_page_dict[ktup] xml_sec_type_dict = get_cell_mdict(celldata) print[ktup, cnt, '/', total] for xml_id, c_ar in rdict.items(): if not xml_id.strip(): continue #sys.exit() page_number = xml_id.split('#')[0].split('_')[-1].strip() dk = (doc_id + '.pdf', page_number) r, c, txt, sec_type = xml_sec_type_dict[xml_id] b_ar, page_n = c_ar if str(page_n) == page_number: if dk not in doc_id_page_number_bbox_dict: doc_id_page_number_bbox_dict[dk] = {} if sec_type not in doc_id_page_number_bbox_dict[dk]: doc_id_page_number_bbox_dict[dk][sec_type] = [] n_ar = [] for ar in b_ar: st = '_'.join(map(str, ar)) n_ar.append(st) bb = '$'.join(n_ar) pc = doc_page_cord_dict.get(doc_id, {}).get(page_number, '') #print [doc_id, table_id, page_number, xml_id, txt, pc] if not pc: print[doc_id, table_id, page_number, xml_id, txt, pc] print 'page cord error' sys.exit() dd = (table_id, r, c, txt, bb, pc) if dd not in doc_id_page_number_bbox_dict[dk][sec_type]: doc_id_page_number_bbox_dict[dk][sec_type].append(dd) cnt += 1 #sys.exit() ###################################### ff = '/var/www/html/company_bbox/' if not os.path.exists(ff): cmd = 'mkdir -p %s' % (ff) os.system(cmd) fname = os.path.join(ff, company_name + '.txt') fout = open(fname, 'w') st = '\t'.join([ 'DOC_PDF', 'TABLE_ID', 'PAGE_NUMBER', 'SECTION_TYPE', 'ROW', 'COL', 'TXT', 'BBOX(split by $ then split by _ )', 'PAGE_CORDS' ]) st += '\n' fout.write(st) for dk, sec_dict in doc_id_page_number_bbox_dict.items(): for sec_type, bbox_ar in sec_dict.items(): for (table_id, r, c, txt, bb, pc) in bbox_ar: st = '\t'.join( [dk[0], table_id, dk[1], sec_type, r, c, txt, bb, pc]) st += '\n' fout.write(st) fout.close()