def start_grid_wise_multiple_doc_wise_v1_grid(self,docs,ddocs,company,table_type,project,order_doc):#doc_id,db_string,ProjectID,workspace_id,ddoc,page,grid): search_result = {} obj = get_info.pdf_cloud_db(self.config_path) label_result_dic = self.get_labels(docs, obj) for ddoc in ddocs: ddoc_key = project+"_GRID_"+ddoc si,sp,sdb = self.config.get('redis_search','storage').split('##') dredis_obj = redis_search.TAS_AutoCompleter(si,sp,sdb, ddoc_key) for query, cnt in label_result_dic.items(): try: get_alltext = dredis_obj.search_query_convert_docs_wise_v1(query, search_result, ddoc, 1) except Exception as e: pass new_results = [] for doc_id in ddocs:#search_result.items(): results = search_result.get(doc_id, {}) sorted_x = sorted(results.items(), key=operator.itemgetter(1),reverse=True) db_string = project.split('__')[1] #self.config.get(project,'pdf_cloud_data_db') #, 'value') last_three = sorted_x[0:5] grid_avail = map(lambda a: a[0], last_three) if table_type: save_obj = save_mgmt1.save_mgmt("/var/www/cgi-bin/INC_Interface/pysrc_20_19_20/Config.ini") all_grids = save_obj.get_only_suggestion(doc_id ,company,db_string,table_type) for gg in all_grids: if gg not in grid_avail: last_three.append((gg,0)) new_results.append((int(doc_id),last_three)) new_results.sort() dd = sorted(new_results,key=lambda x: order_doc.get(x[0],999)) import doc_stats doc_stats_obj = doc_stats.stats() return doc_stats_obj.convert_scop_grid_format(dd, ddocs, project) #dd #new_results
def start_grid_wise_response(self,doc_id,db_string,ProjectID,workspace_id,ddoc,page,grid): obj = get_info.pdf_cloud_db(self.config_path) search_result = {} ddoc_key = "GRID_"+ddoc dredis_obj = redis_search.TAS_AutoCompleter('172.16.20.7','6382','0', ddoc_key) data = obj.getTableInfoSql(db_string,doc_id,page,grid) data = json.loads(data) if not data: return ["Data Not Found"] for rkeys,rdic in data['data'].items(): section_type = rdic.get('ldr','') if section_type and section_type == 'hch': values = rdic.get('data','') if not values:continue clean_values = dredis_obj.StringEscape(values) query = '@DATA:"%s"'%clean_values try: get_alltext = dredis_obj.search_query_convert_result(query) search_result[clean_values] = get_alltext except Exception as e: print [e, query] pass search_len = {} for k,v in search_result.items(): search_len[k] = len(v) return [search_result, search_len]
def start_grid_wise(self,doc_id,db_string,ProjectID,workspace_id,ddoc,page,grid,project): obj = get_info.pdf_cloud_db(self.config_path) search_result = {} ddoc_key = "GRID_"+ddoc ci,cp,cg = self.config(project,'suser_storage').split('##') #dredis_obj = redis_search.TAS_AutoCompleter('172.16.20.7','6382','0', ddoc_key) dredis_obj = redis_search.TAS_AutoCompleter(ci,cp,cg,ddoc_key) data = obj.getTableInfoSql(db_string,doc_id,page,grid) data = json.loads(data) if not data: return ["Data Not Found"] for rkeys,rdic in data['data'].items(): section_type = rdic.get('ldr','') if section_type and section_type == 'hch': values = rdic.get('data','') if not values:continue clean_values = dredis_obj.StringEscape(values) query = '@DATA:"%s"'%clean_values try: get_alltext = dredis_obj.search_query_convert(query,search_result) except Exception as e: print [e, query] pass sorted_x = sorted(search_result.items(), key=operator.itemgetter(1),reverse=True) last_three = sorted_x[0:5] return last_three
def start(self, companies, project): obj = get_info.pdf_cloud_db(self.config_path) #doc_idddd = ['1189'] for companydic in companies: company = companydic #company = companydic['id'] #if company not in ['HyundaiEngineeringandConstructionCoLtd']:continue #if company not in ['BOCHongKongHoldingsLtd']:continue all_docs = obj.get_company_info_cmp(company, project) obj = get_info.pdf_cloud_db(self.config_path) for docs in all_docs: #print docs doc_id = docs['doc_id'] #print doc_id #if doc_id != "13953":continue #print doc_id #if doc_id not in doc_idddd:continue db_string = docs['db_string'] page_info = self.get_page_info(doc_id, db_string) res = [] for page, grids in page_info.items(): for grid in grids: try: data = obj.get_row_col_db_info( db_string, doc_id, page, grid, project) except: print 'grid Error', [doc_id, page, grid, project] continue if not data: continue if 'data' not in data: continue for rkeys, rdic in data['data'].items(): r, c = rkeys.split('_') section_type = rdic.get('ldr', '') if (type(section_type) == type([])): section_type = ''.join(section_type) res.append([ rdic.get('data', ''), section_type, str(doc_id), str(page), str(grid), str(rkeys), rdic.get('bbox', '') ]) vdata = self.start_indexing(res, doc_id, company, project) print[doc_id, page, vdata]
def start_descrption_wise(self,data,ddoc): obj = get_info.pdf_cloud_db(self.config_path) search_result = {} ddoc_key = "GRID_"+ddoc dredis_obj = redis_search.TAS_AutoCompleter('172.16.20.7','6382','0', ddoc_key) for values in data: clean_values = dredis_obj.StringEscape(values) query = '@DATA:"%s"'%clean_values try: get_alltext = dredis_obj.search_query_convert(query,search_result) except Exception as e:pass sorted_x = sorted(search_result.items(), key=operator.itemgetter(1),reverse=True) last_three = sorted_x[0:5] return last_three
def start_grid_wise_multiple_doc_wise_contains(self,docs,ddocs,company,table_type,project):#doc_id,db_string,ProjectID,workspace_id,ddoc,page,grid): search_result = {} for ddoc in ddocs: for vjson in docs: db_string = vjson.get("db_string","") doc_id = vjson['doc_id'] page = vjson['pageno'] grid = vjson['groupid'] ProjectID = vjson['ProjectID'] workspace_id = vjson['workspace_id'] obj = get_info.pdf_cloud_db(self.config_path) ddoc_key = project+"_GRID_"+ddoc si,sp,sdb = self.config.get(project,'suser_storage').split('##') dredis_obj = redis_search.TAS_AutoCompleter(si,sp,sdb, ddoc_key) #print [doc_id,page,grid] data = obj.getTableInfoSql(db_string,doc_id,page,grid) if not data:continue data = json.loads(data) if not data: return ["Data Not Found"] for rkeys,rdic in data['data'].items(): section_type = rdic.get('ldr','') if section_type and section_type == 'hch': values = rdic.get('data','') if not values:continue clean_values = dredis_obj.StringEscape(values) query = '@DATA:"%s"'%clean_values #print query try: get_alltext = dredis_obj.search_query_convert_docs_wise_v1(query,search_result, ddoc) except Exception as e: #print [e, query] pass new_results = [] for doc_id,results in search_result.items(): sorted_x = sorted(results.items(), key=operator.itemgetter(1),reverse=True) db_string = self.config.get(project,'pdf_cloud_data_db') #, 'value') last_three = sorted_x[0:5] grid_avail = map(lambda a: a[0], last_three) all_grids = obj.get_page_info_cval(db_string,ddoc) save_obj = save_mgmt1.save_mgmt("/var/www/cgi-bin/INC_Interface/pysrc_08_07_19/Config.ini") all_grids = save_obj.get_only_suggestion(ddoc,company,db_string,table_type) #all_grids = {} for gg in all_grids: if gg not in grid_avail: last_three.append((gg,0)) new_results.append((int(doc_id),last_three)) new_results.sort() return new_results
def start_grid_wise_multiple_doc_wise_v1test(self,docs,ddocs,company,table_type,project,order_doc):#doc_id,db_string,ProjectID,workspace_id,ddoc,page,grid): search_result = {} obj = get_info.pdf_cloud_db(self.config_path) label_result_dic = self.get_labels(docs, obj) print label_result_dic for ddoc in ddocs: ddoc_key = project+"_GRID_"+ddoc #print 'tt', ddoc_key si,sp,sdb = self.config.get('redis_search','storage').split('##') dredis_obj = redis_search.TAS_AutoCompleter(si,sp,sdb, ddoc_key) for query, cnt in label_result_dic.items(): #print 'query', query try: get_alltext = dredis_obj.search_query_convert_docs_wise_v1(query, search_result, ddoc, 1) except Exception as e: #print [e, query] pass new_results = [] for doc_id,results in search_result.items(): #print results sorted_x = sorted(results.items(), key=operator.itemgetter(1),reverse=True) db_string = self.config.get(project,'pdf_cloud_data_db') #, 'value') last_three = sorted_x[0:5] grid_avail = map(lambda a: a[0], last_three) #all_grids = obj.get_page_info_cval(db_string,ddoc) if table_type: save_obj = save_mgmt1.save_mgmt("/var/www/cgi-bin/INC_Interface/pysrc_08_07_19/Config.ini") all_grids = save_obj.get_only_suggestion(ddoc,company,db_string,table_type) print ',,,,,,,,', all_grids #all_grids = {} for gg in all_grids: if gg not in grid_avail: last_three.append((gg,0)) new_results.append((int(doc_id),last_three)) print new_results new_results.sort() dd = sorted(new_results,key=lambda x: order_doc.get(x[0],999)) return [] #dd #new_results
def start_doc_id_wise(self, db_string, doc_id, project_id, workspace_id, company): obj = get_info.pdf_cloud_db(self.config_path) page_info = self.get_page_info(doc_id, db_string) res = [] for page, grids in page_info.items(): for grid in grids: data = obj.get_row_col_db_info(db_string, doc_id, page, grid) for rkeys, rdic in data['data'].items(): r, c = rkeys.split('_') section_type = rdic.get('ldr', '') if (type(section_type) == type([])): section_type = ''.join(section_type) res.append([ rdic.get('data', ''), section_type, str(doc_id), str(page), str(grid), str(rkeys), rdic.get('bbox', '') ]) vdata = self.start_indexing(res, doc_id, company) return ["Done"]
def get_page_info(self, doc_id, db_string): obj = get_info.pdf_cloud_db(self.config_path) res = obj.get_page_info(db_string, doc_id) return res
def get_companies(self, project): dbobj = get_info.pdf_cloud_db(self.config_path) companies = dbobj.get_company_info(project) #print companies self.start(companies, project)