def get_cell_label(cache_base, img_base, photo_file, box, zoom, sleep_delay): cache_path = cache_base + photo_file + '_' + '_'.join( [str(x) for x in box[:4]]) + '.json' if os.path.isfile(cache_path): with open(cache_path, 'r') as cache_file: response = json.loads(cache_file.read()) else: img = cv2.imread(img_base + photo_file) x1 = zoom * box[0] x2 = x1 + (zoom * box[2]) y1 = zoom * box[1] y2 = y1 + (zoom * box[3]) cell = img[y1:y2, x1:x2] retval, cell_buffer = cv2.imencode('.jpg', cell) image_content = base64.b64encode(cell_buffer).decode() response = query_google_ocr(image_content) time.sleep(sleep_delay) if 'responses' in response: dir_helper.ensure(cache_path) with open(cache_path, 'w') as cache_file: json.dump(response, cache_file) else: return '' return get_labels(response, combine=True)
def get_cell_label(cache_base, img_base, photo_file, box, zoom, sleep_delay): cache_path = cache_base + photo_file + '_' + '_'.join([str(x) for x in box[:4]]) + '.json' if os.path.isfile(cache_path): with open(cache_path, 'r') as cache_file: response = json.loads(cache_file.read()) else: img = cv2.imread(img_base + photo_file) x1 = zoom * box[0] x2 = x1 + (zoom * box[2]) y1 = zoom * box[1] y2 = y1 + (zoom * box[3]) cell = img[y1:y2, x1:x2] retval, cell_buffer = cv2.imencode('.jpg', cell) image_content = base64.b64encode(cell_buffer).decode() response = query_google_ocr(image_content) time.sleep(sleep_delay) if 'responses' in response: dir_helper.ensure(cache_path) with open(cache_path, 'w') as cache_file: json.dump(response, cache_file) else: return '' return get_labels(response, combine=True)
def get_json_data(image, base_path, zoom_level, pref, sleep_delay): zoom_prefix = str(zoom_level) + 'x/' if zoom_level > 1 else '' json_cache_file = pref + json_cache_path + '/' + zoom_prefix + image + '.json' if os.path.isfile(json_cache_file): with open(json_cache_file, 'r') as j_file: data = json.loads(j_file.read()) if 'statusCode' not in data or data['statusCode'] != 429: return data with open(base_path + '/' + zoom_prefix + image, 'rb') as img_file: img_data = img_file.read() data = None while data is None: conn = None try: conn = http.client.HTTPSConnection('api.projectoxford.ai', timeout=10) conn.request("POST", "/vision/v1/ocr?%s" % params, img_data, headers) response = conn.getresponse() data = response.read() conn.close() except Exception as e: print("[Errno {0}] {1}".format(e.errno, e.strerror)) data = None else: if conn is not None: conn.close() conn = None json_data = json.loads( data.decode('utf-8')) # Need to double-check if utf-8 is correct dir_helper.ensure(json_cache_file) with open(json_cache_file, 'w') as json_file: json.dump(json_data, json_file) time.sleep(sleep_delay) return json_data
def get_json_data(image, base_path, zoom_level, pref, sleep_delay): zoom_prefix = str(zoom_level) + 'x/' if zoom_level > 1 else '' json_cache_file = pref + json_cache_path + '/' + zoom_prefix + image + '.json' if os.path.isfile(json_cache_file): with open(json_cache_file, 'r') as j_file: data = json.loads(j_file.read()) if 'statusCode' not in data or data['statusCode'] != 429: return data with open(base_path + '/' + zoom_prefix + image, 'rb') as img_file: img_data = img_file.read() data = None while data is None: conn = None try: conn = http.client.HTTPSConnection('api.projectoxford.ai', timeout=10) conn.request("POST", "/vision/v1/ocr?%s" % params, img_data, headers) response = conn.getresponse() data = response.read() conn.close() except Exception as e: print("[Errno {0}] {1}".format(e.errno, e.strerror)) data = None else: if conn is not None: conn.close() conn = None json_data = json.loads(data.decode('utf-8')) # Need to double-check if utf-8 is correct dir_helper.ensure(json_cache_file) with open(json_cache_file, 'w') as json_file: json.dump(json_data, json_file) time.sleep(sleep_delay) return json_data
def setup(): key_files = [key_file for key_file in os.listdir(feature_path)] train_labels = [] test_labels = [] train_set = [] train_features = [] test_features = [] test_set = [] for key_file in key_files: l, f = get_image_info(key_file) if len(train_labels) < 35000: train_set.append(key_file) train_labels += l train_features += f else: test_set.append(key_file) test_labels += l test_features += f train_features = trim_features(train_features) test_features = trim_features(test_features) forest = RandomForestClassifier(n_estimators=100) forest.fit(train_features, train_labels) tn_count = 0 tp_count = 0 fn_count = 0 fp_count = 0 false_negs = [] false_poss = [] for (l, f) in zip(test_labels, test_features): pred = forest.predict([f]) if l == 0: if pred[0] == 0: tn_count += 1 else: fp_count += 1 false_poss.append(f) else: if pred[0] == 1: tp_count += 1 else: fn_count += 1 false_negs.append(f) print('Accuracy: ' + str((tn_count + tp_count) * 1.0 / len(test_labels))) print('Recall: ' + str(tp_count * 1.0 / (tp_count + fn_count))) print('Precision: ' + str(tp_count * 1.0 / (tp_count + fp_count))) dir_helper.ensure(classifier_path) with open(classifier_path, 'wb') as f: pickle.dump(forest, f) dir_helper.ensure(train_set_path) with open(train_set_path, 'w') as f: f.write(' \n'.join(train_set)) dir_helper.ensure(test_set_path) with open(test_set_path, 'w') as f: f.write(' \n'.join(test_set)) print('Done')
def output(rows, cols, boxes, xlsx_path, json_path): try: os.remove(xlsx_path) except OSError: pass try: dir_helper.ensure(xlsx_path) book = xlsxwriter.Workbook(xlsx_path) sheet = book.add_worksheet() indices = {} for i, row in enumerate(rows): for box in row[5]: idx = boxes.index(box) indices[idx] = {} # This could be made more efficient with a default dict if 'rows' not in indices[idx]: indices[idx]['rows'] = [] indices[idx]['rows'].append(i) for i, col in enumerate(cols): for box in col[5]: idx = boxes.index(box) # See note about efficiency above if 'cols' not in indices[idx]: indices[idx]['cols'] = [] indices[idx]['cols'].append(i) cells = [[[] for x in range(len(cols))] for y in range(len(rows))] for i,box in enumerate(boxes): sorted_rows = sorted(indices[i]['rows']) sorted_cols = sorted(indices[i]['cols']) for k,row_idx in enumerate(sorted_rows): for j,col_idx in enumerate(sorted_cols): if k == 0 and j == 0: contents = {'type': 'cell', 'contents': box} else: contents = {'type': 'span', 'main_row': sorted_rows[0], 'main_col': sorted_cols[0]} cells[row_idx][col_idx].append(contents) out_arr = [['' for x in range(len(cols))] for y in range(len(rows))] for row_idx, row in enumerate(cells): for col_idx, cell in enumerate(row): overall = [] for cell_info in cell: if 'type' not in cell_info: cell_info = {'type': 'unspecified'} if cell_info['type'] == 'cell': contents = ' '.join(cell_info['contents'][4]) elif cell_info['type'] == 'span': # This can later be a special structure for the json # including for the main cell, too contents = 'SPAN_OF(' + str(cell_info['main_row']) + ', ' + str(cell_info['main_col']) + ')' else: contents = '' overall.append(contents) display_contents = ' '.join(overall) sheet.write(row_idx, col_idx, display_contents) # Store for json output out_arr[row_idx][col_idx] = display_contents dir_helper.ensure(json_path) with open(json_path, 'w') as f: json.dump({'cells': out_arr}, f) # sheet.write(row, col, item) finally: book.close()
def output(rows, cols, boxes, xlsx_path, json_path): try: os.remove(xlsx_path) except OSError: pass try: dir_helper.ensure(xlsx_path) book = xlsxwriter.Workbook(xlsx_path) sheet = book.add_worksheet() indices = {} for i, row in enumerate(rows): for box in row[5]: idx = boxes.index(box) indices[idx] = {} # This could be made more efficient with a default dict if 'rows' not in indices[idx]: indices[idx]['rows'] = [] indices[idx]['rows'].append(i) for i, col in enumerate(cols): for box in col[5]: idx = boxes.index(box) # See note about efficiency above if 'cols' not in indices[idx]: indices[idx]['cols'] = [] indices[idx]['cols'].append(i) cells = [[[] for x in range(len(cols))] for y in range(len(rows))] for i, box in enumerate(boxes): sorted_rows = sorted(indices[i]['rows']) sorted_cols = sorted(indices[i]['cols']) for k, row_idx in enumerate(sorted_rows): for j, col_idx in enumerate(sorted_cols): if k == 0 and j == 0: contents = {'type': 'cell', 'contents': box} else: contents = { 'type': 'span', 'main_row': sorted_rows[0], 'main_col': sorted_cols[0] } cells[row_idx][col_idx].append(contents) out_arr = [['' for x in range(len(cols))] for y in range(len(rows))] for row_idx, row in enumerate(cells): for col_idx, cell in enumerate(row): overall = [] for cell_info in cell: if 'type' not in cell_info: cell_info = {'type': 'unspecified'} if cell_info['type'] == 'cell': contents = ' '.join(cell_info['contents'][4]) elif cell_info['type'] == 'span': # This can later be a special structure for the json # including for the main cell, too contents = 'SPAN_OF(' + str( cell_info['main_row']) + ', ' + str( cell_info['main_col']) + ')' else: contents = '' overall.append(contents) display_contents = ' '.join(overall) sheet.write(row_idx, col_idx, display_contents) # Store for json output out_arr[row_idx][col_idx] = display_contents dir_helper.ensure(json_path) with open(json_path, 'w') as f: json.dump({'cells': out_arr}, f) # sheet.write(row, col, item) finally: book.close()
def score_boxes(boxes, lines, feature_file, dist_imgs): box_scores = [[0.0 for box in boxes] for box in boxes] # Remove since we're appending for now if should_record_features: dir_helper.ensure(feature_file) open(feature_file, 'w').close() with open('classifier.pkl', 'rb') as f: classifier = pickle.load(f) features = {} for comb in combinations(enumerate(boxes), 2): i = comb[0][0] j = comb[1][0] box_1 = comb[0][1] box_2 = comb[1][1] scores = {} features = {} # 1. Higher score the closer together they are # horiz and vert (percentage and flat) horiz_dist = max(0, max(box_1[0], box_2[0]) - min(box_1[0] + box_1[2], box_2[0] + box_2[2])) vert_dist = max(0, max(box_1[1], box_2[1]) - min(box_1[1] + box_1[3], box_2[1] + box_2[3])) min_horiz_range = min(box_1[2], box_2[2]) min_vert_range = min(box_1[3], box_2[3]) dist = (horiz_dist ** 2 + vert_dist ** 2) ** 0.5 min_range = (min_horiz_range ** 2 + min_vert_range ** 2) ** 0.5 scores['dist_pix'] = 1.0 / (1.0 + dist) scores['dist_perc'] = 1.0 / (1.0 + (dist * 1.0 / min_range)) features['dist_pix'] = dist features['dist_perc'] = dist * 1.0 / min_range # 2. Higher score if they overlap (horiz and vert) # both percentage and flat horiz_over = max(0, min(box_1[0] + box_1[2], box_2[0] + box_2[2]) - max(box_1[0], box_2[0])) vert_over = max(0, min(box_1[1] + box_1[3], box_2[1] + box_2[3]) - max(box_1[1], box_2[1])) scores['overlap_pix'] = 1.0 * horiz_over * vert_over scores['overlap_perc'] = 1.0 * horiz_over * vert_over / (min_horiz_range * min_vert_range) features['overlap_pix'] = horiz_over * vert_over features['overlap_perc'] = scores['overlap_perc'] # Check for horizontal line b/w boxes scores['no_div_horiz_line'] = 0.0 if line_between(box_1, box_2, lines, 1) else 1.0 features['no_div_horiz_line'] = 0 if line_between(box_1, box_2, lines, 1) else 1 # Check for vertical line b/w boxes scores['no_div_vert_line'] = 0.0 if line_between(box_1, box_2, lines, 0) else 1.0 features['no_div_vert_line'] = 0 if line_between(box_1, box_2, lines, 0) else 1 # Additional features? # Ratio of areas box_1_area = box_1[2] * box_1[3] box_2_area = box_2[2] * box_2[3] features['ratio_of_areas_1'] = box_1_area * 1.0 / box_2_area features['ratio_of_areas_2'] = box_2_area * 1.0 / box_1_area # Area of box 1 features['box_1_area'] = box_1_area # Area of box 2 features['box_2_area'] = box_2_area # Left to right distance, and right to left distance features['ltr_dist'] = min(abs(box_1[0] - (box_2[0] + box_2[2])), box_2[0] - (box_1[0] + box_1[2])) # and top to bottom and bottom to top features['ttb_dist'] = min(abs(box_1[1] - (box_2[1] + box_2[3])), box_2[1] - (box_1[1] + box_1[3])) # Line in between ignoring overlap features['no_horiz_line_in_middle'] = 0 if line_in_middle(box_1, box_2, lines, 1) else 1 features['no_vert_line_in_middle'] = 0 if line_in_middle(box_1, box_2, lines, 0) else 1 # Distance between centers features['dist_bw_vert_centers'] = abs((box_1[0] + box_1[0] + box_1[2]) - (box_2[0] + box_2[0] + box_2[2])) / 2.0 features['dist_bw_horiz_centers'] = abs((box_1[1] + box_1[1] + box_1[3]) - (box_2[1] + box_2[1] + box_2[3])) / 2.0 # Distance between tops features['dist_bw_lefts'] = abs(box_1[0] - box_2[0]) features['dist_bw_tops'] = abs(box_1[1] - box_2[1]) # Distance between bottoms features['dist_bw_rights'] = abs((box_1[0] + box_1[2]) - (box_2[0] + box_2[2])) features['dist_bw_bottoms'] = abs((box_1[1] + box_1[3]) - (box_2[1] + box_2[3])) # Distance transform features features['dist_trans_tb_scaled'] = get_tb_dt(dist_imgs[1], box_1, box_2) features['dist_trans_lr_scaled'] = get_lr_dt(dist_imgs[1], box_1, box_2) score = get_classifier_score(features, classifier) record_features(box_1, box_2, features, feature_file) box_scores[i][j] = score box_scores[j][i] = score return box_scores