Exemplo n.º 1
0
def get_cell_label(cache_base, img_base, photo_file, box, zoom, sleep_delay):
    cache_path = cache_base + photo_file + '_' + '_'.join(
        [str(x) for x in box[:4]]) + '.json'

    if os.path.isfile(cache_path):
        with open(cache_path, 'r') as cache_file:
            response = json.loads(cache_file.read())
    else:
        img = cv2.imread(img_base + photo_file)
        x1 = zoom * box[0]
        x2 = x1 + (zoom * box[2])
        y1 = zoom * box[1]
        y2 = y1 + (zoom * box[3])

        cell = img[y1:y2, x1:x2]

        retval, cell_buffer = cv2.imencode('.jpg', cell)

        image_content = base64.b64encode(cell_buffer).decode()

        response = query_google_ocr(image_content)

        time.sleep(sleep_delay)

        if 'responses' in response:
            dir_helper.ensure(cache_path)
            with open(cache_path, 'w') as cache_file:
                json.dump(response, cache_file)
        else:
            return ''

    return get_labels(response, combine=True)
Exemplo n.º 2
0
def get_cell_label(cache_base, img_base, photo_file, box, zoom, sleep_delay):
  cache_path = cache_base + photo_file + '_' + '_'.join([str(x) for x in box[:4]]) + '.json'

  if os.path.isfile(cache_path):
    with open(cache_path, 'r') as cache_file:
      response = json.loads(cache_file.read())
  else:
    img = cv2.imread(img_base + photo_file)
    x1 = zoom * box[0]
    x2 = x1 + (zoom * box[2])
    y1 = zoom * box[1]
    y2 = y1 + (zoom * box[3])

    cell = img[y1:y2, x1:x2]

    retval, cell_buffer = cv2.imencode('.jpg', cell)

    image_content = base64.b64encode(cell_buffer).decode()

    response = query_google_ocr(image_content)

    time.sleep(sleep_delay)

    if 'responses' in response:
      dir_helper.ensure(cache_path)
      with open(cache_path, 'w') as cache_file:
        json.dump(response, cache_file)
    else:
      return ''

  return get_labels(response, combine=True)
Exemplo n.º 3
0
def get_json_data(image, base_path, zoom_level, pref, sleep_delay):
    zoom_prefix = str(zoom_level) + 'x/' if zoom_level > 1 else ''
    json_cache_file = pref + json_cache_path + '/' + zoom_prefix + image + '.json'

    if os.path.isfile(json_cache_file):
        with open(json_cache_file, 'r') as j_file:
            data = json.loads(j_file.read())

        if 'statusCode' not in data or data['statusCode'] != 429:
            return data

    with open(base_path + '/' + zoom_prefix + image, 'rb') as img_file:
        img_data = img_file.read()

    data = None

    while data is None:
        conn = None
        try:
            conn = http.client.HTTPSConnection('api.projectoxford.ai',
                                               timeout=10)
            conn.request("POST", "/vision/v1/ocr?%s" % params, img_data,
                         headers)
            response = conn.getresponse()
            data = response.read()
            conn.close()
        except Exception as e:
            print("[Errno {0}] {1}".format(e.errno, e.strerror))
            data = None
        else:
            if conn is not None:
                conn.close()
                conn = None

    json_data = json.loads(
        data.decode('utf-8'))  # Need to double-check if utf-8 is correct

    dir_helper.ensure(json_cache_file)
    with open(json_cache_file, 'w') as json_file:
        json.dump(json_data, json_file)

    time.sleep(sleep_delay)

    return json_data
Exemplo n.º 4
0
def get_json_data(image, base_path, zoom_level, pref, sleep_delay):
  zoom_prefix = str(zoom_level) + 'x/' if zoom_level > 1 else ''
  json_cache_file = pref + json_cache_path + '/' + zoom_prefix + image + '.json'

  if os.path.isfile(json_cache_file):
    with open(json_cache_file, 'r') as j_file:
      data = json.loads(j_file.read())

    if 'statusCode' not in data or data['statusCode'] != 429:
      return data

  with open(base_path + '/' + zoom_prefix + image, 'rb') as img_file:
    img_data = img_file.read()

  data = None

  while data is None:
    conn = None
    try:
      conn = http.client.HTTPSConnection('api.projectoxford.ai', timeout=10)
      conn.request("POST", "/vision/v1/ocr?%s" % params, img_data, headers)
      response = conn.getresponse()
      data = response.read()
      conn.close()
    except Exception as e:
      print("[Errno {0}] {1}".format(e.errno, e.strerror))
      data = None
    else:
      if conn is not None:
        conn.close()
        conn = None

  json_data = json.loads(data.decode('utf-8')) # Need to double-check if utf-8 is correct

  dir_helper.ensure(json_cache_file)
  with open(json_cache_file, 'w') as json_file:
    json.dump(json_data, json_file)

  time.sleep(sleep_delay)

  return json_data
Exemplo n.º 5
0
def setup():
  key_files = [key_file for key_file in os.listdir(feature_path)]

  train_labels = []
  test_labels = []
  train_set = []

  train_features = []
  test_features = []
  test_set = []

  for key_file in key_files:
    l, f = get_image_info(key_file)

    if len(train_labels) < 35000:
      train_set.append(key_file)
      train_labels += l
      train_features += f
    else:
      test_set.append(key_file)
      test_labels += l
      test_features += f

  train_features = trim_features(train_features)
  test_features = trim_features(test_features)

  forest = RandomForestClassifier(n_estimators=100)
  forest.fit(train_features, train_labels)

  tn_count = 0
  tp_count = 0
  fn_count = 0
  fp_count = 0

  false_negs = []
  false_poss = []
  for (l, f) in zip(test_labels, test_features):
    pred = forest.predict([f])

    if l == 0:
      if pred[0] == 0:
        tn_count += 1
      else:
        fp_count += 1
        false_poss.append(f)
    else:
      if pred[0] == 1:
        tp_count += 1
      else:
        fn_count += 1
        false_negs.append(f)

  print('Accuracy: ' + str((tn_count + tp_count) * 1.0 / len(test_labels)))
  print('Recall: ' + str(tp_count * 1.0 / (tp_count + fn_count)))
  print('Precision: ' + str(tp_count * 1.0 / (tp_count + fp_count)))

  dir_helper.ensure(classifier_path)
  with open(classifier_path, 'wb') as f:
    pickle.dump(forest, f)

  dir_helper.ensure(train_set_path)
  with open(train_set_path, 'w') as f:
    f.write(' \n'.join(train_set))

  dir_helper.ensure(test_set_path)
  with open(test_set_path, 'w') as f:
    f.write(' \n'.join(test_set))

  print('Done')
Exemplo n.º 6
0
def output(rows, cols, boxes, xlsx_path, json_path):
  try:
    os.remove(xlsx_path)
  except OSError:
    pass

  try:
    dir_helper.ensure(xlsx_path)
    book = xlsxwriter.Workbook(xlsx_path)
    sheet = book.add_worksheet()
  
    indices = {}
  
    for i, row in enumerate(rows):
  
      for box in row[5]:
        idx = boxes.index(box)
        indices[idx] = {}
  
        # This could be made more efficient with a default dict
        if 'rows' not in indices[idx]:
          indices[idx]['rows'] = []

        indices[idx]['rows'].append(i)
  
    for i, col in enumerate(cols):
      for box in col[5]:
        idx = boxes.index(box)

        # See note about efficiency above
        if 'cols' not in indices[idx]:
          indices[idx]['cols'] = []

        indices[idx]['cols'].append(i)

    cells = [[[] for x in range(len(cols))] for y in range(len(rows))]

    for i,box in enumerate(boxes):
      sorted_rows = sorted(indices[i]['rows'])
      sorted_cols = sorted(indices[i]['cols'])
      for k,row_idx in enumerate(sorted_rows):
        for j,col_idx in enumerate(sorted_cols):
          if k == 0 and j == 0:
            contents = {'type': 'cell', 'contents': box}
          else:
            contents = {'type': 'span', 'main_row': sorted_rows[0], 'main_col': sorted_cols[0]}

          cells[row_idx][col_idx].append(contents)

    out_arr = [['' for x in range(len(cols))] for y in range(len(rows))]

    for row_idx, row in enumerate(cells):
      for col_idx, cell in enumerate(row):
        overall = []
        for cell_info in cell:
          if 'type' not in cell_info:
            cell_info = {'type': 'unspecified'}

          if cell_info['type'] == 'cell':
            contents = ' '.join(cell_info['contents'][4])
          elif cell_info['type'] == 'span':
            # This can later be a special structure for the json
            # including for the main cell, too
            contents = 'SPAN_OF(' + str(cell_info['main_row']) + ', ' + str(cell_info['main_col']) + ')'
          else:
            contents = ''

          overall.append(contents)

        display_contents = ' '.join(overall)
        sheet.write(row_idx, col_idx, display_contents)

        # Store for json output
        out_arr[row_idx][col_idx] = display_contents

    dir_helper.ensure(json_path)
    with open(json_path, 'w') as f:
      json.dump({'cells': out_arr}, f)
  
    # sheet.write(row, col, item)

  finally:
    book.close()
Exemplo n.º 7
0
def output(rows, cols, boxes, xlsx_path, json_path):
    try:
        os.remove(xlsx_path)
    except OSError:
        pass

    try:
        dir_helper.ensure(xlsx_path)
        book = xlsxwriter.Workbook(xlsx_path)
        sheet = book.add_worksheet()

        indices = {}

        for i, row in enumerate(rows):

            for box in row[5]:
                idx = boxes.index(box)
                indices[idx] = {}

                # This could be made more efficient with a default dict
                if 'rows' not in indices[idx]:
                    indices[idx]['rows'] = []

                indices[idx]['rows'].append(i)

        for i, col in enumerate(cols):
            for box in col[5]:
                idx = boxes.index(box)

                # See note about efficiency above
                if 'cols' not in indices[idx]:
                    indices[idx]['cols'] = []

                indices[idx]['cols'].append(i)

        cells = [[[] for x in range(len(cols))] for y in range(len(rows))]

        for i, box in enumerate(boxes):
            sorted_rows = sorted(indices[i]['rows'])
            sorted_cols = sorted(indices[i]['cols'])
            for k, row_idx in enumerate(sorted_rows):
                for j, col_idx in enumerate(sorted_cols):
                    if k == 0 and j == 0:
                        contents = {'type': 'cell', 'contents': box}
                    else:
                        contents = {
                            'type': 'span',
                            'main_row': sorted_rows[0],
                            'main_col': sorted_cols[0]
                        }

                    cells[row_idx][col_idx].append(contents)

        out_arr = [['' for x in range(len(cols))] for y in range(len(rows))]

        for row_idx, row in enumerate(cells):
            for col_idx, cell in enumerate(row):
                overall = []
                for cell_info in cell:
                    if 'type' not in cell_info:
                        cell_info = {'type': 'unspecified'}

                    if cell_info['type'] == 'cell':
                        contents = ' '.join(cell_info['contents'][4])
                    elif cell_info['type'] == 'span':
                        # This can later be a special structure for the json
                        # including for the main cell, too
                        contents = 'SPAN_OF(' + str(
                            cell_info['main_row']) + ', ' + str(
                                cell_info['main_col']) + ')'
                    else:
                        contents = ''

                    overall.append(contents)

                display_contents = ' '.join(overall)
                sheet.write(row_idx, col_idx, display_contents)

                # Store for json output
                out_arr[row_idx][col_idx] = display_contents

        dir_helper.ensure(json_path)
        with open(json_path, 'w') as f:
            json.dump({'cells': out_arr}, f)

        # sheet.write(row, col, item)

    finally:
        book.close()
Exemplo n.º 8
0
def setup():
    key_files = [key_file for key_file in os.listdir(feature_path)]

    train_labels = []
    test_labels = []
    train_set = []

    train_features = []
    test_features = []
    test_set = []

    for key_file in key_files:
        l, f = get_image_info(key_file)

        if len(train_labels) < 35000:
            train_set.append(key_file)
            train_labels += l
            train_features += f
        else:
            test_set.append(key_file)
            test_labels += l
            test_features += f

    train_features = trim_features(train_features)
    test_features = trim_features(test_features)

    forest = RandomForestClassifier(n_estimators=100)
    forest.fit(train_features, train_labels)

    tn_count = 0
    tp_count = 0
    fn_count = 0
    fp_count = 0

    false_negs = []
    false_poss = []
    for (l, f) in zip(test_labels, test_features):
        pred = forest.predict([f])

        if l == 0:
            if pred[0] == 0:
                tn_count += 1
            else:
                fp_count += 1
                false_poss.append(f)
        else:
            if pred[0] == 1:
                tp_count += 1
            else:
                fn_count += 1
                false_negs.append(f)

    print('Accuracy: ' + str((tn_count + tp_count) * 1.0 / len(test_labels)))
    print('Recall: ' + str(tp_count * 1.0 / (tp_count + fn_count)))
    print('Precision: ' + str(tp_count * 1.0 / (tp_count + fp_count)))

    dir_helper.ensure(classifier_path)
    with open(classifier_path, 'wb') as f:
        pickle.dump(forest, f)

    dir_helper.ensure(train_set_path)
    with open(train_set_path, 'w') as f:
        f.write(' \n'.join(train_set))

    dir_helper.ensure(test_set_path)
    with open(test_set_path, 'w') as f:
        f.write(' \n'.join(test_set))

    print('Done')
Exemplo n.º 9
0
def score_boxes(boxes, lines, feature_file, dist_imgs):
  box_scores = [[0.0 for box in boxes] for box in boxes]

  # Remove since we're appending for now
  if should_record_features:
    dir_helper.ensure(feature_file)

    open(feature_file, 'w').close()

  with open('classifier.pkl', 'rb') as f:
    classifier = pickle.load(f)

  features = {}

  for comb in combinations(enumerate(boxes), 2):
    i = comb[0][0]
    j = comb[1][0]

    box_1 = comb[0][1]
    box_2 = comb[1][1]

    scores = {}
    features = {}

    # 1. Higher score the closer together they are
    # horiz and vert (percentage and flat)
    horiz_dist = max(0, max(box_1[0], box_2[0]) - min(box_1[0] + box_1[2], box_2[0] + box_2[2]))
    vert_dist = max(0, max(box_1[1], box_2[1]) - min(box_1[1] + box_1[3], box_2[1] + box_2[3]))

    min_horiz_range = min(box_1[2], box_2[2])
    min_vert_range = min(box_1[3], box_2[3])

    dist = (horiz_dist ** 2 + vert_dist ** 2) ** 0.5
    min_range = (min_horiz_range ** 2 + min_vert_range ** 2) ** 0.5

    scores['dist_pix'] = 1.0 / (1.0 + dist)
    scores['dist_perc'] = 1.0 / (1.0 + (dist * 1.0 / min_range))
    features['dist_pix'] = dist
    features['dist_perc'] = dist * 1.0 / min_range

    # 2. Higher score if they overlap (horiz and vert)
    # both percentage and flat
    horiz_over = max(0, min(box_1[0] + box_1[2], box_2[0] + box_2[2]) - max(box_1[0], box_2[0]))
    vert_over = max(0, min(box_1[1] + box_1[3], box_2[1] + box_2[3]) - max(box_1[1], box_2[1]))

    scores['overlap_pix'] = 1.0 * horiz_over * vert_over
    scores['overlap_perc'] = 1.0 * horiz_over * vert_over / (min_horiz_range * min_vert_range)
    features['overlap_pix'] = horiz_over * vert_over
    features['overlap_perc'] = scores['overlap_perc']

    # Check for horizontal line b/w boxes
    scores['no_div_horiz_line'] = 0.0 if line_between(box_1, box_2, lines, 1) else 1.0
    features['no_div_horiz_line'] = 0 if line_between(box_1, box_2, lines, 1) else 1

    # Check for vertical line b/w boxes
    scores['no_div_vert_line'] = 0.0 if line_between(box_1, box_2, lines, 0) else 1.0
    features['no_div_vert_line'] = 0 if line_between(box_1, box_2, lines, 0) else 1

    # Additional features?
    # Ratio of areas
    box_1_area = box_1[2] * box_1[3]
    box_2_area = box_2[2] * box_2[3]

    features['ratio_of_areas_1'] = box_1_area * 1.0 / box_2_area
    features['ratio_of_areas_2'] = box_2_area * 1.0 / box_1_area

    # Area of box 1
    features['box_1_area'] = box_1_area

    # Area of box 2
    features['box_2_area'] = box_2_area

    # Left to right distance, and right to left distance
    features['ltr_dist'] = min(abs(box_1[0] - (box_2[0] + box_2[2])), box_2[0] - (box_1[0] + box_1[2]))

    # and top to bottom and bottom to top
    features['ttb_dist'] = min(abs(box_1[1] - (box_2[1] + box_2[3])), box_2[1] - (box_1[1] + box_1[3]))

    # Line in between ignoring overlap
    features['no_horiz_line_in_middle'] = 0 if line_in_middle(box_1, box_2, lines, 1) else 1
    features['no_vert_line_in_middle'] = 0 if line_in_middle(box_1, box_2, lines, 0) else 1

    # Distance between centers
    features['dist_bw_vert_centers'] = abs((box_1[0] + box_1[0] + box_1[2]) - (box_2[0] + box_2[0] + box_2[2])) / 2.0
    features['dist_bw_horiz_centers'] = abs((box_1[1] + box_1[1] + box_1[3]) - (box_2[1] + box_2[1] + box_2[3])) / 2.0

    # Distance between tops
    features['dist_bw_lefts'] = abs(box_1[0] - box_2[0])
    features['dist_bw_tops'] = abs(box_1[1] - box_2[1])

    # Distance between bottoms
    features['dist_bw_rights'] = abs((box_1[0] + box_1[2]) - (box_2[0] + box_2[2]))
    features['dist_bw_bottoms'] = abs((box_1[1] + box_1[3]) - (box_2[1] + box_2[3]))

    # Distance transform features
    features['dist_trans_tb_scaled'] = get_tb_dt(dist_imgs[1], box_1, box_2)
    features['dist_trans_lr_scaled'] = get_lr_dt(dist_imgs[1], box_1, box_2)

    score = get_classifier_score(features, classifier)
    record_features(box_1, box_2, features, feature_file)

    box_scores[i][j] = score
    box_scores[j][i] = score

  return box_scores