Exemplo n.º 1
0
def load_file(path):
    image = pdf2image.convert_from_path(path)[0]
    height = image.size[1]
    width = image.size[0]

    ngrams = util.create_ngrams(image)
    for ngram in ngrams:
        if "amount" in ngram["parses"]:
            ngram["parses"]["amount"] = util.normalize(
                ngram["parses"]["amount"], key="amount")
        if "date" in ngram["parses"]:
            ngram["parses"]["date"] = util.normalize(ngram["parses"]["date"],
                                                     key="date")

    fields = {field: '0' for field in FIELDS}

    page = {
        "fields": fields,
        "nGrams": ngrams,
        "height": height,
        "width": width,
        "filename": path
    }

    return {'image': image, 'page': page}
Exemplo n.º 2
0
def process_file(filename, out_dir, phase):
    try:
        page = pdf2image.convert_from_path(filename)[0]
        page.save(
            os.path.join(out_dir, phase,
                         os.path.basename(filename)[:-3] + 'png'))

        height = page.size[1]
        width = page.size[0]

        ngrams = util.create_ngrams(page)
        for ngram in ngrams:
            if "amount" in ngram["parses"]:
                ngram["parses"]["amount"] = util.normalize(
                    ngram["parses"]["amount"], key="amount")
            if "date" in ngram["parses"]:
                ngram["parses"]["date"] = util.normalize(
                    ngram["parses"]["date"], key="date")

        with open(filename[:-3] + 'json', 'r') as fp:
            labels = simplejson.loads(fp.read())

        fields = {}
        for field in FIELDS:
            if field in labels:
                if FIELDS[field] == FIELD_TYPES["amount"]:
                    fields[field] = util.normalize(labels[field], key="amount")
                elif FIELDS[field] == FIELD_TYPES["date"]:
                    fields[field] = util.normalize(labels[field], key="date")
                else:
                    fields[field] = labels[field]
            else:
                fields[field] = ''

        data = {
            "fields":
            fields,
            "nGrams":
            ngrams,
            "height":
            height,
            "width":
            width,
            "filename":
            os.path.abspath(
                os.path.join(out_dir, phase,
                             os.path.basename(filename)[:-3] + 'png'))
        }

        with open(
                os.path.join(out_dir, phase,
                             os.path.basename(filename)[:-3] + 'json'),
                'w') as fp:
            fp.write(simplejson.dumps(data, indent=2))
        return True

    except Exception as exp:
        print("Skipping {} : {}".format(filename, exp))
        return False
Exemplo n.º 3
0
    def _prepare_data(self):
        self._get_inputs()

        if self.args["data_dir"] == '':
            messagebox.showerror("Error", "No files were selected!")
            return

        if not os.path.exists(self.args["data_dir"]):
            messagebox.showerror("Error", "No files were selected!")
            return

        self.progressbar["value"] = 0
        self.progress_label.configure(text="Preparing Data:")

        data_dir = os.path.join(self.args["prepared_data"], 'predict')
        os.makedirs(data_dir, exist_ok=True)

        filenames = [os.path.abspath(f) for f in glob.glob(data_dir + "**/*.json", recursive=True)]
        filenames += [os.path.abspath(f) for f in glob.glob(data_dir + "**/*.png", recursive=True)]
        for f in filenames:
            os.remove(f)

        filenames = []
        if self.args["data_dir"] and os.path.exists(self.args["data_dir"]):
            filenames = [os.path.abspath(f) for f in glob.glob(self.args["data_dir"] + "**/*.pdf", recursive=True)]
        if self.args["data_file"] and os.path.exists(self.args["data_file"]):
            filenames += [self.args["data_file"]]

        self.logger.log("Total: {}".format(len(filenames)))
        self.logger.log("Preparing data for extraction...")

        total_samples = len(filenames)
        sample_idx = 0
        for filename in tqdm(filenames):
            try:
                page = pdf2image.convert_from_path(filename)[0]
                page.save(os.path.join(data_dir, os.path.basename(filename)[:-3] + 'png'))

                height = page.size[1]
                width = page.size[0]

                ngrams = util.create_ngrams(page)
                for ngram in ngrams:
                    if "amount" in ngram["parses"]:
                        ngram["parses"]["amount"] = util.normalize(ngram["parses"]["amount"], key="amount")
                    if "date" in ngram["parses"]:
                        ngram["parses"]["date"] = util.normalize(ngram["parses"]["date"], key="date")

                fields = {field: '0' for field in FIELDS}

                data = {
                    "fields": fields,
                    "nGrams": ngrams,
                    "height": height,
                    "width": width,
                    "filename": os.path.abspath(os.path.join(data_dir, os.path.basename(filename)[:-3] + 'png'))
                }

                with open(os.path.join(data_dir, os.path.basename(filename)[:-3] + 'json'),
                          'w') as fp:
                    fp.write(simplejson.dumps(data, indent=2))

            except Exception as exp:
                self.logger.log("Skipping {} : {}".format(filename, exp))

            sample_idx += 1
            self.progress_label.configure(text="Preparing data [{}/{}]:".format(sample_idx, total_samples))
            self.progressbar["value"] = (sample_idx / total_samples) * 100
            self.progressbar.update()

        self.progress_label.configure(text="Completed!")
        self.progressbar["value"] = 100
        self.progressbar.update()
        self.logger.log("Prepared data stored in '{}'".format(data_dir))
Exemplo n.º 4
0
def main():
    ap = argparse.ArgumentParser()

    ap.add_argument(
        "--data_dir",
        type=str,
        required=True,
        help="path to directory containing invoice document images")
    ap.add_argument("--out_dir",
                    type=str,
                    default='processed_data/',
                    help="path to save prepared data")
    ap.add_argument("--val_size",
                    type=float,
                    default=0.2,
                    help="validation split ration")

    args = ap.parse_args()

    os.makedirs(os.path.join(args.out_dir, 'train'), exist_ok=True)
    os.makedirs(os.path.join(args.out_dir, 'val'), exist_ok=True)

    # filenames = [os.path.abspath(f) for f in glob.glob(args.data_dir + "**/*.pdf", recursive=True)]
    filenames = [
        os.path.abspath(item) for sublist in [
            glob.glob(args.data_dir + ext, recursive=True)
            for ext in ["**/*.pdf", "**/*.jpg", "**/*.png"]
        ] for item in sublist
    ]

    idx = int(len(filenames) * args.val_size)
    train_files = filenames[idx:]
    val_files = filenames[:idx]

    print("Total: {}".format(len(filenames)))
    print("Training: {}".format(len(train_files)))
    print("Validation: {}".format(len(val_files)))

    for phase, filenames in [('train', train_files), ('val', val_files)]:
        print("Preparing {} data...".format(phase))

        for filename in tqdm(filenames):
            try:
                page = pdf2image.convert_from_path(filename, dpi=500)[0]
                page.save(
                    os.path.join(args.out_dir, phase,
                                 os.path.basename(filename)[:-3] + 'png'))

                height = page.size[1]
                width = page.size[0]

                ngrams = util.create_ngrams(page)
                for ngram in ngrams:
                    if "amount" in ngram["parses"]:
                        ngram["parses"]["amount"] = util.normalize(
                            ngram["parses"]["amount"], key="amount")
                    if "date" in ngram["parses"]:
                        ngram["parses"]["date"] = util.normalize(
                            ngram["parses"]["date"], key="date")

                with open(filename[:-3] + 'json', 'r') as fp:
                    labels = simplejson.loads(fp.read())

                fields = {}
                for field in FIELDS:
                    if field in labels:
                        if FIELDS[field] == FIELD_TYPES["amount"]:
                            fields[field] = util.normalize(labels[field],
                                                           key="amount")
                        elif FIELDS[field] == FIELD_TYPES["date"]:
                            fields[field] = util.normalize(labels[field],
                                                           key="date")
                        else:
                            fields[field] = labels[field]
                    else:
                        fields[field] = ''

                data = {
                    "fields":
                    fields,
                    "nGrams":
                    ngrams,
                    "height":
                    height,
                    "width":
                    width,
                    "filename":
                    os.path.abspath(
                        os.path.join(args.out_dir, phase,
                                     os.path.basename(filename)[:-3] + 'png'))
                }

                with open(
                        os.path.join(args.out_dir, phase,
                                     os.path.basename(filename)[:-3] + 'json'),
                        'w') as fp:
                    fp.write(simplejson.dumps(data, indent=2))
            except:
                if "png" in filename or "jpg" in filename:
                    import cv2
                    page = cv2.imread(filename)
                    cv2.imwrite(
                        os.path.join(args.out_dir, phase,
                                     os.path.basename(filename)[:-3] + 'png'),
                        page)

                    height = page.shape[0]
                    width = page.shape[1]

                    ngrams = util.create_ngrams(page, height, width)
                    for ngram in ngrams:
                        if "amount" in ngram["parses"]:
                            ngram["parses"]["amount"] = util.normalize(
                                ngram["parses"]["amount"], key="amount")
                        if "date" in ngram["parses"]:
                            ngram["parses"]["date"] = util.normalize(
                                ngram["parses"]["date"], key="date")

                    with open(filename[:-3] + 'json', 'r') as fp:
                        labels = simplejson.loads(fp.read())

                    fields = {}
                    for field in FIELDS:
                        if field in labels:
                            if FIELDS[field] == FIELD_TYPES["amount"]:
                                fields[field] = util.normalize(labels[field],
                                                               key="amount")
                            elif FIELDS[field] == FIELD_TYPES["date"]:
                                fields[field] = util.normalize(labels[field],
                                                               key="date")
                            else:
                                fields[field] = labels[field]
                        else:
                            fields[field] = ''

                    data = {
                        "fields":
                        fields,
                        "nGrams":
                        ngrams,
                        "height":
                        height,
                        "width":
                        width,
                        "filename":
                        os.path.abspath(
                            os.path.join(
                                args.out_dir, phase,
                                os.path.basename(filename)[:-3] + 'png'))
                    }

                    with open(
                            os.path.join(
                                args.out_dir, phase,
                                os.path.basename(filename)[:-3] + 'json'),
                            'w') as fp:
                        fp.write(simplejson.dumps(data, indent=2))
Exemplo n.º 5
0
    def _prepare_data(self):
        self._get_inputs()

        if self.args["data_dir"] == '':
            messagebox.showerror("Error", "Data folder does not exist!")
            return

        if not os.path.exists(self.args["data_dir"]):
            messagebox.showerror("Error", "Data folder does not exist!")
            return

        self.progressbar["value"] = 0
        self.progress_label.configure(text="Preparing Data:")

        os.makedirs(os.path.join(self.args["prepared_data"], 'train'),
                    exist_ok=True)
        os.makedirs(os.path.join(self.args["prepared_data"], 'val'),
                    exist_ok=True)

        filenames = [
            os.path.abspath(f)
            for f in glob.glob(self.args["data_dir"] + "**/*.pdf",
                               recursive=True)
        ]
        random.shuffle(filenames)

        idx = int(len(filenames) * 0.2)
        train_files = filenames[idx:]
        val_files = filenames[:idx]

        self.logger.log("Total: {}".format(len(filenames)))
        self.logger.log("Training: {}".format(len(train_files)))
        self.logger.log("Validation: {}".format(len(val_files)))

        total_samples = len(filenames)
        sample_idx = 0
        for phase, filenames in [('train', train_files), ('val', val_files)]:
            self.logger.log("Preparing {} data...".format(phase))
            for filename in tqdm(filenames):
                # try:
                page = pdf2image.convert_from_path(filename)[0]
                page.save(
                    os.path.join(self.args["prepared_data"], phase,
                                 os.path.basename(filename)[:-3] + 'png'))

                height = page.size[1]
                width = page.size[0]

                ngrams = util.create_ngrams(page)
                for ngram in ngrams:
                    if "amount" in ngram["parses"]:
                        ngram["parses"]["amount"] = util.normalize(
                            ngram["parses"]["amount"], key="amount")
                    if "date" in ngram["parses"]:
                        ngram["parses"]["date"] = util.normalize(
                            ngram["parses"]["date"], key="date")

                with open(filename[:-3] + 'json', 'r') as fp:
                    labels = simplejson.loads(fp.read())

                fields = {}
                for field in FIELDS:
                    if field in labels:
                        if FIELDS[field] == FIELD_TYPES["amount"]:
                            fields[field] = util.normalize(labels[field],
                                                           key="amount")
                        elif FIELDS[field] == FIELD_TYPES["date"]:
                            fields[field] = util.normalize(labels[field],
                                                           key="date")
                        else:
                            fields[field] = labels[field]
                    else:
                        fields[field] = ''

                data = {
                    "fields":
                    fields,
                    "nGrams":
                    ngrams,
                    "height":
                    height,
                    "width":
                    width,
                    "filename":
                    os.path.abspath(
                        os.path.join(self.args["prepared_data"], phase,
                                     os.path.basename(filename)[:-3] + 'png'))
                }

                with open(
                        os.path.join(self.args["prepared_data"], phase,
                                     os.path.basename(filename)[:-3] + 'json'),
                        'w') as fp:
                    fp.write(simplejson.dumps(data, indent=2))

                # except Exception as exp:
                #     self.logger.log("Skipping {} : {}".format(filename, exp))

                sample_idx += 1
                self.progress_label.configure(
                    text="Preparing data [{}/{}]:".format(
                        sample_idx, total_samples))
                self.progressbar["value"] = (sample_idx / total_samples) * 100
                self.progressbar.update()

        self.progress_label.configure(text="Completed!")
        self.progressbar["value"] = 100
        self.progressbar.update()
        self.logger.log("Prepared data stored in '{}'".format(
            self.args["prepared_data"]))