예제 #1
0
def convert(input_filename, doc_id, slug):
    # Provision a temporary directory in which to handle document conversion
    document_directory = tempfile.mkdtemp(prefix=TMP_DIR)

    # Grab file from storage to tmp
    tmp_path = os.path.join(document_directory, Path(input_filename).name)
    with storage.open(input_filename, "rb") as document_file:
        with open(tmp_path, "wb") as tmp_file:
            tmp_file.write(document_file.read())

    # Run LibreOffice
    libre_office_convert(tmp_path)
    # Remove created file (early, just to free RAM that might be needed later)
    os.remove(tmp_path)

    # Put converted file back in storage
    # We expect the filename to be the same but with a pdf extension
    # (LibreOffice offers no mechanism to specify a precise name)
    output_path = str(Path(tmp_path).with_suffix(".pdf"))
    output_filename = path.doc_path(doc_id, slug)
    with storage.open(output_filename, "wb") as output_document_file:
        with open(output_path, "rb") as pdf_file:
            output_document_file.write(pdf_file.read())

    # Remove temporary directory
    shutil.rmtree(document_directory)
예제 #2
0
def doc_embedding(project_id, language, tfidf, features, doc_svd):
    """Calculate the doc embeddings"""
    import fasttext

    logger.info("[SIDEKICK PREPROCESS] project_id: %s - doc embeddings",
                project_id)

    # Load the embedding model
    # error if language not present
    language = LANGUAGES.get(language, language)
    model = fasttext.load_model(
        os.path.join(EMBEDDING_DIR, f"cc.{language}.300.bin"))
    embedding_vectors = np.array(
        [model.get_word_vector(feature) for feature in features])

    # scale embedding vectors based on frequency of the words
    doc_embeddings = np.dot(tfidf.A, embedding_vectors)

    # Doc vectors are just doc_svd and doc_embeddings concatenated
    doc_vectors = np.hstack((doc_svd, doc_embeddings))

    # Serialize doc vectors to file
    with storage.open(path.sidekick_document_vectors_path(project_id),
                      "wb") as vectors_file:
        np.savez_compressed(vectors_file, doc_vectors)
예제 #3
0
    def get_document_vectors(self):
        """Fetch the pre-preocessed document vectors from storage"""
        with storage.open(path.sidekick_document_vectors_path(self.project_id),
                          "rb") as vectors_file:
            doc_vector_obj = np.load(vectors_file)

        # Grab document vector matrix
        return (doc_vector_obj.get("vectors"), doc_vector_obj.get("ids"))
예제 #4
0
def doc_embedding_(project_id, _language, _tfidf, _features, doc_svd, doc_ids):
    """Simpler doc embeddings - skip word vectors and just use the doc svd"""

    logger.info("[SIDEKICK PREPROCESS] project_id: %s - doc embeddings",
                project_id)

    # Serialize doc vectors to file
    with storage.open(path.sidekick_document_vectors_path(project_id),
                      "wb") as vectors_file:
        np.savez_compressed(vectors_file, vectors=doc_svd, ids=doc_ids)
예제 #5
0
 def get_text(self):
     try:
         return (storage.open(path.text_path(self.pk, self.slug),
                              "rb").read().decode("utf8"))
     except ValueError as exc:
         logger.error(
             "Error getting text: Document: %d Exception: %s",
             self.pk,
             exc,
             exc_info=sys.exc_info(),
         )
         return ""
예제 #6
0
 def get_all_page_text(self):
     try:
         return json.loads(
             storage.open(path.json_text_path(self.pk, self.slug),
                          "rb").read().decode("utf8"))
     except ValueError as exc:
         logger.error(
             "Error getting all page text: Document: %d Exception: %s",
             self.pk,
             exc,
             exc_info=sys.exc_info(),
         )
         return {"pages": [], "updated": None}
예제 #7
0
    def handle(self, *args, **options):
        data_files = os.listdir(TESSERACT_DATA_DIRECTORY)
        print("UPLOADING", data_files)
        print("...")
        for data_file_path in data_files:
            with open(os.path.join(TESSERACT_DATA_DIRECTORY, data_file_path),
                      "rb") as data_file:
                with storage.open(
                        os.path.join(MINIO_DATA_DIRECTORY, data_file_path),
                        "wb") as minio_file:
                    minio_file.write(data_file.read())

        print("WROTE ALL FILES")
예제 #8
0
def download_tmp_file(relative_path):
    """Downloads the requested data file to a tmp directory."""
    Path(TMP_DIRECTORY).mkdir(
        parents=True, exist_ok=True
    )  # Make tmp directory if it doesn't exist
    local_file_path = os.path.join(TMP_DIRECTORY, relative_path)
    if os.path.exists(local_file_path):
        # OCR language pack already downloaded
        return

    # Check if tmp directory is too big
    if local_folder_size(TMP_DIRECTORY) > TMP_SIZE_LIMIT:
        # If so, just delete all OCR data (shouldn't happen too often)
        logger.warning("[Deleting tmp OCR data]")
        files = Path(TMP_DIRECTORY).rglob("*")
        for file in files:
            os.remove(file)

    # Download OCR data file
    with storage.open(
        os.path.join(OCR_DATA_DIRECTORY, relative_path), "rb"
    ) as ocr_data_file, open(local_file_path, "wb") as local_file:
        local_file.write(ocr_data_file.read())
예제 #9
0
def ocr_page(doc_id, page_path, upload_text_path, access, ocr_code="eng"):
    """Internal method to run OCR on a single page.

    Returns:
        The page text.
    """
    # Download the requisite language data
    logger.info("[OCR PAGE] doc_id %s", doc_id)
    download_language_pack(ocr_code)
    download_tmp_file(PDF_FONT_FILE)

    logger.info("[OCR PAGE] download complete doc_id %s", doc_id)

    # Initialize temporary files
    tmp_files = {
        "img": tempfile.mkstemp(suffix=".png")[1],
        "pdf": tempfile.mkstemp()[1],
        "text": tempfile.mkstemp()[1],
    }

    # Capture the page image as a temporary PNG file
    with storage.open(page_path, "rb") as image_file:
        img = Image.open(image_file).convert("RGB")
        # Resize only if image is too big (OCR computation is slow with large images)
        if img.width > DESIRED_WIDTH:
            resize = DESIRED_WIDTH / img.width
            img = img.resize(
                (DESIRED_WIDTH, round(img.height * resize)), Image.ANTIALIAS
            )
    img.save(tmp_files["img"], "png")

    logger.info("[OCR PAGE] image resized doc_id %s", doc_id)

    # Use Tesseract OCR to render a text-only PDF and txt file
    tess = Tesseract(ocr_code)
    text = ""
    pdf_contents = b""
    try:
        tess.create_renderer(tmp_files["pdf"], tmp_files["text"])
        tess.render(tmp_files["img"])
        tess.destroy_renderer()

        logger.info("[OCR PAGE] rendered doc_id %s", doc_id)

        # Get txt and text-only pdf file contents
        with open(tmp_files["pdf"] + ".pdf", "rb") as pdf_file:
            pdf_contents = pdf_file.read()
        with storage.open(upload_text_path, "w", access=access) as new_text_file:
            with open(tmp_files["text"] + ".txt", "r", encoding="utf-8") as text_file:
                # Store text locally to return (gets used by Redis later)
                text = text_file.read()
                # Also upload text file to s3
                new_text_file.write(text)
        logger.info("[OCR PAGE] data stored doc_id %s", doc_id)
    finally:
        logger.info("[OCR PAGE] cleanup doc_id %s", doc_id)
        os.remove(tmp_files["pdf"])
        os.remove(tmp_files["text"])
        os.remove(tmp_files["img"])

    return text, pdf_contents