Пример #1
0
def extract_images(doc):
    """If the given document (proposal.models.Document) has been copied to
    the local filesystem, extract its images to a subdirectory of the
    document's directory (docs/<doc id>/images). Extracts the text
    contents to docs/<doc id>/text.txt.

    :param doc: proposal.models.Document object with a corresponding PDF
    file that has been copied to the local filesystem

    :returns: A list of proposal.model.Image objects

    """
    # TODO: Break this into smaller subtasks
    docfile = doc.document

    if not docfile:
        logger.error("Document has not been copied to the local filesystem.")
        return []

    path = docfile.path

    if not os.path.exists(path):
        logger.error("Document %s is not where it says it is: %s", doc.pk,
                     path)
        return []

    images_dir = os.path.join(os.path.dirname(path), "images")
    os.makedirs(images_dir, exist_ok=True)

    images_pattern = os.path.join(images_dir, "image")

    logger.info("Extracting images to '%s'", images_dir)
    status = subprocess.call(
        ["pdfimages", "-png", "-tiff", "-j", "-jp2", path, images_pattern])

    images = []
    if status:
        logger.warn("pdfimages failed with exit code %i", status)
    else:
        # Do stuff with the images in the directory
        for image_name in os.listdir(images_dir):
            image_path = os.path.join(images_dir, image_name)

            if not is_interesting(image_path):
                # Delete 'uninteresting' images
                os.unlink(image_path)
                continue

            image = Image(proposal=doc.proposal, document=doc)
            image.image = image_path
            images.append(image)

            try:
                image.save()
            except IntegrityError:
                # This can occur if the image has already been fetched
                # and associated with the Proposal.
                pass

    return images
Пример #2
0
def extract_images(doc_id):
    """If the given document (proposal.models.Document) has been copied to
    the local filesystem, extract its images to a subdirectory of the
    document's directory (docs/<doc id>/images).

    :param doc: proposal.models.Document object with a corresponding PDF
    file that has been copied to the local filesystem

    :returns: A list of proposal.model.Image objects

    """
    doc = Document.objects.get(pk=doc_id)
    docfile = doc.document

    if not docfile:
        logger.error("Document has not been copied to the local filesystem.")
        return []

    path = docfile.path

    if not os.path.exists(path):
        logger.error("Document %s is not where it says it is: %s",
                     doc.pk, path)
        return []

    images_dir = os.path.join(os.path.dirname(path), "images")
    os.makedirs(images_dir, exist_ok=True)

    logger.info("Extracting images to '%s'", images_dir)
    image_paths = pdf.extract_images(path, dirname=images_dir)

    images = []
    # Do stuff with the images in the directory
    for image_name in image_paths:
        image_path = os.path.join(images_dir, image_name)

        if not is_interesting(image_path):
            # Delete 'uninteresting' images
            os.unlink(image_path)
            continue

        image = Image(proposal=doc.proposal,
                      document=doc)
        image.image = image_path
        images.append(image)

        try:
            image.save()
        except IntegrityError:
            # This can occur if the image has already been fetched
            # and associated with the Proposal.
            pass

    logger.info("Extracted %i image(s) from %s.", len(images), path)

    return [image.pk for image in images]
Пример #3
0
def extract_images(doc_id):
    """If the given document (proposal.models.Document) has been copied to
    the local filesystem, extract its images to a subdirectory of the
    document's directory (docs/<doc id>/images).

    :param doc: proposal.models.Document object with a corresponding PDF
    file that has been copied to the local filesystem

    :returns: A list of proposal.model.Image objects

    """
    doc = Document.objects.get(pk=doc_id)
    docfile = doc.document

    if not docfile:
        logger.error("Document has not been copied to the local filesystem.")
        return []

    path = docfile.path

    if not os.path.exists(path):
        logger.error("Document %s is not where it says it is: %s", doc.pk,
                     path)
        return []

    images_dir = os.path.join(os.path.dirname(path), "images")
    os.makedirs(images_dir, exist_ok=True)

    logger.info("Extracting images to '%s'", images_dir)
    image_paths = pdf.extract_images(path, dirname=images_dir)

    images = []
    # Do stuff with the images in the directory
    for image_name in image_paths:
        image_path = os.path.join(images_dir, image_name)

        if not is_interesting(image_path):
            # Delete 'uninteresting' images
            os.unlink(image_path)
            continue

        image = Image(proposal=doc.proposal, document=doc)
        image.image = image_path
        images.append(image)

        try:
            image.save()
        except IntegrityError:
            # This can occur if the image has already been fetched
            # and associated with the Proposal.
            pass

    logger.info("Extracted %i image(s) from %s.", len(images), path)

    return [image.pk for image in images]
Пример #4
0
def extract_content(doc, encoding="ISO-8859-9"):
    """If the given document (proposal.models.Document) has been copied to
    the local filesystem, extract its images to a subdirectory of the
    document's directory (docs/<doc id>/images). Extracts the text
    content to docs/<doc id>/content.txt.

    """
    docfile = doc.document
    logger = extract_content.get_logger()

    if not docfile:
        logger.error("Document has not been copied to the local filesystem.")
        return

    try:
        path = docfile.path
    except:
        path = docfile.name

    if not os.path.exists(path):
        logger.error("Document %s is not where it says it is: %s",
                     doc.pk, path)
        return

    images_dir = os.path.join(os.path.dirname(path), "images")
    os.makedirs(images_dir, exist_ok=True)

    images_pattern = os.path.join(images_dir, "image")

    logger.info("Extracting images to '%s'", images_dir)
    status = subprocess.call(["pdfimages", "-png", "-tiff", "-j", "-jp2",
                              path, images_pattern])

    if status:
        logger.warn("pdfimages failed with exit code %i", status)
    else:
        # Do stuff with the images in the directory
        for image_name in os.listdir(images_dir):
            image_path = os.path.join(images_dir, image_name)

            if not images.is_interesting(image_path):
                # Delete 'uninteresting' images
                os.unlink(image_path)
                continue

            image = Image(proposal=doc.proposal,
                          document=doc)
            image.image = image_path
            #image.set_image_path(image_path)

            try:
                image.save()
            except IntegrityError:
                # This can occur if the image has already been fetched
                # and associated with the Proposal.
                pass

    # Could consider storing the full extracted text of the document in
    # the database and indexing it, rather than extracting it to a file.
    text_path = os.path.join(os.path.dirname(path), "text.txt")

    # TODO: It may be practical to sniff pdfinfo, determine the PDF
    # producer used, and make a best guess at encoding based on that
    # information. We should be able to get away with using ISO-8859-9
    # for now.
    status = subprocess.call(["pdftotext", "-enc", encoding, path, text_path])

    if status:
        logger.error("Failed to extract text from {doc}".\
                     format(doc=path))
    else:
        # Do stuff with the contents of the file.
        # Possibly perform some rudimentary scraping?
        doc.fulltext = text_path
        doc.encoding = encoding
        doc.save()
Пример #5
0
def extract_images(doc):
    """If the given document (proposal.models.Document) has been copied to
    the local filesystem, extract its images to a subdirectory of the
    document's directory (docs/<doc id>/images). Extracts the text
    contents to docs/<doc id>/text.txt.

    :param doc: proposal.models.Document object with a corresponding PDF
    file that has been copied to the local filesystem

    :returns: A list of proposal.model.Image objects

    """
    # TODO: Break this into smaller subtasks
    docfile = doc.document
    logger = extract_images.get_logger()

    if not docfile:
        logger.error("Document has not been copied to the local filesystem.")
        return []

    path = docfile.path

    if not os.path.exists(path):
        logger.error("Document %s is not where it says it is: %s",
                     doc.pk, path)
        return []

    images_dir = os.path.join(os.path.dirname(path), "images")
    os.makedirs(images_dir, exist_ok=True)

    images_pattern = os.path.join(images_dir, "image")

    logger.info("Extracting images to '%s'", images_dir)
    status = subprocess.call(["pdfimages", "-png", "-tiff", "-j", "-jp2",
                              path, images_pattern])

    images = []
    if status:
        logger.warn("pdfimages failed with exit code %i", status)
    else:
        # Do stuff with the images in the directory
        for image_name in os.listdir(images_dir):
            image_path = os.path.join(images_dir, image_name)

            if not images.is_interesting(image_path):
                # Delete 'uninteresting' images
                os.unlink(image_path)
                continue

            image = Image(proposal=doc.proposal,
                          document=doc)
            image.image = image_path
            images.append(image)

            try:
                image.save()
            except IntegrityError:
                # This can occur if the image has already been fetched
                # and associated with the Proposal.
                pass

    return images