Exemplo n.º 1
0
    def get_source(self, base_location, path):
        """
        Get an appropriate BinarySource based on the type of ``base_location``.
        The following forms of ``base_location`` are recognized:

        * ``zip:<path to zipfile>``
        * ``sftp:<user>@<host>/<path to dir>``
        * ``http://<host>/<path to dir>``
        * ``zip+sftp:<user>@<host>/<path to zipfile>``
        * ``<local dir path>``

        :param base_location:
        :param path:
        :return:
        """
        if base_location.startswith('zip:'):
            return ZipFileSource(base_location[4:], path)
        elif base_location.startswith('sftp:'):
            return RemoteFileSource(
                location=os.path.join(base_location, path),
                ssh_options={'key_filename': self.ssh_private_key}
            )
        elif base_location.startswith('http:') or base_location.startswith('https:'):
            base_uri = base_location if base_location.endswith('/') else base_location + '/'
            return HTTPFileSource(base_uri + path)
        elif base_location.startswith('zip+sftp:'):
            return ZipFileSource(
                zip_file=base_location[4:],
                path=path,
                ssh_options={'key_filename': self.ssh_private_key}
            )
        else:
            # with no URI prefix, assume a local file path
            return LocalFileSource(localpath=os.path.join(base_location, path))
Exemplo n.º 2
0
def get_file_object(path, source=None):
    extension = path[path.rfind('.'):]
    if extension in FILE_CLASS_FOR:
        cls = FILE_CLASS_FOR[extension]
    else:
        cls = File
    if source is None:
        source = LocalFileSource(path)
    f = cls.from_source(source)
    return f
Exemplo n.º 3
0
def get_source(binary_column_value: str) -> Optional[BinarySource]:
    """
    Returns the appropriate BinarySource implementation to use, based on the
    value in the binary column, or None if an appropriate BinarySource
    implementation cannot be determined.
    """
    source: Optional[BinarySource] = None
    if binary_column_value.startswith("http:") or binary_column_value.startswith("https:"):
        source = HTTPFileSource(binary_column_value)
    elif binary_column_value is not None:
        source = LocalFileSource(binary_column_value)
    return source
Exemplo n.º 4
0
def test_local_file():
    f = LocalFileSource('/foo/bar')
    assert f.localpath == '/foo/bar'
Exemplo n.º 5
0
def test_nonexistent_local_file_source():
    # pick a random filename string that is unlikely to exist
    f = LocalFileSource(str(uuid4()))
    assert f.exists() is False
Exemplo n.º 6
0
    def create_page(self, issue_mets, page_div, issue):
        dmdsec = issue_mets.dmdsec(page_div.get('DMDID'))
        number = dmdsec.find('.//MODS:start', xmlns).text
        reel = dmdsec.find('.//MODS:identifier[@type="reel number"]', xmlns)
        if reel is not None:
            reel = reel.text
        frame = dmdsec.find('.//MODS:identifier[@type="reel sequence number"]',
                            xmlns)
        if frame is not None:
            frame = frame.text
        title = "{0}, page {1}".format(issue.title, number)

        # create Page object
        page = Page(issue=issue,
                    reel=reel,
                    number=number,
                    title=title,
                    frame=frame)

        # optionally generate a file object for each file in the XML snippet
        for fptr in page_div.findall('METS:fptr', xmlns):
            fileid = fptr.get('FILEID')
            filexml = issue_mets.file(fileid)

            if 'ADMID' not in filexml.attrib:
                raise DataReadException(
                    f'No ADMID found for {fileid}, cannot lookup technical metadata'
                )

            # get technical metadata by type
            techmd = {}
            for admid in filexml.get('ADMID').split():
                t = issue_mets.techmd(admid)
                for mdwrap in t.findall('METS:mdWrap', xmlns):
                    mdtype = mdwrap.get('MDTYPE')
                    if mdtype == 'OTHER':
                        mdtype = mdwrap.get('OTHERMDTYPE')
                    techmd[mdtype] = t

            use = filexml.get('USE')
            file_locator = filexml.find('METS:FLocat', xmlns)
            href = file_locator.get('{http://www.w3.org/1999/xlink}href')
            localpath = os.path.join(self.dir, os.path.basename(href))
            basename = os.path.basename(localpath)
            mimetype = techmd['PREMIS'].find('.//premis:formatName',
                                             xmlns).text

            file_class = FILE_CLASS_FOR[use]

            file = file_class.from_source(LocalFileSource(localpath,
                                                          mimetype=mimetype),
                                          title=f'{basename} ({use})')
            file.use = use
            file.basename = basename
            file.dcmitype = dcmitype.Text

            if mimetype == 'image/tiff':
                file.width = techmd['NISOIMG'].find('.//mix:ImageWidth',
                                                    xmlns).text
                file.height = techmd['NISOIMG'].find('.//mix:ImageLength',
                                                     xmlns).text
                file.resolution = (int(techmd['NISOIMG'].find(
                    './/mix:XSamplingFrequency', xmlns).text),
                                   int(techmd['NISOIMG'].find(
                                       './/mix:YSamplingFrequency',
                                       xmlns).text))
            else:
                file.width = None
                file.height = None
                file.resolution = None

            page.add_file(file)

        page.parse_ocr()

        return page
Exemplo n.º 7
0
    def read_data(self):
        try:
            tree = parse(self.path)
        except OSError:
            raise DataReadException("Unable to read {0}".format(self.path))
        except XMLSyntaxError:
            raise DataReadException("Unable to parse {0} as XML".format(
                self.path))

        issue_mets = METSResource(tree)
        root = tree.getroot()
        m = XPATHMAP['issue']

        issue = Issue(member_of=self.batch.collection)

        # get required metadata elements
        try:
            issue.title = root.get('LABEL')
            issue.date = root.find('.//MODS:dateIssued', xmlns).text
            issue.sequence_attr = ('Page', 'number')
        except AttributeError:
            raise DataReadException("Missing metadata in {0}".format(
                self.path))

        # optional metadata elements
        if root.find(m['volume']) is not None:
            issue.volume = root.find(m['volume']).text
        if root.find(m['issue']) is not None:
            issue.issue = root.find(m['issue']).text
        if root.find(m['edition']) is not None:
            issue.edition = root.find(m['edition']).text

        # add the issue and article-level XML files as related objects
        issue.add_related(
            IssueMetadata(
                MetadataFile.from_source(
                    LocalFileSource(self.path),
                    title=f'{issue.title}, issue METS metadata')))
        issue.add_related(
            IssueMetadata(
                MetadataFile.from_source(
                    LocalFileSource(self.article_path),
                    title=f'{issue.title}, article METS metadata')))

        # create a page object for each page and append to list of pages
        for page_div in issue_mets.xpath(
                'METS:structMap//METS:div[@TYPE="np:page"]'):
            # create a page and add to the list of members
            page = self.create_page(issue_mets, page_div, issue)
            issue.add_member(page)

            # create a proxy for the page in this issue and add it to the aggregation
            issue.append_proxy(
                page, title=f'Proxy for page {page.number} in {issue.title}')

            # add OCR text blocks as annotations
            issue.annotations.extend(page.textblocks())

        # iterate over the article XML and create objects for articles
        try:
            article_tree = parse(self.article_path)
        except OSError:
            raise DataReadException("Unable to read {0}".format(
                self.article_path))
        except XMLSyntaxError:
            raise DataReadException("Unable to parse {0} as XML".format(
                self.article_path))

        article_root = article_tree.getroot()
        for article in article_root.findall(m['article']):
            article_title = article.get('LABEL')
            article_pagenums = set()
            for area in article.findall(m['areas']):
                pagenum = int(area.get('FILEID').replace('ocrFile', ''))
                article_pagenums.add(pagenum)
            article = Article(title=article_title,
                              issue=issue,
                              pages=sorted(list(article_pagenums)))
            issue.add_member(article)

        self.issue = issue
        return issue
Exemplo n.º 8
0
    def get_items(self, lines, mapping):
        cls = create_class_from_mapping(mapping, self.item_rdf_type)

        key_column = get_flagged_column(mapping, 'key')
        filename_column = get_flagged_column(mapping, 'filename')
        dirname_column = get_flagged_column(mapping, 'dirname')

        if key_column is not None:
            # the lines are grouped into subjects by
            # the unique values of the key column
            key_conf = mapping[key_column]
            keys = OrderedDict.fromkeys([line[key_column] for line in lines])
            for key in keys:
                # add an item for each unique key
                sub_lines = [line for line in lines if line[key_column] == key]
                attrs = {
                    column: get_column_value(sub_lines[0], column, mapping)
                    for column in mapping.keys()
                }
                item = cls(**attrs)
                item.path = key
                item.ordered = False
                item.sequence_attr = ('Page', 'number')

                # add any members or files
                if 'members' in key_conf:
                    # this key_column is a subject with member items
                    for component in self.get_items(sub_lines,
                                                    key_conf['members']):
                        # there may also be files that should be directly
                        # associated with the item
                        if isinstance(component, pcdm.File):
                            item.add_file(component)
                        else:
                            item.add_member(component)
                elif 'files' in key_conf:
                    # this key_column is a subject with file items
                    for component in self.get_items(sub_lines,
                                                    key_conf['files']):
                        item.add_file(component)

                yield item

        elif filename_column is not None:
            # this mapping is for file objects
            filename_conf = mapping[filename_column]
            for line in lines:
                filenames = line.get(filename_column, None)
                if filenames is not None:
                    if filename_conf.get('multivalued', False):
                        filenames = filenames.split(filename_conf['separator'])
                    else:
                        filenames = [filenames]

                    for filename in filenames:
                        if 'host' in filename_conf:
                            source = RemoteFileSource(filename_conf['host'],
                                                      filename)
                        else:
                            # local file
                            localpath = os.path.join(self.file_path, filename)
                            source = LocalFileSource(localpath)

                        f = pcdm.get_file_object(filename, source)
                        for column, conf in mapping.items():
                            set_value(f, column, conf, line)
                        yield f

        elif dirname_column is not None:
            # this mapping describes a directory of files that should be
            # subdivided based on filename into member objects
            dirname_conf = mapping[dirname_column]
            for line in lines:
                dirname = line.get(dirname_column, None)
                if dirname is not None:
                    members = {}
                    for entry in os.scandir(
                            os.path.join(self.file_path, dirname)):
                        base, ext = os.path.splitext(entry.name)
                        if base not in members:
                            members[base] = []
                        members[base].append(entry)

                    for key in members:
                        key_parts = key.split('-')
                        if len(key_parts) == 2:
                            # top-level
                            for entry in members[key]:
                                source = LocalFileSource(entry.path)
                                f = pcdm.get_file_object(entry.name, source)
                                yield f

                        elif len(key_parts) == 3:
                            # part
                            # TODO: this number only makes sense as part of the
                            # folder; should be the title of the proxy object
                            sequence_number = int(key_parts[2])
                            page = pcdm.Page(number=str(sequence_number),
                                             title=f'Page {sequence_number}')
                            for entry in members[key]:
                                source = LocalFileSource(entry.path)
                                f = pcdm.get_file_object(entry.name, source)
                                for column, conf in mapping.items():
                                    set_value(f, column, conf, line)
                                page.add_file(f)
                            yield page

        else:
            # each line is its own (implicit) subject
            # for an Item resource
            for line in lines:
                attrs = {
                    column: get_column_value(line, column, mapping)
                    for column in mapping.keys()
                }
                item = cls(**attrs)
                yield item