Пример #1
0
class ArchiveSearch(ArchiveTask):
    """
    Search archive via the ia tool. Requires an archive.org account:
    https://archive.org/account/login.createaccount.php

    The command `ia configure` will set you up.

        $ cat ~/.config/ia.ini
        [s3]
        access = asudiasd77xsdlds
        secret = oasdu888s8x9a0sd

        [cookies]
        logged-in-user = [email protected]
        logged-in-sig = secret

    Refs #8000.
    """
    date = ClosestDateParameter(default=datetime.date.today())
    query = luigi.Parameter(default='collection:prelinger')

    def requires(self):
        """
        The setup of siskin should install this automatically.
        """
        return Executable(name='ia',
                          message='https://pypi.org/project/internetarchive/')

    def run(self):
        output = shellout("ia search '{query}' > {output}", query=self.query)
        luigi.LocalTarget(output).move(self.output().path)

    def output(self):
        return luigi.LocalTarget(path=self.path(ext='ldj', digest=True))
Пример #2
0
class DegruyterDOIList(DegruyterTask):
    """
    A list of Degruyter DOIs.
    """
    date = ClosestDateParameter(default=datetime.date.today())

    def requires(self):
        return {
            'input': DegruyterIntermediateSchema(date=self.date),
            'jq': Executable(name='jq',
                             message='https://github.com/stedolan/jq')
        }

    @timed
    def run(self):
        _, stopover = tempfile.mkstemp(prefix='siskin-')
        shellout(
            """jq -r '.doi' <(unpigz -c {input}) | grep -v "null" | grep -o "10.*" 2> /dev/null > {output} """,
            input=self.input().get('input').path,
            output=stopover)
        output = shellout("""sort -u {input} > {output} """, input=stopover)
        luigi.LocalTarget(output).move(self.output().path)

    def output(self):
        return luigi.LocalTarget(path=self.path(), format=TSV)
Пример #3
0
Файл: nl.py Проект: zazi/siskin
class NLFetch(NLTask):
    """
    Stream from SOLR.
    """
    date = ClosestDateParameter(default=datetime.date.today())
    query = luigi.Parameter(
        default="collection_details:GBV_NL_EBOOK",
        description="to test: id:NLEB006936695 OR id:NLEB006936733")

    def run(self):
        """
        cf. https://github.com/stedolan/jq/issues/787, "Warning: replace is deprecated and will be removed in a future version."

        There's jq -j, but replace cannot run on a single 1GB line.
        """
        output = shellout(
            """solrdump -verbose -server {server} -q "{query}" -fl fullrecord | \
                          jq -r '.fullrecord' | \
                          replace '#29;' $(printf "\\x1D") '#30;' $(printf "\\x1E") '#31;' $(printf "\\x1F") > {output} """,
            server=self.config.get('nl', 'solr'),
            query=self.query)
        output = shellout(
            "sed ':a;N;$!ba;s/\x1d\x0a/\x1d/g' {input} > {output}",
            input=output)
        luigi.LocalTarget(output).move(self.output().path)

    def output(self):
        return luigi.LocalTarget(path=self.path(ext="mrc", digest=True))
Пример #4
0
class CrossrefDOIList(CrossrefTask):
    """
    A list of Crossref DOIs.
    """
    date = ClosestDateParameter(default=datetime.date.today())

    def requires(self):
        return {
            'input': CrossrefIntermediateSchema(date=self.date),
            'jq': Executable(name='jq',
                             message='https://github.com/stedolan/jq')
        }

    @timed
    def run(self):
        _, stopover = tempfile.mkstemp(prefix='siskin-')
        # process substitution sometimes results in a broken pipe, so extract beforehand
        output = shellout("unpigz -c {input} > {output}",
                          input=self.input().get('input').path)
        shellout(
            """jq -r '.doi?' {input} | grep -o "10.*" 2> /dev/null | LC_ALL=C sort -S50% > {output} """,
            input=output,
            output=stopover)
        os.remove(output)
        output = shellout("""sort -S50% -u {input} > {output} """,
                          input=stopover)
        luigi.LocalTarget(output).move(self.output().path)

    def output(self):
        return luigi.LocalTarget(path=self.path(), format=TSV)
Пример #5
0
class DegruyterXML(DegruyterTask):
    """
    Single file version.
    """
    date = ClosestDateParameter(default=datetime.date.today())
    group = luigi.Parameter(default='SSH', description='main subdirectory')
    ts = luigi.Parameter(default=DegruyterTask.TIMESTAMP)

    def requires(self):
        return DegruyterPaths(date=self.date)

    @timed
    def run(self):
        _, stopover = tempfile.mkstemp(prefix='siskin-')
        with self.input().open() as handle:
            for row in handle.iter_tsv(cols=('path', )):
                if not '/%s/' % self.group in row.path:
                    continue
                if '-%s.zip' % self.ts not in row.path:
                    continue
                shellout(r"unzip -p {path} \*.xml 2> /dev/null >> {output}",
                         output=stopover,
                         path=row.path,
                         ignoremap={
                             1: 'OK',
                             9: 'skip corrupt file'
                         })
        luigi.LocalTarget(stopover).move(self.output().path)

    def output(self):
        return luigi.LocalTarget(path=self.path(ext='xml'))
Пример #6
0
class IJOCFincSolr(IJOCTask):
    """
    Export to finc solr schema by using span-export.
    Tag with ISIL for FID and change record type.
    """
    format = luigi.Parameter(default='solr5vu3', description='export format')
    isil = luigi.Parameter(default='DE-15-FID', description='isil FID')
    date = ClosestDateParameter(default=datetime.date.today())

    def requires(self):
        return {
            'config': AMSLFilterConfig(date=self.date),
            'file': IJOCIntermediateSchema(date=self.date)
        }

    def run(self):
        output = shellout(
            """span-tag -c {config} <(unpigz -c {input}) | span-export -o {format} -with-fullrecord > {output}""",
            config=self.input().get('config').path,
            input=self.input().get('file').path,
            format=self.format)
        luigi.LocalTarget(output).move(self.output().path)

    def output(self):
        return luigi.LocalTarget(path=self.path(ext='fincsolr.ndj'))
Пример #7
0
class CrossrefExport(CrossrefTask):
    """
    Tag with ISILs, then export to various formats.
    """
    date = ClosestDateParameter(default=datetime.date.today())
    format = luigi.Parameter(default='solr5vu3', description='export format')

    def requires(self):
        return {
            'file': CrossrefIntermediateSchema(date=self.date),
            'config': AMSLFilterConfig(date=self.date),
        }

    def run(self):
        output = shellout(
            "span-tag -c {config} <(unpigz -c {input}) | pigz -c > {output}",
            config=self.input().get('config').path,
            input=self.input().get('file').path)
        output = shellout(
            "span-export -o {format} <(unpigz -c {input}) | pigz -c > {output}",
            format=self.format,
            input=output)
        luigi.LocalTarget(output).move(self.output().path)

    def output(self):
        extensions = {
            'solr5vu3': 'ldj.gz',
            'formeta': 'form.gz',
        }
        return luigi.LocalTarget(path=self.path(
            ext=extensions.get(self.format, 'gz')))
Пример #8
0
class DOAJIntermediateSchema(DOAJTask):
    """
    Respect whitelist.
    """
    date = ClosestDateParameter(default=datetime.date.today())
    format = luigi.Parameter(
        default="doaj-oai",
        description=
        "kind of source document, doaj-oai (defunkt: doaj, doaj-api)")

    def requires(self):
        return {
            'data': DOAJIntermediateSchemaDirty(date=self.date,
                                                format=self.format),
            'whitelist': DOAJWhitelist(date=self.date),
        }

    @timed
    def run(self):
        output = shellout(
            """unpigz -c {input} | LC_ALL=C grep -Ff {whitelist} | pigz -c > {output}""",
            whitelist=self.input().get('whitelist').path,
            input=self.input().get('data').path)
        luigi.LocalTarget(output).move(self.output().path)

    def output(self):
        return luigi.LocalTarget(path=self.path(ext='ldj.gz'))
Пример #9
0
class DOAJISSNList(DOAJTask):
    """
    A list of DOAJ ISSNs.
    """
    date = ClosestDateParameter(default=datetime.date.today())

    def requires(self):
        return {
            'input': DOAJIntermediateSchema(date=self.date),
            'jq': Executable(name='jq', message='http://git.io/NYpfTw')
        }

    @timed
    def run(self):
        _, stopover = tempfile.mkstemp(prefix='siskin-')
        shellout(
            """jq -r '.["rft.issn"][]?' <(unpigz -c {input}) >> {output} """,
            input=self.input().get('input').path,
            output=stopover)
        shellout(
            """jq -r '.["rft.eissn"][]?' <(unpigz -c {input}) >> {output} """,
            input=self.input().get('input').path,
            output=stopover)
        output = shellout("""sort -u {input} > {output} """, input=stopover)
        luigi.LocalTarget(output).move(self.output().path)

    def output(self):
        return luigi.LocalTarget(path=self.path(), format=TSV)
Пример #10
0
class MediarepIntermediateSchema(MediarepTask):
    """
    Single file dump.
    """
    date = ClosestDateParameter(default=datetime.date.today())
    url = luigi.Parameter(default="https://mediarep.org/oai/request",
                          significant=False)
    prefix = luigi.Parameter(default="dim", significant=False)

    def requires(self):
        return [
            Executable(name='metha-sync',
                       message='https://github.com/miku/metha'),
            Executable(name='span-import',
                       message='https://github.com/miku/span'),
        ]

    def run(self):
        shellout("METHA_DIR={dir} metha-sync -format {prefix} {url}",
                 prefix=self.prefix,
                 url=self.url,
                 dir=self.config.get('core', 'metha-dir'))
        output = shellout(
            """METHA_DIR={dir} metha-cat -root Records -format {prefix} {url} |
                             span-import -i mediarep-dim | pigz -c > {output}""",
            prefix=self.prefix,
            url=self.url,
            dir=self.config.get('core', 'metha-dir'))
        luigi.LocalTarget(output).move(self.output().path)

    def output(self):
        return luigi.LocalTarget(path=self.path(ext="ldj.gz"))
Пример #11
0
class LyndaIntermediateSchema(LyndaTask):
    """
    XXX: Workaround SOLR, refs #11477.
    """
    date = ClosestDateParameter(default=datetime.date.today())

    def requires(self):
        return LyndaPaths(date=self.date)

    def run(self):
        with self.input().open() as handle:
            for row in handle.iter_tsv(cols=('path', )):
                if row.path.endswith("latest"):
                    output = shellout(""" gunzip -c {input} |
                                      jq -rc '.fullrecord' |
                                      jq -rc 'del(.["x.labels"])' |
                                      jq -rc '. + {{"finc.id": .["finc.record_id"]}}' | gzip -c > {output} """,
                                      input=row.path)
                    luigi.LocalTarget(output).move(self.output().path)
                    break
            else:
                raise RuntimeError("no latest symlink found in folder")

    def output(self):
        return luigi.LocalTarget(path=self.path(ext="ldj.gz"), format=Gzip)
Пример #12
0
class MediarepMARC(MediarepTask):
    """
    Harvest and convert to MARC.
    """
    date = ClosestDateParameter(default=datetime.date.today())
    url = luigi.Parameter(default="https://mediarep.org/oai/request",
                          significant=False)
    prefix = luigi.Parameter(default="oai_dc", significant=False)

    def requires(self):
        return Executable(name='metha-sync',
                          message='https://github.com/miku/metha'),

    def run(self):
        shellout("METHA_DIR={dir} metha-sync -format {prefix} {url}",
                 prefix=self.prefix,
                 url=self.url,
                 dir=self.config.get('core', 'metha-dir'))
        data = shellout(
            """METHA_DIR={dir} metha-cat -root Records -format {prefix} {url} > {output}""",
            dir=self.config.get('core', 'metha-dir'),
            prefix=self.prefix,
            url=self.url)
        output = shellout("""python {script} {input} {output}""",
                          script=self.assets('170/170_marcbinary.py'),
                          input=data)
        luigi.LocalTarget(output).move(self.output().path)

        os.remove(data)

    def output(self):
        return luigi.LocalTarget(path=self.path(ext="fincmarc.mrc"))
Пример #13
0
class IMSLPConvert(IMSLPTask):
    """
    Extract and transform.

    TODO, refs #13055 -- see IMSLPDownloadNext and IMSLPConvertNext and IMSLPLegacyMapping.
    """

    date = ClosestDateParameter(default=datetime.date.today())
    debug = luigi.BoolParameter(description='do not delete temporary folder', significant=False)

    def requires(self):
        return IMSLPDownload(date=self.date)

    def run(self):
        tempdir = tempfile.mkdtemp(prefix='siskin-')
        shellout("tar -xzf {archive} -C {tempdir}", archive=self.input().path, tempdir=tempdir)
        output = shellout("python {script} {tempdir} {output} {fieldmap}",
                          script=self.assets('15/15_marcbinary.py'),
                          tempdir=tempdir,
                          fieldmap=self.assets('15/15_fieldmap.json'))
        if not self.debug:
            shutil.rmtree(tempdir)
        else:
            self.logger.debug("not deleting temporary folder at %s", tempdir)
        luigi.LocalTarget(output).move(self.output().path)

    def output(self):
        return luigi.LocalTarget(path=self.path(ext='fincmarc.mrc'))
Пример #14
0
class IMSLPConvertNext(IMSLPTask):
    """
    Take a current version of the data plus legacy mapping and convert.

    WIP, refs #12288, refs #13055. May merge with 15_marcbinary.py.
    """
    date = ClosestDateParameter(default=datetime.date.today())

    def requires(self):
        return {
            'legacy-mapping': IMSLPLegacyMapping(),
            'data': IMSLPDownload(date=self.date),
        }

    def run(self):
        """
        Load mapping, convert, write.
        """
        with self.input().get("legacy-mapping").open() as handle:
            mapping = json.load(handle)

        output = imslp_tarball_to_marc(self.input().get("data").path, legacy_mapping=mapping)
        luigi.LocalTarget(output).move(self.output().path)

    def output(self):
        filename = os.path.basename(self.latest_link())
        dst = os.path.join(self.taskdir(), filename.replace("tar.gz", "fincmarc.mrc"))
        return luigi.LocalTarget(path=dst)
Пример #15
0
class SpringerCleanup(SpringerTask):
    """
    2017-11-28: finc.mega_collection is now multi-valued; AIAccessFacet remains.
    2017-12-12: new finc.id, refs #11821, #11960, #11961.
    """
    date = ClosestDateParameter(default=datetime.date.today())

    def requires(self):
        return SpringerPaths(date=self.date)

    def run(self):
        realpath = None
        with self.input().open() as handle:
            for row in handle.iter_tsv(cols=('path', )):
                if not row.path.endswith("total_tpu.ldj.gz"):
                    continue
                realpath = row.path
                break
            else:
                raise RuntimeError(
                    'FTP site does not contain total_tpu.ldj.gz')
        output = shellout("""
            unpigz -c {input} | jq -rc 'del(.["finc.AIRecordType"]) | del(.["AIAccessFacet"])' |
            jq -c '. + {{ "finc.record_id": .doi, "finc.format": "ElectronicArticle", "url": ["https://doi.org/" + .doi] }}' | pigz -c > {output}
        """,
                          input=realpath)
        luigi.LocalTarget(output).move(self.output().path)

    def output(self):
        return luigi.LocalTarget(path=self.path(ext='ldj.gz'), format=Gzip)
Пример #16
0
class CambridgeDropbox(CambridgeTask):
    """
    Pull down content from FTP dropbox, in Dec '18 about 10K zips.
    """
    date = ClosestDateParameter(default=datetime.date.today())

    def requires(self):
        return Executable('rsync', message='https://rsync.samba.org/')

    def run(self):
        target = os.path.join(self.taskdir(), 'mirror')
        shellout("mkdir -p {target} && rsync {rsync_options} {src} {target}",
                 rsync_options=self.config.get('cambridge', 'rsync-options', fallback='-avzP'),
                 src=self.config.get('cambridge', 'scp-src'),
                 target=target)

        if not os.path.exists(self.taskdir()):
            os.makedirs(self.taskdir())

        with self.output().open('w') as output:
            for path in iterfiles(target):
                output.write_tsv(path)

    def output(self):
        return luigi.LocalTarget(path=self.path(ext='filelist'), format=TSV)
Пример #17
0
class DDNLPaths(DDNLTask):
    """
    Mirror.
    """

    date = ClosestDateParameter(default=datetime.date.today())
    max_retries = luigi.IntParameter(default=10, significant=False)
    timeout = luigi.IntParameter(default=20,
                                 significant=False,
                                 description='timeout in seconds')

    def requires(self):
        return FTPMirror(host=self.config.get('ddnl', 'ftp-host'),
                         base=self.config.get('ddnl', 'ftp-base'),
                         username=self.config.get('ddnl', 'ftp-username'),
                         password=self.config.get('ddnl', 'ftp-password'),
                         pattern=self.config.get('ddnl', 'ftp-pattern'),
                         max_retries=self.max_retries,
                         timeout=self.timeout)

    def run(self):
        self.input().move(self.output().path)

    def output(self):
        return luigi.LocalTarget(path=self.path(), format=TSV)
Пример #18
0
class VKFilmFFMARC(VKFilmFFTask):
    """
    Find MARC XML, uncompress, clean, remove "Nichtsortierzeichen" on the fly, convert via Python.
    """
    date = ClosestDateParameter(default=datetime.date.today())

    def requires(self):
        return VKFilmFFPaths(date=self.date)

    def run(self):
        with self.input().open() as handle:
            filename = 'film_theater_marc_%s.xml.gz' % (self.closest().strftime("%Y%m%d"))
            for row in handle.iter_tsv(cols=('path', )):

                if not row.path.endswith(filename):
                    continue
                output = shellout("unpigz -c {file} | sed 's/\xC2\x98//g;s/\xC2\x9C//g' > {output}", file=row.path)
                output = shellout("yaz-marcdump -i marcxml -o marc {input} > {output}", input=output)
                output = shellout("python {script} {input} {output}",
                                  script=self.assets("119/119_marcbinary.py"),
                                  input=output)
                luigi.LocalTarget(output).move(self.output().path)
                break
            else:
                raise RuntimeError('not found: %s' % filename)

    def output(self):
        return luigi.LocalTarget(path=self.path(ext='fincmarc.mrc'))
Пример #19
0
class CrossrefRawItems(CrossrefTask):
    """
    Concatenate all harvested items.
    """
    begin = luigi.DateParameter(default=datetime.date(2006, 1, 1))
    date = ClosestDateParameter(default=datetime.date.today())
    update = luigi.Parameter(default='months',
                             description='days, weeks or months')

    def requires(self):
        if self.update not in ('days', 'weeks', 'months'):
            raise RuntimeError('update can only be: days, weeks or months')
        dates = [
            dt for dt in date_range(self.begin, self.date, 1, self.update)
        ]
        tasks = [
            CrossrefChunkItems(begin=dates[i - 1], end=dates[i])
            for i in range(1, len(dates))
        ]
        return tasks

    def run(self):
        _, stopover = tempfile.mkstemp(prefix='siskin-')
        for target in self.input():
            shellout("cat {input} >> {output}",
                     input=target.path,
                     output=stopover)
        luigi.LocalTarget(stopover).move(self.output().path)

    def output(self):
        return luigi.LocalTarget(path=self.path(ext='ldj.gz'), format=Gzip)
Пример #20
0
class PQDTCombine(PQDTTask):
    """ Combine files."""

    date = ClosestDateParameter(default=datetime.date.today())
    prefix = luigi.Parameter(default="oai_dc")

    def requires(self):
        return Executable(name='metha-sync',
                          message='https://github.com/miku/metha')

    def run(self):
        url = self.config.get('pqdt', 'oai')
        shellout("METHA_DIR={dir} metha-sync -format {prefix} {url}",
                 prefix=self.prefix,
                 url=url,
                 dir=self.config.get('core', 'metha-dir'))
        output = shellout(
            "METHA_DIR={dir} metha-cat -root Records -format {prefix} {url} | pigz -c > {output}",
            prefix=self.prefix,
            url=url,
            dir=self.config.get('core', 'metha-dir'))
        luigi.LocalTarget(output).move(self.output().path)

    def output(self):
        return luigi.LocalTarget(path=self.path(ext="xml.gz"))
Пример #21
0
class CrossrefCollectionsCount(CrossrefTask):
    """
    Report collections and the number of titles per collection.
    """
    begin = luigi.DateParameter(default=datetime.date(2006, 1, 1))
    date = ClosestDateParameter(default=datetime.date.today())

    def requires(self):
        return {
            'input': CrossrefIntermediateSchema(begin=self.begin,
                                                date=self.date),
            'jq': Executable(name='jq',
                             message='https://github.com/stedolan/jq')
        }

    @timed
    def run(self):
        output = shellout(
            """jq -rc '.["finc.mega_collection"][]?' <(unpigz -c {input}) | LC_ALL=C sort -S35% > {output}""",
            input=self.input().get('input').path)

        groups = {}  # Map collection name to its size.
        with open(output) as handle:
            for k, g in itertools.groupby(handle):
                name = k.strip()
                groups[name] = len(list(g))

        with self.output().open('w') as output:
            json.dump(groups, output)

    def output(self):
        return luigi.LocalTarget(path=self.path(ext='json'))
Пример #22
0
class DataciteCombine(DataciteTask):
    """
    Single file dump.
    """
    date = ClosestDateParameter(default=datetime.date.today())
    url = luigi.Parameter(default="http://oai.datacite.org/oai",
                          significant=False)
    prefix = luigi.Parameter(default="oai_dc", significant=False)

    def requires(self):
        return Executable(name='metha-sync',
                          message='https://github.com/miku/metha')

    def run(self):
        shellout("METHA_DIR={dir} metha-sync -format {prefix} {url}",
                 prefix=self.prefix,
                 url=self.url,
                 dir=self.config.get('core', 'metha-dir'))
        output = shellout(
            "METHA_DIR={dir} metha-cat -root Records -format {prefix} {url} | pigz -c > {output}",
            prefix=self.prefix,
            url=self.url,
            dir=self.config.get('core', 'metha-dir'))
        luigi.LocalTarget(output).move(self.output().path)

    def output(self):
        return luigi.LocalTarget(path=self.path(ext="xml.gz"))
Пример #23
0
class CrossrefDOIAndISSNList(CrossrefTask):
    """
    A list of Crossref DOIs with their ISSNs.
    """
    date = ClosestDateParameter(default=datetime.date.today())

    def requires(self):
        return {
            'input': CrossrefIntermediateSchema(date=self.date),
            'jq': Executable(name='jq',
                             message='https://github.com/stedolan/jq')
        }

    @timed
    def run(self):
        _, stopover = tempfile.mkstemp(prefix='siskin-')
        temp = shellout("unpigz -c {input} > {output}",
                        input=self.input().get('input').path)
        output = shellout(
            """jq -r '[.doi?, .["rft.issn"][]?, .["rft.eissn"][]?] | @csv' {input} | LC_ALL=C sort -S50% > {output} """,
            input=temp,
            output=stopover)
        os.remove(temp)
        luigi.LocalTarget(output).move(self.output().path)

    def output(self):
        return luigi.LocalTarget(path=self.path(ext='csv'))
Пример #24
0
class GeniosIntermediateSchema(GeniosTask):
    """
    Intermediate schema by kind. May be incomplete, since the database mapping
    is derived from dozens of XLS sheets and manual guesses.

    Related: "Neue Quellen bzw. Austausch", Mon, Dec 5, 2016 at 12:23 PM, ba54ea7d396a41a2a1281f51bba5d33f
    See also: #9534.
    """
    kind = luigi.Parameter(default='fachzeitschriften',
                           description='or: ebooks, literaturnachweise_...')
    date = ClosestDateParameter(default=datetime.date.today())

    def requires(self):
        return GeniosLatest(kind=self.kind, date=self.date)

    def run(self):
        if not os.path.exists(self.taskdir()):
            os.makedirs(self.taskdir())
        output = shellout(
            "span-import -i genios <(unpigz -c {input}) | pigz -c >> {output}",
            input=self.input().path)
        luigi.LocalTarget(output).move(self.output().path)

    def output(self):
        return luigi.LocalTarget(path=self.path(ext='ldj.gz'))
Пример #25
0
class DegruyterISSNList(DegruyterTask):
    """
    List of ISSNs.
    """
    date = ClosestDateParameter(default=datetime.date.today())

    def requires(self):
        return DegruyterIntermediateSchema(date=self.date)

    @timed
    def run(self):
        _, stopover = tempfile.mkstemp(prefix='siskin-')
        shellout(
            """jq -r '.["rft.issn"][]?' <(unpigz -c {input}) 2> /dev/null >> {output} """,
            input=self.input().path,
            output=stopover)
        shellout(
            """jq -r '.["rft.eissn"][]?' <(unpigz -c {input}) 2> /dev/null >> {output} """,
            input=self.input().path,
            output=stopover)
        output = shellout("""sort -u {input} > {output} """, input=stopover)
        luigi.LocalTarget(output).move(self.output().path)

    def output(self):
        return luigi.LocalTarget(path=self.path(), format=TSV)
Пример #26
0
class GeniosISSNList(GeniosTask):
    """
    A list of Genios ISSNs.
    """
    date = ClosestDateParameter(default=datetime.date.today())

    def requires(self):
        return {
            'input': GeniosCombinedIntermediateSchema(date=self.date),
            'jq': Executable(name='jq',
                             message='https://github.com/stedolan/jq')
        }

    def run(self):
        _, output = tempfile.mkstemp(prefix='siskin-')
        shellout(
            """jq -c -r '.["rft.issn"][]?' <(unpigz -c {input}) >> {output} """,
            input=self.input().get('input').path,
            output=output)
        shellout(
            """jq -c -r '.["rft.eissn"][]?' <(unpigz -c {input}) >> {output} """,
            input=self.input().get('input').path,
            output=output)
        output = shellout("""LC_ALL=C sort -S35% -u {input} > {output} """,
                          input=output)
        luigi.LocalTarget(output).move(self.output().path)

    def output(self):
        return luigi.LocalTarget(path=self.path(), format=TSV)
Пример #27
0
class DegruyterPaths(DegruyterTask):
    """
    A list of Degruyter file paths (via FTP).
    """
    date = ClosestDateParameter(default=datetime.date.today())

    def requires(self):
        host = self.config.get('degruyter', 'ftp-host')
        username = self.config.get('degruyter', 'ftp-username')
        password = self.config.get('degruyter', 'ftp-password')
        base = self.config.get('degruyter', 'ftp-path')
        pattern = self.config.get('degruyter', 'ftp-pattern')
        exclude_glob = self.config.get('degruyter',
                                       'ftp-exclude-glob',
                                       fallback='')
        return FTPMirror(host=host,
                         username=username,
                         password=password,
                         base=base,
                         pattern=pattern,
                         exclude_glob=exclude_glob)

    @timed
    def run(self):
        self.input().move(self.output().path)

    def output(self):
        return luigi.LocalTarget(path=self.path(ext="filelist"), format=TSV)
Пример #28
0
class MarburgCombine(MarburgTask):
    """
    Harvest and combine a given set into a single file.

    NLM format has been discontinued as of 2018-01-01, refs #5486. Using datacite.
    """

    date = ClosestDateParameter(default=datetime.date.today())
    format = luigi.Parameter(default='datacite')
    set = luigi.Parameter(default='issn:2196-4270')

    def run(self):
        endpoint = "http://archiv.ub.uni-marburg.de/ubfind/OAI/Server"
        shellout("metha-sync -set {set} -format {format} {endpoint}",
                 set=self.set,
                 format=self.format,
                 endpoint=endpoint)
        output = shellout(
            "metha-cat -set {set} -format {format} {endpoint} > {output}",
            set=self.set,
            format=self.format,
            endpoint=endpoint)
        luigi.LocalTarget(output).move(self.output().path)

    def output(self):
        return luigi.LocalTarget(self.path(ext="xml", digest=True))
Пример #29
0
class B3KatDownload(B3KatTask):
    """
    Download snapshot. Output is a single (large) MARC binary file. Typically
    the downloads are provided in May and November.
    """
    date = ClosestDateParameter(default=datetime.date.today())

    def requires(self):
        return B3KatLinks(date=self.date)

    def run(self):
        _, stopover = tempfile.mkstemp(prefix='siskin-')
        with self.input().open() as handle:
            for i, row in enumerate(handle.iter_tsv(cols=('url', )), start=1):
                downloaded = shellout("""curl -sL --fail "{url}" > {output} """, url=row.url)
                output = shellout("""yaz-marcdump -i marcxml -o marc "{input}" >> {stopover}""",
                                  input=downloaded,
                                  stopover=stopover)
                try:
                    os.remove(downloaded)
                    os.remove(output)
                except OSError as err:
                    self.logger.error(err)

        luigi.LocalTarget(stopover).move(self.output().path)

    def output(self):
        return luigi.LocalTarget(path=self.path(ext='mrc'))
Пример #30
0
class ArchiveMARC(ArchiveTask):
    """
    Convert. Hard-coded collections, currently.
    """
    date = ClosestDateParameter(default=datetime.date.today())

    def requires(self):
        queries = [
            'collection:prelinger',
            'collection:classic_cartoons',
            'collection:feature_films',
            'collection:more_animation',
            'collection:vintage_cartoons',
        ]
        return [
            ArchiveSearchMetadata(date=self.date, query=query)
            for query in queries
        ]

    def run(self):
        inputs = [target.path for target in self.input()]
        output = shellout("python {script} {output} {inputs}",
                          script=self.assets("153/153_marcbinary.py"),
                          inputs=' '.join(inputs))
        luigi.LocalTarget(output).move(self.output().path)

    def output(self):
        return luigi.LocalTarget(path=self.path(ext='fincmarc.mrc'))