def run(self): _, tmpfile = tempfile.mkstemp(prefix='byoi-') for target in self.input(): shellout("cat {input} >> {output}", input=target.path, output=tmpfile) luigi.File(tmpfile).move(self.output().path)
def run(self): """ Only use the first file, so it is faster. To use more files, drop the `head -1`. """ directory = os.path.join(self.inputdir(), 'crossref') output = shellout("find {directory} -name '*.ldj.gz' | head -1 > {output}", directory=directory) luigi.File(output).move(self.output().path)
def run(self): """ TODO: Concatenate all input files. """ _, tmpfile = tempfile.mkstemp(prefix='byoi-') # TODO: loop over inputs and run `cat` luigi.File(tmpfile).move(self.output().path)
def run(self): output = shellout('marctotsv -k -s "|" {input} 001 653.a > {output}', input=self.input().get('dump').path) with luigi.File(output, format=TSV).open() as handle: with self.output().open('w') as output: for row in handle.iter_tsv(cols=('id', 'terms')): for subfield in row.terms.split('|'): for term in subfield.split('--'): term = term.strip() output.write_tsv(row.id, term)
def run(self): """ TODO: For each file, we want to run a jq command. """ _, temp = tempfile.mkstemp(prefix='byoi-') with self.input().open() as handle: # TODO: insert code here pass luigi.File(temp).move(self.output().path)
def run(self): _, temp = tempfile.mkstemp(prefix='byoi-') with self.input().open() as handle: for path in map(str.strip, handle): print('processing: %s' % path) shellout( "jq -r -c '.message.items[]' <(unpigz -c {input}) | pigz -c >> {output}", input=path, output=temp) luigi.File(temp).move(self.output().path)
def run(self): command = """lftp -u {username},{password} -e "set net:max-retries 5; set net:timeout 10; get -c {filepath} -o {output}; exit" {host}""" output = shellout(command, host=self.host, username=pipes.quote(self.username), password=pipes.quote(self.password), filepath=pipes.quote(self.filepath)) luigi.File(output).move(self.output().path)
def shellout(template, **kwargs): """ Takes a shell command template and executes it. The template must use the new (2.6+) format mini language. `kwargs` must contain any defined placeholder, only `output` is optional. Raises RuntimeError on nonzero exit codes. Simple template: wc -l < {input} > {output} Quoted curly braces: ps ax|awk '{{print $1}}' > {output} Usage with luigi: ... tmp = shellout('wc -l < {input} > {output}', input=self.input().fn) luigi.File(tmp).move(self.output.fn()) .... """ preserve_spaces = kwargs.get('preserve_spaces', False) stopover = luigi.File( is_tmp=True ) # Should return a random path string, e.g. /tmp/as3as8d90a8s9f8d if not 'output' in kwargs: kwargs.update({'output': stopover.fn}) command = template.format(**kwargs) if not preserve_spaces: command = re.sub(' +', ' ', command) # logger.debug(cyan(command)) code = subprocess.call([command], shell=True) if not code == 0: raise RuntimeError('%s exitcode: %s' % (command, code)) # return kwargs.get('output') return stopover if stopover else luigi.File(kwargs.get('output'))
def run(self): server = "datendienst.dnb.de" path = "/cgi-bin/mabit.pl" params = urllib.urlencode({ 'cmd': 'fetch', 'userID': 'opendata', 'pass': '******', 'mabheft': 'GND.rdf.gz' }) url = "http://{server}{path}?{params}".format(server=server, path=path, params=params) output = shellout("""wget --retry-connrefused "{url}" -O {output}""", url=url) luigi.File(output).move(self.output().fn)
def run(self): stopover = random_tmp_path() pattern = re.compile( """rdf:about="http://d-nb.info/gnd/([0-9X-]+)">""") with dbopen(stopover) as cursor: cursor.execute("""CREATE TABLE gnd (id text PRIMARY KEY, content blob)""") cursor.execute("""CREATE INDEX IF NOT EXISTS idx_gnd_id ON gnd (id)""") with self.input().open() as handle: groups = itertools.groupby(handle, key=str.isspace) for i, (k, lines) in enumerate(groups): if k: continue lines = map(string.strip, list(lines)) match = pattern.search(lines[0]) if match: row = (match.group(1), '\n'.join(lines)) cursor.execute("INSERT INTO gnd VALUES (?, ?)", row) luigi.File(path=stopover).move(self.output().fn)
def run(self): stopover = random_tmp_path() with self.input().open() as handle: with dbopen(stopover) as cursor: cursor.execute( """CREATE TABLE IF NOT EXISTS successor (id text, successor text, PRIMARY KEY (id, successor))""") for line in handle: id, successor = line.strip().split() if id == successor: continue cursor.execute("INSERT INTO successor VALUES (?, ?)", (id, successor)) cursor.execute("""CREATE INDEX IF NOT EXISTS idx_successor_id ON successor (id)""") cursor.execute("""CREATE INDEX IF NOT EXISTS idx_successor_successor ON successor (successor)""") luigi.File(stopover).move(self.output().fn)
def run(self): """ Just copy the fixture, so we have some output. """ luigi.File(path=self.fixture).copy(self.output().path)
def run(self): """ Just run wget quietly. """ output = shellout('wget -q "{url}" -O {output}', url=self.url) luigi.File(output).move(self.output().path)
def run(self): """ TODO: convert input to intermediate schema via span-import. """ luigi.File(output).move(self.output().path)
def run(self): url = "http://viaf.org/viaf/data/viaf-20131014-links.txt.gz" output = shellout("""wget --retry-connrefused {url} -O {output}""", url=url) luigi.File(output).move(self.output().fn)
def run(self): output = shellout("gunzip -c {input} > {output}", input=self.input().fn) luigi.File(output).move(self.output().fn)
def run(self): url = "http://gutenberg.readingroo.ms/cache/generated/feeds/catalog.marc.bz2" output = shellout('wget -q "{url}" -O {output}', url=url) output = shellout('bunzip2 {input} -c > {output}', input=output) luigi.File(output).move(self.output().path)
def run(self): output = shellout( "span-import -w 2 -i crossref <(unpigz -c {input}) | pigz -c > {output}", input=self.input().path) luigi.File(output).move(self.output().path)
def run(self): output = shellout("span-export <(unpigz -c {input}) | pigz -c > {output}", input=self.input().path) luigi.File(output).move(self.output().path)
def run(self): """ wc -l wrapped. """ tmp = shellout("wc -l < {input} > {output}", input=self.input().fn) luigi.File(tmp).move(self.output().fn)
def run(self): """ Simulate touch. """ luigi.File(path=self.output().path).open('w')
def run(self): temp = shellout("pagerank {input} > {output}", input=self.input().get('data').fn) luigi.File(temp).move(self.output().fn)
def run(self): output = shellout('span-tag -c {config} <(unpigz -c {input}) | pigz -c > {output}', config=self.input().get('config').path, input=self.input().get('input').path) luigi.File(output).move(self.output().path)
def run(self): output = shellout("cut -f 2- {input}| sort | uniq -c | sort -nr > {output}", input=self.input().path) luigi.File(output).move(self.output().path)