def run(self): output = shellout("cut -f2 {input} | LANG=C sort | LANG=C uniq > {output}", input=self.input().path) with open(output) as handle: dates = map(string.strip, handle.readlines()) with self.output().open('w') as output: for date in dates: dateobj = datetime.date(*map(int, date.split('-'))) marc = SWBOpenDataMarc(date=dateobj) sdb = SWBOpenDataSeekMapDB(date=dateobj) luigi.build([marc, sdb], local_scheduler=True) with open(marc.output().path) as handle: with sqlite3db(sdb.output().path) as cursor: idset = df[df.date == date].id.values.tolist() limit, offset = self.limit, 0 while True: cursor.execute(""" SELECT offset, length FROM seekmap WHERE id IN (%s)""" % ( ','.join(("'%s'" % id for id in idset[offset:offset + limit])))) rows = cursor.fetchall() if not rows: break else: copyregions(handle, output, rows) offset += limit
def run(self): with self.input().get('surface').open() as handle: with self.output().open('w') as output: with self.input().get('file').open() as fh: with sqlite3db(self.input().get('seekmap').path) as cursor: regions = [] for row in handle.iter_tsv(cols=('id', 'date')): cursor.execute("SELECT offset, length FROM seekmap where id = ?", (row.id,)) regions.append(cursor.fetchone()) copyregions(fh, output, regions)
def test_copyregions(self): with tempfile.NamedTemporaryFile(delete=False) as handle: handle.write('0123456789\n') with open(handle.name) as src: with tempfile.NamedTemporaryFile(delete=False) as dst: copyregions(src, dst, [(3, 2)]) with open(dst.name) as handle: self.assertEquals("34", handle.read())
def run(self): output = shellout("cut -f3 {input} | LANG=C sort | LANG=C uniq > {output}", input=self.input().path) with open(output) as handle: dates = map(string.strip, handle.readlines()) with self.output().open('w') as output: for date in dates: task = NEPCombine(date=datetime.date(*(int(v) for v in date.split('-')))) luigi.build([task], local_scheduler=True) with task.output().open() as fh: seekfile = shellout("""LANG=C grep "{date}" "{input}" | cut -f 4,5 > {output}""", date=str(date), input=self.input().path) with luigi.File(seekfile, format=TSV).open() as handle: seekmap = ((int(offset), int(length)) for offset, length in handle.iter_tsv()) copyregions(fh, output, seekmap)
def run(self): combined = shellout(r"unzip -p {input} \*.mrc > {output}", input=self.input().path) # there is a broken record inside! mmap = shellout("""marcmap {input} | LANG=C grep -v ^ebr10661760 \ | awk '{{print $2":"$3}}' > {output} """, input=combined) # prepare seekmap seekmap = [] with open(mmap) as handle: for line in handle: offset, length = map(int, line.strip().split(':')) seekmap.append((offset, length)) # create the filtered file _, tmp = tempfile.mkstemp(prefix='gluish-') with open(tmp, 'w') as output: with open(combined) as handle: copyregions(handle, output, seekmap) output = shellout("""yaz-marcdump -f marc8s -t utf8 -o marc -l 9=97 {input} > {output}""", input=tmp) luigi.File(output).move(self.output().path)