def iter_tsv(input_stream, cols=None, encoding='utf-8'): """ If a tuple is given in cols, use the elements as names to construct a namedtuple. Columns can be marked as ignored by using ``X`` or ``0`` as column name. Example (ignore the first four columns of a five column TSV): :: def run(self): with self.input().open() as handle: for row in handle.iter_tsv(cols=('X', 'X', 'X', 'X', 'iln')): print(row.iln) """ if cols: cols = [c if not c in ('x', 'X', 0, None) else random_string(length=5) for c in cols] Record = collections.namedtuple('Record', cols) for line in input_stream: yield Record._make(line.decode(encoding).rstrip('\n').split('\t')) else: for line in input_stream: yield tuple(line.decode(encoding).rstrip('\n').split('\t'))
def random_tmp_path(prefix='gluish'): """ Return a random path, that is located under the system's tmp dir. This is just a path, nothing gets touched or created. Just use: tempfile.mktemp(prefix='gluish-') instead. """ warnings.warn("deprecated", DeprecationWarning) return os.path.join(tempfile.gettempdir(), '%s-%s' % (prefix, random_string()))
def test_create_dir(self): target = os.path.join(tempfile.gettempdir(), random_string()) task = Directory(path=target) luigi.build([task], local_scheduler=True) self.assertEquals(task.output().path, target) self.assertTrue(os.path.isdir(task.output().path)) # task must be idempotent task = Directory(path=target) self.assertTrue(task.complete()) luigi.build([task], local_scheduler=True) self.assertEquals(task.output().path, target) self.assertTrue(os.path.isdir(task.output().path))
class DailyIndex(FrontpageTask, luigi.WrapperTask): """ Wraps a couple of downloads, so they can be parallelized. """ date = luigi.DateParameter(default=daily()) indicator = luigi.Parameter(default=random_string()) def requires(self): """ Index all pages. """ for url in NEWSPAPERS: yield IndexPage(url=url, date=self.date) def output(self): """ This is just a wrapper task. """ return self.input()
class FTPMirror(CommonTask): """ A generic FTP directory sync. Required lftp (http://lftp.yar.ru/). The output of this task is a single file, that contains the paths to all the mirrored files. """ host = luigi.Parameter() username = luigi.Parameter(default='anonymous') password = luigi.Parameter(default='') pattern = luigi.Parameter(default='*', description="e.g. '*leip_*.zip'") base = luigi.Parameter(default='.') indicator = luigi.Parameter(default=random_string()) def requires(self): return Executable(name='lftp', message='http://lftp.yar.ru/') def run(self): """ The indicator is always recreated, while the subdir for a given (host, username, base, pattern) is just synced. """ base = os.path.dirname(self.output().path) subdir = hashlib.sha1('{host}:{username}:{base}:{pattern}'.format( host=self.host, username=self.username, base=self.base, pattern=self.pattern)).hexdigest() # target is the root of the mirror target = os.path.join(base, subdir) if not os.path.exists(target): os.makedirs(target) command = """lftp -u {username},{password} -e "set net:max-retries 5; set net:timeout 10; mirror --verbose=0 --only-newer -I {pattern} {base} {target}; exit" {host}""" shellout(command, host=self.host, username=pipes.quote(self.username), password=pipes.quote(self.password), pattern=pipes.quote(self.pattern), target=pipes.quote(target), base=pipes.quote(self.base)) with self.output().open('w') as output: for path in iterfiles(target): logger.debug("Mirrored: %s" % path) output.write_tsv(path) def output(self): return luigi.LocalTarget(path=self.path(digest=True), format=TSV)
class FTPFileCopyTaskWithWrongUsername(TestTask): """ Indicator make this task run on each test run. """ indicator = luigi.Parameter(default=random_string()) def requires(self): return FTPFile(host='ftp.cs.brown.edu', username='******', password='******', filepath='/pub/techreports/00/cs00-07.pdf') def run(self): self.input().move(self.output().path) def output(self): return luigi.LocalTarget(path=self.path(ext='pdf'))
class MirrorTask(TestTask): """ Indicator make this task run on each test run. """ indicator = luigi.Parameter(default=random_string()) def requires(self): return FTPMirror(host='ftp.cs.brown.edu', username='******', password='******', pattern='*02*pdf', base='/pub/techreports/00') def run(self): self.input().move(self.output().path) def output(self): return luigi.LocalTarget(path=self.path())
def run(self): line_count = sum(1 for line in open(self.filename)) lines = int((line_count + self.chunks) / self.chunks) taskdir = os.path.dirname(self.output().fn) if not os.path.exists(taskdir): os.makedirs(taskdir) prefix = random_string() shellout("cd {taskdir} && split -l {lines} {input} {prefix}", taskdir=taskdir, lines=lines, input=self.filename, prefix=prefix) with self.output().open('w') as output: for path in sorted(iterfiles(taskdir)): if os.path.basename(path).startswith(prefix): output.write_tsv(path)
def run(self): prefix = "{0}-".format(random_string()) output = shellout( "cd {tmp} && split -l {lines} -a 8 {input} {prefix} && cd -", lines=self.lines, tmp=tempfile.gettempdir(), input=self.input().path, prefix=prefix, ) target = os.path.join(self.taskdir()) if not os.path.exists(target): os.makedirs(target) with self.output().open("w") as output: for path in iterfiles(tempfile.gettempdir()): filename = os.path.basename(path) if filename.startswith(prefix): dst = os.path.join(target, filename) shutil.move(path, dst) output.write_tsv(dst)
def test_random_string(self): """ Test random string length. """ self.assertEquals(16, len(random_string())) self.assertEquals(10, len(random_string(length=10)))
def oai_harvest(url=None, collection=None, begin=None, end=None, prefix='oai_dc', verb='ListRecords', max_retries=8, directory=None, ext='xml', download=download, delay=0): """ Harvest OAI for `url`. Will download all files into `directory`. Optionally add a delay between requests. argument OAI name -------------------- begin from collection set end until prefix metadataPrefix verb verb """ if url is None: raise RuntimeError('A URL must be given.') if directory is None: raise RuntimeError('A directory must be given.') if not os.path.exists(directory): raise RuntimeError('Directory does not exist: %s' % directory) params = {'from': begin, 'until': end, 'metadataPrefix': prefix, 'set': collection, 'verb': verb} params = dict([(k, v) for k, v in params.iteritems() if v]) # first request with all params full_url = '%s?%s' % (url, urllib.urlencode(params)) path = os.path.join(directory, '%s.%s' % (random_string(length=16), ext)) for retry in range(max_retries): try: download(url=full_url, filename=path, timeout=30) time.sleep(delay) break except RuntimeError as err: logger.info('Retry %s on %s' % (retry, full_url)) unlink(path) else: raise RuntimeError('Max retries (%s) exceeded: %s' % ( max_retries, full_url)) # any subsequent request uses 'resumptiontoken' while True: with open(path) as handle: soup = BeautifulSoup.BeautifulStoneSoup(handle.read()) token = soup.find('resumptiontoken') if token is None: break # subsequent requests are done with resumptiontoken only ... params = {'resumptionToken': token.text, 'verb': verb} full_url = '%s?%s' % (url, urllib.urlencode(params)) path = os.path.join(directory, "%s.%s" % ( random_string(length=16), ext)) retry = 0 while True: if retry >= max_retries: raise RuntimeError("Max retries (%s) exceeded: %s" % ( max_retries, full_url)) try: download(url=full_url, filename=path) time.sleep(delay) break except RuntimeError as err: retry += 1 logger.info("Retry #%s on %s" % (retry, full_url)) unlink(path)