def kway_merge(type, name, bucket, index, tag, input, output): pipeline = Pipeline( name=name, steps=[ Deserialize(), OneToMany(transform=lambda item: item.split()), MergeSort(piecesize=16 * 1024 * 1024, key=lambda row: row.key, steps=lambda item: [ S3Download(), NDJsonIndex(extract=lambda row: int(row[tag])), ]), MinMax(key='sorting:markers'), NDJsonFlush(), S3Upload(bucket=bucket, key=lambda metadata: f'{output}.out/{index:04}', chunksize=128 * 1024 * 1024), S3Rename( key=lambda metadata: f'{output}.out/{index:04}?{metadata.get("sorting:markers").queryable()}' ), Serialize(), ]) return pipeline.start(input)
def worker_ftp(name, host, directory, bucket, input, output): pipeline = Pipeline(name=name, steps=[ FtpDownload(host=host, directory=directory), S3Upload(bucket=bucket, key=output, chunksize=128*1024*1024) ]) pipeline.start(input=input)
def driver(cluster, task, securityGroup, vpcSubnet): pipeline = Pipeline(name='driver', steps=[ EcsTask(cluster=cluster, task=task, securityGroup=securityGroup, vpcSubnet=vpcSubnet, environment=lambda value: [ { 'name': 'TYPE', 'value': 'master' } ]) ]) pipeline.start(input=None)
def worker_json(name, rowtag, bucket, input, output): pipeline = Pipeline(name=name, steps=[ S3Download(), Ungzip(), XmlToJson(rowtag=rowtag), S3Upload(bucket=bucket, key=output, chunksize=128*1024*1024) ]) pipeline.start(input=S3Object(bucket=bucket, key=input))
def master_sort(filename, tag, bucket, cluster, task, securityGroup, vpcSubnet): pipeline = Pipeline(name=filename, steps=[ Conditional( inverse=True, condition=S3KeyExists(bucket=bucket, key=lambda value: f'sort/{split_name(value)}'), steps=[ EcsTask(cluster=cluster, task=task, securityGroup=securityGroup, vpcSubnet=vpcSubnet, environment=lambda value: [ { 'name': 'TYPE', 'value': 'worker-sort' }, { 'name': 'NAME', 'value': value }, { 'name': 'TAG', 'value': tag }, { 'name': 'BUCKET', 'value': bucket }, { 'name': 'INPUT', 'value': f'json/{split_name(value)}' }, { 'name': 'OUTPUT', 'value': f'sort/{split_name(value)}' }, ]), ] ), ]) pipeline.start(input=filename)
def quick_sort(type, name, bucket, index, tag, input, output): pipeline = Pipeline( name=name, steps=[ Deserialize(), NDJsonMeasure(steps=lambda: [S3Download()]), S3Download(), NDJsonChunk(chunksize=1024 * 1024), NDJsonIndex(extract=lambda row: int(row[tag])), QuickSort(key=lambda row: row.key), DataMarker(key='sorting:markers', count=16), NDJsonFlush(), S3Upload( bucket=bucket, key=lambda metadata: f'{output}.tmp/{index:04}?{metadata.get("sorting:markers").queryable()}', chunksize=128 * 1024 * 1024), Serialize(), ]) return pipeline.start(input)
def worker_sort(name, tag, bucket, input, output): pipeline = Pipeline(name=name, steps=[ S3Chunk(chunksize=512*1024*1024), ForEachItemParallel(threads=16, steps=lambda index, metadata: [ Serialize(), Lambda('wikipedia-run', lambda item: { "type": "quick-sort", "name": f'{name}-{index}', "bucket": bucket, "index": index, "tag": tag, "input": item, "output": output }), OneToMany(), Deserialize(), ]), MergeGroup(), ForEachItemParallel(threads=32, steps=lambda index, metadata: [ Serialize(), Lambda('wikipedia-run', lambda item: { "type": "kway-merge", "name": f'{name}-{index}', "bucket": bucket, "index": index, "tag": tag, "input": item, "output": output }), OneToMany(), Deserialize(), ]), #Singleton(value=S3Prefix(bucket=bucket, prefix=f'{output}.tmp/')), #S3List(), #S3Delete(), WaitAll(), DictDebug(), ]) pipeline.start(input=S3Object(bucket=bucket, key=input))
def master_get(filename, rowtag, bucket, cluster, task, securityGroup, vpcSubnet, ftpQueue, jsonQueue): pipeline = Pipeline(name=filename, steps=[ Conditional( inverse=True, condition=S3KeyExists(bucket=bucket, key=lambda value: f'raw/{split_name(value)}'), steps=[ AcquireToken(queue=ftpQueue), EcsTask(cluster=cluster, task=task, securityGroup=securityGroup, vpcSubnet=vpcSubnet, environment=lambda token: [ { 'name': 'TYPE', 'value': 'worker-ftp' }, { 'name': 'NAME', 'value': token.value }, { 'name': 'BUCKET', 'value': bucket }, { 'name': 'INPUT', 'value': token.value }, { 'name': 'OUTPUT', 'value': f'raw/{split_name(token.value)}' }, { 'name': 'HOST', 'value': token.item['Host'] }, { 'name': 'DIRECTORY', 'value': token.item['Directory'] }, ]), ReleaseToken(queue=ftpQueue), ] ), Conditional( inverse=True, condition=S3KeyExists(bucket=bucket, key=lambda value: f'json/{split_name(splitext(splitext(value)[0])[0])}.json'), steps=[ AcquireToken(queue=jsonQueue), EcsTask(cluster=cluster, task=task, securityGroup=securityGroup, vpcSubnet=vpcSubnet, environment=lambda token: [ { 'name': 'TYPE', 'value': 'worker-json' }, { 'name': 'NAME', 'value': token.value }, { 'name': 'ROWTAG', 'value': rowtag }, { 'name': 'BUCKET', 'value': bucket }, { 'name': 'INPUT', 'value': f'raw/{split_name(token.value)}' }, { 'name': 'OUTPUT', 'value': f'json/{split_name(splitext(splitext(token.value)[0])[0])}.json' }, ]), ReleaseToken(queue=jsonQueue), ] ) ]) pipeline.start(input=filename)