Exemplo n.º 1
0
def kway_merge(type, name, bucket, index, tag, input, output):
    pipeline = Pipeline(
        name=name,
        steps=[
            Deserialize(),
            OneToMany(transform=lambda item: item.split()),
            MergeSort(piecesize=16 * 1024 * 1024,
                      key=lambda row: row.key,
                      steps=lambda item: [
                          S3Download(),
                          NDJsonIndex(extract=lambda row: int(row[tag])),
                      ]),
            MinMax(key='sorting:markers'),
            NDJsonFlush(),
            S3Upload(bucket=bucket,
                     key=lambda metadata: f'{output}.out/{index:04}',
                     chunksize=128 * 1024 * 1024),
            S3Rename(
                key=lambda metadata:
                f'{output}.out/{index:04}?{metadata.get("sorting:markers").queryable()}'
            ),
            Serialize(),
        ])

    return pipeline.start(input)
Exemplo n.º 2
0
def worker_ftp(name, host, directory, bucket, input, output):
    pipeline = Pipeline(name=name, steps=[
        FtpDownload(host=host, directory=directory),
        S3Upload(bucket=bucket, key=output, chunksize=128*1024*1024)
    ])

    pipeline.start(input=input)
Exemplo n.º 3
0
def driver(cluster, task, securityGroup, vpcSubnet):
    pipeline = Pipeline(name='driver', steps=[
        EcsTask(cluster=cluster, task=task, securityGroup=securityGroup, vpcSubnet=vpcSubnet, environment=lambda value: [
            { 'name': 'TYPE', 'value': 'master' }
        ])
    ])

    pipeline.start(input=None)
Exemplo n.º 4
0
def worker_json(name, rowtag, bucket, input, output):
    pipeline = Pipeline(name=name, steps=[
        S3Download(),
        Ungzip(),
        XmlToJson(rowtag=rowtag),
        S3Upload(bucket=bucket, key=output, chunksize=128*1024*1024)
    ])

    pipeline.start(input=S3Object(bucket=bucket, key=input))
Exemplo n.º 5
0
def master_sort(filename, tag, bucket, cluster, task, securityGroup, vpcSubnet):
    pipeline = Pipeline(name=filename, steps=[
        Conditional(
            inverse=True,
            condition=S3KeyExists(bucket=bucket, key=lambda value: f'sort/{split_name(value)}'),
            steps=[
                EcsTask(cluster=cluster, task=task, securityGroup=securityGroup, vpcSubnet=vpcSubnet, environment=lambda value: [
                    { 'name': 'TYPE', 'value': 'worker-sort' },
                    { 'name': 'NAME', 'value': value },
                    { 'name': 'TAG', 'value': tag },
                    { 'name': 'BUCKET', 'value': bucket },
                    { 'name': 'INPUT', 'value': f'json/{split_name(value)}' },
                    { 'name': 'OUTPUT', 'value': f'sort/{split_name(value)}' },
                ]),
            ]
        ), 
    ])

    pipeline.start(input=filename)
Exemplo n.º 6
0
def quick_sort(type, name, bucket, index, tag, input, output):
    pipeline = Pipeline(
        name=name,
        steps=[
            Deserialize(),
            NDJsonMeasure(steps=lambda: [S3Download()]),
            S3Download(),
            NDJsonChunk(chunksize=1024 * 1024),
            NDJsonIndex(extract=lambda row: int(row[tag])),
            QuickSort(key=lambda row: row.key),
            DataMarker(key='sorting:markers', count=16),
            NDJsonFlush(),
            S3Upload(
                bucket=bucket,
                key=lambda metadata:
                f'{output}.tmp/{index:04}?{metadata.get("sorting:markers").queryable()}',
                chunksize=128 * 1024 * 1024),
            Serialize(),
        ])

    return pipeline.start(input)
Exemplo n.º 7
0
def worker_sort(name, tag, bucket, input, output):
    pipeline = Pipeline(name=name, steps=[
        S3Chunk(chunksize=512*1024*1024),
        ForEachItemParallel(threads=16, steps=lambda index, metadata: [
            Serialize(),
            Lambda('wikipedia-run', lambda item: {
                "type": "quick-sort",
                "name": f'{name}-{index}',
                "bucket": bucket,
                "index": index,
                "tag": tag,
                "input": item,
                "output": output
            }),
            OneToMany(),
            Deserialize(),
        ]),
        MergeGroup(),
        ForEachItemParallel(threads=32, steps=lambda index, metadata: [
            Serialize(),
            Lambda('wikipedia-run', lambda item: {
                "type": "kway-merge",
                "name": f'{name}-{index}',
                "bucket": bucket,
                "index": index,
                "tag": tag,
                "input": item,
                "output": output
            }),
            OneToMany(),
            Deserialize(),
        ]),
        #Singleton(value=S3Prefix(bucket=bucket, prefix=f'{output}.tmp/')),
        #S3List(),
        #S3Delete(),
        WaitAll(),
        DictDebug(),
    ])

    pipeline.start(input=S3Object(bucket=bucket, key=input))
Exemplo n.º 8
0
def master_get(filename, rowtag, bucket, cluster, task, securityGroup, vpcSubnet, ftpQueue, jsonQueue):
    pipeline = Pipeline(name=filename, steps=[
        Conditional(
            inverse=True,
            condition=S3KeyExists(bucket=bucket, key=lambda value: f'raw/{split_name(value)}'),
            steps=[
                AcquireToken(queue=ftpQueue),
                EcsTask(cluster=cluster, task=task, securityGroup=securityGroup, vpcSubnet=vpcSubnet, environment=lambda token: [
                    { 'name': 'TYPE', 'value': 'worker-ftp' },
                    { 'name': 'NAME', 'value': token.value },
                    { 'name': 'BUCKET', 'value': bucket },
                    { 'name': 'INPUT', 'value': token.value },
                    { 'name': 'OUTPUT', 'value': f'raw/{split_name(token.value)}' },
                    { 'name': 'HOST', 'value': token.item['Host'] },
                    { 'name': 'DIRECTORY', 'value': token.item['Directory'] },
                ]),
                ReleaseToken(queue=ftpQueue),
            ]
        ), 
        Conditional(
            inverse=True,
            condition=S3KeyExists(bucket=bucket, key=lambda value: f'json/{split_name(splitext(splitext(value)[0])[0])}.json'),
            steps=[
                AcquireToken(queue=jsonQueue),
                EcsTask(cluster=cluster, task=task, securityGroup=securityGroup, vpcSubnet=vpcSubnet, environment=lambda token: [
                    { 'name': 'TYPE', 'value': 'worker-json' },
                    { 'name': 'NAME', 'value': token.value },
                    { 'name': 'ROWTAG', 'value': rowtag },
                    { 'name': 'BUCKET', 'value': bucket },
                    { 'name': 'INPUT', 'value': f'raw/{split_name(token.value)}' },
                    { 'name': 'OUTPUT', 'value': f'json/{split_name(splitext(splitext(token.value)[0])[0])}.json' },
                ]),
                ReleaseToken(queue=jsonQueue),
            ]
        )
    ])

    pipeline.start(input=filename)