示例#1
0
def aggregate_collection(
    request: HttpRequest,
    collection_id: int,
) -> HttpResponse:
    """
    Value count computations could be also moved into a celery task that
    would prepare the answer for the user and bring it to him later
    (via email or on page with results).
    """
    collection = get_object_or_404(StarWarsCollection, id=collection_id)
    table = etl.fromcsv(collection.filepath)
    aggregate_keys, parameters_settings = parse_parameters(
        request.GET.get(
            'current_parameters',
            '0000001001',
        ), )
    if len(aggregate_keys) == 1:  # aggregate does not work correctly
        # if list with 1 element is passed
        aggregate_keys = aggregate_keys[0]
    if len(aggregate_keys) == 0:  # show no table if every option is disabled
        table = etl.empty()
    else:
        table = table.aggregate(key=aggregate_keys, aggregation=len)
    return render(
        request,
        'main/collection_aggregate.html',
        {
            'collection': collection,
            'parameters_settings': parameters_settings,
            'headers': etl.header(table),
            'data': etl.data(table),
        },
    )
示例#2
0
def test_empty():

    actual = (etl.empty().addcolumn('foo', ['a', 'b', 'c']).addcolumn(
        'bar', [1, 2, 2]))
    expect = (('foo', 'bar'), ('a', 1), ('b', 2), ('c', 2))
    ieq(expect, actual)
    ieq(expect, actual)
示例#3
0
def save_characters_to_file(generated_file_path, characters_pages):
    etl.setheader(
        etl.empty(),
        settings.STAR_WARS_CHARACTERS_OUTPUT_FILE_HEADER_FIELDS,
    ).tocsv(generated_file_path)
    logger.info('Created file: %s', generated_file_path)
    for characters_page in characters_pages:
        etl.appendcsv(
            characters_page,
            generated_file_path,
            write_header=False,
        )
        logger.info('Added data to file: %s', generated_file_path)
示例#4
0
list(d)

# records()
###############

import petl as etl
table = [['foo', 'bar'], ['a', 1], ['b', 2]]
d = etl.records(table)
d
list(d)

# rowgroupby()
##############

import petl as etl
table1 = [['foo', 'bar', 'baz'], ['a', 1, True], ['b', 3, True], ['b', 2]]
# group entire rows
for key, group in etl.rowgroupby(table1, 'foo'):
    print(key, list(group))

# group specific values
for key, group in etl.rowgroupby(table1, 'foo', 'bar'):
    print(key, list(group))

# empty()
#########

import petl as etl
table = (etl.empty().addcolumn('foo', ['A', 'B']).addcolumn('bar', [1, 2]))
table
def init(release_dir, load_geneset=False, geneset_attributes=None):
    """Initialise data resources.

    Parameters
    ----------
    release_dir : string
        Local filesystem path where data from the release are stored.
    load_geneset : string
        If True, load geneset into memory.
    geneset_attributes : dict-like
        Attributes to load.

    """

    # reference sequence
    ####################

    global genome_agamp3, genome_agamp4, genome_dir
    genome_dir = os.path.join(release_dir, 'genome')
    genome_agamp3_dir = os.path.join(genome_dir, 'agamP3')
    genome_agamp3_fn = os.path.join(
        genome_agamp3_dir, 'Anopheles-gambiae-PEST_CHROMOSOMES_AgamP3.fa')
    if os.path.exists(genome_agamp3_fn):
        genome_agamp3 = pyfasta.Fasta(genome_agamp3_fn,
                                      key_fn=lambda v: v.split()[0])
    genome_agamp4_dir = os.path.join(genome_dir, 'agamP4')
    genome_agamp4_fn = os.path.join(
        genome_agamp4_dir, 'Anopheles-gambiae-PEST_CHROMOSOMES_AgamP4.fa')
    if os.path.exists(genome_agamp4_fn):
        genome_agamp4 = pyfasta.Fasta(genome_agamp4_fn,
                                      key_fn=lambda v: v.split()[0])

    # genome annotations
    ####################

    global geneset_agamp44_fn, geneset_agamp44, geneset_dir
    geneset_dir = os.path.join(release_dir, 'geneset')
    geneset_agamp44_fn = os.path.join(
        geneset_dir,
        'Anopheles-gambiae-PEST_BASEFEATURES_AgamP4.4.sorted.gff3.gz')
    if load_geneset:
        geneset_agamp44 = allel.FeatureTable.from_gff3(
            geneset_agamp44_fn, attributes=geneset_attributes)

    # variant callsets
    ##################

    global callset, callset_pass, callset_pass_biallelic, variation_dir, \
        callset_snpeff_agamp42
    variation_dir = os.path.join(release_dir, 'variation')

    # main callset
    callset_h5_fn = os.path.join(variation_dir, 'main', 'hdf5', 'all',
                                 'ag1000g.phase2.ar1.h5')
    callset_lite_h5_fn = os.path.join(variation_dir, 'main', 'hdf5', 'lite',
                                      'ag1000g.phase2.ar1.lite.h5')
    callset_zarr_fn = os.path.join(variation_dir, 'main', 'zarr', 'all',
                                   'ag1000g.phase2.ar1')

    # preference: zarr > hdf5 > hdf5 (lite)
    if os.path.exists(callset_zarr_fn):
        callset = zarr.open_group(callset_zarr_fn, mode='r')
    elif os.path.exists(callset_h5_fn):
        callset = h5py.File(callset_h5_fn, mode='r')
    elif os.path.exists(callset_lite_h5_fn):
        callset = h5py.File(callset_lite_h5_fn, mode='r')

    # main callset, PASS variants only
    callset_pass_h5_fn = os.path.join(variation_dir, 'main', 'hdf5', 'pass',
                                      'ag1000g.phase2.ar1.pass.h5')
    callset_pass_lite_h5_fn = os.path.join(variation_dir, 'main', 'hdf5',
                                           'lite',
                                           'ag1000g.phase2.ar1.pass.lite.h5')
    callset_pass_zarr_fn = os.path.join(variation_dir, 'main', 'zarr', 'pass',
                                        'ag1000g.phase2.ar1.pass')

    # preference: zarr > hdf5 > hdf5 (lite)
    if os.path.exists(callset_pass_zarr_fn):
        callset_pass = zarr.open_group(callset_pass_zarr_fn, mode='r')
    elif os.path.exists(callset_pass_h5_fn):
        callset_pass = h5py.File(callset_pass_h5_fn, mode='r')
    elif os.path.exists(callset_pass_lite_h5_fn):
        callset_pass = h5py.File(callset_pass_lite_h5_fn, mode='r')

    # main callset, PASS biallelic variants only
    callset_pass_biallelic_h5_fn = os.path.join(
        variation_dir, 'main', 'hdf5', 'biallelic',
        'ag1000g.phase2.ar1.pass.biallelic.h5')
    callset_pass_biallelic_lite_h5_fn = os.path.join(
        variation_dir, 'main', 'hdf5', 'lite',
        'ag1000g.phase2.ar1.pass.biallelic.lite.h5')
    callset_pass_biallelic_zarr_fn = os.path.join(
        variation_dir, 'main', 'zarr', 'biallelic',
        'ag1000g.phase2.ar1.pass.biallelic')

    # preference: zarr > hdf5 > hdf5 (lite)
    if os.path.exists(callset_pass_biallelic_zarr_fn):
        callset_pass_biallelic = zarr.open_group(
            callset_pass_biallelic_zarr_fn, mode='r')
    elif os.path.exists(callset_pass_biallelic_h5_fn):
        callset_pass_biallelic = h5py.File(callset_pass_biallelic_h5_fn,
                                           mode='r')
    elif os.path.exists(callset_pass_biallelic_lite_h5_fn):
        callset_pass_biallelic = h5py.File(callset_pass_biallelic_lite_h5_fn,
                                           mode='r')

    # SNPEFF annotations
    callset_snpeff_agamp42_h5_fn_template = os.path.join(
        variation_dir, 'main', 'hdf5', 'all_snpeff',
        'ag1000g.phase2.ar1.snpeff.AgamP4.2.{chrom}.h5')
    # work around broken link file
    callset_snpeff_agamp42 = dict()
    for chrom in '2L', '2R', '3L', '3R', 'X':
        fn = callset_snpeff_agamp42_h5_fn_template.format(chrom=chrom)
        if os.path.exists(fn):
            callset_snpeff_agamp42[chrom] = h5py.File(fn, mode='r')[chrom]

    # accessibility
    ###############

    global accessibility, accessibility_dir
    accessibility_dir = os.path.join(release_dir, 'accessibility')
    accessibility_fn = os.path.join(accessibility_dir, 'accessibility.h5')
    if os.path.exists(accessibility_fn):
        accessibility = h5py.File(accessibility_fn, mode='r')

    # sample metadata
    #################

    global tbl_samples, lkp_samples, sample_ids, df_samples, samples_dir
    samples_dir = os.path.join(release_dir, 'samples')
    samples_fn = os.path.join(samples_dir, 'samples.meta.txt')
    if os.path.exists(samples_fn):
        tbl_samples = (etl.fromtsv(samples_fn).convert(
            ('year', 'n_sequences'), int).convert(('mean_coverage', ), float))
        lkp_samples = tbl_samples.recordlookupone('ox_code')
        sample_ids = tbl_samples.values('ox_code').list()
        df_samples = pandas.read_csv(samples_fn, sep='\t', index_col='ox_code')

    # extras
    ########

    global allele_counts
    extras_dir = os.path.join(release_dir, 'extras')

    # allele counts
    allele_counts_fn = os.path.join(extras_dir, 'allele_counts.h5')
    if os.path.exists(allele_counts_fn):
        allele_counts = h5py.File(allele_counts_fn, mode='r')

    # haplotypes
    ############

    global haplotypes_dir, callset_phased, tbl_haplotypes, df_haplotypes, lkp_haplotypes
    haplotypes_dir = os.path.join(release_dir, 'haplotypes')

    # no HDF5 link file, load up as dict for now
    callset_phased_hdf5_fn_template = os.path.join(
        haplotypes_dir, 'main', 'hdf5',
        'ag1000g.phase2.ar1.haplotypes.{chrom}.h5')
    callset_phased = dict()
    for chrom in '2L', '2R', '3L', '3R', 'X':
        fn = callset_phased_hdf5_fn_template.format(chrom=chrom)
        if os.path.exists(fn):
            callset_phased[chrom] = h5py.File(fn, mode='r')[chrom]

    # no haplotypes file, create here for now
    # TODO source this from file Nick has created
    if '3R' in callset_phased:
        phased_samples = callset_phased['3R']['samples'][:].astype('U')
        haplotype_labels = list(
            itertools.chain(*[[s + 'a', s + 'b'] for s in phased_samples]))
        tbl_haplotypes = (etl.empty().addcolumn(
            'label', haplotype_labels).addrownumbers(start=0).rename(
                'row', 'index'
            ).addfield('ox_code', lambda row: row.label[:-1]).hashleftjoin(
                tbl_samples, key='ox_code').addfield(
                    'label_aug', lambda row: '%s [%s, %s, %s, %s]' %
                    (row.label, row.country, row.location, row.m_s, row.sex)))
        lkp_haplotypes = tbl_haplotypes.recordlookupone('label')
        df_haplotypes = tbl_haplotypes.todataframe(index='index')
示例#6
0
import petl as etl

table = [["foo", "bar"], ["a", 1], ["b", 2]]
d = etl.records(table)
d
list(d)


# rowgroupby()
##############

import petl as etl

table1 = [["foo", "bar", "baz"], ["a", 1, True], ["b", 3, True], ["b", 2]]
# group entire rows
for key, group in etl.rowgroupby(table1, "foo"):
    print(key, list(group))

# group specific values
for key, group in etl.rowgroupby(table1, "foo", "bar"):
    print(key, list(group))


# empty()
#########

import petl as etl

table = etl.empty().addcolumn("foo", ["A", "B"]).addcolumn("bar", [1, 2])
table
示例#7
0
文件: xml2csv.py 项目: hsinkai/crowa
import yaml

parser = argparse.ArgumentParser(
    description='Parse an xml file to csv via an yaml config.')
parser.add_argument('-f', metavar='xml', help='path of input xml file')
parser.add_argument('-t', metavar='csv', help='path of output csv file')
parser.add_argument('-e',
                    action='store_true',
                    help='add quotes to header to fit SQL pattern')
parser.add_argument('config', help='path of config yaml file')
args = parser.parse_args()

info = yaml.load(open(args.config))
xml_file = args.f or info['xml']
csv_file = args.t or info['csv']
table = petl.empty()

# substitute namespace to keys
for key in eval(Template(str(info['keys'])).substitute(
        **info['namespace'])) if 'namespace' in info else info['keys']:
    # collect data from each key
    table = table.cat(petl.fromxml(xml_file, key['anchor'], key['select']))

if 'pks' in info:
    table = table.mergeduplicates(
        info['pks'] if len(info['pks']) > 1 else info['pks'][0])

if 'orderBy' in info:
    table = table.sort(info['orderBy'])

if 'skip' in info:
示例#8
0
def dicts2table(dicts):
    """transform dicts into a ``petl.util.base.Table``"""
    return petl.wrap(petl.fromdicts(dicts)) if dicts else petl.empty()
 def _get_eval_modes(self):
     return etl.empty()
示例#10
0
 def extract(self, task, job_config):
     return petl.empty()
 def _get_artifacts(self):
     # TODO: double check that this should be empty
     # TODO: see if there is any point in keeping teacher adv
     return etl.empty()
 def _get_tasks(self):
     return etl.empty()
 def _get_items(self):
     return etl.empty()
示例#14
0
 def _get_artifacts(self):
     return etl.empty()
示例#15
0
 def _get_answers(self):
     comments = (etl.fromcsv(f'{self._dirc}/assessment_result.csv',
                             delimiter=',').listoflists())
     return etl.empty()