예제 #1
0
def filter_by_json(identifiers, indices, json_file, invert):
    """Filter included set using json file."""
    data = file_io.load_yaml(json_file)
    id_set = set(data['identifiers'])
    if not invert:
        indices = [i for i in indices if identifiers[i] in id_set]
    else:
        indices = [i for i in indices if identifiers[i] not in id_set]
    return indices
예제 #2
0
파일: fetch.py 프로젝트: epaule/blobtools2
def fetch_metadata(path_to_dataset, **kwargs):
    """
    Load Metadata from file.

    fetch_metadata('tests/files/dataset')
    """
    dataset_id = path_to_dataset.split("/").pop()
    new_meta = {}
    meta = None
    if not os.path.exists(path_to_dataset):
        os.makedirs(path_to_dataset)
    if kwargs.get("--meta"):
        new_meta = file_io.load_yaml(kwargs["--meta"])
        if (kwargs["--bed"] or kwargs["--fasta"]) and kwargs["--replace"]:
            files = glob.glob("%s/*" % kwargs["DIRECTORY"])
            for file in files:
                os.remove(file)
    try:
        meta = kwargs["meta"]
    except KeyError:
        try:
            meta = file_io.load_yaml("%s/meta.json" % path_to_dataset)
        except ValueError:
            pass
    if meta is None:
        meta = {}
    if "id" not in meta:
        meta["id"] = dataset_id
        meta["name"] = dataset_id
    for key, value in new_meta.items():
        if isinstance(value, dict):
            try:
                meta[key].update({k: v for k, v in value.items()})
            except KeyError:
                meta[key] = value
        elif isinstance(value, list):
            meta[key] += value
        else:
            meta[key] = value
    return Metadata(dataset_id, **meta)
예제 #3
0
파일: cov.py 프로젝트: abdo3a/blobtools2
def parse_json_cov(json_file, **kwargs):
    """Parse coverage from JSON cov file."""
    parts = json_file.split('=')
    base_name = parts[1]
    data = load_yaml(parts[0])
    covs = []
    if 'values' in data:
        for value in data['values']:
            covs.append(float("%.4f" % value))
    if base_name.endswith('_read_cov'):
        type = 'read_cov'
        parent = 'read_coverage'
        datatype = 'float'
        clamp = 1
    elif base_name.endswith('_cov'):
        type = 'cov'
        parent = 'base_coverage'
        datatype = 'integer'
        clamp = 0.01
    else:
        return None
    field_id = base_name
    fields = {}
    fields["%s_id" % type] = field_id
    fields["%s_range" % type] = [
        min(covs + [kwargs["%s_range" % type][0]]),
        max(covs + [kwargs["%s_range" % type][1]])
    ]
    if kwargs['meta'].has_field(field_id):
        file_name = kwargs['meta'].field_meta(field_id)['file']
    else:
        file_name = json_file
    fields[type] = Variable(field_id,
                            values=covs,
                            meta={
                                'field_id': field_id,
                                'file': file_name
                            },
                            parents=[
                                'children', {
                                    'id':
                                    parent,
                                    'datatype':
                                    'integer',
                                    'clamp':
                                    clamp if fields["%s_range" %
                                                    type][0] == 0 else False,
                                    'range':
                                    fields["%s_range" % type]
                                }, 'children'
                            ])
    return fields
예제 #4
0
파일: cov.py 프로젝트: epaule/blobtools2
def parse_json_cov(json_file, **kwargs):
    """Parse coverage from JSON cov file."""
    parts = json_file.split("=")
    base_name = parts[1]
    data = load_yaml(parts[0])
    covs = []
    if "values" in data:
        for value in data["values"]:
            covs.append(float("%.4f" % value))
    if base_name.endswith("_read_cov"):
        type = "read_cov"
        parent = "read_coverage"
        datatype = "float"
        clamp = 1
    elif base_name.endswith("_cov"):
        type = "cov"
        parent = "base_coverage"
        datatype = "integer"
        clamp = 0.01
    else:
        return None
    field_id = base_name
    fields = {}
    fields["%s_id" % type] = field_id
    fields["%s_range" % type] = [
        min(covs + [kwargs["%s_range" % type][0]]),
        max(covs + [kwargs["%s_range" % type][1]]),
    ]
    if kwargs["meta"].has_field(field_id):
        file_name = kwargs["meta"].field_meta(field_id)["file"]
    else:
        file_name = json_file
    fields[type] = Variable(
        field_id,
        values=covs,
        meta={
            "field_id": field_id,
            "file": file_name
        },
        parents=[
            "children",
            {
                "id": parent,
                "datatype": "integer",
                "clamp": clamp if fields["%s_range" % type][0] == 0 else False,
                "range": fields["%s_range" % type],
            },
            "children",
        ],
    )
    return fields
예제 #5
0
파일: fetch.py 프로젝트: epaule/blobtools2
def fetch_taxdump(path_to_taxdump):
    """Load Taxdump from file."""
    json_file = "%s/taxdump.json" % path_to_taxdump
    if not Path(json_file).exists():
        print("Parsing taxdump")
    else:
        print("Loading parsed taxdump")
    data = file_io.load_yaml(json_file)
    if data is None:
        taxdump = Taxdump(path_to_taxdump)
        file_io.write_file(json_file, taxdump.values_to_dict())
    else:
        taxdump = Taxdump(path_to_taxdump, **data)
    return taxdump
예제 #6
0
def fetch_metadata(path_to_dataset, **kwargs):
    """
    Load Metadata from file.

    fetch_metadata('tests/files/dataset')
    """
    dataset_id = path_to_dataset.split('/').pop()
    new_meta = {}
    if not os.path.exists(path_to_dataset):
        os.makedirs(path_to_dataset)
    if kwargs.get('--meta'):
        new_meta = file_io.load_yaml(kwargs['--meta'])
        if kwargs['--replace']:
            files = glob.glob("%s/*" % kwargs['DIRECTORY'])
            for file in files:
                os.remove(file)
    try:
        meta = kwargs['meta']
    except KeyError:
        meta = file_io.load_yaml("%s/meta.json" % path_to_dataset)
    if not meta:
        meta = {}
    if 'id' not in meta:
        meta['id'] = dataset_id
        meta['name'] = dataset_id
    for key, value in new_meta.items():
        if isinstance(value, dict):
            try:
                meta[key].update({k: v for k, v in value.items()})
            except KeyError:
                meta[key] = value
        elif isinstance(value, list):
            meta[key] += value
        else:
            meta[key] = value
    return Metadata(dataset_id, **meta)
예제 #7
0
파일: fetch.py 프로젝트: epaule/blobtools2
def fetch_field(path_to_dataset, field_id, meta=None):
    """
    Load fields from file.

    fetch_field('tests/files/dataset', 'identifiers', meta)
    """
    field_meta = meta.field_meta(field_id)
    try:
        data = file_io.load_yaml("%s/%s.json" % (path_to_dataset, field_id))
        if data is not None:
            data.update({"meta": field_meta})
        field = TYPES[field_meta["type"]](field_id, **data)
    except TypeError:
        field = False
    except KeyError:
        field = False
    return field
예제 #8
0
def parse(file, **kwargs):
    """Parse all synonym files."""
    blob_db = file_io.load_yaml(file)
    kwargs['meta'].assembly.update({'file': blob_db['assembly_f']})
    parsed = []
    identifiers = kwargs['dependencies']['identifiers']
    if not identifiers:
        identifiers = Identifier('identifiers',
                                 meta={'field_id': 'identifiers'},
                                 values=blob_db['order_of_blobs'],
                                 parents=[])
        kwargs['meta'].assembly.update(
            {'scaffold-count': len(identifiers.values)})
        parsed.append(identifiers)
    values = values_from_blob_db(blob_db)
    kwargs['meta'].assembly.update({'span': sum(values['lengths'])})
    parsed.append(
        Variable('gc',
                 meta={
                     'preload': True,
                     'scale': 'scaleLinear',
                     'field_id': 'gc',
                     'name': 'GC',
                     'datatype': 'float',
                     'range': [min(values['gcs']),
                               max(values['gcs'])]
                 },
                 values=values['gcs'],
                 parents=[]))
    _min = min(values['lengths'])
    parsed.append(
        Variable('length',
                 meta={
                     'field_id': 'length',
                     'preload': True,
                     'scale': 'scaleLog',
                     'name': 'Length',
                     'clamp': 100 if _min == 0 else False,
                     'datatype': 'integer',
                     'range': [_min, max(values['lengths'])]
                 },
                 parents=[],
                 values=values['lengths']))
    parsed.append(
        Variable('ncount',
                 meta={
                     'field_id': 'ncount',
                     'scale': 'scaleLinear',
                     'name': 'N count',
                     'datatype': 'integer',
                     'range':
                     [min(values['n_counts']),
                      max(values['n_counts'])]
                 },
                 values=values['n_counts'],
                 parents=[]))
    if 'z' not in kwargs['meta'].plot:
        kwargs['meta'].plot.update({'z': 'length'})
    if 'x' not in kwargs['meta'].plot:
        kwargs['meta'].plot.update({'x': 'gc'})
    cov_range = [math.inf, -math.inf]
    read_cov_range = [math.inf, -math.inf]
    for cov_lib, cov_meta in blob_db['covLibs'].items():
        cov_file_name = field_name_from_path(blob_db['covLibs'][cov_lib]['f'])
        covs = values["%s_cov" % cov_lib]
        read_covs = values["%s_read_cov" % cov_lib]
        cov_range = [min(covs + [cov_range[0]]), max(covs + [cov_range[1]])]
        read_cov_range = [
            min(read_covs + [read_cov_range[0]]),
            max(read_covs + [read_cov_range[1]])
        ]
        if 'y' not in kwargs['meta'].plot:
            kwargs['meta'].plot.update({'y': "%s_cov" % cov_file_name})
        parsed.append(
            Variable("%s_cov" % cov_file_name,
                     values=covs,
                     meta={
                         'field_id': "%s_cov" % cov_file_name,
                         'file': cov_meta['f']
                     },
                     parents=cov.parent() + [
                         'children', {
                             'id': 'base_coverage',
                             'clamp': 1 if cov_range[0] == 0 else False,
                             'range': cov_range
                         }, 'children'
                     ]))
        parsed.append(
            Variable("%s_read_cov" % cov_file_name,
                     values=read_covs,
                     meta={
                         'field_id': "%s_read_cov" % cov_file_name,
                         'file': cov_meta['f'],
                         'reads_mapped': cov_meta['reads_mapped'],
                         'reads_unmapped': cov_meta['reads_unmapped']
                     },
                     parents=cov.parent() + [
                         'children', {
                             'id': 'read_coverage',
                             'datatype': 'integer',
                             'clamp': 1 if read_cov_range[0] == 0 else False,
                             'range': read_cov_range
                         }, 'children'
                     ]))
    ranks = blob_db['dict_of_blobs'][identifiers.values[0]]['taxonomy'][
        blob_db['taxrules'][0]].keys()
    for tax_rule in blob_db['taxrules']:
        if 'cat' not in kwargs['meta'].plot:
            kwargs['meta'].plot.update({'cat': "%s_phylum" % tax_rule})
        hit_list = hits_from_blob_db(blob_db, tax_rule)
        parsed.append(
            MultiArray("%s_hits" % tax_rule,
                       values=hit_list,
                       meta={
                           'field_id': "%s_hits" % tax_rule,
                           'type': 'multiarray',
                           'datatype': 'mixed',
                           'preload': False,
                           'active': False,
                           'files':
                           [m['f'] for x, m in blob_db['hitLibs'].items()]
                       },
                       parents=hits.parent() +
                       ['children', {
                           'id': tax_rule
                       }, 'children'],
                       category_slot=None,
                       headers=['taxid', 'score']))
        for rank in ranks:
            field_id = "%s_%s" % (tax_rule, rank)
            parsed.append(
                Category(field_id,
                         values=values[field_id],
                         meta={'field_id': field_id},
                         parents=hits.parent() +
                         ['children', {
                             'id': tax_rule
                         }, 'children']))
            parents = hits.parent() + [
                'children', {
                    'id': tax_rule
                }, 'children', {
                    'id': field_id
                }, 'data'
            ]
            field_id = "%s_%s_cindex" % (tax_rule, rank)
            parsed.append(
                Variable(field_id,
                         values=values[field_id],
                         meta={
                             'scale':
                             'scaleLinear',
                             'field_id':
                             field_id,
                             'datatype':
                             'integer',
                             'range':
                             [min(values[field_id]),
                              max(values[field_id])],
                             'preload':
                             False,
                             'active':
                             False
                         },
                         parents=parents))
            field_id = "%s_%s_score" % (tax_rule, rank)
            _min = min(values[field_id])
            parsed.append(
                Variable(field_id,
                         values=values[field_id],
                         meta={
                             'scale': 'scaleLog',
                             'field_id': field_id,
                             'clamp': 1 if _min == 0 else False,
                             'datatype': 'float',
                             'range': [_min, max(values[field_id])],
                             'preload': False,
                             'active': False
                         },
                         parents=parents))

    return parsed