예제 #1
0
def index_assembly(obj_data, ws_info, obj_data_v1):
    """
    Currently Handles the follownig workspace types:
         KBaseGenomeAnnotations.Assembly-6.0
    """
    info = obj_data['info']
    data = obj_data['data']
    workspace_id = info[6]
    object_id = info[0]
    # get mean contig length
    if data.get('contigs'):
        # we do not include the contig if it does not store the requisite field
        mean_contig_length = mean([
            contig.get('length') for _, contig in data['contigs'].items()
            if contig.get('length')
        ])
        percent_complete_contigs = mean([
            contig.get('is_complete') for _, contig in data['contigs'].items()
            if contig.get('is_complete')
        ])
        percent_circle_contigs = mean([
            contig.get('is_circ') for _, contig in data['contigs'].items()
            if contig.get('is_circ')
        ])
    else:
        mean_contig_length, percent_complete_contigs, percent_circle_contigs = None, None, None
    yield {
        '_action': 'index',
        'doc': {
            "assembly_name":
            data.get("name", None),
            "mean_contig_length":
            mean_contig_length,
            "percent_complete_contigs":
            percent_complete_contigs,
            "percent_circle_contigs":
            percent_circle_contigs,
            "assembly_id":
            data.get('assembly_id', None),
            "gc_content":
            data.get('gc_content', None),
            "size":
            data.get('dna_size', None),
            "num_contigs":
            data.get('num_contigs', None),
            "taxon_ref":
            data.get('taxon_ref', None),
            "external_origination_date":
            data.get('external_source_origination_date', None),
            "external_source_id":
            data.get('external_source_id', None),
            "external_source":
            data.get('external_source', None),
        },
        'index': _ASSEMBLY_INDEX_NAME,
        'id': f"{_NAMESPACE}::{workspace_id}:{object_id}",
    }
예제 #2
0
def _index_ama(features_file_gz_path, data, ama_id, ver_ama_id, tmp_dir, conf):
    """"""
    publication_titles = [pub[2] for pub in data.get('publications', [])]
    publication_authors = [pub[5] for pub in data.get('publications', [])]
    ama_index = {
        '_action': 'index',
        'doc': {
            'size': data.get('dna_size'),
            'source_id': data.get('source_id'),
            'source': data.get('source'),
            'gc_content': data.get('gc_content'),
            'warnings': data.get('warnings'),
            'num_contigs': data.get('num_contigs'),
            'mean_contig_length': mean(data.get('contig_lengths', [])),
            'external_source_origination_date': data.get('external_source_origination_date'),
            'original_source_file_name': data.get('original_source_file_name'),
            'environment': data.get('environment'),
            'num_features': data.get('num_features'),
            'publication_authors': publication_authors,
            'publication_titles': publication_titles,
            'molecule_type': data.get('molecule_type'),
            'assembly_ref': data.get('assembly_ref'),
            'notes': data.get('notes'),
            # not sure what to do with the following fields.
            # list<Ontology_event> ontology_events;
            # mapping<string, mapping<string, string>> ontologies_present;
        },
        'index': conf['index_name'],
        'id': ama_id
    }
    ama_index['id'] = ama_id
    yield ama_index
    ver_ama_index = dict(ama_index)
    ver_ama_index['id'] = ver_ama_id
    ver_ama_index['index'] = conf['ver_index_name']
    yield ver_ama_index

    if config()['skip_features']:
        # Indexing of AMA features is turned off in the env
        return

    # unzip gzip file.
    features_file_path = os.path.join(tmp_dir, ver_ama_id.replace(':', "_") + ".json")
    with gzip.open(features_file_gz_path, "rb") as f_in:
        with open(features_file_path, "wb") as f_out:
            shutil.copyfileobj(f_in, f_out)

    with open(features_file_path) as f:
        features = json.load(f)

    for feat in features:
        id_ = feat.get('id')
        ver_feat_id = ver_ama_id + f"::ama_ft::{id_}"
        # calculate gc content for each feature.
        # if feat.get('dna_sequence'):
        #     dna_seq = feat.get('dna_sequence')
        #     feat_gc_content = ((float(dna_seq.lower().count('c')) + float(dna_seq.lower().count('g'))) / len(dna_seq))

        if feat.get('location'):
            contig_ids, starts, strands, stops = zip(*feat.get('location'))
            contig_ids, starts, strands, stops = list(contig_ids), list(starts), list(strands), list(stops)
        else:
            contig_ids, starts, strands, stops = None, None, None, None

        ver_feat_index = {
            '_action': 'index',
            'doc': {
                'id': id_,
                'type': feat.get('type'),
                'size': feat.get('dna_sequence_length'),
                'starts': starts,
                'strands': strands,
                'stops': stops,
                'contig_ids': contig_ids,
                'functions': feat.get('functions'),
                'functional_descriptions': feat.get('functional_descriptions'),
                'warnings': feat.get('warnings'),
                'parent_gene': feat.get('parent_gene'),
                'inference_data': feat.get('inference_data'),
                'dna_sequence': feat.get('dna_sequence'),
                # 'aliases': feat.get('aliases'),
                # 'gc_content': feat_gc_content,
                # Parent ids below
                'parent_id': ver_ama_id,
                'annotated_metagenome_assembly_size': data.get('dna_size'),
                'annotated_metagenome_assembly_num_features': data.get('num_features'),
                'annotated_metagenome_assembly_num_contigs': data.get('num_contigs'),
                'annotated_metagenome_assembly_gc_content': data.get('gc_content')
            },
            'index': conf['ver_features_index_name'],
            'id': ver_feat_id,
        }
        yield ver_feat_index
    # remove unzipped file
    os.remove(features_file_path)
예제 #3
0
def index_genome(obj_data, ws_info, obj_data_v1):
    """
    Currently indexes following workspace types:
        ci:              KBaseGenomes.Genome-13.0+
        narrative(prod): KBaseGenomes.Genome-8.1+
    """
    info = obj_data['info']
    if not obj_data.get('data'):
        raise Exception("no data in object")
    data = obj_data['data']
    workspace_id = info[6]
    object_id = info[0]
    version = info[4]
    '''
    feature
        feat_type
        sequence_length
        functions
        functional_description
        genome_workspace_id (upa)
    '''
    assembly_ref = ":".join(
        data.get('assembly_ref', data.get('contigset_ref', "")).split('/'))
    publication_titles = [pub[2] for pub in data.get('publications', [])]
    publication_authors = [pub[5] for pub in data.get('publications', [])]
    genome_scientific_name = data.get('scientific_name', None)
    genome_id = f"{_NAMESPACE}::{workspace_id}:{object_id}"
    genome_index = {
        '_action': 'index',
        'doc': {
            'genome_id':
            data.get('id', None),
            'scientific_name':
            genome_scientific_name,
            'publication_titles':
            publication_titles,
            'publication_authors':
            publication_authors,
            'size':
            data.get('dna_size', None),
            'num_contigs':
            data.get('num_contigs', None),
            'genome_type':
            data.get('genome_type', None),
            'gc_content':
            data.get('gc_content', None),
            'taxonomy':
            data.get('taxonomy', None),
            'mean_contig_length':
            mean(data.get('contig_lengths', [])),
            'external_origination_date':
            data.get('external_source_origination_date', None),
            'original_source_file_name':
            data.get('original_source_file_name', None),
            'cds_count':
            len(data.get('cdss', [])),
            'feature_count':
            len(data.get('features', [])),
            'mrna_count':
            len(data.get('mrnas', [])),
            'non_coding_feature_count':
            len(data.get('non_coding_features', [])),
            'assembly_ref':
            assembly_ref,
            'source_id':
            data.get('source_id', []),
            'feature_counts':
            data.get('feature_counts', None),
            'source':
            data.get('source', None),
            'warnings':
            data.get('warnings', None)
        },
        'index': _GENOME_INDEX_NAME,
        'id': genome_id
    }
    yield genome_index
    # gupa = f"{workspace_id}/{object_id}/{version}"
    # iterate through the features and yield for each feature
    if config()['skip_features']:
        # Indexing of genome features is turned off in the env
        return
    for feat_type, field in [('gene', 'features'),
                             ('non_coding_feature', 'non_coding_features'),
                             ('CDS', 'cdss'), ('mrna', 'mrnas')]:
        for feat in data.get(field, []):
            functions = feat.get('functions')
            if feat.get('location'):
                contig_ids, starts, strands, stops = zip(*feat.get('location'))
                contig_ids, starts, strands, stops = list(contig_ids), list(
                    starts), list(strands), list(stops)
            else:
                contig_ids, starts, strands, stops = None, None, None, None
            # contig_ids = [l[0] for l in feat.get('location', [])]
            seq_len = feat.get('dna_sequence_length', None)
            feature_id = feat.get('id', "")
            feature_index = {
                '_action': 'index',
                'doc': {
                    'id': feature_id,
                    'feature_type': feat.get('type', feat_type),
                    'functions': functions,
                    'contig_ids': contig_ids,
                    'sequence_length': seq_len,
                    'id': feature_id,
                    'obj_type_name':
                    "GenomeFeature",  # hack to get ui for features to work.
                    'assembly_ref': assembly_ref,
                    'starts': starts,
                    'strands': strands,
                    'stops': stops,
                    'aliases': feat.get('aliases', None),
                    # Parent data from the Genome
                    'parent_id': genome_id,
                    'genome_version': int(version),
                    'genome_scientific_name': genome_scientific_name,
                    'genome_taxonomy': data.get('taxonomy'),
                    'genome_source': data.get('source'),
                    'genome_source_id': data.get('source_id'),
                    'genome_size': data.get('dna_size'),
                    'genome_num_contigs': data.get('num_contigs'),
                    'genome_feature_count': len(data.get('features', [])),
                    'genome_gc_content': data.get('gc_content')
                },
                'index': _GENOME_FEATURE_INDEX_NAME,
                'id': genome_id + f'::ft::{feature_id}'
            }
            yield feature_index