示例#1
0
class GenomeToGFF:
    """
    typedef structure {
        string genome_ref;
        list <string> ref_path_to_genome;
        int is_gtf;
    } GenomeToGFFParams;

    /* from_cache is 1 if the file already exists and was just returned, 0 if
    the file was generated during this call. */
    typedef structure {
        File file_path;
        boolean from_cache;
    } GenomeToGFFResult;

    funcdef genome_to_gff(GenomeToGFFParams params)
                returns (GenomeToGFFResult result) authentication required;
    """

    def __init__(self, sdk_config):
        self.cfg = sdk_config
        self.dfu = DataFileUtil(self.cfg.callbackURL)
        self.gi = GenomeInterface(sdk_config)
        self.child_dict = {}
        self.transcript_counter = defaultdict(int)

    def export(self, ctx, params):
        # 1) validate parameters and extract defaults
        self.validate_params(params)

        # 2) get genome info
        data, info = self.gi.get_one_genome({'objects': [{"ref": params['genome_ref']}]})

        # 3) make sure the type is valid
        ws_type_name = info[2].split('.')[1].split('-')[0]
        if ws_type_name != 'Genome' and ws_type_name != 'AnnotatedMetagenomeAssembly':
            raise ValueError('Object is not a Genome or an AnnotatedMetagenomeAssembly, it is a:' + str(info[2]))

        is_gtf = params.get('is_gtf', 0)

        target_dir = params.get('target_dir')
        if not target_dir:
            target_dir = os.path.join(self.cfg.sharedFolder, "gff_" + str(int(time.time() * 1000)))
        if not os.path.exists(target_dir):
            os.makedirs(target_dir)

        is_metagenome = 'AnnotatedMetagenomeAssembly' in info[2]

        if is_metagenome:
            # if the type is metagenome, get from shock
            result = self.get_gff_handle(data, target_dir)
        else:
            # 4) Build the GFF/GTF file and return it
            result = self.build_gff_file(data, target_dir, info[1], is_gtf == 1, is_metagenome)
        if result is None:
            raise ValueError('Unable to generate file.  Something went wrong')
        result['from_cache'] = int(is_metagenome)
        return result

    def get_gff_handle(self, data, output_dir):
        """Get the gff file directly from the 'gff_handle_ref' field in the object"""
        if not data.get('gff_handle_ref'):
            return None

        print('pulling cached GFF file from Shock: '+str(data['gff_handle_ref']))
        file_ret = self.dfu.shock_to_file(
            {'handle_id': data['gff_handle_ref'],
             'file_path': output_dir,
             'unpack': 'unpack'})
        return {'file_path': file_ret['file_path']}

    def build_gff_file(self, genome_data, output_dir, output_filename, is_gtf, is_metagenome):
        def feature_sort(feat):
            order = ('gene', 'mRNA', 'CDS')
            if feat.get('children'):
                priority = 0
            elif feat['type'] not in order:
                priority = len(order)
            else:
                priority = order.index(feat['type'])
            return get_start(self.get_common_location(
                feat['location'])), priority

        gff_header = ['seqname', 'source', 'type', 'start', 'end', 'score',
                      'strand', 'frame', 'attribute']

        # create the file
        file_ext = ".gtf" if is_gtf else ".gff"
        out_file_path = os.path.join(output_dir, output_filename + file_ext)
        print('Creating file: ' + str(out_file_path))

        if is_metagenome:
            json_file_path = os.path.join(output_dir, output_filename + '_features.json')

            json_res = self.dfu.shock_to_file({
                'handle_id': genome_data['features_handle_ref'],
                'file_path': json_file_path
            })
            with open(json_res['file_path']) as json_fid:
                features = json.load(json_fid)

            features_by_contig = defaultdict(list)
            for feature in features:
                if 'type' not in feature:
                    feature['type'] = 'gene'
                elif feature['type']== 'CDS' or feature['type'] == 'mRNA':
                    if feature.get('parent_gene'):
                        self.child_dict[feature['id']] = feature
                features_by_contig[feature['location'][0][0]].append(feature)

        else:
            """There is two ways of printing, if a feature has a parent_gene, it
            will be printed breadth first when it's parent parent gene is printed.
            if not, it needs to be added to the features_by_contig to be printed"""
            # sort every feature in the feat_arrays into a dict by contig
            features_by_contig = defaultdict(list)
            for feature in genome_data['features'] + genome_data.get(
                    'non_coding_features', []):
                # type is not present in new gene array
                if 'type' not in feature:
                    feature['type'] = 'gene'
                features_by_contig[feature['location'][0][0]].append(feature)

            for mrna in genome_data.get('mrnas', []):
                mrna['type'] = 'mRNA'
                if mrna.get('parent_gene'):
                    self.child_dict[mrna['id']] = mrna
                else:
                    features_by_contig[mrna['location'][0][0]].append(mrna)

            for cds in genome_data.get('cdss', []):
                cds['type'] = 'CDS'
                if cds.get('parent_gene') or cds.get('parent_mrna'):
                    self.child_dict[cds['id']] = cds
                else:
                    features_by_contig[cds['location'][0][0]].append(cds)

        file_handle = open(out_file_path, 'w')
        writer = csv.DictWriter(file_handle, gff_header, delimiter="\t",
                                escapechar='\\', quotechar="'")
        for contig in genome_data.get('contig_ids', features_by_contig.keys()):
            file_handle.write("##sequence-region {}\n".format(contig))
            features_by_contig[contig].sort(key=feature_sort)
            for feature in features_by_contig[contig]:
                writer.writerows(self.make_feature_group(feature, is_gtf))

        return {'file_path': out_file_path}

    def make_feature_group(self, feature, is_gtf):
        # RNA types make exons if they have compound locations
        if feature['type'] in {'RNA', 'mRNA', 'tRNA', 'rRNA', 'misc_RNA', 'transcript'}:
            loc = self.get_common_location(feature['location'])
            lines = [self.make_feature(loc, feature, is_gtf)]
            for i, loc in enumerate(feature['location']):
                exon = {'id': "{}_exon_{}".format(feature['id'], i + 1),
                        'parent_gene': feature.get('parent_gene', ""),
                        'parent_mrna': feature['id']}
                lines.append(self.make_feature(loc, exon, is_gtf))
        # other types duplicate the feature
        else:
            lines = [self.make_feature(loc, feature, is_gtf)
                     for loc in feature['location']]

        #if this is a gene with mRNAs, make the mrna (and subfeatures)
        if feature.get('mrnas', False):
            for mrna_id in feature['mrnas']:
                lines += self.make_feature_group(self.child_dict[mrna_id], is_gtf)
        # if no mrnas are present in a gene and there are CDS, make them here
        elif feature.get('cdss', False):
            for cds_id in feature['cdss']:
                lines += self.make_feature_group(self.child_dict[cds_id], is_gtf)
        # if this is a mrna with a child CDS, make it here
        elif feature.get('cds', False):
            lines += self.make_feature_group(self.child_dict[feature['cds']], is_gtf)

        return lines

    def make_feature(self, location, in_feature, is_gtf):
        """Make a single feature line for the file"""
        try:
            out_feature = {
                'seqname': location[0],
                'source': 'KBase',
                'type': in_feature.get('type', 'exon'),
                'start': str(get_start(location)),
                'end': str(get_end(location)),
                'score': '.',
                'strand': location[2],
                'frame': '0',
            }
            if is_gtf:
                out_feature['attribute'] = self.gen_gtf_attr(in_feature)
            else:
                out_feature['attribute'] = self.gen_gff_attr(in_feature)
        except Exception as e:
            traceback.print_exc()
            raise Exception(f'Unable to parse {in_feature}:{e}')
        return out_feature

    @staticmethod
    def gen_gtf_attr(feature):
        """Makes the attribute line for a feature in gtf style"""
        if feature.get('type') == 'gene':
            return f'gene_id "{feature["id"]}"; transcript_id ""'

        if "parent" in feature:
            feature['parent_gene'] = feature['parent']

        return (f'gene_id "{feature.get("parent_gene", feature["id"])}"; '
                f'transcript_id "{feature.get("parent_mrna", feature["id"])}"')

    @staticmethod
    def gen_gff_attr(feature):
        """Makes the attribute line for a feature in gff style"""
        def _one_attr(k, val):
            return f'{k}={urllib.parse.quote(val, " /:")}'

        # don't add an attribute that could be 0 without refactor
        for key in ('parent_gene', 'parent_mrna'):
            if key in feature:
                feature['parent'] = feature[key]
        attr_keys = (('id', 'ID'), ('parent', 'Parent'), ('note', 'note'))
        attrs = [_one_attr(pair[1], feature[pair[0]])
                 for pair in attr_keys if feature.get(pair[0])]
        attrs.extend([_one_attr('db_xref', '{}:{}'.format(*x))
                     for x in feature.get('db_xrefs', [])])
        attrs.extend([_one_attr(pair[0], pair[1])
                      for pair in feature.get('aliases', [''])
                      if isinstance(pair, list)])
        if feature.get('functional_descriptions'):
            attrs.append(_one_attr('function', ";".join(
                feature['functional_descriptions'])))
        if feature.get('functions'):
            attrs.append(_one_attr('product', ";".join(feature['functions'])))
        elif feature.get('function'):
            attrs.append(_one_attr('product', feature['function']))
        for ont in feature.get('ontology_terms', []):
            attrs.extend([_one_attr(ont.lower(), x)
                          for x in feature['ontology_terms'][ont]])

        if 'inference_data' in feature:
            attrs.extend([_one_attr(
                'inference', ":".join([x[y] for y in ('category', 'type', 'evidence') if x[y]]))
                for x in feature['inference_data']])
        if 'trans_splicing' in feature.get('flags', []):
            attrs.append(_one_attr("exception", "trans-splicing"))
        return "; ".join(attrs)

    @staticmethod
    def get_common_location(location_array):
        """Merges a compound location array into an overall location"""
        contig = location_array[0][0]
        strand = location_array[0][2]
        min_pos = min([get_start(loc) for loc in location_array])
        max_pos = max([get_end(loc) for loc in location_array])
        common_length = max_pos - min_pos + 1
        common_start = min_pos if strand == '+' else max_pos
        return [contig, common_start, strand, common_length]

    @staticmethod
    def validate_params(params):
        if 'genome_ref' not in params:
            raise ValueError('required "genome_ref" field was not defined')
示例#2
0
class GenomeToGenbank(object):
    def __init__(self, sdk_config):
        self.cfg = sdk_config
        self.dfu = DataFileUtil(self.cfg.callbackURL)
        self.gi = GenomeInterface(sdk_config)

    def validate_params(self, params):
        if 'genome_ref' not in params:
            raise ValueError('required "genome_ref" field was not defined')

    def export(self, ctx, params):
        # 1) validate parameters and extract defaults
        self.validate_params(params)

        # 2) get genome info
        data, info = self.gi.get_one_genome(
            {'objects': [{
                "ref": params['genome_ref']
            }]})

        # 3) make sure the type is valid
        if info[2].split(".")[1].split('-')[0] != 'Genome':
            raise ValueError('Object is not a Genome, it is a:' + str(info[2]))

        # 4) build the genbank file and return it
        log('not cached, building file...')
        result = self.build_genbank_file(data,
                                         "KBase_derived_" + info[1] + ".gbff",
                                         params['genome_ref'])
        if result is None:
            raise ValueError('Unable to generate file.  Something went wrong')
        result['from_cache'] = 0
        return result

    def export_original_genbank(self, ctx, params):
        # 1) validate parameters and extract defaults
        self.validate_params(params)

        # 2) get genome genbank handle reference
        data, info = self.gi.get_one_genome(
            {'objects': [{
                "ref": params['genome_ref']
            }]})

        # 3) make sure the type is valid
        if info[2].split(".")[1].split('-')[0] != 'Genome':
            raise ValueError('Object is not a Genome, it is a:' + str(info[2]))

        # 4) if the genbank handle is there, get it and return
        log('checking if genbank file is cached...')
        result = self.get_genbank_handle(data)
        return result

    def get_genbank_handle(self, data):
        if 'genbank_handle_ref' not in data:
            return None
        if data['genbank_handle_ref'] is None:
            return None

        log('pulling cached genbank file from Shock: ' +
            str(data['genbank_handle_ref']))
        file = self.dfu.shock_to_file({
            'handle_id': data['genbank_handle_ref'],
            'file_path': self.cfg.sharedFolder,
            'unpack': 'unpack'
        })
        return {'genbank_file': {'file_path': file['file_path']}}

    def build_genbank_file(self, genome_data, output_filename, genome_ref):
        g = GenomeFile(self.cfg, genome_data, genome_ref)
        file_path = self.cfg.sharedFolder + "/" + output_filename
        g.write_genbank_file(file_path)

        return {'genbank_file': {'file_path': file_path}}