class metadata_m(DictSerializable, db_2.Model): """ Table for storing information about other tables """ __tablename__ = "metadata" key = db_2.Column(db_2.String(50), index=True, primary_key=True) value = db_2.Column(db_2.String)
class homologs_m(DictSerializable, db_2.Model): """ The homologs database combines """ __tablename__ = "homologs" id = db_2.Column(db_2.Integer, primary_key=True) gene_id = db_2.Column(db_2.ForeignKey('wormbase_gene_summary.gene_id'), nullable=False, index=True) gene_name = db_2.Column(db_2.String(40), index=True) homolog_species = db_2.Column(db_2.String(50), index=True) homolog_taxon_id = db_2.Column(db_2.Integer, index=True, nullable=True) # If available homolog_gene = db_2.Column(db_2.String(50), index=True) homolog_source = db_2.Column(db_2.String(40)) gene_summary = db_2.relationship("wormbase_gene_summary_m", backref='homologs', lazy='joined') def unnest(self): """ Used with the gene API - returns an unnested homolog datastructure combined with the wormbase gene summary model. """ self.__dict__.update(self.gene_summary.__dict__) self.__dict__['gene_summary'] = None return self def __repr__(self): return f"homolog: {self.gene_name} -- {self.homolog_gene}"
class wormbase_gene_summary_m(DictSerializable, db_2.Model): """ This is a condensed version of the wormbase_gene_m model; It is constructed out of convenience and only defines the genes (not exons/introns/etc.) """ __tablename__ = "wormbase_gene_summary" id = db_2.Column(db_2.Integer, primary_key=True) chrom = db_2.Column(db_2.String(7), index=True) chrom_num = db_2.Column(db_2.Integer(), index=True) start = db_2.Column(db_2.Integer(), index=True) end = db_2.Column(db_2.Integer(), index=True) locus = db_2.Column(db_2.String(30), index=True) gene_id = db_2.Column(db_2.String(25), index=True) gene_id_type = db_2.Column(db_2.String(15), index=False) sequence_name = db_2.Column(db_2.String(30), index=True) biotype = db_2.Column(db_2.String(30), nullable=True) gene_symbol = db_2.column_property( func.coalesce(locus, sequence_name, gene_id)) interval = db_2.column_property(func.printf("%s:%s-%s", chrom, start, end)) arm_or_center = db_2.Column(db_2.String(12), index=True) @classmethod def resolve_gene_id(cls, query): """ query - a locus name or transcript ID output - a wormbase gene ID Example: wormbase_gene_m.resolve_gene_id('pot-2') --> WBGene00010195 """ result = cls.query.filter( or_(cls.locus == query, cls.sequence_name == query)).first() if result: return result.gene_id
class wormbase_gene_m(DictSerializable, db_2.Model): __tablename__ = 'wormbase_gene' id = db_2.Column(db_2.Integer, primary_key=True) chrom = db_2.Column(db_2.String(20), index=True) chrom_num = db_2.Column(db_2.Integer(), index=True) # For sorting purposes start = db_2.Column(db_2.Integer(), index=True) end = db_2.Column(db_2.Integer(), index=True) feature = db_2.Column(db_2.String(30), index=True) strand = db_2.Column(db_2.String(1)) frame = db_2.Column(db_2.Integer(), nullable=True) gene_id = db_2.Column(db_2.ForeignKey('wormbase_gene_summary.gene_id'), nullable=False) gene_biotype = db_2.Column(db_2.String(30), nullable=True) locus = db_2.Column(db_2.String(30), index=True) transcript_id = db_2.Column(db_2.String(30), nullable=True, index=True) transcript_biotype = db_2.Column(db_2.String(), index=True) exon_id = db_2.Column(db_2.String(30), nullable=True, index=True) exon_number = db_2.Column(db_2.Integer(), nullable=True) protein_id = db_2.Column(db_2.String(30), nullable=True, index=True) arm_or_center = db_2.Column(db_2.String(12), index=True) gene_summary = db_2.relationship("wormbase_gene_summary_m", backref='gene_components') def __repr__(self): return f"{self.gene_id}:{self.feature} [{self.seqname}:{self.start}-{self.end}]"
class strain_m(DictSerializable, db_2.Model): __tablename__ = "strain" strain = db_2.Column(db_2.String(25), primary_key=True) reference_strain = db_2.Column(db_2.Boolean(), index=True) sequenced = db_2.Column(db_2.Boolean(), index=True, nullable=True) isotype = db_2.Column(db_2.String(25), index=True, nullable=True) previous_names = db_2.Column(db_2.String(100), nullable=True) source_lab = db_2.Column(db_2.String(), nullable=True) release = db_2.Column(db_2.Integer(), nullable=False, index=True) latitude = db_2.Column(db_2.Float(), nullable=True) longitude = db_2.Column(db_2.Float(), nullable=True) elevation = db_2.Column(db_2.Float(), nullable=True) landscape = db_2.Column(db_2.String(), nullable=True) substrate = db_2.Column(db_2.String(), nullable=True) photo = db_2.Column(db_2.String(), nullable=True) isolated_by = db_2.Column(db_2.String(), nullable=True) sampled_by = db_2.Column(db_2.String(), nullable=True) isolation_date = db_2.Column(db_2.Date(), nullable=True) isolation_date_comment = db_2.Column(db_2.String(), nullable=True) notes = db_2.Column(db_2.String(), nullable=True) sets = db_2.Column(db_2.String(), nullable=True) c_label = db_2.Column(db_2.String(), nullable=True) s_label = db_2.Column(db_2.String(), nullable=True) substrate_temp = db_2.Column(db_2.Float()) ambient_temp = db_2.Column(db_2.Float()) substrate_moisture = db_2.Column(db_2.Float()) ambient_humidity = db_2.Column(db_2.Float()) def __repr__(self): return self.strain def to_json(self): return { k: v for k, v in self.__dict__.items() if not k.startswith("_") } def list_sets(self): if self.sets: return self.sets.split(",") else: return [] def bam_url(self): """ Return bam / bam_index url set """ url_set = Markup(f""" <a href="{URLS.BAM_URL_PREFIX}/{self.isotype}.bam"> BAM </a> / <a href="{URLS.BAM_URL_PREFIX}/{self.isotype}.bam.bai"> bai </a> """.strip()) return url_set @classmethod def cum_sum_strain_isotype(cls): """ Create a time-series plot of strains and isotypes collected over time Args: df - the strain dataset """ df = pd.read_sql_table(cls.__tablename__, db_2.engine) cumulative_isotype = df[['isotype', 'isolation_date']].sort_values(['isolation_date'], axis=0) \ .drop_duplicates(['isotype']) \ .groupby(['isolation_date'], as_index=True) \ .count() \ .cumsum() \ .reset_index() cumulative_isotype = cumulative_isotype.append( { 'isolation_date': np.datetime64(datetime.datetime.today().strftime("%Y-%m-%d")), 'isotype': len(df['isotype'].unique()) }, ignore_index=True) cumulative_strain = df[['strain', 'isolation_date']].sort_values(['isolation_date'], axis=0) \ .drop_duplicates(['strain']) \ .dropna(how='any') \ .groupby(['isolation_date']) \ .count() \ .cumsum() \ .reset_index() cumulative_strain = cumulative_strain.append( { 'isolation_date': np.datetime64(datetime.datetime.today().strftime("%Y-%m-%d")), 'strain': len(df['strain'].unique()) }, ignore_index=True) df = cumulative_isotype.set_index('isolation_date') \ .join(cumulative_strain.set_index('isolation_date')) \ .reset_index() return df @classmethod def release_summary(cls, release): """ Returns isotype and strain count for a data release. Args: release - the data release """ counts = { 'strain_count': cls.query.filter((cls.release <= release)).count(), 'strain_count_sequenced': cls.query.filter((cls.release <= release) & (cls.sequenced == True)).count(), 'isotype_count': cls.query.filter(cls.release <= release).group_by( cls.isotype).count() } return counts def as_dict(self): return {c.name: getattr(self, c.name) for c in self.__table__.columns}