class User(db.Model): __tablename__ = 'users' id = db.Column(db.Integer, primary_key=True) username = db.Column(db.String(50), unique=True, index=True) first_name = db.Column(db.String(50)) last_name = db.Column(db.String(50)) password_hash = db.Column(db.Text) email = db.Column(db.Text) reset_key = db.Column(db.Text) is_admin = db.Column(db.SmallInteger) is_banned = db.Column(db.SmallInteger) wants_newsletter = db.Column(db.SmallInteger) registered = db.Column(db.DateTime) def __init__(self, username, password, email, reset_key='', is_admin=False, is_banned=False, registered=datetime.now().replace(microsecond=0)): self.username = username self.password_hash = generate_password_hash(password) self.email = email self.reset_key = reset_key self.is_admin = is_admin self.is_banned = is_banned self.registered = registered self.wants_newsletter = False def __repr__(self): return '<User %d>' % self.id def check_password(self, password): return check_password_hash(self.password_hash, password) @property def is_administrator(self): return self.is_admin @property def is_authenticated(self): return True @property def is_active(self): return True @property def is_anonymous(self): return False def get_id(self): return str(self.id) @staticmethod def get(user_id): return User.query.get(user_id)
class News(db.Model): __tablename__ = 'news' id = db.Column(db.Integer, primary_key=True) message = db.Column(db.Text(collation=SQL_COLLATION)) posted = db.Column(db.DateTime) posted_by = db.Column(db.String(100)) @property def message_markup(self): return Markup(markdown(self.message)) @property def posted_formatted(self): return self.posted.strftime("%Y-%m-%d %H:%M")
class ExpressionSpecificity(db.Model): __tablename__ = 'expression_specificity' id = db.Column(db.Integer, primary_key=True) profile_id = db.Column(db.Integer, db.ForeignKey('expression_profiles.id', ondelete='CASCADE'), index=True) condition = db.Column(db.String(255), index=True) score = db.Column(db.Float, index=True) entropy = db.Column(db.Float, index=True) tau = db.Column(db.Float, index=True) method_id = db.Column(db.Integer, db.ForeignKey('expression_specificity_method.id', ondelete='CASCADE'), index=True)
class SequenceCoexpressionClusterAssociation(db.Model): __tablename__ = 'sequence_coexpression_cluster' __table_args__ = {'extend_existing': True} id = db.Column(db.Integer, primary_key=True) probe = db.Column(db.String(50), index=True) sequence_id = db.Column(db.Integer, db.ForeignKey('sequences.id', ondelete='CASCADE')) coexpression_cluster_id = db.Column( db.Integer, db.ForeignKey('coexpression_clusters.id', ondelete='CASCADE')) sequence = db.relationship('Sequence', backref=db.backref( 'coexpression_cluster_associations', lazy='dynamic', passive_deletes=True), lazy='joined') coexpression_cluster = db.relationship('CoexpressionCluster', backref=db.backref( 'sequence_associations', lazy='dynamic', passive_deletes=True), lazy='joined')
class Species(db.Model): __tablename__ = 'species' id = db.Column(db.Integer, primary_key=True) code = db.Column(db.String(50, collation=SQL_COLLATION), unique=True) name = db.Column(db.String(200, collation=SQL_COLLATION)) data_type = db.Column(db.Enum('genome', 'transcriptome', name='data_type')) color = db.Column(db.String(7), default="#C7C7C7") highlight = db.Column(db.String(7), default="#DEDEDE") sequence_count = db.Column(db.Integer) network_count = db.Column(db.Integer) profile_count = db.Column(db.Integer) description = db.Column(db.Text) sequences = db.relationship('Sequence', backref='species', lazy='dynamic', cascade="all, delete-orphan", passive_deletes=True) networks = db.relationship('ExpressionNetworkMethod', backref='species', lazy='dynamic', cascade="all, delete-orphan", passive_deletes=True) profiles = db.relationship('ExpressionProfile', backref='species', lazy='dynamic', cascade="all, delete-orphan", passive_deletes=True) expression_specificities = db.relationship('ExpressionSpecificityMethod', backref='species', lazy='dynamic', cascade="all, delete-orphan", passive_deletes=True) condition_tissues = db.relationship('ConditionTissue', backref='species', lazy='dynamic', cascade="all, delete-orphan", passive_deletes=True) def __init__(self, code, name, data_type='genome', color="#C7C7C7", highlight="#DEDEDE", description=None): self.code = code self.name = name self.data_type = data_type self.color = color self.highlight = highlight self.sequence_count = 0 self.profile_count = 0 self.network_count = 0 self.description = description def __repr__(self): return str(self.id) + ". " + self.name @property def has_interpro(self): from conekt.models.sequences import Sequence from conekt.models.relationships.sequence_interpro import SequenceInterproAssociation domain = SequenceInterproAssociation.query.join( Sequence, Sequence.id == SequenceInterproAssociation.sequence_id).filter( Sequence.species_id == self.id).first() if domain is not None: return True else: return False @property def has_go(self): from conekt.models.sequences import Sequence from conekt.models.relationships.sequence_go import SequenceGOAssociation go = SequenceGOAssociation.query.join( Sequence, Sequence.id == SequenceGOAssociation.sequence_id).filter( Sequence.species_id == self.id).first() if go is not None: return True else: return False @staticmethod def add(code, name, data_type='genome', color="#C7C7C7", highlight="#DEDEDE", description=None): new_species = Species(code, name, data_type=data_type, color=color, highlight=highlight, description=description) species = Species.query.filter_by(code=code).first() # species is not in the DB yet, add it if species is None: try: db.session.add(new_species) db.session.commit() except: db.rollback() return new_species.id else: return species.id @staticmethod def update_counts(): """ To avoid long counts the number of sequences, profiles and networks can be precalculated and stored in the database using this function. """ species = Species.query.all() for s in species: s.sequence_count = s.sequences.count() s.profile_count = s.profiles.count() s.network_count = s.networks.count() try: db.session.commit() except Exception as e: db.session.rollback() print(e)
class CoexpressionCluster(db.Model): __tablename__ = 'coexpression_clusters' id = db.Column(db.Integer, primary_key=True) method_id = db.Column( db.Integer, db.ForeignKey('coexpression_clustering_methods.id', ondelete='CASCADE')) name = db.Column(db.String(50), index=True) # Other properties # sequences defined in Sequence # sequence_associations defined in SequenceCoexpressionClusterAssociation' # go_enrichment defined in ClusterGOEnrichment # clade_enrichment defined in ClusterCladeEnrichment @staticmethod def get_cluster(cluster_id): """ Returns the network for a whole cluster (reporting edges only between members of the cluster !) :param cluster_id: internal ID of the cluster :return network for the selected cluster (dict with nodes and edges) """ cluster = CoexpressionCluster.query.get(cluster_id) probes = [ member.probe for member in cluster.sequence_associations.all() ] network = cluster.method.network_method.probes.\ options(joinedload('sequence').load_only('name')).\ filter(ExpressionNetwork.probe.in_(probes)).all() nodes = [] edges = [] existing_edges = [] for node in network: nodes.append({ "id": node.probe, "name": node.probe, "gene_id": int(node.sequence_id) if node.sequence_id is not None else None, "gene_name": node.sequence.name if node.sequence_id is not None else node.probe, "depth": 0 }) links = json.loads(node.network) for link in links: # only add links that are in the cluster ! if link["probe_name"] in probes and [ node.probe, link["probe_name"] ] not in existing_edges: edges.append({ "source": node.probe, "target": link["probe_name"], "profile_comparison": url_for( 'expression_profile.expression_profile_compare_probes', probe_a=node.probe, probe_b=link["probe_name"], species_id=node.method.species.id), "depth": 0, "link_score": link["link_score"], "link_pcc": link["link_pcc"] if "link_pcc" in link.keys() else None, "hrr": link["hrr"] if "hrr" in link.keys() else None, "edge_type": cluster.method.network_method.edge_type }) existing_edges.append([node.probe, link["probe_name"]]) existing_edges.append([link["probe_name"], node.probe]) return {"nodes": nodes, "edges": edges} def __calculate_enrichment(self): """ Initial implementation to calculate GO enrichment for a single cluster """ gene_count = self.method.network_method.species.sequence_count species_id = self.method.network_method.species_id sequences = self.sequences.options(load_only("id")).all() associations = SequenceGOAssociation.query\ .filter(SequenceGOAssociation.sequence_id.in_([s.id for s in sequences]))\ .filter(SequenceGOAssociation.predicted == 0)\ .options(load_only("sequence_id", "go_id"))\ .group_by(SequenceGOAssociation.sequence_id, SequenceGOAssociation.go_id) go_data = {} for a in associations: if a.go_id not in go_data.keys(): go_data[a.go_id] = {} go_data[a.go_id]["total_count"] = json.loads( a.go.species_counts)[str(species_id)] go_data[a.go_id]["cluster_count"] = 1 else: go_data[a.go_id]["cluster_count"] += 1 p_values = [] for go_id in go_data: p_values.append( hypergeo_sf(go_data[go_id]['cluster_count'], len(sequences), go_data[go_id]['total_count'], gene_count)) corrected_p_values = fdr_correction(p_values) for i, go_id in enumerate(go_data): enrichment = ClusterGOEnrichment() enrichment.cluster_id = self.id enrichment.go_id = go_id enrichment.cluster_count = go_data[go_id]['cluster_count'] enrichment.cluster_size = len(sequences) enrichment.go_count = go_data[go_id]['total_count'] enrichment.go_size = gene_count enrichment.enrichment = log2( (go_data[go_id]['cluster_count'] / len(sequences)) / (go_data[go_id]['total_count'] / gene_count)) enrichment.p_value = p_values[i] enrichment.corrected_p_value = corrected_p_values[i] db.session.add(enrichment) try: db.session.commit() except Exception as e: db.session.rollback() print(e) @staticmethod def calculate_enrichment(empty=True): """ Static method to calculate the enrichment for all cluster in the database :param empty: empty table cluster_go_enrichment first """ # If required empty the table first if empty: try: db.session.query(ClusterGOEnrichment).delete() db.session.commit() except Exception as e: db.session.rollback() print(e) else: clusters = CoexpressionCluster.query.all() for i, cluster in enumerate(clusters): # print(i, "\t cluster: ", cluster.method_id, cluster.name) cluster.__calculate_enrichment() def __calculate_clade_enrichment(self, background, gf_method_id): """ Calculates the clade enrichment for a co-expression cluster (i.e. if genes which originated in a certain clade are overrepresented). A background is required (how many genes there are per clade in the organism) and the gene family method those clades are based on. Calculations will be immediately committed to the DB. :param background: dict with background :param gf_method_id: internal ID of gene family method """ species_gene_count = self.method.network_method.species.sequence_count species_id = self.method.network_method.species_id cluster_clade_count = defaultdict(lambda: 0) cluster_gene_count = self.sequences.count() try: sequences = self.sequences.\ join(SequenceFamilyAssociation, Sequence.id == SequenceFamilyAssociation.sequence_id).\ join(GeneFamily, SequenceFamilyAssociation.gene_family_id == GeneFamily.id).\ add_columns(Sequence.name, Sequence.species_id, SequenceFamilyAssociation.gene_family_id, GeneFamily.method_id, GeneFamily.clade_id).\ filter(GeneFamily.method_id == gf_method_id).all() except Exception as e: print(e, file=sys.stderr) for s in sequences: cluster_clade_count[s.clade_id] += 1 enrichment_scores = [] for clade_id, count in cluster_clade_count.items(): try: background_count = background[species_id][clade_id] p_value = hypergeo_sf(count, cluster_gene_count, background_count, species_gene_count) enrichment = log2((count / cluster_gene_count) / (background_count / species_gene_count)) enrichment_scores.append({ 'clade_count': background_count, 'clade_size': species_gene_count, 'cluster_count': count, 'cluster_size': cluster_gene_count, 'p_value': p_value, 'enrichment': enrichment, 'clade_id': clade_id, 'cluster_id': self.id }) except Exception as e: print(e, file=sys.stderr) corrected_p_values = fdr_correction( [es['p_value'] for es in enrichment_scores]) commit_required = False for es, corrected_p_value in zip(enrichment_scores, corrected_p_values): if es['p_value'] < 0.05 and es['enrichment'] > 0: commit_required = True cluster_clade_enrichment = ClusterCladeEnrichment() cluster_clade_enrichment.p_value = es['p_value'] cluster_clade_enrichment.corrected_p_value = corrected_p_value cluster_clade_enrichment.enrichment = es['enrichment'] cluster_clade_enrichment.clade_id = es['clade_id'] cluster_clade_enrichment.cluster_id = es['cluster_id'] cluster_clade_enrichment.gene_family_method_id = gf_method_id cluster_clade_enrichment.clade_count = es['clade_count'] cluster_clade_enrichment.clade_size = es['clade_size'] cluster_clade_enrichment.cluster_count = es['cluster_count'] cluster_clade_enrichment.cluster_size = es['cluster_size'] db.session.add(cluster_clade_enrichment) if commit_required: try: db.session.commit() except Exception as e: db.session.rollback() print(e) @staticmethod def calculate_clade_enrichment(gene_family_method_id, empty=True): """ Calculates clade enrichment for co-expression clusters :param gene_family_method_id: gene family method to use to determine clades :param empty: when true, removes clade enrichments for the current gf_method """ if empty: try: print("Removing Existing Enrichment") db.session.query(ClusterCladeEnrichment).\ filter(ClusterCladeEnrichment.gene_family_method_id == gene_family_method_id).delete() db.session.commit() except Exception as e: db.session.rollback() print(e) print("Calculating background...", sep='') gf_method = GeneFamilyMethod.query.get(gene_family_method_id) counts = gf_method.get_clade_distribution() print(' Done!') # calculate enrichment print("Calculate enrichment", sep='') clusters = CoexpressionCluster.query.all() for i, cluster in enumerate(clusters): print(i, "\t cluster: ", cluster.method_id, cluster.name) cluster.__calculate_clade_enrichment(counts, gene_family_method_id) print(" Done!") @staticmethod def delete_enrichment(): """ Removes all GO enrichment data from the database :return: """ try: db.session.query(ClusterGOEnrichment).delete() db.session.commit() except Exception as e: db.session.rollback() print(e) @staticmethod @benchmark def calculate_similarities(gene_family_method_id=1, percentile_pass=0.95): """ This function will calculate ALL similarities between clusters in the database. Results will be added to the DB :param gene_family_method_id: Internal ID of gene family method to use to calculate the scores (default = 1) :param percentile_pass: percentile based cutoff (default = 0.95) """ # sqlalchemy to fetch cluster associations fields = [ SequenceCoexpressionClusterAssociation.__table__.c.sequence_id, SequenceCoexpressionClusterAssociation.__table__.c. coexpression_cluster_id ] condition = SequenceCoexpressionClusterAssociation.__table__.c.sequence_id is not None cluster_associations = db.engine.execute( db.select(fields).where(condition)).fetchall() # sqlalchemy to fetch sequence family associations fields = [ SequenceFamilyAssociation.__table__.c.sequence_id, SequenceFamilyAssociation.__table__.c.gene_family_id, GeneFamily.__table__.c.method_id ] condition = GeneFamily.__table__.c.method_id == gene_family_method_id table = join( SequenceFamilyAssociation.__table__, GeneFamily.__table__, SequenceFamilyAssociation.__table__.c.gene_family_id == GeneFamily.__table__.c.id) sequence_families = db.engine.execute( db.select(fields).select_from(table).where(condition)).fetchall() # convert sqlachemy results into dictionary sequence_to_family = { seq_id: fam_id for seq_id, fam_id, method_id in sequence_families } cluster_to_sequences = {} cluster_to_families = {} for seq_id, cluster_id in cluster_associations: if cluster_id not in cluster_to_sequences.keys(): cluster_to_sequences[cluster_id] = [] cluster_to_sequences[cluster_id].append(seq_id) for cluster_id, sequences in cluster_to_sequences.items(): families = list( set([ sequence_to_family[s] for s in sequences if s in sequence_to_family.keys() ])) if len(families) > 0: cluster_to_families[cluster_id] = families keys = list(cluster_to_families.keys()) data = [] for i in range(len(keys) - 1): for j in range(i + 1, len(keys)): current_keys = [keys[x] for x in [i, j]] current_families = [ cluster_to_families[k] for k in current_keys ] if len(current_families[0]) > 4 and len( current_families[1]) > 4: j = jaccard(current_families[0], current_families[1]) data.append([current_keys[0], current_keys[1], j]) ordered_j = sorted([a[2] for a in data]) if len(ordered_j) > 0: percentile_cutoff = ordered_j[int( len(ordered_j) * percentile_pass)] database = [{ 'source_id': d[0], 'target_id': d[1], 'gene_family_method_id': gene_family_method_id, 'jaccard_index': d[2], 'p_value': 0, 'corrected_p_value': 0 } for d in data if d[2] >= percentile_cutoff] db.engine.execute(CoexpressionClusterSimilarity.__table__.insert(), database) else: print("No similar clusters found!") @property def profiles(self): """ Returns a list with all expression profiles of cluster members :return: list of all profiles """ sequence_subquery = self.sequences.subquery() profiles = ExpressionProfile.query.\ options(undefer('profile')).\ join(sequence_subquery, ExpressionProfile.sequence_id == sequence_subquery.c.id).all() return profiles @property def interpro_stats(self): """ Get InterPro statistics for the current cluster :return: Interpro statistics """ sequence_ids = [s.id for s in self.sequences.all()] return Interpro.sequence_stats(sequence_ids) @property def go_stats(self): """ Get GO statistics for the current cluster :return: GO statistics """ sequence_ids = [s.id for s in self.sequences.all()] return GO.sequence_stats(sequence_ids) @property def family_stats(self): """ Get gene family statistics for the current cluster :return: gene family statistics """ sequence_ids = [s.id for s in self.sequences.all()] return GeneFamily.sequence_stats(sequence_ids)
class Tree(db.Model): __tablename__ = 'trees' id = db.Column(db.Integer, primary_key=True) label = db.Column(db.String(50, collation=SQL_COLLATION), index=True) data_newick = db.Column(db.Text) data_phyloxml = db.Column(db.Text) gf_id = db.Column(db.Integer, db.ForeignKey('gene_families.id', ondelete='CASCADE'), index=True) method_id = db.Column(db.Integer, db.ForeignKey('tree_methods.id', ondelete='CASCADE'), index=True) @property def ascii_art(self): """ Returns an ascii representation of the tree. Useful for quick visualizations :return: string with ascii representation of the tree """ tree = newick.loads(self.data_newick)[0] return tree.ascii_art() @staticmethod def __yattag_node(node, tag, text, line, id_to_clade, seq_to_species, seq_to_id, root=1): with tag('clade'): if root == 1: line('branch_length', 0.1) else: line('branch_length', node.length) if node.is_leaf: line('name', node.name) if node.name in seq_to_id.keys(): line('id', seq_to_id[node.name]) if node.name in seq_to_species.keys(): with tag('taxonomy'): line('code', seq_to_species[node.name]) else: clade_id, duplication, dup_score = node.name.split('_') clade_id = int(clade_id) duplication = True if duplication == 'D' else False dup_score = float(dup_score) if clade_id in id_to_clade.keys(): with tag('taxonomy'): line('code', id_to_clade[clade_id]) if duplication: line('property', str(dup_score), applies_to="clade", datatype="xksd:double", ref="Duplication consistency score") with tag('events'): line('duplications', 1) else: with tag('events'): line('speciations', 1) for d in node.descendants: Tree.__yattag_node(d, tag, text, line, id_to_clade, seq_to_species, seq_to_id, root=0) @property def phyloxml(self): """ data_phyloXML to phyloXML conversion :return: """ # Load Tree with addition information tree = newick.loads(self.data_phyloxml)[0] # Load Additional information from the database clades = Clade.query.all() id_to_clade = {c.id: c.name for c in clades} seq_to_species = {} seq_to_id = {} species = [] for s in self.sequences.all(): seq_to_id[s.name] = s.id seq_to_species[s.name] = s.species.code if s.species not in species: species.append(s.species) csep = CrossSpeciesExpressionProfile() csep_data = csep.get_data(*seq_to_id.values()) has_heatmap = False heatmap_order = [] for cd in csep_data: if "profile" in cd.keys() and "order" in cd["profile"].keys(): has_heatmap = True heatmap_order = cd["profile"]["order"] break # Start constructing PhyloXML doc, tag, text, line = Doc().ttl() with tag('phyloxml'): with tag('phylogeny', rooted="True"): # line('name', self.label) # line('description', "PlaNet 2.0 PhyloXML tree") Tree.__yattag_node(tree, tag, text, line, id_to_clade, seq_to_species, seq_to_id) with tag('graphs'): if has_heatmap: with tag('graph', type="heatmap"): line('name', 'Heatmap') with tag('legend', show=1): for label in heatmap_order: with tag('field'): line('name', label) with tag('gradient'): line('name', 'YlGnBu') line('classes', len(heatmap_order)) with tag('data'): for cd in csep_data: if "profile" in cd.keys( ) and "data" in cd["profile"].keys(): with tag('values', **{'for': str(cd["sequence_id"])}): for label in heatmap_order: if cd["profile"]["data"][ label] is not None: line( 'value', cd["profile"] ["data"][label]) else: line('value', '') with tag('graph', type="binary"): line('name', 'Low Expression') with tag('legend', show=1): with tag('field'): line('name', 'Low expression') line('color', '0xf03b20') line('shape', 'circle') with tag('data'): for cd in csep_data: if "low_expressed" in cd.keys(): with tag('values', **{'for': str(cd["sequence_id"])}): line('value', cd["low_expressed"]) with tag('graph', type="multibar"): line('name', 'Expression Range') with tag('legend', show=1): with tag('field'): line('name', 'Max. Expression (TPM)') line('color', '0x664977') with tag('data'): for cd in csep_data: if "max_expression" in cd.keys(): with tag('values', **{'for': str(cd["sequence_id"])}): line('value', cd["max_expression"]) with tag('taxonomies'): for s in species: with tag('taxonomy', code=s.code): line('color', s.color.replace("#", "0x")) line('name', s.name) line( 'url', url_for('species.species_view', species_id=s.id, _external=True)) for c in clades: with tag('taxonomy', code=c.name): line('color', '0x000000') line('name', c.name) line( 'url', url_for('clade.clade_view', clade_id=c.id, _external=True)) return indent(doc.getvalue()) @property def count(self): tree = newick.loads(self.data_newick)[0] return len(tree.get_leaves()) @property def sequences(self): tree = newick.loads(self.data_newick)[0] sequences = [l.name for l in tree.get_leaves()] return Sequence.query.filter(Sequence.name.in_(sequences)) @property def tree_stripped(self): tree = newick.loads(self.data_newick)[0] tree.remove_lengths() return newick.dumps([tree])
class GO(db.Model): __tablename__ = 'go' id = db.Column(db.Integer, primary_key=True) label = db.Column(db.String(50, collation=SQL_COLLATION), unique=True, index=True) name = db.Column(db.Text) type = db.Column(db.Enum('biological_process', 'molecular_function', 'cellular_component', name='go_type')) description = db.Column(db.Text) obsolete = db.Column(db.SmallInteger) is_a = db.Column(db.Text) extended_go = db.Column(db.Text) species_counts = db.Column(db.Text) sequences = db.relationship('Sequence', secondary=sequence_go, lazy='dynamic') # Other properties # # sequence_associations declared in 'SequenceGOAssociation' # enriched_clusters declared in 'ClusterGOEnrichment' def __init__(self, label, name, go_type, description, obsolete, is_a, extended_go): self.label = label self.name = name self.type = go_type self.description = description self.obsolete = obsolete self.is_a = is_a self.extended_go = extended_go self.species_counts = "" def set_all(self, label, name, go_type, description, extended_go): self.label = label self.name = name self.type = go_type self.description = description self.extended_go = extended_go self.species_counts = "" @property def short_type(self): if self.type == 'biological_process': return 'BP' elif self.type == 'molecular_function': return 'MF' elif self.type == 'cellular_component': return 'CC' else: return 'UNK' @property def readable_type(self): if self.type == 'biological_process': return 'Biological process' elif self.type == 'molecular_function': return 'Molecular function' elif self.type == 'cellular_component': return 'Cellular component' else: return 'Unknown type' @property def parent_count(self): """ Returns total number of genes 'above' this gene in the DAG :return: """ return len(self.extended_go.split(';')) if self.extended_go != '' else 0 @property def interpro_stats(self): from conekt.models.interpro import Interpro return Interpro.sequence_stats_subquery(self.sequences) @property def go_stats(self): return GO.sequence_stats_subquery(self.sequences) @property def family_stats(self): from conekt.models.gene_families import GeneFamily return GeneFamily.sequence_stats_subquery(self.sequences) def species_occurrence(self, species_id): """ count how many genes have the current GO term in a given species :param species_id: internal id of the selected species :return: count of sequences with this term associated """ count = 0 sequences = self.sequences.all() for s in sequences: if s.species_id == species_id: count += 1 return count @staticmethod def sequence_stats(sequence_ids, exclude_predicted=True): """ Takes a list of sequence IDs and returns InterPro stats for those sequences :param sequence_ids: list of sequence ids :param exclude_predicted: if True (default) predicted GO labels will be excluded :return: dict with for each InterPro domain linked with any of the input sequences stats """ query = SequenceGOAssociation.query.filter(SequenceGOAssociation.sequence_id.in_(sequence_ids)) if exclude_predicted: query = query.filter(SequenceGOAssociation.predicted == 0) data = query.all() return GO.__sequence_stats_associations(data) @staticmethod def sequence_stats_subquery(sequences, exclude_predicted=True): subquery = sequences.subquery() query = SequenceGOAssociation.query if exclude_predicted: query = query.filter(SequenceGOAssociation.predicted == 0) data = query.join(subquery, SequenceGOAssociation.sequence_id == subquery.c.id).all() return GO.__sequence_stats_associations(data) @staticmethod def __sequence_stats_associations(associations): output = {} for d in associations: if d.go_id not in output.keys(): output[d.go_id] = { 'go': d.go, 'count': 1, 'sequences': [d.sequence_id], 'species': [d.sequence.species_id] } else: output[d.go_id]['count'] += 1 if d.sequence_id not in output[d.go_id]['sequences']: output[d.go_id]['sequences'].append(d.sequence_id) if d.sequence.species_id not in output[d.go_id]['species']: output[d.go_id]['species'].append(d.sequence.species_id) for k, v in output.items(): v['species_count'] = len(v['species']) v['sequence_count'] = len(v['sequences']) return output @staticmethod def update_species_counts(): """ Adds phylo-profile to each go-label, results are stored in the database :param exclude_predicted: if True (default) predicted GO labels will be excluded """ # link species to sequences sequences = db.engine.execute(db.select([Sequence.__table__.c.id, Sequence.__table__.c.species_id])).fetchall() sequence_to_species = {} for seq_id, species_id in sequences: if species_id is not None: sequence_to_species[seq_id] = int(species_id) # get go for all genes associations = db.engine.execute( db.select([SequenceGOAssociation.__table__.c.sequence_id, SequenceGOAssociation.__table__.c.go_id], distinct=True)\ .where(SequenceGOAssociation.__table__.c.predicted == 0))\ .fetchall() count = {} for seq_id, go_id in associations: species_id = sequence_to_species[seq_id] if go_id not in count.keys(): count[go_id] = {} if species_id not in count[go_id]: count[go_id][species_id] = 1 else: count[go_id][species_id] += 1 # update counts for go_id, data in count.items(): db.engine.execute(db.update(GO.__table__) .where(GO.__table__.c.id == go_id) .values(species_counts=json.dumps(data))) @staticmethod def add_from_obo(filename, empty=True, compressed=False): """ Parses GeneOntology's OBO file and adds it to the database :param filename: Path to the OBO file to parse :param compressed: load data from .gz file if true (default: False) :param empty: Empty the database first when true (default: True) """ # If required empty the table first if empty: try: db.session.query(GO).delete() db.session.commit() except Exception as e: db.session.rollback() print(e) obo_parser = OBOParser() obo_parser.readfile(filename, compressed=compressed) obo_parser.extend_go() for i, term in enumerate(obo_parser.terms): go = GO(term.id, term.name, term.namespace, term.definition, term.is_obsolete, ";".join(term.is_a), ";".join(term.extended_go)) db.session.add(go) if i % 40 == 0: # commit to the db frequently to allow WHOOSHEE's indexing function to work without timing out try: db.session.commit() except Exception as e: db.session.rollback() print(e) try: db.session.commit() except Exception as e: db.session.rollback() print(e) @staticmethod def add_go_from_plaza(filename): """ Adds GO annotation from PLAZA 3.0 to the database :param filename: Path to the annotation file :return: """ go_parser = GOParser() go_parser.read_plaza_go(filename) gene_hash = {} go_hash = {} all_sequences = Sequence.query.all() all_go = GO.query.all() for sequence in all_sequences: gene_hash[sequence.name] = sequence for term in all_go: go_hash[term.label] = term associations = [] for gene, terms in go_parser.annotation.items(): if gene in gene_hash.keys(): current_sequence = gene_hash[gene] for term in terms: if term["id"] in go_hash.keys(): current_term = go_hash[term["id"]] association = { "sequence_id": current_sequence.id, "go_id": current_term.id, "evidence": term["evidence"], "source": term["source"]} associations.append(association) else: print(term, "not found in the database.") else: print("Gene", gene, "not found in the database.") if len(associations) > 400: db.engine.execute(SequenceGOAssociation.__table__.insert(), associations) associations = [] # Add extended GOs for gene, terms in go_parser.annotation.items(): if gene in gene_hash.keys(): current_sequence = gene_hash[gene] new_terms = [] current_terms = [] for term in terms: if term["id"] not in current_terms: current_terms.append(term["id"]) for term in terms: if term["id"] in go_hash.keys(): extended_terms = go_hash[term["id"]].extended_go.split(";") for extended_term in extended_terms: if extended_term not in current_terms and extended_term not in new_terms: new_terms.append(extended_term) for new_term in new_terms: if new_term in go_hash.keys(): current_term = go_hash[new_term] association = { "sequence_id": current_sequence.id, "go_id": current_term.id, "evidence": None, "source": "Extended"} associations.append(association) if len(associations) > 400: db.engine.execute(SequenceGOAssociation.__table__.insert(), associations) associations = [] db.engine.execute(SequenceGOAssociation.__table__.insert(), associations) @staticmethod def add_go_from_tab(filename, species_id, source="Source not provided"): gene_hash = {} go_hash = {} all_sequences = Sequence.query.filter_by(species_id=species_id).all() all_go = GO.query.all() for sequence in all_sequences: gene_hash[sequence.name] = sequence for term in all_go: go_hash[term.label] = term associations = [] gene_go = defaultdict(list) with open(filename, "r") as f: for line in f: gene, term, evidence = line.strip().split('\t') if gene in gene_hash.keys(): current_sequence = gene_hash[gene] if term in go_hash.keys(): current_term = go_hash[term] association = { "sequence_id": current_sequence.id, "go_id": current_term.id, "evidence": evidence, "source": source} associations.append(association) if term not in gene_go[gene]: gene_go[gene].append(term) else: print(term, "not found in the database.") else: print("Gene", gene, "not found in the database.") if len(associations) > 400: db.engine.execute(SequenceGOAssociation.__table__.insert(), associations) associations = [] # Add extended GOs for gene, terms in gene_go.items(): if gene in gene_hash.keys(): current_sequence = gene_hash[gene] new_terms = [] current_terms = [] for term in terms: if term not in current_terms: current_terms.append(term) for term in terms: if term in go_hash.keys(): extended_terms = go_hash[term].extended_go.split(";") for extended_term in extended_terms: if extended_term not in current_terms and extended_term not in new_terms: new_terms.append(extended_term) for new_term in new_terms: if new_term in go_hash.keys(): current_term = go_hash[new_term] association = { "sequence_id": current_sequence.id, "go_id": current_term.id, "evidence": None, "source": "Extended"} associations.append(association) if len(associations) > 400: db.engine.execute(SequenceGOAssociation.__table__.insert(), associations) associations = [] db.engine.execute(SequenceGOAssociation.__table__.insert(), associations) @staticmethod def predict_from_network(expression_network_method_id, threshold=5, source="PlaNet Prediction"): """ Function to transfer GO terms from neighbors in the network. If n or more (based on threshold) neighbors have a GO label (excluding other predicted labels) the term is transferred. :param expression_network_method_id: Expression network as input :param threshold: number of neighboring genes that should have the label to allow transfor :param source: Value for the source field """ from conekt.models.expression.networks import ExpressionNetworkMethod expression_network_method = ExpressionNetworkMethod.query.get(expression_network_method_id) if expression_network_method is None: print("ERROR: Network Method ID %d not found" % expression_network_method_id) return # Get all genes that belong to the network probes = expression_network_method.probes.all() new_associations = [] for i, probe in enumerate(probes): print("Predicting GO for gene: %d, %s (%d out of %d)" % (probe.sequence_id, probe.sequence.name, i, expression_network_method.probe_count)) # Get neighborhood from database neighborhood = json.loads(probe.network) # Get sequence ids from genes in first level neighborhood sequence_ids = [n['gene_id'] for n in neighborhood if 'gene_id' in n] # If the number of genes in the neighborhood is smaller than the threshold skip (no prediction possible) # If there is no sequence associated with the probe skip as well if len(sequence_ids) < threshold or probe.sequence_id is None: continue # Get own GO terms own_associations = SequenceGOAssociation.query.filter(SequenceGOAssociation.sequence_id == probe.sequence_id) own_terms = list(set([a.go_id for a in own_associations])) # Get GO terms from neighbors associations = SequenceGOAssociation.query.filter(SequenceGOAssociation.sequence_id.in_(sequence_ids)).\ filter(SequenceGOAssociation.predicted == 0).all() # Make GO terms from neighbors unique and ignore terms the current gene has already unique_associations = set([(a.sequence_id, a.go_id) for a in associations if a.go_id not in own_terms]) go_counts = defaultdict(lambda: 0) for ua in unique_associations: go_counts[ua[1]] += 1 # Determine new terms (that occurred equal or more times than the desired threshold new_terms = [{ 'go_id': k, 'score': v } for k, v in go_counts.items() if v >= threshold] # Store new terms in a list that can be added to the database for nt in new_terms: new_associations.append({ 'sequence_id': probe.sequence_id, 'go_id': nt['go_id'], 'evidence': 'IEP', 'source': source, 'predicted': True, 'prediction_data': json.dumps({'score': nt['score'], 'threshold': threshold, 'network_method': expression_network_method_id, 'prediction_method': 'Neighbor counting' }) }) # Add new labels to the database in chuncks of 400 for i in range(0, len(new_associations), 400): db.engine.execute(SequenceGOAssociation.__table__.insert(), new_associations[i: i + 400]) @staticmethod def predict_from_network_enrichment(expression_network_method_id, cutoff=0.05, source="PlaNet Prediction"): from conekt.models.expression.networks import ExpressionNetworkMethod expression_network_method = ExpressionNetworkMethod.query.get(expression_network_method_id) if expression_network_method is None: print("ERROR: Network Method ID %d not found" % expression_network_method_id) return probes = expression_network_method.probes.all() # Get all GO terms and get background # Important, counts are obtained from precomputed counts in the species_counts field !! go_data = db.engine.execute(db.select([GO.__table__.c.id, GO.__table__.c.species_counts])).fetchall() go_background = defaultdict(lambda: 0) for go_id, counts_json in go_data: if counts_json is not "": counts = json.loads(counts_json) if str(expression_network_method.species_id) in counts.keys(): go_background[go_id] = counts[str(expression_network_method.species_id)] new_associations = [] for i, probe in enumerate(probes): print("Predicting GO for gene: %d, %s (%d out of %d)" % (probe.sequence_id, probe.sequence.name, i, expression_network_method.probe_count)) # Get neighborhood from database neighborhood = json.loads(probe.network) # Get sequence ids from genes in first level neighborhood sequence_ids = [n['gene_id'] for n in neighborhood if 'gene_id' in n] # Get own GO terms own_associations = SequenceGOAssociation.query.filter(SequenceGOAssociation.sequence_id == probe.sequence_id) own_terms = list(set([a.go_id for a in own_associations])) # Get GO terms from neighbors associations = SequenceGOAssociation.query.filter(SequenceGOAssociation.sequence_id.in_(sequence_ids)).\ filter(SequenceGOAssociation.predicted == 0).all() # Make GO terms from neighbors unique and ignore terms the current gene has already unique_associations = set([(a.sequence_id, a.go_id) for a in associations if a.go_id not in own_terms]) go_counts = defaultdict(lambda: 0) for ua in unique_associations: go_counts[ua[1]] += 1 # find significantly enriched GO terms and store them enriched_go = [] for go_id, count in go_counts.items(): p_value = hypergeo_sf(count, len(sequence_ids), go_background[go_id], len(probes)) if p_value < cutoff: enriched_go.append((go_id, p_value)) # apply FDR correction to the p-values corrected_p = fdr_correction([a[1] for a in enriched_go]) # push new prediction in a dict that will be added to the DB for corrected_p, (go_id, p_value) in zip(corrected_p, enriched_go): new_associations.append({ 'sequence_id': probe.sequence_id, 'go_id': go_id, 'evidence': 'IEP', 'source': source, 'predicted': True, 'prediction_data': json.dumps({'p-cutoff': cutoff, 'p-value': p_value, 'p-value (FDR)': corrected_p, 'network_method': expression_network_method_id, 'prediction_method': 'Neighborhood enrichment' }) }) # Add new labels to the database in chuncks of 400 for i in range(0, len(new_associations), 400): db.engine.execute(SequenceGOAssociation.__table__.insert(), new_associations[i: i + 400])
class ExpressionProfile(db.Model): __tablename__ = 'expression_profiles' id = db.Column(db.Integer, primary_key=True) species_id = db.Column(db.Integer, db.ForeignKey('species.id', ondelete='CASCADE'), index=True) probe = db.Column(db.String(50, collation=SQL_COLLATION), index=True) sequence_id = db.Column(db.Integer, db.ForeignKey('sequences.id', ondelete='CASCADE'), index=True) profile = db.deferred(db.Column(db.Text)) specificities = db.relationship('ExpressionSpecificity', backref=db.backref('profile', lazy='joined'), lazy='dynamic', cascade="all, delete-orphan", passive_deletes=True) def __init__(self, probe, sequence_id, profile): self.probe = probe self.sequence_id = sequence_id self.profile = profile @staticmethod def __profile_to_table(data): """ Internal function to convert an expression profile (dict) to a tabular text :param data: Dict with expression profile :return: table (string) """ output = [["condition", "mean", "min", "max"]] order = data["order"] for o in order: try: values = data["data"][o] output.append( [o, str(mean(values)), str(min(values)), str(max(values))]) except Exception as e: print(e) return '\n'.join(['\t'.join(l) for l in output]) @property def table(self): """ Returns the condition expression as a tabular text file :return: table with data (string) """ table = ExpressionProfile.__profile_to_table(json.loads(self.profile)) return table def tissue_table(self, condition_tissue_id, use_means=True): """ Returns the tissue expression as a tabular text file :param condition_tissue_id: condition_tissue_id for the conversion :param use_means: Use the mean of the condition (recommended) :return: table with data (string) """ table = ExpressionProfile.__profile_to_table( self.tissue_profile(condition_tissue_id, use_means=use_means)) return table @property def low_abundance(self, cutoff=10): """ Checks if the mean expression value in any conditions in the plot is higher than the desired cutoff :param cutoff: cutoff for expression, default = 10 :return: True in case of low abundance otherwise False """ data = json.loads(self.profile) checks = [mean(v) > cutoff for _, v in data["data"].items()] return not any(checks) @staticmethod def convert_profile(condition_to_tissue, profile_data, use_means=True): """ Convert a full, detailed profile into a more general summarized one using conversion table stored in the database :param condition_to_tissue: dict with conversion instructions :param profile_data: profile to convert :param use_means: use means of detailed condition if True otherwise use samples independently. Default True :return: New profile """ tissues = list(set(condition_to_tissue['conversion'].values())) output = {} for t in tissues: valid_conditions = [ k for k in profile_data['data'] if k in condition_to_tissue['conversion'] and condition_to_tissue['conversion'][k] == t ] valid_values = [] for k, v in profile_data['data'].items(): if k in valid_conditions: if use_means: valid_values.append(mean(v)) else: valid_values += v output[t] = valid_values if len(valid_values) > 0 else [0] return { 'order': condition_to_tissue['order'], 'colors': condition_to_tissue['colors'], 'data': output } def tissue_profile(self, condition_tissue_id, use_means=True): """ Applies a conversion to the profile, grouping several condition into one more general feature (e.g. tissue). :param condition_tissue_id: identifier of the conversion table :param use_means: store the mean of the condition rather than individual values. The matches the spm calculations better. :return: parsed profile """ ct = ConditionTissue.query.get(condition_tissue_id) condition_to_tissue = json.loads(ct.data) profile_data = json.loads(self.profile) output = ExpressionProfile.convert_profile(condition_to_tissue, profile_data, use_means=use_means) return output @staticmethod def get_heatmap(species_id, probes, zlog=True, raw=False): """ Returns a heatmap for a given species (species_id) and a list of probes. It returns a dict with 'order' the order of the experiments and 'heatmap' another dict with the actual data. Data is zlog transformed :param species_id: species id (internal database id) :param probes: a list of probes to include in the heatmap :param zlog: enable zlog transformation (otherwise normalization against highest expressed condition) """ profiles = ExpressionProfile.query.options(undefer('profile')).filter_by(species_id=species_id).\ filter(ExpressionProfile.probe.in_(probes)).all() order = [] output = [] not_found = [p.lower() for p in probes] for profile in profiles: name = profile.probe data = json.loads(profile.profile) order = data['order'] experiments = data['data'] with contextlib.suppress(ValueError): not_found.remove(profile.probe.lower()) with contextlib.suppress(ValueError): not_found.remove(profile.sequence.name.lower()) values = {} for o in order: values[o] = mean(experiments[o]) row_mean = mean(values.values()) row_max = max(values.values()) for o in order: if zlog: if row_mean == 0 or values[o] == 0: values[o] = '-' else: try: values[o] = log(values[o] / row_mean, 2) except ValueError as _: print("Unable to calculate log()", values[o], row_mean) values[o] = '-' else: if row_max != 0 and not raw: values[o] = values[o] / row_max output.append({ "name": name, "values": values, "sequence_id": profile.sequence_id, "shortest_alias": profile.sequence.shortest_alias }) if len(not_found) > 0: flash("Couldn't find profile for: %s" % ", ".join(not_found), "warning") return {'order': order, 'heatmap_data': output} @staticmethod def get_profiles(species_id, probes, limit=1000): """ Gets the data for a set of probes (including the full profiles), a limit can be provided to avoid overly long queries :param species_id: internal id of the species :param probes: probe names to fetch :param limit: maximum number of probes to get :return: List of ExpressionProfile objects including the full profiles """ profiles = ExpressionProfile.query.\ options(undefer('profile')).\ filter(ExpressionProfile.probe.in_(probes)).\ filter_by(species_id=species_id).\ options(joinedload('sequence').load_only('name').noload('xrefs')).\ limit(limit).all() return profiles @staticmethod def add_profile_from_lstrap(matrix_file, annotation_file, species_id, order_color_file=None): """ Function to convert an (normalized) expression matrix (lstrap output) into a profile :param matrix_file: path to the expression matrix :param annotation_file: path to the file assigning samples to conditions :param species_id: internal id of the species :param order_color_file: tab delimited file that contains the order and color of conditions """ annotation = {} with open(annotation_file, 'r') as fin: # get rid of the header _ = fin.readline() for line in fin: parts = line.strip().split('\t') if len(parts) > 1: run, description = parts annotation[run] = description order, colors = [], [] if order_color_file is not None: with open(order_color_file, 'r') as fin: for line in fin: try: o, c = line.strip().split('\t') order.append(o) colors.append(c) except Exception as _: pass # build conversion table for sequences sequences = Sequence.query.filter_by(species_id=species_id).all() sequence_dict = {} # key = sequence name uppercase, value internal id for s in sequences: sequence_dict[s.name.upper()] = s.id with open(matrix_file) as fin: # read header _, *colnames = fin.readline().rstrip().split() colnames = [c.replace('.htseq', '') for c in colnames] # determine order after annotation is not defined if order is None: order = [] for c in colnames: if c in annotation.keys(): if annotation[c] not in order: order.append(annotation[c]) order.sort() # read each line and build profile new_probes = [] for line in fin: transcript, *values = line.rstrip().split() profile = defaultdict(list) for c, v in zip(colnames, values): if c in annotation.keys(): condition = annotation[c] profile[condition].append(float(v)) new_probe = { "species_id": species_id, "probe": transcript, "sequence_id": sequence_dict[transcript.upper()] if transcript.upper() in sequence_dict.keys() else None, "profile": json.dumps({ "order": order, "colors": colors, "data": profile }) } new_probes.append(new_probe) if len(new_probes) > 400: db.engine.execute(ExpressionProfile.__table__.insert(), new_probes) new_probes = [] db.engine.execute(ExpressionProfile.__table__.insert(), new_probes)
class Interpro(db.Model): __tablename__ = 'interpro' id = db.Column(db.Integer, primary_key=True) label = db.Column(db.String(50, collation=SQL_COLLATION), unique=True, index=True) description = db.Column(db.Text) clade_id = db.Column(db.Integer, db.ForeignKey('clades.id', ondelete='SET NULL'), index=True) sequences = db.relationship('Sequence', secondary=sequence_interpro, lazy='dynamic') # Other properties # sequence_associations = defined in SequenceInterproRelationship def __init__(self, label, description): self.label = label self.description = description @property def species_codes(self): """ Finds all species the family has genes from :return: a list of all species (codes) """ sequences = self.sequences.options(joinedload('species')).all() output = [] for s in sequences: if s.species.code not in output: output.append(s.species.code) return output @property def species_counts(self): """ Generates a phylogenetic profile of a gene family :return: a dict with counts per species (codes are keys) """ sequences = self.sequences.options(joinedload('species')).all() output = {} for s in sequences: if s.species.code not in output: output[s.species.code] = 1 else: output[s.species.code] += 1 return output @staticmethod def sequence_stats(sequence_ids): """ Takes a list of sequence IDs and returns InterPro stats for those sequences :param sequence_ids: list of sequence ids :return: dict with for each InterPro domain linked with any of the input sequences stats """ data = SequenceInterproAssociation.query.filter(SequenceInterproAssociation.sequence_id.in_(sequence_ids)).all() return Interpro.__sequence_stats_associations(data) @staticmethod def sequence_stats_subquery(sequences): subquery = sequences.subquery() data = SequenceInterproAssociation.query.join(subquery, SequenceInterproAssociation.sequence_id == subquery.c.id).all() return Interpro.__sequence_stats_associations(data) @staticmethod def __sequence_stats_associations(associations): output = {} for d in associations: if d.interpro_id not in output.keys(): output[d.interpro_id] = { 'domain': d.domain, 'count': 1, 'sequences': [d.sequence_id], 'species': [d.sequence.species_id] } else: output[d.interpro_id]['count'] += 1 if d.sequence_id not in output[d.interpro_id]['sequences']: output[d.interpro_id]['sequences'].append(d.sequence_id) if d.sequence.species_id not in output[d.interpro_id]['species']: output[d.interpro_id]['species'].append(d.sequence.species_id) for k, v in output.items(): v['species_count'] = len(v['species']) v['sequence_count'] = len(v['sequences']) return output @property def interpro_stats(self): sequence_ids = [s.id for s in self.sequences.all()] return Interpro.sequence_stats_subquery(self.sequences) @property def go_stats(self): from conekt.models.go import GO return GO.sequence_stats_subquery(self.sequences) @property def family_stats(self): from conekt.models.gene_families import GeneFamily return GeneFamily.sequence_stats_subquery(self.sequences) @staticmethod def add_from_xml(filename, empty=True): """ Populates interpro table with domains and descriptions from the official website's XML file :param filename: path to XML file :param empty: If True the interpro table will be cleared before uploading the new domains, default = True """ # If required empty the table first if empty: try: db.session.query(Interpro).delete() db.session.commit() except Exception as e: db.session.rollback() print(e) interpro_parser = InterproParser() interpro_parser.readfile(filename) for i, domain in enumerate(interpro_parser.domains): interpro = Interpro(domain.label, domain.description) db.session.add(interpro) if i % 40 == 0: # commit to the db frequently to allow WHOOSHEE's indexing function to work without timing out try: db.session.commit() except Exception as e: db.session.rollback() print(e) try: db.session.commit() except Exception as e: db.session.rollback() print(e) @staticmethod def add_interpro_from_plaza(filename): """ Adds GO annotation from PLAZA 3.0 to the database :param filename: Path to the annotation file :return: """ interpro_parser = InterproDomainParser() interpro_parser.read_plaza_interpro(filename) gene_hash = {} domain_hash = {} all_sequences = Sequence.query.all() all_domains = Interpro.query.all() for sequence in all_sequences: gene_hash[sequence.name] = sequence for domain in all_domains: domain_hash[domain.label] = domain new_domains = [] for gene, domains in interpro_parser.annotation.items(): if gene in gene_hash.keys(): current_sequence = gene_hash[gene] for domain in domains: if domain["id"] in domain_hash.keys(): current_domain = domain_hash[domain["id"]] new_domain = {"sequence_id": current_sequence.id, "interpro_id": current_domain.id, "start": domain["start"], "stop": domain["stop"]} new_domains.append(new_domain) else: print(domain["id"], "not found in the database.") else: print("Gene", gene, "not found in the database.") if len(new_domains) > 400: db.engine.execute(SequenceInterproAssociation.__table__.insert(), new_domains) new_domains = [] db.engine.execute(SequenceInterproAssociation.__table__.insert(), new_domains) @staticmethod def add_interpro_from_interproscan(filename, species_id): """ Adds GO annotation from InterProScan Output :param filename: Path to the annotation file :return: """ interpro_parser = InterproDomainParser() interpro_parser.read_interproscan(filename) gene_hash = {} domain_hash = {} all_sequences = Sequence.query.filter_by(species_id=species_id) all_domains = Interpro.query.all() for sequence in all_sequences: gene_hash[sequence.name] = sequence for domain in all_domains: domain_hash[domain.label] = domain new_domains = [] for gene, domains in interpro_parser.annotation.items(): if gene in gene_hash.keys(): current_sequence = gene_hash[gene] for domain in domains: if domain["id"] in domain_hash.keys(): current_domain = domain_hash[domain["id"]] new_domain = {"sequence_id": current_sequence.id, "interpro_id": current_domain.id, "start": domain["start"], "stop": domain["stop"]} new_domains.append(new_domain) else: print(domain["id"], "not found in the database.") else: print("Gene", gene, "not found in the database.") if len(new_domains) > 400: db.engine.execute(SequenceInterproAssociation.__table__.insert(), new_domains) new_domains = [] db.engine.execute(SequenceInterproAssociation.__table__.insert(), new_domains)
class Clade(db.Model): __tablename__ = 'clades' id = db.Column(db.Integer, primary_key=True) name = db.Column(db.String(50, collation=SQL_COLLATION), unique=True, index=True) species = db.Column(db.Text(collation=SQL_COLLATION)) species_count = db.Column(db.Integer) newick_tree = db.Column(db.Text) families = db.relationship('GeneFamily', backref='clade', lazy='dynamic') interpro = db.relationship('Interpro', backref='clade', lazy='dynamic') def __init__(self, name, species, tree): self.name = name self.species = json.dumps(species) self.species_count = len(species) self.newick_tree = tree def __repr__(self): return str(self.id) + ". " + self.name @staticmethod def add_clade(name, species, tree): """ Add a clade to the database :param name: name of the clade :param species: list with codes (!) of the species in the clade :param tree: newick tree for this clade. Will be stored in the database and used for visualizations """ new_clade = Clade(name, species, tree) db.session.add(new_clade) try: db.session.commit() except Exception as e: db.session.rollback() print(e) @staticmethod def add_clades_from_json(data): """ Adds a clade from a dict with clade details :param data: dict with clade details """ for c, data in data.items(): Clade.add_clade(c, data['species'], data['tree']) @staticmethod def update_clades(): """ Loop over all families and determine what clade they belong too. Results are stored in the database """ clades = Clade.query.all() families = GeneFamily.query.all() clade_to_species = {c.name: json.loads(c.species) for c in clades} clade_to_id = {c.name: c.id for c in clades} for f in families: family_species = f.species_codes # skip for families without members if len(family_species) == 0: f.clade_id = None continue # find the clade with the fewest species that contains all the codes selected_clade, _ = get_clade(family_species, clade_to_species) if selected_clade is None: f.clade_id = None else: f.clade_id = clade_to_id[selected_clade] try: db.session.commit() except Exception as e: db.session.rollback() print(e) @staticmethod def update_clades_interpro(): """ Loop over all families and determine what clade they belong too """ clades = Clade.query.all() interpro = Interpro.query.all() clade_to_species = {c.name: json.loads(c.species) for c in clades} clade_to_id = {c.name: c.id for c in clades} for i in interpro: interpro_species = i.species_codes # skip for families without members if len(interpro_species) == 0: i.clade_id = None continue # find the clade with the fewest species that contains all the codes selected_clade, _ = get_clade(interpro_species, clade_to_species) if selected_clade is None: i.clade_id = None else: i.clade_id = clade_to_id[selected_clade] try: db.session.commit() except Exception as e: db.session.rollback() print(e) @property def newick_tree_species(self): """ Returns a Newick tree with the species present in the current clade. :return: Newick tree (string) with species for the current clade """ species = {s.code: s.name for s in Species.query.all()} tree = newick.loads(self.newick_tree)[0] for code, name in species.items(): node = tree.get_node(code) if node is not None: node.name = name return newick.dumps([tree])
class ExpressionNetwork(db.Model): __tablename__ = 'expression_networks' id = db.Column(db.Integer, primary_key=True) probe = db.Column(db.String(50, collation=SQL_COLLATION), index=True) sequence_id = db.Column(db.Integer, db.ForeignKey('sequences.id', ondelete='CASCADE'), index=True) network = db.Column(db.Text) method_id = db.Column(db.Integer, db.ForeignKey('expression_network_methods.id', ondelete='CASCADE'), index=True) def __init__(self, probe, sequence_id, network, method_id): self.probe = probe self.sequence_id = sequence_id self.network = network self.method_id = method_id @property def neighbors_count(self): """ Returns the number of neighors the current gene has :return: int, number of neighbors """ data = json.loads(self.network) return len(data) @property def neighbors_table(self): """ Returns a tab delimited representation of the current gene's neighbors :return: """ data = json.loads(self.network) output = [["Sequence", "Description", "Alias", "PCC", "hrr"]] # Pull in descriptions and aliases sequence_ids = [d["gene_id"] for d in data if "gene_id" in d.keys() and d["gene_id"] is not None] sequences = {s.id: s for s in Sequence.query.filter(Sequence.id.in_(sequence_ids))} for d in data: try: description, alias = "", "" if d["gene_id"] in sequences.keys(): description = sequences[d["gene_id"]].description alias = sequences[d["gene_id"]].aliases description = description if description is not None else "" alias = alias if alias is not None else "" output.append([d["gene_name"], description, alias, str(d["link_pcc"]), str(d["hrr"])]) except Exception as e: print(e) return '\n'.join(['\t'.join(l) for l in output]) @staticmethod def get_neighborhood(probe, depth=0): """ Get the coexpression neighborhood for a specific probe :param probe: internal ID of the probe :param depth: how many steps away from the query you wish to expand the network :return: dict with nodes and edges """ node = ExpressionNetwork.query.get(probe) links = json.loads(node.network) method_id = node.method_id edge_type = node.method.edge_type # add the initial node nodes = [{"id": node.probe, "name": node.probe, "probe_id": node.id, "gene_id": int(node.sequence_id) if node.sequence_id is not None else None, "gene_name": node.sequence.name if node.sequence_id is not None else node.probe, "node_type": "query", "depth": 0}] edges = [] # lists necessary for doing deeper searches additional_nodes = [] existing_edges = [] existing_nodes = [node.probe] # add direct neighbors of the gene of interest for link in links: nodes.append(ExpressionNetwork.__process_link(link, depth=0)) edges.append({"source": node.probe, "target": link["probe_name"], "profile_comparison": url_for('expression_profile.expression_profile_compare_probes', probe_a=node.probe, probe_b=link["probe_name"], species_id=node.method.species.id), "depth": 0, "link_score": link["link_score"], "link_pcc": link["link_pcc"] if "link_pcc" in link.keys() else None, "hrr": link["hrr"] if "hrr" in link.keys() else None, "edge_type": edge_type}) additional_nodes.append(link["probe_name"]) existing_edges.append([node.probe, link["probe_name"]]) existing_edges.append([link["probe_name"], node.probe]) existing_nodes.append(link["probe_name"]) # iterate n times to add deeper links if len(additional_nodes) > 0: for i in range(1, depth+1): new_nodes = ExpressionNetwork.\ query.filter(and_(ExpressionNetwork.probe.in_(additional_nodes), ExpressionNetwork.method_id == method_id)) next_nodes = [] for new_node in new_nodes: new_links = json.loads(new_node.network) for link in new_links: if link["probe_name"] not in existing_nodes: nodes.append(ExpressionNetwork.__process_link(link, depth=depth)) existing_nodes.append(link["probe_name"]) next_nodes.append(link["probe_name"]) if [new_node.probe, link["probe_name"]] not in existing_edges: edges.append({"source": new_node.probe, "target": link["probe_name"], "profile_comparison": url_for('expression_profile.expression_profile_compare_probes', probe_a=new_node.probe, probe_b=link["probe_name"], species_id=node.method.species.id), "depth": i, "link_score": link["link_score"], "link_pcc": link["link_pcc"] if "link_pcc" in link.keys() else None, "hrr": link["hrr"] if "hrr" in link.keys() else None, "edge_type": edge_type}) existing_edges.append([new_node.probe, link["probe_name"]]) existing_edges.append([link["probe_name"], new_node.probe]) additional_nodes = next_nodes # Add links between the last set of nodes added new_nodes = [] if len(additional_nodes) > 0: new_nodes = ExpressionNetwork.query.filter(and_(ExpressionNetwork.probe.in_(additional_nodes), ExpressionNetwork.method_id == method_id)) for new_node in new_nodes: new_links = json.loads(new_node.network) for link in new_links: if link["probe_name"] in existing_nodes: if [new_node.probe, link["probe_name"]] not in existing_edges: edges.append({"source": new_node.probe, "target": link["probe_name"], "profile_comparison": url_for('expression_profile.expression_profile_compare_probes', probe_a=new_node.probe, probe_b=link["probe_name"], species_id=node.method.species.id), "depth": depth+1, "link_score": link["link_score"], "link_pcc": link["link_pcc"] if "link_pcc" in link.keys() else None, "hrr": link["hrr"] if "hrr" in link.keys() else None, "edge_type": edge_type}) existing_edges.append([new_node.probe, link["probe_name"]]) existing_edges.append([link["probe_name"], new_node.probe]) return {"nodes": nodes, "edges": edges} @staticmethod def get_custom_network(method_id, probes): """ Return a network dict for a certain set of probes/sequences. Only returns the selected nodes and connections between them (if any) :param method_id: network method to extract information from :param probes: list of probe/sequence names :return: network dict """ nodes = [] edges = [] probes = ExpressionNetwork.query.filter(ExpressionNetwork.method_id == method_id).\ filter(ExpressionNetwork.probe.in_(probes)).all() valid_nodes = [] for p in probes: node = {"id": p.probe, "name": p.probe, "probe_id": p.id, "gene_id": int(p.sequence_id) if p.sequence_id is not None else None, "gene_name": p.sequence.name if p.sequence_id is not None else p.probe, "node_type": "query", "depth": 0} valid_nodes.append(p.probe) nodes.append(node) existing_edges = [] for p in probes: source = p.probe neighborhood = json.loads(p.network) for n in neighborhood: if n["probe_name"] in valid_nodes: if [source, n["probe_name"]] not in existing_edges: edges.append({"source": source, "target": n["probe_name"], "profile_comparison": url_for('expression_profile.expression_profile_compare_probes', probe_a=source, probe_b=n["probe_name"], species_id=p.method.species.id), "depth": 0, "link_score": n["link_score"], "link_pcc": n["link_pcc"] if "link_pcc" in n.keys() else None, "hrr": n["hrr"] if "hrr" in n.keys() else None, "edge_type": p.method.edge_type}) existing_edges.append([source, n["probe_name"]]) existing_edges.append([n["probe_name"], source]) return {"nodes": nodes, "edges": edges} @staticmethod def __process_link(linked_probe, depth): """ Internal function that processes a linked probe (from the ExpressionNetwork.network field) to a data entry compatible with cytoscape.js :param linked_probe: hash with information from ExpressionNetwork.network field :return: a hash formatted for use as a node with cytoscape.js """ if linked_probe["gene_id"] is not None: return {"id": linked_probe["probe_name"], "name": linked_probe["probe_name"], "gene_id": linked_probe["gene_id"], "gene_name": linked_probe["gene_name"], "node_type": "linked", "depth": depth} else: return {"id": linked_probe["probe_name"], "name": linked_probe["probe_name"], "gene_id": None, "gene_name": linked_probe["probe_name"], "node_type": "linked", "depth": depth} @staticmethod def read_expression_network_lstrap(network_file, species_id, description, score_type="rank", pcc_cutoff=0.7, limit=30, enable_second_level=False): """ Reads a network from disk, generated using LSTrAP, determing hrr scores for each pair and store things in the DB. :param network_file: path to input file :param species_id: species the data is from :param description: description to add to the db for this network :param score_type: which scores are used, default = "rank" :param pcc_cutoff: pcc threshold, pairs with a score below this will be ignored :param limit: hrr score threshold, pairs with a score above this will be ignored :param enable_second_level: include second level neighborhood in the database (only to be used for sparse networks) :return: internal ID of the new network """ # build conversion table for sequences sequences = Sequence.query.filter_by(species_id=species_id).all() sequence_dict = {} # key = sequence name uppercase, value internal id for s in sequences: sequence_dict[s.name.upper()] = s.id # Add network method first network_method = ExpressionNetworkMethod(species_id, description, score_type) network_method.hrr_cutoff = limit network_method.pcc_cutoff = pcc_cutoff network_method.enable_second_level = enable_second_level db.session.add(network_method) try: db.session.commit() except Exception as e: db.session.rollback() print(e) network = {} scores = defaultdict(lambda: defaultdict(lambda: None)) # Score for non-existing pairs will be None with open(network_file) as fin: for linenr, line in enumerate(fin): try: query, hits = line.strip().split(' ') query = query.replace(':', '') except ValueError: print("Error parsing line %d: \"%s\"" % (linenr, line)) # skip this line and continue continue network[query] = { "probe": query, "sequence_id": sequence_dict[query.upper()] if query.upper() in sequence_dict.keys() else None, "linked_probes": [], "total_count": 0, "method_id": network_method.id } for i, h in enumerate(hits.split('\t')): try: name, value = h.split('(') value = float(value.replace(')', '')) if value > pcc_cutoff: network[query]["total_count"] += 1 if i < limit: link = {"probe_name": name, "gene_name": name, "gene_id": sequence_dict[name.upper()] if name.upper() in sequence_dict.keys() else None, "link_score": i, "link_pcc": value} network[query]["linked_probes"].append(link) scores[query][name] = i except ValueError as e: print("Error on line %d, skipping ... (%s)" % (i, str(h)), file=sys.stderr) # HRR hr_ranks = defaultdict(lambda: defaultdict(int)) for query, targets in scores.items(): for target, score in targets.items(): if None in [score, scores[target][query]]: hr_ranks[query][target] = None else: # As scores start from 0 and ranks one, increase the hrr by one hr_ranks[query][target] = max(score, scores[target][query]) + 1 # Dump dicts into network string, which will be loaded into the database for query in network.keys(): for i, l in enumerate(network[query]["linked_probes"]): network[query]["linked_probes"][i]["hrr"] = hr_ranks[query][l["probe_name"]] # Dump links WITH HRR into json string network[query]["network"] = json.dumps([n for n in network[query]["linked_probes"] if n['hrr'] is not None]) # add nodes in sets of 400 to avoid sending to much in a single query new_nodes = [] for _, n in network.items(): new_nodes.append(n) if len(new_nodes) > 400: db.engine.execute(ExpressionNetwork.__table__.insert(), new_nodes) new_nodes = [] db.engine.execute(ExpressionNetwork.__table__.insert(), new_nodes) return network_method.id
class Sequence(db.Model): __tablename__ = 'sequences' id = db.Column(db.Integer, primary_key=True) species_id = db.Column(db.Integer, db.ForeignKey('species.id', ondelete='CASCADE'), index=True) name = db.Column(db.String(50, collation=SQL_COLLATION), index=True) description = db.Column(db.Text) coding_sequence = db.deferred(db.Column(db.Text)) type = db.Column(db.Enum('protein_coding', 'TE', 'RNA', name='sequence_type'), default='protein_coding') is_mitochondrial = db.Column(db.SmallInteger, default=False) is_chloroplast = db.Column(db.SmallInteger, default=False) expression_profiles = db.relationship('ExpressionProfile', backref=db.backref('sequence', lazy='joined'), lazy='dynamic', cascade="all, delete-orphan", passive_deletes=True) network_nodes = db.relationship('ExpressionNetwork', backref=db.backref('sequence', lazy='joined'), lazy='dynamic', cascade="all, delete-orphan", passive_deletes=True) # Other properties # # coexpression_cluster_associations declared in 'SequenceCoexpressionClusterAssociation' # interpro_associations declared in 'SequenceInterproAssociation' # go_associations declared in 'SequenceGOAssociation' # family_associations declared in 'SequenceFamilyAssociation' go_labels = db.relationship('GO', secondary=sequence_go, lazy='dynamic') interpro_domains = db.relationship('Interpro', secondary=sequence_interpro, lazy='dynamic') families = db.relationship('GeneFamily', secondary=sequence_family, lazy='dynamic') coexpression_clusters = db.relationship( 'CoexpressionCluster', secondary=sequence_coexpression_cluster, backref=db.backref('sequences', lazy='dynamic'), lazy='dynamic') ecc_query_associations = db.relationship( 'SequenceSequenceECCAssociation', primaryjoin="SequenceSequenceECCAssociation.query_id == Sequence.id", backref=db.backref('query_sequence', lazy='joined'), lazy='dynamic') ecc_target_associations = db.relationship( 'SequenceSequenceECCAssociation', primaryjoin="SequenceSequenceECCAssociation.target_id == Sequence.id", backref=db.backref('target_sequence', lazy='joined'), lazy='dynamic') clade_associations_one = db.relationship( 'SequenceSequenceCladeAssociation', primaryjoin= "SequenceSequenceCladeAssociation.sequence_one_id == Sequence.id", backref=db.backref('sequence_one', lazy='joined'), lazy='dynamic') clade_associations_two = db.relationship( 'SequenceSequenceCladeAssociation', primaryjoin= "SequenceSequenceCladeAssociation.sequence_two_id == Sequence.id", backref=db.backref('sequence_two', lazy='joined'), lazy='dynamic') xrefs = db.relationship('XRef', secondary=sequence_xref, lazy='joined') def __init__(self, species_id, name, coding_sequence, type='protein_coding', is_chloroplast=False, is_mitochondrial=False, description=None): self.species_id = species_id self.name = name self.description = description self.coding_sequence = coding_sequence self.type = type self.is_chloroplast = is_chloroplast self.is_mitochondrial = is_mitochondrial @property def protein_sequence(self): """ Function to translate the coding sequence to the amino acid sequence. Will start at the first start codon and break after adding a stop codon (indicated by '*') :return: The amino acid sequence based on the coding sequence """ return translate(self.coding_sequence) @property def aliases(self): """ Returns a readable string with the aliases or tokens stored for this sequence in the table xrefs :return: human readable string with aliases or None """ t = [x.name for x in self.xrefs if x.platform == 'token'] return ", ".join(t) if len(t) > 0 else None @property def shortest_alias(self): """ Returns the shortest alias :return: string with shortest alias or None (in case no aliases exist) """ t = [x.name for x in self.xrefs if x.platform == 'token'] return min(t, key=len) if len(t) > 0 else None @property def display_name(self): """ Returns a name to display (from xrefs with display) if available otherwise return name :return: display name """ t = [x.name for x in self.xrefs if x.platform == 'display'] return t[0] if len(t) > 0 else self.name @property def best_name(self): """ Checks if there is a display name, if not checks the shortest alias, otherwise returns name. To be used in e.g. graphs :return: string with best name to show in graphs, ... """ if self.display_name is not self.name: return self.display_name elif self.shortest_alias is not None: return self.shortest_alias else: return self.name @property def readable_type(self): """ Converts the type table to a readable string :return: string with readable version of the sequence type """ conversion = { 'protein_coding': 'protein coding', 'TE': 'transposable element', 'RNA': 'RNA' } if self.type in conversion.keys(): return conversion[self.type] else: return 'other' @staticmethod def add_from_fasta(filename, species_id, compressed=False): fasta_data = Fasta() fasta_data.readfile(filename, compressed=compressed) new_sequences = [] # Loop over sequences, sorted by name (key here) and add to db for name, sequence in sorted(fasta_data.sequences.items(), key=operator.itemgetter(0)): new_sequence = { "species_id": species_id, "name": name, "description": None, "coding_sequence": sequence, "type": "protein_coding", "is_mitochondrial": False, "is_chloroplast": False } new_sequences.append(new_sequence) # add 400 sequences at the time, more can cause problems with some database engines if len(new_sequences) > 400: db.engine.execute(Sequence.__table__.insert(), new_sequences) new_sequences = [] # add the last set of sequences db.engine.execute(Sequence.__table__.insert(), new_sequences) return len(fasta_data.sequences.keys()) @staticmethod def add_descriptions(filename, species_id): sequences = Sequence.query.filter_by(species_id=species_id).all() seq_dict = {} for s in sequences: seq_dict[s.name] = s with open(filename, "r") as f_in: for i, line in enumerate(f_in): try: name, description = line.strip().split('\t') except ValueError: print("Cannot parse line %d: \"%s\"" % (i, line), file=sys.stderr) finally: if name in seq_dict.keys(): seq_dict[name].description = description if i % 400 == 0: db.session.commit() db.session.commit() @staticmethod def export_cds(filename): sequences = Sequence.query.options(undefer('coding_sequence')).all() with open(filename, "w") as f_out: for s in sequences: print(">%s\n%s" % (s.name, s.coding_sequence), file=f_out) @staticmethod def export_protein(filename): sequences = Sequence.query.options(undefer('coding_sequence')).all() with open(filename, "w") as f_out: for s in sequences: print(">%s\n%s" % (s.name, s.protein_sequence), file=f_out)