예제 #1
0
class User(db.Model):
    __tablename__ = 'users'
    id = db.Column(db.Integer, primary_key=True)
    username = db.Column(db.String(50), unique=True, index=True)
    first_name = db.Column(db.String(50))
    last_name = db.Column(db.String(50))
    password_hash = db.Column(db.Text)
    email = db.Column(db.Text)
    reset_key = db.Column(db.Text)
    is_admin = db.Column(db.SmallInteger)
    is_banned = db.Column(db.SmallInteger)
    wants_newsletter = db.Column(db.SmallInteger)
    registered = db.Column(db.DateTime)

    def __init__(self,
                 username,
                 password,
                 email,
                 reset_key='',
                 is_admin=False,
                 is_banned=False,
                 registered=datetime.now().replace(microsecond=0)):
        self.username = username
        self.password_hash = generate_password_hash(password)
        self.email = email
        self.reset_key = reset_key
        self.is_admin = is_admin
        self.is_banned = is_banned
        self.registered = registered
        self.wants_newsletter = False

    def __repr__(self):
        return '<User %d>' % self.id

    def check_password(self, password):
        return check_password_hash(self.password_hash, password)

    @property
    def is_administrator(self):
        return self.is_admin

    @property
    def is_authenticated(self):
        return True

    @property
    def is_active(self):
        return True

    @property
    def is_anonymous(self):
        return False

    def get_id(self):
        return str(self.id)

    @staticmethod
    def get(user_id):
        return User.query.get(user_id)
예제 #2
0
class News(db.Model):
    __tablename__ = 'news'
    id = db.Column(db.Integer, primary_key=True)
    message = db.Column(db.Text(collation=SQL_COLLATION))
    posted = db.Column(db.DateTime)
    posted_by = db.Column(db.String(100))

    @property
    def message_markup(self):
        return Markup(markdown(self.message))

    @property
    def posted_formatted(self):
        return self.posted.strftime("%Y-%m-%d %H:%M")
예제 #3
0
class ExpressionSpecificity(db.Model):
    __tablename__ = 'expression_specificity'

    id = db.Column(db.Integer, primary_key=True)
    profile_id = db.Column(db.Integer,
                           db.ForeignKey('expression_profiles.id',
                                         ondelete='CASCADE'),
                           index=True)
    condition = db.Column(db.String(255), index=True)
    score = db.Column(db.Float, index=True)
    entropy = db.Column(db.Float, index=True)
    tau = db.Column(db.Float, index=True)
    method_id = db.Column(db.Integer,
                          db.ForeignKey('expression_specificity_method.id',
                                        ondelete='CASCADE'),
                          index=True)
예제 #4
0
class SequenceCoexpressionClusterAssociation(db.Model):
    __tablename__ = 'sequence_coexpression_cluster'
    __table_args__ = {'extend_existing': True}

    id = db.Column(db.Integer, primary_key=True)
    probe = db.Column(db.String(50), index=True)
    sequence_id = db.Column(db.Integer,
                            db.ForeignKey('sequences.id', ondelete='CASCADE'))
    coexpression_cluster_id = db.Column(
        db.Integer,
        db.ForeignKey('coexpression_clusters.id', ondelete='CASCADE'))

    sequence = db.relationship('Sequence',
                               backref=db.backref(
                                   'coexpression_cluster_associations',
                                   lazy='dynamic',
                                   passive_deletes=True),
                               lazy='joined')
    coexpression_cluster = db.relationship('CoexpressionCluster',
                                           backref=db.backref(
                                               'sequence_associations',
                                               lazy='dynamic',
                                               passive_deletes=True),
                                           lazy='joined')
예제 #5
0
class Species(db.Model):
    __tablename__ = 'species'
    id = db.Column(db.Integer, primary_key=True)
    code = db.Column(db.String(50, collation=SQL_COLLATION), unique=True)
    name = db.Column(db.String(200, collation=SQL_COLLATION))
    data_type = db.Column(db.Enum('genome', 'transcriptome', name='data_type'))
    color = db.Column(db.String(7), default="#C7C7C7")
    highlight = db.Column(db.String(7), default="#DEDEDE")
    sequence_count = db.Column(db.Integer)
    network_count = db.Column(db.Integer)
    profile_count = db.Column(db.Integer)
    description = db.Column(db.Text)

    sequences = db.relationship('Sequence',
                                backref='species',
                                lazy='dynamic',
                                cascade="all, delete-orphan",
                                passive_deletes=True)
    networks = db.relationship('ExpressionNetworkMethod',
                               backref='species',
                               lazy='dynamic',
                               cascade="all, delete-orphan",
                               passive_deletes=True)
    profiles = db.relationship('ExpressionProfile',
                               backref='species',
                               lazy='dynamic',
                               cascade="all, delete-orphan",
                               passive_deletes=True)
    expression_specificities = db.relationship('ExpressionSpecificityMethod',
                                               backref='species',
                                               lazy='dynamic',
                                               cascade="all, delete-orphan",
                                               passive_deletes=True)
    condition_tissues = db.relationship('ConditionTissue',
                                        backref='species',
                                        lazy='dynamic',
                                        cascade="all, delete-orphan",
                                        passive_deletes=True)

    def __init__(self,
                 code,
                 name,
                 data_type='genome',
                 color="#C7C7C7",
                 highlight="#DEDEDE",
                 description=None):
        self.code = code
        self.name = name
        self.data_type = data_type
        self.color = color
        self.highlight = highlight
        self.sequence_count = 0
        self.profile_count = 0
        self.network_count = 0
        self.description = description

    def __repr__(self):
        return str(self.id) + ". " + self.name

    @property
    def has_interpro(self):
        from conekt.models.sequences import Sequence
        from conekt.models.relationships.sequence_interpro import SequenceInterproAssociation

        domain = SequenceInterproAssociation.query.join(
            Sequence,
            Sequence.id == SequenceInterproAssociation.sequence_id).filter(
                Sequence.species_id == self.id).first()

        if domain is not None:
            return True
        else:
            return False

    @property
    def has_go(self):
        from conekt.models.sequences import Sequence
        from conekt.models.relationships.sequence_go import SequenceGOAssociation

        go = SequenceGOAssociation.query.join(
            Sequence, Sequence.id == SequenceGOAssociation.sequence_id).filter(
                Sequence.species_id == self.id).first()

        if go is not None:
            return True
        else:
            return False

    @staticmethod
    def add(code,
            name,
            data_type='genome',
            color="#C7C7C7",
            highlight="#DEDEDE",
            description=None):

        new_species = Species(code,
                              name,
                              data_type=data_type,
                              color=color,
                              highlight=highlight,
                              description=description)

        species = Species.query.filter_by(code=code).first()

        # species is not in the DB yet, add it
        if species is None:
            try:
                db.session.add(new_species)
                db.session.commit()
            except:
                db.rollback()

            return new_species.id
        else:
            return species.id

    @staticmethod
    def update_counts():
        """
        To avoid long counts the number of sequences, profiles and networks can be precalculated and stored in the
        database using this function.
        """
        species = Species.query.all()

        for s in species:
            s.sequence_count = s.sequences.count()
            s.profile_count = s.profiles.count()
            s.network_count = s.networks.count()

        try:
            db.session.commit()
        except Exception as e:
            db.session.rollback()
            print(e)
예제 #6
0
class CoexpressionCluster(db.Model):
    __tablename__ = 'coexpression_clusters'
    id = db.Column(db.Integer, primary_key=True)
    method_id = db.Column(
        db.Integer,
        db.ForeignKey('coexpression_clustering_methods.id',
                      ondelete='CASCADE'))
    name = db.Column(db.String(50), index=True)

    # Other properties
    # sequences defined in Sequence
    # sequence_associations defined in SequenceCoexpressionClusterAssociation'
    # go_enrichment defined in ClusterGOEnrichment
    # clade_enrichment defined in ClusterCladeEnrichment

    @staticmethod
    def get_cluster(cluster_id):
        """
        Returns the network for a whole cluster (reporting edges only between members of the cluster !)

        :param cluster_id: internal ID of the cluster
        :return network for the selected cluster (dict with nodes and edges)
        """
        cluster = CoexpressionCluster.query.get(cluster_id)

        probes = [
            member.probe for member in cluster.sequence_associations.all()
        ]

        network = cluster.method.network_method.probes.\
            options(joinedload('sequence').load_only('name')).\
            filter(ExpressionNetwork.probe.in_(probes)).all()

        nodes = []
        edges = []

        existing_edges = []

        for node in network:
            nodes.append({
                "id":
                node.probe,
                "name":
                node.probe,
                "gene_id":
                int(node.sequence_id)
                if node.sequence_id is not None else None,
                "gene_name":
                node.sequence.name
                if node.sequence_id is not None else node.probe,
                "depth":
                0
            })

            links = json.loads(node.network)

            for link in links:
                # only add links that are in the cluster !
                if link["probe_name"] in probes and [
                        node.probe, link["probe_name"]
                ] not in existing_edges:
                    edges.append({
                        "source":
                        node.probe,
                        "target":
                        link["probe_name"],
                        "profile_comparison":
                        url_for(
                            'expression_profile.expression_profile_compare_probes',
                            probe_a=node.probe,
                            probe_b=link["probe_name"],
                            species_id=node.method.species.id),
                        "depth":
                        0,
                        "link_score":
                        link["link_score"],
                        "link_pcc":
                        link["link_pcc"]
                        if "link_pcc" in link.keys() else None,
                        "hrr":
                        link["hrr"] if "hrr" in link.keys() else None,
                        "edge_type":
                        cluster.method.network_method.edge_type
                    })
                    existing_edges.append([node.probe, link["probe_name"]])
                    existing_edges.append([link["probe_name"], node.probe])

        return {"nodes": nodes, "edges": edges}

    def __calculate_enrichment(self):
        """
        Initial implementation to calculate GO enrichment for a single cluster
        """
        gene_count = self.method.network_method.species.sequence_count
        species_id = self.method.network_method.species_id

        sequences = self.sequences.options(load_only("id")).all()

        associations = SequenceGOAssociation.query\
            .filter(SequenceGOAssociation.sequence_id.in_([s.id for s in sequences]))\
            .filter(SequenceGOAssociation.predicted == 0)\
            .options(load_only("sequence_id", "go_id"))\
            .group_by(SequenceGOAssociation.sequence_id, SequenceGOAssociation.go_id)

        go_data = {}

        for a in associations:
            if a.go_id not in go_data.keys():
                go_data[a.go_id] = {}
                go_data[a.go_id]["total_count"] = json.loads(
                    a.go.species_counts)[str(species_id)]
                go_data[a.go_id]["cluster_count"] = 1
            else:
                go_data[a.go_id]["cluster_count"] += 1

        p_values = []
        for go_id in go_data:
            p_values.append(
                hypergeo_sf(go_data[go_id]['cluster_count'], len(sequences),
                            go_data[go_id]['total_count'], gene_count))

        corrected_p_values = fdr_correction(p_values)

        for i, go_id in enumerate(go_data):
            enrichment = ClusterGOEnrichment()
            enrichment.cluster_id = self.id
            enrichment.go_id = go_id

            enrichment.cluster_count = go_data[go_id]['cluster_count']
            enrichment.cluster_size = len(sequences)
            enrichment.go_count = go_data[go_id]['total_count']
            enrichment.go_size = gene_count

            enrichment.enrichment = log2(
                (go_data[go_id]['cluster_count'] / len(sequences)) /
                (go_data[go_id]['total_count'] / gene_count))
            enrichment.p_value = p_values[i]
            enrichment.corrected_p_value = corrected_p_values[i]

            db.session.add(enrichment)

        try:
            db.session.commit()
        except Exception as e:
            db.session.rollback()
            print(e)

    @staticmethod
    def calculate_enrichment(empty=True):
        """
        Static method to calculate the enrichment for all cluster in the database

        :param empty: empty table cluster_go_enrichment first
        """
        # If required empty the table first
        if empty:
            try:
                db.session.query(ClusterGOEnrichment).delete()
                db.session.commit()
            except Exception as e:
                db.session.rollback()
                print(e)
            else:
                clusters = CoexpressionCluster.query.all()

                for i, cluster in enumerate(clusters):
                    # print(i, "\t cluster: ", cluster.method_id, cluster.name)
                    cluster.__calculate_enrichment()

    def __calculate_clade_enrichment(self, background, gf_method_id):
        """
        Calculates the clade enrichment for a co-expression cluster (i.e. if genes which originated in a certain clade
        are overrepresented). A background is required (how many genes there are per clade in the organism) and the
        gene family method those clades are based on.

        Calculations will be immediately committed to the DB.

        :param background: dict with background
        :param gf_method_id: internal ID of gene family method
        """
        species_gene_count = self.method.network_method.species.sequence_count
        species_id = self.method.network_method.species_id

        cluster_clade_count = defaultdict(lambda: 0)

        cluster_gene_count = self.sequences.count()

        try:
            sequences = self.sequences.\
                join(SequenceFamilyAssociation, Sequence.id == SequenceFamilyAssociation.sequence_id).\
                join(GeneFamily, SequenceFamilyAssociation.gene_family_id == GeneFamily.id).\
                add_columns(Sequence.name,
                            Sequence.species_id,
                            SequenceFamilyAssociation.gene_family_id,
                            GeneFamily.method_id,
                            GeneFamily.clade_id).\
                filter(GeneFamily.method_id == gf_method_id).all()
        except Exception as e:
            print(e, file=sys.stderr)

        for s in sequences:
            cluster_clade_count[s.clade_id] += 1

        enrichment_scores = []

        for clade_id, count in cluster_clade_count.items():
            try:
                background_count = background[species_id][clade_id]
                p_value = hypergeo_sf(count, cluster_gene_count,
                                      background_count, species_gene_count)
                enrichment = log2((count / cluster_gene_count) /
                                  (background_count / species_gene_count))

                enrichment_scores.append({
                    'clade_count': background_count,
                    'clade_size': species_gene_count,
                    'cluster_count': count,
                    'cluster_size': cluster_gene_count,
                    'p_value': p_value,
                    'enrichment': enrichment,
                    'clade_id': clade_id,
                    'cluster_id': self.id
                })

            except Exception as e:
                print(e, file=sys.stderr)

        corrected_p_values = fdr_correction(
            [es['p_value'] for es in enrichment_scores])

        commit_required = False
        for es, corrected_p_value in zip(enrichment_scores,
                                         corrected_p_values):
            if es['p_value'] < 0.05 and es['enrichment'] > 0:
                commit_required = True
                cluster_clade_enrichment = ClusterCladeEnrichment()
                cluster_clade_enrichment.p_value = es['p_value']
                cluster_clade_enrichment.corrected_p_value = corrected_p_value
                cluster_clade_enrichment.enrichment = es['enrichment']
                cluster_clade_enrichment.clade_id = es['clade_id']
                cluster_clade_enrichment.cluster_id = es['cluster_id']
                cluster_clade_enrichment.gene_family_method_id = gf_method_id
                cluster_clade_enrichment.clade_count = es['clade_count']
                cluster_clade_enrichment.clade_size = es['clade_size']
                cluster_clade_enrichment.cluster_count = es['cluster_count']
                cluster_clade_enrichment.cluster_size = es['cluster_size']

                db.session.add(cluster_clade_enrichment)

        if commit_required:
            try:
                db.session.commit()
            except Exception as e:
                db.session.rollback()
                print(e)

    @staticmethod
    def calculate_clade_enrichment(gene_family_method_id, empty=True):
        """
        Calculates clade enrichment for co-expression clusters

        :param gene_family_method_id: gene family method to use to determine clades
        :param empty: when true, removes clade enrichments for the current gf_method
        """
        if empty:
            try:
                print("Removing Existing Enrichment")
                db.session.query(ClusterCladeEnrichment).\
                    filter(ClusterCladeEnrichment.gene_family_method_id == gene_family_method_id).delete()
                db.session.commit()
            except Exception as e:
                db.session.rollback()
                print(e)

        print("Calculating background...", sep='')
        gf_method = GeneFamilyMethod.query.get(gene_family_method_id)
        counts = gf_method.get_clade_distribution()
        print(' Done!')

        # calculate enrichment
        print("Calculate enrichment", sep='')

        clusters = CoexpressionCluster.query.all()

        for i, cluster in enumerate(clusters):
            print(i, "\t cluster: ", cluster.method_id, cluster.name)
            cluster.__calculate_clade_enrichment(counts, gene_family_method_id)

        print(" Done!")

    @staticmethod
    def delete_enrichment():
        """
        Removes all GO enrichment data from the database

        :return:
        """
        try:
            db.session.query(ClusterGOEnrichment).delete()
            db.session.commit()
        except Exception as e:
            db.session.rollback()
            print(e)

    @staticmethod
    @benchmark
    def calculate_similarities(gene_family_method_id=1, percentile_pass=0.95):
        """
        This function will calculate ALL similarities between clusters in the database. Results will be added to the
        DB

        :param gene_family_method_id: Internal ID of gene family method to use to calculate the scores (default = 1)
        :param percentile_pass: percentile based cutoff (default = 0.95)
        """

        # sqlalchemy to fetch cluster associations
        fields = [
            SequenceCoexpressionClusterAssociation.__table__.c.sequence_id,
            SequenceCoexpressionClusterAssociation.__table__.c.
            coexpression_cluster_id
        ]
        condition = SequenceCoexpressionClusterAssociation.__table__.c.sequence_id is not None
        cluster_associations = db.engine.execute(
            db.select(fields).where(condition)).fetchall()

        # sqlalchemy to fetch sequence family associations
        fields = [
            SequenceFamilyAssociation.__table__.c.sequence_id,
            SequenceFamilyAssociation.__table__.c.gene_family_id,
            GeneFamily.__table__.c.method_id
        ]
        condition = GeneFamily.__table__.c.method_id == gene_family_method_id
        table = join(
            SequenceFamilyAssociation.__table__, GeneFamily.__table__,
            SequenceFamilyAssociation.__table__.c.gene_family_id ==
            GeneFamily.__table__.c.id)
        sequence_families = db.engine.execute(
            db.select(fields).select_from(table).where(condition)).fetchall()

        # convert sqlachemy results into dictionary
        sequence_to_family = {
            seq_id: fam_id
            for seq_id, fam_id, method_id in sequence_families
        }

        cluster_to_sequences = {}
        cluster_to_families = {}

        for seq_id, cluster_id in cluster_associations:
            if cluster_id not in cluster_to_sequences.keys():
                cluster_to_sequences[cluster_id] = []
            cluster_to_sequences[cluster_id].append(seq_id)

        for cluster_id, sequences in cluster_to_sequences.items():
            families = list(
                set([
                    sequence_to_family[s] for s in sequences
                    if s in sequence_to_family.keys()
                ]))
            if len(families) > 0:
                cluster_to_families[cluster_id] = families

        keys = list(cluster_to_families.keys())

        data = []

        for i in range(len(keys) - 1):
            for j in range(i + 1, len(keys)):
                current_keys = [keys[x] for x in [i, j]]
                current_families = [
                    cluster_to_families[k] for k in current_keys
                ]

                if len(current_families[0]) > 4 and len(
                        current_families[1]) > 4:
                    j = jaccard(current_families[0], current_families[1])
                    data.append([current_keys[0], current_keys[1], j])

        ordered_j = sorted([a[2] for a in data])
        if len(ordered_j) > 0:
            percentile_cutoff = ordered_j[int(
                len(ordered_j) * percentile_pass)]

            database = [{
                'source_id': d[0],
                'target_id': d[1],
                'gene_family_method_id': gene_family_method_id,
                'jaccard_index': d[2],
                'p_value': 0,
                'corrected_p_value': 0
            } for d in data if d[2] >= percentile_cutoff]

            db.engine.execute(CoexpressionClusterSimilarity.__table__.insert(),
                              database)
        else:
            print("No similar clusters found!")

    @property
    def profiles(self):
        """
        Returns a list with all expression profiles of cluster members
        :return: list of all profiles
        """

        sequence_subquery = self.sequences.subquery()

        profiles = ExpressionProfile.query.\
            options(undefer('profile')).\
            join(sequence_subquery, ExpressionProfile.sequence_id == sequence_subquery.c.id).all()

        return profiles

    @property
    def interpro_stats(self):
        """
        Get InterPro statistics for the current cluster

        :return: Interpro statistics
        """
        sequence_ids = [s.id for s in self.sequences.all()]

        return Interpro.sequence_stats(sequence_ids)

    @property
    def go_stats(self):
        """
        Get GO statistics for the current cluster

        :return: GO statistics
        """
        sequence_ids = [s.id for s in self.sequences.all()]

        return GO.sequence_stats(sequence_ids)

    @property
    def family_stats(self):
        """
        Get gene family statistics for the current cluster

        :return: gene family statistics
        """
        sequence_ids = [s.id for s in self.sequences.all()]

        return GeneFamily.sequence_stats(sequence_ids)
예제 #7
0
class Tree(db.Model):
    __tablename__ = 'trees'
    id = db.Column(db.Integer, primary_key=True)

    label = db.Column(db.String(50, collation=SQL_COLLATION), index=True)
    data_newick = db.Column(db.Text)
    data_phyloxml = db.Column(db.Text)

    gf_id = db.Column(db.Integer,
                      db.ForeignKey('gene_families.id', ondelete='CASCADE'),
                      index=True)
    method_id = db.Column(db.Integer,
                          db.ForeignKey('tree_methods.id', ondelete='CASCADE'),
                          index=True)

    @property
    def ascii_art(self):
        """
        Returns an ascii representation of the tree. Useful for quick visualizations

        :return: string with ascii representation of the tree
        """
        tree = newick.loads(self.data_newick)[0]

        return tree.ascii_art()

    @staticmethod
    def __yattag_node(node,
                      tag,
                      text,
                      line,
                      id_to_clade,
                      seq_to_species,
                      seq_to_id,
                      root=1):
        with tag('clade'):
            if root == 1:
                line('branch_length', 0.1)
            else:
                line('branch_length', node.length)
            if node.is_leaf:
                line('name', node.name)
                if node.name in seq_to_id.keys():
                    line('id', seq_to_id[node.name])
                if node.name in seq_to_species.keys():
                    with tag('taxonomy'):
                        line('code', seq_to_species[node.name])
            else:
                clade_id, duplication, dup_score = node.name.split('_')

                clade_id = int(clade_id)
                duplication = True if duplication == 'D' else False
                dup_score = float(dup_score)

                if clade_id in id_to_clade.keys():
                    with tag('taxonomy'):
                        line('code', id_to_clade[clade_id])

                if duplication:
                    line('property',
                         str(dup_score),
                         applies_to="clade",
                         datatype="xksd:double",
                         ref="Duplication consistency score")
                    with tag('events'):
                        line('duplications', 1)
                else:
                    with tag('events'):
                        line('speciations', 1)

                for d in node.descendants:
                    Tree.__yattag_node(d,
                                       tag,
                                       text,
                                       line,
                                       id_to_clade,
                                       seq_to_species,
                                       seq_to_id,
                                       root=0)

    @property
    def phyloxml(self):
        """
        data_phyloXML to phyloXML conversion

        :return:
        """
        # Load Tree with addition information
        tree = newick.loads(self.data_phyloxml)[0]

        # Load Additional information from the database
        clades = Clade.query.all()
        id_to_clade = {c.id: c.name for c in clades}
        seq_to_species = {}
        seq_to_id = {}
        species = []

        for s in self.sequences.all():
            seq_to_id[s.name] = s.id
            seq_to_species[s.name] = s.species.code
            if s.species not in species:
                species.append(s.species)

        csep = CrossSpeciesExpressionProfile()
        csep_data = csep.get_data(*seq_to_id.values())

        has_heatmap = False
        heatmap_order = []
        for cd in csep_data:
            if "profile" in cd.keys() and "order" in cd["profile"].keys():
                has_heatmap = True
                heatmap_order = cd["profile"]["order"]
                break

        # Start constructing PhyloXML
        doc, tag, text, line = Doc().ttl()
        with tag('phyloxml'):
            with tag('phylogeny', rooted="True"):
                # line('name', self.label)
                # line('description', "PlaNet 2.0 PhyloXML tree")
                Tree.__yattag_node(tree, tag, text, line, id_to_clade,
                                   seq_to_species, seq_to_id)

            with tag('graphs'):
                if has_heatmap:
                    with tag('graph', type="heatmap"):
                        line('name', 'Heatmap')
                        with tag('legend', show=1):
                            for label in heatmap_order:
                                with tag('field'):
                                    line('name', label)
                            with tag('gradient'):
                                line('name', 'YlGnBu')
                                line('classes', len(heatmap_order))
                        with tag('data'):
                            for cd in csep_data:
                                if "profile" in cd.keys(
                                ) and "data" in cd["profile"].keys():
                                    with tag('values',
                                             **{'for':
                                                str(cd["sequence_id"])}):
                                        for label in heatmap_order:
                                            if cd["profile"]["data"][
                                                    label] is not None:
                                                line(
                                                    'value', cd["profile"]
                                                    ["data"][label])
                                            else:
                                                line('value', '')

                with tag('graph', type="binary"):
                    line('name', 'Low Expression')
                    with tag('legend', show=1):
                        with tag('field'):
                            line('name', 'Low expression')
                            line('color', '0xf03b20')
                            line('shape', 'circle')

                    with tag('data'):
                        for cd in csep_data:
                            if "low_expressed" in cd.keys():
                                with tag('values',
                                         **{'for': str(cd["sequence_id"])}):
                                    line('value', cd["low_expressed"])

                with tag('graph', type="multibar"):
                    line('name', 'Expression Range')
                    with tag('legend', show=1):
                        with tag('field'):
                            line('name', 'Max. Expression (TPM)')
                            line('color', '0x664977')

                    with tag('data'):
                        for cd in csep_data:
                            if "max_expression" in cd.keys():
                                with tag('values',
                                         **{'for': str(cd["sequence_id"])}):
                                    line('value', cd["max_expression"])

            with tag('taxonomies'):
                for s in species:
                    with tag('taxonomy', code=s.code):
                        line('color', s.color.replace("#", "0x"))
                        line('name', s.name)
                        line(
                            'url',
                            url_for('species.species_view',
                                    species_id=s.id,
                                    _external=True))

                for c in clades:
                    with tag('taxonomy', code=c.name):
                        line('color', '0x000000')
                        line('name', c.name)
                        line(
                            'url',
                            url_for('clade.clade_view',
                                    clade_id=c.id,
                                    _external=True))

        return indent(doc.getvalue())

    @property
    def count(self):
        tree = newick.loads(self.data_newick)[0]
        return len(tree.get_leaves())

    @property
    def sequences(self):
        tree = newick.loads(self.data_newick)[0]
        sequences = [l.name for l in tree.get_leaves()]

        return Sequence.query.filter(Sequence.name.in_(sequences))

    @property
    def tree_stripped(self):
        tree = newick.loads(self.data_newick)[0]
        tree.remove_lengths()

        return newick.dumps([tree])
예제 #8
0
class GO(db.Model):
    __tablename__ = 'go'
    id = db.Column(db.Integer, primary_key=True)
    label = db.Column(db.String(50, collation=SQL_COLLATION), unique=True, index=True)
    name = db.Column(db.Text)
    type = db.Column(db.Enum('biological_process', 'molecular_function', 'cellular_component', name='go_type'))
    description = db.Column(db.Text)
    obsolete = db.Column(db.SmallInteger)
    is_a = db.Column(db.Text)
    extended_go = db.Column(db.Text)
    species_counts = db.Column(db.Text)

    sequences = db.relationship('Sequence', secondary=sequence_go, lazy='dynamic')

    # Other properties
    #
    # sequence_associations declared in 'SequenceGOAssociation'
    # enriched_clusters declared in 'ClusterGOEnrichment'

    def __init__(self, label, name, go_type, description, obsolete, is_a, extended_go):
        self.label = label
        self.name = name
        self.type = go_type
        self.description = description
        self.obsolete = obsolete
        self.is_a = is_a
        self.extended_go = extended_go
        self.species_counts = ""

    def set_all(self, label, name, go_type, description, extended_go):
        self.label = label
        self.name = name
        self.type = go_type
        self.description = description
        self.extended_go = extended_go
        self.species_counts = ""

    @property
    def short_type(self):
        if self.type == 'biological_process':
            return 'BP'
        elif self.type == 'molecular_function':
            return 'MF'
        elif self.type == 'cellular_component':
            return 'CC'
        else:
            return 'UNK'

    @property
    def readable_type(self):
        if self.type == 'biological_process':
            return 'Biological process'
        elif self.type == 'molecular_function':
            return 'Molecular function'
        elif self.type == 'cellular_component':
            return 'Cellular component'
        else:
            return 'Unknown type'

    @property
    def parent_count(self):
        """
        Returns total number of genes 'above' this gene in the DAG
        :return:
        """
        return len(self.extended_go.split(';')) if self.extended_go != '' else 0

    @property
    def interpro_stats(self):
        from conekt.models.interpro import Interpro

        return Interpro.sequence_stats_subquery(self.sequences)

    @property
    def go_stats(self):
        return GO.sequence_stats_subquery(self.sequences)

    @property
    def family_stats(self):
        from conekt.models.gene_families import GeneFamily

        return GeneFamily.sequence_stats_subquery(self.sequences)

    def species_occurrence(self, species_id):
        """
        count how many genes have the current GO term in a given species

        :param species_id: internal id of the selected species
        :return: count of sequences with this term associated
        """
        count = 0
        sequences = self.sequences.all()

        for s in sequences:
            if s.species_id == species_id:
                count += 1

        return count

    @staticmethod
    def sequence_stats(sequence_ids, exclude_predicted=True):
        """
        Takes a list of sequence IDs and returns InterPro stats for those sequences

        :param sequence_ids: list of sequence ids
        :param exclude_predicted: if True (default) predicted GO labels will be excluded
        :return: dict with for each InterPro domain linked with any of the input sequences stats
        """
        query = SequenceGOAssociation.query.filter(SequenceGOAssociation.sequence_id.in_(sequence_ids))

        if exclude_predicted:
            query = query.filter(SequenceGOAssociation.predicted == 0)

        data = query.all()

        return GO.__sequence_stats_associations(data)

    @staticmethod
    def sequence_stats_subquery(sequences, exclude_predicted=True):
        subquery = sequences.subquery()

        query = SequenceGOAssociation.query

        if exclude_predicted:
            query = query.filter(SequenceGOAssociation.predicted == 0)

        data = query.join(subquery, SequenceGOAssociation.sequence_id == subquery.c.id).all()

        return GO.__sequence_stats_associations(data)

    @staticmethod
    def __sequence_stats_associations(associations):
        output = {}
        for d in associations:
            if d.go_id not in output.keys():
                output[d.go_id] = {
                    'go': d.go,
                    'count': 1,
                    'sequences': [d.sequence_id],
                    'species': [d.sequence.species_id]
                }
            else:
                output[d.go_id]['count'] += 1
                if d.sequence_id not in output[d.go_id]['sequences']:
                    output[d.go_id]['sequences'].append(d.sequence_id)
                if d.sequence.species_id not in output[d.go_id]['species']:
                    output[d.go_id]['species'].append(d.sequence.species_id)

        for k, v in output.items():
            v['species_count'] = len(v['species'])
            v['sequence_count'] = len(v['sequences'])

        return output

    @staticmethod
    def update_species_counts():
        """
        Adds phylo-profile to each go-label, results are stored in the database

        :param exclude_predicted: if True (default) predicted GO labels will be excluded
        """
        # link species to sequences
        sequences = db.engine.execute(db.select([Sequence.__table__.c.id, Sequence.__table__.c.species_id])).fetchall()

        sequence_to_species = {}
        for seq_id, species_id in sequences:
            if species_id is not None:
                sequence_to_species[seq_id] = int(species_id)

        # get go for all genes
        associations = db.engine.execute(
            db.select([SequenceGOAssociation.__table__.c.sequence_id,
                       SequenceGOAssociation.__table__.c.go_id], distinct=True)\
            .where(SequenceGOAssociation.__table__.c.predicted == 0))\
            .fetchall()

        count = {}
        for seq_id, go_id in associations:
            species_id = sequence_to_species[seq_id]

            if go_id not in count.keys():
                count[go_id] = {}

            if species_id not in count[go_id]:
                count[go_id][species_id] = 1
            else:
                count[go_id][species_id] += 1

        # update counts
        for go_id, data in count.items():
            db.engine.execute(db.update(GO.__table__)
                              .where(GO.__table__.c.id == go_id)
                              .values(species_counts=json.dumps(data)))

    @staticmethod
    def add_from_obo(filename, empty=True, compressed=False):
        """
        Parses GeneOntology's OBO file and adds it to the database

        :param filename: Path to the OBO file to parse
        :param compressed: load data from .gz file if true (default: False)
        :param empty: Empty the database first when true (default: True)
        """
        # If required empty the table first
        if empty:
            try:
                db.session.query(GO).delete()
                db.session.commit()
            except Exception as e:
                db.session.rollback()
                print(e)

        obo_parser = OBOParser()
        obo_parser.readfile(filename, compressed=compressed)

        obo_parser.extend_go()

        for i, term in enumerate(obo_parser.terms):
            go = GO(term.id, term.name, term.namespace, term.definition, term.is_obsolete, ";".join(term.is_a),
                    ";".join(term.extended_go))

            db.session.add(go)

            if i % 40 == 0:
                # commit to the db frequently to allow WHOOSHEE's indexing function to work without timing out
                try:
                    db.session.commit()
                except Exception as e:
                    db.session.rollback()
                    print(e)
        try:
            db.session.commit()
        except Exception as e:
            db.session.rollback()
            print(e)

    @staticmethod
    def add_go_from_plaza(filename):
        """
        Adds GO annotation from PLAZA 3.0 to the database

        :param filename: Path to the annotation file
        :return:
        """
        go_parser = GOParser()

        go_parser.read_plaza_go(filename)

        gene_hash = {}
        go_hash = {}

        all_sequences = Sequence.query.all()
        all_go = GO.query.all()

        for sequence in all_sequences:
            gene_hash[sequence.name] = sequence

        for term in all_go:
            go_hash[term.label] = term

        associations = []

        for gene, terms in go_parser.annotation.items():
            if gene in gene_hash.keys():
                current_sequence = gene_hash[gene]
                for term in terms:
                    if term["id"] in go_hash.keys():
                        current_term = go_hash[term["id"]]
                        association = {
                            "sequence_id": current_sequence.id,
                            "go_id": current_term.id,
                            "evidence": term["evidence"],
                            "source": term["source"]}
                        associations.append(association)
                    else:
                        print(term, "not found in the database.")
            else:
                print("Gene", gene, "not found in the database.")

            if len(associations) > 400:
                db.engine.execute(SequenceGOAssociation.__table__.insert(), associations)
                associations = []

        # Add extended GOs
        for gene, terms in go_parser.annotation.items():
            if gene in gene_hash.keys():
                current_sequence = gene_hash[gene]
                new_terms = []
                current_terms = []

                for term in terms:
                    if term["id"] not in current_terms:
                        current_terms.append(term["id"])

                for term in terms:
                    if term["id"] in go_hash.keys():
                        extended_terms = go_hash[term["id"]].extended_go.split(";")
                        for extended_term in extended_terms:
                            if extended_term not in current_terms and extended_term not in new_terms:
                                new_terms.append(extended_term)

                for new_term in new_terms:
                    if new_term in go_hash.keys():
                        current_term = go_hash[new_term]
                        association = {
                            "sequence_id": current_sequence.id,
                            "go_id": current_term.id,
                            "evidence": None,
                            "source": "Extended"}
                        associations.append(association)

                    if len(associations) > 400:
                        db.engine.execute(SequenceGOAssociation.__table__.insert(), associations)
                        associations = []

        db.engine.execute(SequenceGOAssociation.__table__.insert(), associations)

    @staticmethod
    def add_go_from_tab(filename, species_id, source="Source not provided"):
        gene_hash = {}
        go_hash = {}

        all_sequences = Sequence.query.filter_by(species_id=species_id).all()
        all_go = GO.query.all()

        for sequence in all_sequences:
            gene_hash[sequence.name] = sequence

        for term in all_go:
            go_hash[term.label] = term

        associations = []

        gene_go = defaultdict(list)

        with open(filename, "r") as f:
            for line in f:
                gene, term, evidence = line.strip().split('\t')
                if gene in gene_hash.keys():
                    current_sequence = gene_hash[gene]
                    if term in go_hash.keys():
                        current_term = go_hash[term]
                        association = {
                            "sequence_id": current_sequence.id,
                            "go_id": current_term.id,
                            "evidence": evidence,
                            "source": source}
                        associations.append(association)

                        if term not in gene_go[gene]:
                            gene_go[gene].append(term)

                    else:
                        print(term, "not found in the database.")
                else:
                    print("Gene", gene, "not found in the database.")

                if len(associations) > 400:
                    db.engine.execute(SequenceGOAssociation.__table__.insert(), associations)
                    associations = []

        # Add extended GOs
        for gene, terms in gene_go.items():
            if gene in gene_hash.keys():
                current_sequence = gene_hash[gene]
                new_terms = []
                current_terms = []

                for term in terms:
                    if term not in current_terms:
                        current_terms.append(term)

                for term in terms:
                    if term in go_hash.keys():
                        extended_terms = go_hash[term].extended_go.split(";")
                        for extended_term in extended_terms:
                            if extended_term not in current_terms and extended_term not in new_terms:
                                new_terms.append(extended_term)

                for new_term in new_terms:
                    if new_term in go_hash.keys():
                        current_term = go_hash[new_term]
                        association = {
                            "sequence_id": current_sequence.id,
                            "go_id": current_term.id,
                            "evidence": None,
                            "source": "Extended"}
                        associations.append(association)

                    if len(associations) > 400:
                        db.engine.execute(SequenceGOAssociation.__table__.insert(), associations)
                        associations = []

        db.engine.execute(SequenceGOAssociation.__table__.insert(), associations)

    @staticmethod
    def predict_from_network(expression_network_method_id, threshold=5, source="PlaNet Prediction"):
        """
        Function to transfer GO terms from neighbors in the network. If n or more (based on threshold) neighbors have a
        GO label (excluding other predicted labels) the term is transferred.

        :param expression_network_method_id: Expression network as input
        :param threshold: number of neighboring genes that should have the label to allow transfor
        :param source: Value for the source field
        """
        from conekt.models.expression.networks import ExpressionNetworkMethod

        expression_network_method = ExpressionNetworkMethod.query.get(expression_network_method_id)

        if expression_network_method is None:
            print("ERROR: Network Method ID %d not found" % expression_network_method_id)
            return

        # Get all genes that belong to the network
        probes = expression_network_method.probes.all()

        new_associations = []

        for i, probe in enumerate(probes):
            print("Predicting GO for gene: %d, %s (%d out of %d)" %
                  (probe.sequence_id, probe.sequence.name, i, expression_network_method.probe_count))

            # Get neighborhood from database
            neighborhood = json.loads(probe.network)

            # Get sequence ids from genes in first level neighborhood
            sequence_ids = [n['gene_id'] for n in neighborhood if 'gene_id' in n]

            # If the number of genes in the neighborhood is smaller than the threshold skip (no prediction possible)
            # If there is no sequence associated with the probe skip as well
            if len(sequence_ids) < threshold or probe.sequence_id is None:
                continue

            # Get own GO terms
            own_associations = SequenceGOAssociation.query.filter(SequenceGOAssociation.sequence_id == probe.sequence_id)
            own_terms = list(set([a.go_id for a in own_associations]))

            # Get GO terms from neighbors
            associations = SequenceGOAssociation.query.filter(SequenceGOAssociation.sequence_id.in_(sequence_ids)).\
                filter(SequenceGOAssociation.predicted == 0).all()

            # Make GO terms from neighbors unique and ignore terms the current gene has already
            unique_associations = set([(a.sequence_id, a.go_id) for a in associations if a.go_id not in own_terms])

            go_counts = defaultdict(lambda: 0)

            for ua in unique_associations:
                go_counts[ua[1]] += 1

            # Determine new terms (that occurred equal or more times than the desired threshold
            new_terms = [{
                'go_id': k,
                'score': v
            } for k, v in go_counts.items() if v >= threshold]

            # Store new terms in a list that can be added to the database
            for nt in new_terms:
                new_associations.append({
                    'sequence_id': probe.sequence_id,
                    'go_id': nt['go_id'],
                    'evidence': 'IEP',
                    'source': source,
                    'predicted': True,
                    'prediction_data': json.dumps({'score': nt['score'],
                                                   'threshold': threshold,
                                                   'network_method': expression_network_method_id,
                                                   'prediction_method': 'Neighbor counting'
                                                   })
                })

        # Add new labels to the database in chuncks of 400
        for i in range(0, len(new_associations), 400):
            db.engine.execute(SequenceGOAssociation.__table__.insert(), new_associations[i: i + 400])

    @staticmethod
    def predict_from_network_enrichment(expression_network_method_id, cutoff=0.05, source="PlaNet Prediction"):
        from conekt.models.expression.networks import ExpressionNetworkMethod

        expression_network_method = ExpressionNetworkMethod.query.get(expression_network_method_id)

        if expression_network_method is None:
            print("ERROR: Network Method ID %d not found" % expression_network_method_id)
            return

        probes = expression_network_method.probes.all()

        # Get all GO terms and get background
        # Important, counts are obtained from precomputed counts in the species_counts field !!
        go_data = db.engine.execute(db.select([GO.__table__.c.id, GO.__table__.c.species_counts])).fetchall()

        go_background = defaultdict(lambda: 0)

        for go_id, counts_json in go_data:
            if counts_json is not "":
                counts = json.loads(counts_json)
                if str(expression_network_method.species_id) in counts.keys():
                    go_background[go_id] = counts[str(expression_network_method.species_id)]

        new_associations = []

        for i, probe in enumerate(probes):
            print("Predicting GO for gene: %d, %s (%d out of %d)" %
                  (probe.sequence_id, probe.sequence.name, i, expression_network_method.probe_count))

            # Get neighborhood from database
            neighborhood = json.loads(probe.network)

            # Get sequence ids from genes in first level neighborhood
            sequence_ids = [n['gene_id'] for n in neighborhood if 'gene_id' in n]

            # Get own GO terms
            own_associations = SequenceGOAssociation.query.filter(SequenceGOAssociation.sequence_id == probe.sequence_id)
            own_terms = list(set([a.go_id for a in own_associations]))

            # Get GO terms from neighbors
            associations = SequenceGOAssociation.query.filter(SequenceGOAssociation.sequence_id.in_(sequence_ids)).\
                filter(SequenceGOAssociation.predicted == 0).all()

            # Make GO terms from neighbors unique and ignore terms the current gene has already
            unique_associations = set([(a.sequence_id, a.go_id) for a in associations if a.go_id not in own_terms])
            go_counts = defaultdict(lambda: 0)

            for ua in unique_associations:
                go_counts[ua[1]] += 1

            # find significantly enriched GO terms and store them
            enriched_go = []

            for go_id, count in go_counts.items():
                p_value = hypergeo_sf(count, len(sequence_ids), go_background[go_id], len(probes))
                if p_value < cutoff:
                    enriched_go.append((go_id, p_value))

            # apply FDR correction to the p-values
            corrected_p = fdr_correction([a[1] for a in enriched_go])

            # push new prediction in a dict that will be added to the DB
            for corrected_p, (go_id, p_value) in zip(corrected_p, enriched_go):
                new_associations.append({
                    'sequence_id': probe.sequence_id,
                    'go_id': go_id,
                    'evidence': 'IEP',
                    'source': source,
                    'predicted': True,
                    'prediction_data': json.dumps({'p-cutoff': cutoff,
                                                   'p-value': p_value,
                                                   'p-value (FDR)': corrected_p,
                                                   'network_method': expression_network_method_id,
                                                   'prediction_method': 'Neighborhood enrichment'
                                                   })
                })

        # Add new labels to the database in chuncks of 400
        for i in range(0, len(new_associations), 400):
            db.engine.execute(SequenceGOAssociation.__table__.insert(), new_associations[i: i + 400])
예제 #9
0
class ExpressionProfile(db.Model):
    __tablename__ = 'expression_profiles'
    id = db.Column(db.Integer, primary_key=True)
    species_id = db.Column(db.Integer,
                           db.ForeignKey('species.id', ondelete='CASCADE'),
                           index=True)
    probe = db.Column(db.String(50, collation=SQL_COLLATION), index=True)
    sequence_id = db.Column(db.Integer,
                            db.ForeignKey('sequences.id', ondelete='CASCADE'),
                            index=True)
    profile = db.deferred(db.Column(db.Text))

    specificities = db.relationship('ExpressionSpecificity',
                                    backref=db.backref('profile',
                                                       lazy='joined'),
                                    lazy='dynamic',
                                    cascade="all, delete-orphan",
                                    passive_deletes=True)

    def __init__(self, probe, sequence_id, profile):
        self.probe = probe
        self.sequence_id = sequence_id
        self.profile = profile

    @staticmethod
    def __profile_to_table(data):
        """
        Internal function to convert an expression profile (dict) to a tabular text

        :param data: Dict with expression profile
        :return: table (string)
        """
        output = [["condition", "mean", "min", "max"]]
        order = data["order"]

        for o in order:
            try:
                values = data["data"][o]
                output.append(
                    [o,
                     str(mean(values)),
                     str(min(values)),
                     str(max(values))])
            except Exception as e:
                print(e)

        return '\n'.join(['\t'.join(l) for l in output])

    @property
    def table(self):
        """
        Returns the condition expression as a tabular text file

        :return: table with data (string)
        """
        table = ExpressionProfile.__profile_to_table(json.loads(self.profile))

        return table

    def tissue_table(self, condition_tissue_id, use_means=True):
        """
        Returns the tissue expression as a tabular text file

        :param condition_tissue_id: condition_tissue_id for the conversion
        :param use_means: Use the mean of the condition (recommended)
        :return: table with data (string)
        """
        table = ExpressionProfile.__profile_to_table(
            self.tissue_profile(condition_tissue_id, use_means=use_means))
        return table

    @property
    def low_abundance(self, cutoff=10):
        """
        Checks if the mean expression value in any conditions in the plot is higher than the desired cutoff

        :param cutoff: cutoff for expression, default = 10
        :return: True in case of low abundance otherwise False
        """
        data = json.loads(self.profile)

        checks = [mean(v) > cutoff for _, v in data["data"].items()]

        return not any(checks)

    @staticmethod
    def convert_profile(condition_to_tissue, profile_data, use_means=True):
        """
        Convert a full, detailed profile into a more general summarized one using conversion table stored in the
        database

        :param condition_to_tissue: dict with conversion instructions
        :param profile_data: profile to convert
        :param use_means: use means of detailed condition if True otherwise use samples independently. Default True
        :return: New profile
        """
        tissues = list(set(condition_to_tissue['conversion'].values()))

        output = {}

        for t in tissues:
            valid_conditions = [
                k for k in profile_data['data']
                if k in condition_to_tissue['conversion']
                and condition_to_tissue['conversion'][k] == t
            ]
            valid_values = []
            for k, v in profile_data['data'].items():
                if k in valid_conditions:
                    if use_means:
                        valid_values.append(mean(v))
                    else:
                        valid_values += v

            output[t] = valid_values if len(valid_values) > 0 else [0]

        return {
            'order': condition_to_tissue['order'],
            'colors': condition_to_tissue['colors'],
            'data': output
        }

    def tissue_profile(self, condition_tissue_id, use_means=True):
        """
        Applies a conversion to the profile, grouping several condition into one more general feature (e.g. tissue).

        :param condition_tissue_id: identifier of the conversion table
        :param use_means: store the mean of the condition rather than individual values. The matches the spm
        calculations better.
        :return: parsed profile
        """
        ct = ConditionTissue.query.get(condition_tissue_id)

        condition_to_tissue = json.loads(ct.data)
        profile_data = json.loads(self.profile)

        output = ExpressionProfile.convert_profile(condition_to_tissue,
                                                   profile_data,
                                                   use_means=use_means)

        return output

    @staticmethod
    def get_heatmap(species_id, probes, zlog=True, raw=False):
        """
        Returns a heatmap for a given species (species_id) and a list of probes. It returns a dict with 'order'
        the order of the experiments and 'heatmap' another dict with the actual data. Data is zlog transformed

        :param species_id: species id (internal database id)
        :param probes: a list of probes to include in the heatmap
        :param zlog: enable zlog transformation (otherwise normalization against highest expressed condition)
        """
        profiles = ExpressionProfile.query.options(undefer('profile')).filter_by(species_id=species_id).\
            filter(ExpressionProfile.probe.in_(probes)).all()

        order = []

        output = []

        not_found = [p.lower() for p in probes]

        for profile in profiles:
            name = profile.probe
            data = json.loads(profile.profile)
            order = data['order']
            experiments = data['data']

            with contextlib.suppress(ValueError):
                not_found.remove(profile.probe.lower())

            with contextlib.suppress(ValueError):
                not_found.remove(profile.sequence.name.lower())

            values = {}

            for o in order:
                values[o] = mean(experiments[o])

            row_mean = mean(values.values())
            row_max = max(values.values())

            for o in order:
                if zlog:
                    if row_mean == 0 or values[o] == 0:
                        values[o] = '-'
                    else:
                        try:
                            values[o] = log(values[o] / row_mean, 2)
                        except ValueError as _:
                            print("Unable to calculate log()", values[o],
                                  row_mean)
                            values[o] = '-'
                else:
                    if row_max != 0 and not raw:
                        values[o] = values[o] / row_max

            output.append({
                "name": name,
                "values": values,
                "sequence_id": profile.sequence_id,
                "shortest_alias": profile.sequence.shortest_alias
            })

        if len(not_found) > 0:
            flash("Couldn't find profile for: %s" % ", ".join(not_found),
                  "warning")

        return {'order': order, 'heatmap_data': output}

    @staticmethod
    def get_profiles(species_id, probes, limit=1000):
        """
        Gets the data for a set of probes (including the full profiles), a limit can be provided to avoid overly
        long queries

        :param species_id: internal id of the species
        :param probes: probe names to fetch
        :param limit: maximum number of probes to get
        :return: List of ExpressionProfile objects including the full profiles
        """
        profiles = ExpressionProfile.query.\
            options(undefer('profile')).\
            filter(ExpressionProfile.probe.in_(probes)).\
            filter_by(species_id=species_id).\
            options(joinedload('sequence').load_only('name').noload('xrefs')).\
            limit(limit).all()

        return profiles

    @staticmethod
    def add_profile_from_lstrap(matrix_file,
                                annotation_file,
                                species_id,
                                order_color_file=None):
        """
        Function to convert an (normalized) expression matrix (lstrap output) into a profile

        :param matrix_file: path to the expression matrix
        :param annotation_file: path to the file assigning samples to conditions
        :param species_id: internal id of the species
        :param order_color_file: tab delimited file that contains the order and color of conditions
        """
        annotation = {}

        with open(annotation_file, 'r') as fin:
            # get rid of the header
            _ = fin.readline()

            for line in fin:
                parts = line.strip().split('\t')
                if len(parts) > 1:
                    run, description = parts
                    annotation[run] = description

        order, colors = [], []
        if order_color_file is not None:
            with open(order_color_file, 'r') as fin:
                for line in fin:
                    try:
                        o, c = line.strip().split('\t')
                        order.append(o)
                        colors.append(c)
                    except Exception as _:
                        pass

        # build conversion table for sequences
        sequences = Sequence.query.filter_by(species_id=species_id).all()

        sequence_dict = {}  # key = sequence name uppercase, value internal id
        for s in sequences:
            sequence_dict[s.name.upper()] = s.id

        with open(matrix_file) as fin:
            # read header
            _, *colnames = fin.readline().rstrip().split()

            colnames = [c.replace('.htseq', '') for c in colnames]

            # determine order after annotation is not defined
            if order is None:
                order = []

                for c in colnames:
                    if c in annotation.keys():
                        if annotation[c] not in order:
                            order.append(annotation[c])

                order.sort()

            # read each line and build profile
            new_probes = []
            for line in fin:
                transcript, *values = line.rstrip().split()
                profile = defaultdict(list)

                for c, v in zip(colnames, values):
                    if c in annotation.keys():
                        condition = annotation[c]
                        profile[condition].append(float(v))

                new_probe = {
                    "species_id":
                    species_id,
                    "probe":
                    transcript,
                    "sequence_id":
                    sequence_dict[transcript.upper()]
                    if transcript.upper() in sequence_dict.keys() else None,
                    "profile":
                    json.dumps({
                        "order": order,
                        "colors": colors,
                        "data": profile
                    })
                }

                new_probes.append(new_probe)

                if len(new_probes) > 400:
                    db.engine.execute(ExpressionProfile.__table__.insert(),
                                      new_probes)
                    new_probes = []

            db.engine.execute(ExpressionProfile.__table__.insert(), new_probes)
예제 #10
0
class Interpro(db.Model):
    __tablename__ = 'interpro'
    id = db.Column(db.Integer, primary_key=True)
    label = db.Column(db.String(50, collation=SQL_COLLATION), unique=True, index=True)
    description = db.Column(db.Text)

    clade_id = db.Column(db.Integer, db.ForeignKey('clades.id', ondelete='SET NULL'), index=True)

    sequences = db.relationship('Sequence', secondary=sequence_interpro, lazy='dynamic')

    # Other properties
    # sequence_associations = defined in SequenceInterproRelationship

    def __init__(self, label, description):
        self.label = label
        self.description = description

    @property
    def species_codes(self):
        """
        Finds all species the family has genes from
        :return: a list of all species (codes)
        """

        sequences = self.sequences.options(joinedload('species')).all()

        output = []

        for s in sequences:
            if s.species.code not in output:
                output.append(s.species.code)

        return output

    @property
    def species_counts(self):
        """
        Generates a phylogenetic profile of a gene family
        :return: a dict with counts per species (codes are keys)
        """

        sequences = self.sequences.options(joinedload('species')).all()

        output = {}

        for s in sequences:
            if s.species.code not in output:
                output[s.species.code] = 1
            else:
                output[s.species.code] += 1

        return output

    @staticmethod
    def sequence_stats(sequence_ids):
        """
        Takes a list of sequence IDs and returns InterPro stats for those sequences

        :param sequence_ids: list of sequence ids
        :return: dict with for each InterPro domain linked with any of the input sequences stats
        """
        data = SequenceInterproAssociation.query.filter(SequenceInterproAssociation.sequence_id.in_(sequence_ids)).all()

        return Interpro.__sequence_stats_associations(data)

    @staticmethod
    def sequence_stats_subquery(sequences):
        subquery = sequences.subquery()
        data = SequenceInterproAssociation.query.join(subquery, SequenceInterproAssociation.sequence_id == subquery.c.id).all()

        return Interpro.__sequence_stats_associations(data)

    @staticmethod
    def __sequence_stats_associations(associations):
        output = {}

        for d in associations:
            if d.interpro_id not in output.keys():
                output[d.interpro_id] = {
                    'domain': d.domain,
                    'count': 1,
                    'sequences': [d.sequence_id],
                    'species': [d.sequence.species_id]
                }
            else:
                output[d.interpro_id]['count'] += 1
                if d.sequence_id not in output[d.interpro_id]['sequences']:
                    output[d.interpro_id]['sequences'].append(d.sequence_id)
                if d.sequence.species_id not in output[d.interpro_id]['species']:
                    output[d.interpro_id]['species'].append(d.sequence.species_id)

        for k, v in output.items():
            v['species_count'] = len(v['species'])
            v['sequence_count'] = len(v['sequences'])

        return output

    @property
    def interpro_stats(self):
        sequence_ids = [s.id for s in self.sequences.all()]

        return Interpro.sequence_stats_subquery(self.sequences)

    @property
    def go_stats(self):
        from conekt.models.go import GO

        return GO.sequence_stats_subquery(self.sequences)

    @property
    def family_stats(self):
        from conekt.models.gene_families import GeneFamily

        return GeneFamily.sequence_stats_subquery(self.sequences)

    @staticmethod
    def add_from_xml(filename, empty=True):
        """
        Populates interpro table with domains and descriptions from the official website's XML file

        :param filename: path to XML file
        :param empty: If True the interpro table will be cleared before uploading the new domains, default = True
        """
        # If required empty the table first
        if empty:
            try:
                db.session.query(Interpro).delete()
                db.session.commit()
            except Exception as e:
                db.session.rollback()
                print(e)

        interpro_parser = InterproParser()

        interpro_parser.readfile(filename)

        for i, domain in enumerate(interpro_parser.domains):
            interpro = Interpro(domain.label, domain.description)

            db.session.add(interpro)

            if i % 40 == 0:
                # commit to the db frequently to allow WHOOSHEE's indexing function to work without timing out
                try:
                    db.session.commit()
                except Exception as e:
                    db.session.rollback()
                    print(e)

        try:
            db.session.commit()
        except Exception as e:
            db.session.rollback()
            print(e)

    @staticmethod
    def add_interpro_from_plaza(filename):
        """
        Adds GO annotation from PLAZA 3.0 to the database

        :param filename: Path to the annotation file
        :return:
        """
        interpro_parser = InterproDomainParser()

        interpro_parser.read_plaza_interpro(filename)

        gene_hash = {}
        domain_hash = {}

        all_sequences = Sequence.query.all()
        all_domains = Interpro.query.all()

        for sequence in all_sequences:
            gene_hash[sequence.name] = sequence

        for domain in all_domains:
            domain_hash[domain.label] = domain

        new_domains = []

        for gene, domains in interpro_parser.annotation.items():
            if gene in gene_hash.keys():
                current_sequence = gene_hash[gene]
                for domain in domains:
                    if domain["id"] in domain_hash.keys():
                        current_domain = domain_hash[domain["id"]]

                        new_domain = {"sequence_id": current_sequence.id,
                                      "interpro_id": current_domain.id,
                                      "start": domain["start"],
                                      "stop": domain["stop"]}

                        new_domains.append(new_domain)

                    else:
                        print(domain["id"], "not found in the database.")
            else:
                print("Gene", gene, "not found in the database.")

            if len(new_domains) > 400:
                db.engine.execute(SequenceInterproAssociation.__table__.insert(), new_domains)
                new_domains = []

        db.engine.execute(SequenceInterproAssociation.__table__.insert(), new_domains)

    @staticmethod
    def add_interpro_from_interproscan(filename, species_id):
        """
        Adds GO annotation from InterProScan Output

        :param filename: Path to the annotation file
        :return:
        """
        interpro_parser = InterproDomainParser()

        interpro_parser.read_interproscan(filename)

        gene_hash = {}
        domain_hash = {}

        all_sequences = Sequence.query.filter_by(species_id=species_id)
        all_domains = Interpro.query.all()

        for sequence in all_sequences:
            gene_hash[sequence.name] = sequence

        for domain in all_domains:
            domain_hash[domain.label] = domain

        new_domains = []

        for gene, domains in interpro_parser.annotation.items():
            if gene in gene_hash.keys():
                current_sequence = gene_hash[gene]
                for domain in domains:
                    if domain["id"] in domain_hash.keys():
                        current_domain = domain_hash[domain["id"]]

                        new_domain = {"sequence_id": current_sequence.id,
                                      "interpro_id": current_domain.id,
                                      "start": domain["start"],
                                      "stop": domain["stop"]}

                        new_domains.append(new_domain)

                    else:
                        print(domain["id"], "not found in the database.")
            else:
                print("Gene", gene, "not found in the database.")

            if len(new_domains) > 400:
                db.engine.execute(SequenceInterproAssociation.__table__.insert(), new_domains)
                new_domains = []

        db.engine.execute(SequenceInterproAssociation.__table__.insert(), new_domains)
예제 #11
0
class Clade(db.Model):
    __tablename__ = 'clades'
    id = db.Column(db.Integer, primary_key=True)
    name = db.Column(db.String(50, collation=SQL_COLLATION),
                     unique=True,
                     index=True)
    species = db.Column(db.Text(collation=SQL_COLLATION))
    species_count = db.Column(db.Integer)
    newick_tree = db.Column(db.Text)

    families = db.relationship('GeneFamily', backref='clade', lazy='dynamic')
    interpro = db.relationship('Interpro', backref='clade', lazy='dynamic')

    def __init__(self, name, species, tree):
        self.name = name
        self.species = json.dumps(species)
        self.species_count = len(species)
        self.newick_tree = tree

    def __repr__(self):
        return str(self.id) + ". " + self.name

    @staticmethod
    def add_clade(name, species, tree):
        """
        Add a clade to the database

        :param name: name of the clade
        :param species: list with codes (!) of the species in the clade
        :param tree: newick tree for this clade. Will be stored in the database and used for visualizations
        """
        new_clade = Clade(name, species, tree)
        db.session.add(new_clade)
        try:
            db.session.commit()
        except Exception as e:
            db.session.rollback()
            print(e)

    @staticmethod
    def add_clades_from_json(data):
        """
        Adds a clade from a dict with clade details

        :param data: dict with clade details
        """
        for c, data in data.items():
            Clade.add_clade(c, data['species'], data['tree'])

    @staticmethod
    def update_clades():
        """
        Loop over all families and determine what clade they belong too. Results are stored in the database
        """
        clades = Clade.query.all()
        families = GeneFamily.query.all()

        clade_to_species = {c.name: json.loads(c.species) for c in clades}
        clade_to_id = {c.name: c.id for c in clades}

        for f in families:
            family_species = f.species_codes

            # skip for families without members
            if len(family_species) == 0:
                f.clade_id = None
                continue

            # find the clade with the fewest species that contains all the codes
            selected_clade, _ = get_clade(family_species, clade_to_species)
            if selected_clade is None:
                f.clade_id = None
            else:
                f.clade_id = clade_to_id[selected_clade]

        try:
            db.session.commit()
        except Exception as e:
            db.session.rollback()
            print(e)

    @staticmethod
    def update_clades_interpro():
        """
        Loop over all families and determine what clade they belong too
        """
        clades = Clade.query.all()
        interpro = Interpro.query.all()

        clade_to_species = {c.name: json.loads(c.species) for c in clades}
        clade_to_id = {c.name: c.id for c in clades}

        for i in interpro:
            interpro_species = i.species_codes

            # skip for families without members
            if len(interpro_species) == 0:
                i.clade_id = None
                continue

            # find the clade with the fewest species that contains all the codes
            selected_clade, _ = get_clade(interpro_species, clade_to_species)
            if selected_clade is None:
                i.clade_id = None
            else:
                i.clade_id = clade_to_id[selected_clade]

        try:
            db.session.commit()
        except Exception as e:
            db.session.rollback()
            print(e)

    @property
    def newick_tree_species(self):
        """
        Returns a Newick tree with the species present in the current clade.

        :return: Newick tree (string) with species for the current clade
        """
        species = {s.code: s.name for s in Species.query.all()}

        tree = newick.loads(self.newick_tree)[0]

        for code, name in species.items():
            node = tree.get_node(code)
            if node is not None:
                node.name = name

        return newick.dumps([tree])
예제 #12
0
class ExpressionNetwork(db.Model):
    __tablename__ = 'expression_networks'
    id = db.Column(db.Integer, primary_key=True)
    probe = db.Column(db.String(50, collation=SQL_COLLATION), index=True)
    sequence_id = db.Column(db.Integer, db.ForeignKey('sequences.id', ondelete='CASCADE'), index=True)
    network = db.Column(db.Text)
    method_id = db.Column(db.Integer, db.ForeignKey('expression_network_methods.id', ondelete='CASCADE'), index=True)

    def __init__(self, probe, sequence_id, network, method_id):
        self.probe = probe
        self.sequence_id = sequence_id
        self.network = network
        self.method_id = method_id

    @property
    def neighbors_count(self):
        """
        Returns the number of neighors the current gene has

        :return: int, number of neighbors
        """
        data = json.loads(self.network)

        return len(data)

    @property
    def neighbors_table(self):
        """
        Returns a tab delimited representation of the current gene's neighbors

        :return:
        """
        data = json.loads(self.network)
        output = [["Sequence", "Description", "Alias", "PCC", "hrr"]]

        # Pull in descriptions and aliases
        sequence_ids = [d["gene_id"] for d in data if "gene_id" in d.keys() and d["gene_id"] is not None]
        sequences = {s.id: s for s in Sequence.query.filter(Sequence.id.in_(sequence_ids))}

        for d in data:
            try:
                description, alias = "", ""

                if d["gene_id"] in sequences.keys():
                    description = sequences[d["gene_id"]].description
                    alias = sequences[d["gene_id"]].aliases
                    description = description if description is not None else ""
                    alias = alias if alias is not None else ""

                output.append([d["gene_name"], description, alias, str(d["link_pcc"]), str(d["hrr"])])
            except Exception as e:
                print(e)

        return '\n'.join(['\t'.join(l) for l in output])

    @staticmethod
    def get_neighborhood(probe, depth=0):
        """
        Get the coexpression neighborhood for a specific probe

        :param probe: internal ID of the probe
        :param depth: how many steps away from the query you wish to expand the network
        :return: dict with nodes and edges
        """
        node = ExpressionNetwork.query.get(probe)
        links = json.loads(node.network)

        method_id = node.method_id
        edge_type = node.method.edge_type

        # add the initial node
        nodes = [{"id": node.probe,
                  "name": node.probe,
                  "probe_id": node.id,
                  "gene_id": int(node.sequence_id) if node.sequence_id is not None else None,
                  "gene_name": node.sequence.name if node.sequence_id is not None else node.probe,
                  "node_type": "query",
                  "depth": 0}]
        edges = []

        # lists necessary for doing deeper searches
        additional_nodes = []
        existing_edges = []
        existing_nodes = [node.probe]

        # add direct neighbors of the gene of interest

        for link in links:
            nodes.append(ExpressionNetwork.__process_link(link, depth=0))
            edges.append({"source": node.probe,
                          "target": link["probe_name"],
                          "profile_comparison":
                              url_for('expression_profile.expression_profile_compare_probes',
                                      probe_a=node.probe,
                                      probe_b=link["probe_name"],
                                      species_id=node.method.species.id),
                          "depth": 0,
                          "link_score": link["link_score"],
                          "link_pcc": link["link_pcc"] if "link_pcc" in link.keys() else None,
                          "hrr": link["hrr"] if "hrr" in link.keys() else None,
                          "edge_type": edge_type})
            additional_nodes.append(link["probe_name"])
            existing_edges.append([node.probe, link["probe_name"]])
            existing_edges.append([link["probe_name"], node.probe])
            existing_nodes.append(link["probe_name"])

        # iterate n times to add deeper links
        if len(additional_nodes) > 0:
            for i in range(1, depth+1):
                new_nodes = ExpressionNetwork.\
                    query.filter(and_(ExpressionNetwork.probe.in_(additional_nodes),
                                      ExpressionNetwork.method_id == method_id))
                next_nodes = []

                for new_node in new_nodes:
                    new_links = json.loads(new_node.network)

                    for link in new_links:
                        if link["probe_name"] not in existing_nodes:
                            nodes.append(ExpressionNetwork.__process_link(link, depth=depth))
                            existing_nodes.append(link["probe_name"])
                            next_nodes.append(link["probe_name"])

                        if [new_node.probe, link["probe_name"]] not in existing_edges:
                            edges.append({"source": new_node.probe,
                                          "target": link["probe_name"],
                                          "profile_comparison":
                                              url_for('expression_profile.expression_profile_compare_probes',
                                                      probe_a=new_node.probe,
                                                      probe_b=link["probe_name"],
                                                      species_id=node.method.species.id),
                                          "depth": i,
                                          "link_score": link["link_score"],
                                          "link_pcc": link["link_pcc"] if "link_pcc" in link.keys() else None,
                                          "hrr": link["hrr"] if "hrr" in link.keys() else None,
                                          "edge_type": edge_type})
                            existing_edges.append([new_node.probe, link["probe_name"]])
                            existing_edges.append([link["probe_name"], new_node.probe])

                additional_nodes = next_nodes

        # Add links between the last set of nodes added
        new_nodes = []
        if len(additional_nodes) > 0:
            new_nodes = ExpressionNetwork.query.filter(and_(ExpressionNetwork.probe.in_(additional_nodes),
                                                            ExpressionNetwork.method_id == method_id))

        for new_node in new_nodes:
            new_links = json.loads(new_node.network)
            for link in new_links:
                if link["probe_name"] in existing_nodes:
                    if [new_node.probe, link["probe_name"]] not in existing_edges:
                        edges.append({"source": new_node.probe,
                                      "target": link["probe_name"],
                                      "profile_comparison":
                                          url_for('expression_profile.expression_profile_compare_probes',
                                                  probe_a=new_node.probe,
                                                  probe_b=link["probe_name"],
                                                  species_id=node.method.species.id),
                                      "depth": depth+1,
                                      "link_score": link["link_score"],
                                      "link_pcc": link["link_pcc"] if "link_pcc" in link.keys() else None,
                                      "hrr": link["hrr"] if "hrr" in link.keys() else None,
                                      "edge_type": edge_type})
                        existing_edges.append([new_node.probe, link["probe_name"]])
                        existing_edges.append([link["probe_name"], new_node.probe])

        return {"nodes": nodes, "edges": edges}

    @staticmethod
    def get_custom_network(method_id, probes):
        """
        Return a network dict for a certain set of probes/sequences. Only returns the selected nodes and connections
        between them (if any)

        :param method_id: network method to extract information from
        :param probes: list of probe/sequence names
        :return: network dict
        """
        nodes = []
        edges = []

        probes = ExpressionNetwork.query.filter(ExpressionNetwork.method_id == method_id).\
            filter(ExpressionNetwork.probe.in_(probes)).all()

        valid_nodes = []

        for p in probes:
            node = {"id": p.probe,
                    "name": p.probe,
                    "probe_id": p.id,
                    "gene_id": int(p.sequence_id) if p.sequence_id is not None else None,
                    "gene_name": p.sequence.name if p.sequence_id is not None else p.probe,
                    "node_type": "query",
                    "depth": 0}

            valid_nodes.append(p.probe)
            nodes.append(node)

        existing_edges = []

        for p in probes:
            source = p.probe
            neighborhood = json.loads(p.network)
            for n in neighborhood:
                if n["probe_name"] in valid_nodes:
                    if [source, n["probe_name"]] not in existing_edges:
                        edges.append({"source": source,
                                      "target": n["probe_name"],
                                      "profile_comparison":
                                          url_for('expression_profile.expression_profile_compare_probes',
                                                  probe_a=source,
                                                  probe_b=n["probe_name"],
                                                  species_id=p.method.species.id),
                                      "depth": 0,
                                      "link_score": n["link_score"],
                                      "link_pcc": n["link_pcc"] if "link_pcc" in n.keys() else None,
                                      "hrr": n["hrr"] if "hrr" in n.keys() else None,
                                      "edge_type": p.method.edge_type})
                        existing_edges.append([source, n["probe_name"]])
                        existing_edges.append([n["probe_name"], source])

        return {"nodes": nodes, "edges": edges}

    @staticmethod
    def __process_link(linked_probe, depth):
        """
        Internal function that processes a linked probe (from the ExpressionNetwork.network field) to a data entry
        compatible with cytoscape.js

        :param linked_probe: hash with information from ExpressionNetwork.network field
        :return: a hash formatted for use as a node with cytoscape.js
        """
        if linked_probe["gene_id"] is not None:
            return {"id": linked_probe["probe_name"],
                    "name": linked_probe["probe_name"],
                    "gene_id": linked_probe["gene_id"],
                    "gene_name": linked_probe["gene_name"],
                    "node_type": "linked",
                    "depth": depth}
        else:
            return {"id": linked_probe["probe_name"],
                    "name": linked_probe["probe_name"],
                    "gene_id": None,
                    "gene_name": linked_probe["probe_name"],
                    "node_type": "linked",
                    "depth": depth}

    @staticmethod
    def read_expression_network_lstrap(network_file, species_id, description, score_type="rank",
                                       pcc_cutoff=0.7, limit=30, enable_second_level=False):
        """
        Reads a network from disk, generated using LSTrAP, determing hrr scores for each pair and store things in the
        DB.

        :param network_file: path to input file
        :param species_id: species the data is from
        :param description: description to add to the db for this network
        :param score_type: which scores are used, default = "rank"
        :param pcc_cutoff: pcc threshold, pairs with a score below this will be ignored
        :param limit: hrr score threshold, pairs with a score above this will be ignored
        :param enable_second_level: include second level neighborhood in the database (only to be used for sparse networks)
        :return: internal ID of the new network
        """
        # build conversion table for sequences
        sequences = Sequence.query.filter_by(species_id=species_id).all()

        sequence_dict = {}  # key = sequence name uppercase, value internal id
        for s in sequences:
            sequence_dict[s.name.upper()] = s.id

        # Add network method first
        network_method = ExpressionNetworkMethod(species_id, description, score_type)
        network_method.hrr_cutoff = limit
        network_method.pcc_cutoff = pcc_cutoff
        network_method.enable_second_level = enable_second_level

        db.session.add(network_method)

        try:
            db.session.commit()
        except Exception as e:
            db.session.rollback()
            print(e)

        network = {}
        scores = defaultdict(lambda: defaultdict(lambda: None))     # Score for non-existing pairs will be None

        with open(network_file) as fin:
            for linenr, line in enumerate(fin):
                try:
                    query, hits = line.strip().split(' ')
                    query = query.replace(':', '')
                except ValueError:
                    print("Error parsing line %d: \"%s\"" % (linenr, line))
                    # skip this line and continue
                    continue

                network[query] = {
                    "probe": query,
                    "sequence_id": sequence_dict[query.upper()] if query.upper() in sequence_dict.keys() else None,
                    "linked_probes": [],
                    "total_count": 0,
                    "method_id": network_method.id
                }

                for i, h in enumerate(hits.split('\t')):
                    try:
                        name, value = h.split('(')
                        value = float(value.replace(')', ''))
                        if value > pcc_cutoff:
                            network[query]["total_count"] += 1
                            if i < limit:
                                link = {"probe_name": name,
                                        "gene_name": name,
                                        "gene_id": sequence_dict[name.upper()] if name.upper() in sequence_dict.keys() else None,
                                        "link_score": i,
                                        "link_pcc": value}
                                network[query]["linked_probes"].append(link)
                                scores[query][name] = i
                    except ValueError as e:
                        print("Error on line %d, skipping ... (%s)" % (i, str(h)), file=sys.stderr)

        # HRR
        hr_ranks = defaultdict(lambda: defaultdict(int))

        for query, targets in scores.items():
            for target, score in targets.items():
                if None in [score, scores[target][query]]:
                    hr_ranks[query][target] = None
                else:
                    # As scores start from 0 and ranks one, increase the hrr by one
                    hr_ranks[query][target] = max(score, scores[target][query]) + 1

        # Dump dicts into network string, which will be loaded into the database
        for query in network.keys():

            for i, l in enumerate(network[query]["linked_probes"]):
                network[query]["linked_probes"][i]["hrr"] = hr_ranks[query][l["probe_name"]]

            # Dump links WITH HRR into json string
            network[query]["network"] = json.dumps([n for n in network[query]["linked_probes"] if n['hrr'] is not None])

        # add nodes in sets of 400 to avoid sending to much in a single query
        new_nodes = []
        for _, n in network.items():
            new_nodes.append(n)
            if len(new_nodes) > 400:
                db.engine.execute(ExpressionNetwork.__table__.insert(), new_nodes)
                new_nodes = []

        db.engine.execute(ExpressionNetwork.__table__.insert(), new_nodes)

        return network_method.id
예제 #13
0
class Sequence(db.Model):
    __tablename__ = 'sequences'
    id = db.Column(db.Integer, primary_key=True)
    species_id = db.Column(db.Integer,
                           db.ForeignKey('species.id', ondelete='CASCADE'),
                           index=True)
    name = db.Column(db.String(50, collation=SQL_COLLATION), index=True)
    description = db.Column(db.Text)
    coding_sequence = db.deferred(db.Column(db.Text))
    type = db.Column(db.Enum('protein_coding',
                             'TE',
                             'RNA',
                             name='sequence_type'),
                     default='protein_coding')
    is_mitochondrial = db.Column(db.SmallInteger, default=False)
    is_chloroplast = db.Column(db.SmallInteger, default=False)

    expression_profiles = db.relationship('ExpressionProfile',
                                          backref=db.backref('sequence',
                                                             lazy='joined'),
                                          lazy='dynamic',
                                          cascade="all, delete-orphan",
                                          passive_deletes=True)
    network_nodes = db.relationship('ExpressionNetwork',
                                    backref=db.backref('sequence',
                                                       lazy='joined'),
                                    lazy='dynamic',
                                    cascade="all, delete-orphan",
                                    passive_deletes=True)

    # Other properties
    #
    # coexpression_cluster_associations declared in 'SequenceCoexpressionClusterAssociation'
    # interpro_associations declared in 'SequenceInterproAssociation'
    # go_associations declared in 'SequenceGOAssociation'
    # family_associations declared in 'SequenceFamilyAssociation'

    go_labels = db.relationship('GO', secondary=sequence_go, lazy='dynamic')
    interpro_domains = db.relationship('Interpro',
                                       secondary=sequence_interpro,
                                       lazy='dynamic')
    families = db.relationship('GeneFamily',
                               secondary=sequence_family,
                               lazy='dynamic')

    coexpression_clusters = db.relationship(
        'CoexpressionCluster',
        secondary=sequence_coexpression_cluster,
        backref=db.backref('sequences', lazy='dynamic'),
        lazy='dynamic')

    ecc_query_associations = db.relationship(
        'SequenceSequenceECCAssociation',
        primaryjoin="SequenceSequenceECCAssociation.query_id == Sequence.id",
        backref=db.backref('query_sequence', lazy='joined'),
        lazy='dynamic')

    ecc_target_associations = db.relationship(
        'SequenceSequenceECCAssociation',
        primaryjoin="SequenceSequenceECCAssociation.target_id == Sequence.id",
        backref=db.backref('target_sequence', lazy='joined'),
        lazy='dynamic')

    clade_associations_one = db.relationship(
        'SequenceSequenceCladeAssociation',
        primaryjoin=
        "SequenceSequenceCladeAssociation.sequence_one_id == Sequence.id",
        backref=db.backref('sequence_one', lazy='joined'),
        lazy='dynamic')

    clade_associations_two = db.relationship(
        'SequenceSequenceCladeAssociation',
        primaryjoin=
        "SequenceSequenceCladeAssociation.sequence_two_id == Sequence.id",
        backref=db.backref('sequence_two', lazy='joined'),
        lazy='dynamic')

    xrefs = db.relationship('XRef', secondary=sequence_xref, lazy='joined')

    def __init__(self,
                 species_id,
                 name,
                 coding_sequence,
                 type='protein_coding',
                 is_chloroplast=False,
                 is_mitochondrial=False,
                 description=None):
        self.species_id = species_id
        self.name = name
        self.description = description
        self.coding_sequence = coding_sequence
        self.type = type
        self.is_chloroplast = is_chloroplast
        self.is_mitochondrial = is_mitochondrial

    @property
    def protein_sequence(self):
        """
        Function to translate the coding sequence to the amino acid sequence. Will start at the first start codon and
        break after adding a stop codon (indicated by '*')

        :return: The amino acid sequence based on the coding sequence
        """
        return translate(self.coding_sequence)

    @property
    def aliases(self):
        """
        Returns a readable string with the aliases or tokens stored for this sequence in the table xrefs

        :return: human readable string with aliases or None
        """
        t = [x.name for x in self.xrefs if x.platform == 'token']

        return ", ".join(t) if len(t) > 0 else None

    @property
    def shortest_alias(self):
        """
        Returns the shortest alias

        :return: string with shortest alias or None (in case no aliases exist)
        """
        t = [x.name for x in self.xrefs if x.platform == 'token']

        return min(t, key=len) if len(t) > 0 else None

    @property
    def display_name(self):
        """
        Returns a name to display (from xrefs with display) if available otherwise return name

        :return: display name
        """
        t = [x.name for x in self.xrefs if x.platform == 'display']

        return t[0] if len(t) > 0 else self.name

    @property
    def best_name(self):
        """
        Checks if there is a display name, if not checks the shortest alias, otherwise returns name. To be used in e.g.
        graphs

        :return: string with best name to show in graphs, ...
        """
        if self.display_name is not self.name:
            return self.display_name
        elif self.shortest_alias is not None:
            return self.shortest_alias
        else:
            return self.name

    @property
    def readable_type(self):
        """
        Converts the type table to a readable string

        :return: string with readable version of the sequence type
        """
        conversion = {
            'protein_coding': 'protein coding',
            'TE': 'transposable element',
            'RNA': 'RNA'
        }

        if self.type in conversion.keys():
            return conversion[self.type]
        else:
            return 'other'

    @staticmethod
    def add_from_fasta(filename, species_id, compressed=False):
        fasta_data = Fasta()
        fasta_data.readfile(filename, compressed=compressed)

        new_sequences = []

        # Loop over sequences, sorted by name (key here) and add to db
        for name, sequence in sorted(fasta_data.sequences.items(),
                                     key=operator.itemgetter(0)):
            new_sequence = {
                "species_id": species_id,
                "name": name,
                "description": None,
                "coding_sequence": sequence,
                "type": "protein_coding",
                "is_mitochondrial": False,
                "is_chloroplast": False
            }

            new_sequences.append(new_sequence)

            # add 400 sequences at the time, more can cause problems with some database engines
            if len(new_sequences) > 400:
                db.engine.execute(Sequence.__table__.insert(), new_sequences)
                new_sequences = []

        # add the last set of sequences
        db.engine.execute(Sequence.__table__.insert(), new_sequences)

        return len(fasta_data.sequences.keys())

    @staticmethod
    def add_descriptions(filename, species_id):
        sequences = Sequence.query.filter_by(species_id=species_id).all()

        seq_dict = {}

        for s in sequences:
            seq_dict[s.name] = s

        with open(filename, "r") as f_in:
            for i, line in enumerate(f_in):
                try:
                    name, description = line.strip().split('\t')
                except ValueError:
                    print("Cannot parse line %d: \"%s\"" % (i, line),
                          file=sys.stderr)
                finally:
                    if name in seq_dict.keys():
                        seq_dict[name].description = description

                if i % 400 == 0:
                    db.session.commit()

            db.session.commit()

    @staticmethod
    def export_cds(filename):
        sequences = Sequence.query.options(undefer('coding_sequence')).all()

        with open(filename, "w") as f_out:
            for s in sequences:
                print(">%s\n%s" % (s.name, s.coding_sequence), file=f_out)

    @staticmethod
    def export_protein(filename):
        sequences = Sequence.query.options(undefer('coding_sequence')).all()

        with open(filename, "w") as f_out:
            for s in sequences:
                print(">%s\n%s" % (s.name, s.protein_sequence), file=f_out)