Пример #1
0
def clear_all_bioinformatics_data():
    enzyme_types = EnzymeType.objects()
    seqs = Sequence.objects()

    for enz in enzyme_types:
        enz.bioinformatics_status = 'Idle'
        enz.save()

    for seq in seqs:
        seq.blast = None
        seq.alignments_made = None
        seq.save()

    UniRef50.drop_collection()
    SSN_record.drop_collection()

    UniRef90.drop_collection()
    Alignment.drop_collection()
    SeqSimNet.drop_collection()

    analysis_data_ssn = str(Path(__file__).parents[3]) + f'/analysis/analysis_data/ssn'
    analysis_data_aba = str(Path(__file__).parents[3]) + f'/analysis/analysis_data/all_by_all_blast'
    shutil.rmtree(analysis_data_ssn)
    shutil.rmtree(analysis_data_aba)
    os.mkdir(analysis_data_ssn)
    os.mkdir(analysis_data_aba)

    print('ALL BIOINFORMATICS DATA DELETED')

    result = {'status': 'success',
              'msg': f"Done",
              'issues': []}

    return jsonify(result=result)
Пример #2
0
def task_check_ssn_status():
    for enzyme_type in EnzymeType.objects():
        ssn_query = list(SSN_record.objects(enzyme_type=enzyme_type))
        if len(ssn_query) > 1:
            print(
                f'Warning - multiple ssn records for {enzyme_type} - deleting extras'
            )
            for i in range(1, len(ssn_query)):
                ssn_query[i].delete()

    if len(current_app.blast_queue.jobs) + len(
            current_app.process_blasts_queue.jobs) + len(
                current_app.alignment_queue.jobs) == 0:
        print('Checking ssn status')
        ssn_records = SSN_record.objects().select_related()

        for ssn_r in ssn_records:
            if ssn_r.status != 'Complete' and ssn_r.enzyme_type.bioinformatics_status == 'Complete':
                if len(UniRef50.objects(enzyme_type=ssn_r.enzyme_type)) != 0:
                    enzyme_type = ssn_r.enzyme_type.enzyme_type
                    job_name = f"{enzyme_type}_expand_ssn"
                    current_app.alignment_queue.enqueue(
                        ssn_tasks.task_expand_ssn,
                        enzyme_type,
                        job_id=job_name)
                    print(f'Queued SSN job for {enzyme_type}')

        for enz_type_obj in EnzymeType.objects():
            if enz_type_obj.bioinformatics_status == 'Complete':
                if enz_type_obj not in SSN_record.objects().distinct(
                        'enzyme_type'):
                    unirefs = UniRef50.objects(enzyme_type=enz_type_obj)
                    biocatdb_seqs = list(
                        Sequence.objects(
                            db.Q(enzyme_type=enz_type_obj.enzyme_type)
                            & db.Q(bioinformatics_ignore__ne=True)))
                    biocatdb_seqs = [
                        seq for seq in biocatdb_seqs
                        if seq.sequence != '' and seq.sequence is not None
                    ]

                    if len(unirefs) + len(biocatdb_seqs) != 0:
                        print(
                            f"No SSN for {enz_type_obj.enzyme_type}, but blasts are complete and sequences present..  creating SSN."
                        )
                        job_name = f"{enz_type_obj.enzyme_type}_expand_ssn"
                        current_app.alignment_queue.enqueue(
                            ssn_tasks.task_expand_ssn,
                            enz_type_obj.enzyme_type,
                            job_id=job_name)

    else:
        print(f"Length blast queue = {len(current_app.blast_queue.jobs)}")
        print(
            f"Length process blast queue = {len(current_app.process_blasts_queue.jobs)}"
        )
        print(
            f"Length alignment queue = {len(current_app.alignment_queue.jobs)}"
        )
Пример #3
0
    def nodes_not_present(self, only_biocatdb=False, max_num=None):
        """ Return a list of enzymes which are not in the ssn """

        # Get a list of all sequence objects of enzyme type
        t0 = time.time()
        sequences = Sequence.objects(
            db.Q(enzyme_type=self.enzyme_type) & db.Q(sequence__ne="")
            & db.Q(sequence__ne=None) & db.Q(sequence_unavailable__ne=True))
        if only_biocatdb is True:
            seq_objects = list(sequences)
        else:
            unirefs = UniRef50.objects(enzyme_type=self.enzyme_type_obj)
            seq_objects = list(sequences) + list(unirefs)

        # Get sequences not in nodes
        not_in_nodes = []
        for seq_obj in seq_objects:
            if seq_obj.enzyme_name not in list(self.graph.nodes):
                if seq_obj.sequence != None:
                    if len(seq_obj.sequence) > 12:
                        not_in_nodes.append(seq_obj)

        # Return only up to the maximum number of sequences
        if max_num != None:
            if len(not_in_nodes) > max_num:
                not_in_nodes = not_in_nodes[0:max_num]

        t1 = time.time()
        self.log(
            f"Identified {len(not_in_nodes)} {self.enzyme_type} proteins which need adding, in {round(t1 - t0, 1)} seconds"
        )
        return not_in_nodes
Пример #4
0
def full_uniref_check(enzyme_type_obj):
    unirefs = UniRef50.objects(enzyme_type=enzyme_type_obj).select_related()
    if len(unirefs) != 0:
        for ur in unirefs:
            print(f'Checking {ur.enzyme_name}..')
            ref_parser = UniRef_Parser()
            ref_parser.load_xml(ur.enzyme_name)
            time.sleep(0.2)

            if ref_parser.check_id_match(ur.enzyme_name) == False:
                print(
                    f"{ur.enzyme_name} doesnt match cluster id online, deleting.."
                )
                for seq in ur.result_of_blasts_for:
                    seq.blast = None
                    seq.save()
                ur.delete()

    ssn_query = SSN_record.objects(enzyme_type=enzyme_type_obj)
    if len(ssn_query) != 0:
        ssn_record = SSN_record.objects(enzyme_type=enzyme_type_obj)[0]
        ssn_record.status = 'Queued for update'
        ssn_record.save()

    enzyme_type_obj.bioinformatics_status = 'Queued for update'
    enzyme_type_obj.save()

    print(f"Full UniRef50 update complete for {enzyme_type_obj.enzyme_type}")
Пример #5
0
    def _find_uniref_metadata(self):
        node_metadata = {}
        unirefs = UniRef50.objects(enzyme_type=self.enzyme_type_obj).exclude(
            'id', 'enzyme_type', 'sequence', "result_of_blasts_for")

        for seq_obj in unirefs:
            node_metadata[seq_obj.enzyme_name] = json.loads(seq_obj.to_json())

        return node_metadata
Пример #6
0
    def _make_db_fasta(self):
        """ Create a fasta file containing all the sequences of an enzyme type """

        seqs = Sequence.objects(
            db.Q(enzyme_type=self.enzyme_type) & db.Q(sequence__ne="")
            & db.Q(sequence__ne=None) & db.Q(sequence_unavailable__ne=True))
        bioinf_seqs = UniRef50.objects(db.Q(enzyme_type=self.enzyme_type_obj))

        with open(f"{self.directory}/{self.enz_type_dir_name}.fasta",
                  'w') as file:
            for seq in list(seqs) + list(bioinf_seqs):
                name = seq.enzyme_name
                seq = seq.sequence.replace('\n', '')

                file.write(f'>{name}\n')
                file.write(f"{seq}\n")
Пример #7
0
    def _add_uniref(self, alignment, identifier, sequence, enzyme_type_obj,
                    seq_seed):

        try:
            uniref_seq = UniRef50(
                enzyme_name=identifier,
                protein_name=self._get_name_from_header(alignment.title),
                tax=self._get_tax_from_header(alignment.title),
                tax_id=self._get_tax_id_from_header(alignment.title),
                sequence=sequence,
                enzyme_type=enzyme_type_obj,
                result_of_blasts_for=[seq_seed],
                blast_round=self.blast_round)
            uniref_seq.save()
        except Exception as e:
            self.log(e)
Пример #8
0
def clear_empty_ssns():
    ssn_records = SSN_record.objects().select_related()

    for ssn_r in ssn_records:
        enzyme_type_obj = ssn_r.enzyme_type
        unirefs = UniRef50.objects(enzyme_type=enzyme_type_obj)
        biocat_seqs = Sequence.objects(
            db.Q(enzyme_type=enzyme_type_obj.enzyme_type)
            & db.Q(sequence__ne="") & db.Q(sequence__ne=None)
            & db.Q(sequence_unavailable__ne=True))

        if len(unirefs) + len(biocat_seqs) == 0:
            ssn_r.delete()

    result = {'status': 'success', 'msg': f'Empty SSNs removed', 'issues': []}
    return jsonify(result=result)
Пример #9
0
    def remove_nonexisting_seqs(self):

        t0 = time.time()
        sequences = Sequence.objects(
            enzyme_type=self.enzyme_type).distinct('enzyme_name')
        unirefs = UniRef50.objects(
            enzyme_type=self.enzyme_type_obj).distinct('enzyme_name')
        protein_names = list(sequences) + list(unirefs)
        count = 0
        for node in list(self.graph.nodes):
            if node not in protein_names:
                self.log(f"Node: {node} not in the database - removing")
                self.graph.remove_node(node)
                count += 1

        t1 = time.time()
        self.log(
            f"Identified {count} sequences which were in SSN but not in database, in {round(t1 - t0, 1)} seconds"
        )
Пример #10
0
def load_uniref_data():
    name = request.form['name']
    enzyme_type = request.form['enzyme_type']
    enzyme_type_obj = EnzymeType.objects(enzyme_type=enzyme_type)[0]

    et = db.Q(enzyme_type=enzyme_type_obj)
    nq = db.Q(enzyme_name=name)

    query = UniRef50.objects(et & nq)
    seq = query[0]
    protein_name = seq.protein_name
    organism = seq.tax

    uniprot_id = retrieve_uniref_info.strip_uniref_name(name)
    if uniprot_id[0:2] == 'UP':
        uniprot_id = ""

    ref_parser = retrieve_uniref_info.UniRef_Parser()
    ref_parser.load_xml(name)
    uni90, uni100, uniprot = ref_parser.get_uniref_members()
    cluster_id = ref_parser.get_cluster_name()
    num_uni90 = len(uni90)
    num_uni100 = len(uni100)
    num_uniprot = len(list(uniprot.keys()))

    if uniprot_id != "":
        prot_parser = retrieve_uniref_info.UniProt_Parser()
        prot_parser.load_xml(uniprot_id)
        pfams = prot_parser.get_pfams()
    else:
        pfams = []

    result = {
        'rep_seq_name': protein_name,
        'rep_seq_organism': organism,
        'rep_seq_uniprot_id': uniprot_id,
        'cluster_id': cluster_id,
        'num_uni90': num_uni90,
        'num_uni100': num_uni100,
        'num_uniprot': num_uniprot,
        'pfam_object': pfams
    }
    return jsonify(result=result)
Пример #11
0
def bioinformatics_admin_page():
    enzyme_types = EnzymeType.objects().order_by('enzyme_type')

    biostat = {}
    ssn = {}
    for enz_type_obj in enzyme_types:
        enz_type = enz_type_obj.enzyme_type
        biostat[enz_type] = enz_type_obj.bioinformatics_status
        q = SSN_record.objects(enzyme_type=enz_type_obj)
        if len(q) != 0:
            ssn[enz_type] = q[0].status
        else:
            ssn[enz_type] = 'None'

    enzyme_numbers = {}
    for enz_type_obj in enzyme_types:
        enz_type = enz_type_obj.enzyme_type
        enzyme_numbers[enz_type] = {}
        enzyme_numbers[enz_type]['biocatdb'] = len(Sequence.objects(enzyme_type=enz_type))
        enzyme_numbers[enz_type]['uniref'] = len(UniRef50.objects(enzyme_type=enz_type_obj))

    enz_type_dict = {}
    for enz_type_obj in enzyme_types:
        enz_type = enz_type_obj.enzyme_type
        enz_type_dict[enz_type] = 0
        seqs = Sequence.objects(enzyme_type=enz_type)
        if len(seqs) != 0:
            for seq in seqs:
                if seq.blast is not None:
                    enz_type_dict[enz_type] += 1
            if enz_type_dict[enz_type] != 0:
                enz_type_dict[enz_type] = round((enz_type_dict[enz_type]/len(seqs))*100, 0)

    registry = StartedJobRegistry(queue=current_app.blast_queue)
    num_jobs = registry.count

    return render_template('bioinformatics/bioinformatics_admin.html',
                           blasted_enz_types=enz_type_dict,
                           biostat=biostat,
                           ssn=ssn,
                           num_jobs=num_jobs,
                           enzyme_numbers=enzyme_numbers)
Пример #12
0
def ssn_object():
    enzyme_type = request.form['enzyme_type']
    enzyme_type_obj = EnzymeType.objects(enzyme_type=enzyme_type)[0]
    ssn_obj = SSN_record.objects(enzyme_type=enzyme_type_obj)[0]

    num_biocatdb = Sequence.objects(enzyme_type=enzyme_type).count()
    num_uniref = UniRef50.objects(enzyme_type=enzyme_type_obj).count()

    precalc_choices = {}
    for score in ssn_obj.num_at_alignment_score:
        clusters = ssn_obj.num_at_alignment_score[score]
        idt = ssn_obj.identity_at_alignment_score[score]

        choice_text = f"{score}, {clusters} clusters, avg identity {idt[0]} ± {idt[1]}"
        precalc_choices[score] = choice_text

    result = {'status': ssn_obj.status,
              'num_biocatdb': num_biocatdb,
              'num_uniref': num_uniref,
              'precalculated': precalc_choices}
    return jsonify(result=result)
Пример #13
0
    def parse(self, output, seq_obj):
        blast_record = output
        query_length = len(seq_obj.sequence)
        enzyme_type_obj = EnzymeType.objects(
            enzyme_type=seq_obj.enzyme_type)[0]

        for alignment in blast_record.alignments:
            identifier = alignment.hit_id.replace(self.identifier_head, '')

            if self._alignment_filters(alignment, query_length):
                db_query = UniRef50.objects(
                    db.Q(enzyme_name=identifier)
                    & db.Q(enzyme_type=enzyme_type_obj)).select_related()
                if len(db_query) == 0:
                    protein_sequence = self._get_sequence(identifier)
                    if self._sequence_filters(protein_sequence, query_length):
                        self.log(f"Adding sequence for {identifier}")
                        self._add_uniref(alignment, identifier,
                                         protein_sequence, enzyme_type_obj,
                                         seq_obj)
                else:
                    uniref_obj = db_query[0]
                    self._add_result_of_blasts_for(seq_obj, uniref_obj)
Пример #14
0
def check_random_uniref(num_to_check=25):
    for enzyme_type in EnzymeType.objects():

        unirefs = UniRef50.objects(enzyme_type=enzyme_type)

        all_match = True
        if len(unirefs) != 0:
            for i in range(num_to_check):
                rand_uniref = random.choice(unirefs)
                name = rand_uniref.enzyme_name
                ref_parser = UniRef_Parser()
                ref_parser.load_xml(name)
                time.sleep(0.2)

                if ref_parser.check_id_match(name) == False:
                    all_match = False

            if all_match != True:
                print(
                    f'Identified mismatches with online uniref entries..  full uniref check for {enzyme_type.enzyme_type}'
                )
                full_uniref_check(enzyme_type)

    print(f'Uniref checks complete ')
Пример #15
0
 def _get_sequence_object(enzyme_name):
     if 'UniRef50' in enzyme_name:
         return UniRef50.objects(enzyme_name=enzyme_name)[0]
     else:
         return Sequence.objects(enzyme_name=enzyme_name)[0]
Пример #16
0
def task_check_uniref_has_blast_source():
    print('Checking for unirefs with no blast source..')
    uniref_query = UniRef50.objects(result_of_blasts_for__size=0)
    for uniref in uniref_query:
        print(f"Deleting {uniref.enzyme_name}")
        uniref.delete()