Пример #1
0
def main():
    params = get_params()

    from pyHCA.core.ioHCA import read_multifasta_it

    AA_sorted = dict()
    AA1 = [
        'A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q',
        'R', 'S', 'T', 'V', 'W', 'Y'
    ]
    for i, aa in enumerate(AA1):
        AA_sorted[aa] = i + 1
    if "X" not in AA_sorted:
        AA_sorted["X"] = 0

    compute_features = compute_features3  # all_compute_features[params.method]

    scaler_model = joblib.load(params.model)
    rbs_scaler = scaler_model["scaler"]
    trained_clf = scaler_model["model"]

    with open(params.outputfile, "w") as outf:
        for prot, sequence in read_multifasta_it(params.fastafile):
            seq = str(sequence.seq).upper()
            domains, clusters = prepare_sequence(seq)
            features = compute_features(seq, domains, clusters, AA_sorted)
            features = rbs_scaler.transform(features)
            probas = trained_clf.predict_proba(features)

            outf.write(">{}\n".format(prot))
            for i in range(len(probas)):
                outf.write("{} {} {}\n".format(i + 1, seq[i], probas[i, 0]))
    sys.exit(0)
Пример #2
0
def domain_sequence(inputf, domainf, outputf, verbose=False):
    """ get sequences of domain annotation
    """
    # read domain annotation
    annotation = read_annotation(domainf, "seghca")
    with open(outputf, "w") as outf:
        for prot, sequence in read_multifasta_it(inputf):
            seq = str(sequence.seq)
            for i, domain in enumerate(annotation.get(prot, [])):
                start, stop = domain[:2]
                outf.write(">{}-{} {}-{}\n{}\n".format(prot, i, start+1, stop, seq[start: stop]))
Пример #3
0
def _scores(output, dseq, seq_type="aminoacid", t=0.1, method="domain", verbose=False, dist=16):
    """ The main annotation function. Two methods are avaliable: 'domain' and 
    'cluster' 
    
    Parameters
    ----------
    dseq : dictionary
        the biological sequences, keys are string, values are biopython Sequence
        object from SeqIO
    seq_type: string, ["aminoacids", "nucleotides"]
        the type of biological sequence
    t : float
        parameter controlling the domain creation based on cluster density
    method : string
        the method used, 
        domain: will return a list of domain positions
        cluster: will return a list of cluster positions
    verbose: bool
        print interesting stuff
    
    Return:
    -------
    danno : dictionarry
        the annotation for each protein or each frame of each nucleotide
        sequences
    """
    with open(output, "w") as outf:
        if seq_type == "aminoacid":
            #for prot in dseq:
            for prot, sequence in read_multifasta_it(inputfile):
                #sequence = str(dseq[prot].seq)
                annotations = _annotation_aminoacids(sequence, t=t, method=method, 
                                                    verbose=verbose, dist=dist)
                outf.write(">{} {}\n".format(prot, len(sequence)))
                if method =="domain":
                    posdomains = np.zeros(len(sequence), dtype=np.uint8)
                    for domannot in annotations["domain"]:
                        posdomains[domannot.start: domannot.stop] = 1
                    for i in range(len(sequence)):
                        outf.write("{:.5f}\t{}\n".format(annotations["scores"][i], posdomains[i]))
                else:
                    for i in range(len(sequence)):
                        outf.write("{:.5f}\tNaN\n".format(annotations["scores"][i]))
        else:
            cnt, nb_dot = 0, 0
            #for name in dseq:
            for name, sequence in read_multifasta_it(inputfile):
                for strand, frame, start, protseq in six_frames(sequence):
                    cnt += 1
                    if cnt == 1000:
                        cnt = 0
                        sys.stdout.write(".")
                        sys.stdout.flush()
                        nb_dot += 1
                    if nb_dot == 80:
                        nb_dot = 0
                        sys.stdout.write("\n")
                    if strand > 0:
                        new_name = "{}_5'3'_Frame_{}_start_{}".format(name, frame+1, start+1)
                    else:
                        new_name = "{}_3'5'_Frame_{}_start_{}".format(name, frame+1, start+1)
                    
                    annotations = _annotation_aminoacids(protseq, t=t, method=method, verbose=verbose, dist=dist)                            
                    if annotations:
                        outf.write(">{} {}\n".format(new_name, len(protseq)))
                        if method =="domain":
                            posdomains = np.zeros(len(protseq), dtype=np.uint8)
                            for domannot in annotations["domain"]:
                                posdomains[domannot.start: domannot.stop] = 1
                            for i in range(len(protseq)):
                                outf.write("{:.5f}\t{}\n".format(annotations["scores"][i], pos_domains[i]))
                        else:
                            for i in range(len(protseq)):
                                outf.write("{:.5f}\tNaN\n".format(annotations["scores"][i]))
            sys.stdout.write("\n")
Пример #4
0
def _annotation(output, inputf, seq_type="aminoacid", t=0.1, method="domain", verbose=False):
    """ The main annotation function. Two methods are avaliable: 'domain' and 
    'cluster' 
    
    Parameters
    ----------
    inputf: string
        path of the input file
    seq_type: string, ["aminoacids", "nucleotides"]
        the type of biological sequence
    t : float
        parameter controlling the domain creation based on cluster density
    method : string
        the method used, 
        domain: will return a list of domain positions
        cluster: will return a list of cluster positions
    verbose: bool
        print interesting stuff
    
    Return:
    -------
    danno : dictionarry
        the annotation for each protein or each frame of each nucleotide
        sequences
    """
    with open(output, "w") as outf:
        if seq_type == "aminoacid":
            for prot, sequence in read_multifasta_it(inputf, verbose):
                #for prot in dseq:
                #sequence = str(dseq[prot].seq)
                annotations = _annotation_aminoacids(sequence, t=t, method=method, verbose=verbose)
                outf.write(">{} {}\n".format(prot, len(sequence)))
                for domannot in annotations["domain"]:
                    outf.write("{}\n".format(str(domannot)))
                for clustannot in annotations["cluster"]:
                    outf.write("{}\n".format(str(clustannot)))
        else:
            cnt, nb_dot = 0, 0
            #for name in dseq:
            for name, sequence in read_multifasta_it(path, verbose):
                #for strand, frame, start, protseq in six_frames(dseq[name]):
                for strand, frame, start, protseq in six_frames(sequence):
                    cnt += 1
                    if cnt == 1000:
                        cnt = 0
                        sys.stdout.write(".")
                        sys.stdout.flush()
                        nb_dot += 1
                    if nb_dot == 80:
                        nb_dot = 0
                        sys.stdout.write("\n")
                    if strand > 0:
                        new_name = "{}_5'3'_Frame_{}_start_{}".format(name, frame+1, start+1)
                    else:
                        new_name = "{}_3'5'_Frame_{}_start_{}".format(name, frame+1, start+1)
                    
                    annotations = {"cluster": [], "domain": []}
                    cur_annotation = _annotation_aminoacids(protseq, t=t, method=method, verbose=verbose)
                    for domannot in cur_annotation["domain"]:
                        annotations["domain"].append(domannot)
                    for clustannot in cur_annotation["cluster"]:
                        annotations["cluster"].append(clustannot)
                        
                    if annotations:
                        outf.write(">{} {}\n".format(new_name, len(protseq)))
                        for domannot in annotations["domain"]:
                            outf.write("{}\n".format(str(domannot)))
                        for clustannot in annotations["cluster"]:
                            outf.write("{}\n".format(str(clustannot)))
            sys.stdout.write("\n")
Пример #5
0
def _annotation(output,
                inputf,
                seq_type="aminoacid",
                t=0.1,
                method="domain",
                verbose=False):
    """ The main annotation function. Two methods are avaliable: 'domain' and 
    'cluster' 
    
    Parameters
    ----------
    inputf: string
        path of the input file
    seq_type: string, ["aminoacids", "nucleotides"]
        the type of biological sequence
    t : float
        parameter controlling the domain creation based on cluster density
    method : string
        the method used, 
        domain: will return a list of domain positions
        cluster: will return a list of cluster positions
    verbose: bool
        print interesting stuff
    
    Return:
    -------
    danno : dictionarry
        the annotation for each protein or each frame of each nucleotide
        sequences
    """
    with open(output, "w") as outf:
        outf.write("""# pyHCA v0.1 segmentation results
# 
# Format:
# 
# >'protein_id' 'protein_length' 'hca_score computed on the whole sequence'
# domain 'domain_start' 'domain_stop' 'hca_score' 'hca_pvalue' (if -m domain is used)
# cluster 'cluster_start' 'cluster_stop' 'cluster_pattern'
# 
# The hca_score and associated p-value provide a way to measure the foldability
# of a protein, i.e how similar is the score compared to scores from disordered
# sequences.
# Low p-values correspond to scores at the tail of the distribution of scores 
# for disordered protein sequences.
# 
# /!\ Warning /!\
# 1- The score computed at the whole protein level (in the line with '>') is for 
# information only as some people found it useful.
# No p-value is associated to this score as the scores used in the distributions
# don't come from full protein sequences but domain or "disordered regions" of 
# comparable lengths.
#
# 2- similarly, scores are displayed even for HCA domain shorted than 30 amino 
# acids. 
# As the sequences of length lower than 30 amino acids where filtered out to
# compute distributions of scores, no p-values are given.
# 
# In these two cases, the scores provided must be analyzed carefully, keeping
# in mind their origin and initial purpose
# /!\ Warning /!\
#
#

""")
        if seq_type == "aminoacid":
            for prot, sequence in read_multifasta_it(inputf, verbose):
                #for prot in dseq:
                #sequence = str(dseq[prot].seq)
                annotations = _annotation_aminoacids(sequence,
                                                     t=t,
                                                     method=method,
                                                     verbose=verbose)
                score, pvalue = compute_disstat(0, len(sequence),
                                                annotations["cluster"])
                outf.write(">{} {} {:.3f} {:.3f}\n".format(
                    prot, len(sequence), pvalue, score))
                for domannot in annotations["domain"]:
                    outf.write("{}\n".format(str(domannot)))
                for clustannot in annotations["cluster"]:
                    outf.write("{}\n".format(str(clustannot)))
        else:
            cnt, nb_dot = 0, 0
            #for name in dseq:
            for name, sequence in read_multifasta_it(path, verbose):
                #for strand, frame, start, protseq in six_frames(dseq[name]):
                for strand, frame, start, protseq in six_frames(sequence):
                    cnt += 1
                    if cnt == 1000:
                        cnt = 0
                        sys.stdout.write(".")
                        sys.stdout.flush()
                        nb_dot += 1
                    if nb_dot == 80:
                        nb_dot = 0
                        sys.stdout.write("\n")
                    if strand > 0:
                        new_name = "{}_5'3'_Frame_{}_start_{}".format(
                            name, frame + 1, start + 1)
                    else:
                        new_name = "{}_3'5'_Frame_{}_start_{}".format(
                            name, frame + 1, start + 1)

                    annotations = {"cluster": [], "domain": []}
                    cur_annotation = _annotation_aminoacids(protseq,
                                                            t=t,
                                                            method=method,
                                                            verbose=verbose)
                    for domannot in cur_annotation["domain"]:
                        annotations["domain"].append(domannot)
                    for clustannot in cur_annotation["cluster"]:
                        annotations["cluster"].append(clustannot)

                    score, pvalue = compute_disstat(0, len(protseq),
                                                    annotations["cluster"])
                    if annotations:
                        outf.write(">{} {} {:.3f}\n".format(
                            new_name, len(protseq), score))
                        for domannot in annotations["domain"]:
                            outf.write("{}\n".format(str(domannot)))
                        for clustannot in annotations["cluster"]:
                            outf.write("{}\n".format(str(clustannot)))
            sys.stdout.write("\n")