Exemplo n.º 1
0
def test_too_big_hmms():

    test_repeat = repeat.Repeat(msa=TEST_RESULT_REPEAT_MSA_LONG)
    test_hmm = HMM.create(input_format='repeat', repeat=test_repeat)
    test_seq = sequence.Sequence(TEST_SEQUENCE_A)
    test_optimized_repeat = test_seq.detect([test_hmm])
    assert type(test_optimized_repeat) == repeat_list.RepeatList
    assert len(test_optimized_repeat.repeats) == 0

    test_repeat = repeat.Repeat(msa=TEST_RESULT_REPEAT_MSA_SUPER_LONG)
    test_hmm = HMM.create(input_format='repeat', repeat=test_repeat)
    test_seq = sequence.Sequence(TEST_SEQUENCE_SUPER_LONG_A)
    test_optimized_repeat = test_seq.detect([test_hmm])
    assert type(test_optimized_repeat) == repeat_list.RepeatList
    assert len(test_optimized_repeat.repeats) == 0
Exemplo n.º 2
0
def test_sequence_pickle():

    test_seq = sequence.Sequence(TEST_SEQUENCE)

    test_pickle = os.path.join(path(), "test.pickle")
    test_seq.write(test_pickle, 'pickle')
    test_seq_new = sequence.Sequence.create(test_pickle, 'pickle')

    assert test_seq.seq == test_seq_new.seq

    test_repeat = repeat.Repeat(msa=TEST_REPEAT_MSA_DOUBLE)
    test_hmm = HMM.create(input_format='repeat', repeat=test_repeat)
    test_optimized_repeat = test_seq.detect([test_hmm])
    test_seq.set_repeatlist(test_optimized_repeat, TEST_SEQUENCE_TAG)

    assert type(test_optimized_repeat) == repeat_list.RepeatList
    assert list(test_seq.d_repeatlist.keys()) == [TEST_SEQUENCE_TAG]
    assert type(
        test_seq.d_repeatlist[TEST_SEQUENCE_TAG]) == repeat_list.RepeatList
    assert test_seq.d_repeatlist[TEST_SEQUENCE_TAG].repeats

    test_retrieved_repeatlist = test_seq.get_repeatlist(TEST_SEQUENCE_TAG)
    assert test_retrieved_repeatlist == test_optimized_repeat

    test_seq.write(test_pickle, 'pickle')
    test_seq_new = sequence.Sequence.create(test_pickle, 'pickle')

    assert test_seq.d_repeatlist.keys() == test_seq_new.d_repeatlist.keys()
    assert test_seq.d_repeatlist[TEST_SEQUENCE_TAG].repeats[
        0].msa == test_seq_new.d_repeatlist[TEST_SEQUENCE_TAG].repeats[0].msa

    if os.path.exists(test_pickle):
        os.remove(test_pickle)
Exemplo n.º 3
0
def test_detect_TRUST():

    test_seq = sequence.Sequence(TEST_SEQUENCE_Q9BRR0)
    predicted_repeats = repeat_detection_run.run_detector(seq_records = [test_seq], detectors = ["TRUST"])[0]['TRUST']
    # Warning: TRUST finds these results ONLY with the BLOSUM50 substitution matrix.
    assert len(predicted_repeats) == 3
    # Warning: TRUST finds these results ONLY with the BLOSUM50 substitution matrix.
    assert predicted_repeats[0].msa == ['HLREDIAQIP---TCAEAGE---QEGRLQR', 'KQKNATGGRR--HICHECGKSFAQSSGLSK', 'HRRIHTGEKP--YECEECGKAFIGSSALVI', 'HQRVHTGEKP--YECEECGKAFSHSSDLIK', 'HQRTHTGEKP--YECDDCGKTFSQSCSLLE', 'HHRIHTGEKP--YQCSMCGKAFRRSSHLLR', 'HQRIHTGDKN--VQEPEQGEAW--KSRM--', 'ESQLENVETPmsYKCNECERSFTQNTGLIE', 'HQKIHTGEKP--YQCNACGKGFTRISYLVQ']
Exemplo n.º 4
0
def test_detect_repeats_denovo():

    test_parameters = {"detection": {"detectors": ["TRUST"]}}

    test_seq = sequence.Sequence(TEST_SEQUENCE_Q9BRR0)
    test_optimized_repeat = test_seq.detect(denovo=True, **test_parameters)

    assert type(test_optimized_repeat) == repeat_list.RepeatList
    assert len(test_optimized_repeat.repeats) == 3
Exemplo n.º 5
0
def test_serialize_repeat_list_tsv():

    test_repeats = [repeat.Repeat(msa=i) for i in TEST_REPEATS[:2]]
    test_seq = sequence.Sequence(TEST_SEQUENCE)
    for i in test_repeats:
        test_seq.repeat_in_sequence(i)
    test_repeat_list = rl.RepeatList(repeats=test_repeats)

    tsv = test_repeat_list.write("tsv", return_string=True)

    assert type(tsv) == str
Exemplo n.º 6
0
def test_serialize_repeat_list_tsv():

    test_repeats = [repeat.Repeat(msa = i) for i in TEST_REPEATS]
    test_seq = sequence.Sequence(TEST_SEQUENCE)
    for i in test_repeats:
        test_seq.repeat_in_sequence(i)
    test_repeat_list = rl.RepeatList(repeats = test_repeats)

    tsv = rl_io.serialize_repeat_list_tsv(test_repeat_list)

    assert type(tsv) == str
Exemplo n.º 7
0
def test_detect_repeats_with_repeat():

    test_repeat = repeat.Repeat(msa=TEST_REPEAT_MSA_DOUBLE)
    test_hmm = HMM.create(input_format='repeat', repeat=test_repeat)
    test_seq = sequence.Sequence(TEST_SEQUENCE)
    test_optimized_repeat = test_seq.detect([test_hmm])
    assert type(test_optimized_repeat) == repeat_list.RepeatList
    assert len(test_optimized_repeat.repeats) == 1
    assert test_optimized_repeat.repeats[
        0].msa == TEST_RESULT_REPEAT_MSA_DOUBLE

    test_repeat = repeat.Repeat(msa=TEST_REPEAT_MSA_SINGLE)
    test_hmm = HMM.create(input_format='repeat', repeat=test_repeat)
    test_optimized_repeat = test_seq.detect([test_hmm])
    assert type(test_optimized_repeat) == repeat_list.RepeatList
    assert len(test_optimized_repeat.repeats) == 1
    assert test_optimized_repeat.repeats[
        0].msa == TEST_RESULT_REPEAT_MSA_SINGLE
Exemplo n.º 8
0
def workflow(sequences_file,
             hmm_annotation_file,
             hmm_dir,
             result_file,
             result_file_serialized,
             format,
             max_time,
             time_interval=3600,
             next_time=3600,
             **kwargs):
    ''' Annotate sequences with TRs from multiple sources, test and refine annotations.

     Save the annotations in a pickle.

     Args:
         sequences_file (str): Path to the pickle file containing a list of ``Sequence``
            instances.
         hmm_dir (str): Path to directory where all HMMs are stored as .pickles
         result_file (str): Path to the result file.
         max_time (str): Max run time in seconds

     Raises:
        Exception: If the pickle ``sequences_file`` cannot be loaded
        Exception: if the hmm_dir does not exist

    '''

    start = datetime.datetime.now()
    max_time, time_interval, next_time = int(max_time), int(
        time_interval), int(next_time)

    try:
        l_sequence = Fasta(sequences_file)
    except:
        raise Exception(
            "Cannot load putative pickle file sequences_file: {}".format(
                sequences_file))

    if not os.path.isdir(hmm_dir):
        try:
            os.makedirs(hmm_dir)
        except:
            raise Exception(
                "hmm_dir does not exists and could not be created: {}".format(
                    hmm_dir))

    try:
        with open(hmm_annotation_file, 'rb') as fh:
            dHMM_annotation = pickle.load(fh)
    except:
        raise Exception(
            "Cannot load hmm_annotation_file: {}".format(hmm_annotation_file))

    basic_filter = CONFIG['filter']['basic']['dict']
    basic_filter_tag = CONFIG['filter']['basic']['tag']

    # Load previous results
    try:
        if not os.path.isdir(os.path.dirname(result_file)):
            os.makedirs(os.path.dirname(result_file))
    except:
        raise Exception(
            "Could not create path to result_file directory: {}".format(
                os.path.dirname(result_file)))

    try:
        with open(result_file, 'rb') as fh:
            dResults = pickle.load(fh)
    except:
        LOG.debug(
            "Could not load previous results file - perhaps non existant: {}".
            format(result_file))
        dResults = {}

    dHMM = {}
    for iS_pyfaidx in l_sequence:

        # If sequence is already included in results: continue.
        if iS_pyfaidx.name in dResults:
            continue

        elapsed_time = (datetime.datetime.now() - start).seconds
        if elapsed_time > max_time or elapsed_time > next_time:
            with open(result_file, 'wb') as fh:
                pickle.dump(dResults, fh)
            next_time = next_time + time_interval

        iS = sequence.Sequence(seq=str(iS_pyfaidx), name=iS_pyfaidx.name)

        LOG.debug("Work on sequence {}".format(iS))
        # 1. annotate_de_novo()
        denovo_repeat_list = iS.detect(denovo=True,
                                       repeat={"calc_pvalue": True})
        LOG.debug(denovo_repeat_list.repeats)
        for iTR in denovo_repeat_list.repeats:
            iTR.model = None

        # 2. annotate_TRs_from_hmmer()
        if iS.name in dHMM_annotation:
            lHMM = dHMM_annotation[iS.name]
            infoNRuns = len(lHMM)
            LOG.debug(
                "{} Viterbi runs need to be performed.".format(infoNRuns))
            lHMM = set(lHMM)
            infoNHMM = len(lHMM)
            LOG.debug(
                "These derive from {} independent HMMs.".format(infoNHMM))
            # Load all HMM pickles needed for the particular sequence.
            for hmm_ID in lHMM:
                if hmm_ID not in dHMM:
                    dHMM[hmm_ID] = hmm.HMM.create(file_format="pickle",
                                                  file=os.path.join(
                                                      hmm_dir,
                                                      hmm_ID + ".pickle"))

            pfam_repeat_list = iS.detect([dHMM[hmm_ID] for hmm_ID in lHMM],
                                         repeat={"calc_pvalue": True})
            for iTR, hmm_ID in zip(pfam_repeat_list.repeats, lHMM):
                iTR.model = hmm_ID
                iTR.TRD = "PFAM"
        else:
            pfam_repeat_list = None

        # 3. merge_and_basic_filter()
        all_repeat_list = denovo_repeat_list + pfam_repeat_list
        iS.set_repeatlist(all_repeat_list, REPEAT_LIST_TAG)
        iS.set_repeatlist(denovo_repeat_list, DE_NOVO_ALL_TAG)
        iS.set_repeatlist(pfam_repeat_list, PFAM_ALL_TAG)

        rl_tmp = iS.get_repeatlist(REPEAT_LIST_TAG)
        if iS.get_repeatlist(REPEAT_LIST_TAG):
            for iB in basic_filter.values():
                rl_tmp = rl_tmp.filter(**iB)
        else:
            rl_tmp = iS.get_repeatlist(REPEAT_LIST_TAG)
        iS.set_repeatlist(rl_tmp, basic_filter_tag)
        iS.set_repeatlist(rl_tmp.intersection(iS.get_repeatlist(PFAM_ALL_TAG)),
                          PFAM_TAG)
        iS.set_repeatlist(
            rl_tmp.intersection(iS.get_repeatlist(DE_NOVO_ALL_TAG)),
            DE_NOVO_TAG)

        # 4. calculate_overlap()

        # Perform common ancestry overlap filter and keep PFAMs
        criterion_pfam_fixed = {
            "func_name": "none_overlapping_fixed_repeats",
            "rl_fixed": iS.get_repeatlist(PFAM_TAG),
            "overlap_type": "common_ancestry"
        }

        iS.d_repeatlist[DE_NOVO_TAG] = iS.get_repeatlist(DE_NOVO_TAG).filter(
            **criterion_pfam_fixed)

        # Choose only the most convincing de novo TRs
        criterion_filter_order = {
            "func_name":
            "none_overlapping",
            "overlap": ("common_ancestry", None),
            "l_criterion": [("pvalue", "phylo_gap01"),
                            ("divergence", "phylo_gap01")]
        }
        iS.d_repeatlist[DE_NOVO_TAG] = iS.get_repeatlist(DE_NOVO_TAG).filter(
            **criterion_filter_order)

        # 5. refine_denovo()
        denovo_final = []
        denovo_refined = [None] * len(
            iS.get_repeatlist(DE_NOVO_ALL_TAG).repeats)
        for i, iTR in enumerate(iS.get_repeatlist(DE_NOVO_ALL_TAG).repeats):
            if not iTR in iS.get_repeatlist(DE_NOVO_TAG).repeats:
                continue
            # Create HMM from TR
            denovo_hmm = hmm.HMM.create(file_format='repeat', repeat=iTR)
            # Run HMM on sequence
            denovo_refined_rl = iS.detect(lHMM=[denovo_hmm])
            append_refined = False
            if denovo_refined_rl and denovo_refined_rl.repeats:
                iTR_refined = denovo_refined_rl.repeats[0]
                iTR_refined.TRD = iTR.TRD
                iTR_refined.model = "cpHMM"
                denovo_refined[i] = iTR_refined
                # Check whether new and old TR overlap. Check whether new TR is
                # significant. If not both, put unrefined TR into final.
                if repeat_list.two_repeats_overlap("shared_char", iTR,
                                                   iTR_refined):
                    rl_tmp = repeat_list.RepeatList([iTR_refined])
                    LOG.debug(iTR_refined.msa)
                    for iB in basic_filter.values():
                        rl_tmp = rl_tmp.filter(**iB)
                    if rl_tmp.repeats:
                        append_refined = True
            else:
                denovo_refined[i] = False
            if append_refined:
                denovo_final.append(iTR_refined)
            else:
                denovo_final.append(iTR)

        iS.set_repeatlist(repeat_list.RepeatList(denovo_refined),
                          DE_NOVO_REFINED_TAG)
        iS.set_repeatlist(repeat_list.RepeatList(denovo_final),
                          DE_NOVO_FINAL_TAG)
        iS.set_repeatlist(
            iS.get_repeatlist(DE_NOVO_FINAL_TAG) + iS.get_repeatlist(PFAM_TAG),
            FINAL_TAG)

        dResults[iS.name] = iS

    # 6.a Save results as pickle
    with open(result_file, 'wb') as fh:
        pickle.dump(dResults, fh)

    # 6.b Save serialized results
    with open(result_file_serialized, 'w') as fh_o:

        if format == 'tsv':
            header = [
                "ID", "MSA", "begin", "pvalue", "l_effective", "n",
                "n_effective", "TRD", "model"
            ]
        fh_o.write("\t".join(header))

        for iS in dResults.values():
            for iTR in iS.get_repeatlist(FINAL_TAG).repeats:
                if format == 'tsv':
                    try:
                        data = [
                            str(i) for i in [
                                iS.name, " ".join(iTR.msa), iTR.begin,
                                iTR.pvalue("phylo_gap01"), iTR.l_effective,
                                iTR.n, iTR.n_effective, iTR.TRD, iTR.model
                            ]
                        ]
                    except:
                        print(iTR)
                        raise Exception(
                            "(Could not save data for the above TR.)")

                fh_o.write("\n" + "\t".join(data))

    print("DONE")
Exemplo n.º 9
0
def test_detect_repeats_with_hmm():
    test_hmm = HMM.create(input_format='hmmer',
                          file=os.path.join(path(), TEST_FILE_WITH_ID))
    test_seq = sequence.Sequence(TEST_SEQUENCE)
    test_optimized_repeat = test_seq.detect([test_hmm])
Exemplo n.º 10
0
def test_initialise_sequence():
    test_seq = sequence.Sequence(TEST_SEQUENCE)
    assert test_seq.seq == TEST_SEQUENCE
Exemplo n.º 11
0
def find_protein_repeats(sequences_file, result_dir, pvalue_threshold = 0.05, divergence_threshold = 0.1, n_threshold = 2.5, l_threshold = 3):
    """
    Finds tandem repeats (TRs) in the protein sequences provided in 'sequence_file'.
    Filters the TRs according to the thresholds.
    Saves the TRs in the 'result_dir' each individually with the protein identifier as filename.

    Args:
        sequence_file (str): whole path to file with >=1 protein fasta-sequence
        result_dir (str): path where the individual TRAL results are supposed to be stored.
        pvalue_threshold (int): p-value threshold for filtering
        divergence_threshold (int): divergence threshold for filtering
        n_threshold (int): minimun repeat unit count for filtering
        l_threshold (int): maximum repeat unit length for filtering
    
    Output:
        For each protein sequence, a single file with its TRs is produced and stored in 'result_dir'.
    """
    logging.config.fileConfig(config_file("logging.ini"))
    log = logging.getLogger('root')

    # define this as in the config
    CONFIG_GENERAL = configuration.Configuration.instance().config
    CONFIG = CONFIG_GENERAL["repeat_list"]
    score = CONFIG["model"]

    ##########################################################################
    # From .fasta to Class Sequence sequences
    proteins = Fasta(sequences_file)

    all_denovo_repeats = 0
    all_filtered_repeats = 0
    
    for pyfaidx in proteins:
        seq_name = pyfaidx.name.split("|")[1]

        # name is protein identifier
        seq = sequence.Sequence(seq=str(pyfaidx), name=seq_name)

        log.debug("Work on sequence {}".format(seq_name))
        ##########################################################################
        # Getting TRs

        denovo_list = seq.detect(denovo=True)
        for TR in denovo_list.repeats:
            TR.calculate_pvalues()

        ##########################################################################
        # Filtering TRs

        # add number of denovo found repeats
        all_denovo_repeats += len(denovo_list.repeats)

        # filtering for pvalue
        denovo_list = denovo_list.filter(
            "pvalue",
            score,
            pvalue_threshold)

        # filtering for divergence
        denovo_list = denovo_list.filter(
            "divergence",
            score,
            divergence_threshold)

        # filtering for number of repeat units
        denovo_list = denovo_list.filter(
            "attribute",
            "n_effective",
            "min",
            n_threshold)

        # filtering for length of repeat units
        denovo_list = denovo_list.filter(
            "attribute",
            "l_effective",
            "max",
            l_threshold)

        ##########################################################################
        # Building HMM with hmmbuild

        # # De novo TRs were remastered with HMM
        denovo_hmm = [hmm.HMM.create(input_format='repeat', repeat=iTR)
                    for iTR in denovo_list.repeats]  # only possible with hmmbuild
        denovo_list_remastered = seq.detect(lHMM=denovo_hmm)

        ##########################################################################
        # Clustering

        # De novo TRs were clustered for overlap (common ancestry). Only best =
        # lowest p-Value and lowest divergence were retained.
        denovo_list_remastered = denovo_list.filter(
            "none_overlapping", ["common_ancestry"], {"pvalue": score, "divergence": score})

        ##########################################################################
        # Save Tandem Repeats
    
        # Create output directory if not already exists.
        try:
            if not os.path.isdir(result_dir):
                os.makedirs(result_dir)
        except:
            raise Exception(
                "Could not create path to result directory: {}".format(
                    os.path.dirname(result_dir)))
        
        # create filename
        output_pickle_file = os.path.join(result_dir, seq_name + ".pkl")
        output_tsv_file = os.path.join(result_dir, seq_name + ".tsv")

        # save TR-file
        denovo_list_remastered.write(output_format="pickle", file=output_pickle_file)
        denovo_list_remastered.write(output_format="tsv", file=output_tsv_file)
        #TODO save as fasta




        all_filtered_repeats += len(denovo_list_remastered.repeats)
        print("\n***", seq_name, "***")
        print("denovo repeats:", len(denovo_list.repeats))
        print("repeats after filtering and clustering:",
            len(denovo_list_remastered.repeats))

        for i in range(len(denovo_list_remastered.repeats)):
            print(denovo_list_remastered.repeats[i])

    return print("\nThere where {} repeats found de novo.".format(all_denovo_repeats), "After filtering and clustering there where only {} repeats left.\n".format(
        all_filtered_repeats))
Exemplo n.º 12
0
from pyfaidx import Fasta

# dowload proteinsequence from: https://www.uniprot.org/uniprot/P98179

##########################################################################
# From .fasta to Class Sequence sequences
proteins = Fasta(
    "/home/matteo/polybox/MSc_ACLS/master_thesis/data/test_protein.fasta")

all_denovo_repeats = 0
all_filtered_repeats = 0

for pyfaidx in proteins:
    seq_name = pyfaidx.name.split("|")[1]
    # name is protein identifier
    seq = sequence.Sequence(seq=str(pyfaidx), name=seq_name)

# Saving this sequences as binary files:
# with open(sequence_pkl, 'wb') as f:
#    pickle.dump(seq, f)

##########################################################################
# Getting TRs
denovo_list = seq.detect(denovo=True)
# TODO: reinstall tral with all phylo files!
for TR in denovo_list.repeats:
    TR.calculate_pvalues()

print(TR)

# Saving this sequences as binary files:
Exemplo n.º 13
0
def test_detect_HHrepID():

    test_seq = sequence.Sequence(TEST_SEQUENCE_Q9BRR0)
    predicted_repeats = repeat_detection_run.run_detector(seq_records = [test_seq], detectors = ["HHrepID"])[0]['HHrepID']
    assert len(predicted_repeats) == 1
    assert predicted_repeats[0].msa == ['------------------IPTCAEAGEQ----', 'EGRLQRKQKNATGGRRHICHECGKSFAQ----', 'SSGLSKHRRIHTGEKPYECEECGKAFIG----', 'SSALVIHQRVHTGEKPYECEECGKAFSH----', 'SSDLIKHQRTHTGEKPYECDDCGKTFSQ----', 'SCSLLEHHRIHTGEKPYQCSMCGKAFRR----', 'SSHLLRHQRIHTGDKNVQEPEQGEAWKSRMES', '------QLENVETPMSYKCNECERSFTQ----', 'NTGLIEHQKIHTGEKPYQCNACGKGFTR----', 'ISYLVQHQRSHVG-------------------']
Exemplo n.º 14
0
def test_detect_TREKS():

    test_seq = sequence.Sequence(TEST_SEQUENCE_Q9BRR0)
    predicted_repeats = repeat_detection_run.run_detector(seq_records = [test_seq], detectors = ["T-REKS"])[0]['T-REKS']
    assert len(predicted_repeats) == 1
    assert predicted_repeats[0].msa == ['C---G---KSFAQSSGLSKHRRIHTGEKPYECE-E', 'C---G---KAFIGSSALVIHQRVHTGEKPYECE-E', 'C---G---KAFSHSSDLIKHQRTHTGEKPYECD-D', 'C---G---KTFSQSCSLLEHHRIHTGEKPYQCS-M', 'C---G---KAFRRSSHLLRHQRIHTGDKNVQ-EPE', 'Q---G---EAW--KSRME-SQ-LENVETPMSYK--', 'C---NECERSFTQNTGLIEHQKIHTGEKPYQ----', 'CNACG---KGFTRISYLVQHQRSHVG-KNI-LS--']
Exemplo n.º 15
0
def test_detect_XSTREAM():
    test_seq = sequence.Sequence(TEST_SEQUENCE_Q9BRR0)
    predicted_repeats = repeat_detection_run.run_detector(seq_records = [test_seq], detectors = ["XSTREAM"])[0]['XSTREAM']
    assert len(predicted_repeats) == 1
    assert predicted_repeats[0].msa == ['ECGKSFAQS-SGLSK-HRRIHTGEKPYECE', 'ECGKAFIGS-SALVI-HQRVHTGEKPYECE', 'ECGKAFSHS-SDL-IKHQRTHTGEKPYECD', 'DCGKTFSQSCSLLEH-H-RIHTGEKPY']