예제 #1
0
def test_create_Repeat_list_from_Repeats():

    test_repeats = [repeat.Repeat(msa=i) for i in TEST_REPEATS]
    test_repeat_list = rl.Repeat_list(repeats=test_repeats)

    assert len(test_repeat_list.repeats) == 4
    for i, j in zip(TEST_REPEATS, test_repeat_list.repeats):
        assert i == j.msa
예제 #2
0
def test_serialize_repeat_list_tsv():

    test_repeats = [repeat.Repeat(msa=i) for i in TEST_REPEATS]
    test_seq = sequence.Sequence(TEST_SEQUENCE)
    for i in test_repeats:
        test_seq.repeat_in_sequence(i)
    test_repeat_list = rl.Repeat_list(repeats=test_repeats)

    tsv = rl_io.serialize_repeat_list_tsv(test_repeat_list)

    assert type(tsv) == str
예제 #3
0
def test_serialize_repeat_list_tsv():

    test_repeats = [repeat.Repeat(msa=i) for i in TEST_REPEATS[:2]]
    test_seq = sequence.Sequence(TEST_SEQUENCE)
    for i in test_repeats:
        test_seq.repeat_in_sequence(i)
    test_repeat_list = rl.Repeat_list(repeats=test_repeats)

    tsv = test_repeat_list.write("tsv", str=True)

    assert type(tsv) == str
예제 #4
0
def test_filter_pValue():

    #test_repeats = [repeat.Repeat(msa = i, scoreslist = ["phylo_gap01"], calc_score = True, calc_pValue = True) for i in TEST_REPEATS]
    test_repeats = [repeat.Repeat(msa=i) for i in TEST_REPEATS]
    for i, j in zip(test_repeats, TEST_SCORE_VALUE_LIST):
        i.dPValue = {}
        i.dPValue[TEST_SCORE] = j

    test_repeat_list = rl.Repeat_list(repeats=test_repeats)

    test_repeat_list_filtered = test_repeat_list.filter(
        "pValue", TEST_SCORE, 0.1)
    assert len(test_repeat_list_filtered.repeats) == 1
예제 #5
0
def test_repeat_list_pickle():

    test_repeats = [repeat.Repeat(msa=i) for i in TEST_REPEATS]
    test_repeat_list = rl.Repeat_list(repeats=test_repeats)

    test_pickle = os.path.join(path(), "test.pickle")
    test_repeat_list.write('pickle', test_pickle)
    test_repeat_list_new = repeat.Repeat.create(test_pickle, 'pickle')

    assert len(test_repeat_list.repeats) == len(test_repeat_list_new.repeats)
    assert test_repeat_list.repeats[0].msa == test_repeat_list_new.repeats[
        0].msa

    if os.path.exists(test_pickle):
        os.remove(test_pickle)
예제 #6
0
def test_filter_cluster_based():

    test_repeats = [repeat.Repeat(msa=i) for i in TEST_REPEATS]
    for i, j in zip(test_repeats, TEST_SCORE_VALUE_LIST):
        i.dPValue = {}
        i.dPValue[TEST_SCORE] = j
    for i, j in zip(test_repeats, TEST_BEGIN_LIST):
        i.begin = j

    test_repeat_list = rl.Repeat_list(repeats=test_repeats)
    test_repeat_list.filter("pValue", TEST_SCORE, 0.1)
    test_repeat_list_filtered = test_repeat_list.filter(
        "none_overlapping", ("common_ancestry", None),
        [("pValue", TEST_SCORE), ("divergence", TEST_SCORE)])
    assert len(test_repeat_list_filtered.repeats) == 3
    for i in test_repeats[:3]:
        assert i in test_repeat_list_filtered.repeats
예제 #7
0
def test_cluster():

    test_repeats = [repeat.Repeat(msa=i) for i in TEST_REPEATS]
    for i, j in zip(test_repeats, TEST_BEGIN_LIST):
        i.begin = j

    test_repeat_list = rl.Repeat_list(repeats=test_repeats)
    test_repeat_list.cluster("common_ancestry")

    # Check whether both lists include exactly the same elements.
    for i in [{0}, {1, 3}, {2}]:
        assert i in test_repeat_list.dCluster["common_ancestry"]
    assert len(test_repeat_list.dCluster["common_ancestry"]) == 3

    test_repeat_list.cluster("shared_char")

    # Check whether both lists include exactly the same elements.
    for i in [{0}, {1, 2, 3}]:
        assert i in test_repeat_list.dCluster["shared_char"]
    assert len(test_repeat_list.dCluster["shared_char"]) == 2
예제 #8
0
    def detect(self, lHMM=None, denovo=None, **kwargs):
        """ Detects tandem repeats on ``self.seq`` from 2 possible sources.

        A list of ``Repeat`` instances is created for tandem repeat detections on the
        sequence from two possible sources:

        * Sequence profile hidden Markov models ``HMM``
        * de novo detection algorithms.


        Args:
            hmm (HMM): A list of ``HMM`` instances.
            denovo (bool): boolean
            *args: Parameters fed to denovo TR prediction and/or Repeat instantiation.
                E.g. ``calc_score = True``

        Returns:
            A ``Repeat_list`` instance
        """

        if lHMM:
            if not isinstance(lHMM, list):
                raise Exception('The lHMM value is not a list.')
            for iHMM in lHMM:
                if not isinstance(iHMM, hmm.HMM):
                    raise Exception(
                        'At least one list element in the lHMM value is '
                        'not a valid instance of the HMM class.')

            lRepeat = []
            for iHMM in lHMM:
                # Detect TRs on self.seq with hmm using the Viterbi algorithm.
                most_likely_path = iHMM.viterbi(self.seq)
                logging.debug("most_likely_path: {}".format(most_likely_path))
                if not most_likely_path:
                    continue
                unaligned_msa = hmm_viterbi.hmm_path_to_non_aligned_tandem_repeat_units(
                    self.seq, most_likely_path, iHMM.lD)
                if len(unaligned_msa) > 1:
                    # Align the msa
                    aligned_msa = repeat_align.realign_repeat(unaligned_msa)
                    if len(aligned_msa) > 1:
                        # Create a Repeat() class with the new msa
                        if 'repeat' in kwargs:
                            lRepeat.append(
                                repeat.Repeat(aligned_msa, **kwargs['repeat']))
                        else:
                            lRepeat.append(repeat.Repeat(aligned_msa))

            # Set begin coordinate for all repeats
            for iRepeat in lRepeat:
                self.repeat_in_sequence(iRepeat)

            return repeat_list.Repeat_list(lRepeat)

        elif lHMM == []:
            logging.debug("lHMM == []")
            return None

        elif denovo:
            if 'detection' in kwargs:
                lPredicted_repeat = repeat_detection_run.run_TRD(
                    [self], **kwargs['detection'])[0]
            else:
                lPredicted_repeat = repeat_detection_run.run_TRD([self])[0]

            log.debug("lPredicted_repeat: {}".format(lPredicted_repeat))
            lRepeat = []

            for jTRD, jlTR in lPredicted_repeat.items():
                for iTR in jlTR:
                    if 'repeat' in kwargs:
                        iTR = repeat.Repeat(iTR.msa,
                                            begin=iTR.begin,
                                            **kwargs['repeat'])
                    else:
                        iTR = repeat.Repeat(iTR.msa, begin=iTR.begin)

                    # Consider only tandem repeats that have a repeat unit predicted to be at least one character long.
                    if iTR.lD > 0:

                        # Save l, n, MSA, TRD, scores, sequence_type, position in sequence of given type
                        iTR.TRD = jTRD

                        # Sanity check repeat and set begin coordinate for all repeats
                        if not self.repeat_in_sequence(iTR):
                            logging.debug(
                                "The tandem repeat is not part of the sequence. Detector: {}"
                                .format(iTR.TRD))
                            continue

                        lRepeat.append(iTR)

            return repeat_list.Repeat_list(lRepeat)

        else:
            raise Exception(
                "Either require denovo detection, or present an HMM")
예제 #9
0
def refine_denovo(sequences_file, result_file):
    ''' Refine denovo TRs.

    Refine denovo TRs from the DE_NOVO_TAG ``repeat_list``.
    If the refined de novo TR
        - exists
    append it to the DE_NOVO_REFINED_TAG ``repeat_list``. Otherwise, append False. Append
    None for TRs that are in DE_NOVO_ALL_TAG, but not in DE_NOVO_TAG, such that finally,
    all TRs in DE_NOVO_ALL_TAG have a corresponding entry (which might be None or False)
    in DE_NOVO_REFINED_TAG.

    If the refined de novo TR
        - exists
        - overlaps with the original de novo TR
        - passes the basic filtering test
    append it to the FINAL_TAG ``repeat_list``. Otherwise, append the original de novo TR.

     Args:
         sequences_file (str): Path to the pickle file containing a list of ``Sequence``
            instances.
         result_file (str): Path to the result file.

     Raises:
        Exception: If the pickle ``sequences_file`` cannot be loaded
    '''

    basic_filter = config['filter']['basic']['dict']

    try:
        with open(sequences_file, 'rb') as fh:
            lSequence = pickle.load(fh)
    except:
        raise Exception(
            "Cannot load putative pickle file sequences_file: {}".format(
                sequences_file))

    for iS in lSequence:
        log.debug(iS.id)
        denovo_final = []
        denovo_refined = [None] * len(iS.dRepeat_list[DE_NOVO_ALL_TAG].repeats)
        for i, iTR in enumerate(iS.dRepeat_list[DE_NOVO_ALL_TAG].repeats):
            if not iTR in iS.dRepeat_list[DE_NOVO_TAG].repeats:
                continue
            # Create HMM from TR
            denovo_hmm = hmm.HMM.create(format='repeat', repeat=iTR)
            # Run HMM on sequence
            denovo_refined_rl = iS.detect(lHMM=[denovo_hmm])
            append_refined = False
            if denovo_refined_rl and denovo_refined_rl.repeats:
                iTR_refined = denovo_refined_rl.repeats[0]
                iTR_refined.TRD = iTR.TRD
                iTR_refined.model = "cpHMM"
                denovo_refined[i] = iTR_refined
                # Check whether new and old TR overlap. Check whether new TR is significant. If not both, put unrefined TR into final.
                if repeat_list.two_repeats_overlap("shared_char", iTR,
                                                   iTR_refined):
                    rl_tmp = repeat_list.Repeat_list([iTR_refined])
                    log.debug(iTR_refined.msa)
                    for iB in basic_filter.values():
                        rl_tmp = rl_tmp.filter(**iB)
                    if rl_tmp.repeats:
                        append_refined = True
            else:
                denovo_refined[i] = False
            if append_refined:
                denovo_final.append(iTR_refined)
            else:
                denovo_final.append(iTR)

        iS.set_repeat_list(repeat_list.Repeat_list(denovo_refined),
                           DE_NOVO_REFINED_TAG)
        iS.set_repeat_list(repeat_list.Repeat_list(denovo_final),
                           DE_NOVO_FINAL_TAG)
        iS.set_repeat_list(
            iS.dRepeat_list[DE_NOVO_FINAL_TAG] + iS.dRepeat_list[PFAM_TAG],
            FINAL_TAG)

    with open(result_file, 'wb') as fh:
        pickle.dump(lSequence, fh)

    print("DONE")
예제 #10
0
def merge_and_basic_filter(sequences_file, repeat_files, result_file,
                           **kwargs):
    ''' Merge TR annotations from several sources and perform basic filtering.

    Merge TR annotations from several sources and perform basic filtering based on the
    number of repeat units and the statistical significance of the tandem repeats.

    Args:
        sequences_file (str): Path to the pickle file containing a list of ``Sequence``
            instances.
        repeat_files (list of str): Lists of paths to the pickle file containing a dict
            of ``Repeat`` instances.
        result_file (str): Path to the result file.
        kwargs (dict): A dictionary of parameters for the applied significance test.


     Raises:
        Exception: If the pickle ``repeat_file`` cannot be loaded
     Raises:
        Exception: If any of the pickles in ``sequences_file`` cannot be loaded

    ..ToDo: Input BASIC_FILTER as PARAMETERS
    '''

    basic_filter = config['filter']['basic']['dict']
    basic_filter_tag = config['filter']['basic']['tag']

    try:
        with open(sequences_file, 'rb') as fh:
            lSequence = pickle.load(fh)
    except:
        raise Exception(
            "Cannot load putative pickle file sequences_file: {}".format(
                sequences_file))

    log.debug("Merging all repeat lists in from ``repeat_files``.")
    dRL_all = {}
    for iRLF in repeat_files:
        try:
            with open(iRLF, 'rb') as fh:
                dRL = pickle.load(fh)
        except:
            raise Exception(
                "Cannot load putative pickle file repeat_list_file: {}".format(
                    iRLF))
        if not dRL_all:
            dRL_all = dRL
        else:
            for iS_ID, iRL in dRL.items():
                dRL_all[iS_ID] += iRL

    log.debug("Append ``repeat_list`` to ``sequence``.")
    for iS in lSequence:
        iS.set_repeat_list(dRL_all[iS.id], REPEAT_LIST_TAG)
        if iS.dRepeat_list[REPEAT_LIST_TAG]:
            denovo_repeat_list = repeat_list.Repeat_list([
                i for i in iS.dRepeat_list[REPEAT_LIST_TAG].repeats
                if hasattr(i, "TRD")
            ])
            pfam_repeat_list = repeat_list.Repeat_list([
                i for i in iS.dRepeat_list[REPEAT_LIST_TAG].repeats
                if not hasattr(i, "TRD")
            ])
        else:
            denovo_repeat_list = iS.dRepeat_list[REPEAT_LIST_TAG]
            pfam_repeat_list = iS.dRepeat_list[REPEAT_LIST_TAG]
        iS.set_repeat_list(denovo_repeat_list, DE_NOVO_ALL_TAG)
        iS.set_repeat_list(pfam_repeat_list, PFAM_ALL_TAG)

    for iS in lSequence:
        rl_tmp = iS.dRepeat_list[REPEAT_LIST_TAG]
        if iS.dRepeat_list[REPEAT_LIST_TAG]:
            for iB in basic_filter.values():
                rl_tmp = rl_tmp.filter(**iB)
        else:
            rl_tmp = iS.dRepeat_list[REPEAT_LIST_TAG]
        iS.set_repeat_list(rl_tmp, basic_filter_tag)
        iS.set_repeat_list(rl_tmp.intersection(iS.dRepeat_list[PFAM_ALL_TAG]),
                           PFAM_TAG)
        iS.set_repeat_list(
            rl_tmp.intersection(iS.dRepeat_list[DE_NOVO_ALL_TAG]), DE_NOVO_TAG)

    with open(result_file, 'wb') as fh:
        pickle.dump(lSequence, fh)

    print("DONE")