def main():
    'The script itself'
    #set parameters
    work_dir, output, reference = set_parameters()

    # make a working tempfir
    temp_dir = NamedTemporaryDir()

    # add readgroup tag to each alignment in bam
    add_header_and_tags_bams(work_dir, temp_dir.name)

    # Prepare files to merge
    sams = get_opened_sams_from_dir(temp_dir.name)
    temp_sam = NamedTemporaryFile()

    # merge all the sam in one
    merge_sam(sams, temp_sam, reference)

    # Convert sam into a bam,(Temporary)
    temp_bam = NamedTemporaryFile(suffix='.bam')
    sam2bam(temp_sam.name, temp_bam.name)

    # finally we need to order the bam
    sort_bam_sam(temp_bam.name, output)

    # and make and index of the bam
    call(['samtools', 'index', output], raise_on_error=True)

    temp_dir.close()
    def test_blast_seq_against_bad_db(self):
        'We can blast a seq file against a database'
        test_dir = NamedTemporaryDir()
        project_name = 'backbone'

        create_project(directory=test_dir.name,
                                       name=project_name)
        project_dir = join(test_dir.name, project_name)

        #some query fasta file
        query = '>seq1\nGATCGGCCTTCTTGCGCATCTCACGCGCTCCTGCGGCGGCCTGTAGGGCAGGCT'
        query += 'CATACCCCTGCCGAACCGCTTTTGTCA|n'
        query_fhand = NamedTemporaryFile(mode='w')
        query_fhand.write(query)
        query_fhand.flush()

        #the blast db
        blast_db_fname = 'uni'
        blast_db = join(TEST_DATA_DIR, 'blast', blast_db_fname)

        blast_program = 'blastn'
        try:
            backbone_blast_runner(query_fpath=query_fhand.name,
                                  project_dir=project_dir,
                                  blast_program=blast_program,
                                  blast_db=blast_db)
            self.fail('RuntimeError expected')
        except RuntimeError:
            pass
        test_dir.close()
    def test_blast_seq_against_db():
        'We can blast a seq file against a database'
        test_dir = NamedTemporaryDir()
        project_name = 'backbone'

        create_project(directory=test_dir.name,
                                       name=project_name)
        project_dir = join(test_dir.name, project_name)

        #some query fasta file
        query = '>seq1\nGATCGGCCTTCTTGCGCATCTCACGCGCTCCTGCGGCGGCCTGTAGGGCAGGCT'
        query += 'CATACCCCTGCCGAACCGCTTTTGTCA|n'
        query_fhand = NamedTemporaryFile(mode='w')
        query_fhand.write(query)
        query_fhand.flush()

        #the blast db
        blast_db_fname = 'univec+'
        blast_db = join(TEST_DATA_DIR, 'blast', blast_db_fname)

        blast_program = 'blastn'
        backbone_blast_runner(query_fpath=query_fhand.name,
                              project_dir=project_dir,
                              blast_program=blast_program,
                              blast_db=blast_db)

        #is the blast ok?
        blast_fpath = join(project_dir,
                           BACKBONE_DIRECTORIES['blast_dir'],
                           _get_basename(query_fhand.name),
                           blast_db_fname,
                           '%s.%s.xml' % (BACKBONE_BASENAMES['blast_basename'],
                                          blast_program))
        assert '<Hit_def>vec1</Hit_def>' in open(blast_fpath).read()
        test_dir.close()
示例#4
0
    def test_orf_annotation_analysis():
        "We can annotate orfs"
        test_dir = NamedTemporaryDir()
        project_name = "backbone"
        matrix = os.path.join(TEST_DATA_DIR, "At.smat")
        config = {
            "Annotation": {"orf_annotation": {"estscan_matrix": matrix}},
            "General_settings": {"threads": THREADS},
        }

        settings_path = create_project(directory=test_dir.name, name=project_name, configuration=config)
        project_dir = join(test_dir.name, project_name)
        seq = "CTACTTACTAGCTTTAGTAAATCCTTCTAACCCTCGGTAAAAAAAAAAAAGAGGCATCAAATG"
        seq += "GCTTCATCCATTCTCTCATCCGCCGNTGTGGCCTTTGNCAACAGGGCTTCCCCTGCTCAAGCT"
        seq += "AGCATGGGGGCACCATTCACTGGCCTAAAATCCGCCGCTGCTTTCCCNGTNACTCGCANGACC"
        seq += "AACGACATCACCACTTTGGTTAGCAATGGGGGAAGAGTTCAGGGCNTGAAGGTGTGCCCACCA"
        seq += "CTTGGATTGAAGAAGTTCGAGACTCTTTCTTACCTTCCTGATATGAGTAACGAGCAATTGGGA"
        seq += "AAGGAAGTTGACTACCTTCTCAGGAAGGGATGGATTCCCTGCATTGAATTCGACATTCACAGT"
        seq += "GGATTCGTTTACCGTGAGACCCACAGGTCACCAGGATACTTCGATGGACGCTACTGGACCATG"
        seq += "TGGAAGCTGCCCATGTTTGGCTGCACCGAT"

        annot_input_dir = join(project_dir, "annotations", "input")
        os.makedirs(annot_input_dir)

        # create some seqs to annotate
        fasta = ">seq\n%s\n" % seq
        fhand = open(os.path.join(annot_input_dir, "seqs.fasta"), "w")
        fhand.write(fasta)
        fhand.close()
        do_analysis(project_settings=settings_path, kind="annotate_orfs", silent=True)
        repr_fpath = join(project_dir, BACKBONE_DIRECTORIES["annotation_dbs"], "seqs.0.pickle")
        result = open(repr_fpath).read()
        assert "orf" in result
        do_analysis(project_settings=settings_path, kind="write_annotations", silent=True)

        seq_fpath = join(project_dir, "annotations", "features", "seqs.orf_seq.fasta")
        pep_fpath = join(project_dir, "annotations", "features", "seqs.orf_pep.fasta")

        assert "ATCCGCCGNTGTGGCCTTTGNCAACAGGGCTTCCCCT" in open(seq_fpath).read()
        assert "QASMGAPFTGLKSAAAFPVTRXTNDITTLVSNG" in open(pep_fpath).read()

        do_analysis(project_settings=settings_path, kind="annotation_stats", silent=True)
        stats_fpath = join(project_dir, "annotations", "features", "stats", "seqs.txt")
        result = open(stats_fpath).read()
        expected = """Sequences with ORF: 1
Number of ORFs: 1"""
        assert expected in result

        test_dir.close()
示例#5
0
    def test_simple_named_temporary_dir():
        'It test temporay named dir'
        temp_dir = NamedTemporaryDir()
        dir_name = temp_dir.name
        assert os.path.exists(dir_name) == True
        temp_dir.close()
        assert os.path.exists(dir_name) == False

        temp_dir = NamedTemporaryDir()
        dir_name = temp_dir.name
        fhand = open(os.path.join(dir_name, 'peio'), 'w')
        assert os.path.exists(fhand.name) == True
        assert os.path.exists(dir_name) == True
        del(temp_dir)
        assert os.path.exists(dir_name) == False
    def test_create_project():
        'We can create a project'
        test_dir = NamedTemporaryDir()
        settings_path = create_project(directory=test_dir.name,
                                       name='backbone')

        assert settings_path == join(test_dir.name,
                                'backbone', BACKBONE_DIRECTORIES['config_file'])
        settings = create_configuration(settings_path)
        assert settings['General_settings']['project_name'] == 'backbone'
        project_path = join(test_dir.name, 'backbone')
        assert settings['General_settings']['project_path'] == project_path
        assert settings['Cleaning']['strip_n_percent'] == 2.0
        content = open(settings_path).read()
        assert 'strip_n_percent' in content
        test_dir.close()
示例#7
0
    def test_microsatellite_annoation_analysis():
        "We can annotate introns"
        test_dir = NamedTemporaryDir()
        project_name = "backbone"
        settings_path = create_project(
            directory=test_dir.name, name=project_name, configuration={"General_settings": {"threads": THREADS}}
        )
        project_dir = join(test_dir.name, project_name)
        seq = "GAAAAGATGTGATTGGTGAAATAAGTTTGCCTCAATTCTCTTGTGCCGAAGTTCCAAAGAAGC"
        seq += "AGTTGGTGAATGAGCAGCCAGTACCCGAAAAATCGAGCAAAGATTTTGTGATGTATGTTGGAG"
        seq += "GTCTAGCATGGGGGATGGACTGGTGTCCCCAAGCTCATGAAAATAGGGATGCTCCTATGAAAA"
        seq += "GAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGA"
        seq += "GTGAGTTTGTCGCAATTGCTCCTCATCCTCCTGATTCATCATATCACAAGACTGATGCCTCAC"
        seq += "TTACAGGCAGAGGTGTAATTCAGATATGGTGCCTGCCAGATCTCATTCAAAAAGATATAATTG"
        seq += "TGAAAGAAGATTATTTTGCTCAGGTTAACAAAAAACCGTATAGAAATTTGACAAGAAGTGAAG"
        seq += "CAGGTACGGGAGAAGTATCTGGACCTCAAAAACCAAGAGGAAGACCAAAAAAGAACCCTGGTA"
        seq += "AAGCAGTCCAGGCAAAAGCATCTAGACCACAAAATCCAAGAGGAAGACCGAGAAAGAAGCCTG"
        seq += "TTACTGAATCTTTAGGTGATAGAGATAGTGAAGACCACAGTTTACAACCTCTTGCTATAGAGT"
        seq += "GGTCGCTGCAATCAACAGAACTTTCTGTAGATTTGTCTTGTGGAAATATGAATAAAGCCCAAG"
        seq += "TAGATATTGCGCTGAGTCAAGAAAGATGTATTAATGCGGCAT"
        annot_input_dir = join(project_dir, "annotations", "input")
        os.makedirs(annot_input_dir)

        # create some seqs to annotate
        fasta = ">seq\n%s\n" % seq
        fhand = open(os.path.join(annot_input_dir, "seqs.fasta"), "w")
        fhand.write(fasta)
        fhand.close()
        do_analysis(project_settings=settings_path, kind="annotate_microsatellites", silent=True)
        pickle_fpath = join(project_dir, BACKBONE_DIRECTORIES["annotation_dbs"], "seqs.0.pickle")
        result = open(pickle_fpath).read()
        assert "microsatellite" in result

        do_analysis(project_settings=settings_path, kind="write_annotations", silent=True)
        ssr_fpath = join(project_dir, "annotations", "features", "seqs.ssr")
        assert os.path.exists(ssr_fpath)
        assert "Seqname" in open(ssr_fpath).read()

        do_analysis(project_settings=settings_path, kind="annotation_stats", silent=True)
        stats_fpath = join(project_dir, "annotations", "features", "stats", "seqs.txt")
        result = open(stats_fpath).read()
        expected = "Sequences with microsatellites: 1"
        assert expected in result

        test_dir.close()
示例#8
0
    def test_basic_functionality(self):
        'VersionedPath basic functionality'
        tempdir = NamedTemporaryDir()
        tempdir_name = tempdir.name
        fnames = ['hola.txt', 'hola.0.txt', 'hola.1.txt',
                  'adios.txt', 'adios.1.txt',
                  'foo.txt']
        [open(os.path.join(tempdir.name, fname), 'w') for fname in fnames]

        path_str = os.path.join(tempdir.name, 'hola.txt')
        path = VersionedPath(path_str)
        assert str(path) == path_str

        path_str_0 = os.path.join(tempdir.name, 'hola.0.txt')
        path = VersionedPath(path_str_0)
        assert str(path) == path_str
        assert path.basename == 'hola'
        assert path.directory == tempdir.name
        assert path.extension == 'txt'

        assert path.last_version == VersionedPath(os.path.join(tempdir_name,
                                                               'hola.1.txt'))
        assert path.next_version == VersionedPath(os.path.join(tempdir_name,
                                                               'hola.2.txt'))

        fpaths = [os.path.join(tempdir_name, fname) for fname in ('hola.1.txt',
                                                                  'adios.1.txt',
                                                                  'foo.txt')]
        expected_paths = set(fpaths)
        versioned_paths = set(path.list_fpaths_versioned())
        assert versioned_paths == expected_paths

        path = VersionedPath(tempdir_name)
        versioned_paths = set(path.list_fpaths_versioned())
        assert versioned_paths == expected_paths

        tempdir.close()

        #in an empty dir
        path = VersionedPath('hola.txt')
        assert path.last_version.endswith('hola.txt')
        assert path.next_version.endswith('hola.0.txt')

        path = VersionedPath('pl_illumina.sm_rp_75_59_uc82.sfastq')
        assert path.last_version.endswith('pl_illumina.sm_rp_75_59_uc82.sfastq')
示例#9
0
def b2gpipe_runner(blast, annot_fpath, b2gpipe_bin, prop_fpath, dat_fpath=None, java_conf=None):
    "It runs b2gpipe"
    tempdir = NamedTemporaryDir()
    out_basename = os.path.join(tempdir.name, "out")

    cmd = java_cmd(java_conf)
    cmd.extend(["-jar", b2gpipe_bin, "-in", blast.name, "-out", out_basename, "-a", "-prop", prop_fpath])

    if dat_fpath:
        cmd.append("-d")
    logger = logging.getLogger("franklin")
    logger.info("Running blast2go: %s" % " ".join(cmd))

    call(cmd, raise_on_error=True, add_ext_dir=False)
    shutil.move(out_basename + ".annot", annot_fpath)
    if dat_fpath:
        shutil.move(out_basename + ".dat", dat_fpath)
    tempdir.close()
示例#10
0
    def test_cdna_intron_annoation_analysis():
        "We can annotate introns"
        test_dir = NamedTemporaryDir()
        project_name = "backbone"
        blast_db_path = os.path.join(TEST_DATA_DIR, "blast")
        genomic_db = os.path.join(blast_db_path, "tomato_genome2+")
        config = {
            "Annotation": {"Cdna_intron_annotation": {"genomic_db": genomic_db, "genomic_seq_file": genomic_db}},
            "General_settings": {"threads": THREADS},
        }
        settings_path = create_project(directory=test_dir.name, name=project_name, configuration=config)
        project_dir = join(test_dir.name, project_name)
        seq = "GAAAAGATGTGATTGGTGAAATAAGTTTGCCTCAATTCTCTTGTGCCGAAGTTCCAAAGAAGC"
        seq += "AGTTGGTGAATGAGCAGCCAGTACCCGAAAAATCGAGCAAAGATTTTGTGATGTATGTTGGAG"
        seq += "GTCTAGCATGGGGGATGGACTGGTGTCCCCAAGCTCATGAAAATAGGGATGCTCCTATGAAAA"
        seq += "GTGAGTTTGTCGCAATTGCTCCTCATCCTCCTGATTCATCATATCACAAGACTGATGCCTCAC"
        seq += "TTACAGGCAGAGGTGTAATTCAGATATGGTGCCTGCCAGATCTCATTCAAAAAGATATAATTG"
        seq += "TGAAAGAAGATTATTTTGCTCAGGTTAACAAAAAACCGTATAGAAATTTGACAAGAAGTGAAG"
        seq += "CAGGTACGGGAGAAGTATCTGGACCTCAAAAACCAAGAGGAAGACCAAAAAAGAACCCTGGTA"
        seq += "AAGCAGTCCAGGCAAAAGCATCTAGACCACAAAATCCAAGAGGAAGACCGAGAAAGAAGCCTG"
        seq += "TTACTGAATCTTTAGGTGATAGAGATAGTGAAGACCACAGTTTACAACCTCTTGCTATAGAGT"
        seq += "GGTCGCTGCAATCAACAGAACTTTCTGTAGATTTGTCTTGTGGAAATATGAATAAAGCCCAAG"
        seq += "TAGATATTGCGCTGAGTCAAGAAAGATGTATTAATGCGGCAT"
        annot_input_dir = join(project_dir, "annotations", "input")
        os.makedirs(annot_input_dir)

        # create some seqs to annotate
        fasta = ">seq\n%s\n" % seq
        fhand = open(os.path.join(annot_input_dir, "seqs.fasta"), "w")
        fhand.write(fasta)
        fhand.close()
        do_analysis(project_settings=settings_path, kind="annotate_introns", silent=True)
        pickle_fpath = join(project_dir, BACKBONE_DIRECTORIES["annotation_dbs"], "seqs.0.pickle")
        assert "intron" in open(pickle_fpath).read()

        do_analysis(project_settings=settings_path, kind="annotation_stats", silent=True)
        stats_fpath = join(project_dir, "annotations", "features", "stats", "seqs.txt")
        result = open(stats_fpath).read()
        expected = """Sequences with intron: 1
Number of introns: 3"""
        assert expected in result

        test_dir.close()
示例#11
0
    def test_description_annotation_analysis():
        "We can annotate with description"
        test_dir = NamedTemporaryDir()
        project_name = "backbone"
        arab_blastdb = join(TEST_DATA_DIR, "blast", "arabidopsis_genes+")
        config = {
            "blast": {"arabidopsis": {"path": arab_blastdb, "species": "arabidopsis"}},
            "Annotation": {"description_annotation": {"description_databases": ["arabidopsis"]}},
            "General_settings": {"threads": THREADS},
        }

        settings_path = create_project(directory=test_dir.name, name=project_name, configuration=config)
        project_dir = join(test_dir.name, project_name)

        # some melon file to annotate
        input_dir = join(project_dir, BACKBONE_DIRECTORIES["annotation_input"])
        os.makedirs(input_dir)
        seq_ = "AGGTGTCACCGTTCACGAGGGCGACTGGGACTCCCACGGGGCCATCAAGTCCTGGAACTACA"
        seq_ += "CATGCGGTCCTCTATCTCATTCTCTATTTGTATGAATATGTGTTTATTACTAGCTAGGGTTT"
        seq_ += "CTATTAATGAAAGGTTCATGTAAATATATGAAGATGGGAAGCAAGAGGTGTTCAAGGAGAAG"
        seq_ += "AGGGAGTTAGACGACCAGAAGAT"
        seq1 = SeqWithQuality(Seq(seq_), id="CUTC021854")
        seq2 = SeqWithQuality(Seq("Atagtagcatcagatgagcatcgacttctagctagctagct"), id="CUTC021853")
        write_seqs_in_file([seq1, seq2], open(join(input_dir, "melon.st_nucl.pl_454.fasta"), "a"))

        do_analysis(project_settings=settings_path, kind="annotate_descriptions", silent=True)

        repr_fpath = join(project_dir, BACKBONE_DIRECTORIES["annotation_dbs"], "melon.st_nucl.pl_454.0.pickle")
        result = open(repr_fpath).read()
        # print result
        assert "yet another one" in result

        do_analysis(project_settings=settings_path, kind="annotation_stats", silent=True)
        stats_fpath = join(project_dir, "annotations", "features", "stats", "melon.st_nucl.pl_454.txt")
        result = open(stats_fpath).read()
        expected = """Annotation statistics
---------------------
Number of sequences: 2
Sequences with description: 1"""
        assert expected in result

        test_dir.close()
示例#12
0
def b2gpipe_runner(blast, annot_fpath, b2gpipe_bin, prop_fpath, dat_fpath=None,
                   java_conf=None, ):
    'It runs b2gpipe'
    tempdir = NamedTemporaryDir()
    out_basename = os.path.join(tempdir.name, 'out')

    cmd = java_cmd(java_conf)
    cmd.extend(['-jar', b2gpipe_bin, '-in', blast.name, '-out', out_basename,
                 '-a', '-prop', prop_fpath])

    if dat_fpath:
        cmd.append('-d')
    logger = logging.getLogger('franklin')
    logger.info('Running blast2go: %s' % ' '.join(cmd))

    call(cmd, raise_on_error=True, add_ext_dir=False)
    shutil.move(out_basename + '.annot', annot_fpath)
    if dat_fpath:
        shutil.move(out_basename + '.dat', dat_fpath)
    tempdir.close()
    def test_gmap_mapper():
        'It test the gmap mapper'
        mappers_dir = join(TEST_DATA_DIR, 'mappers')
        gmap_dir = join(TEST_DATA_DIR, 'mappers', 'gmap')
        work_dir = NamedTemporaryDir()
        temp_genome = join(work_dir.name, 'genome.fa')
        os.symlink(join(mappers_dir, 'genome.fa'), temp_genome)

        reads_fpath = join(gmap_dir, 'lb_lib1.pl_sanger.sm_sam1.fa')

        out_bam_fhand = NamedTemporaryFile(suffix='.bam')
        parameters = {'threads':None, 'kmer':13}
        map_reads_with_gmap(temp_genome, reads_fpath, out_bam_fhand.name,
                            parameters)

        sam_fhand = NamedTemporaryFile(suffix='.sam')
        bam2sam(out_bam_fhand.name, sam_fhand.name, header=True)
        result = open(sam_fhand.name).read()
        assert exists(out_bam_fhand.name)
        assert '36M2I204M' in result
        assert 'SN:SL2.30ch00' in result
        assert 'seq9_rev_MOD' in result

        work_dir.close()
        out_bam_fhand.close()
        sam_fhand.close()

        work_dir = NamedTemporaryDir()
        temp_genome = join(work_dir.name, 'genome.fa')
        os.symlink(join(mappers_dir, 'genome.fa'), temp_genome)

        reads_fpath = join(gmap_dir, 'lb_lib1.pl_sanger.sm_sam1.sfastq')
        out_bam_fhand = NamedTemporaryFile(suffix='.bam')
        unmapped_fhand = StringIO.StringIO()
        parameters = {'threads':None, 'kmer':13,
                      'unmapped_fhand':unmapped_fhand}
        map_reads_with_gmap(temp_genome, reads_fpath, out_bam_fhand.name,
                            parameters)

        sam_fhand = NamedTemporaryFile(suffix='.sam')
        bam2sam(out_bam_fhand.name, sam_fhand.name, header=True)
        result = open(sam_fhand.name).read()
        assert exists(out_bam_fhand.name)
        assert '36M2I204M' in result
        assert 'SN:SL2.30ch00' in result
        assert 'seq9_rev_MOD' in result
        assert '?????????????????' in result
        work_dir.close()
        out_bam_fhand.close()
        sam_fhand.close()
    def test_cleaning_analysis():
        'We can clean the reads'
        test_dir = NamedTemporaryDir()
        project_name = 'backbone'
        project_dir = join(test_dir.name, project_name)
        adaptors_dir = join(project_dir, 'config_data', 'adaptors')
        adaptors_path_454 = join(adaptors_dir, '454_adaptors')
        words = ['^ATGAAC', 'TTGATTTGGT']
        univec = os.path.join(TEST_DATA_DIR, 'blast', 'univec+')
        configuration = {'Cleaning': {'vector_database': univec,
                                     'adaptors_file_454': adaptors_path_454,
                                     'short_adaptors_454': words,
                                     'edge_removal': {'454_left': 3,
                                                      '454_right': 3}},
                         'General_settings': {'threads': THREADS}}

        settings_path = create_project(directory=test_dir.name,
                                       name=project_name,
                                       configuration=configuration)

        #setup the original reads
        reads_dir = join(project_dir, 'reads')
        original_reads_dir = join(reads_dir, 'raw')
        os.mkdir(reads_dir)
        os.mkdir(original_reads_dir)

        os.makedirs(adaptors_dir)
        adap_fhand = open(adaptors_path_454, 'w')
        adap_fhand.write('''>smart_5_cds_primer_1
GGTTCAAGGTTTGAGAAAGGATGGGAAG\n''')
        adap_fhand.close()

        #print original_reads_dir
        fpath_454 = join(original_reads_dir, 'pl_454.lb_a.sfastq')
        fpath_ill = join(original_reads_dir, 'pl_illumina.lb_b.sfastq')
        fpath_solid = join(original_reads_dir, 'pl_solid.lb_prueba.sfastq')

        open(fpath_solid, 'w').write(READS_SOLID)
        open(fpath_454, 'w').write(READS_454)
        open(fpath_ill, 'w').write(READS_ILL)

        do_analysis(project_settings=settings_path, kind='clean_reads',
                    silent=True)
        cleaned_dir = join(project_dir, 'reads', 'cleaned')
        assert exists(cleaned_dir)
        cleaned_454 = join(cleaned_dir, os.path.basename(fpath_454))
        assert exists(cleaned_454)
        seqs = list(seqs_in_file(open(cleaned_454)))
        # It means thar the adaptor has been removed
        seq = seqs[0].seq
        assert 'GGTTCAAGGTTTGAGAAAGGATGGGAAG' not in seq

        seq = seqs[2].seq
        # It means that the starting word has been removed
        assert  seq.startswith('TTCCAAGATTCTTCCCACAT')

        # solid
        cleaned_solid = join(cleaned_dir, os.path.basename(fpath_solid))
        clean_seqs = open(cleaned_solid).read()
        assert '10_1824_570_F3' not in clean_seqs

        do_analysis(project_settings=settings_path,
                    kind='prepare_mira_assembly', silent=True)
        assembly_input = join(project_dir, 'assembly', 'input')
        assert exists(assembly_input)
        mira_in_454 = join(assembly_input, 'backbone_in.454.fasta')
        mira_in_qul = join(assembly_input, 'backbone_in.454.fasta.qual')
        assert exists(mira_in_454)
        assert exists(mira_in_qul)

        do_analysis(project_settings=settings_path, kind='mira_assembly',
                    silent=True)
        assembly_dir = join(project_dir, 'assembly')
        sorted(os.listdir(assembly_dir))
        test_dir.close()
示例#15
0
def test_backbone(analysis=None, analysis_dir=None):
    '''It tests the backbone infrastructure.

    If no analysis is given it will run all of them.
    If no analysis_dir is given a temporary one will be used.
    '''
    logger = logging.getLogger('franklin')
    if analysis_dir:
        analysis_fhand = None
        analysis_fpath = analysis_dir
    else:
        analysis_fhand = NamedTemporaryDir()
        analysis_fpath = analysis_fhand.name

    project_dir = analysis_fpath
    repository_dir = join(TEST_DATA_DIR, 'acceptance')
    settings_path = prepare_conf(project_dir, repository_dir)
    choice = analysis
    #choice = 'snvs'
    if choice in ('cleaning', None):
        original_reads = join(project_dir, 'reads/raw')
        if exists(original_reads):
            os.remove(original_reads)
        reads = join(project_dir, 'reads')
        if not exists(reads):
            os.mkdir(reads)
        shutil.copytree(join(repository_dir, 'cleaning'),
                        join(project_dir, 'reads/raw'))
        analyses = ['clean_reads', 'read_stats']
        run_analysis(analyses, settings_path)

    if choice in ('assembling', None):
        clean_reads_dir = join(project_dir, 'reads', 'cleaned')
        if os.path.exists(clean_reads_dir):
            shutil.rmtree(join(project_dir, 'reads'))
        os.mkdir(join(project_dir, 'reads'))
        shutil.copytree(join(repository_dir, 'assembling'),
                        join(project_dir, 'reads/cleaned'))

        analyses = [ 'prepare_mira_assembly', 'mira_assembly']
        run_analysis(analyses, settings_path)

    if choice in ('mapping', None):
        clean_reads_dir = join(project_dir, 'reads', 'cleaned')
        if os.path.exists(clean_reads_dir):
            shutil.rmtree(join(project_dir, 'reads'))
        os.mkdir(join(project_dir, 'reads'))
        shutil.copytree(join(repository_dir, 'assembling'),
                        join(project_dir, 'reads/cleaned'))
        if exists(join(project_dir, 'mapping')):
            shutil.rmtree(join(project_dir, 'mapping'))
        os.makedirs(join(project_dir, 'mapping', 'reference'))
        shutil.copy(join(repository_dir, 'mapping', 'reference.fasta'),
                    join(project_dir, 'mapping', 'reference',
                         'reference.fasta'))

        analyses = ['mapping', 'merge_bams', 'realign_bam']
        run_analysis(analyses, settings_path)

    if choice in ('snvs', None):
        annot_dir = join(project_dir, 'annotations')
        create_dir(annot_dir)
        annot_res = join(annot_dir, 'repr')
        os.mkdir(join(annot_dir, 'input'))
        os.mkdir(annot_res)
        shutil.copy(join(repository_dir, 'snvs', 'reference.fasta'),
                    join(annot_dir, 'input', 'reference.fasta'))

        mapping_dir = join(project_dir, 'mapping')
        create_dir(mapping_dir)
        os.mkdir(join(mapping_dir, 'reference'))
        shutil.copy(join(repository_dir, 'snvs', 'merged.bam'),
                   join(project_dir, 'mapping', 'merged.bam'))
        shutil.copy(join(repository_dir, 'snvs', 'reference.fasta'),
                   join(project_dir, 'mapping', 'reference', 'reference.fasta'))
        analyses = ['annotate_snvs', 'filter_snvs', 'annotation_stats',
                    'write_annotations']
        run_analysis(analyses, settings_path)

        stats_fpath = join(project_dir, 'annotations', 'features', 'stats',
                            'reference.txt')
        result = open(stats_fpath).read()

        #print  result
        expected = '''Sequences with SNVs: 47
SNVs found: 186
SNV types:
\tinsertion: 2
\tdeletion: 12
\tcomplex: 1
\ttransition: 45
\ttransversion: 14
\tunknown: 112
SNV locations:
\tunknown: 186'''
        assert expected in result

    if choice in ('annotation', None):
        annot_dir = join(project_dir, 'annotations')
        if exists(join(annot_dir)):
            shutil.rmtree(annot_dir)
        os.mkdir(annot_dir)
        shutil.copytree(join(repository_dir, 'annotation', 'input'),
                        join(annot_dir, 'input'))
        shutil.copytree(join(repository_dir, 'annotation', 'blast'),
                        join(annot_dir, 'blast'))

        analyses = ['annotate_orfs', 'annotate_microsatellites',
                    'annotate_gos', 'annotate_descriptions',
                    'annotate_orthologs', 'annotate_introns',
                    'annotate_prot_change',
                    'write_annotations', 'annotation_stats']
        run_analysis(analyses, settings_path)

        stats_fpath = join(project_dir, 'annotations', 'features', 'stats',
                            'tair7_cdna.st_nucl.txt')
        result = open(stats_fpath).read()
        expected = '''Number of sequences: 4
Sequences with description: 4
Sequences with ORF: 4
Number of ORFs: 4
Sequences with intron: 2
Number of introns: 3'''
        assert expected in result

    if not analysis_dir:
        analysis_fhand.close()
示例#16
0
    def test_mapping_color():
        'It test the mapping of the mapper with color space'
        test_dir = NamedTemporaryDir()
        project_name = 'backbone'

        blastdb_seq = os.path.join(TEST_DATA_DIR, 'blast', 'arabidopsis_genes+')

        snv_filters = {'filter1':{'name':'uniq_contiguous', 'use':True,
                                  'genomic_db':blastdb_seq,
                                  'genomic_seqs_fpath':blastdb_seq},

                       'filter7':{'name':'by_kind', 'use':True,
                                  'kind':'SNP'},
                       'filter12':{'name':'ref_not_in_list', 'use':True,
                                'list_path':os.path.join(TEST_DATA_DIR, 'cos_list')},
                       'filter10':{'unique_name': 'variable_in_sm',
                                   'name': 'is_variable', 'use':True,
                                   'group_kind':'libraries',
                                   'groups':['hola']},
                       'filter11':{'unique_name': 'variable_in_adios',
                                   'name': 'is_variable', 'use':True,
                                   'group_kind':'libraries',
                                   'groups':['adios']},
                       'filter13':{'unique_name': 'variable_in_caracola',
                                   'name': 'is_variable', 'use':True,
                                   'group_kind':'libraries',
                                   'groups':['caracola']}, }

        configuration = {'Snvs':{'min_quality':20},
                         'Sam_processing':{'add_default_qualities':True},
                         'snv_filters':snv_filters,
                         'General_settings':{'threads':THREADS}}

        settings_path = create_project(directory=test_dir.name,
                                       name=project_name,
                                       configuration=configuration)
        project_dir = join(test_dir.name, project_name)
        #setup the original reads
        reads_dir = join(project_dir, 'reads')
        clean_reads_dir = join(reads_dir, 'cleaned')
        os.mkdir(reads_dir)
        os.mkdir(clean_reads_dir)
        shutil.copy(os.path.join(TEST_DATA_DIR, 'solid.fastq'),
               os.path.join(clean_reads_dir, 'pl_solid.lb_hola.sm_hola.sfastq'))

        #the reference
        reference_dir = join(project_dir, 'mapping/reference')
        os.makedirs(reference_dir)
        reference_fpath = join(reference_dir, 'reference.fasta')
        out = open(reference_fpath, 'w')
        for line in open(join(TEST_DATA_DIR, 'samtools_color/reference')):
            out.write(line)

        do_analysis(project_settings=settings_path, kind='mapping', silent=True)
        mapping_dir = join(project_dir, 'mapping')
        singular_mapping_dir = sorted(os.listdir(mapping_dir))[0]
        singular_mapping_dir = join(mapping_dir, singular_mapping_dir)
        assert exists(join(singular_mapping_dir, 'bams',
                            'by_readgroup', 'pl_solid.lb_hola.sm_hola.bam'))
        result_dir = join(mapping_dir, 'bams')
        assert exists(result_dir)
        result_dir_by_lib = join(result_dir, 'by_readgroup')
        assert exists(result_dir_by_lib)

        do_analysis(project_settings=settings_path, kind='merge_bams',
                    silent=True)
        assert exists(join(result_dir, 'merged.0.bam'))
        assert exists(join(result_dir, 'merged.0.bam.bai'))

        #we realign the mapping using GATK
        do_analysis(project_settings=settings_path, kind='realign_bam',
                    silent=True)
        assert exists(join(result_dir, 'merged.1.bam'))

        test_dir.close()
示例#17
0
    def test_ortholog_annotation_analysis():
        "We can annotate orthologs"
        test_dir = NamedTemporaryDir()
        project_name = "backbone"

        config = {
            "blast": {
                "arabidopsis": {"path": "/path/to/tair", "species": "arabidopsis", "kind": "nucl"},
                "arabidopsis2": {"path": "/path/to/tair2", "species": "arabidopsis2", "kind": "nucl"},
            },
            "Annotation": {"ortholog_annotation": {"ortholog_databases": ["arabidopsis", "arabidopsis2"]}},
            "General_settings": {"threads": THREADS},
        }

        settings_path = create_project(directory=test_dir.name, name=project_name, configuration=config)
        project_dir = join(test_dir.name, project_name)

        # create blast results
        melon_tair_blastdir = join(project_dir, "annotations", "blast", "melon.st_nucl.pl_454", "tair")
        melon_tair2_blastdir = join(project_dir, "annotations", "blast", "melon.st_nucl.pl_454", "tair2")
        os.makedirs(melon_tair_blastdir)
        os.makedirs(melon_tair2_blastdir)
        tair_melon_blastdir = join(project_dir, "annotations", "blast", "tair", "melon.st_nucl.pl_454")
        tair2_melon_blastdir = join(project_dir, "annotations", "blast", "tair2", "melon.st_nucl.pl_454")
        os.makedirs(tair_melon_blastdir)
        os.makedirs(tair2_melon_blastdir)
        blast_fname = BACKBONE_BASENAMES["blast_basename"] + ".tblastx.xml"
        shutil.copy(join(TEST_DATA_DIR, "melon_tair.xml"), join(melon_tair_blastdir, blast_fname))
        shutil.copy(join(TEST_DATA_DIR, "melon_tair.xml"), join(melon_tair2_blastdir, blast_fname))
        shutil.copy(join(TEST_DATA_DIR, "tair_melon.xml"), join(tair_melon_blastdir, blast_fname))
        shutil.copy(join(TEST_DATA_DIR, "tair_melon.xml"), join(tair2_melon_blastdir, blast_fname))

        # some melon file to annotate
        input_dir = join(project_dir, BACKBONE_DIRECTORIES["annotation_input"])
        os.makedirs(input_dir)
        seq1 = SeqWithQuality(Seq("A"), id="melon1")
        seq2 = SeqWithQuality(Seq("A"), id="melon2")
        write_seqs_in_file([seq1, seq2], open(join(input_dir, "melon.st_nucl.pl_454.fasta"), "a"))

        do_analysis(project_settings=settings_path, kind="annotate_orthologs", silent=True)
        pickle_fpath = join(project_dir, BACKBONE_DIRECTORIES["annotation_dbs"], "melon.st_nucl.pl_454.0.pickle")
        pickle = open(pickle_fpath).read()
        assert "arabidopsis-orthologs" in pickle
        assert "arabidopsis2-orthologs" in pickle

        do_analysis(project_settings=settings_path, kind="write_annotations", silent=True)

        orf_fpath = join(project_dir, "annotations", "features", "melon.st_nucl.pl_454.orthologs")
        assert os.path.exists(orf_fpath)
        assert "tair1" in open(orf_fpath).read()

        orf_fpath = join(project_dir, "annotations", "features", "melon.st_nucl.pl_454.orf")
        assert not os.path.exists(orf_fpath)

        do_analysis(project_settings=settings_path, kind="annotation_stats", silent=True)
        stats_fpath = join(project_dir, "annotations", "features", "stats", "melon.st_nucl.pl_454.txt")
        result = open(stats_fpath).read()
        expected = """Orthologs
_________
Sequences with arabidopsis orthologs: 2
Number of arabidopsis orthologs: 2
Sequences with arabidopsis2 orthologs: 2
Number of arabidopsis2 orthologs: 2"""

        assert expected in result

        test_dir.close()
示例#18
0
    def test_protein_change_annotation_analysis():
        "We can annotate protein changes"
        test_dir = NamedTemporaryDir()
        project_name = "backbone"
        matrix = os.path.join(TEST_DATA_DIR, "At.smat")
        configuration = {
            "Snvs": {"min_quality": 20},
            "Sam_processing": {"add_default_qualities": True},
            "Annotation": {"orf_annotation": {"estscan_matrix": matrix}},
            "General_settings": {"threads": THREADS},
        }

        settings_path = create_project(directory=test_dir.name, name=project_name, configuration=configuration)
        project_dir = join(test_dir.name, project_name)
        # setup the original reads
        reads_dir = join(project_dir, "reads")
        clean_reads_dir = join(reads_dir, "cleaned")
        os.mkdir(reads_dir)
        os.mkdir(clean_reads_dir)

        solexa = "@seq1\n"
        solexa += "TCATTGAAAGTTGAAACTGATAGTAGCAGAGTTTTTTCCTCTGTTTGG\n"
        solexa += "+\n"
        solexa += "IIIIIIHIIIIIIIIIIIIIIIIIIUJUAUGJUUJUDFAOUDJOFSUD\n"
        solexa = "@seq2\n"
        solexa += "ATATGATTGAAGATATTTCTGGGCTTTAAGGGTTCTTGAGGATTTATA\n"
        solexa += "+\n"
        solexa += "IIIIIIHIIIIIIIIIIIIIIIZIIUJUAUGJUUJUDFAOUDJOFSUD\n"
        solexa = "@seq14\n"
        solexa += "ATATGATTGAAGATATTTCTGGGCTTTAAGGGTTCTTGAGGATTTATA\n"
        solexa += "+\n"
        solexa += "IIIIIIHIIIIIIIIIIIIIIIZIIUJUAUGJUUJUDFAOUDJOFSUD\n"
        solexa = "@seq15\n"
        solexa += "ATATGATTGAAGATATTTCTGGGCTTTAAGGGTTCTTGAGGATTTATA\n"
        solexa += "+\n"
        solexa += "IIIIIIHIIIIIIIIIIIIIIIZIIUJUAUGJUUJUDFAOUDJOFSUD\n"
        solexa = "@seq12\n"
        solexa += "ATATGATTGAAGATATTTCTGGACTTTAAGGGTTCTTGAGGATTTATA\n"
        solexa += "+\n"
        solexa += "IIIIIIHIIIIIIIIIIIIIIIZIIUJUAUGJUUJUDFAOUDJOFSUD\n"
        solexa = "@seq13\n"
        solexa += "ATATGATTGAAGATATTTCTGGACTTTAAGGGTTCTTGAGGATTTATA\n"
        solexa += "+\n"
        solexa += "IIIIIIHIIIIIIIIIIIIIIIZIIUJUAUGJUUJUDFAOUDJOFSUD\n"
        solexa = "@seq16\n"
        solexa += "ATATGATTGAAGATATTTCTGGACTTTAAGGGTTCTTGAGGATTTATA\n"
        solexa += "+\n"
        solexa += "IIIIIIHIIIIIIIIIIIIIIIZIIUJUAUGJUUJUDFAOUDJOFSUD\n"

        solexa2 = "@seq18\n"
        solexa2 += "TCATTGAAAGTTGAAACTGATAGTAGCAGAGTTTTTTCCTCTGTTTGG\n"
        solexa2 += "+\n"
        solexa2 += "IIIIIIHIIIIIIIIIIIIIIIIIIUJUAUGJUUJUDFAOUDJOFSUD\n"
        solexa2 += "@seq19\n"
        solexa2 += "ATATGATTGAAGATATTTCTGGGCTTTAAGGGTTCTTGAGGATTTATA\n"
        solexa2 += "+\n"
        solexa2 += "IIIIIIHIIIIIIIIIIIIIIIZIIUJUAUGJUUJUDFAOUDJOFSUD\n"
        solexa2 += "@seq20\n"
        solexa2 += "ATATGATTGAAGATATTTCTGGGCTTTAAGGGTTCTTGAGGATTTATA\n"
        solexa2 += "+\n"
        solexa2 += "IIIIIIHIIIIIIIIIIIIIIIZIIUJUAUGJUUJUDFAOUDJOFSUD\n"
        solexa2 += "@seq21\n"
        solexa2 += "ATATGATTGAAGATATTTCTGGGCTTTAAGGGTTCTTGAGGATTTATA\n"
        solexa2 += "+\n"
        solexa2 += "IIIIIIHIIIIIIIIIIIIIIIZIIUJUAUGJUUJUDFAOUDJOFSUD\n"
        solexa2 += "@seq22\n"
        solexa2 += "ATATGATTGAAGATATTTCTGGACTTTAAGGGTTCTTGAGGATTTATA\n"
        solexa2 += "+\n"
        solexa2 += "IIIIIIHIIIIIIIIIIIIIIIZIIUJUAUGJUUJUDFAOUDJOFSUD\n"
        solexa2 += "@seq23\n"
        solexa2 += "ATATGATTGAAGATATTTCTGGACTTTAAGGGTTCTTGAGGATTTATA\n"
        solexa2 += "+\n"
        solexa2 += "IIIIIIHIIIIIIIIIIIIIIIZIIUJUAUGJUUJUDFAOUDJOFSUD\n"
        solexa2 += "@seq24\n"
        solexa2 += "ATATGATTGAAGATATTTCTGGACTTTAAGGGTTCTTGAGGATTTATA\n"
        solexa2 += "+\n"
        solexa2 += "IIIIIIHIIIIIIIIIIIIIIIZIIUJUAUGJUUJUDFAOUDJOFSUD\n"
        solexa2 += "@seq25\n"
        solexa2 += "ATGTACTAGCAGTACGATCACACACTGGACAGTACAGACCAGAATGAC\n"
        solexa2 += "+\n"
        solexa2 += "IIIIIIHIIIIIIIIIIIIIIIZIIUJUAUGJUUJUDFAOUDJOFSUD\n"

        sanger = ">seq3\n"
        sanger += "GATATGATTGAAGATATTTCTGGGCTTTAAGGGTTCTTGAGGATTTATAGGAGATACTGA"
        sanger += "GATTCTGGAATCTCTGAGTTTCTGGGTTCAAGTTGCACTGACCATTGTTGGATTTGTAGA"
        sanger += "TTGTTTCTTCTTTCATTAGGCATTGATTATGGGTAAATGCGTGGGTACATATAATATATA"
        sanger += "TCTGTTGAATGCAATTTACACATTGACTGAGGAACAACATGAACATGGCAGCTTTCTCAA"
        sanger += "AATTGAACCACAGAAGGCTTAAAAGCAAAGTCTTTGGAGAATCAGACTAAGCTTGAGA\n"
        sanger += ">seq4\n"
        sanger += "TCCATACTTTACTCTATCTCTTTCTGTGTTTGGTAACACGCGAGGATTGGATGATAT"
        sanger += "CTATATATTCTAATGTGGACTAAAAATGTGTGTGTGTGTATGAAGATGGGAAGCCGGAAG"
        sanger += "TCATCAAGGAGAAAAGAGAGATAGACGACGAGAAGATGGCGTTGACGTTCAGAGGACTAG"
        sanger += "AGGGTCATGTGATGGAGAAGTACAAGAAGTATGAGGTTATCTTACAGTTCATTCCCAAGT"
        sanger += "CGAACGAAGGCTGCGTCTGCAAAGTCACTCTGATATGGGAGAATCGCAACGAAGACTCCC"
        sanger += ">seq5\n"
        sanger += "TCCATACTTTACTCTATCTCTTTCTGTGTTTGGTAACACGCGAGGATTGGATGATAT"
        sanger += "CTATATATTCTAAAGTGGACTAAAAATGTGTGTGTGTGTATGAAGATGGGAAGCCGGAAG"
        sanger += "TCATCAAGGAGAAAAGAGAGATAGACGACGAGAAGATGGCGTTGACGTTCAGAGGACTAG"
        sanger += "AGGGTCATGTGATGGAGAAGTACAAGAAGTATGAGGTTATCTTACAGTTCATTCCCAAGT"
        sanger += "CGAACGAAGGCTGCGTCTGCAAAGTCACTCTGATATGGGAGAATCGCAACGAAGACTCCC"

        sange2 = ">seq6\n"
        sange2 += "GATATGATTGAAGATATTTCTGGGCTTTAAGGGTTCTTGAGGATTTATAGGAGATACTGA"
        sange2 += "GATTCTGGAATCTCTGAGTTTCTGGGTTCAAGTTGCACTGACCATTGTTGGATTTGTAGA"
        sange2 += "TTGTTTCTTCTTTCATTAGGCATTGATTATGGGTAAATGCGTGGGTACATATAATATATA"
        sange2 += "TCTGTTGAATGCAATTTACACATTGACTGAGGAACAACATGAACATGGCAGCTTTCTCAA"
        sange2 += "AATTGAACCACAGAAGGCTTAAAAGCAAAGTCTTTGGAGAATCAGACTAAGCTTGAGA\n"
        sange2 += ">seq7\n"
        sange2 += "TCCATACTTTACTCTATCTCTTTCTGTGTTTGGTAACACGCGAGGATTGGATGATAT"
        sange2 += "CTATATATTCTAATGTGGACTAAAAATGTGTGTGTGTGTATGAAGATGGGAAGCCGGAAG"
        sange2 += "TCATCAAGGAGAAAAGAGAGATAGACGACGAGAAGATGGCGTTGACGTTCAGAGGACTAG"
        sange2 += "AGGGTCATGTGATGGAGAAGTACAAGAAGTATGAGGTTATCTTACAGTTCATTCCCAAGT"
        sange2 += "CGAACGAAGGCTGCGTCTGCAAAGTCACTCTGATATGGGAGAATCGCAACGAAGACTCCC"
        sange2 += ">seq8\n"
        sange2 += "TCCATACTTTACTCTATCTCTTTCTGTGTTTGGTAACACGCGAGGATTGGATGATAT"
        sange2 += "CTATATATTCTAAAGTGGACTAAAAATGTGTGTGTGTGTATGAAGATGGGAAGCCGGAAG"
        sange2 += "TCATCAAGGAGAAAAGAGAGATAGACGACGAGAAGATGGCGTTGACGTTCAGAGGACTAG"
        sange2 += "AGGGTCATGTGATGGAGAAGTACAAGAAGTATGAGGTTATCTTACAGTTCATTCCCAAGT"
        sange2 += "CGAACGAAGGCTGCGTCTGCAAAGTCACTCTGATATGGGAGAATCGCAACGAAGACTCCC"
        fpath_sanger = join(clean_reads_dir, "lb_hola1.pl_sanger.sm_hola.fasta")
        fpath_solexa = join(clean_reads_dir, "lb_hola2.pl_illumina.sm_hola.sfastq")
        open(fpath_sanger, "w").write(sanger)
        open(fpath_solexa, "w").write(solexa)

        fpath_sanger2 = join(clean_reads_dir, "lb_adios.pl_sanger.fasta")
        fpath_solexa2 = join(clean_reads_dir, "lb_adios.pl_illumina.sfastq")
        open(fpath_sanger2, "w").write(sange2)
        open(fpath_solexa2, "w").write(solexa2)

        # the reference
        reference_dir = join(project_dir, "mapping/reference")
        os.makedirs(reference_dir)
        reference_fpath = join(reference_dir, "reference.fasta")
        out = open(reference_fpath, "w")
        for line in open(join(TEST_DATA_DIR, "blast/arabidopsis_genes")):
            out.write(line)

        do_analysis(project_settings=settings_path, kind="mapping", silent=True)
        mapping_dir = join(project_dir, "mapping")
        singular_mapping_dir = sorted(os.listdir(mapping_dir))[0]
        singular_mapping_dir = join(mapping_dir, singular_mapping_dir)
        assert exists(join(singular_mapping_dir, "bams", "by_readgroup", "lb_hola2.pl_illumina.sm_hola.bam"))
        result_dir = join(mapping_dir, "bams")
        assert exists(result_dir)
        result_dir_by_lib = join(result_dir, "by_readgroup")
        assert exists(result_dir_by_lib)

        do_analysis(project_settings=settings_path, kind="merge_bams", silent=True)
        assert exists(join(result_dir, "merged.0.bam"))
        assert exists(join(result_dir, "merged.0.bam.bai"))

        # we realign the mapping using GATK
        do_analysis(project_settings=settings_path, kind="realign_bam", silent=True)
        assert exists(join(result_dir, "merged.1.bam"))

        annot_input_dir = join(project_dir, "annotations", "input")
        os.makedirs(annot_input_dir)
        os.symlink(reference_fpath, join(annot_input_dir, "reference.fasta"))
        do_analysis(project_settings=settings_path, kind="annotate_snvs", silent=True)

        do_analysis(project_settings=settings_path, kind="annotate_orfs", silent=True)

        do_analysis(project_settings=settings_path, kind="annotate_prot_change", silent=True)

        result_file = join(project_dir, "annotations", "db", "reference.2.pickle")

        seqs = list(seqs_in_file(open(result_file)))
        snv = seqs[2].features[0]
        assert snv.qualifiers["protein_change"]["kind"] == "substitution"
        assert snv.qualifiers["protein_change"]["location"] == "codon_1"

        test_dir.close()
示例#19
0
    def test_mapping_analysis():
        'We can map the reads'
        test_dir = NamedTemporaryDir()
        project_name = 'backbone'
        bed_fhand = NamedTemporaryFile(suffix='.bed')
        bed_fhand.write('AT1G14930.1\t200\t400\nAT1G55265.1\t100\t300\n')
        bed_fhand.flush()

        blastdb_seq = os.path.join(TEST_DATA_DIR, 'blast', 'arabidopsis_genes+')
        snv_filters = {'filter1':{'name':'uniq_contiguous', 'use':True,
                                  'genomic_db':blastdb_seq,
                                  'genomic_seqs_fpath':blastdb_seq},

                       'filter7':{'name':'by_kind', 'use':True,
                                  'kind':'SNP'},
                       'filter12':{'name':'ref_not_in_list', 'use':True,
                                'list_path':os.path.join(TEST_DATA_DIR, 'cos_list')},
                       'filter10':{'unique_name': 'variable_in_sm',
                                   'name': 'is_variable', 'use':True,
                                   'group_kind':'libraries',
                                   'groups':['hola1']},
                       'filter11':{'unique_name': 'variable_in_adios',
                                   'name': 'is_variable', 'use':True,
                                   'group_kind':'libraries',
                                   'groups':['adios']},
                       'filter13':{'unique_name': 'variable_in_caracola',
                                   'name': 'is_variable', 'use':True,
                                   'group_kind':'libraries',
                                   'groups':['hola2']},
                       'filter14':{'name': 'in_segment_bed', 'use':True,
                                   'bed_fpath':bed_fhand.name,
                                   'edge_avoidance':10}}

        configuration = {'Snvs':{'min_quality':20},
                         'Sam_processing':{'add_default_qualities':True},
                         'snv_filters':snv_filters,
                         'General_settings':{'threads':THREADS},
                         'Mappers':{'keep_unmapped_reads_in_bam':False}}

        settings_path = create_project(directory=test_dir.name,
                                       name=project_name,
                                       configuration=configuration)
        project_dir = join(test_dir.name, project_name)
        #setup the original reads
        reads_dir = join(project_dir, 'reads')
        clean_reads_dir = join(reads_dir, 'cleaned')
        os.mkdir(reads_dir)
        os.mkdir(clean_reads_dir)

        solexa = '@seq1\n'
        solexa += 'TCATTGAAAGTTGAAACTGATAGTAGCAGAGTTTTTTCCTCTGTTTGG\n'
        solexa += '+\n'
        solexa += 'IIIIIIHIIIIIIIIIIIIIIIIIIUJUAUGJUUJUDFAOUDJOFSUD\n'
        solexa += '@seq2\n'
        solexa += 'ATATGATTGAAGATATTTCTGGGCTTTAAGGGTTCTTGAGGATTTATA\n'
        solexa += '+\n'
        solexa += 'IIIIIIHIIIIIIIIIIIIIIIZIIUJUAUGJUUJUDFAOUDJOFSUD\n'
        solexa += '@seq14\n'
        solexa += 'ATATGATTGAAGATATTTCTGGGCTTTAAGGGTTCTTGAGGATTTATA\n'
        solexa += '+\n'
        solexa += 'IIIIIIHIIIIIIIIIIIIIIIZIIUJUAUGJUUJUDFAOUDJOFSUD\n'
        solexa += '@seq15\n'
        solexa += 'ATATGATTGAAGATATTTCTGGGCTTTAAGGGTTCTTGAGGATTTATA\n'
        solexa += '+\n'
        solexa += 'IIIIIIHIIIIIIIIIIIIIIIZIIUJUAUGJUUJUDFAOUDJOFSUD\n'
        solexa += '@seq12\n'
        solexa += 'ATATGATTGAAGATATTTCTGGACTTTAAGGGTTCTTGAGGATTTATA\n'
        solexa += '+\n'
        solexa += 'IIIIIIHIIIIIIIIIIIIIIIZIIUJUAUGJUUJUDFAOUDJOFSUD\n'
        solexa += '@seq13\n'
        solexa += 'ATATGATTGAAGATATTTCTGGACTTTAAGGGTTCTTGAGGATTTATA\n'
        solexa += '+\n'
        solexa += 'IIIIIIHIIIIIIIIIIIIIIIZIIUJUAUGJUUJUDFAOUDJOFSUD\n'
        solexa += '@seq16\n'
        solexa += 'ATATGATTGAAGATATTTCTGGACTTTAAGGGTTCTTGAGGATTTATA\n'
        solexa += '+\n'
        solexa += 'IIIIIIHIIIIIIIIIIIIIIIZIIUJUAUGJUUJUDFAOUDJOFSUD\n'
        solexa += '@seq17\n'
        solexa += 'ATGTACTAGCAGTACGATCACACACTGGACAGTACAGACCAGAATGAC\n'
        solexa += '+\n'
        solexa += 'IIIIIIHIIIIIIIIIIIIIIIZIIUJUAUGJUUJUDFAOUDJOFSUD\n'

        sanger = '>seq3\n'
        sanger += 'GATATGATTGAAGATATTTCTGGGCTTTAAGGGTTCTTGAGGATTTATAGGAGATACTGA'
        sanger += 'GATTCTGGAATCTCTGAGTTTCTGGGTTCAAGTTGCACTGACCATTGTTGGATTTGTAGA'
        sanger += 'TTGTTTCTTCTTTCATTAGGCATTGATTATGGGTAAATGCGTGGGTACATATAATATATA'
        sanger += 'TCTGTTGAATGCAATTTACACATTGACTGAGGAACAACATGAACATGGCAGCTTTCTCAA'
        sanger += 'AATTGAACCACAGAAGGCTTAAAAGCAAAGTCTTTGGAGAATCAGACTAAGCTTGAGA\n'
        sanger += '>seq4\n'
        sanger += 'TCCATACTTTACTCTATCTCTTTCTGTGTTTGGTAACACGCGAGGATTGGATGATAT'
        sanger += 'CTATATATTCTAATGTGGACTAAAAATGTGTGTGTGTGTATGAAGATGGGAAGCCGGAAG'
        sanger += 'TCATCAAGGAGAAAAGAGAGATAGACGACGAGAAGATGGCGTTGACGTTCAGAGGACTAG'
        sanger += 'AGGGTCATGTGATGGAGAAGTACAAGAAGTATGAGGTTATCTTACAGTTCATTCCCAAGT'
        sanger += 'CGAACGAAGGCTGCGTCTGCAAAGTCACTCTGATATGGGAGAATCGCAACGAAGACTCCC'
        sanger += '>seq5\n'
        sanger += 'TCCATACTTTACTCTATCTCTTTCTGTGTTTGGTAACACGCGAGGATTGGATGATAT'
        sanger += 'CTATATATTCTAAAGTGGACTAAAAATGTGTGTGTGTGTATGAAGATGGGAAGCCGGAAG'
        sanger += 'TCATCAAGGAGAAAAGAGAGATAGACGACGAGAAGATGGCGTTGACGTTCAGAGGACTAG'
        sanger += 'AGGGTCATGTGATGGAGAAGTACAAGAAGTATGAGGTTATCTTACAGTTCATTCCCAAGT'
        sanger += 'CGAACGAAGGCTGCGTCTGCAAAGTCACTCTGATATGGGAGAATCGCAACGAAGACTCCC'

        fpath_sanger = join(clean_reads_dir, 'lb_hola1.pl_sanger.sm_hola.fasta')
        fpath_solexa = join(clean_reads_dir,
                                    'lb_hola2.pl_illumina.sm_hola.sfastq')
        open(fpath_sanger, 'w').write(sanger)
        open(fpath_solexa, 'w').write(solexa)

        fpath_sanger2 = join(clean_reads_dir, 'lb_adios.pl_sanger.fasta')
        fpath_solexa2 = join(clean_reads_dir,
                                    'lb_adios.pl_illumina.sfastq')
        open(fpath_sanger2, 'w').write(sanger)
        open(fpath_solexa2, 'w').write(solexa)

        #the reference
        reference_dir = join(project_dir, 'mapping/reference')
        os.makedirs(reference_dir)
        reference_fpath = join(reference_dir, 'reference.fasta')
        out = open(reference_fpath, 'w')
        for line in open(join(TEST_DATA_DIR, 'blast/arabidopsis_genes')):
            out.write(line)

        do_analysis(project_settings=settings_path, kind='mapping', silent=True)
        mapping_dir = join(project_dir, 'mapping')
        singular_mapping_dir = sorted(os.listdir(mapping_dir))[0]
        singular_mapping_dir = join(mapping_dir, singular_mapping_dir)
        assert exists(join(singular_mapping_dir, 'bams',
                            'by_readgroup', 'lb_hola2.pl_illumina.sm_hola.bam'))
        result_dir = join(mapping_dir, 'bams')
        assert exists(result_dir)
        result_dir_by_lib = join(result_dir, 'by_readgroup')
        assert exists(result_dir_by_lib)
        unmapped_fpath = join(mapping_dir, 'unmapped_reads.gz')
        assert exists(unmapped_fpath)
        unmappeds = GzipFile(unmapped_fpath).read()
        assert 'seq17' in unmappeds


        do_analysis(project_settings=settings_path, kind='merge_bams',
                    silent=True)
        assert exists(join(result_dir, 'merged.0.bam'))
        assert exists(join(result_dir, 'merged.0.bam.bai'))

        #we realign the mapping using GATK
        do_analysis(project_settings=settings_path, kind='realign_bam',
                    silent=True)
        assert exists(join(result_dir, 'merged.1.bam'))

        #we calculate BAQ
        do_analysis(project_settings=settings_path, kind='calmd_bam',
                    silent=True)

        assert exists(join(result_dir, 'merged.2.bam'))
        assert exists(join(result_dir, 'merged.2.bam.bai'))


        do_analysis(project_settings=settings_path, kind='mapping_stats',
                    silent=True)
        stats_fname = join(mapping_dir,
                           BACKBONE_DIRECTORIES['mapping_stats'][1],
                           BACKBONE_BASENAMES['statistics_file'])
        result = open(stats_fname).read()
        assert 'Statistics for Coverage for platform sanger' in result

        annot_input_dir = join(project_dir, 'annotations', 'input')
        os.makedirs(annot_input_dir)
        os.symlink(reference_fpath, join(annot_input_dir, 'reference.fasta'))
        do_analysis(project_settings=settings_path, kind='annotate_snvs',
                    silent=True)
        json_fpath = join(project_dir, BACKBONE_DIRECTORIES['annotation_dbs'],
                          'reference.0.pickle')
        assert 'snv' in  open(json_fpath).read()

        do_analysis(project_settings=settings_path, kind='filter_snvs',
                    silent=True)
        json_fpath = join(project_dir, BACKBONE_DIRECTORIES['annotation_dbs'],
                          'reference.1.pickle')
        result = open(json_fpath).read()
        #print result
        assert 'snv' in result
        assert 'adios_sanger' in result

        do_analysis(project_settings=settings_path, kind='write_annotations',
                    silent=True)
        vcf_fpath = join(project_dir, 'annotations', 'features',
                         'reference.vcf')
        vcf = open(vcf_fpath).read()

        assert 'VLB1' in vcf
        assert 'VLB2' in vcf
        assert 'VLB3' in vcf
        assert 'AT1G14930.1' in vcf
        assert 'IS10' in vcf

        do_analysis(project_settings=settings_path, kind='mapping_stats',
                    silent=True)
        stats_dir = join(project_dir, 'mapping', 'bams', 'stats')
        assert exists(join(stats_dir, 'backbone.coverage_illumina.dat'))

        stats_fpath = join(stats_dir, BACKBONE_BASENAMES['statistics_file'])
        result = open(stats_fpath).read()
        expected = '''average: 0.4542
variance: 1.3050
total sequence length: 3941'''
        assert expected in result

        test_dir.close()