def main(): 'The script itself' #set parameters work_dir, output, reference = set_parameters() # make a working tempfir temp_dir = NamedTemporaryDir() # add readgroup tag to each alignment in bam add_header_and_tags_bams(work_dir, temp_dir.name) # Prepare files to merge sams = get_opened_sams_from_dir(temp_dir.name) temp_sam = NamedTemporaryFile() # merge all the sam in one merge_sam(sams, temp_sam, reference) # Convert sam into a bam,(Temporary) temp_bam = NamedTemporaryFile(suffix='.bam') sam2bam(temp_sam.name, temp_bam.name) # finally we need to order the bam sort_bam_sam(temp_bam.name, output) # and make and index of the bam call(['samtools', 'index', output], raise_on_error=True) temp_dir.close()
def test_blast_seq_against_bad_db(self): 'We can blast a seq file against a database' test_dir = NamedTemporaryDir() project_name = 'backbone' create_project(directory=test_dir.name, name=project_name) project_dir = join(test_dir.name, project_name) #some query fasta file query = '>seq1\nGATCGGCCTTCTTGCGCATCTCACGCGCTCCTGCGGCGGCCTGTAGGGCAGGCT' query += 'CATACCCCTGCCGAACCGCTTTTGTCA|n' query_fhand = NamedTemporaryFile(mode='w') query_fhand.write(query) query_fhand.flush() #the blast db blast_db_fname = 'uni' blast_db = join(TEST_DATA_DIR, 'blast', blast_db_fname) blast_program = 'blastn' try: backbone_blast_runner(query_fpath=query_fhand.name, project_dir=project_dir, blast_program=blast_program, blast_db=blast_db) self.fail('RuntimeError expected') except RuntimeError: pass test_dir.close()
def test_blast_seq_against_db(): 'We can blast a seq file against a database' test_dir = NamedTemporaryDir() project_name = 'backbone' create_project(directory=test_dir.name, name=project_name) project_dir = join(test_dir.name, project_name) #some query fasta file query = '>seq1\nGATCGGCCTTCTTGCGCATCTCACGCGCTCCTGCGGCGGCCTGTAGGGCAGGCT' query += 'CATACCCCTGCCGAACCGCTTTTGTCA|n' query_fhand = NamedTemporaryFile(mode='w') query_fhand.write(query) query_fhand.flush() #the blast db blast_db_fname = 'univec+' blast_db = join(TEST_DATA_DIR, 'blast', blast_db_fname) blast_program = 'blastn' backbone_blast_runner(query_fpath=query_fhand.name, project_dir=project_dir, blast_program=blast_program, blast_db=blast_db) #is the blast ok? blast_fpath = join(project_dir, BACKBONE_DIRECTORIES['blast_dir'], _get_basename(query_fhand.name), blast_db_fname, '%s.%s.xml' % (BACKBONE_BASENAMES['blast_basename'], blast_program)) assert '<Hit_def>vec1</Hit_def>' in open(blast_fpath).read() test_dir.close()
def test_gmap_mapper(): 'It test the gmap mapper' mappers_dir = join(TEST_DATA_DIR, 'mappers') gmap_dir = join(TEST_DATA_DIR, 'mappers', 'gmap') work_dir = NamedTemporaryDir() temp_genome = join(work_dir.name, 'genome.fa') os.symlink(join(mappers_dir, 'genome.fa'), temp_genome) reads_fpath = join(gmap_dir, 'lb_lib1.pl_sanger.sm_sam1.fa') out_bam_fhand = NamedTemporaryFile(suffix='.bam') parameters = {'threads':None, 'kmer':13} map_reads_with_gmap(temp_genome, reads_fpath, out_bam_fhand.name, parameters) sam_fhand = NamedTemporaryFile(suffix='.sam') bam2sam(out_bam_fhand.name, sam_fhand.name, header=True) result = open(sam_fhand.name).read() assert exists(out_bam_fhand.name) assert '36M2I204M' in result assert 'SN:SL2.30ch00' in result assert 'seq9_rev_MOD' in result work_dir.close() out_bam_fhand.close() sam_fhand.close() work_dir = NamedTemporaryDir() temp_genome = join(work_dir.name, 'genome.fa') os.symlink(join(mappers_dir, 'genome.fa'), temp_genome) reads_fpath = join(gmap_dir, 'lb_lib1.pl_sanger.sm_sam1.sfastq') out_bam_fhand = NamedTemporaryFile(suffix='.bam') unmapped_fhand = StringIO.StringIO() parameters = {'threads':None, 'kmer':13, 'unmapped_fhand':unmapped_fhand} map_reads_with_gmap(temp_genome, reads_fpath, out_bam_fhand.name, parameters) sam_fhand = NamedTemporaryFile(suffix='.sam') bam2sam(out_bam_fhand.name, sam_fhand.name, header=True) result = open(sam_fhand.name).read() assert exists(out_bam_fhand.name) assert '36M2I204M' in result assert 'SN:SL2.30ch00' in result assert 'seq9_rev_MOD' in result assert '?????????????????' in result work_dir.close() out_bam_fhand.close() sam_fhand.close()
def test_orf_annotation_analysis(): "We can annotate orfs" test_dir = NamedTemporaryDir() project_name = "backbone" matrix = os.path.join(TEST_DATA_DIR, "At.smat") config = { "Annotation": {"orf_annotation": {"estscan_matrix": matrix}}, "General_settings": {"threads": THREADS}, } settings_path = create_project(directory=test_dir.name, name=project_name, configuration=config) project_dir = join(test_dir.name, project_name) seq = "CTACTTACTAGCTTTAGTAAATCCTTCTAACCCTCGGTAAAAAAAAAAAAGAGGCATCAAATG" seq += "GCTTCATCCATTCTCTCATCCGCCGNTGTGGCCTTTGNCAACAGGGCTTCCCCTGCTCAAGCT" seq += "AGCATGGGGGCACCATTCACTGGCCTAAAATCCGCCGCTGCTTTCCCNGTNACTCGCANGACC" seq += "AACGACATCACCACTTTGGTTAGCAATGGGGGAAGAGTTCAGGGCNTGAAGGTGTGCCCACCA" seq += "CTTGGATTGAAGAAGTTCGAGACTCTTTCTTACCTTCCTGATATGAGTAACGAGCAATTGGGA" seq += "AAGGAAGTTGACTACCTTCTCAGGAAGGGATGGATTCCCTGCATTGAATTCGACATTCACAGT" seq += "GGATTCGTTTACCGTGAGACCCACAGGTCACCAGGATACTTCGATGGACGCTACTGGACCATG" seq += "TGGAAGCTGCCCATGTTTGGCTGCACCGAT" annot_input_dir = join(project_dir, "annotations", "input") os.makedirs(annot_input_dir) # create some seqs to annotate fasta = ">seq\n%s\n" % seq fhand = open(os.path.join(annot_input_dir, "seqs.fasta"), "w") fhand.write(fasta) fhand.close() do_analysis(project_settings=settings_path, kind="annotate_orfs", silent=True) repr_fpath = join(project_dir, BACKBONE_DIRECTORIES["annotation_dbs"], "seqs.0.pickle") result = open(repr_fpath).read() assert "orf" in result do_analysis(project_settings=settings_path, kind="write_annotations", silent=True) seq_fpath = join(project_dir, "annotations", "features", "seqs.orf_seq.fasta") pep_fpath = join(project_dir, "annotations", "features", "seqs.orf_pep.fasta") assert "ATCCGCCGNTGTGGCCTTTGNCAACAGGGCTTCCCCT" in open(seq_fpath).read() assert "QASMGAPFTGLKSAAAFPVTRXTNDITTLVSNG" in open(pep_fpath).read() do_analysis(project_settings=settings_path, kind="annotation_stats", silent=True) stats_fpath = join(project_dir, "annotations", "features", "stats", "seqs.txt") result = open(stats_fpath).read() expected = """Sequences with ORF: 1 Number of ORFs: 1""" assert expected in result test_dir.close()
def test_simple_named_temporary_dir(): 'It test temporay named dir' temp_dir = NamedTemporaryDir() dir_name = temp_dir.name assert os.path.exists(dir_name) == True temp_dir.close() assert os.path.exists(dir_name) == False temp_dir = NamedTemporaryDir() dir_name = temp_dir.name fhand = open(os.path.join(dir_name, 'peio'), 'w') assert os.path.exists(fhand.name) == True assert os.path.exists(dir_name) == True del(temp_dir) assert os.path.exists(dir_name) == False
def test_create_project(): 'We can create a project' test_dir = NamedTemporaryDir() settings_path = create_project(directory=test_dir.name, name='backbone') assert settings_path == join(test_dir.name, 'backbone', BACKBONE_DIRECTORIES['config_file']) settings = create_configuration(settings_path) assert settings['General_settings']['project_name'] == 'backbone' project_path = join(test_dir.name, 'backbone') assert settings['General_settings']['project_path'] == project_path assert settings['Cleaning']['strip_n_percent'] == 2.0 content = open(settings_path).read() assert 'strip_n_percent' in content test_dir.close()
def test_basic_functionality(self): 'VersionedPath basic functionality' tempdir = NamedTemporaryDir() tempdir_name = tempdir.name fnames = ['hola.txt', 'hola.0.txt', 'hola.1.txt', 'adios.txt', 'adios.1.txt', 'foo.txt'] [open(os.path.join(tempdir.name, fname), 'w') for fname in fnames] path_str = os.path.join(tempdir.name, 'hola.txt') path = VersionedPath(path_str) assert str(path) == path_str path_str_0 = os.path.join(tempdir.name, 'hola.0.txt') path = VersionedPath(path_str_0) assert str(path) == path_str assert path.basename == 'hola' assert path.directory == tempdir.name assert path.extension == 'txt' assert path.last_version == VersionedPath(os.path.join(tempdir_name, 'hola.1.txt')) assert path.next_version == VersionedPath(os.path.join(tempdir_name, 'hola.2.txt')) fpaths = [os.path.join(tempdir_name, fname) for fname in ('hola.1.txt', 'adios.1.txt', 'foo.txt')] expected_paths = set(fpaths) versioned_paths = set(path.list_fpaths_versioned()) assert versioned_paths == expected_paths path = VersionedPath(tempdir_name) versioned_paths = set(path.list_fpaths_versioned()) assert versioned_paths == expected_paths tempdir.close() #in an empty dir path = VersionedPath('hola.txt') assert path.last_version.endswith('hola.txt') assert path.next_version.endswith('hola.0.txt') path = VersionedPath('pl_illumina.sm_rp_75_59_uc82.sfastq') assert path.last_version.endswith('pl_illumina.sm_rp_75_59_uc82.sfastq')
def test_microsatellite_annoation_analysis(): "We can annotate introns" test_dir = NamedTemporaryDir() project_name = "backbone" settings_path = create_project( directory=test_dir.name, name=project_name, configuration={"General_settings": {"threads": THREADS}} ) project_dir = join(test_dir.name, project_name) seq = "GAAAAGATGTGATTGGTGAAATAAGTTTGCCTCAATTCTCTTGTGCCGAAGTTCCAAAGAAGC" seq += "AGTTGGTGAATGAGCAGCCAGTACCCGAAAAATCGAGCAAAGATTTTGTGATGTATGTTGGAG" seq += "GTCTAGCATGGGGGATGGACTGGTGTCCCCAAGCTCATGAAAATAGGGATGCTCCTATGAAAA" seq += "GAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGA" seq += "GTGAGTTTGTCGCAATTGCTCCTCATCCTCCTGATTCATCATATCACAAGACTGATGCCTCAC" seq += "TTACAGGCAGAGGTGTAATTCAGATATGGTGCCTGCCAGATCTCATTCAAAAAGATATAATTG" seq += "TGAAAGAAGATTATTTTGCTCAGGTTAACAAAAAACCGTATAGAAATTTGACAAGAAGTGAAG" seq += "CAGGTACGGGAGAAGTATCTGGACCTCAAAAACCAAGAGGAAGACCAAAAAAGAACCCTGGTA" seq += "AAGCAGTCCAGGCAAAAGCATCTAGACCACAAAATCCAAGAGGAAGACCGAGAAAGAAGCCTG" seq += "TTACTGAATCTTTAGGTGATAGAGATAGTGAAGACCACAGTTTACAACCTCTTGCTATAGAGT" seq += "GGTCGCTGCAATCAACAGAACTTTCTGTAGATTTGTCTTGTGGAAATATGAATAAAGCCCAAG" seq += "TAGATATTGCGCTGAGTCAAGAAAGATGTATTAATGCGGCAT" annot_input_dir = join(project_dir, "annotations", "input") os.makedirs(annot_input_dir) # create some seqs to annotate fasta = ">seq\n%s\n" % seq fhand = open(os.path.join(annot_input_dir, "seqs.fasta"), "w") fhand.write(fasta) fhand.close() do_analysis(project_settings=settings_path, kind="annotate_microsatellites", silent=True) pickle_fpath = join(project_dir, BACKBONE_DIRECTORIES["annotation_dbs"], "seqs.0.pickle") result = open(pickle_fpath).read() assert "microsatellite" in result do_analysis(project_settings=settings_path, kind="write_annotations", silent=True) ssr_fpath = join(project_dir, "annotations", "features", "seqs.ssr") assert os.path.exists(ssr_fpath) assert "Seqname" in open(ssr_fpath).read() do_analysis(project_settings=settings_path, kind="annotation_stats", silent=True) stats_fpath = join(project_dir, "annotations", "features", "stats", "seqs.txt") result = open(stats_fpath).read() expected = "Sequences with microsatellites: 1" assert expected in result test_dir.close()
def b2gpipe_runner(blast, annot_fpath, b2gpipe_bin, prop_fpath, dat_fpath=None, java_conf=None): "It runs b2gpipe" tempdir = NamedTemporaryDir() out_basename = os.path.join(tempdir.name, "out") cmd = java_cmd(java_conf) cmd.extend(["-jar", b2gpipe_bin, "-in", blast.name, "-out", out_basename, "-a", "-prop", prop_fpath]) if dat_fpath: cmd.append("-d") logger = logging.getLogger("franklin") logger.info("Running blast2go: %s" % " ".join(cmd)) call(cmd, raise_on_error=True, add_ext_dir=False) shutil.move(out_basename + ".annot", annot_fpath) if dat_fpath: shutil.move(out_basename + ".dat", dat_fpath) tempdir.close()
def test_cdna_intron_annoation_analysis(): "We can annotate introns" test_dir = NamedTemporaryDir() project_name = "backbone" blast_db_path = os.path.join(TEST_DATA_DIR, "blast") genomic_db = os.path.join(blast_db_path, "tomato_genome2+") config = { "Annotation": {"Cdna_intron_annotation": {"genomic_db": genomic_db, "genomic_seq_file": genomic_db}}, "General_settings": {"threads": THREADS}, } settings_path = create_project(directory=test_dir.name, name=project_name, configuration=config) project_dir = join(test_dir.name, project_name) seq = "GAAAAGATGTGATTGGTGAAATAAGTTTGCCTCAATTCTCTTGTGCCGAAGTTCCAAAGAAGC" seq += "AGTTGGTGAATGAGCAGCCAGTACCCGAAAAATCGAGCAAAGATTTTGTGATGTATGTTGGAG" seq += "GTCTAGCATGGGGGATGGACTGGTGTCCCCAAGCTCATGAAAATAGGGATGCTCCTATGAAAA" seq += "GTGAGTTTGTCGCAATTGCTCCTCATCCTCCTGATTCATCATATCACAAGACTGATGCCTCAC" seq += "TTACAGGCAGAGGTGTAATTCAGATATGGTGCCTGCCAGATCTCATTCAAAAAGATATAATTG" seq += "TGAAAGAAGATTATTTTGCTCAGGTTAACAAAAAACCGTATAGAAATTTGACAAGAAGTGAAG" seq += "CAGGTACGGGAGAAGTATCTGGACCTCAAAAACCAAGAGGAAGACCAAAAAAGAACCCTGGTA" seq += "AAGCAGTCCAGGCAAAAGCATCTAGACCACAAAATCCAAGAGGAAGACCGAGAAAGAAGCCTG" seq += "TTACTGAATCTTTAGGTGATAGAGATAGTGAAGACCACAGTTTACAACCTCTTGCTATAGAGT" seq += "GGTCGCTGCAATCAACAGAACTTTCTGTAGATTTGTCTTGTGGAAATATGAATAAAGCCCAAG" seq += "TAGATATTGCGCTGAGTCAAGAAAGATGTATTAATGCGGCAT" annot_input_dir = join(project_dir, "annotations", "input") os.makedirs(annot_input_dir) # create some seqs to annotate fasta = ">seq\n%s\n" % seq fhand = open(os.path.join(annot_input_dir, "seqs.fasta"), "w") fhand.write(fasta) fhand.close() do_analysis(project_settings=settings_path, kind="annotate_introns", silent=True) pickle_fpath = join(project_dir, BACKBONE_DIRECTORIES["annotation_dbs"], "seqs.0.pickle") assert "intron" in open(pickle_fpath).read() do_analysis(project_settings=settings_path, kind="annotation_stats", silent=True) stats_fpath = join(project_dir, "annotations", "features", "stats", "seqs.txt") result = open(stats_fpath).read() expected = """Sequences with intron: 1 Number of introns: 3""" assert expected in result test_dir.close()
def test_description_annotation_analysis(): "We can annotate with description" test_dir = NamedTemporaryDir() project_name = "backbone" arab_blastdb = join(TEST_DATA_DIR, "blast", "arabidopsis_genes+") config = { "blast": {"arabidopsis": {"path": arab_blastdb, "species": "arabidopsis"}}, "Annotation": {"description_annotation": {"description_databases": ["arabidopsis"]}}, "General_settings": {"threads": THREADS}, } settings_path = create_project(directory=test_dir.name, name=project_name, configuration=config) project_dir = join(test_dir.name, project_name) # some melon file to annotate input_dir = join(project_dir, BACKBONE_DIRECTORIES["annotation_input"]) os.makedirs(input_dir) seq_ = "AGGTGTCACCGTTCACGAGGGCGACTGGGACTCCCACGGGGCCATCAAGTCCTGGAACTACA" seq_ += "CATGCGGTCCTCTATCTCATTCTCTATTTGTATGAATATGTGTTTATTACTAGCTAGGGTTT" seq_ += "CTATTAATGAAAGGTTCATGTAAATATATGAAGATGGGAAGCAAGAGGTGTTCAAGGAGAAG" seq_ += "AGGGAGTTAGACGACCAGAAGAT" seq1 = SeqWithQuality(Seq(seq_), id="CUTC021854") seq2 = SeqWithQuality(Seq("Atagtagcatcagatgagcatcgacttctagctagctagct"), id="CUTC021853") write_seqs_in_file([seq1, seq2], open(join(input_dir, "melon.st_nucl.pl_454.fasta"), "a")) do_analysis(project_settings=settings_path, kind="annotate_descriptions", silent=True) repr_fpath = join(project_dir, BACKBONE_DIRECTORIES["annotation_dbs"], "melon.st_nucl.pl_454.0.pickle") result = open(repr_fpath).read() # print result assert "yet another one" in result do_analysis(project_settings=settings_path, kind="annotation_stats", silent=True) stats_fpath = join(project_dir, "annotations", "features", "stats", "melon.st_nucl.pl_454.txt") result = open(stats_fpath).read() expected = """Annotation statistics --------------------- Number of sequences: 2 Sequences with description: 1""" assert expected in result test_dir.close()
def b2gpipe_runner(blast, annot_fpath, b2gpipe_bin, prop_fpath, dat_fpath=None, java_conf=None, ): 'It runs b2gpipe' tempdir = NamedTemporaryDir() out_basename = os.path.join(tempdir.name, 'out') cmd = java_cmd(java_conf) cmd.extend(['-jar', b2gpipe_bin, '-in', blast.name, '-out', out_basename, '-a', '-prop', prop_fpath]) if dat_fpath: cmd.append('-d') logger = logging.getLogger('franklin') logger.info('Running blast2go: %s' % ' '.join(cmd)) call(cmd, raise_on_error=True, add_ext_dir=False) shutil.move(out_basename + '.annot', annot_fpath) if dat_fpath: shutil.move(out_basename + '.dat', dat_fpath) tempdir.close()
def test_cleaning_analysis(): 'We can clean the reads' test_dir = NamedTemporaryDir() project_name = 'backbone' project_dir = join(test_dir.name, project_name) adaptors_dir = join(project_dir, 'config_data', 'adaptors') adaptors_path_454 = join(adaptors_dir, '454_adaptors') words = ['^ATGAAC', 'TTGATTTGGT'] univec = os.path.join(TEST_DATA_DIR, 'blast', 'univec+') configuration = {'Cleaning': {'vector_database': univec, 'adaptors_file_454': adaptors_path_454, 'short_adaptors_454': words, 'edge_removal': {'454_left': 3, '454_right': 3}}, 'General_settings': {'threads': THREADS}} settings_path = create_project(directory=test_dir.name, name=project_name, configuration=configuration) #setup the original reads reads_dir = join(project_dir, 'reads') original_reads_dir = join(reads_dir, 'raw') os.mkdir(reads_dir) os.mkdir(original_reads_dir) os.makedirs(adaptors_dir) adap_fhand = open(adaptors_path_454, 'w') adap_fhand.write('''>smart_5_cds_primer_1 GGTTCAAGGTTTGAGAAAGGATGGGAAG\n''') adap_fhand.close() #print original_reads_dir fpath_454 = join(original_reads_dir, 'pl_454.lb_a.sfastq') fpath_ill = join(original_reads_dir, 'pl_illumina.lb_b.sfastq') fpath_solid = join(original_reads_dir, 'pl_solid.lb_prueba.sfastq') open(fpath_solid, 'w').write(READS_SOLID) open(fpath_454, 'w').write(READS_454) open(fpath_ill, 'w').write(READS_ILL) do_analysis(project_settings=settings_path, kind='clean_reads', silent=True) cleaned_dir = join(project_dir, 'reads', 'cleaned') assert exists(cleaned_dir) cleaned_454 = join(cleaned_dir, os.path.basename(fpath_454)) assert exists(cleaned_454) seqs = list(seqs_in_file(open(cleaned_454))) # It means thar the adaptor has been removed seq = seqs[0].seq assert 'GGTTCAAGGTTTGAGAAAGGATGGGAAG' not in seq seq = seqs[2].seq # It means that the starting word has been removed assert seq.startswith('TTCCAAGATTCTTCCCACAT') # solid cleaned_solid = join(cleaned_dir, os.path.basename(fpath_solid)) clean_seqs = open(cleaned_solid).read() assert '10_1824_570_F3' not in clean_seqs do_analysis(project_settings=settings_path, kind='prepare_mira_assembly', silent=True) assembly_input = join(project_dir, 'assembly', 'input') assert exists(assembly_input) mira_in_454 = join(assembly_input, 'backbone_in.454.fasta') mira_in_qul = join(assembly_input, 'backbone_in.454.fasta.qual') assert exists(mira_in_454) assert exists(mira_in_qul) do_analysis(project_settings=settings_path, kind='mira_assembly', silent=True) assembly_dir = join(project_dir, 'assembly') sorted(os.listdir(assembly_dir)) test_dir.close()
def test_backbone(analysis=None, analysis_dir=None): '''It tests the backbone infrastructure. If no analysis is given it will run all of them. If no analysis_dir is given a temporary one will be used. ''' logger = logging.getLogger('franklin') if analysis_dir: analysis_fhand = None analysis_fpath = analysis_dir else: analysis_fhand = NamedTemporaryDir() analysis_fpath = analysis_fhand.name project_dir = analysis_fpath repository_dir = join(TEST_DATA_DIR, 'acceptance') settings_path = prepare_conf(project_dir, repository_dir) choice = analysis #choice = 'snvs' if choice in ('cleaning', None): original_reads = join(project_dir, 'reads/raw') if exists(original_reads): os.remove(original_reads) reads = join(project_dir, 'reads') if not exists(reads): os.mkdir(reads) shutil.copytree(join(repository_dir, 'cleaning'), join(project_dir, 'reads/raw')) analyses = ['clean_reads', 'read_stats'] run_analysis(analyses, settings_path) if choice in ('assembling', None): clean_reads_dir = join(project_dir, 'reads', 'cleaned') if os.path.exists(clean_reads_dir): shutil.rmtree(join(project_dir, 'reads')) os.mkdir(join(project_dir, 'reads')) shutil.copytree(join(repository_dir, 'assembling'), join(project_dir, 'reads/cleaned')) analyses = [ 'prepare_mira_assembly', 'mira_assembly'] run_analysis(analyses, settings_path) if choice in ('mapping', None): clean_reads_dir = join(project_dir, 'reads', 'cleaned') if os.path.exists(clean_reads_dir): shutil.rmtree(join(project_dir, 'reads')) os.mkdir(join(project_dir, 'reads')) shutil.copytree(join(repository_dir, 'assembling'), join(project_dir, 'reads/cleaned')) if exists(join(project_dir, 'mapping')): shutil.rmtree(join(project_dir, 'mapping')) os.makedirs(join(project_dir, 'mapping', 'reference')) shutil.copy(join(repository_dir, 'mapping', 'reference.fasta'), join(project_dir, 'mapping', 'reference', 'reference.fasta')) analyses = ['mapping', 'merge_bams', 'realign_bam'] run_analysis(analyses, settings_path) if choice in ('snvs', None): annot_dir = join(project_dir, 'annotations') create_dir(annot_dir) annot_res = join(annot_dir, 'repr') os.mkdir(join(annot_dir, 'input')) os.mkdir(annot_res) shutil.copy(join(repository_dir, 'snvs', 'reference.fasta'), join(annot_dir, 'input', 'reference.fasta')) mapping_dir = join(project_dir, 'mapping') create_dir(mapping_dir) os.mkdir(join(mapping_dir, 'reference')) shutil.copy(join(repository_dir, 'snvs', 'merged.bam'), join(project_dir, 'mapping', 'merged.bam')) shutil.copy(join(repository_dir, 'snvs', 'reference.fasta'), join(project_dir, 'mapping', 'reference', 'reference.fasta')) analyses = ['annotate_snvs', 'filter_snvs', 'annotation_stats', 'write_annotations'] run_analysis(analyses, settings_path) stats_fpath = join(project_dir, 'annotations', 'features', 'stats', 'reference.txt') result = open(stats_fpath).read() #print result expected = '''Sequences with SNVs: 47 SNVs found: 186 SNV types: \tinsertion: 2 \tdeletion: 12 \tcomplex: 1 \ttransition: 45 \ttransversion: 14 \tunknown: 112 SNV locations: \tunknown: 186''' assert expected in result if choice in ('annotation', None): annot_dir = join(project_dir, 'annotations') if exists(join(annot_dir)): shutil.rmtree(annot_dir) os.mkdir(annot_dir) shutil.copytree(join(repository_dir, 'annotation', 'input'), join(annot_dir, 'input')) shutil.copytree(join(repository_dir, 'annotation', 'blast'), join(annot_dir, 'blast')) analyses = ['annotate_orfs', 'annotate_microsatellites', 'annotate_gos', 'annotate_descriptions', 'annotate_orthologs', 'annotate_introns', 'annotate_prot_change', 'write_annotations', 'annotation_stats'] run_analysis(analyses, settings_path) stats_fpath = join(project_dir, 'annotations', 'features', 'stats', 'tair7_cdna.st_nucl.txt') result = open(stats_fpath).read() expected = '''Number of sequences: 4 Sequences with description: 4 Sequences with ORF: 4 Number of ORFs: 4 Sequences with intron: 2 Number of introns: 3''' assert expected in result if not analysis_dir: analysis_fhand.close()
def test_mapping_color(): 'It test the mapping of the mapper with color space' test_dir = NamedTemporaryDir() project_name = 'backbone' blastdb_seq = os.path.join(TEST_DATA_DIR, 'blast', 'arabidopsis_genes+') snv_filters = {'filter1':{'name':'uniq_contiguous', 'use':True, 'genomic_db':blastdb_seq, 'genomic_seqs_fpath':blastdb_seq}, 'filter7':{'name':'by_kind', 'use':True, 'kind':'SNP'}, 'filter12':{'name':'ref_not_in_list', 'use':True, 'list_path':os.path.join(TEST_DATA_DIR, 'cos_list')}, 'filter10':{'unique_name': 'variable_in_sm', 'name': 'is_variable', 'use':True, 'group_kind':'libraries', 'groups':['hola']}, 'filter11':{'unique_name': 'variable_in_adios', 'name': 'is_variable', 'use':True, 'group_kind':'libraries', 'groups':['adios']}, 'filter13':{'unique_name': 'variable_in_caracola', 'name': 'is_variable', 'use':True, 'group_kind':'libraries', 'groups':['caracola']}, } configuration = {'Snvs':{'min_quality':20}, 'Sam_processing':{'add_default_qualities':True}, 'snv_filters':snv_filters, 'General_settings':{'threads':THREADS}} settings_path = create_project(directory=test_dir.name, name=project_name, configuration=configuration) project_dir = join(test_dir.name, project_name) #setup the original reads reads_dir = join(project_dir, 'reads') clean_reads_dir = join(reads_dir, 'cleaned') os.mkdir(reads_dir) os.mkdir(clean_reads_dir) shutil.copy(os.path.join(TEST_DATA_DIR, 'solid.fastq'), os.path.join(clean_reads_dir, 'pl_solid.lb_hola.sm_hola.sfastq')) #the reference reference_dir = join(project_dir, 'mapping/reference') os.makedirs(reference_dir) reference_fpath = join(reference_dir, 'reference.fasta') out = open(reference_fpath, 'w') for line in open(join(TEST_DATA_DIR, 'samtools_color/reference')): out.write(line) do_analysis(project_settings=settings_path, kind='mapping', silent=True) mapping_dir = join(project_dir, 'mapping') singular_mapping_dir = sorted(os.listdir(mapping_dir))[0] singular_mapping_dir = join(mapping_dir, singular_mapping_dir) assert exists(join(singular_mapping_dir, 'bams', 'by_readgroup', 'pl_solid.lb_hola.sm_hola.bam')) result_dir = join(mapping_dir, 'bams') assert exists(result_dir) result_dir_by_lib = join(result_dir, 'by_readgroup') assert exists(result_dir_by_lib) do_analysis(project_settings=settings_path, kind='merge_bams', silent=True) assert exists(join(result_dir, 'merged.0.bam')) assert exists(join(result_dir, 'merged.0.bam.bai')) #we realign the mapping using GATK do_analysis(project_settings=settings_path, kind='realign_bam', silent=True) assert exists(join(result_dir, 'merged.1.bam')) test_dir.close()
def test_ortholog_annotation_analysis(): "We can annotate orthologs" test_dir = NamedTemporaryDir() project_name = "backbone" config = { "blast": { "arabidopsis": {"path": "/path/to/tair", "species": "arabidopsis", "kind": "nucl"}, "arabidopsis2": {"path": "/path/to/tair2", "species": "arabidopsis2", "kind": "nucl"}, }, "Annotation": {"ortholog_annotation": {"ortholog_databases": ["arabidopsis", "arabidopsis2"]}}, "General_settings": {"threads": THREADS}, } settings_path = create_project(directory=test_dir.name, name=project_name, configuration=config) project_dir = join(test_dir.name, project_name) # create blast results melon_tair_blastdir = join(project_dir, "annotations", "blast", "melon.st_nucl.pl_454", "tair") melon_tair2_blastdir = join(project_dir, "annotations", "blast", "melon.st_nucl.pl_454", "tair2") os.makedirs(melon_tair_blastdir) os.makedirs(melon_tair2_blastdir) tair_melon_blastdir = join(project_dir, "annotations", "blast", "tair", "melon.st_nucl.pl_454") tair2_melon_blastdir = join(project_dir, "annotations", "blast", "tair2", "melon.st_nucl.pl_454") os.makedirs(tair_melon_blastdir) os.makedirs(tair2_melon_blastdir) blast_fname = BACKBONE_BASENAMES["blast_basename"] + ".tblastx.xml" shutil.copy(join(TEST_DATA_DIR, "melon_tair.xml"), join(melon_tair_blastdir, blast_fname)) shutil.copy(join(TEST_DATA_DIR, "melon_tair.xml"), join(melon_tair2_blastdir, blast_fname)) shutil.copy(join(TEST_DATA_DIR, "tair_melon.xml"), join(tair_melon_blastdir, blast_fname)) shutil.copy(join(TEST_DATA_DIR, "tair_melon.xml"), join(tair2_melon_blastdir, blast_fname)) # some melon file to annotate input_dir = join(project_dir, BACKBONE_DIRECTORIES["annotation_input"]) os.makedirs(input_dir) seq1 = SeqWithQuality(Seq("A"), id="melon1") seq2 = SeqWithQuality(Seq("A"), id="melon2") write_seqs_in_file([seq1, seq2], open(join(input_dir, "melon.st_nucl.pl_454.fasta"), "a")) do_analysis(project_settings=settings_path, kind="annotate_orthologs", silent=True) pickle_fpath = join(project_dir, BACKBONE_DIRECTORIES["annotation_dbs"], "melon.st_nucl.pl_454.0.pickle") pickle = open(pickle_fpath).read() assert "arabidopsis-orthologs" in pickle assert "arabidopsis2-orthologs" in pickle do_analysis(project_settings=settings_path, kind="write_annotations", silent=True) orf_fpath = join(project_dir, "annotations", "features", "melon.st_nucl.pl_454.orthologs") assert os.path.exists(orf_fpath) assert "tair1" in open(orf_fpath).read() orf_fpath = join(project_dir, "annotations", "features", "melon.st_nucl.pl_454.orf") assert not os.path.exists(orf_fpath) do_analysis(project_settings=settings_path, kind="annotation_stats", silent=True) stats_fpath = join(project_dir, "annotations", "features", "stats", "melon.st_nucl.pl_454.txt") result = open(stats_fpath).read() expected = """Orthologs _________ Sequences with arabidopsis orthologs: 2 Number of arabidopsis orthologs: 2 Sequences with arabidopsis2 orthologs: 2 Number of arabidopsis2 orthologs: 2""" assert expected in result test_dir.close()
def test_protein_change_annotation_analysis(): "We can annotate protein changes" test_dir = NamedTemporaryDir() project_name = "backbone" matrix = os.path.join(TEST_DATA_DIR, "At.smat") configuration = { "Snvs": {"min_quality": 20}, "Sam_processing": {"add_default_qualities": True}, "Annotation": {"orf_annotation": {"estscan_matrix": matrix}}, "General_settings": {"threads": THREADS}, } settings_path = create_project(directory=test_dir.name, name=project_name, configuration=configuration) project_dir = join(test_dir.name, project_name) # setup the original reads reads_dir = join(project_dir, "reads") clean_reads_dir = join(reads_dir, "cleaned") os.mkdir(reads_dir) os.mkdir(clean_reads_dir) solexa = "@seq1\n" solexa += "TCATTGAAAGTTGAAACTGATAGTAGCAGAGTTTTTTCCTCTGTTTGG\n" solexa += "+\n" solexa += "IIIIIIHIIIIIIIIIIIIIIIIIIUJUAUGJUUJUDFAOUDJOFSUD\n" solexa = "@seq2\n" solexa += "ATATGATTGAAGATATTTCTGGGCTTTAAGGGTTCTTGAGGATTTATA\n" solexa += "+\n" solexa += "IIIIIIHIIIIIIIIIIIIIIIZIIUJUAUGJUUJUDFAOUDJOFSUD\n" solexa = "@seq14\n" solexa += "ATATGATTGAAGATATTTCTGGGCTTTAAGGGTTCTTGAGGATTTATA\n" solexa += "+\n" solexa += "IIIIIIHIIIIIIIIIIIIIIIZIIUJUAUGJUUJUDFAOUDJOFSUD\n" solexa = "@seq15\n" solexa += "ATATGATTGAAGATATTTCTGGGCTTTAAGGGTTCTTGAGGATTTATA\n" solexa += "+\n" solexa += "IIIIIIHIIIIIIIIIIIIIIIZIIUJUAUGJUUJUDFAOUDJOFSUD\n" solexa = "@seq12\n" solexa += "ATATGATTGAAGATATTTCTGGACTTTAAGGGTTCTTGAGGATTTATA\n" solexa += "+\n" solexa += "IIIIIIHIIIIIIIIIIIIIIIZIIUJUAUGJUUJUDFAOUDJOFSUD\n" solexa = "@seq13\n" solexa += "ATATGATTGAAGATATTTCTGGACTTTAAGGGTTCTTGAGGATTTATA\n" solexa += "+\n" solexa += "IIIIIIHIIIIIIIIIIIIIIIZIIUJUAUGJUUJUDFAOUDJOFSUD\n" solexa = "@seq16\n" solexa += "ATATGATTGAAGATATTTCTGGACTTTAAGGGTTCTTGAGGATTTATA\n" solexa += "+\n" solexa += "IIIIIIHIIIIIIIIIIIIIIIZIIUJUAUGJUUJUDFAOUDJOFSUD\n" solexa2 = "@seq18\n" solexa2 += "TCATTGAAAGTTGAAACTGATAGTAGCAGAGTTTTTTCCTCTGTTTGG\n" solexa2 += "+\n" solexa2 += "IIIIIIHIIIIIIIIIIIIIIIIIIUJUAUGJUUJUDFAOUDJOFSUD\n" solexa2 += "@seq19\n" solexa2 += "ATATGATTGAAGATATTTCTGGGCTTTAAGGGTTCTTGAGGATTTATA\n" solexa2 += "+\n" solexa2 += "IIIIIIHIIIIIIIIIIIIIIIZIIUJUAUGJUUJUDFAOUDJOFSUD\n" solexa2 += "@seq20\n" solexa2 += "ATATGATTGAAGATATTTCTGGGCTTTAAGGGTTCTTGAGGATTTATA\n" solexa2 += "+\n" solexa2 += "IIIIIIHIIIIIIIIIIIIIIIZIIUJUAUGJUUJUDFAOUDJOFSUD\n" solexa2 += "@seq21\n" solexa2 += "ATATGATTGAAGATATTTCTGGGCTTTAAGGGTTCTTGAGGATTTATA\n" solexa2 += "+\n" solexa2 += "IIIIIIHIIIIIIIIIIIIIIIZIIUJUAUGJUUJUDFAOUDJOFSUD\n" solexa2 += "@seq22\n" solexa2 += "ATATGATTGAAGATATTTCTGGACTTTAAGGGTTCTTGAGGATTTATA\n" solexa2 += "+\n" solexa2 += "IIIIIIHIIIIIIIIIIIIIIIZIIUJUAUGJUUJUDFAOUDJOFSUD\n" solexa2 += "@seq23\n" solexa2 += "ATATGATTGAAGATATTTCTGGACTTTAAGGGTTCTTGAGGATTTATA\n" solexa2 += "+\n" solexa2 += "IIIIIIHIIIIIIIIIIIIIIIZIIUJUAUGJUUJUDFAOUDJOFSUD\n" solexa2 += "@seq24\n" solexa2 += "ATATGATTGAAGATATTTCTGGACTTTAAGGGTTCTTGAGGATTTATA\n" solexa2 += "+\n" solexa2 += "IIIIIIHIIIIIIIIIIIIIIIZIIUJUAUGJUUJUDFAOUDJOFSUD\n" solexa2 += "@seq25\n" solexa2 += "ATGTACTAGCAGTACGATCACACACTGGACAGTACAGACCAGAATGAC\n" solexa2 += "+\n" solexa2 += "IIIIIIHIIIIIIIIIIIIIIIZIIUJUAUGJUUJUDFAOUDJOFSUD\n" sanger = ">seq3\n" sanger += "GATATGATTGAAGATATTTCTGGGCTTTAAGGGTTCTTGAGGATTTATAGGAGATACTGA" sanger += "GATTCTGGAATCTCTGAGTTTCTGGGTTCAAGTTGCACTGACCATTGTTGGATTTGTAGA" sanger += "TTGTTTCTTCTTTCATTAGGCATTGATTATGGGTAAATGCGTGGGTACATATAATATATA" sanger += "TCTGTTGAATGCAATTTACACATTGACTGAGGAACAACATGAACATGGCAGCTTTCTCAA" sanger += "AATTGAACCACAGAAGGCTTAAAAGCAAAGTCTTTGGAGAATCAGACTAAGCTTGAGA\n" sanger += ">seq4\n" sanger += "TCCATACTTTACTCTATCTCTTTCTGTGTTTGGTAACACGCGAGGATTGGATGATAT" sanger += "CTATATATTCTAATGTGGACTAAAAATGTGTGTGTGTGTATGAAGATGGGAAGCCGGAAG" sanger += "TCATCAAGGAGAAAAGAGAGATAGACGACGAGAAGATGGCGTTGACGTTCAGAGGACTAG" sanger += "AGGGTCATGTGATGGAGAAGTACAAGAAGTATGAGGTTATCTTACAGTTCATTCCCAAGT" sanger += "CGAACGAAGGCTGCGTCTGCAAAGTCACTCTGATATGGGAGAATCGCAACGAAGACTCCC" sanger += ">seq5\n" sanger += "TCCATACTTTACTCTATCTCTTTCTGTGTTTGGTAACACGCGAGGATTGGATGATAT" sanger += "CTATATATTCTAAAGTGGACTAAAAATGTGTGTGTGTGTATGAAGATGGGAAGCCGGAAG" sanger += "TCATCAAGGAGAAAAGAGAGATAGACGACGAGAAGATGGCGTTGACGTTCAGAGGACTAG" sanger += "AGGGTCATGTGATGGAGAAGTACAAGAAGTATGAGGTTATCTTACAGTTCATTCCCAAGT" sanger += "CGAACGAAGGCTGCGTCTGCAAAGTCACTCTGATATGGGAGAATCGCAACGAAGACTCCC" sange2 = ">seq6\n" sange2 += "GATATGATTGAAGATATTTCTGGGCTTTAAGGGTTCTTGAGGATTTATAGGAGATACTGA" sange2 += "GATTCTGGAATCTCTGAGTTTCTGGGTTCAAGTTGCACTGACCATTGTTGGATTTGTAGA" sange2 += "TTGTTTCTTCTTTCATTAGGCATTGATTATGGGTAAATGCGTGGGTACATATAATATATA" sange2 += "TCTGTTGAATGCAATTTACACATTGACTGAGGAACAACATGAACATGGCAGCTTTCTCAA" sange2 += "AATTGAACCACAGAAGGCTTAAAAGCAAAGTCTTTGGAGAATCAGACTAAGCTTGAGA\n" sange2 += ">seq7\n" sange2 += "TCCATACTTTACTCTATCTCTTTCTGTGTTTGGTAACACGCGAGGATTGGATGATAT" sange2 += "CTATATATTCTAATGTGGACTAAAAATGTGTGTGTGTGTATGAAGATGGGAAGCCGGAAG" sange2 += "TCATCAAGGAGAAAAGAGAGATAGACGACGAGAAGATGGCGTTGACGTTCAGAGGACTAG" sange2 += "AGGGTCATGTGATGGAGAAGTACAAGAAGTATGAGGTTATCTTACAGTTCATTCCCAAGT" sange2 += "CGAACGAAGGCTGCGTCTGCAAAGTCACTCTGATATGGGAGAATCGCAACGAAGACTCCC" sange2 += ">seq8\n" sange2 += "TCCATACTTTACTCTATCTCTTTCTGTGTTTGGTAACACGCGAGGATTGGATGATAT" sange2 += "CTATATATTCTAAAGTGGACTAAAAATGTGTGTGTGTGTATGAAGATGGGAAGCCGGAAG" sange2 += "TCATCAAGGAGAAAAGAGAGATAGACGACGAGAAGATGGCGTTGACGTTCAGAGGACTAG" sange2 += "AGGGTCATGTGATGGAGAAGTACAAGAAGTATGAGGTTATCTTACAGTTCATTCCCAAGT" sange2 += "CGAACGAAGGCTGCGTCTGCAAAGTCACTCTGATATGGGAGAATCGCAACGAAGACTCCC" fpath_sanger = join(clean_reads_dir, "lb_hola1.pl_sanger.sm_hola.fasta") fpath_solexa = join(clean_reads_dir, "lb_hola2.pl_illumina.sm_hola.sfastq") open(fpath_sanger, "w").write(sanger) open(fpath_solexa, "w").write(solexa) fpath_sanger2 = join(clean_reads_dir, "lb_adios.pl_sanger.fasta") fpath_solexa2 = join(clean_reads_dir, "lb_adios.pl_illumina.sfastq") open(fpath_sanger2, "w").write(sange2) open(fpath_solexa2, "w").write(solexa2) # the reference reference_dir = join(project_dir, "mapping/reference") os.makedirs(reference_dir) reference_fpath = join(reference_dir, "reference.fasta") out = open(reference_fpath, "w") for line in open(join(TEST_DATA_DIR, "blast/arabidopsis_genes")): out.write(line) do_analysis(project_settings=settings_path, kind="mapping", silent=True) mapping_dir = join(project_dir, "mapping") singular_mapping_dir = sorted(os.listdir(mapping_dir))[0] singular_mapping_dir = join(mapping_dir, singular_mapping_dir) assert exists(join(singular_mapping_dir, "bams", "by_readgroup", "lb_hola2.pl_illumina.sm_hola.bam")) result_dir = join(mapping_dir, "bams") assert exists(result_dir) result_dir_by_lib = join(result_dir, "by_readgroup") assert exists(result_dir_by_lib) do_analysis(project_settings=settings_path, kind="merge_bams", silent=True) assert exists(join(result_dir, "merged.0.bam")) assert exists(join(result_dir, "merged.0.bam.bai")) # we realign the mapping using GATK do_analysis(project_settings=settings_path, kind="realign_bam", silent=True) assert exists(join(result_dir, "merged.1.bam")) annot_input_dir = join(project_dir, "annotations", "input") os.makedirs(annot_input_dir) os.symlink(reference_fpath, join(annot_input_dir, "reference.fasta")) do_analysis(project_settings=settings_path, kind="annotate_snvs", silent=True) do_analysis(project_settings=settings_path, kind="annotate_orfs", silent=True) do_analysis(project_settings=settings_path, kind="annotate_prot_change", silent=True) result_file = join(project_dir, "annotations", "db", "reference.2.pickle") seqs = list(seqs_in_file(open(result_file))) snv = seqs[2].features[0] assert snv.qualifiers["protein_change"]["kind"] == "substitution" assert snv.qualifiers["protein_change"]["location"] == "codon_1" test_dir.close()
def test_mapping_analysis(): 'We can map the reads' test_dir = NamedTemporaryDir() project_name = 'backbone' bed_fhand = NamedTemporaryFile(suffix='.bed') bed_fhand.write('AT1G14930.1\t200\t400\nAT1G55265.1\t100\t300\n') bed_fhand.flush() blastdb_seq = os.path.join(TEST_DATA_DIR, 'blast', 'arabidopsis_genes+') snv_filters = {'filter1':{'name':'uniq_contiguous', 'use':True, 'genomic_db':blastdb_seq, 'genomic_seqs_fpath':blastdb_seq}, 'filter7':{'name':'by_kind', 'use':True, 'kind':'SNP'}, 'filter12':{'name':'ref_not_in_list', 'use':True, 'list_path':os.path.join(TEST_DATA_DIR, 'cos_list')}, 'filter10':{'unique_name': 'variable_in_sm', 'name': 'is_variable', 'use':True, 'group_kind':'libraries', 'groups':['hola1']}, 'filter11':{'unique_name': 'variable_in_adios', 'name': 'is_variable', 'use':True, 'group_kind':'libraries', 'groups':['adios']}, 'filter13':{'unique_name': 'variable_in_caracola', 'name': 'is_variable', 'use':True, 'group_kind':'libraries', 'groups':['hola2']}, 'filter14':{'name': 'in_segment_bed', 'use':True, 'bed_fpath':bed_fhand.name, 'edge_avoidance':10}} configuration = {'Snvs':{'min_quality':20}, 'Sam_processing':{'add_default_qualities':True}, 'snv_filters':snv_filters, 'General_settings':{'threads':THREADS}, 'Mappers':{'keep_unmapped_reads_in_bam':False}} settings_path = create_project(directory=test_dir.name, name=project_name, configuration=configuration) project_dir = join(test_dir.name, project_name) #setup the original reads reads_dir = join(project_dir, 'reads') clean_reads_dir = join(reads_dir, 'cleaned') os.mkdir(reads_dir) os.mkdir(clean_reads_dir) solexa = '@seq1\n' solexa += 'TCATTGAAAGTTGAAACTGATAGTAGCAGAGTTTTTTCCTCTGTTTGG\n' solexa += '+\n' solexa += 'IIIIIIHIIIIIIIIIIIIIIIIIIUJUAUGJUUJUDFAOUDJOFSUD\n' solexa += '@seq2\n' solexa += 'ATATGATTGAAGATATTTCTGGGCTTTAAGGGTTCTTGAGGATTTATA\n' solexa += '+\n' solexa += 'IIIIIIHIIIIIIIIIIIIIIIZIIUJUAUGJUUJUDFAOUDJOFSUD\n' solexa += '@seq14\n' solexa += 'ATATGATTGAAGATATTTCTGGGCTTTAAGGGTTCTTGAGGATTTATA\n' solexa += '+\n' solexa += 'IIIIIIHIIIIIIIIIIIIIIIZIIUJUAUGJUUJUDFAOUDJOFSUD\n' solexa += '@seq15\n' solexa += 'ATATGATTGAAGATATTTCTGGGCTTTAAGGGTTCTTGAGGATTTATA\n' solexa += '+\n' solexa += 'IIIIIIHIIIIIIIIIIIIIIIZIIUJUAUGJUUJUDFAOUDJOFSUD\n' solexa += '@seq12\n' solexa += 'ATATGATTGAAGATATTTCTGGACTTTAAGGGTTCTTGAGGATTTATA\n' solexa += '+\n' solexa += 'IIIIIIHIIIIIIIIIIIIIIIZIIUJUAUGJUUJUDFAOUDJOFSUD\n' solexa += '@seq13\n' solexa += 'ATATGATTGAAGATATTTCTGGACTTTAAGGGTTCTTGAGGATTTATA\n' solexa += '+\n' solexa += 'IIIIIIHIIIIIIIIIIIIIIIZIIUJUAUGJUUJUDFAOUDJOFSUD\n' solexa += '@seq16\n' solexa += 'ATATGATTGAAGATATTTCTGGACTTTAAGGGTTCTTGAGGATTTATA\n' solexa += '+\n' solexa += 'IIIIIIHIIIIIIIIIIIIIIIZIIUJUAUGJUUJUDFAOUDJOFSUD\n' solexa += '@seq17\n' solexa += 'ATGTACTAGCAGTACGATCACACACTGGACAGTACAGACCAGAATGAC\n' solexa += '+\n' solexa += 'IIIIIIHIIIIIIIIIIIIIIIZIIUJUAUGJUUJUDFAOUDJOFSUD\n' sanger = '>seq3\n' sanger += 'GATATGATTGAAGATATTTCTGGGCTTTAAGGGTTCTTGAGGATTTATAGGAGATACTGA' sanger += 'GATTCTGGAATCTCTGAGTTTCTGGGTTCAAGTTGCACTGACCATTGTTGGATTTGTAGA' sanger += 'TTGTTTCTTCTTTCATTAGGCATTGATTATGGGTAAATGCGTGGGTACATATAATATATA' sanger += 'TCTGTTGAATGCAATTTACACATTGACTGAGGAACAACATGAACATGGCAGCTTTCTCAA' sanger += 'AATTGAACCACAGAAGGCTTAAAAGCAAAGTCTTTGGAGAATCAGACTAAGCTTGAGA\n' sanger += '>seq4\n' sanger += 'TCCATACTTTACTCTATCTCTTTCTGTGTTTGGTAACACGCGAGGATTGGATGATAT' sanger += 'CTATATATTCTAATGTGGACTAAAAATGTGTGTGTGTGTATGAAGATGGGAAGCCGGAAG' sanger += 'TCATCAAGGAGAAAAGAGAGATAGACGACGAGAAGATGGCGTTGACGTTCAGAGGACTAG' sanger += 'AGGGTCATGTGATGGAGAAGTACAAGAAGTATGAGGTTATCTTACAGTTCATTCCCAAGT' sanger += 'CGAACGAAGGCTGCGTCTGCAAAGTCACTCTGATATGGGAGAATCGCAACGAAGACTCCC' sanger += '>seq5\n' sanger += 'TCCATACTTTACTCTATCTCTTTCTGTGTTTGGTAACACGCGAGGATTGGATGATAT' sanger += 'CTATATATTCTAAAGTGGACTAAAAATGTGTGTGTGTGTATGAAGATGGGAAGCCGGAAG' sanger += 'TCATCAAGGAGAAAAGAGAGATAGACGACGAGAAGATGGCGTTGACGTTCAGAGGACTAG' sanger += 'AGGGTCATGTGATGGAGAAGTACAAGAAGTATGAGGTTATCTTACAGTTCATTCCCAAGT' sanger += 'CGAACGAAGGCTGCGTCTGCAAAGTCACTCTGATATGGGAGAATCGCAACGAAGACTCCC' fpath_sanger = join(clean_reads_dir, 'lb_hola1.pl_sanger.sm_hola.fasta') fpath_solexa = join(clean_reads_dir, 'lb_hola2.pl_illumina.sm_hola.sfastq') open(fpath_sanger, 'w').write(sanger) open(fpath_solexa, 'w').write(solexa) fpath_sanger2 = join(clean_reads_dir, 'lb_adios.pl_sanger.fasta') fpath_solexa2 = join(clean_reads_dir, 'lb_adios.pl_illumina.sfastq') open(fpath_sanger2, 'w').write(sanger) open(fpath_solexa2, 'w').write(solexa) #the reference reference_dir = join(project_dir, 'mapping/reference') os.makedirs(reference_dir) reference_fpath = join(reference_dir, 'reference.fasta') out = open(reference_fpath, 'w') for line in open(join(TEST_DATA_DIR, 'blast/arabidopsis_genes')): out.write(line) do_analysis(project_settings=settings_path, kind='mapping', silent=True) mapping_dir = join(project_dir, 'mapping') singular_mapping_dir = sorted(os.listdir(mapping_dir))[0] singular_mapping_dir = join(mapping_dir, singular_mapping_dir) assert exists(join(singular_mapping_dir, 'bams', 'by_readgroup', 'lb_hola2.pl_illumina.sm_hola.bam')) result_dir = join(mapping_dir, 'bams') assert exists(result_dir) result_dir_by_lib = join(result_dir, 'by_readgroup') assert exists(result_dir_by_lib) unmapped_fpath = join(mapping_dir, 'unmapped_reads.gz') assert exists(unmapped_fpath) unmappeds = GzipFile(unmapped_fpath).read() assert 'seq17' in unmappeds do_analysis(project_settings=settings_path, kind='merge_bams', silent=True) assert exists(join(result_dir, 'merged.0.bam')) assert exists(join(result_dir, 'merged.0.bam.bai')) #we realign the mapping using GATK do_analysis(project_settings=settings_path, kind='realign_bam', silent=True) assert exists(join(result_dir, 'merged.1.bam')) #we calculate BAQ do_analysis(project_settings=settings_path, kind='calmd_bam', silent=True) assert exists(join(result_dir, 'merged.2.bam')) assert exists(join(result_dir, 'merged.2.bam.bai')) do_analysis(project_settings=settings_path, kind='mapping_stats', silent=True) stats_fname = join(mapping_dir, BACKBONE_DIRECTORIES['mapping_stats'][1], BACKBONE_BASENAMES['statistics_file']) result = open(stats_fname).read() assert 'Statistics for Coverage for platform sanger' in result annot_input_dir = join(project_dir, 'annotations', 'input') os.makedirs(annot_input_dir) os.symlink(reference_fpath, join(annot_input_dir, 'reference.fasta')) do_analysis(project_settings=settings_path, kind='annotate_snvs', silent=True) json_fpath = join(project_dir, BACKBONE_DIRECTORIES['annotation_dbs'], 'reference.0.pickle') assert 'snv' in open(json_fpath).read() do_analysis(project_settings=settings_path, kind='filter_snvs', silent=True) json_fpath = join(project_dir, BACKBONE_DIRECTORIES['annotation_dbs'], 'reference.1.pickle') result = open(json_fpath).read() #print result assert 'snv' in result assert 'adios_sanger' in result do_analysis(project_settings=settings_path, kind='write_annotations', silent=True) vcf_fpath = join(project_dir, 'annotations', 'features', 'reference.vcf') vcf = open(vcf_fpath).read() assert 'VLB1' in vcf assert 'VLB2' in vcf assert 'VLB3' in vcf assert 'AT1G14930.1' in vcf assert 'IS10' in vcf do_analysis(project_settings=settings_path, kind='mapping_stats', silent=True) stats_dir = join(project_dir, 'mapping', 'bams', 'stats') assert exists(join(stats_dir, 'backbone.coverage_illumina.dat')) stats_fpath = join(stats_dir, BACKBONE_BASENAMES['statistics_file']) result = open(stats_fpath).read() expected = '''average: 0.4542 variance: 1.3050 total sequence length: 3941''' assert expected in result test_dir.close()