def main(): 'The main part' actions, settings_fpath = set_parameters() logger = logging.getLogger('franklin') try: for action in actions: start_time = datetime.datetime.today() do_analysis(project_settings=settings_fpath, kind=action) time_elapsed = datetime.datetime.today() - start_time logger.info('Time elapsed %s' % str(time_elapsed)) except Exception as error: if not os.path.exists(ERROR_DIR): os.mkdir(ERROR_DIR) error_fpath = tempfile.mkstemp(suffix='.txt', dir=ERROR_DIR)[1] msg = 'An unexpected error happened.\n' msg += 'The clean_reads developers would appreciate your feedback\n' msg += 'Please send them the error log and take a look at it: ' msg += error_fpath + '\n\n' logger.exception(str(error)+ '\n' + msg) hook = cgitb.Hook(display=0, format='text', logfpath=error_fpath) hook.handle() raise
def test_snv_annot_without_rg(): 'It tests that we can do snv calling with a bam without rg info' test_dir = NamedTemporaryDir() project_name = 'backbone' configuration = {'Snvs':{'default_bam_platform':'sanger'}, 'General_settings':{'threads':THREADS}} settings_path = create_project(directory=test_dir.name, name=project_name, configuration=configuration) project_dir = join(test_dir.name, project_name) #the reference reference_dir = join(project_dir, 'mapping/reference') os.makedirs(reference_dir) reference_fpath = join(reference_dir, 'reference.fasta') out = open(reference_fpath, 'w') for line in open(join(TEST_DATA_DIR, 'blast/arabidopsis_genes')): out.write(line) bams_dir = join(project_dir, 'mapping', 'bams') os.makedirs(bams_dir) bam_fpath = join(bams_dir, 'merged.0.bam') shutil.copy(join(TEST_DATA_DIR, 'merged.0.bam'), bam_fpath) create_bam_index(bam_fpath) annot_input_dir = join(project_dir, 'annotations', 'input') os.makedirs(annot_input_dir) os.symlink(reference_fpath, join(annot_input_dir, 'reference.fasta')) do_analysis(project_settings=settings_path, kind='annotate_snvs', silent=True)
def run_analysis(analyses, settings_path): 'It runs the analyses and removes the files if fails' for analysis in analyses: print 'Running analysis: %s' % analysis do_analysis(project_settings=settings_path, kind=analysis, silent=True) print "Test OK"
def test_read_stats_analysis2(): # another read stats with real data clean_fpath = os.path.join(TEST_DATA_DIR, 'clean_stats', 'cleaned', 'lb_sflp2.pl_sanger.sm_t111.sfastq') raw_fpath = os.path.join(TEST_DATA_DIR, 'clean_stats', 'raw', 'lb_sflp2.pl_sanger.sm_t111.sfastq') test_dir = NamedTemporaryDir() project_name = 'backbone' project_dir = join(test_dir.name, project_name) settings_path = create_project(directory=test_dir.name, name=project_name) #setup the original reads reads_dir = join(project_dir, 'reads') original_reads_dir = join(reads_dir, 'raw') cleaned_reads_dir = join(reads_dir, 'cleaned') os.mkdir(reads_dir) os.mkdir(original_reads_dir) os.mkdir(cleaned_reads_dir) shutil.copy(clean_fpath, cleaned_reads_dir) shutil.copy(raw_fpath, original_reads_dir) do_analysis(project_settings=settings_path, kind='read_stats', silent=True)
def test_remove_output_on_error(): 'We remove files when we have an error on cleaning' test_dir = NamedTemporaryDir() project_name = 'backbone' project_dir = join(test_dir.name, project_name) configuration = {'Cleaning': {'adaptors_file_454': 'AKHSGDASD'}, 'General_settings': {'threads': THREADS}} settings_path = create_project(directory=test_dir.name, name=project_name, configuration=configuration) #setup the original reads reads_dir = join(project_dir, 'reads') original_reads_dir = join(reads_dir, 'raw') os.mkdir(reads_dir) os.mkdir(original_reads_dir) #fake solid reads try: do_analysis(project_settings=settings_path, kind='clean_reads', silent=True) except KeyError: pass output__fpath = join(reads_dir, 'cleaned', 'pl_454.lb_b.sfastq') assert not exists(output__fpath)
def test_cleaning_analysis_lucy(): 'We can clean the reads' test_dir = NamedTemporaryDir() project_name = 'backbone' univec = os.path.join(TEST_DATA_DIR, 'blast', 'univec') configuration = {'Cleaning':{'vector_database':None}} settings_path = create_project(directory=test_dir.name, name=project_name, configuration=configuration) project_dir = join(test_dir.name, project_name) #setup the original reads reads_dir = join(project_dir, 'reads') original_reads_dir = join(reads_dir, 'raw') os.mkdir(reads_dir) os.mkdir(original_reads_dir) os.makedirs(join(project_dir, 'config_data', 'lucy')) lucy_settings = join(project_dir, 'config_data', 'lucy', 'lucy.conf') luc_c = open(lucy_settings, 'w') luc_c.write(repr({'ps':{'vector_file':'tmp' , 'splice_file':'tmp'}})) luc_c.flush() #print original_reads_dir fpath_noqual = join(original_reads_dir, 'pl_sanger.lb_ps.fasta') fpath_qual = join(original_reads_dir, 'pl_sanger.lb_andreas.sfastq') fpath_454 = join(original_reads_dir, 'pl_454.lb_ps.sfastq') fpath_ill = join(original_reads_dir, 'pl_illumina.lb_psi.sfastq') open(fpath_noqual, 'w').write(READS_NOQUAL) open(fpath_qual, 'w').write(SANGER_QUAL) open(fpath_454, 'w').write(READS_454) open(fpath_ill, 'w').write(READS_ILL) do_analysis(project_settings=settings_path, kind='clean_reads', silent=True) cleaned_dir = join(project_dir, 'reads', 'cleaned') assert exists(cleaned_dir) cleaned_qual = join(cleaned_dir, os.path.basename(fpath_qual)) assert 'SEX' in open(cleaned_qual).read() cleaned_454 = join(cleaned_dir, os.path.basename(fpath_454)) assert exists(cleaned_454) cleaned_noqual = join(cleaned_dir, os.path.basename(fpath_noqual)) clean_seqs = open(cleaned_noqual).read() assert clean_seqs.startswith('>FM195262.1\nGCATTCTCG')
def test_cdna_intron_annoation_analysis(): "We can annotate introns" test_dir = NamedTemporaryDir() project_name = "backbone" blast_db_path = os.path.join(TEST_DATA_DIR, "blast") genomic_db = os.path.join(blast_db_path, "tomato_genome2+") config = { "Annotation": {"Cdna_intron_annotation": {"genomic_db": genomic_db, "genomic_seq_file": genomic_db}}, "General_settings": {"threads": THREADS}, } settings_path = create_project(directory=test_dir.name, name=project_name, configuration=config) project_dir = join(test_dir.name, project_name) seq = "GAAAAGATGTGATTGGTGAAATAAGTTTGCCTCAATTCTCTTGTGCCGAAGTTCCAAAGAAGC" seq += "AGTTGGTGAATGAGCAGCCAGTACCCGAAAAATCGAGCAAAGATTTTGTGATGTATGTTGGAG" seq += "GTCTAGCATGGGGGATGGACTGGTGTCCCCAAGCTCATGAAAATAGGGATGCTCCTATGAAAA" seq += "GTGAGTTTGTCGCAATTGCTCCTCATCCTCCTGATTCATCATATCACAAGACTGATGCCTCAC" seq += "TTACAGGCAGAGGTGTAATTCAGATATGGTGCCTGCCAGATCTCATTCAAAAAGATATAATTG" seq += "TGAAAGAAGATTATTTTGCTCAGGTTAACAAAAAACCGTATAGAAATTTGACAAGAAGTGAAG" seq += "CAGGTACGGGAGAAGTATCTGGACCTCAAAAACCAAGAGGAAGACCAAAAAAGAACCCTGGTA" seq += "AAGCAGTCCAGGCAAAAGCATCTAGACCACAAAATCCAAGAGGAAGACCGAGAAAGAAGCCTG" seq += "TTACTGAATCTTTAGGTGATAGAGATAGTGAAGACCACAGTTTACAACCTCTTGCTATAGAGT" seq += "GGTCGCTGCAATCAACAGAACTTTCTGTAGATTTGTCTTGTGGAAATATGAATAAAGCCCAAG" seq += "TAGATATTGCGCTGAGTCAAGAAAGATGTATTAATGCGGCAT" annot_input_dir = join(project_dir, "annotations", "input") os.makedirs(annot_input_dir) # create some seqs to annotate fasta = ">seq\n%s\n" % seq fhand = open(os.path.join(annot_input_dir, "seqs.fasta"), "w") fhand.write(fasta) fhand.close() do_analysis(project_settings=settings_path, kind="annotate_introns", silent=True) pickle_fpath = join(project_dir, BACKBONE_DIRECTORIES["annotation_dbs"], "seqs.0.pickle") assert "intron" in open(pickle_fpath).read() do_analysis(project_settings=settings_path, kind="annotation_stats", silent=True) stats_fpath = join(project_dir, "annotations", "features", "stats", "seqs.txt") result = open(stats_fpath).read() expected = """Sequences with intron: 1 Number of introns: 3""" assert expected in result test_dir.close()
def test_description_annotation_analysis(): "We can annotate with description" test_dir = NamedTemporaryDir() project_name = "backbone" arab_blastdb = join(TEST_DATA_DIR, "blast", "arabidopsis_genes+") config = { "blast": {"arabidopsis": {"path": arab_blastdb, "species": "arabidopsis"}}, "Annotation": {"description_annotation": {"description_databases": ["arabidopsis"]}}, "General_settings": {"threads": THREADS}, } settings_path = create_project(directory=test_dir.name, name=project_name, configuration=config) project_dir = join(test_dir.name, project_name) # some melon file to annotate input_dir = join(project_dir, BACKBONE_DIRECTORIES["annotation_input"]) os.makedirs(input_dir) seq_ = "AGGTGTCACCGTTCACGAGGGCGACTGGGACTCCCACGGGGCCATCAAGTCCTGGAACTACA" seq_ += "CATGCGGTCCTCTATCTCATTCTCTATTTGTATGAATATGTGTTTATTACTAGCTAGGGTTT" seq_ += "CTATTAATGAAAGGTTCATGTAAATATATGAAGATGGGAAGCAAGAGGTGTTCAAGGAGAAG" seq_ += "AGGGAGTTAGACGACCAGAAGAT" seq1 = SeqWithQuality(Seq(seq_), id="CUTC021854") seq2 = SeqWithQuality(Seq("Atagtagcatcagatgagcatcgacttctagctagctagct"), id="CUTC021853") write_seqs_in_file([seq1, seq2], open(join(input_dir, "melon.st_nucl.pl_454.fasta"), "a")) do_analysis(project_settings=settings_path, kind="annotate_descriptions", silent=True) repr_fpath = join(project_dir, BACKBONE_DIRECTORIES["annotation_dbs"], "melon.st_nucl.pl_454.0.pickle") result = open(repr_fpath).read() # print result assert "yet another one" in result do_analysis(project_settings=settings_path, kind="annotation_stats", silent=True) stats_fpath = join(project_dir, "annotations", "features", "stats", "melon.st_nucl.pl_454.txt") result = open(stats_fpath).read() expected = """Annotation statistics --------------------- Number of sequences: 2 Sequences with description: 1""" assert expected in result test_dir.close()
def test_go_annotation_analysis(): "We can annotate gos" test_dir = NamedTemporaryDir() project_name = "backbone" nr_path = os.path.join(TEST_DATA_DIR, "blast", "arabidopsis_genes+") b2g = os.path.join(TEST_DATA_DIR, "b2gPipe.properties") b2gpipe_bin = os.path.join(guess_jar_dir("blast2go.jar"), "blast2go.jar") if not b2gpipe_bin: print "Do not run b2gppe tests, blast2go jar file not found " return config = { "blast": {"nr": {"path": nr_path, "species": "nr"}}, "Annotation": { "go_annotation": { "blast_database": "nr", "create_dat_file": True, "java_memory": 2048, "b2g_properties_file": b2g, "blast2go_path": b2gpipe_bin, } }, "General_settings": {"threads": THREADS}, } settings_path = create_project(directory=test_dir.name, name=project_name, configuration=config) project_dir = join(test_dir.name, project_name) seq = "CTTCATCCATTCTCTCATCCGCCGNTGTGGCCTTTGNCAACAGGGCTTCCCCTGCTCAAGCT" seq += "AGCATGGGGGCACCATTCACTGGCCTAAAATCCGCCGCTGCTTTCCCNGTNACTCGCANGACC" seq += "AACGACATCACCACTTTGGTTAGCAATGGGGGAAGAGTTCAGGGCNTGAAGGTGTGCCCACCA" seq += "CTTGGATTGAAGAAGTTCGAGACTCTTTCTTACCTTCCTGATATGAGTAACGAGCAATTGGGA" seq += "AAGGAAGTTGACTACCTTCTCAGGAAGGGATGGATTCCCTGCATTGAATTCGACATTCACAGT" seq += "GGATTCGTTTACCGTGAGACCCACAGGTCACCAGG" annot_input_dir = join(project_dir, "annotations", "input") os.makedirs(annot_input_dir) # create some seqs to annotate fasta = ">seq1\n%s\n" % seq fhand = open(os.path.join(annot_input_dir, "seqs.st_nucl.pl_454.fasta"), "w") fhand.write(fasta) fhand.close() bdir = join(project_dir, "annotations", "blast", "seqs.st_nucl.pl_454", "arabidopsis_genes+") os.makedirs(bdir) shutil.copy(join(TEST_DATA_DIR, "blastResult.xml"), join(bdir, "blast.tblastx.xml")) do_analysis(project_settings=settings_path, kind="annotate_gos", silent=True) repr_fpath = join(project_dir, BACKBONE_DIRECTORIES["annotation_dbs"], "seqs.st_nucl.pl_454.0.pickle") result = open(repr_fpath).read() assert "GO:0043094" in result assert os.path.exists(os.path.join(project_dir, "annotations", "features", "seqs.st_nucl.pl_454.b2g.dat")) assert os.path.exists(os.path.join(project_dir, "annotations", "features", "seqs.st_nucl.pl_454.b2g.annot")) do_analysis(project_settings=settings_path, kind="annotate_gos", silent=True) do_analysis(project_settings=settings_path, kind="annotation_stats", silent=True) stats_fpath = join(project_dir, "annotations", "features", "stats", "seqs.st_nucl.pl_454.txt") result = open(stats_fpath).read() expected = """Sequences with GOs: 1 Number of GOs: 10""" assert expected in result
def test_orf_annotation_analysis(): "We can annotate orfs" test_dir = NamedTemporaryDir() project_name = "backbone" matrix = os.path.join(TEST_DATA_DIR, "At.smat") config = { "Annotation": {"orf_annotation": {"estscan_matrix": matrix}}, "General_settings": {"threads": THREADS}, } settings_path = create_project(directory=test_dir.name, name=project_name, configuration=config) project_dir = join(test_dir.name, project_name) seq = "CTACTTACTAGCTTTAGTAAATCCTTCTAACCCTCGGTAAAAAAAAAAAAGAGGCATCAAATG" seq += "GCTTCATCCATTCTCTCATCCGCCGNTGTGGCCTTTGNCAACAGGGCTTCCCCTGCTCAAGCT" seq += "AGCATGGGGGCACCATTCACTGGCCTAAAATCCGCCGCTGCTTTCCCNGTNACTCGCANGACC" seq += "AACGACATCACCACTTTGGTTAGCAATGGGGGAAGAGTTCAGGGCNTGAAGGTGTGCCCACCA" seq += "CTTGGATTGAAGAAGTTCGAGACTCTTTCTTACCTTCCTGATATGAGTAACGAGCAATTGGGA" seq += "AAGGAAGTTGACTACCTTCTCAGGAAGGGATGGATTCCCTGCATTGAATTCGACATTCACAGT" seq += "GGATTCGTTTACCGTGAGACCCACAGGTCACCAGGATACTTCGATGGACGCTACTGGACCATG" seq += "TGGAAGCTGCCCATGTTTGGCTGCACCGAT" annot_input_dir = join(project_dir, "annotations", "input") os.makedirs(annot_input_dir) # create some seqs to annotate fasta = ">seq\n%s\n" % seq fhand = open(os.path.join(annot_input_dir, "seqs.fasta"), "w") fhand.write(fasta) fhand.close() do_analysis(project_settings=settings_path, kind="annotate_orfs", silent=True) repr_fpath = join(project_dir, BACKBONE_DIRECTORIES["annotation_dbs"], "seqs.0.pickle") result = open(repr_fpath).read() assert "orf" in result do_analysis(project_settings=settings_path, kind="write_annotations", silent=True) seq_fpath = join(project_dir, "annotations", "features", "seqs.orf_seq.fasta") pep_fpath = join(project_dir, "annotations", "features", "seqs.orf_pep.fasta") assert "ATCCGCCGNTGTGGCCTTTGNCAACAGGGCTTCCCCT" in open(seq_fpath).read() assert "QASMGAPFTGLKSAAAFPVTRXTNDITTLVSNG" in open(pep_fpath).read() do_analysis(project_settings=settings_path, kind="annotation_stats", silent=True) stats_fpath = join(project_dir, "annotations", "features", "stats", "seqs.txt") result = open(stats_fpath).read() expected = """Sequences with ORF: 1 Number of ORFs: 1""" assert expected in result test_dir.close()
def test_microsatellite_annoation_analysis(): "We can annotate introns" test_dir = NamedTemporaryDir() project_name = "backbone" settings_path = create_project( directory=test_dir.name, name=project_name, configuration={"General_settings": {"threads": THREADS}} ) project_dir = join(test_dir.name, project_name) seq = "GAAAAGATGTGATTGGTGAAATAAGTTTGCCTCAATTCTCTTGTGCCGAAGTTCCAAAGAAGC" seq += "AGTTGGTGAATGAGCAGCCAGTACCCGAAAAATCGAGCAAAGATTTTGTGATGTATGTTGGAG" seq += "GTCTAGCATGGGGGATGGACTGGTGTCCCCAAGCTCATGAAAATAGGGATGCTCCTATGAAAA" seq += "GAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGA" seq += "GTGAGTTTGTCGCAATTGCTCCTCATCCTCCTGATTCATCATATCACAAGACTGATGCCTCAC" seq += "TTACAGGCAGAGGTGTAATTCAGATATGGTGCCTGCCAGATCTCATTCAAAAAGATATAATTG" seq += "TGAAAGAAGATTATTTTGCTCAGGTTAACAAAAAACCGTATAGAAATTTGACAAGAAGTGAAG" seq += "CAGGTACGGGAGAAGTATCTGGACCTCAAAAACCAAGAGGAAGACCAAAAAAGAACCCTGGTA" seq += "AAGCAGTCCAGGCAAAAGCATCTAGACCACAAAATCCAAGAGGAAGACCGAGAAAGAAGCCTG" seq += "TTACTGAATCTTTAGGTGATAGAGATAGTGAAGACCACAGTTTACAACCTCTTGCTATAGAGT" seq += "GGTCGCTGCAATCAACAGAACTTTCTGTAGATTTGTCTTGTGGAAATATGAATAAAGCCCAAG" seq += "TAGATATTGCGCTGAGTCAAGAAAGATGTATTAATGCGGCAT" annot_input_dir = join(project_dir, "annotations", "input") os.makedirs(annot_input_dir) # create some seqs to annotate fasta = ">seq\n%s\n" % seq fhand = open(os.path.join(annot_input_dir, "seqs.fasta"), "w") fhand.write(fasta) fhand.close() do_analysis(project_settings=settings_path, kind="annotate_microsatellites", silent=True) pickle_fpath = join(project_dir, BACKBONE_DIRECTORIES["annotation_dbs"], "seqs.0.pickle") result = open(pickle_fpath).read() assert "microsatellite" in result do_analysis(project_settings=settings_path, kind="write_annotations", silent=True) ssr_fpath = join(project_dir, "annotations", "features", "seqs.ssr") assert os.path.exists(ssr_fpath) assert "Seqname" in open(ssr_fpath).read() do_analysis(project_settings=settings_path, kind="annotation_stats", silent=True) stats_fpath = join(project_dir, "annotations", "features", "stats", "seqs.txt") result = open(stats_fpath).read() expected = "Sequences with microsatellites: 1" assert expected in result test_dir.close()
def test_mapping_color(): 'It test the mapping of the mapper with color space' test_dir = NamedTemporaryDir() project_name = 'backbone' blastdb_seq = os.path.join(TEST_DATA_DIR, 'blast', 'arabidopsis_genes+') snv_filters = {'filter1':{'name':'uniq_contiguous', 'use':True, 'genomic_db':blastdb_seq, 'genomic_seqs_fpath':blastdb_seq}, 'filter7':{'name':'by_kind', 'use':True, 'kind':'SNP'}, 'filter12':{'name':'ref_not_in_list', 'use':True, 'list_path':os.path.join(TEST_DATA_DIR, 'cos_list')}, 'filter10':{'unique_name': 'variable_in_sm', 'name': 'is_variable', 'use':True, 'group_kind':'libraries', 'groups':['hola']}, 'filter11':{'unique_name': 'variable_in_adios', 'name': 'is_variable', 'use':True, 'group_kind':'libraries', 'groups':['adios']}, 'filter13':{'unique_name': 'variable_in_caracola', 'name': 'is_variable', 'use':True, 'group_kind':'libraries', 'groups':['caracola']}, } configuration = {'Snvs':{'min_quality':20}, 'Sam_processing':{'add_default_qualities':True}, 'snv_filters':snv_filters, 'General_settings':{'threads':THREADS}} settings_path = create_project(directory=test_dir.name, name=project_name, configuration=configuration) project_dir = join(test_dir.name, project_name) #setup the original reads reads_dir = join(project_dir, 'reads') clean_reads_dir = join(reads_dir, 'cleaned') os.mkdir(reads_dir) os.mkdir(clean_reads_dir) shutil.copy(os.path.join(TEST_DATA_DIR, 'solid.fastq'), os.path.join(clean_reads_dir, 'pl_solid.lb_hola.sm_hola.sfastq')) #the reference reference_dir = join(project_dir, 'mapping/reference') os.makedirs(reference_dir) reference_fpath = join(reference_dir, 'reference.fasta') out = open(reference_fpath, 'w') for line in open(join(TEST_DATA_DIR, 'samtools_color/reference')): out.write(line) do_analysis(project_settings=settings_path, kind='mapping', silent=True) mapping_dir = join(project_dir, 'mapping') singular_mapping_dir = sorted(os.listdir(mapping_dir))[0] singular_mapping_dir = join(mapping_dir, singular_mapping_dir) assert exists(join(singular_mapping_dir, 'bams', 'by_readgroup', 'pl_solid.lb_hola.sm_hola.bam')) result_dir = join(mapping_dir, 'bams') assert exists(result_dir) result_dir_by_lib = join(result_dir, 'by_readgroup') assert exists(result_dir_by_lib) do_analysis(project_settings=settings_path, kind='merge_bams', silent=True) assert exists(join(result_dir, 'merged.0.bam')) assert exists(join(result_dir, 'merged.0.bam.bai')) #we realign the mapping using GATK do_analysis(project_settings=settings_path, kind='realign_bam', silent=True) assert exists(join(result_dir, 'merged.1.bam')) test_dir.close()
def test_read_stats_analysis(): 'It test the read statistics' test_dir = NamedTemporaryDir() project_name = 'backbone' project_dir = join(test_dir.name, project_name) settings_path = create_project(directory=test_dir.name, name=project_name) #setup the original reads reads_dir = join(project_dir, 'reads') original_reads_dir = join(reads_dir, 'raw') os.mkdir(reads_dir) os.mkdir(original_reads_dir) fpath_454 = join(original_reads_dir, 'pl_454.lb_a.sfastq') fpath_ill = join(original_reads_dir, 'pl_illumina.lb_b.sfastq') open(fpath_454, 'w').write(READS_454) open(fpath_ill, 'w').write(READS_ILL) #the cleaned reads cleaned_reads_dir = join(reads_dir, 'cleaned') os.mkdir(cleaned_reads_dir) fpath_454 = join(cleaned_reads_dir, 'pl_454.lb_a.sfastq') fpath_ill = join(cleaned_reads_dir, 'pl_illumina.lb_no_raw.sfastq') open(fpath_454, 'w').write(READS_454) open(fpath_ill, 'w').write(READS_ILL) do_analysis(project_settings=settings_path, kind='read_stats', silent=True) clean_stats_dir = join(cleaned_reads_dir, 'stats') clean_fnames = os.listdir(clean_stats_dir) expected_fnames = ['pl_illumina.lb_no_raw.qual', 'pl_454.lb_a.qual', 'pl_illumina.lb_no_raw.length', 'pl_454.lb_a.length'] for fname in expected_fnames: assert fname + '.dat' in clean_fnames assert fname + '.svg' in clean_fnames statistics_fpath = join(clean_stats_dir, BACKBONE_BASENAMES['statistics_file']) content = open(statistics_fpath).read() assert content == '''statistics for pl_454.lb_a.sfastq --------------------------------- Num sequences: 4 Total sequence length: 759 Sequence length minimum: 106 Sequence length maximum: 295 Sequence length average: 189.75 Sequence length variance: 4972.69 Sequence qualities minimum: 20 Sequence qualities maximum: 40 Sequence qualities average: 36.99 Sequence qualities variance: 8.19 statistics for pl_illumina.lb_no_raw.sfastq ------------------------------------------- Num sequences: 6 Total sequence length: 172 Sequence length minimum: 24 Sequence length maximum: 31 Sequence length average: 28.67 Sequence length variance: 10.89 Sequence qualities minimum: 4 Sequence qualities maximum: 34 Sequence qualities average: 29.63 Sequence qualities variance: 47.80 ''' boxplot_fpath = join(clean_stats_dir, 'pl_illumina.lb_no_raw' + '.qual.boxplot.dat') exp = 'distrib\tmean\tstd_deviation\t1st_quartile\tmedian\t3rd_qualtile' assert exp in open(boxplot_fpath).read() freq_nucl_fpath = join(clean_stats_dir, 'pl_454.lb_a.freq_position.svg') nucl_freq = open(freq_nucl_fpath).read() assert 'style="fill:#0000ff;stroke:#000000;"/>' in nucl_freq
def test_cleaning_analysis(): 'We can clean the reads' test_dir = NamedTemporaryDir() project_name = 'backbone' project_dir = join(test_dir.name, project_name) adaptors_dir = join(project_dir, 'config_data', 'adaptors') adaptors_path_454 = join(adaptors_dir, '454_adaptors') words = ['^ATGAAC', 'TTGATTTGGT'] univec = os.path.join(TEST_DATA_DIR, 'blast', 'univec+') configuration = {'Cleaning': {'vector_database': univec, 'adaptors_file_454': adaptors_path_454, 'short_adaptors_454': words, 'edge_removal': {'454_left': 3, '454_right': 3}}, 'General_settings': {'threads': THREADS}} settings_path = create_project(directory=test_dir.name, name=project_name, configuration=configuration) #setup the original reads reads_dir = join(project_dir, 'reads') original_reads_dir = join(reads_dir, 'raw') os.mkdir(reads_dir) os.mkdir(original_reads_dir) os.makedirs(adaptors_dir) adap_fhand = open(adaptors_path_454, 'w') adap_fhand.write('''>smart_5_cds_primer_1 GGTTCAAGGTTTGAGAAAGGATGGGAAG\n''') adap_fhand.close() #print original_reads_dir fpath_454 = join(original_reads_dir, 'pl_454.lb_a.sfastq') fpath_ill = join(original_reads_dir, 'pl_illumina.lb_b.sfastq') fpath_solid = join(original_reads_dir, 'pl_solid.lb_prueba.sfastq') open(fpath_solid, 'w').write(READS_SOLID) open(fpath_454, 'w').write(READS_454) open(fpath_ill, 'w').write(READS_ILL) do_analysis(project_settings=settings_path, kind='clean_reads', silent=True) cleaned_dir = join(project_dir, 'reads', 'cleaned') assert exists(cleaned_dir) cleaned_454 = join(cleaned_dir, os.path.basename(fpath_454)) assert exists(cleaned_454) seqs = list(seqs_in_file(open(cleaned_454))) # It means thar the adaptor has been removed seq = seqs[0].seq assert 'GGTTCAAGGTTTGAGAAAGGATGGGAAG' not in seq seq = seqs[2].seq # It means that the starting word has been removed assert seq.startswith('TTCCAAGATTCTTCCCACAT') # solid cleaned_solid = join(cleaned_dir, os.path.basename(fpath_solid)) clean_seqs = open(cleaned_solid).read() assert '10_1824_570_F3' not in clean_seqs do_analysis(project_settings=settings_path, kind='prepare_mira_assembly', silent=True) assembly_input = join(project_dir, 'assembly', 'input') assert exists(assembly_input) mira_in_454 = join(assembly_input, 'backbone_in.454.fasta') mira_in_qul = join(assembly_input, 'backbone_in.454.fasta.qual') assert exists(mira_in_454) assert exists(mira_in_qul) do_analysis(project_settings=settings_path, kind='mira_assembly', silent=True) assembly_dir = join(project_dir, 'assembly') sorted(os.listdir(assembly_dir)) test_dir.close()
def test_mapping_analysis(): 'We can map the reads' test_dir = NamedTemporaryDir() project_name = 'backbone' bed_fhand = NamedTemporaryFile(suffix='.bed') bed_fhand.write('AT1G14930.1\t200\t400\nAT1G55265.1\t100\t300\n') bed_fhand.flush() blastdb_seq = os.path.join(TEST_DATA_DIR, 'blast', 'arabidopsis_genes+') snv_filters = {'filter1':{'name':'uniq_contiguous', 'use':True, 'genomic_db':blastdb_seq, 'genomic_seqs_fpath':blastdb_seq}, 'filter7':{'name':'by_kind', 'use':True, 'kind':'SNP'}, 'filter12':{'name':'ref_not_in_list', 'use':True, 'list_path':os.path.join(TEST_DATA_DIR, 'cos_list')}, 'filter10':{'unique_name': 'variable_in_sm', 'name': 'is_variable', 'use':True, 'group_kind':'libraries', 'groups':['hola1']}, 'filter11':{'unique_name': 'variable_in_adios', 'name': 'is_variable', 'use':True, 'group_kind':'libraries', 'groups':['adios']}, 'filter13':{'unique_name': 'variable_in_caracola', 'name': 'is_variable', 'use':True, 'group_kind':'libraries', 'groups':['hola2']}, 'filter14':{'name': 'in_segment_bed', 'use':True, 'bed_fpath':bed_fhand.name, 'edge_avoidance':10}} configuration = {'Snvs':{'min_quality':20}, 'Sam_processing':{'add_default_qualities':True}, 'snv_filters':snv_filters, 'General_settings':{'threads':THREADS}, 'Mappers':{'keep_unmapped_reads_in_bam':False}} settings_path = create_project(directory=test_dir.name, name=project_name, configuration=configuration) project_dir = join(test_dir.name, project_name) #setup the original reads reads_dir = join(project_dir, 'reads') clean_reads_dir = join(reads_dir, 'cleaned') os.mkdir(reads_dir) os.mkdir(clean_reads_dir) solexa = '@seq1\n' solexa += 'TCATTGAAAGTTGAAACTGATAGTAGCAGAGTTTTTTCCTCTGTTTGG\n' solexa += '+\n' solexa += 'IIIIIIHIIIIIIIIIIIIIIIIIIUJUAUGJUUJUDFAOUDJOFSUD\n' solexa += '@seq2\n' solexa += 'ATATGATTGAAGATATTTCTGGGCTTTAAGGGTTCTTGAGGATTTATA\n' solexa += '+\n' solexa += 'IIIIIIHIIIIIIIIIIIIIIIZIIUJUAUGJUUJUDFAOUDJOFSUD\n' solexa += '@seq14\n' solexa += 'ATATGATTGAAGATATTTCTGGGCTTTAAGGGTTCTTGAGGATTTATA\n' solexa += '+\n' solexa += 'IIIIIIHIIIIIIIIIIIIIIIZIIUJUAUGJUUJUDFAOUDJOFSUD\n' solexa += '@seq15\n' solexa += 'ATATGATTGAAGATATTTCTGGGCTTTAAGGGTTCTTGAGGATTTATA\n' solexa += '+\n' solexa += 'IIIIIIHIIIIIIIIIIIIIIIZIIUJUAUGJUUJUDFAOUDJOFSUD\n' solexa += '@seq12\n' solexa += 'ATATGATTGAAGATATTTCTGGACTTTAAGGGTTCTTGAGGATTTATA\n' solexa += '+\n' solexa += 'IIIIIIHIIIIIIIIIIIIIIIZIIUJUAUGJUUJUDFAOUDJOFSUD\n' solexa += '@seq13\n' solexa += 'ATATGATTGAAGATATTTCTGGACTTTAAGGGTTCTTGAGGATTTATA\n' solexa += '+\n' solexa += 'IIIIIIHIIIIIIIIIIIIIIIZIIUJUAUGJUUJUDFAOUDJOFSUD\n' solexa += '@seq16\n' solexa += 'ATATGATTGAAGATATTTCTGGACTTTAAGGGTTCTTGAGGATTTATA\n' solexa += '+\n' solexa += 'IIIIIIHIIIIIIIIIIIIIIIZIIUJUAUGJUUJUDFAOUDJOFSUD\n' solexa += '@seq17\n' solexa += 'ATGTACTAGCAGTACGATCACACACTGGACAGTACAGACCAGAATGAC\n' solexa += '+\n' solexa += 'IIIIIIHIIIIIIIIIIIIIIIZIIUJUAUGJUUJUDFAOUDJOFSUD\n' sanger = '>seq3\n' sanger += 'GATATGATTGAAGATATTTCTGGGCTTTAAGGGTTCTTGAGGATTTATAGGAGATACTGA' sanger += 'GATTCTGGAATCTCTGAGTTTCTGGGTTCAAGTTGCACTGACCATTGTTGGATTTGTAGA' sanger += 'TTGTTTCTTCTTTCATTAGGCATTGATTATGGGTAAATGCGTGGGTACATATAATATATA' sanger += 'TCTGTTGAATGCAATTTACACATTGACTGAGGAACAACATGAACATGGCAGCTTTCTCAA' sanger += 'AATTGAACCACAGAAGGCTTAAAAGCAAAGTCTTTGGAGAATCAGACTAAGCTTGAGA\n' sanger += '>seq4\n' sanger += 'TCCATACTTTACTCTATCTCTTTCTGTGTTTGGTAACACGCGAGGATTGGATGATAT' sanger += 'CTATATATTCTAATGTGGACTAAAAATGTGTGTGTGTGTATGAAGATGGGAAGCCGGAAG' sanger += 'TCATCAAGGAGAAAAGAGAGATAGACGACGAGAAGATGGCGTTGACGTTCAGAGGACTAG' sanger += 'AGGGTCATGTGATGGAGAAGTACAAGAAGTATGAGGTTATCTTACAGTTCATTCCCAAGT' sanger += 'CGAACGAAGGCTGCGTCTGCAAAGTCACTCTGATATGGGAGAATCGCAACGAAGACTCCC' sanger += '>seq5\n' sanger += 'TCCATACTTTACTCTATCTCTTTCTGTGTTTGGTAACACGCGAGGATTGGATGATAT' sanger += 'CTATATATTCTAAAGTGGACTAAAAATGTGTGTGTGTGTATGAAGATGGGAAGCCGGAAG' sanger += 'TCATCAAGGAGAAAAGAGAGATAGACGACGAGAAGATGGCGTTGACGTTCAGAGGACTAG' sanger += 'AGGGTCATGTGATGGAGAAGTACAAGAAGTATGAGGTTATCTTACAGTTCATTCCCAAGT' sanger += 'CGAACGAAGGCTGCGTCTGCAAAGTCACTCTGATATGGGAGAATCGCAACGAAGACTCCC' fpath_sanger = join(clean_reads_dir, 'lb_hola1.pl_sanger.sm_hola.fasta') fpath_solexa = join(clean_reads_dir, 'lb_hola2.pl_illumina.sm_hola.sfastq') open(fpath_sanger, 'w').write(sanger) open(fpath_solexa, 'w').write(solexa) fpath_sanger2 = join(clean_reads_dir, 'lb_adios.pl_sanger.fasta') fpath_solexa2 = join(clean_reads_dir, 'lb_adios.pl_illumina.sfastq') open(fpath_sanger2, 'w').write(sanger) open(fpath_solexa2, 'w').write(solexa) #the reference reference_dir = join(project_dir, 'mapping/reference') os.makedirs(reference_dir) reference_fpath = join(reference_dir, 'reference.fasta') out = open(reference_fpath, 'w') for line in open(join(TEST_DATA_DIR, 'blast/arabidopsis_genes')): out.write(line) do_analysis(project_settings=settings_path, kind='mapping', silent=True) mapping_dir = join(project_dir, 'mapping') singular_mapping_dir = sorted(os.listdir(mapping_dir))[0] singular_mapping_dir = join(mapping_dir, singular_mapping_dir) assert exists(join(singular_mapping_dir, 'bams', 'by_readgroup', 'lb_hola2.pl_illumina.sm_hola.bam')) result_dir = join(mapping_dir, 'bams') assert exists(result_dir) result_dir_by_lib = join(result_dir, 'by_readgroup') assert exists(result_dir_by_lib) unmapped_fpath = join(mapping_dir, 'unmapped_reads.gz') assert exists(unmapped_fpath) unmappeds = GzipFile(unmapped_fpath).read() assert 'seq17' in unmappeds do_analysis(project_settings=settings_path, kind='merge_bams', silent=True) assert exists(join(result_dir, 'merged.0.bam')) assert exists(join(result_dir, 'merged.0.bam.bai')) #we realign the mapping using GATK do_analysis(project_settings=settings_path, kind='realign_bam', silent=True) assert exists(join(result_dir, 'merged.1.bam')) #we calculate BAQ do_analysis(project_settings=settings_path, kind='calmd_bam', silent=True) assert exists(join(result_dir, 'merged.2.bam')) assert exists(join(result_dir, 'merged.2.bam.bai')) do_analysis(project_settings=settings_path, kind='mapping_stats', silent=True) stats_fname = join(mapping_dir, BACKBONE_DIRECTORIES['mapping_stats'][1], BACKBONE_BASENAMES['statistics_file']) result = open(stats_fname).read() assert 'Statistics for Coverage for platform sanger' in result annot_input_dir = join(project_dir, 'annotations', 'input') os.makedirs(annot_input_dir) os.symlink(reference_fpath, join(annot_input_dir, 'reference.fasta')) do_analysis(project_settings=settings_path, kind='annotate_snvs', silent=True) json_fpath = join(project_dir, BACKBONE_DIRECTORIES['annotation_dbs'], 'reference.0.pickle') assert 'snv' in open(json_fpath).read() do_analysis(project_settings=settings_path, kind='filter_snvs', silent=True) json_fpath = join(project_dir, BACKBONE_DIRECTORIES['annotation_dbs'], 'reference.1.pickle') result = open(json_fpath).read() #print result assert 'snv' in result assert 'adios_sanger' in result do_analysis(project_settings=settings_path, kind='write_annotations', silent=True) vcf_fpath = join(project_dir, 'annotations', 'features', 'reference.vcf') vcf = open(vcf_fpath).read() assert 'VLB1' in vcf assert 'VLB2' in vcf assert 'VLB3' in vcf assert 'AT1G14930.1' in vcf assert 'IS10' in vcf do_analysis(project_settings=settings_path, kind='mapping_stats', silent=True) stats_dir = join(project_dir, 'mapping', 'bams', 'stats') assert exists(join(stats_dir, 'backbone.coverage_illumina.dat')) stats_fpath = join(stats_dir, BACKBONE_BASENAMES['statistics_file']) result = open(stats_fpath).read() expected = '''average: 0.4542 variance: 1.3050 total sequence length: 3941''' assert expected in result test_dir.close()
def test_ortholog_annotation_analysis(): "We can annotate orthologs" test_dir = NamedTemporaryDir() project_name = "backbone" config = { "blast": { "arabidopsis": {"path": "/path/to/tair", "species": "arabidopsis", "kind": "nucl"}, "arabidopsis2": {"path": "/path/to/tair2", "species": "arabidopsis2", "kind": "nucl"}, }, "Annotation": {"ortholog_annotation": {"ortholog_databases": ["arabidopsis", "arabidopsis2"]}}, "General_settings": {"threads": THREADS}, } settings_path = create_project(directory=test_dir.name, name=project_name, configuration=config) project_dir = join(test_dir.name, project_name) # create blast results melon_tair_blastdir = join(project_dir, "annotations", "blast", "melon.st_nucl.pl_454", "tair") melon_tair2_blastdir = join(project_dir, "annotations", "blast", "melon.st_nucl.pl_454", "tair2") os.makedirs(melon_tair_blastdir) os.makedirs(melon_tair2_blastdir) tair_melon_blastdir = join(project_dir, "annotations", "blast", "tair", "melon.st_nucl.pl_454") tair2_melon_blastdir = join(project_dir, "annotations", "blast", "tair2", "melon.st_nucl.pl_454") os.makedirs(tair_melon_blastdir) os.makedirs(tair2_melon_blastdir) blast_fname = BACKBONE_BASENAMES["blast_basename"] + ".tblastx.xml" shutil.copy(join(TEST_DATA_DIR, "melon_tair.xml"), join(melon_tair_blastdir, blast_fname)) shutil.copy(join(TEST_DATA_DIR, "melon_tair.xml"), join(melon_tair2_blastdir, blast_fname)) shutil.copy(join(TEST_DATA_DIR, "tair_melon.xml"), join(tair_melon_blastdir, blast_fname)) shutil.copy(join(TEST_DATA_DIR, "tair_melon.xml"), join(tair2_melon_blastdir, blast_fname)) # some melon file to annotate input_dir = join(project_dir, BACKBONE_DIRECTORIES["annotation_input"]) os.makedirs(input_dir) seq1 = SeqWithQuality(Seq("A"), id="melon1") seq2 = SeqWithQuality(Seq("A"), id="melon2") write_seqs_in_file([seq1, seq2], open(join(input_dir, "melon.st_nucl.pl_454.fasta"), "a")) do_analysis(project_settings=settings_path, kind="annotate_orthologs", silent=True) pickle_fpath = join(project_dir, BACKBONE_DIRECTORIES["annotation_dbs"], "melon.st_nucl.pl_454.0.pickle") pickle = open(pickle_fpath).read() assert "arabidopsis-orthologs" in pickle assert "arabidopsis2-orthologs" in pickle do_analysis(project_settings=settings_path, kind="write_annotations", silent=True) orf_fpath = join(project_dir, "annotations", "features", "melon.st_nucl.pl_454.orthologs") assert os.path.exists(orf_fpath) assert "tair1" in open(orf_fpath).read() orf_fpath = join(project_dir, "annotations", "features", "melon.st_nucl.pl_454.orf") assert not os.path.exists(orf_fpath) do_analysis(project_settings=settings_path, kind="annotation_stats", silent=True) stats_fpath = join(project_dir, "annotations", "features", "stats", "melon.st_nucl.pl_454.txt") result = open(stats_fpath).read() expected = """Orthologs _________ Sequences with arabidopsis orthologs: 2 Number of arabidopsis orthologs: 2 Sequences with arabidopsis2 orthologs: 2 Number of arabidopsis2 orthologs: 2""" assert expected in result test_dir.close()
def test_protein_change_annotation_analysis(): "We can annotate protein changes" test_dir = NamedTemporaryDir() project_name = "backbone" matrix = os.path.join(TEST_DATA_DIR, "At.smat") configuration = { "Snvs": {"min_quality": 20}, "Sam_processing": {"add_default_qualities": True}, "Annotation": {"orf_annotation": {"estscan_matrix": matrix}}, "General_settings": {"threads": THREADS}, } settings_path = create_project(directory=test_dir.name, name=project_name, configuration=configuration) project_dir = join(test_dir.name, project_name) # setup the original reads reads_dir = join(project_dir, "reads") clean_reads_dir = join(reads_dir, "cleaned") os.mkdir(reads_dir) os.mkdir(clean_reads_dir) solexa = "@seq1\n" solexa += "TCATTGAAAGTTGAAACTGATAGTAGCAGAGTTTTTTCCTCTGTTTGG\n" solexa += "+\n" solexa += "IIIIIIHIIIIIIIIIIIIIIIIIIUJUAUGJUUJUDFAOUDJOFSUD\n" solexa = "@seq2\n" solexa += "ATATGATTGAAGATATTTCTGGGCTTTAAGGGTTCTTGAGGATTTATA\n" solexa += "+\n" solexa += "IIIIIIHIIIIIIIIIIIIIIIZIIUJUAUGJUUJUDFAOUDJOFSUD\n" solexa = "@seq14\n" solexa += "ATATGATTGAAGATATTTCTGGGCTTTAAGGGTTCTTGAGGATTTATA\n" solexa += "+\n" solexa += "IIIIIIHIIIIIIIIIIIIIIIZIIUJUAUGJUUJUDFAOUDJOFSUD\n" solexa = "@seq15\n" solexa += "ATATGATTGAAGATATTTCTGGGCTTTAAGGGTTCTTGAGGATTTATA\n" solexa += "+\n" solexa += "IIIIIIHIIIIIIIIIIIIIIIZIIUJUAUGJUUJUDFAOUDJOFSUD\n" solexa = "@seq12\n" solexa += "ATATGATTGAAGATATTTCTGGACTTTAAGGGTTCTTGAGGATTTATA\n" solexa += "+\n" solexa += "IIIIIIHIIIIIIIIIIIIIIIZIIUJUAUGJUUJUDFAOUDJOFSUD\n" solexa = "@seq13\n" solexa += "ATATGATTGAAGATATTTCTGGACTTTAAGGGTTCTTGAGGATTTATA\n" solexa += "+\n" solexa += "IIIIIIHIIIIIIIIIIIIIIIZIIUJUAUGJUUJUDFAOUDJOFSUD\n" solexa = "@seq16\n" solexa += "ATATGATTGAAGATATTTCTGGACTTTAAGGGTTCTTGAGGATTTATA\n" solexa += "+\n" solexa += "IIIIIIHIIIIIIIIIIIIIIIZIIUJUAUGJUUJUDFAOUDJOFSUD\n" solexa2 = "@seq18\n" solexa2 += "TCATTGAAAGTTGAAACTGATAGTAGCAGAGTTTTTTCCTCTGTTTGG\n" solexa2 += "+\n" solexa2 += "IIIIIIHIIIIIIIIIIIIIIIIIIUJUAUGJUUJUDFAOUDJOFSUD\n" solexa2 += "@seq19\n" solexa2 += "ATATGATTGAAGATATTTCTGGGCTTTAAGGGTTCTTGAGGATTTATA\n" solexa2 += "+\n" solexa2 += "IIIIIIHIIIIIIIIIIIIIIIZIIUJUAUGJUUJUDFAOUDJOFSUD\n" solexa2 += "@seq20\n" solexa2 += "ATATGATTGAAGATATTTCTGGGCTTTAAGGGTTCTTGAGGATTTATA\n" solexa2 += "+\n" solexa2 += "IIIIIIHIIIIIIIIIIIIIIIZIIUJUAUGJUUJUDFAOUDJOFSUD\n" solexa2 += "@seq21\n" solexa2 += "ATATGATTGAAGATATTTCTGGGCTTTAAGGGTTCTTGAGGATTTATA\n" solexa2 += "+\n" solexa2 += "IIIIIIHIIIIIIIIIIIIIIIZIIUJUAUGJUUJUDFAOUDJOFSUD\n" solexa2 += "@seq22\n" solexa2 += "ATATGATTGAAGATATTTCTGGACTTTAAGGGTTCTTGAGGATTTATA\n" solexa2 += "+\n" solexa2 += "IIIIIIHIIIIIIIIIIIIIIIZIIUJUAUGJUUJUDFAOUDJOFSUD\n" solexa2 += "@seq23\n" solexa2 += "ATATGATTGAAGATATTTCTGGACTTTAAGGGTTCTTGAGGATTTATA\n" solexa2 += "+\n" solexa2 += "IIIIIIHIIIIIIIIIIIIIIIZIIUJUAUGJUUJUDFAOUDJOFSUD\n" solexa2 += "@seq24\n" solexa2 += "ATATGATTGAAGATATTTCTGGACTTTAAGGGTTCTTGAGGATTTATA\n" solexa2 += "+\n" solexa2 += "IIIIIIHIIIIIIIIIIIIIIIZIIUJUAUGJUUJUDFAOUDJOFSUD\n" solexa2 += "@seq25\n" solexa2 += "ATGTACTAGCAGTACGATCACACACTGGACAGTACAGACCAGAATGAC\n" solexa2 += "+\n" solexa2 += "IIIIIIHIIIIIIIIIIIIIIIZIIUJUAUGJUUJUDFAOUDJOFSUD\n" sanger = ">seq3\n" sanger += "GATATGATTGAAGATATTTCTGGGCTTTAAGGGTTCTTGAGGATTTATAGGAGATACTGA" sanger += "GATTCTGGAATCTCTGAGTTTCTGGGTTCAAGTTGCACTGACCATTGTTGGATTTGTAGA" sanger += "TTGTTTCTTCTTTCATTAGGCATTGATTATGGGTAAATGCGTGGGTACATATAATATATA" sanger += "TCTGTTGAATGCAATTTACACATTGACTGAGGAACAACATGAACATGGCAGCTTTCTCAA" sanger += "AATTGAACCACAGAAGGCTTAAAAGCAAAGTCTTTGGAGAATCAGACTAAGCTTGAGA\n" sanger += ">seq4\n" sanger += "TCCATACTTTACTCTATCTCTTTCTGTGTTTGGTAACACGCGAGGATTGGATGATAT" sanger += "CTATATATTCTAATGTGGACTAAAAATGTGTGTGTGTGTATGAAGATGGGAAGCCGGAAG" sanger += "TCATCAAGGAGAAAAGAGAGATAGACGACGAGAAGATGGCGTTGACGTTCAGAGGACTAG" sanger += "AGGGTCATGTGATGGAGAAGTACAAGAAGTATGAGGTTATCTTACAGTTCATTCCCAAGT" sanger += "CGAACGAAGGCTGCGTCTGCAAAGTCACTCTGATATGGGAGAATCGCAACGAAGACTCCC" sanger += ">seq5\n" sanger += "TCCATACTTTACTCTATCTCTTTCTGTGTTTGGTAACACGCGAGGATTGGATGATAT" sanger += "CTATATATTCTAAAGTGGACTAAAAATGTGTGTGTGTGTATGAAGATGGGAAGCCGGAAG" sanger += "TCATCAAGGAGAAAAGAGAGATAGACGACGAGAAGATGGCGTTGACGTTCAGAGGACTAG" sanger += "AGGGTCATGTGATGGAGAAGTACAAGAAGTATGAGGTTATCTTACAGTTCATTCCCAAGT" sanger += "CGAACGAAGGCTGCGTCTGCAAAGTCACTCTGATATGGGAGAATCGCAACGAAGACTCCC" sange2 = ">seq6\n" sange2 += "GATATGATTGAAGATATTTCTGGGCTTTAAGGGTTCTTGAGGATTTATAGGAGATACTGA" sange2 += "GATTCTGGAATCTCTGAGTTTCTGGGTTCAAGTTGCACTGACCATTGTTGGATTTGTAGA" sange2 += "TTGTTTCTTCTTTCATTAGGCATTGATTATGGGTAAATGCGTGGGTACATATAATATATA" sange2 += "TCTGTTGAATGCAATTTACACATTGACTGAGGAACAACATGAACATGGCAGCTTTCTCAA" sange2 += "AATTGAACCACAGAAGGCTTAAAAGCAAAGTCTTTGGAGAATCAGACTAAGCTTGAGA\n" sange2 += ">seq7\n" sange2 += "TCCATACTTTACTCTATCTCTTTCTGTGTTTGGTAACACGCGAGGATTGGATGATAT" sange2 += "CTATATATTCTAATGTGGACTAAAAATGTGTGTGTGTGTATGAAGATGGGAAGCCGGAAG" sange2 += "TCATCAAGGAGAAAAGAGAGATAGACGACGAGAAGATGGCGTTGACGTTCAGAGGACTAG" sange2 += "AGGGTCATGTGATGGAGAAGTACAAGAAGTATGAGGTTATCTTACAGTTCATTCCCAAGT" sange2 += "CGAACGAAGGCTGCGTCTGCAAAGTCACTCTGATATGGGAGAATCGCAACGAAGACTCCC" sange2 += ">seq8\n" sange2 += "TCCATACTTTACTCTATCTCTTTCTGTGTTTGGTAACACGCGAGGATTGGATGATAT" sange2 += "CTATATATTCTAAAGTGGACTAAAAATGTGTGTGTGTGTATGAAGATGGGAAGCCGGAAG" sange2 += "TCATCAAGGAGAAAAGAGAGATAGACGACGAGAAGATGGCGTTGACGTTCAGAGGACTAG" sange2 += "AGGGTCATGTGATGGAGAAGTACAAGAAGTATGAGGTTATCTTACAGTTCATTCCCAAGT" sange2 += "CGAACGAAGGCTGCGTCTGCAAAGTCACTCTGATATGGGAGAATCGCAACGAAGACTCCC" fpath_sanger = join(clean_reads_dir, "lb_hola1.pl_sanger.sm_hola.fasta") fpath_solexa = join(clean_reads_dir, "lb_hola2.pl_illumina.sm_hola.sfastq") open(fpath_sanger, "w").write(sanger) open(fpath_solexa, "w").write(solexa) fpath_sanger2 = join(clean_reads_dir, "lb_adios.pl_sanger.fasta") fpath_solexa2 = join(clean_reads_dir, "lb_adios.pl_illumina.sfastq") open(fpath_sanger2, "w").write(sange2) open(fpath_solexa2, "w").write(solexa2) # the reference reference_dir = join(project_dir, "mapping/reference") os.makedirs(reference_dir) reference_fpath = join(reference_dir, "reference.fasta") out = open(reference_fpath, "w") for line in open(join(TEST_DATA_DIR, "blast/arabidopsis_genes")): out.write(line) do_analysis(project_settings=settings_path, kind="mapping", silent=True) mapping_dir = join(project_dir, "mapping") singular_mapping_dir = sorted(os.listdir(mapping_dir))[0] singular_mapping_dir = join(mapping_dir, singular_mapping_dir) assert exists(join(singular_mapping_dir, "bams", "by_readgroup", "lb_hola2.pl_illumina.sm_hola.bam")) result_dir = join(mapping_dir, "bams") assert exists(result_dir) result_dir_by_lib = join(result_dir, "by_readgroup") assert exists(result_dir_by_lib) do_analysis(project_settings=settings_path, kind="merge_bams", silent=True) assert exists(join(result_dir, "merged.0.bam")) assert exists(join(result_dir, "merged.0.bam.bai")) # we realign the mapping using GATK do_analysis(project_settings=settings_path, kind="realign_bam", silent=True) assert exists(join(result_dir, "merged.1.bam")) annot_input_dir = join(project_dir, "annotations", "input") os.makedirs(annot_input_dir) os.symlink(reference_fpath, join(annot_input_dir, "reference.fasta")) do_analysis(project_settings=settings_path, kind="annotate_snvs", silent=True) do_analysis(project_settings=settings_path, kind="annotate_orfs", silent=True) do_analysis(project_settings=settings_path, kind="annotate_prot_change", silent=True) result_file = join(project_dir, "annotations", "db", "reference.2.pickle") seqs = list(seqs_in_file(open(result_file))) snv = seqs[2].features[0] assert snv.qualifiers["protein_change"]["kind"] == "substitution" assert snv.qualifiers["protein_change"]["location"] == "codon_1" test_dir.close()