def test_get_strains(mocker, capsys): # ensure params are correct mocker.patch('align.align_helpers.gp.fasta_suffix', '.fa') mocker.patch('align.align_helpers.gp.chrms', ['I', 'II', 'III', 'IV', 'V', 'VI', 'VII', 'VIII', 'IX', 'X', 'XI', 'XII', 'XIII', 'XIV', 'XV', 'XVI']) mock_dir = mocker.patch('os.listdir') mock_dir.return_value = [] helper.get_strains(['mock']) assert "found no chromosome sequence files in mock" in\ capsys.readouterr().out mock_dir.return_value = ['nothing'] helper.get_strains(['invalid']) assert "found no chromosome sequence files in invalid" in\ capsys.readouterr().out with pytest.raises(AssertionError) as e: mock_dir.return_value = ['missing_chr.fa'] helper.get_strains(['noStrains']) assert "some strains in noStrains are missing" in str(e) mock_dir.return_value = ['strain_chr{}.fa'.format(chrm) for chrm in helper.gp.chrms] strains = helper.get_strains(['one_strain']) assert strains == [('strain', 'one_strain')] mock_dir.return_value = ['strain{}_chr{}.fa'.format(strain, chrm) for chrm in helper.gp.chrms for strain in (1, 2)] strains = helper.get_strains(['two/strains']) assert strains == [('strain1', 'two/strains'), ('strain2', 'two/strains')] mock_dir.side_effect = [['strain{}_chr{}.fa'.format(strain, chrm) for chrm in helper.gp.chrms for strain in (1, 2)], ['strain_chr{}.fa'.format(chrm) for chrm in helper.gp.chrms]] strains = helper.get_strains(['two_strains', 'one_strain']) assert strains == [('strain', 'one_strain'), ('strain1', 'two_strains'), ('strain2', 'two_strains')]
def test_get_strains(mocker, capsys): # ensure params are correct mocker.patch('align.align_helpers.gp.fasta_suffix', '.fa') mocker.patch('align.align_helpers.gp.chrms', [ 'I', 'II', 'III', 'IV', 'V', 'VI', 'VII', 'VIII', 'IX', 'X', 'XI', 'XII', 'XIII', 'XIV', 'XV', 'XVI' ]) mock_dir = mocker.patch('os.listdir') mock_dir.return_value = [] helper.get_strains(['mock']) assert "found no chromosome sequence files in mock" in\ capsys.readouterr().out mock_dir.return_value = ['nothing'] helper.get_strains(['invalid']) assert "found no chromosome sequence files in invalid" in\ capsys.readouterr().out with pytest.raises(AssertionError) as e: mock_dir.return_value = ['missing_chr.fa'] helper.get_strains(['noStrains']) assert "some strains in noStrains are missing" in str(e) mock_dir.return_value = [ 'strain_chr{}.fa'.format(chrm) for chrm in helper.gp.chrms ] strains = helper.get_strains(['one_strain']) assert strains == [('strain', 'one_strain')] mock_dir.return_value = [ 'strain{}_chr{}.fa'.format(strain, chrm) for chrm in helper.gp.chrms for strain in (1, 2) ] strains = helper.get_strains(['two/strains']) assert strains == [('strain1', 'two/strains'), ('strain2', 'two/strains')] mock_dir.side_effect = [[ 'strain{}_chr{}.fa'.format(strain, chrm) for chrm in helper.gp.chrms for strain in (1, 2) ], ['strain_chr{}.fa'.format(chrm) for chrm in helper.gp.chrms]] strains = helper.get_strains(['two_strains', 'one_strain']) assert strains == [('strain', 'one_strain'), ('strain1', 'two_strains'), ('strain2', 'two_strains')]
def read_setup_args(fn): x = {} f = open(fn, 'r') line = f.readline() while line != '': line = line[:-1].split(' ') x[line[0]] = line[1:] line = f.readline() f.close() d = {} d['references'] = x['references'] d['reference_directories'] = dict(zip(x['references'], x['reference_directories'])) d['alignments_directory'] = x['alignments_directory'][0] d['strain_dirs'] = \ align_helpers.get_strains(x['test_strain_directories']) return d
def read_setup_args(fn): x = {} f = open(fn, 'r') line = f.readline() while line != '': line = line[:-1].split(' ') x[line[0]] = line[1:] line = f.readline() f.close() d = {} d['references'] = x['references'] d['reference_directories'] = \ dict(zip(x['references'], x['reference_directories'])) d['alignments_directory'] = x['alignments_directory'][0] d['strain_dirs'] = \ align_helpers.get_strains(x['test_strain_directories']) return d
d = diffs_per_site(seqs[keys[i]].lower(), seqs[keys[j]].lower()) if d != 'NA': num += d den += 1 if den == 0: return 'NA' return float(num) / den # read in shared regions shared_regions, _ = \ read_table.read_table_rows('shared_introgression_nonsingleton_list.txt', '\t') # read in strain dirs information s = align_helpers.get_strains(align_helpers.flatten(gp.non_ref_dirs.values())) strain_dirs = dict(s) # for each shared region: # - calculate fraction of sites that are polymorphic among introgressed strains # - for each introgressed strain, calculate: # - number of unique variants among introgressed strains (or all strains?) f = open('shared_introgression_nonsingleton_polymorphism.txt', 'w') f.write('region_number\tchromosome\tstart\tend\tpi\t' 'frac_poly\tnum_poly\tnum_total\tnum_strains\tstrain_list\n') for chrm in gp.chrms: chrom_seqs = {} for region_number in shared_regions.keys(): if shared_regions[region_number]['chromosome'] != chrm: continue
import os from align.align_helpers import get_strains, flatten import global_params as gp # get all non-reference strains of cerevisiae and paradoxus s = get_strains(flatten(gp.non_ref_dirs.values())) gp_dir = '../' a = [] if gp.resume_alignment: a = os.listdir(gp_dir + gp.alignments_dir) # need to add this on the start of each command because os.system() # creates a new shell instance every time cmd_string_start = 'export MUGSY_INSTALL=' + gp.mugsy_install_path + '; ' cmd_string_start += 'export PATH=$PATH:$MUGSY_INSTALL:$MUGSY_INSTALL/mapping; ' cmd_string_start += 'export PERL5LIB=$MUGSY_INSTALL/perllibs; ' ref_prefix = '_'.join(gp.alignment_ref_order) + '_' ref_dirs = [gp.ref_dir[ref] for ref in gp.alignment_ref_order] for strain, d in s: print(strain) cmd_string = cmd_string_start for chrm in [gp.chrms[-1]]: align_fn = ref_prefix + strain + '_chr' + chrm + gp.alignment_suffix # if we don't already have an alignment for this strain/chromosome, # then make one if align_fn not in a:
# strain regions_by_chrm_and_strain = dict( zip(gp.chrms, [{} for i in range(len(gp.chrms))])) fn_regions = gp.analysis_out_dir_absolute + tag + '/' + \ 'introgressed_blocks_filtered_par_' + tag + '_summary_plus.txt' d, labels = read_table.read_table_rows(fn_regions, '\t') for region in d: chrm = d[region]['chromosome'] strain = d[region]['strain'] if strain not in regions_by_chrm_and_strain[chrm]: regions_by_chrm_and_strain[chrm][strain] = [] regions_by_chrm_and_strain[chrm][strain].append( (int(d[region]['start']), int(d[region]['end']))) # read in all strains strain_dirs = align_helpers.get_strains(gp.non_ref_dirs[gp.master_ref]) num_strains = len(strain_dirs) # read in genes in reference sequence into dictionary keyed by # chromosome ref_genes = {} for chrm in gp.chrms: ref_genes[chrm] = [] f = open( gp.analysis_out_dir_absolute + gp.master_ref + '_chr' + chrm + '_genes.txt', 'r') line = f.readline() while line != '': line = line[:-1].split('\t') ref_genes[chrm].append((int(line[1]), int(line[2]))) line = f.readline()