def concatenate_fastas(fns, fn_out, remove_gaps): strains = read_fasta.read_fasta(fns[0])[0] concat_seqs = dict(zip(strains, ['' for s in strains])) for fn in fns: headers, seqs = read_fasta.read_fasta(fn) for i in range(len(seqs)): concat_seqs[headers[i]] += seqs[i] f = open(fn_out, 'w') for strain in strains: f.write(strain + '\n') f.write(concat_seqs[strain] + '\n') f.close()
def test_read_fasta_empty(mocker): fasta = StringIO('>\n') mocked_file = mocker.patch('misc.read_fasta.open', return_value=fasta) headers, seqs = read_fasta('mocked') assert headers == ['>'] assert seqs.tolist() == [[]] mocked_file.assert_called_with('mocked', 'r') fasta = StringIO('') mocked_file = mocker.patch('misc.read_fasta.open', return_value=fasta) # TODO probably handle empty files better with pytest.raises(IndexError): headers, seqs = read_fasta('mocked')
def test_read_fasta_multi(mocker): fasta = StringIO(''' not read > headseq headfname.fa actg --- atcg > headseq headfname.fa actg actg --- > headseq2 headfname.fa actg- cataaa ''') mocked_file = mocker.patch('misc.read_fasta.open', return_value=fasta) headers, seqs = read_fasta('mocked') assert headers == [ '> headseq headfname.fa', '> headseq headfname.fa', '> headseq2 headfname.fa' ] print(seqs) assert seqs == approx( np.array( [list('actg---atcg'), list('actgactg---'), list('actg-cataaa')])) mocked_file.assert_called_with('mocked', 'r')
def get_range_seqs(strains, chrm, start, end, tag, gp_dir='../'): # TODO this shouldn't actually be dependent on tag strain_range_seqs = {} for strain, d in strains: print(strain) fn = d + strain + '_chr' + chrm + gp.fasta_suffix chrm_seq = read_fasta.read_fasta(fn)[1][0] t = None try: t, labels = read_table.read_table_columns( gp.analysis_out_dir_absolute + tag + '/' + 'site_summaries/predictions_' + strain + '_chr' + chrm + '_site_summary.txt.gz', '\t') except FileNotFoundError: # for par reference which doesn't have site summary file align_fn = gp_dir + gp.alignments_dir + \ '_'.join(gp.alignment_ref_order) + '_chr' + chrm + \ '_mafft' + gp.alignment_suffix t = get_inds_from_alignment(align_fn, True) ref_ind_to_strain_ind = dict(zip(t['ps_ref'], t['ps_strain'])) start_strain = int(math.ceil(float(ref_ind_to_strain_ind[str(start)]))) end_strain = int(math.floor(float(ref_ind_to_strain_ind[str(end)]))) strain_range_seqs[strain] = (chrm_seq[start_strain:end_strain + 1], start_strain, end_strain) return strain_range_seqs
def test_filter_ambiguous_on_region_10805(filterer, mocker): fa = os.path.join(os.path.split(__file__)[0], 'r10805.fa') if os.path.exists(fa): headers, seqs = read_fasta.read_fasta(fa, gz=False) seqs = seqs[:-1] region = {'predicted_species': 'N_45'} p, _ = filterer.filter_ambiguous( region, seqs, 0.1, ['S288c', 'CBS432', 'N_45', 'DBVPG6304', 'UWOPS91_917_1']) assert p is False assert region['alternative_states'] == ( 'CBS432,N_45,UWOPS91_917_1,DBVPG6304') assert region['alternative_ids'] == ( '0.9983805668016195,0.994331983805668,' '0.9642857142857143,0.9618506493506493') assert region['alternative_P_counts'] == '145,143,128,129' region = {'predicted_species': 'N_45'} p, _ = filterer.filter_ambiguous( region, seqs, 0.98, ['S288c', 'CBS432', 'N_45', 'DBVPG6304', 'UWOPS91_917_1']) assert p is False assert region['alternative_states'] == 'CBS432,N_45' assert region['alternative_ids'] == ( '0.9983805668016195,0.994331983805668') assert region['alternative_P_counts'] == '145,143' else: warnings.warn('Unable to test with datafile r10805.fa')
def get_indices(self, chromosome: str, strain: str) -> Tuple: ''' Get the sequences and different indices for the provided chromosome and strain Returned tuple contains: -sequences as np.array -index alignment list of indices for each sequence -masked_sites, index aligned for each sequence ''' _, sequences = read_fasta.read_fasta( self.alignments.format(chrom=chromosome, strain=strain)) # to go from index in reference seq to index in alignment alignments = [ self.index_alignment_by_reference(seq) for seq in sequences ] masked = self.read_masked_sites(chromosome, strain) masked_sites = [ alignments[ind][self.masked_sites[chromosome][state]] for ind, state in enumerate(self.known_states) ] + [alignments[-1][masked]] # for strain return sequences, alignments, masked_sites
def test_read_fasta_multi(mocker): fasta = StringIO(''' not read > headseq headfname.fa actg --- atcg > headseq headfname.fa actg actg --- > headseq2 headfname.fa actg- cataaa ''') mocked_file = mocker.patch('misc.read_fasta.open', return_value=fasta) headers, seqs = read_fasta('mocked') assert headers == ['> headseq headfname.fa', '> headseq headfname.fa', '> headseq2 headfname.fa'] print(seqs) assert seqs == approx(np.array([list('actg---atcg'), list('actgactg---'), list('actg-cataaa')])) mocked_file.assert_called_with('mocked', 'r')
def get_orfs(fn): headers, seqs = read_fasta.read_fasta(fn) orfs = {} for h in headers: m = re.search(r' (?P<name>[a-zA-Z0-9]+)_(?P<strain>[a-zA-Z0-9\.]+)' ':(?P<start>[0-9]+):(?P<end>[0-9]+)', h) orfs[(int(m.group('start')), int(m.group('end')))] = m.group('name') return orfs
def mask(fn, masked_fn, intervals_fn): headers, seqs = read_fasta.read_fasta(fn) seq = list(seqs[0]) intervals = read_intervals(intervals_fn) for start, end in intervals: for i in range(start, end + 1): seq[i] = gp.unsequenced_symbol seq = ''.join(seq) write_fasta.write_fasta(headers, [seq], masked_fn)
def fraction_strains_aligned(headers, seqs): nseqs = len(seqs) nsites = len(seqs[0]) seq_lengths = [] fracs_aligned = [] for i in range(nseqs): h = headers[i].split(' ') actual = nsites - seqs[i].count(gp.gap_symbol) seq_lengths.append(actual) s = read_fasta.read_fasta(h[-1]) expected = len(s[1][0]) fracs_aligned.append(float(actual) / expected) return fracs_aligned, seq_lengths
def test_read_fasta_single(mocker): fasta = StringIO(''' not read > headseq headfname.fa actg --- atcg ''') mocked_file = mocker.patch('misc.read_fasta.open', return_value=fasta) headers, seqs = read_fasta('mocked') assert headers == ['> headseq headfname.fa'] assert seqs == approx(np.asarray([list('actg---atcg')])) mocked_file.assert_called_with('mocked', 'r')
def get_gene_seqs(query_fn, strains, chrm, ref_chrm_fn, start, end, strand, tag, strain_ind_to_ref_ind): # outfmt = '"6 qseqid sseqid slen qstart qend \ # length mismatch gapopen gaps sseq"' outfmt = '"6 sseqid slen evalue bitscore"' strain_gene_seqs = {} out_fn = 'blast_chr' + chrm + '.out' for strain, d in strains: if strain != 'yjm1332': continue print('-', strain) sys.stdout.flush() fn = d + 'orfs/' + strain + '_chr' + chrm + '_orfs' + gp.fasta_suffix cmd_string = gp.blast_install_path + 'blastn' + \ ' -db ' + fn + \ ' -query ' + query_fn + \ ' -out ' + out_fn + \ ' -outfmt ' + outfmt # print(cmd_string) os.system(cmd_string) hits = [ line[:-1].split('\t') for line in open(out_fn, 'r').readlines() ] if len(hits) == 0: strain_gene_seqs[strain] = ('nohit', '', -1, -1, '') continue # best_orf_id = hits[0][0] headers, seqs = read_fasta.read_fasta(fn) best_orf_id, x, seq, orf_start, orf_end, orf_strand = \ choose_best_hit(hits, start, end, tag, strain, chrm, headers, seqs, strain_ind_to_ref_ind[strain]) print(hits) print(best_orf_id) print(orf_strand, strand) sys.exit() if best_orf_id is None or orf_strand != strand: strain_gene_seqs[strain] = ('nohit', '', -1, -1, '') continue strain_gene_seqs[strain] = (x, seq, orf_start, orf_end, orf_strand) os.remove(out_fn) return strain_gene_seqs
def test_filter_ambiguous_on_region_10817(filterer, mocker): fa = os.path.join(os.path.split(__file__)[0], 'r10817.fa') if os.path.exists(fa): headers, seqs = read_fasta.read_fasta(fa, gz=False) seqs = seqs[:-1] region = {'predicted_species': 'CBS432'} p, _ = filterer.filter_ambiguous( region, seqs, 0.98, ['S288c', 'CBS432', 'N_45', 'DBVPG6304', 'UWOPS91_917_1']) assert p is False assert region['alternative_states'] == ( 'CBS432,N_45') assert region['alternative_P_counts'] == '111,110' else: warnings.warn('Unable to test with datafile r10817.fa')
def get_inds_from_alignment(fn, rind, sind): headers, seqs = read_fasta.read_fasta(fn) n = len(seqs[0]) ri = -1 si = -1 ps = [] for i in range(n): s_gap = True if seqs[sind][i] != gp.gap_symbol: si += 1 s_gap = False if seqs[rind][i] != gp.gap_symbol: ri += 1 if s_gap: ps.append(None) else: ps.append(str(si)) return ps
def get_inds_from_alignment(fn, flip_ref, rind=0, sind=1): headers, seqs = read_fasta.read_fasta(fn) n = len(seqs[0]) ri = -1 si = -1 pr = [] ps = [] if flip_ref: rind = 1 sind = 0 for i in range(n): if seqs[sind][i] != gp.gap_symbol: si += 1 if seqs[rind][i] != gp.gap_symbol: ri += 1 pr.append(str(ri)) ps.append(str(si)) if flip_ref: return {'ps_ref': ps, 'ps_strain': pr} return {'ps_ref': pr, 'ps_strain': ps}
def get_aligned_genes(fn, strains): headers, seqs = read_fasta.read_fasta(fn) d = {} for i in range(len(headers)): strain = headers[i][1:].split()[0] if strain in strains: d[strain] = seqs[i] n = len(d.values()[0]) remove_columns = [] for i in range(n): all_gap = True for strain in d.keys(): if d[strain][i] != gp.gap_symbol: all_gap = False break if all_gap: remove_columns.append(i) for i in remove_columns[::-1]: for strain in d.keys(): d[strain] = d[strain][:i] + d[strain][i + 1:] return d
def get_ref_gene_seq(gene, gene_coords_fn, seq_fn): d1, labels = read_table.read_table_rows(gene_coords_fn, '\t', header=False, key_ind=0) d = {} for g in d1: if d1[g][0] == '""': d[g] = d1[g][1:] else: d[d1[g][0]] = d1[g][1:] gene_start = int(d[gene][2]) - 1 gene_end = int(d[gene][3]) - 1 chrm_seq = read_fasta.read_fasta(seq_fn)[1][0] gene_seq = chrm_seq[gene_start:gene_end + 1] strand = d[gene][1] if strand == '-1': gene_seq = seq_functions.reverse_complement(gene_seq) assert gene_seq.startswith('atg') or gene_seq.startswith('ATG') assert gene_start < gene_end return gene_seq, gene_start, gene_end, strand
import sys from misc import read_fasta def pad(s, n): s = s.strip() return s[:n] + (n - len(s)) * ' ' headers, seqs = read_fasta.read_fasta(sys.argv[1]) fp = open(sys.argv[2], 'w') fp.write(str(len(headers)) + ' ' + str(len(seqs[0])) + '\n') for i in range(len(headers)): h = pad(headers[i][1:], 10) fp.write(h + seqs[i] + '\n') fp.close()
# ====== # write all sites, including gaps # ====== print('writing file with all sites') f_all = open(fn_all, 'w') # master reference (cerevisiae) print('*', gp.master_ref) f_all.write('>' + gp.master_ref + '\n') chrm_offset = 0 for chrm in gp.chrms: seq = read_fasta.read_fasta(gp.ref_dir[gp.master_ref] + gp.ref_fn_prefix[gp.master_ref] + '_chr' + chrm + gp.fasta_suffix)[1][0] f_all.write(seq) chrm_offsets[chrm] = chrm_offset chrm_offset += len(seq) f_all.write('\n') # other reference (paradoxus) other_ref_strain = gp.ref_fn_prefix[gp.alignment_ref_order[1]] print('*', other_ref_strain) f_all.write('>' + other_ref_strain + '\n') for chrm in gp.chrms: align_fn = gp_dir + gp.alignments_dir + \ '_'.join(gp.alignment_ref_order) + '_chr' + chrm + \ '_mafft' + gp.alignment_suffix ps = get_inds_from_alignment(align_fn, 0, 1)
open('check_paralogs_out_cer_paralog.tsv', 'r').readlines()] genes_to_analyze = list(set(genes_to_analyze)) ip = 0 for gene in genes_to_analyze: if gene not in paralogs: continue print(ip) ip += 1 chrm, ref_gene_start, ref_gene_end = gene_coords[gene] gene_headers, gene_seqs = \ read_fasta.read_fasta(gp.analysis_out_dir_absolute + tag + '/genes/' + gene + '/' + gene + '_from_alignment.fa') gene_headers = [x[1:].strip() for x in gene_headers] strain_seqs = dict(zip(gene_headers, gene_seqs)) cer_seq = strain_seqs['S288c'] par_seq = strain_seqs['CBS432'] paralog = paralogs[gene] gene_headers, gene_seqs = \ read_fasta.read_fasta(gp.analysis_out_dir_absolute + tag + '/genes/' + paralog + '/' + paralog + '_from_alignment.fa') gene_headers = [x[1:].strip() for x in gene_headers] strain_paralog_seqs = dict(zip(gene_headers, gene_seqs)) cer_paralog_seq = strain_paralog_seqs['S288c'] par_paralog_seq = strain_paralog_seqs['CBS432']
import os from convert_coordinates import (write_coordinates, convert) import global_params as gp from misc import read_fasta gp_dir = '../' fns = os.listdir(gp_dir + gp.alignments_dir) fns = filter(lambda fn: fn.endswith(gp.alignment_suffix), fns) for fn in fns: print(fn) x = fn.split('_') chrm = x[-2] strain_names = x[0:-2] headers, seqs = read_fasta.read_fasta(gp_dir + gp.alignments_dir + fn) # for each index in cer reference, get index in other strain # (either par reference for 2-way alignment or cer strain for # 3-way) coord_fn = (gp.analysis_out_dir_absolute + 'coordinates/' + strain_names[0] + '_to_' + strain_names[-1] + '_' + chrm + '.txt.gz') write_coordinates(convert(seqs[0], seqs[-1]), coord_fn) # for each index in other strain, get index in cer reference coord_fn = (gp.analysis_out_dir_absolute + 'coordinates/' + strain_names[-1] + '_to_' + strain_names[0] + '_' + chrm + '.txt.gz') write_coordinates(convert(seqs[-1], seqs[0]), coord_fn)
'_chr' + chrm + '.txt.gz' f_coord = gzip.open(coord_fn, 'rb') ref_ind_to_strain_i_ind = [ try_int(line[:-1]) for line in f_coord.readlines() ] # current strain fasta file for current chromosome strain_fn = d_i + strain_i + '_chr' + chrm + gp.fasta_suffix print(strain_i, chrm) # get chromosome sequence for this strain relative to # reference strain (the base for this strain at each site in # the reference, based on original alignment); # gaps/unsequenced sites/etc marked as 'N' strain_i_seqs[chrm] = referize( read_fasta.read_fasta(strain_fn)[1][0].lower(), ref_ind_to_strain_i_ind) # get version of sequence where everything that doesn't fall # within gene is replaced by 'N' strain_i_seqs_coding[chrm] = mark_included(strain_i_seqs[chrm], ref_genes[chrm]) # also get version of above sequences where introgressed sites are # replaced by 'N' strain_i_seqs_nonint[chrm] = copy.deepcopy(strain_i_seqs[chrm]) strain_i_seqs_coding_nonint[chrm] = copy.deepcopy( strain_i_seqs_coding[chrm]) if strain_i in regions_by_chrm_and_strain[chrm]: strain_i_seqs_nonint[chrm] = mark_excluded( strain_i_seqs[chrm],
# ====== # input file for ldselect is formatted so that each row is a snp and # each column is the genotype for a strain, e.g. fn = out_dir + 'ldselect_input_chr' + chrm + '.tsv' f = open(fn, 'w') snps = defaultdict(list) # loop through all the strains for strain in strains: print('-', strain) # read multiple alignment file for this strain with the master # reference (and other references which we don't care about # here) headers, seqs = read_fasta.read_fasta(gp_dir + gp.alignments_dir + '_'.join(gp.alignment_ref_order) + '_' + strain + '_chr' + chrm + '_mafft.maf') # look at all alignment columns, keeping track of the index in # the master reference i = 0 for c in range(len(seqs[0])): # if the master reference doesn't have a gap in this # column, then store the allele that the current strain # has at this site if seqs[0][c] != gp.gap_symbol and seqs[0][c] != gp.unsequenced_symbol: snps[i].append(seqs[-1][c]) i += 1 # get reference sequence (unaligned, without gaps) # TODO correct alignment file location ref_seq = read_fasta.read_fasta(gp_dir + gp.alignments_dir +
sys.stdout.flush() fn_out = gp.analysis_out_dir_absolute + args['tag'] + '/site_summaries/' +\ 'predictions_' + strain + '_chr' + chrm + '_site_summary.txt.gz' if not os.path.exists(os.path.dirname(fn_out)): os.makedirs(os.path.dirname(fn_out)) # skip this strain x chromosome if there are no introgressed # regions for it if strain not in regions or chrm not in regions[strain]: continue # read alignment blocks for this strain and chromosome fn_align = fn_align_prefix + \ strain + '_chr' + chrm + '_mafft' + gp.alignment_suffix alignment_headers, alignment_seqs = read_fasta.read_fasta(fn_align) # read masked (unaligned) sequences seq_masked_fns = [header.split()[-1] for header in alignment_headers] seq_masked_fns = [ mfn[:-len(gp.fasta_suffix)] + '_masked' + gp.fasta_suffix for mfn in seq_masked_fns ] seqs_masked = [read_fasta.read_fasta(mfn)[1][0] for mfn in seq_masked_fns] labels = ref_labels + [strain] # mark each site as matching each reference or not ref_match_by_site = gene_predictions.get_ref_match_by_site( alignment_seqs, labels) # mark each site as in a gene or not
# - by chromosome # - in windows across genome window = 100 gp_dir = '../' nrefs = len(gp.alignment_ref_order) pair_chrm_ids = defaultdict(lambda: defaultdict(list)) for chrm in gp.chrms: print(chrm) fn = (gp_dir + gp.alignments_dir + '_'.join(gp.alignment_ref_order) + '_chr' + chrm + '_mafft' + gp.alignment_suffix) headers, seqs = read_fasta.read_fasta(fn) for i in range(nrefs): ref1 = gp.alignment_ref_order[i] for j in range(i + 1, nrefs): print(i, j) ref2 = gp.alignment_ref_order[j] ids = seq_functions.seq_id_windowed(seqs[i], seqs[j], window) pair_chrm_ids[(ref1, ref2)][chrm] = ids fs = open( gp.analysis_out_dir_absolute + 'ref_ids_summary_' + '_'.join(gp.alignment_ref_order) + '.txt', 'w') fs.write('pair\tchromosome\tmean\tmedian\n')
region_seqs = {} for strain in strains: print(' ', strain) ref_to_strain_coords = [ float(x[:-1]) for x in gzip.open(gp.analysis_out_dir_absolute + 'coordinates/S288c_to_' + strain + '_chr' + chrm + '.txt.gz').readlines() ] strain_start = int( max(0, math.ceil(ref_to_strain_coords[ref_start]))) strain_end = int(math.floor(ref_to_strain_coords[ref_end])) if strain not in chrom_seqs: chrom_seqs[strain] = read_fasta.read_fasta( strain_dirs[strain] + strain + '_chr' + chrm + gp.fasta_suffix)[1][0] # seq = chrom_seqs[strain][strain_start:strain_end+1] seq = [gp.gap_symbol for i in range(ref_start, ref_end + 1)] for i in range(ref_start, ref_end + 1): c = ref_to_strain_coords[i] if int(c) == c: seq[i - ref_start] = chrom_seqs[strain][int(c)] region_seqs[strain] = ''.join(seq) p, t = calculate_polymorphism(region_seqs) fp = 'NA' if t != 0: fp = float(p) / t nuc_div = calculate_nuc_div(region_seqs)
for chrm in gp.chrms: for strain, strain_dir in args['setup_args']['strain_dirs']: print(f'working on: {strain} {chrm}') ref_prefix = '_'.join(args['known_states']) fn = (f'{args["setup_args"]["alignments_directory"]}{ref_prefix}_{strain}' f'_chr{chrm}_mafft{gp.alignment_suffix}') if not os.path.exists(fn): print(fn) print(f'no alignment for {strain} {chrm}') continue headers, seqs = read_fasta.read_fasta(fn) ref_seqs = seqs[:-1] predict_seq = seqs[-1] # predict introgressed/non-introgressed tracts state_seq, probs, hmm, hmm_init, ps = \ predict.predict_introgressed(ref_seqs, predict_seq, args, train=True) state_seq_blocks = predict.convert_to_blocks(state_seq, args['states']) # output # the positions actually used in predictions
'_genes.txt' genes, _ = read_table.read_table_rows(fn, '\t', header=False, key_ind=0) for gene in genes: genes[gene] = (int(genes[gene][0]), int(genes[gene][1])) # read in cer ref -> par ref position file fn = gp.analysis_out_dir_absolute + 'coordinates/' + gp.master_ref + \ '_to_' + other_ref + '_chr' + chrm + '.txt.gz' master_to_other_ref_pos = [ float(line[:-1]) for line in gzip.open(fn, 'rb').readlines() ] # read in cer ref chromosome sequence fn = gp.ref_dir[gp.master_ref] + gp.ref_fn_prefix[gp.master_ref] + \ '_chr' + chrm + gp.fasta_suffix master_seq = read_fasta.read_fasta(fn)[1][0] # read in par ref chromosome sequence fn = gp.ref_dir[other_ref] + gp.ref_fn_prefix[other_ref] + \ '_chr' + chrm + gp.fasta_suffix other_ref_seq = read_fasta.read_fasta(fn)[1][0] # read in par ref ORFs fn = gp.ref_dir[other_ref] + 'orfs/' + other_ref + \ '_chr' + chrm + '_orfs' + gp.fasta_suffix ref_orfs = annotate_positions.get_orfs(fn) for strain in region_ids_by_chrm_strain[chrm].keys(): print('-', strain) if strain not in strain_totals:
# ====== strain_dirs = align_helpers.get_strains( align_helpers.flatten(gp.non_ref_dirs.values())) num_strains = len(strain_dirs) # ====== # loop through all strains, getting appropriate sequence # ====== # master reference and other reference seqs master_ref = gp.alignment_ref_order[0] master_fn = gp.ref_dir[master_ref] + gp.ref_fn_prefix[master_ref] + '_chr' + \ chrm + gp.fasta_suffix master_seq = read_fasta.read_fasta(master_fn)[1][0][ region_start:region_end+1].lower() other_ref = gp.alignment_ref_order[1] coord_fn = gp.analysis_out_dir_absolute + 'coordinates/' + \ gp.master_ref + '_to_' + other_ref + \ '_chr' + chrm + '.txt.gz' f_coord = gzip.open(coord_fn, 'rb') ref_ind_to_strain_ind = [try_int(line[:-1]) for line in f_coord.readlines()] other_ref_fn = gp.ref_dir[other_ref] + gp.ref_fn_prefix[other_ref] + \ '_chr' + chrm + gp.fasta_suffix other_ref_seq = referize(read_fasta.read_fasta(other_ref_fn)[1][0].lower(), ref_ind_to_strain_ind)[region_start:region_end+1] # other strains seqs = {}
def main(): args = read_args.process_predict_args(sys.argv[2:]) task_ind = int(sys.argv[1]) species_ind = task_ind species_from = args['states'][species_ind] base_dir = gp.analysis_out_dir_absolute + args['tag'] regions_dir = f'{base_dir}/regions/' if not os.path.isdir(regions_dir): os.mkdir(regions_dir) quality_writer = None positions = gzip.open(f'{base_dir}/positions_{args["tag"]}.txt.gz', 'rt') line_number = 0 region_writer = gzip.open( f'{regions_dir}{species_from}{gp.fasta_suffix}.gz', 'wt') region_index = {} for chrm in gp.chrms: # region_id strain chromosome predicted_species start end num_non_gap regions_chrm, labels = read_table.read_table_columns( f'{base_dir}/blocks_{species_from}_{args["tag"]}_labeled.txt', '\t', group_by='strain', chromosome=chrm ) for strain in regions_chrm: n = len(regions_chrm[strain]['region_id']) for s in args['known_states']: regions_chrm[strain]['match_nongap_' + s] = [0] * n regions_chrm[strain]['num_sites_nongap_' + s] = [0] * n regions_chrm[strain]['match_hmm_' + s] = [0] * n regions_chrm[strain]['match_nonmask_' + s] = [0] * n regions_chrm[strain]['num_sites_nonmask_' + s] = [0] * n info_string_symbols = list('.-_npbcxNPBCX') for s in info_string_symbols: regions_chrm[strain]['count_' + s] = [0] * n # get masked sites for all references, not just the current # species_from we're considering regions from masked_sites_refs = {} for s, state in enumerate(args['known_states']): masked_sites_refs[s] = \ convert_intervals_to_sites( read_masked_intervals( f'{gp.mask_dir}{state}' f'_chr{chrm}_intervals.txt')) # loop through chromosomes and strains, followed by species of # introgression so that we only have to read each alignment in once # move to last read chromosome positions.seek(line_number) line = positions.readline() while line != '': line = line.split('\t') current_chrm = line[1] if current_chrm != chrm: break strain = line[0] if strain not in regions_chrm: # record current position in case need to re read line line_number = positions.tell() line = positions.readline() continue print(strain, chrm) # indices of alignment columns used by HMM ps = np.array([int(x) for x in line[2:]]) headers, seqs = read_fasta.read_fasta( args['setup_args']['alignments_directory'] + \ '_'.join(args['known_states']) + f'_{strain}_chr{chrm}_mafft{gp.alignment_suffix}') # to go from index in reference seq to index in alignment ind_align = [] for seq in seqs: ind_align.append(index_alignment_by_reference(seq)) masked_sites = convert_intervals_to_sites( read_masked_intervals( f'{gp.mask_dir}{strain}_chr{chrm}_intervals.txt')) masked_sites_ind_align = [] for s in range(len(args['known_states'])): masked_sites_ind_align.append( ind_align[s][masked_sites_refs[s]]) # add in sequence of query strain masked_sites_ind_align.append( ind_align[-1][masked_sites]) # convert position indices from indices in master reference to # indices in alignment ps_ind_align = ind_align[0][ps] # loop through all regions for the specified chromosome and the # current strain for i in range(len(regions_chrm[strain]['region_id'])): r_id = regions_chrm[strain]['region_id'][i] start = regions_chrm[strain]['start'][i] end = regions_chrm[strain]['end'][i] # calculate: # - identity with each reference # - fraction of region that is gapped/masked # index of start and end of region in aligned sequences slice_start = ind_align[0][int(start)] slice_end = ind_align[0][int(end)] assert slice_start in ps_ind_align, \ f'{slice_start} {start} {r_id}' assert slice_end in ps_ind_align, \ f'{slice_end} {end} {r_id}' seqx = seqs[-1][slice_start:slice_end + 1] len_seqx = slice_end - slice_start + 1 len_states = len(args['known_states']) # . = all match # - = gap in one or more sequences # p = matches predicted reference info = {'gap_any_flag': np.zeros((len_seqx), bool), 'mask_any_flag': np.zeros((len_seqx), bool), 'unseq_any_flag': np.zeros((len_seqx), bool), 'hmm_flag': np.zeros((len_seqx), bool), 'gap_flag': np.zeros((len_seqx, len_states), bool), 'mask_flag': np.zeros((len_seqx, len_states), bool), 'unseq_flag': np.zeros((len_seqx, len_states), bool), 'match_flag': np.zeros((len_seqx, len_states), bool)} for sj, statej in enumerate(args['known_states']): seqj = seqs[sj][slice_start:slice_end+1] # only alignment columns used by HMM (polymorphic, no # gaps in any strain) total_match_hmm, total_sites_hmm, infoj = \ seq_id_hmm(seqj, seqx, slice_start, ps_ind_align) if statej == species_from \ or species_ind >= len(args['known_states']): regions_chrm[strain]['num_sites_hmm'][i] = \ total_sites_hmm # only write once, the first index if sj == 0: info['hmm_flag'] = infoj['hmm_flag'] info['gap_any_flag'] = np.logical_or( info['gap_any_flag'], infoj['gap_flag']) info['unseq_any_flag'] = np.logical_or( info['unseq_any_flag'], infoj['unseq_flag']) info['gap_flag'][:, sj] = infoj['gap_flag'] info['unseq_flag'][:, sj] = infoj['unseq_flag'] info['match_flag'][:, sj] = infoj['match'] regions_chrm[strain][f'match_hmm_{statej}'][i] = \ total_match_hmm # all alignment columns, excluding ones with gaps in # these two sequences total_match_nongap, total_sites_nongap = \ seq_functions.seq_id(seqj, seqx) regions_chrm[strain][f'match_nongap_{statej}'][i] =\ total_match_nongap regions_chrm[strain][f'num_sites_nongap_{statej}'][i] =\ total_sites_nongap # all alignment columns, excluding ones with gaps or # masked bases or unsequenced in *these two sequences* total_match_nonmask, total_sites_nonmask, infoj = \ seq_id_unmasked(seqj, seqx, slice_start, masked_sites_ind_align[sj], masked_sites_ind_align[-1]) info['mask_any_flag'] = np.logical_or( info['mask_any_flag'], infoj['mask_flag']) info['mask_flag'][:, sj] = infoj['mask_flag'] regions_chrm[strain][f'match_nonmask_{statej}'][i] = \ total_match_nonmask regions_chrm[strain][f'num_sites_nonmask_{statej}'][i] = \ total_sites_nonmask region_index[int(r_id[1:])] = region_writer.tell() region_writer.write(f'#{r_id}\n') names = args['known_states'] + [strain] for sj in range(len(names)): # write sequence to region alignment file, along with # start and end coordinates startj = bisect.bisect_left(ind_align[sj], slice_start) endj = bisect.bisect_left(ind_align[sj], slice_end) region_writer.write(f'> {names[sj]} {startj} {endj}\n') region_writer.write( ''.join(seqs[sj][slice_start:slice_end+1]) + '\n') # also write string with info about each site info_string = make_info_string(info, 0, species_ind) region_writer.write('> info\n') region_writer.write(info_string + '\n') # TODO this can be made faster with numpy # and keep track of each symbol count for sym in info_string_symbols: regions_chrm[strain]['count_' + sym][i] = \ info_string.count(sym) # record current position in case need to re read line line_number = positions.tell() line = positions.readline() sys.stdout.flush() labels += ['match_nongap_' + x for x in args['known_states']] labels += ['num_sites_nongap_' + x for x in args['known_states']] labels += ['match_hmm_' + x for x in args['known_states']] labels += ['match_nonmask_' + x for x in args['known_states']] labels += ['num_sites_nonmask_' + x for x in args['known_states']] labels += ['count_' + x for x in info_string_symbols] assert labels[0] == 'region_id', 'Unexpected labeled format' # write on first execution if quality_writer is None: quality_writer = open(f'{base_dir}/blocks_{species_from}' f'_{args["tag"]}_quality.txt', 'w') quality_writer.write('\t'.join(labels) + '\n') # reorganize output as list of tuples ordered by label output = [] strains = list(regions_chrm.keys()) for strain in strains: # pop to limit memory usage d = regions_chrm.pop(strain) output += list(zip(*[d[l] for l in labels])) # sort by region id (index 0, remove r) for entry in sorted(output, key=lambda e: int(e[0][1:])): quality_writer.write('\t'.join([str(e) for e in entry]) + '\n') quality_writer.close() region_writer.close() with open(f'{regions_dir}{species_from}.pkl', 'wb') as index: pickle.dump(region_index, index)
# get all non-reference strains of cerevisiae and paradoxus s = get_strains(flatten(gp.non_ref_dirs.values())) strain, d = s[int(sys.argv[1])] gp_dir = '../' fn_start = (gp_dir + gp.alignments_dir + '_'.join(gp.alignment_ref_order) + '_' + strain + '_chr') for chrm in gp.chrms: print(chrm) sys.stdout.flush() if not os.path.isfile(fn_start + chrm + '_mafft.maf'): continue headers, seqs = read_fasta.read_fasta(fn_start + chrm + '_mafft.maf') a = dict(zip(headers, seqs)) f_out = open(fn_start + chrm + '_mafft.stats', 'w') # number of sites where n,...,3,2,1 genomes aligned num_strains_by_site = num_strains_aligned_by_site(seqs) f_out.write('# histogram of number of strains ' 'aligned across all alignment columns\n') for n in range(len(num_strains_by_site)): f_out.write(str(n) + ',' + str(num_strains_by_site[n]) + '\n') f_out.write('\n') # fraction of genomes aligned (should all be 1) fracs_aligned, seq_lengths = fraction_strains_aligned(headers, seqs) for frac in fracs_aligned:
def maf_id(fn, ref1='S288c', ref2='CBS432'): headers, seqs = read_fasta.read_fasta(fn) id1, den1 = seq_id(seqs[2], seqs[0]) id2, den2 = seq_id(seqs[2], seqs[1]) return id1, id2, den1, den2
def get_range_seq(start, end, seq_fn): chrm_seq = read_fasta.read_fasta(seq_fn)[1][0] range_seq = chrm_seq[start:end + 1] return range_seq