def run_msa(gene_dict, rs_data, problems): # Next, get sequences and run alignments counter = 0 matches = set() aln_data = {} for gene_sym, rs_ids in gene_dict.items(): counter += 1 #if counter >= 20: # break print("%s: %d of %d genes" % (gene_sym, counter, len(gene_dict))) fasta_lines = [] # Get the main Uniprot sequence from the gene symbol hgnc_id = hgnc_client.get_hgnc_id(gene_sym) up_id_main = hgnc_client.get_uniprot_id(hgnc_id) up_sequence = uniprot_client.get_sequence(up_id_main) fasta_lines.append('>%s\n' % gene_sym) fasta_lines.append('%s\n' % up_sequence) # Now, iterate over the refseq ids and get the sequences seq_ids = [] # The filenames to use if we do an alignment in_file = 'aln/in/%s.fasta' % gene_sym out_file = 'aln/out/%s.fasta' % gene_sym # Iterate over the Refseq IDs for rs_id in rs_ids: seq_info = rs_data.get(rs_id) if not seq_info: problems.add((rs_id, 'no sequence in Refseq')) continue seq_ids.append(rs_id) fasta_header, sequence = seq_info fasta_lines.append('>%s\n%s\n' % (rs_id, sequence)) if sequence == up_sequence: aln_data[rs_id] = (gene_sym, True, None) else: aln_data[rs_id] = (gene_sym, False, out_file) if len(seq_ids) == 0: continue if len(seq_ids) == 1 and sequence == up_sequence: print("\tAll sequences match, no alignment needed.") continue else: # Write the fasta file with open(in_file, 'wt') as f: for line in fasta_lines: f.write(line) # Run the sequence alignment print("\tRunning sequence alignment.") subprocess.call(['./clustal-omega-1.2.3-macosx', '-i', in_file, '-o', out_file, '--force']) return aln_data
def test_get_sequence(): seq = uniprot_client.get_sequence('P00533') assert len(seq) > 1000 assert unicode_strs(seq)
for site_tuple in sites: s = site_tuple[1] split_id = s.split('-') if len(split_id) == 1 or split_id[1] == '1': continue else: iso_sites.append(site_tuple) iso_only = [] pos_valid_in_ref = [] mappable_in_ref = [] for site in iso_sites: site_id, up_id, res, pos, motif = site ref_iso = up_id.split('-')[0] ref_seq = uniprot_client.get_sequence(ref_iso) assert len(motif) == 13 ref_motif_start = ref_seq.find(motif) if ref_motif_start == -1: iso_only.append(site) else: ref_pos = str(ref_motif_start + 6) if ref_pos == pos: pos_valid_in_ref.append(site) else: mappable_in_ref.append(site) """ with open('ms_data/tubulin_sorted_site_list.txt', 'rt') as f: for line in f.readlines(): gene_name, respos = line.strip().split('_') res = respos[0]
def test_get_sequence(): seq = uniprot_client.get_sequence('P00533') assert(len(seq) > 1000)