예제 #1
0
def run_msa(gene_dict, rs_data, problems):
    # Next, get sequences and run alignments
    counter = 0
    matches = set()
    aln_data = {}
    for gene_sym, rs_ids in gene_dict.items():
        counter += 1
        #if counter >= 20:
        #    break
        print("%s: %d of %d genes" % (gene_sym, counter, len(gene_dict)))
        fasta_lines = []
        # Get the main Uniprot sequence from the gene symbol
        hgnc_id = hgnc_client.get_hgnc_id(gene_sym)
        up_id_main = hgnc_client.get_uniprot_id(hgnc_id)
        up_sequence = uniprot_client.get_sequence(up_id_main)
        fasta_lines.append('>%s\n' % gene_sym)
        fasta_lines.append('%s\n' % up_sequence)

        # Now, iterate over the refseq ids and get the sequences
        seq_ids = []
        # The filenames to use if we do an alignment
        in_file = 'aln/in/%s.fasta' % gene_sym
        out_file = 'aln/out/%s.fasta' % gene_sym
        # Iterate over the Refseq IDs
        for rs_id in rs_ids:
            seq_info = rs_data.get(rs_id)
            if not seq_info:
                problems.add((rs_id, 'no sequence in Refseq'))
                continue
            seq_ids.append(rs_id)
            fasta_header, sequence = seq_info
            fasta_lines.append('>%s\n%s\n' % (rs_id, sequence))
            if sequence == up_sequence:
                aln_data[rs_id] = (gene_sym, True, None)
            else:
                aln_data[rs_id] = (gene_sym, False, out_file)
        if len(seq_ids) == 0:
            continue

        if len(seq_ids) == 1 and sequence == up_sequence:
            print("\tAll sequences match, no alignment needed.")
            continue
        else:
            # Write the fasta file
            with open(in_file, 'wt') as f:
                for line in fasta_lines:
                    f.write(line)
            # Run the sequence alignment
            print("\tRunning sequence alignment.")
            subprocess.call(['./clustal-omega-1.2.3-macosx', '-i', in_file,
                             '-o', out_file, '--force'])
    return aln_data
예제 #2
0
def test_get_sequence():
    seq = uniprot_client.get_sequence('P00533')
    assert len(seq) > 1000
    assert unicode_strs(seq)
예제 #3
0
for site_tuple in sites:
    s = site_tuple[1]
    split_id = s.split('-')
    if len(split_id) == 1 or split_id[1] == '1':
        continue
    else:
        iso_sites.append(site_tuple)

iso_only = []
pos_valid_in_ref = []
mappable_in_ref = []

for site in iso_sites:
    site_id, up_id, res, pos, motif = site
    ref_iso = up_id.split('-')[0]
    ref_seq = uniprot_client.get_sequence(ref_iso)
    assert len(motif) == 13
    ref_motif_start = ref_seq.find(motif)
    if ref_motif_start == -1:
        iso_only.append(site)
    else:
        ref_pos = str(ref_motif_start + 6)
        if ref_pos == pos:
            pos_valid_in_ref.append(site)
        else:
            mappable_in_ref.append(site)
"""
with open('ms_data/tubulin_sorted_site_list.txt', 'rt') as f:
    for line in f.readlines():
        gene_name, respos = line.strip().split('_')
        res = respos[0]
예제 #4
0
def test_get_sequence():
    seq = uniprot_client.get_sequence('P00533')
    assert len(seq) > 1000
    assert unicode_strs(seq)
예제 #5
0
def test_get_sequence():
    seq = uniprot_client.get_sequence('P00533')
    assert(len(seq) > 1000)