def test_haplotype_3070(): sites = str2sites('16093C 16183d 16184d 16191.1T 16191.2T 16270T') seq = sites2seq(sites, region=range(16000,16570)) rts = seq2sites(seq) # rts: round trip sites print 'EXP: %s' % sites print 'OBS: %s' % rts assert sites == rts
def test_haplotype_4827(): sites = str2sites('16172C 16183d 16193.1C 16193.2C 16223T 16320T') seq = sites2seq(sites, region=range(16000,16570)) rts = seq2sites(seq) # rts: round trip sites print 'EXP: %s' % sites print 'OBS: %s' % rts assert sites == rts
def test_haplotype_2236(): sites = str2sites('16126C 16163G 16185.1T 16185.2T 16189d 16294T 16519C') seq = sites2seq(sites, region=range(16000,16570)) rts = seq2sites(seq) # rts: round trip sites print 'EXP: %s' % sites print 'OBS: %s' % rts assert sites == rts
def test_haplotype_2911(): sites = str2sites('16051G 16129C 16182d 16183d 16193.1C 16193.2C 16362C 16519C') seq = sites2seq(sites, region=range(16000,16570)) rts = seq2sites(seq) # rts: round trip sites print 'EXP: %s' % sites print 'OBS: %s' % rts assert sites == rts
drop = ['San_43', 'San_67', 'tzbg040', 'tzdt045', 'tzhz108', 'tzhz130', 'tzhz131'] hids = [] seqs = [] for e in data: name_parts = e['name'].split() if name_parts[0] not in drop: hids.append(name_parts[0]) seqs.append(e['sequence']) ## Validate passed_validation = True for i in range(len(seqs)): mysites = seq2sites(seqs[i]) myseq = translate(sites2seq(mysites, region), None, '-') if not seqs[i] == myseq: passed_validation = False print i, hids[i] if passed_validation: with open('processed.csv', 'w') as f: for i in range(len(seqs)): mysites = ' '.join([str(x) for x in seq2sites(seqs[i])]) origid = hids[i] prefix = metadata.ix[origid[:4],'NewPrefix'] num = origid[4:].split('_')[0].zfill(3) newid = prefix + num f.write('%s,%s,%s\n' % (newid, origid, mysites))
from utils import * ## load metadata metadata = pd.read_csv('metadata.csv', index_col=0) region = range2region(metadata.ix[0, 'SeqRange']) ff = fasta('krings1999.fasta', 'r') data = ff.readentries() ff.close() hids = [] sites = [] for entry in data: hids.append(entry['name']) sites.append(seq2sites(entry['sequence'])) # some of the sequences are short. Many are just missing a base or two # from the beginning or the end - will keep those # a few are missing large chunks of the end, so will drop those ok = [41, 42, 43, 44, 54, 55, 83, 85, 89, 136, 137, 157, 178] skip = [8, 123, 124] # validate passed_validation = True for i in range(len(sites)): if i in ok or i in skip: pass else: seq1 = data[i]['sequence'].upper()
if e['sequence'].count('n') > 10: print 'skipping isolate %s' % name[3] else: hids.append(name[3]) seqs.append(e['sequence']) if name[3] == '63': pops.append('Yoruban') else: for f in e['features']: if f[0] == 'source': for f2 in f[1]: if isinstance(f2, tuple): if f2[0] == 'note': pops.append(f2[1]) if 'complete' in e['definition'] or name[3] in ['3', '65']: sites.append(seq2sites(e['sequence'])) else: found = False for f in e['features']: if f[0] == 'misc_feature' and f[1][1][1] == 'segment 1': c = int(f[1][0].split('..')[1]) - 1 s = e['sequence'] sites.append(seq2sites(s[:c]) + seq2sites(s[c:])) found = True if not found: print 'problem with isolate %s' % name[3] # Vigilant GenBank data have variable sequence lengths # normalize all sites to specified range in the metadata file mysites = []
hids1 = [] groups1 = [] seqs1 = [] sites1 = [] for e in entries1: hid = getisolate(e) if hid is not None: hids1.append(hid) n = getnote(e) if n is not None: parts = n.split() groups1.append(parts[-1]) seqs1.append(e['sequence']) sites1.append(seq2sites(e['sequence'])) entries2 = read_genbank('hvr2.gb', what='filename') hids2 = [] groups2 = [] seqs2 = [] sites2 = [] for e in entries2: hid = getisolate(e) if hid is not None: hids2.append(hid) n = getnote(e) if n is not None: parts = n.split()
for e in data1: k = e['name'].split()[0] hvr1[k] = e['sequence'] for e in data2: k = e['name'].split()[0] hvr2[k] = e['sequence'] # retaining only those with both hvr1 and hvr2 k1 = set(hvr1.keys()) k2 = set(hvr2.keys()) hids = list(k1 & k2) sites = {} for k in hids: sites[k] = seq2sites(hvr1[k]) sites[k] = sites[k] + seq2sites(hvr2[k]) ## Validate passed_validation = True for i in range(len(sites)): hid = hids[i] seq1 = translate(sites2seq(sites[hid], region1), None, '-') seq2 = translate(sites2seq(sites[hid], region2), None, '-') if not seq1 == hvr1[hid] and seq2 == hvr2[hid]: passed_validation = False print i, hids[i] if passed_validation: counter = {}
def process_seq2sites(form): """Process data submitted in seq2sites form""" # submission validation and error reporting problems = [] valid = True # first, just assume whatever is in the textarea is the submission # even if that may be nothing content = form.cleaned_data['query'] # then check to see if a file was supplied, and if so, replace the # previously assumed content with the file data if form.cleaned_data['file'] is not None: if form.cleaned_data['file'].multiple_chunks(): pass # error - return with error content = form.cleaned_data['file'].read() # clear off any trailing whitespace content.strip() # make sure something was submitted if len(content) == 0: valid = False return HttpResponseRedirect(reverse('seq2sites')) # determine format format = None if content.startswith('>'): format = 'fasta' else: format = 'single_seq' # pull names and sequence out of submitted content names = [] seqs = [] if format == 'fasta': try: fnames = [] fseqs = [] for entry in fasta(content, 's'): fnames.append(entry['name']) fseqs.append(RE_NON_IUPAC.sub('', entry['sequence'].upper())) names = fnames seqs = fseqs except: valid = False problems.append('There was an error in the FASTA format') else: names = [''] seqs = [RE_NON_IUPAC.sub('', content.upper())] # enforce limits for multisequence submissions if format == 'fasta': if len(seqs) > MAX_SEQS: valid = False problems.append('Too many sequences submitted; current maximum allowed is %d' % MAX_SEQS) if valid: result_lines = [] sites_by_line = [] for seq in seqs: try: sites = seq2sites(seq) sites_by_line.append(sites) result_lines.append(sites2str(sites)) except Exception, e: result_lines.append('There was an error: %s' % e) results = list(Result(x,y) for x,y in zip(names,result_lines)) c = Context({'results': results})
s = "%s%s" % m.groups() elif ".1" in s: s = s + "C" elif ".2" in s: h2f.append(s[:-1] + "1C") s = s + "C" h2f.append(s) hvr2[i] = " ".join(h2f) ## Validate passed_validation = True for i in range(len(freq)): curr_sites = str2sites(hvr1[i], add16k=True) seq = translate(sites2seq(curr_sites, region1), None, "-") mysites = seq2sites(seq) if not mysites == curr_sites: myseq = translate(sites2seq(mysites, region1), None, "-") if not seq == myseq: passed_validation = False print i, "hvr1" curr_sites = str2sites(hvr2[i]) seq = translate(sites2seq(curr_sites, region2), None, "-") mysites = seq2sites(seq) if not mysites == curr_sites: myseq = translate(sites2seq(mysites, region2), None, "-") if not seq == myseq: passed_validation = False print i, "hvr2"
ff = fasta('knight2003.fasta', 'r') data = ff.readentries() ff.close() hids = [] seqs = [] sites = [] for e in data: hids.append(e['name'].split()[0]) seqs.append(e['sequence']) m = re.search(r'GATCACA', e['sequence']) cut = m.start() seq1 = e['sequence'][:cut] seq2 = e['sequence'][cut:] sites.append(seq2sites(seq1) + seq2sites(seq2)) ## Validate passed_validation = True for i in range(len(sites)): seq = translate(sites2seq(sites[i], region1), None, '-') + translate(sites2seq(sites[i], region2), None, '-') if not seq == seqs[i]: passed_validation = False print i, hids[i] if passed_validation: counter = {} for k in metadata.index: counter[k] = 0 with open('processed.csv', 'w') as f:
sites.append(" ".join(parts[3:])) ## Validate passed_validation = True # there are sites in the source table that are not actual variant sites # sequence 9 (index 8) has 263A as a variant # sequence 12 (index 11) has 16223C as a variant not_polys = [Polymorphism(263, 0, "A"), Polymorphism(16223, 0, "C")] for i in range(len(sites)): curr_sites = sites[i] curr_polys = [x for x in str2sites(curr_sites) if x not in not_polys] cseq1 = sites2seq(curr_sites, region1) cseq2 = sites2seq(curr_sites, region2) mysites1 = seq2sites(cseq1) mysites2 = seq2sites(cseq2) mysites = mysites1 + mysites2 if not mysites == curr_polys: passed_validation = False print iids[i] if passed_validation: counters = [1] * 2 with open("processed.csv", "w") as f: for i in range(len(sites)): curr_sites = str2sites(sites[i]) mysites = [x for x in curr_sites if x not in not_polys] mysites = " ".join([str(x) for x in mysites]) for j in range(2): if counts[i, j] > 0:
def new_query(self, seq, label='Query'): all_polymorphisms = {} defining_polymorphisms = {} return MotifQuery(defining_polymorphisms=seq2sites(seq), label=label)
ff = fasta('non_2011.fasta', 'r') data = ff.readentries() ff.close() hids = [] seqs = [] sites = [] # four sequences are shorter than all the rest, will drop them for e in data: if len(e['sequence']) > 350: hids.append(e['name'].split()[0]) seqs.append(e['sequence']) sites.append(seq2sites(e['sequence'])) ## Validate passed_validation = True for i in range(len(sites)): hid = hids[i] key = hid[:2] region = range2region(metadata.ix[key, 'SeqRange']) seq = translate(sites2seq(sites[i], region), None, '-') if not seq == seqs[i]: passed_validation = False print i, hids[i] if passed_validation: counter = {}
sites = [] with open('defilippo2010.csv', 'rU') as f: reader = csv.reader(f) reader.next() # skip past header for row in reader: hids.append(row[0]) groups.append(row[1]) sites.append(str2sites(row[3])) ## Validate passed_validation = True for i in range(len(sites)): seq = sites2seq(sites[i], region) mysites = seq2sites(seq) if not mysites == sites[i]: myseq = translate(sites2seq(mysites, region), None, '-') if not seq == myseq: passed_validation = False print i, hids[i] counter = {} for k in metadata.index: counter[k] = 0 if passed_validation: with open('processed.csv', 'w') as f: for i in range(len(groups)): key = groups[i] counter[key] = counter[key] + 1
for filename in ["barbieri2013a.fasta", "barbieri2013b.fasta", "barbieri2013c.fasta", "barbieri2014.fasta"]: ff = fasta(filename, "r") data = ff.readentries() ff.close() for e in data: if e["sequence"].count("N") < 20: hid = e["name"].split()[4].upper() if hid in groups.index: hids.append(hid) seqs.append(e["sequence"]) sites = [] for s in seqs: sites.append(seq2sites(s, ambig_cutoff=20)) ## Validate passed_validation = True for i in range(len(sites)): hid = hids[i] seq = translate(sites2seq(sites[i], region), None, "-") if not seq == seqs[i]: # some sequences have N in position 308 and this doesn't parse properly in my converter # not going to worry about it because will always drop the variants in this region if not seq[:305] == seqs[i][:305] and seq[-16260:] == seqs[i][-16260:]: passed_validation = False print i, hids[i] if passed_validation: