def test_haplotype_3070():
    sites = str2sites('16093C 16183d 16184d 16191.1T 16191.2T 16270T')
    seq   = sites2seq(sites, region=range(16000,16570))
    rts   = seq2sites(seq) # rts: round trip sites
    print 'EXP: %s' % sites
    print 'OBS: %s' % rts
    assert sites == rts
def test_haplotype_4827():
    sites = str2sites('16172C 16183d 16193.1C 16193.2C 16223T 16320T')
    seq   = sites2seq(sites, region=range(16000,16570))
    rts   = seq2sites(seq) # rts: round trip sites
    print 'EXP: %s' % sites
    print 'OBS: %s' % rts
    assert sites == rts
def test_haplotype_2236():
    sites = str2sites('16126C 16163G 16185.1T 16185.2T 16189d 16294T 16519C')
    seq   = sites2seq(sites, region=range(16000,16570))
    rts   = seq2sites(seq) # rts: round trip sites
    print 'EXP: %s' % sites
    print 'OBS: %s' % rts
    assert sites == rts
def test_haplotype_2911():
    sites = str2sites('16051G 16129C 16182d 16183d 16193.1C 16193.2C 16362C 16519C')
    seq   = sites2seq(sites, region=range(16000,16570))
    rts   = seq2sites(seq) # rts: round trip sites
    print 'EXP: %s' % sites
    print 'OBS: %s' % rts
    assert sites == rts
Пример #5
0
drop = ['San_43', 'San_67', 'tzbg040', 'tzdt045', 'tzhz108', 'tzhz130', 'tzhz131']

hids = []
seqs = []

for e in data:
	name_parts = e['name'].split()
	if name_parts[0] not in drop:
		hids.append(name_parts[0])
		seqs.append(e['sequence'])

## Validate
passed_validation = True

for i in range(len(seqs)):
	mysites = seq2sites(seqs[i])
	myseq = translate(sites2seq(mysites, region), None, '-')
	if not seqs[i] == myseq:
		passed_validation = False
		print i, hids[i]

if passed_validation:
	with open('processed.csv', 'w') as f:
		for i in range(len(seqs)):
			mysites = ' '.join([str(x) for x in seq2sites(seqs[i])])
			origid = hids[i]
			prefix = metadata.ix[origid[:4],'NewPrefix']
			num = origid[4:].split('_')[0].zfill(3)
			newid = prefix + num
			f.write('%s,%s,%s\n' % (newid, origid, mysites))
Пример #6
0
from utils import *

## load metadata
metadata = pd.read_csv('metadata.csv', index_col=0)
region = range2region(metadata.ix[0, 'SeqRange'])

ff = fasta('krings1999.fasta', 'r')
data = ff.readentries()
ff.close()

hids = []
sites = []

for entry in data:
	hids.append(entry['name'])
	sites.append(seq2sites(entry['sequence']))

# some of the sequences are short. Many are just missing a base or two
# from the beginning or the end - will keep those
# a few are missing large chunks of the end, so will drop those
ok = [41, 42, 43, 44, 54, 55, 83, 85, 89, 136, 137, 157, 178]
skip = [8, 123, 124]

# validate
passed_validation = True

for i in range(len(sites)):
	if i in ok or i in skip:
		pass
	else:
		seq1 = data[i]['sequence'].upper()
Пример #7
0
	if e['sequence'].count('n') > 10:
		print 'skipping isolate %s' % name[3]
	else:
		hids.append(name[3])
		seqs.append(e['sequence'])
		if name[3] == '63':
			pops.append('Yoruban')
		else:
			for f in e['features']:
				if f[0] == 'source':
					for f2 in f[1]:
						if isinstance(f2, tuple):
							if f2[0] == 'note':
								pops.append(f2[1])
		if 'complete' in e['definition'] or name[3] in ['3', '65']:
			sites.append(seq2sites(e['sequence']))
		else:
			found = False
			for f in e['features']:
				if f[0] == 'misc_feature' and f[1][1][1] == 'segment 1':
					c = int(f[1][0].split('..')[1]) - 1
					s = e['sequence']
					sites.append(seq2sites(s[:c]) + seq2sites(s[c:]))
					found = True
			if not found:
				print 'problem with isolate %s' % name[3]

# Vigilant GenBank data have variable sequence lengths
# normalize all sites to specified range in the metadata file
mysites = []
Пример #8
0
hids1 = []
groups1 = []
seqs1 = []
sites1 = []

for e in entries1:
	hid = getisolate(e)
	if hid is not None:
		hids1.append(hid)
	n = getnote(e)
	if n is not None:
		parts = n.split()
		groups1.append(parts[-1])
	seqs1.append(e['sequence'])
	sites1.append(seq2sites(e['sequence']))

entries2 = read_genbank('hvr2.gb', what='filename')

hids2 = []
groups2 = []
seqs2 = []
sites2 = []

for e in entries2:
	hid = getisolate(e)
	if hid is not None:
		hids2.append(hid)
	n = getnote(e)
	if n is not None:
		parts = n.split()
Пример #9
0
for e in data1:
	k = e['name'].split()[0]
	hvr1[k] = e['sequence']

for e in data2:
	k = e['name'].split()[0]
	hvr2[k] = e['sequence']

# retaining only those with both hvr1 and hvr2
k1 = set(hvr1.keys())
k2 = set(hvr2.keys())
hids = list(k1 & k2)

sites = {}
for k in hids:
	sites[k] = seq2sites(hvr1[k])
	sites[k] = sites[k] + seq2sites(hvr2[k])

## Validate
passed_validation = True

for i in range(len(sites)):
	hid = hids[i]
	seq1 = translate(sites2seq(sites[hid], region1), None, '-')
	seq2 = translate(sites2seq(sites[hid], region2), None, '-')
	if not seq1 == hvr1[hid] and seq2 == hvr2[hid]:
		passed_validation = False
		print i, hids[i]

if passed_validation:
	counter = {}
Пример #10
0
def process_seq2sites(form):
    """Process data submitted in seq2sites form"""

    # submission validation and error reporting
    problems = []
    valid = True

    # first, just assume whatever is in the textarea is the submission
    # even if that may be nothing
    content = form.cleaned_data['query']   

    # then check to see if a file was supplied, and if so, replace the
    # previously assumed content with the file data
    if form.cleaned_data['file'] is not None:
        if form.cleaned_data['file'].multiple_chunks():
            pass
            # error - return with error
        content = form.cleaned_data['file'].read()

    # clear off any trailing whitespace
    content.strip()

    # make sure something was submitted
    if len(content) == 0:
        valid = False
        return HttpResponseRedirect(reverse('seq2sites'))

    # determine format
    format = None
    if content.startswith('>'):
        format = 'fasta'
    else:
        format = 'single_seq'
        
    # pull names and sequence out of submitted content
    names = []
    seqs = []
    if format == 'fasta':
        try:
            fnames = []
            fseqs = []
            for entry in fasta(content, 's'):
                fnames.append(entry['name'])
                fseqs.append(RE_NON_IUPAC.sub('', entry['sequence'].upper()))
            names = fnames
            seqs = fseqs
        except:
            valid = False
            problems.append('There was an error in the FASTA format')
    else:
        names = ['']
        seqs = [RE_NON_IUPAC.sub('', content.upper())]

    # enforce limits for multisequence submissions
    if format == 'fasta':
        if len(seqs) > MAX_SEQS:
                valid = False
                problems.append('Too many sequences submitted; current maximum allowed is %d' % MAX_SEQS)

    if valid:
        result_lines = []
        sites_by_line = []
        for seq in seqs:
            try:
                sites = seq2sites(seq)
                sites_by_line.append(sites)
                result_lines.append(sites2str(sites))
            except Exception, e:
                result_lines.append('There was an error: %s' % e)

        results = list(Result(x,y) for x,y in zip(names,result_lines))

        c = Context({'results': results})
Пример #11
0
            s = "%s%s" % m.groups()
        elif ".1" in s:
            s = s + "C"
        elif ".2" in s:
            h2f.append(s[:-1] + "1C")
            s = s + "C"
        h2f.append(s)
    hvr2[i] = " ".join(h2f)

## Validate
passed_validation = True

for i in range(len(freq)):
    curr_sites = str2sites(hvr1[i], add16k=True)
    seq = translate(sites2seq(curr_sites, region1), None, "-")
    mysites = seq2sites(seq)
    if not mysites == curr_sites:
        myseq = translate(sites2seq(mysites, region1), None, "-")
        if not seq == myseq:
            passed_validation = False
            print i, "hvr1"
    curr_sites = str2sites(hvr2[i])
    seq = translate(sites2seq(curr_sites, region2), None, "-")
    mysites = seq2sites(seq)
    if not mysites == curr_sites:
        myseq = translate(sites2seq(mysites, region2), None, "-")
        if not seq == myseq:
            passed_validation = False
            print i, "hvr2"

Пример #12
0
ff = fasta('knight2003.fasta', 'r')
data = ff.readentries()
ff.close()

hids = []
seqs = []
sites = []

for e in data:
	hids.append(e['name'].split()[0])
	seqs.append(e['sequence'])
	m = re.search(r'GATCACA', e['sequence'])
	cut = m.start()
	seq1 = e['sequence'][:cut]
	seq2 = e['sequence'][cut:]
	sites.append(seq2sites(seq1) + seq2sites(seq2))

## Validate
passed_validation = True

for i in range(len(sites)):
	seq = translate(sites2seq(sites[i], region1), None, '-') + translate(sites2seq(sites[i], region2), None, '-')
	if not seq == seqs[i]:
		passed_validation = False
		print i, hids[i]

if passed_validation:
	counter = {}
	for k in metadata.index:
		counter[k] = 0
	with open('processed.csv', 'w') as f:
Пример #13
0
    sites.append(" ".join(parts[3:]))

## Validate
passed_validation = True

# there are sites in the source table that are not actual variant sites
# sequence 9 (index 8) has 263A as a variant
# sequence 12 (index 11) has 16223C as a variant
not_polys = [Polymorphism(263, 0, "A"), Polymorphism(16223, 0, "C")]

for i in range(len(sites)):
    curr_sites = sites[i]
    curr_polys = [x for x in str2sites(curr_sites) if x not in not_polys]
    cseq1 = sites2seq(curr_sites, region1)
    cseq2 = sites2seq(curr_sites, region2)
    mysites1 = seq2sites(cseq1)
    mysites2 = seq2sites(cseq2)
    mysites = mysites1 + mysites2
    if not mysites == curr_polys:
        passed_validation = False
        print iids[i]

if passed_validation:
    counters = [1] * 2
    with open("processed.csv", "w") as f:
        for i in range(len(sites)):
            curr_sites = str2sites(sites[i])
            mysites = [x for x in curr_sites if x not in not_polys]
            mysites = " ".join([str(x) for x in mysites])
            for j in range(2):
                if counts[i, j] > 0:
Пример #14
0
    def new_query(self, seq, label='Query'):
        all_polymorphisms  = {}
        defining_polymorphisms = {}

        return MotifQuery(defining_polymorphisms=seq2sites(seq), label=label)
Пример #15
0
ff = fasta('non_2011.fasta', 'r')
data = ff.readentries()
ff.close()

hids = []
seqs = []
sites = []

# four sequences are shorter than all the rest, will drop them

for e in data:
	if len(e['sequence']) > 350:
		hids.append(e['name'].split()[0])
		seqs.append(e['sequence'])
		sites.append(seq2sites(e['sequence']))

## Validate
passed_validation = True

for i in range(len(sites)):
	hid = hids[i]
	key = hid[:2]
	region = range2region(metadata.ix[key, 'SeqRange'])
	seq = translate(sites2seq(sites[i], region), None, '-')
	if not seq == seqs[i]:
		passed_validation = False
		print i, hids[i]

if passed_validation:
	counter = {}
Пример #16
0
sites = []

with open('defilippo2010.csv', 'rU') as f:
	reader = csv.reader(f)
	reader.next() # skip past header
	for row in reader:
		hids.append(row[0])
		groups.append(row[1])
		sites.append(str2sites(row[3]))

## Validate
passed_validation = True

for i in range(len(sites)):
	seq = sites2seq(sites[i], region)
	mysites = seq2sites(seq)
	if not mysites == sites[i]:
		myseq = translate(sites2seq(mysites, region), None, '-')
		if not seq == myseq:
			passed_validation = False
			print i, hids[i]

counter = {}
for k in metadata.index:
	counter[k] = 0
	
if passed_validation:
	with open('processed.csv', 'w') as f:
		for i in range(len(groups)):
			key = groups[i]
			counter[key] = counter[key] + 1
Пример #17
0
for filename in ["barbieri2013a.fasta", "barbieri2013b.fasta", "barbieri2013c.fasta", "barbieri2014.fasta"]:
    ff = fasta(filename, "r")
    data = ff.readentries()
    ff.close()

    for e in data:
        if e["sequence"].count("N") < 20:
            hid = e["name"].split()[4].upper()
            if hid in groups.index:
                hids.append(hid)
                seqs.append(e["sequence"])

sites = []
for s in seqs:
    sites.append(seq2sites(s, ambig_cutoff=20))

## Validate
passed_validation = True

for i in range(len(sites)):
    hid = hids[i]
    seq = translate(sites2seq(sites[i], region), None, "-")
    if not seq == seqs[i]:
        # some sequences have N in position 308 and this doesn't parse properly in my converter
        # not going to worry about it because will always drop the variants in this region
        if not seq[:305] == seqs[i][:305] and seq[-16260:] == seqs[i][-16260:]:
            passed_validation = False
            print i, hids[i]

if passed_validation: