-
Notifications
You must be signed in to change notification settings - Fork 1
/
20100622a.py
94 lines (83 loc) · 3.26 KB
/
20100622a.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
"""Convert diploid microsatellite data to a ternary character alignment.
"""
from StringIO import StringIO
import numpy as np
from SnippetUtil import HandlingError
import Form
import FormOut
import Carbone
import hud
import Util
import iterutils
g_tags = ['pca:convert']
g_default_rows = [
['DOE3-0002Aa', '94', '94', '156', '162', '89', '172'],
['DOE3-0002Ab', '94', '94', '156', '182', '89', '172'],
['DOE3-0002Ba', '91', '91', '?', '162', '89', '172'],
['DOE3-0002Bb', '91', '91', '?', '182', '89', '172'],
['DOE3-0007Aa', '89', '89', '158', '160', '93', '?'],
['DOE3-0007Ab', '89', '89', '160', '160', '107', '?']]
g_default_lines = ['\t'.join(x for x in row) for row in g_default_rows]
def get_form():
"""
@return: the body of a form
"""
form_objects = [
Form.MultiLine('data', 'diploid microsatellite data',
'\n'.join(g_default_lines))]
return form_objects
def get_form_out():
return FormOut.Report('out')
def get_response_content(fs):
return process(fs.data.splitlines()) + '\n'
def read_microsatellite_lines(raw_lines):
"""
How can i combine the two haploid data sources?
Maybe create each data matrix separately from the interleaved input.
@param raw_lines: raw input lines
@return: headers, diploid data
"""
lines = Util.get_stripped_lines(raw_lines)
if len(lines) % 2:
raise ValueError('expected an even number of lines')
if len(lines) < 2:
raise ValueError('expected at least two lines')
full_rows = [x.split() for x in lines]
nfullcols = len(full_rows[0])
if nfullcols < 2:
raise ValueError('expected at least two columns')
for row in full_rows:
if len(row) != nfullcols:
msg = 'each row should have the same number of elements'
raise ValueError(msg)
a_full_rows = [row for i, row in enumerate(full_rows) if i % 2 == 0]
b_full_rows = [row for i, row in enumerate(full_rows) if i % 2 == 1]
a_headers = [row[0] for row in a_full_rows]
b_headers = [row[0] for row in b_full_rows]
for h in a_headers:
if not h.endswith('a'):
msg = 'each odd row label should end with the letter a'
raise ValueError(msg)
for h in b_headers:
if not h.endswith('b'):
msg = 'each even row label should end with the letter b'
raise ValueError(msg)
headers = [h[:-1] for h in a_headers]
# get the unique elements of each column
rows = [row[1:] for row in full_rows]
cols = zip(*rows)
uniques = [list(iterutils.unique_everseen(col)) for col in cols]
# get the results for each row
a_rows = [row[1:] for row in a_full_rows]
b_rows = [row[1:] for row in b_full_rows]
a_columns = zip(*a_rows)
b_columns = zip(*b_rows)
a_binary_rows = Carbone.get_binary_rows_helper(a_columns, uniques)
b_binary_rows = Carbone.get_binary_rows_helper(b_columns, uniques)
# add the elements entrywise and return as a list of lists
bin_row_groups = [a_binary_rows, b_binary_rows]
binary_rows = np.array(bin_row_groups).sum(axis=0).tolist()
return headers, binary_rows
def process(raw_lines):
headers, binary_rows = read_microsatellite_lines(raw_lines)
return hud.encode(headers, binary_rows) + '\n'