forked from argriffing/xgcode
-
Notifications
You must be signed in to change notification settings - Fork 0
/
20100608a.py
100 lines (88 loc) · 3.04 KB
/
20100608a.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
"""Compute principal components given a .hud file.
The output is an R frame.
The principal components are computed according to Patterson et al.
in their work on population structure.
This is really only for diploid biallelic data.
Microsatellite data is condoned.
Using this for multi-allelic high-ploidy data is probably a hack.
"""
from StringIO import StringIO
import math
import os
import argparse
import numpy as np
from SnippetUtil import HandlingError
import Carbone
import hud
import EigUtil
import Form
import FormOut
import eigenpop
g_tags = ['pca:compute']
g_default_hud_string = """
IC31 1 1 0 0
IC32 1 1 1 0
IC33 1 0 1 1
IC34 0 0 1 0
""".strip()
"""
Form.RadioGroup('scaling', 'PC scaling', [
Form.RadioItem('scale_none',
'use orthonormal eigenvectors', True),
Form.RadioItem('scale_sqrt',
'scale by the square root of the eigenvalue'),
Form.RadioItem('scale_eigenvalue',
'scale by the eigenvalue')]),
"""
def get_form():
"""
@return: the body of a form
"""
form_objects = [
Form.MultiLine('hud',
'contents of a .hud file',
g_default_hud_string),
Form.CheckGroup('input_options', 'input options', [
Form.CheckItem('diploid_and_biallelic',
'the data source is really diploid and biallelic', True)]),
Form.Integer('npcs',
'find this many principal components', 3),
Form.CheckGroup('output_options', 'output options', [
Form.CheckItem('add_indices',
'add row indices for R table compatibility', True),
Form.CheckItem('clean_isolates',
'force first-column elements to be IC-prefixed', True)])]
return form_objects
def get_form_out():
return FormOut.RTable('out')
def get_response_content(fs):
return process(fs, fs.hud.splitlines()) + '\n'
def process(args, raw_hud_lines):
"""
@param args: user options from the web or cmdline
@param hud_lines: raw lines of a .hud file
@return: results in convenient text form
"""
out = StringIO()
names, data = hud.decode(raw_hud_lines)
# normalize the names of the isolates
if args.clean_isolates:
names = [Carbone.clean_isolate_element(x) for x in names]
# get the pcs
C_full = np.array(data, dtype=float)
pcs = eigenpop.get_scaled_eigenvectors(C_full, args.diploid_and_biallelic)
# check for sufficient number of eigenvectors
if len(pcs) < args.npcs:
msg_a = 'the number of requested principal components '
msg_b = 'must be no more than the number of OTUs'
raise ValueError(msg_a + msg_b)
# create the R frame
headers = ['otu'] + ['pc%d' % (i+1) for i in range(args.npcs)]
print >> out, '\t'.join(headers)
for i, name in enumerate(names):
typed_row = [name] + [pcs[j][i] for j in range(args.npcs)]
if args.add_indices:
typed_row = [i+1] + typed_row
row = [str(x) for x in typed_row]
print >> out, '\t'.join(row)
return out.getvalue()