forked from Heroico/PredictDBAnalysis
/
geuvadis_input.py
65 lines (58 loc) · 2.15 KB
/
geuvadis_input.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
__author__ = 'Alvaro Barbeira'
import csv
class GFTF:
""""Geuvadis file table format"""
TARGET_ID=0
GENE_SYMBOL=1
CHR=2
COORD=3
#
from person import Person
from person import People
def loadPeopleFromGEUVADISHeader(cls,header):
people = People()
for i,text in enumerate(header):
if i > GFTF.COORD:
person = Person()
person.id = text
people.addPerson(person)
return people
setattr(People,'loadPeopleFromGEUVADISHeader',classmethod(loadPeopleFromGEUVADISHeader))
#
from gene import GeneData
def loadFromGEUVADISRow(cls,row,gencode_set):
missing = None
gene_data = GeneData()
ensemble_version = row[GFTF.TARGET_ID]
ensemble = ensemble_version.split(".")[0]
if not ensemble in gencode_set.gencodes_by_ensemble_id:
missing = 'Need gencode data for '+ensemble_version
else:
gencode = gencode_set.gencodes_by_ensemble_id[ensemble]
gene_data.name = gencode.name
gene_data.ensemble_id_version = ensemble_version
for i,value in enumerate(row):
if i > GFTF.COORD:
gene_data.data.append(value)
return gene_data, missing
setattr(GeneData, 'loadFromGEUVADISRow', classmethod(loadFromGEUVADISRow))
from gene import GeneDataSets
def LoadGEUVADISFile(gencodes, data_file_name, set_name=None):
gene_sets = GeneDataSets()
gene_sets.name = set_name
missing_gencodes = []
with open(data_file_name, 'rb') as file:
reader = csv.reader(file, delimiter="\t", quotechar='"')
for row in reader:
if reader.line_num == 1:
people = People.loadPeopleFromGEUVADISHeader(row)
gene_sets.setUpPeople(people)
else:
gene_data, missing = GeneData.loadFromGEUVADISRow(row, gencodes)
if missing is not None:
missing_gencodes.append(missing)
continue
gene_sets.genes.append(gene_data)
gene_sets.genes_by_name[gene_data.name] = gene_data
gene_sets.genes_by_ensemble_id_version[gene_data.ensemble_id_version] = gene_data
return gene_sets, missing_gencodes