def getAlignmentMatrix(self, alignment_id): sys.stderr.write("Getting alignment matrix for alignment=%s ..." % (alignment_id)) snp_pos_ls = [] accession_id_ls = [] name_ls = [] data_matrix = [] rows = Sequence.query.filter_by(alignment=alignment_id).order_by( Sequence.accession).all() counter = 0 for row in rows: if counter == 0: snp_pos_ls = self.get_snp_pos_ls(row.alignment_obj.target, row.alignment_obj.chromosome, row.alignment_obj.start) accession_id_ls.append(row.accession) name_ls.append(row.accession_obj.name) data_row = dict_map(nt2number, row.bases) data_matrix.append(data_row) counter += 1 data_matrix = num.array(data_matrix, num.int8) passingdata = PassingData(snp_pos_ls=snp_pos_ls, accession_id_ls=accession_id_ls, name_ls=name_ls, data_matrix=data_matrix) sys.stderr.write(' %s accessions, %s bases. Done.\n' % (len(accession_id_ls), len(snp_pos_ls))) return passingdata
def getAlignmentMatrix(self, alignment_id): sys.stderr.write("Getting alignment matrix for alignment=%s ..."%(alignment_id)) snp_pos_ls = [] accession_id_ls = [] name_ls = [] data_matrix = [] rows = Sequence.query.filter_by(alignment=alignment_id).order_by(Sequence.accession).all() counter = 0 for row in rows: if counter == 0: for i in range(len(row.alignment_obj.target)): base_number = nt2number[row.alignment_obj.target[i]] if base_number!=-1: if i==0: snp_pos_ls.append((row.alignment_obj.chromosome, row.alignment_obj.start, 0)) #the 3rd position is insertion offset relative to Column position else: snp_pos_ls.append((row.alignment_obj.chromosome, snp_pos_ls[i-1][1]+1, 0)) else: #base is deletion if i==0: snp_pos_ls.append((row.alignment_obj.chromosome, row.alignment_obj.start-1, 1)) #this probably doesn't exist in db. it's controversal whether this insertion should be assigned to the previous or alignment's start base else: snp_pos_ls.append((row.alignment_obj.chromosome, snp_pos_ls[i-1][1], snp_pos_ls[i-1][2]+1)) #position doesn't change. offset++ accession_id_ls.append(row.accession) name_ls.append(row.accession_obj.name) data_row = dict_map(nt2number, row.bases) data_matrix.append(data_row) counter += 1 data_matrix = num.array(data_matrix, num.int8) passingdata = PassingData(snp_pos_ls=snp_pos_ls, accession_id_ls=accession_id_ls, name_ls=name_ls, data_matrix=data_matrix) sys.stderr.write(' %s accessions, %s bases. Done.\n'%(len(accession_id_ls), len(snp_pos_ls))) return passingdata
def read_data(cls, input_fname, input_alphabet=0, turn_into_integer=1, double_header=0, delimiter="\t"): """ 2008-05-18 DEPRECATED. moved to pymodule.SNP 2008-05-12 add delimiter 2008-05-07 add option double_header 2007-03-06 different from the one from SelectStrains.py is map(int, data_row) 2007-05-14 add input_alphabet 2007-10-09 add turn_into_integer """ sys.stderr.write("Reading data ...") reader = csv.reader(open(input_fname), delimiter=delimiter) header = reader.next() if double_header: header = [header, reader.next()] data_matrix = [] strain_acc_list = [] category_list = [] for row in reader: strain_acc_list.append(row[0]) category_list.append(row[1]) data_row = row[2:] no_of_snps = len(data_row) if input_alphabet: data_row = dict_map(nt2number, data_row) if no_of_snps != len(data_row): print row else: if turn_into_integer: data_row = map(int, data_row) data_matrix.append(data_row) del reader sys.stderr.write("Done.\n") return header, strain_acc_list, category_list, data_matrix