Exemplo n.º 1
0
	def run(self):
		"""
		2007-03-20
		2007-04-03
		"""		
		from FilterStrainSNPMatrix import FilterStrainSNPMatrix
		FilterStrainSNPMatrix_instance = FilterStrainSNPMatrix()
		if self.draw_only:
			header, strain_acc_list, category_list, data_matrix = FilterStrainSNPMatrix_instance.read_data(self.output_fname)
			data_matrix = Numeric.array(data_matrix)
		else:
			(conn, curs) =  db_connect(self.hostname, self.dbname, self.schema)
			header, strain_acc_list, category_list, data_matrix = FilterStrainSNPMatrix_instance.read_data(self.input_fname)
			
			snp_acc_ls = header[2:]
			strain_id2index = self.get_id2index(curs, self.strain_info_table, strain_acc_list)
			snp_id2index = self.get_id2index(curs, self.snp_locus_table, snp_acc_ls)
			
			from dbSNP2data import dbSNP2data
			dbSNP2data_instance = dbSNP2data(report=self.report)
			data_matrix = dbSNP2data_instance.get_data_matrix(curs, strain_id2index, snp_id2index, nt2number, self.data_table, need_heterozygous_call=1)
			
			FilterStrainSNPMatrix_instance.write_data_matrix(data_matrix, self.output_fname, header, strain_acc_list, category_list)
		
		heterozygous_data_matrix, coarse_data_matrix = self.get_heterozygous_and_coarse_data_matrix(data_matrix)
		self.displayDataMatrix(heterozygous_data_matrix, title='heterozygous_data_matrix, 5-10=hetero, else=0')
		self.displayDataMatrix(coarse_data_matrix, title='coarse_data_matrix, 0=NA, 1=h**o, 2=hetero')
		raw_input("enter")
Exemplo n.º 2
0
    def run(self):
        """
		2007-10-11
		"""
        import MySQLdb
        conn = MySQLdb.connect(db=self.dbname, host=self.hostname)
        curs = conn.cursor()

        from FilterStrainSNPMatrix import FilterStrainSNPMatrix
        FilterStrainSNPMatrix_instance = FilterStrainSNPMatrix()
        header, strain_acc_list, category_list, data_matrix = FilterStrainSNPMatrix_instance.read_data(
            self.input_fname)

        if self.debug:
            import pdb
            pdb.set_trace()

        identity_pair_ls = self.construct_identity_pair_ls(
            strain_acc_list, header, data_matrix)
        g = self.construct_graph_out_of_identity_pair(identity_pair_ls)
        g = self.expand_g_with_singleton_strain_id_ls(g, strain_acc_list)
        cc_id2clique_id_ls, clique_id2ecotype_id_ls = self.compute_components_and_cliques(
            g)

        if self.commit:
            self.create_identity_table(curs, self.identity_table)
            self.create_component2clique_table(curs,
                                               self.component2clique_table)
            self.create_clique2ecotype_table(curs, self.clique2ecotype_table)
            self.submit_identity_pairs(curs, g, self.identity_table)
            self.submit_cc_id2clique_id_ls(curs, cc_id2clique_id_ls,
                                           self.component2clique_table)
            self.submit_clique_id2ecotype_id_ls(curs, clique_id2ecotype_id_ls,
                                                self.clique2ecotype_table)
def shuffleMatrixSNPColumn_in_chrom_position_order(input_fname, curs,
                                                   snps_table, output_fname):
    from FilterStrainSNPMatrix import FilterStrainSNPMatrix
    FilterStrainSNPMatrix_instance = FilterStrainSNPMatrix()
    header, strain_acc_list, category_list, data_matrix = FilterStrainSNPMatrix_instance.read_data(
        input_fname)
    snp_acc_list = header[2:]
    snp_acc2col_index = {}
    new_snp_acc_list = []
    curs.execute(
        "select snpid, chromosome, position from %s order by chromosome, position"
        % (snps_table))
    rows = curs.fetchall()
    for row in rows:
        snpid, chromosome, position = row
        snp_acc2col_index[snpid] = len(snp_acc2col_index)
        new_snp_acc_list.append(snpid)
    import numpy
    old_matrix = numpy.array(data_matrix)
    new_matrix = numpy.zeros(old_matrix.shape, numpy.integer)
    for j in range(old_matrix.shape[1]):
        snp_acc = snp_acc_list[j]
        col_index = snp_acc2col_index[snp_acc]
        new_matrix[:, col_index] = old_matrix[:, j]
    header = header[:2] + new_snp_acc_list
    FilterStrainSNPMatrix_instance.write_data_matrix(new_matrix, output_fname,
                                                     header, strain_acc_list,
                                                     category_list)
Exemplo n.º 4
0
	def run(self):
		"""
		2007-10-11
		"""
		import MySQLdb
		conn = MySQLdb.connect(db=self.dbname,host=self.hostname)
		curs = conn.cursor()
		
		from FilterStrainSNPMatrix import FilterStrainSNPMatrix
		FilterStrainSNPMatrix_instance = FilterStrainSNPMatrix()
		header, strain_acc_list, category_list, data_matrix = FilterStrainSNPMatrix_instance.read_data(self.input_fname)
		
		if self.debug:
			import pdb
			pdb.set_trace()
		
		identity_pair_ls = self.construct_identity_pair_ls(strain_acc_list, header, data_matrix)
		g = self.construct_graph_out_of_identity_pair(identity_pair_ls)
		g = self.expand_g_with_singleton_strain_id_ls(g, strain_acc_list)
		cc_id2clique_id_ls, clique_id2ecotype_id_ls = self.compute_components_and_cliques(g)
		
		if self.commit:
			self.create_identity_table(curs, self.identity_table)
			self.create_component2clique_table(curs, self.component2clique_table)
			self.create_clique2ecotype_table(curs, self.clique2ecotype_table)
			self.submit_identity_pairs(curs, g, self.identity_table)
			self.submit_cc_id2clique_id_ls(curs, cc_id2clique_id_ls, self.component2clique_table)
			self.submit_clique_id2ecotype_id_ls(curs, clique_id2ecotype_id_ls, self.clique2ecotype_table)
def cmp192StrainsBorevitsAndNordborgData(borevitz_data_fname,
                                         nordborg_data_fname):
    """
	2007-10-09
		compare between borevitz and nordborg data of 192 strains
	"""
    from FilterStrainSNPMatrix import FilterStrainSNPMatrix
    FilterStrainSNPMatrix_instance = FilterStrainSNPMatrix()
    borevitz_header, borevitz_strain_acc_list, borevitz_category_list, borevitz_data_matrix = FilterStrainSNPMatrix_instance.read_data(
        borevitz_data_fname)

    nordborg_header, nordborg_strain_acc_list, nordborg_category_list, nordborg_data_matrix = FilterStrainSNPMatrix_instance.read_data(
        nordborg_data_fname, turn_into_integer=0)

    #for nordborg data
    accession_name2index = {}
    for i in range(len(nordborg_category_list)):
        accession_name = nordborg_category_list[i]
        accession_name2index[accession_name] = i

    nativename_missing_in_nordborg_alignment_ls = []
    for nativename in borevitz_category_list:
        if nativename not in accession_name2index:
            nativename_missing_in_nordborg_alignment_ls.append(nativename)
    print 'nativename_missing_in_nordborg_alignment_ls:', nativename_missing_in_nordborg_alignment_ls

    sys.stderr.write(
        "Comparing 192 strains' data from borevitz lab and nordborg 2010 ...")
    acc_name_pair_ls = []
    borevitz_dist_ls = []
    nordborg_dist_ls = []
    no_of_borevits_strains = len(borevitz_strain_acc_list)
    no_of_valid_nordborg_pairs_ls = []
    for i in range(no_of_borevits_strains):
        for j in range(i + 1, no_of_borevits_strains):
            acc_name1 = borevitz_category_list[i]
            acc_name2 = borevitz_category_list[j]
            if acc_name1 in accession_name2index and acc_name2 in accession_name2index:
                borevitz_dist, no_of_valid_pairs = calBinaryDistanceBetTwoNumericVectors(
                    borevitz_data_matrix[i], borevitz_data_matrix[j])

                nordborg_dist, no_of_valid_pairs = calBinaryDistanceBetTwoAlignmentVectors(
                    nordborg_data_matrix[accession_name2index[acc_name1]],
                    nordborg_data_matrix[accession_name2index[acc_name2]])

                borevitz_dist_ls.append(borevitz_dist)
                nordborg_dist_ls.append(nordborg_dist)
                acc_name_pair_ls.append((acc_name1, acc_name2))
                no_of_valid_nordborg_pairs_ls.append(no_of_valid_pairs)
    sys.stderr.write("Done.\n")
    return acc_name_pair_ls, borevitz_dist_ls, nordborg_dist_ls, no_of_valid_nordborg_pairs_ls
Exemplo n.º 6
0
	def run(self):
		from FilterStrainSNPMatrix import FilterStrainSNPMatrix
		FilterStrainSNPMatrix_instance = FilterStrainSNPMatrix()
		header, strain_acc_list, category_list, data_matrix = FilterStrainSNPMatrix_instance.read_data(self.input_fname)
		data_matrix = Numeric.array(data_matrix)
		locus_allele_prob_vector = self.cal_locus_allele_prob_vector(data_matrix)
		locus_heterozygous_prob_vector = self.cal_locus_heterozygous_prob_vector(locus_allele_prob_vector)
		locus_heterozygous_prob_matrix = self.cal_locus_heterozygous_prob_matrix(locus_heterozygous_prob_vector, self.max_selfing_generation)
		selfing_generation_ls = self.cal_selfing_generation_prob(data_matrix, locus_heterozygous_prob_vector, strain_acc_list, category_list, locus_heterozygous_prob_matrix, self.output_fname)
		
		import pylab
		pylab.clf()
		pylab.hist(selfing_generation_ls, 20)
		pylab.title("hist of selfing generations")
		pylab.show()
Exemplo n.º 7
0
    def run(self):
        from FilterStrainSNPMatrix import FilterStrainSNPMatrix
        FilterStrainSNPMatrix_instance = FilterStrainSNPMatrix()
        header, strain_acc_list, category_list, data_matrix = FilterStrainSNPMatrix_instance.read_data(
            self.input_fname)
        data_matrix = Numeric.array(data_matrix)
        locus_allele_prob_vector = self.cal_locus_allele_prob_vector(
            data_matrix)
        locus_heterozygous_prob_vector = self.cal_locus_heterozygous_prob_vector(
            locus_allele_prob_vector)
        locus_heterozygous_prob_matrix = self.cal_locus_heterozygous_prob_matrix(
            locus_heterozygous_prob_vector, self.max_selfing_generation)
        selfing_generation_ls = self.cal_selfing_generation_prob(
            data_matrix, locus_heterozygous_prob_vector, strain_acc_list,
            category_list, locus_heterozygous_prob_matrix, self.output_fname)

        import pylab
        pylab.clf()
        pylab.hist(selfing_generation_ls, 20)
        pylab.title("hist of selfing generations")
        pylab.show()
Exemplo n.º 8
0
def find_2010_accession_id_for_old_2010_x_149snp_matrix(input_fname, curs, accession_table='at.accession'):
	"""
	2007-11-05 whether the names can still be matched to entries in accession_table.
	"""
	from FilterStrainSNPMatrix import FilterStrainSNPMatrix
	FilterStrainSNPMatrix_instance = FilterStrainSNPMatrix()
	header, strain_acc_list, category_list, data_matrix = FilterStrainSNPMatrix_instance.read_data(input_fname)
	strain_acc_accession_id_ls = []
	strain_acc_match_failed_ls = []
	for strain_acc in category_list:
		curs.execute("select id from %s where name='%s'"%(accession_table, strain_acc))
		rows = curs.fetchall()
		if rows:
			accession_id = rows[0][0]
			strain_acc_accession_id_ls.append([strain_acc, accession_id])
		else:
			accession_id = ''
			strain_acc_accession_id_ls.append([strain_acc])
			strain_acc_match_failed_ls.append(strain_acc)
		print '%s\t%s'%(strain_acc, accession_id)
	return strain_acc_accession_id_ls, strain_acc_match_failed_ls
Exemplo n.º 9
0
def shuffleMatrixSNPColumn_in_chrom_position_order(input_fname, curs, snps_table, output_fname):
	from FilterStrainSNPMatrix import FilterStrainSNPMatrix
	FilterStrainSNPMatrix_instance = FilterStrainSNPMatrix()
	header, strain_acc_list, category_list, data_matrix = FilterStrainSNPMatrix_instance.read_data(input_fname)
	snp_acc_list = header[2:]
	snp_acc2col_index = {}
	new_snp_acc_list = []
	curs.execute("select snpid, chromosome, position from %s order by chromosome, position"%(snps_table))
	rows = curs.fetchall()
	for row in rows:
		snpid, chromosome, position = row
		snp_acc2col_index[snpid] = len(snp_acc2col_index)
		new_snp_acc_list.append(snpid)
	import numpy
	old_matrix = numpy.array(data_matrix)
	new_matrix = numpy.zeros(old_matrix.shape, numpy.integer)
	for j in range(old_matrix.shape[1]):
		snp_acc = snp_acc_list[j]
		col_index = snp_acc2col_index[snp_acc]
		new_matrix[:,col_index] = old_matrix[:,j]
	header = header[:2] + new_snp_acc_list
	FilterStrainSNPMatrix_instance.write_data_matrix(new_matrix, output_fname, header, strain_acc_list, category_list)
def find_2010_accession_id_for_old_2010_x_149snp_matrix(
        input_fname, curs, accession_table='at.accession'):
    """
	2007-11-05 whether the names can still be matched to entries in accession_table.
	"""
    from FilterStrainSNPMatrix import FilterStrainSNPMatrix
    FilterStrainSNPMatrix_instance = FilterStrainSNPMatrix()
    header, strain_acc_list, category_list, data_matrix = FilterStrainSNPMatrix_instance.read_data(
        input_fname)
    strain_acc_accession_id_ls = []
    strain_acc_match_failed_ls = []
    for strain_acc in category_list:
        curs.execute("select id from %s where name='%s'" %
                     (accession_table, strain_acc))
        rows = curs.fetchall()
        if rows:
            accession_id = rows[0][0]
            strain_acc_accession_id_ls.append([strain_acc, accession_id])
        else:
            accession_id = ''
            strain_acc_accession_id_ls.append([strain_acc])
            strain_acc_match_failed_ls.append(strain_acc)
        print '%s\t%s' % (strain_acc, accession_id)
    return strain_acc_accession_id_ls, strain_acc_match_failed_ls
Exemplo n.º 11
0
	def run(self):
		import MySQLdb
		conn = MySQLdb.connect(db=self.dbname,host=self.hostname, user = self.user, passwd = self.passwd)
		curs = conn.cursor()
		
		from dbSNP2data import dbSNP2data
		dbSNP2data_instance = dbSNP2data(user=self.user, passwd=self.passwd, output_fname='whatever')
		
		snp_id2index, snp_id_list, snp_id2info = dbSNP2data_instance.get_snp_id2index_m(curs, self.input_table, self.snp_locus_table)
		#strain_id2index, strain_id_list
		strain_id2index, strain_id_list, nativename2strain_id, strain_id2acc, strain_id2category  = dbSNP2data_instance.get_strain_id2index_m(curs, self.input_table, self.strain_info_table)
		#2008-06-02 stuff returned by get_strain_id2index_m is totally changed.
		ecotype_id2row_index = {}
		for strain_id, acc in strain_id2acc.iteritems():
			row_index = strain_id2index[strain_id]
			ecotype_id2row_index[acc] = row_index
		
		#strain_id2acc, strain_id2category = dbSNP2data_instance.get_strain_id_info_m(curs, strain_id_list, self.strain_info_table)
		snp_id2acc = dbSNP2data_instance.get_snp_id_info_m(curs, snp_id_list, self.snp_locus_table)
		data_matrix = dbSNP2data_instance.get_data_matrix_m(curs, strain_id2index, snp_id2index, nt2number, self.input_table, need_heterozygous_call=1)
		
		
		from OutputPopulation import OutputPopulation
		
		popid2ecotypeid_ls = OutputPopulation.get_popid2ecotypeid_ls(curs, self.population_table)
		from FilterStrainSNPMatrix import FilterStrainSNPMatrix
		FilterStrainSNPMatrix_instance = FilterStrainSNPMatrix()
		
		from RemoveBadSNPs import RemoveBadSNPs
		RemoveBadSNPs_instance = RemoveBadSNPs()
		popid2strain_id_snp_id_ls = {}
		for popid, ecotypeid_ls in popid2ecotypeid_ls.iteritems():
			if len(ecotypeid_ls)>=self.min_no_of_strains_per_pop:
				sys.stderr.write("Population %s\n"%popid)
				sub_data_matrix, new_ecotypeid_ls = self.create_sub_data_matrix(popid, data_matrix, ecotypeid_ls, ecotype_id2row_index)
				if len(new_ecotypeid_ls)>=self.min_no_of_strains_per_pop:
					sys.stderr.write("\tPopulation %s has %s strains\n"%(popid, len(new_ecotypeid_ls)))
					strain_id_selected, snp_id_selected = self.cleanup_one_population(FilterStrainSNPMatrix_instance, RemoveBadSNPs_instance, sub_data_matrix, new_ecotypeid_ls, snp_id_list, self.min_no_of_strains_per_pop, self.row_cutoff, self.col_cutoff, self.min_log_prob)
					if strain_id_selected and snp_id_selected:
						popid2strain_id_snp_id_ls[popid] = [strain_id_selected, snp_id_selected]
		
		if self.commit:
			self.create_popid2snpid_table(curs, self.output_table)
			self.mark_strain_id_selected(curs, popid2strain_id_snp_id_ls, self.population_table)
			self.submit_popid2snpid_list(curs, popid2strain_id_snp_id_ls, self.population_table, self.output_table)
			conn.commit()
Exemplo n.º 12
0
def cmp192StrainsBorevitsAndNordborgData(borevitz_data_fname, nordborg_data_fname):
	"""
	2007-10-09
		compare between borevitz and nordborg data of 192 strains
	"""
	from FilterStrainSNPMatrix import FilterStrainSNPMatrix
	FilterStrainSNPMatrix_instance = FilterStrainSNPMatrix()
	borevitz_header, borevitz_strain_acc_list, borevitz_category_list, borevitz_data_matrix = FilterStrainSNPMatrix_instance.read_data(borevitz_data_fname)
	
	nordborg_header, nordborg_strain_acc_list, nordborg_category_list, nordborg_data_matrix = FilterStrainSNPMatrix_instance.read_data(nordborg_data_fname, turn_into_integer=0)
	
	#for nordborg data
	accession_name2index = {}
	for i in range(len(nordborg_category_list)):
		accession_name = nordborg_category_list[i]
		accession_name2index[accession_name] = i
	
	nativename_missing_in_nordborg_alignment_ls = []
	for nativename in borevitz_category_list:
		if nativename not in accession_name2index:
			nativename_missing_in_nordborg_alignment_ls.append(nativename)
	print 'nativename_missing_in_nordborg_alignment_ls:', nativename_missing_in_nordborg_alignment_ls
	
	sys.stderr.write("Comparing 192 strains' data from borevitz lab and nordborg 2010 ...")
	acc_name_pair_ls = []
	borevitz_dist_ls = []
	nordborg_dist_ls = []
	no_of_borevits_strains = len(borevitz_strain_acc_list)
	no_of_valid_nordborg_pairs_ls = []
	for i in range(no_of_borevits_strains):
		for j in range(i+1, no_of_borevits_strains):
			acc_name1 = borevitz_category_list[i]
			acc_name2 = borevitz_category_list[j]
			if acc_name1 in accession_name2index and acc_name2 in accession_name2index:
				borevitz_dist, no_of_valid_pairs = calBinaryDistanceBetTwoNumericVectors(borevitz_data_matrix[i], borevitz_data_matrix[j])
				
				nordborg_dist, no_of_valid_pairs = calBinaryDistanceBetTwoAlignmentVectors(nordborg_data_matrix[accession_name2index[acc_name1]], nordborg_data_matrix[accession_name2index[acc_name2]])
				
				borevitz_dist_ls.append(borevitz_dist)
				nordborg_dist_ls.append(nordborg_dist)
				acc_name_pair_ls.append((acc_name1, acc_name2))
				no_of_valid_nordborg_pairs_ls.append(no_of_valid_pairs)
	sys.stderr.write("Done.\n")
	return acc_name_pair_ls, borevitz_dist_ls, nordborg_dist_ls, no_of_valid_nordborg_pairs_ls
    def run(self):
        """
		2007-03-20
		2007-04-03
		"""
        from FilterStrainSNPMatrix import FilterStrainSNPMatrix
        FilterStrainSNPMatrix_instance = FilterStrainSNPMatrix()
        if self.draw_only:
            header, strain_acc_list, category_list, data_matrix = FilterStrainSNPMatrix_instance.read_data(
                self.output_fname)
            data_matrix = Numeric.array(data_matrix)
        else:
            (conn, curs) = db_connect(self.hostname, self.dbname, self.schema)
            header, strain_acc_list, category_list, data_matrix = FilterStrainSNPMatrix_instance.read_data(
                self.input_fname)

            snp_acc_ls = header[2:]
            strain_id2index = self.get_id2index(curs, self.strain_info_table,
                                                strain_acc_list)
            snp_id2index = self.get_id2index(curs, self.snp_locus_table,
                                             snp_acc_ls)

            from dbSNP2data import dbSNP2data
            dbSNP2data_instance = dbSNP2data(report=self.report)
            data_matrix = dbSNP2data_instance.get_data_matrix(
                curs,
                strain_id2index,
                snp_id2index,
                nt2number,
                self.data_table,
                need_heterozygous_call=1)

            FilterStrainSNPMatrix_instance.write_data_matrix(
                data_matrix, self.output_fname, header, strain_acc_list,
                category_list)

        heterozygous_data_matrix, coarse_data_matrix = self.get_heterozygous_and_coarse_data_matrix(
            data_matrix)
        self.displayDataMatrix(
            heterozygous_data_matrix,
            title='heterozygous_data_matrix, 5-10=hetero, else=0')
        self.displayDataMatrix(
            coarse_data_matrix,
            title='coarse_data_matrix, 0=NA, 1=h**o, 2=hetero')
        raw_input("enter")
Exemplo n.º 14
0
    def run(self):
        from FilterStrainSNPMatrix import FilterStrainSNPMatrix
        FilterStrainSNPMatrix_instance = FilterStrainSNPMatrix()

        import MySQLdb
        #conn = MySQLdb.connect(db="stock",host='natural.uchicago.edu', user='******', passwd='iamhereatusc')
        conn = MySQLdb.connect(db=self.dbname, host=self.hostname)
        curs = conn.cursor()
        if self.debug:
            import pdb
            pdb.set_trace()
        nt_number2diff_matrix_index = self.get_nt_number2diff_matrix_index(
            nt2number)
        SNPpos2col_index, snpid2col_index, snp_acc_ls, snp_index2snp_info_ls = self.setup_SNP_dstruc(
            curs, self.snp_locus_table)
        ecotype_id2accession_id, ecotype_id2row_index, ecotype_id2info_ls, ecotype_id_ls, accession_id2row_index, accession_id_ls, accession_id2ecotype_id_ls = self.setup_accession_ecotype_dstruc(
            curs, self.accession2ecotype_table, self.ecotype_table,
            self.calls_table)

        ecotype_X_snp_matrix, ecotype_X_snp_matrix_touched = self.get_ecotype_X_snp_matrix(
            curs, ecotype_id2row_index, snpid2col_index, self.calls_table)
        if self.sub_justin_output_fname:
            header = ['ecotype_id', 'ecotype_id'] + snp_acc_ls
            FilterStrainSNPMatrix_instance.write_data_matrix(
                ecotype_X_snp_matrix, self.sub_justin_output_fname, header,
                ecotype_id_ls, ecotype_id_ls)

        alignment_id2positions_to_be_checked_ls, alignment_id2start = self.get_alignment_id2positions_to_be_checked_ls(
            curs, self.alignment_table)
        accession_X_snp_matrix, accession_X_snp_matrix_touched, snp_index2alignment_id = self.get_accession_X_snp_matrix(
            curs, accession_id2row_index, SNPpos2col_index,
            self.sequence_table, self.alignment_table,
            alignment_id2positions_to_be_checked_ls)

        if self.output_fname:
            header = ['accession_id', 'accession_id'] + snp_acc_ls
            FilterStrainSNPMatrix_instance.write_data_matrix(
                accession_X_snp_matrix, self.output_fname, header,
                accession_id_ls, accession_id_ls)
        summary_diff_matrix_ls, diff_details_ls = self.cmp_two_matricies(
            accession_X_snp_matrix,
            accession_X_snp_matrix_touched,
            ecotype_X_snp_matrix,
            ecotype_X_snp_matrix_touched,
            nt_number2diff_matrix_index,
            ecotype_id2accession_id,
            ecotype_id2row_index,
            accession_id2row_index,
            diff_details_ls_type=2)
        print "diff_matrix_touched_accession_vs_touched_ecotype"
        print summary_diff_matrix_ls[0]
        print "diff_matrix_touched_accession_vs_untouched_ecotype"
        print summary_diff_matrix_ls[1]
        print "diff_matrix_untouched_accession_vs_touched_ecotype"
        print summary_diff_matrix_ls[2]
        print "diff_matrix_untouched_accession_vs_untouched_ecotype"
        print summary_diff_matrix_ls[3]

        summary_diff_matrix_caption_ls = [
            'PCR-tried vs sequenom-tried', 'PCR-tried vs sequenom-untried',
            'PCR-untried vs sequenom-tried', 'PCR-untried vs sequenom-untried'
        ]

        if self.latex_output_fname:
            outf = open(self.latex_output_fname, 'w')
            outf.write(
                '\\section{2010 PCR versus sequenom. summary} \\label{section_summary}\n'
            )
            for i in range(len(summary_diff_matrix_ls)):
                from pymodule.latex import outputMatrixInLatexTable
                wrapped_diff_matrix = self.wrap_diff_matrix_with_row_col_names(
                    summary_diff_matrix_ls[i])
                table_label = 'table_dm%s' % i
                outf.write(
                    outputMatrixInLatexTable(wrapped_diff_matrix,
                                             summary_diff_matrix_caption_ls[i],
                                             table_label))
            table_no = i

            #output the whole diff_details_ls
            outf.write(
                '\\section{Real Mismatches between pcr and sequenom (deletion/NA excluded)} \\label{section_real_mismatch}\n'
            )
            diff_details_ls = self.beautify_snp_diff_details_ls(
                diff_details_ls, ecotype_id2info_ls, snp_index2snp_info_ls,
                alignment_id2start, snp_index2alignment_id)
            table_label = 'table_dm%s' % table_no
            caption = 'mismatches between pcr and sequenom data (deletion/NA excluded, sorted by accession id)'
            outf.write(
                outputMatrixInLatexTable(diff_details_ls,
                                         caption,
                                         table_label,
                                         header_ls=[
                                             'nativename', 'stkparent',
                                             'ecotype_id', 'duplicate',
                                             'accession_id', 'SNP',
                                             'chromosome', 'position',
                                             'alignment_id', 'alignment_start',
                                             'pcr_call', 'sequenom_call'
                                         ]))

            #Strain-wise comparison
            outf.write(
                '\\section{2010 PCR versus sequenom for each strain} \\label{section_strain_wise}\n'
            )
            accession_id_ls.sort()
            for accession_id in accession_id_ls:
                ecotype_id_ls = accession_id2ecotype_id_ls[accession_id]
                outf.write(
                    '\\subsection{strain %s(accession id=%s)}\n' %
                    (ecotype_id2info_ls[ecotype_id_ls[0]][0], accession_id))
                for ecotype_id in ecotype_id_ls:
                    outf.write(
                        '\\subsubsection{corresponding ecotype %s(stkparent=%s, ecotype id=%s, duplicate=%s)}\n'
                        % (ecotype_id2info_ls[ecotype_id][0],
                           ecotype_id2info_ls[ecotype_id][1], ecotype_id[0],
                           ecotype_id[1]))
                    e_row_index = ecotype_id2row_index[ecotype_id]
                    a_row_index = accession_id2row_index[accession_id]

                    diff_matrix_ls, diff_details_ls = self.cmp_two_lists(
                        accession_X_snp_matrix[a_row_index, :],
                        accession_X_snp_matrix_touched[a_row_index, :],
                        ecotype_X_snp_matrix[e_row_index, :],
                        ecotype_X_snp_matrix_touched[e_row_index, :],
                        nt_number2diff_matrix_index)
                    wrapped_diff_matrix = self.wrap_diff_matrix_with_row_col_names(
                        diff_matrix_ls[0])
                    table_no += 1
                    table_label = 'table_dm%s' % table_no
                    caption = 'accession id=%s vs ecotype id=%s, duplicate=%s(nativename=%s, stockparent=%s)' % (
                        accession_id, ecotype_id[0], ecotype_id[1],
                        ecotype_id2info_ls[ecotype_id][0],
                        ecotype_id2info_ls[ecotype_id][1])
                    outf.write(
                        outputMatrixInLatexTable(wrapped_diff_matrix, caption,
                                                 table_label))

                    if diff_details_ls:
                        diff_details_ls = self.beautify_diff_details_ls(
                            diff_details_ls, snp_index2snp_info_ls,
                            alignment_id2start, snp_index2alignment_id)
                        table_no += 1
                        table_label = 'table_dm%s' % table_no
                        caption = 'detailed difference for accession id=%s vs ecotype id=%s, duplicate=%s' % (
                            accession_id, ecotype_id[0], ecotype_id[1])
                        outf.write(
                            outputMatrixInLatexTable(
                                diff_details_ls,
                                caption,
                                table_label,
                                header_ls=[
                                    'snp', 'chromosome', 'position',
                                    'alignment_id', 'alignment_start',
                                    'pcr_call', 'sequenom_call'
                                ]))
            #SNP-wise comparison
            outf.write(
                '\\section{2010 PCR versus sequenom for each SNP} \\label{section_snp_wise}\n'
            )
            for snp_column in range(accession_X_snp_matrix.shape[1]):
                snp_acc, chromosome, position = snp_index2snp_info_ls[
                    snp_column]
                alignment_id = snp_index2alignment_id[snp_column]
                alignment_start = alignment_id2start[alignment_id]
                outf.write(
                    '\\subsection{SNP %s(chrom=%s, pos=%s, alignment id=%s, alignment start=%s)}\n'
                    % (snp_acc, chromosome, position, alignment_id,
                       alignment_start))

                diff_matrix_ls, diff_details_ls = self.cmp_two_matricies(
                    accession_X_snp_matrix,
                    accession_X_snp_matrix_touched,
                    ecotype_X_snp_matrix,
                    ecotype_X_snp_matrix_touched,
                    nt_number2diff_matrix_index,
                    ecotype_id2accession_id,
                    ecotype_id2row_index,
                    accession_id2row_index,
                    snp_column=snp_column,
                    diff_details_ls_type=1)
                wrapped_diff_matrix = self.wrap_diff_matrix_with_row_col_names(
                    diff_matrix_ls[0])
                table_no += 1
                table_label = 'table_dm%s' % table_no
                caption = 'SNP %s(chromosome=%s, position=%s, alignment id=%s, alignment start=%s)' % (
                    snp_acc, chromosome, position, alignment_id,
                    alignment_start)
                outf.write(
                    outputMatrixInLatexTable(wrapped_diff_matrix, caption,
                                             table_label))

                if diff_details_ls:
                    diff_details_ls = self.beautify_snp_diff_details_ls(
                        diff_details_ls, ecotype_id2info_ls)
                    table_no += 1
                    table_label = 'table_dm%s' % table_no
                    caption = 'detailed difference for SNP %s' % (snp_acc)
                    header_ls = [
                        'nativename', 'stkparent', 'ecotype_id', 'duplicate',
                        'accession_id', 'pcr_call', 'sequenom_call'
                    ]
                    outf.write(
                        outputMatrixInLatexTable(diff_details_ls, caption,
                                                 table_label, header_ls))
            del outf
Exemplo n.º 15
0
	def run(self):
		from FilterStrainSNPMatrix import FilterStrainSNPMatrix
		FilterStrainSNPMatrix_instance = FilterStrainSNPMatrix()
		
		import MySQLdb
		#conn = MySQLdb.connect(db="stock",host='natural.uchicago.edu', user='******', passwd='iamhereatusc')
		conn = MySQLdb.connect(db=self.dbname,host=self.hostname)
		curs = conn.cursor()
		if self.debug:
			import pdb
			pdb.set_trace()
		nt_number2diff_matrix_index = self.get_nt_number2diff_matrix_index(nt2number)
		SNPpos2col_index, snpid2col_index, snp_acc_ls, snp_index2snp_info_ls = self.setup_SNP_dstruc(curs, self.snp_locus_table)
		ecotype_id2accession_id, ecotype_id2row_index, ecotype_id2info_ls, ecotype_id_ls, accession_id2row_index, accession_id_ls, accession_id2ecotype_id_ls = self.setup_accession_ecotype_dstruc(curs, self.accession2ecotype_table, self.ecotype_table, self.calls_table)
		
		ecotype_X_snp_matrix, ecotype_X_snp_matrix_touched = self.get_ecotype_X_snp_matrix(curs, ecotype_id2row_index, snpid2col_index, self.calls_table)
		if self.sub_justin_output_fname:
			header = ['ecotype_id', 'ecotype_id'] + snp_acc_ls
			FilterStrainSNPMatrix_instance.write_data_matrix(ecotype_X_snp_matrix, self.sub_justin_output_fname, header, ecotype_id_ls, ecotype_id_ls)
		
		alignment_id2positions_to_be_checked_ls, alignment_id2start = self.get_alignment_id2positions_to_be_checked_ls(curs, self.alignment_table)
		accession_X_snp_matrix, accession_X_snp_matrix_touched, snp_index2alignment_id= self.get_accession_X_snp_matrix(curs, accession_id2row_index, SNPpos2col_index, self.sequence_table, self.alignment_table, alignment_id2positions_to_be_checked_ls)
		
		if self.output_fname:
			header = ['accession_id', 'accession_id'] + snp_acc_ls
			FilterStrainSNPMatrix_instance.write_data_matrix(accession_X_snp_matrix, self.output_fname, header, accession_id_ls, accession_id_ls)
		summary_diff_matrix_ls, diff_details_ls = self.cmp_two_matricies(accession_X_snp_matrix, accession_X_snp_matrix_touched, ecotype_X_snp_matrix, ecotype_X_snp_matrix_touched, nt_number2diff_matrix_index, ecotype_id2accession_id, ecotype_id2row_index, accession_id2row_index, diff_details_ls_type=2)
		print "diff_matrix_touched_accession_vs_touched_ecotype"
		print summary_diff_matrix_ls[0]
		print "diff_matrix_touched_accession_vs_untouched_ecotype"
		print summary_diff_matrix_ls[1]
		print "diff_matrix_untouched_accession_vs_touched_ecotype"
		print summary_diff_matrix_ls[2]
		print "diff_matrix_untouched_accession_vs_untouched_ecotype"
		print summary_diff_matrix_ls[3]
		
		summary_diff_matrix_caption_ls = ['PCR-tried vs sequenom-tried', 'PCR-tried vs sequenom-untried', 'PCR-untried vs sequenom-tried', 'PCR-untried vs sequenom-untried']
		
		if self.latex_output_fname:
			outf = open(self.latex_output_fname, 'w')
			outf.write('\\section{2010 PCR versus sequenom. summary} \\label{section_summary}\n')
			for i in range(len(summary_diff_matrix_ls)):
				from pymodule.latex import outputMatrixInLatexTable
				wrapped_diff_matrix = self.wrap_diff_matrix_with_row_col_names(summary_diff_matrix_ls[i])
				table_label = 'table_dm%s'%i
				outf.write(outputMatrixInLatexTable(wrapped_diff_matrix, summary_diff_matrix_caption_ls[i], table_label))
			table_no = i
			
			#output the whole diff_details_ls
			outf.write('\\section{Real Mismatches between pcr and sequenom (deletion/NA excluded)} \\label{section_real_mismatch}\n')
			diff_details_ls = self.beautify_snp_diff_details_ls(diff_details_ls, ecotype_id2info_ls, snp_index2snp_info_ls, alignment_id2start, snp_index2alignment_id)
			table_label = 'table_dm%s'%table_no
			caption = 'mismatches between pcr and sequenom data (deletion/NA excluded, sorted by accession id)'
			outf.write(outputMatrixInLatexTable(diff_details_ls, caption, table_label, header_ls=['nativename', 'stkparent', 'ecotype_id', 'duplicate', 'accession_id', 'SNP', 'chromosome', 'position', 'alignment_id', 'alignment_start', 'pcr_call', 'sequenom_call']))
			
			#Strain-wise comparison
			outf.write('\\section{2010 PCR versus sequenom for each strain} \\label{section_strain_wise}\n')
			accession_id_ls.sort()
			for accession_id in accession_id_ls:
				ecotype_id_ls = accession_id2ecotype_id_ls[accession_id]
				outf.write('\\subsection{strain %s(accession id=%s)}\n'%(ecotype_id2info_ls[ecotype_id_ls[0]][0], accession_id))
				for ecotype_id in ecotype_id_ls:
					outf.write('\\subsubsection{corresponding ecotype %s(stkparent=%s, ecotype id=%s, duplicate=%s)}\n'%(ecotype_id2info_ls[ecotype_id][0], ecotype_id2info_ls[ecotype_id][1], ecotype_id[0], ecotype_id[1]))
					e_row_index = ecotype_id2row_index[ecotype_id]
					a_row_index = accession_id2row_index[accession_id]
					
					diff_matrix_ls, diff_details_ls= self.cmp_two_lists(accession_X_snp_matrix[a_row_index,:], accession_X_snp_matrix_touched[a_row_index,:], ecotype_X_snp_matrix[e_row_index,:], ecotype_X_snp_matrix_touched[e_row_index,:], nt_number2diff_matrix_index)
					wrapped_diff_matrix = self.wrap_diff_matrix_with_row_col_names(diff_matrix_ls[0])
					table_no += 1
					table_label = 'table_dm%s'%table_no
					caption = 'accession id=%s vs ecotype id=%s, duplicate=%s(nativename=%s, stockparent=%s)'%(accession_id, ecotype_id[0], ecotype_id[1], ecotype_id2info_ls[ecotype_id][0], ecotype_id2info_ls[ecotype_id][1])
					outf.write(outputMatrixInLatexTable(wrapped_diff_matrix, caption, table_label))
					
					if diff_details_ls:
						diff_details_ls = self.beautify_diff_details_ls(diff_details_ls, snp_index2snp_info_ls, alignment_id2start, snp_index2alignment_id)
						table_no += 1
						table_label = 'table_dm%s'%table_no
						caption = 'detailed difference for accession id=%s vs ecotype id=%s, duplicate=%s'%(accession_id, ecotype_id[0], ecotype_id[1])
						outf.write(outputMatrixInLatexTable(diff_details_ls, caption, table_label, header_ls=['snp', 'chromosome', 'position', 'alignment_id', 'alignment_start', 'pcr_call', 'sequenom_call']))
			#SNP-wise comparison
			outf.write('\\section{2010 PCR versus sequenom for each SNP} \\label{section_snp_wise}\n')
			for snp_column in range(accession_X_snp_matrix.shape[1]):
				snp_acc, chromosome, position = snp_index2snp_info_ls[snp_column]
				alignment_id = snp_index2alignment_id[snp_column]
				alignment_start = alignment_id2start[alignment_id]
				outf.write('\\subsection{SNP %s(chrom=%s, pos=%s, alignment id=%s, alignment start=%s)}\n'%(snp_acc, chromosome, position, alignment_id, alignment_start))
				
				diff_matrix_ls, diff_details_ls = self.cmp_two_matricies(accession_X_snp_matrix, accession_X_snp_matrix_touched, ecotype_X_snp_matrix, ecotype_X_snp_matrix_touched, nt_number2diff_matrix_index, ecotype_id2accession_id, ecotype_id2row_index, accession_id2row_index, snp_column=snp_column, diff_details_ls_type=1)
				wrapped_diff_matrix = self.wrap_diff_matrix_with_row_col_names(diff_matrix_ls[0])
				table_no += 1
				table_label = 'table_dm%s'%table_no
				caption = 'SNP %s(chromosome=%s, position=%s, alignment id=%s, alignment start=%s)'%(snp_acc, chromosome, position, alignment_id, alignment_start)
				outf.write(outputMatrixInLatexTable(wrapped_diff_matrix, caption, table_label))
				
				if diff_details_ls:
					diff_details_ls = self.beautify_snp_diff_details_ls(diff_details_ls, ecotype_id2info_ls)
					table_no += 1
					table_label = 'table_dm%s'%table_no
					caption = 'detailed difference for SNP %s'%(snp_acc)
					header_ls = ['nativename', 'stkparent', 'ecotype_id', 'duplicate', 'accession_id', 'pcr_call', 'sequenom_call']
					outf.write(outputMatrixInLatexTable(diff_details_ls, caption, table_label, header_ls))
			del outf
Exemplo n.º 16
0
	def run(self):
		"""
		2007-03-29
		2007-04-03
		2007-05-01
			--db_connect()
			--FilterStrainSNPMatrix_instance.read_data()
			if self.comparison_only:
				--FilterStrainSNPMatrix_instance.read_data()
			else:
				--get_SNPpos2index()
				--create_SNP_matrix_2010()
					--get_align_length_from_fname()
						--get_positions_to_be_checked_ls()
					--get_align_matrix_from_fname()
						--get_positions_to_be_checked_ls()
				--get_mapping_info_regarding_strain_acc()
				--shuffle_data_matrix_according_to_strain_acc_ls()
				--FilterStrainSNPMatrix_instance.write_data_matrix()
			
			--extract_sub_data_matrix()
			if self.sub_justin_output_fname:
				--FilterStrainSNPMatrix_instance.write_data_matrix()
			--compare_two_SNP_matrix()
			--outputDiffType()
			
		"""
		from FilterStrainSNPMatrix import FilterStrainSNPMatrix
		FilterStrainSNPMatrix_instance = FilterStrainSNPMatrix()
		header, src_strain_acc_list, category_list, data_matrix = FilterStrainSNPMatrix_instance.read_data(self.input_fname)
		if self.comparison_only:
			header, strain_acc_ls, abbr_name_ls_sorted, SNP_matrix_2010_sorted = FilterStrainSNPMatrix_instance.read_data(self.output_fname)
			SNP_matrix_2010_sorted = Numeric.array(SNP_matrix_2010_sorted)
		else:
			(conn, curs) =  db_connect(self.hostname, self.dbname, self.schema)
			#extract data from alignment
			snp_acc_ls = header[2:]
			SNPpos2index = self.get_SNPpos2index(curs, snp_acc_ls, self.snp_locus_table)
			abbr_name_ls, SNP_matrix_2010 = self.create_SNP_matrix_2010(SNPpos2index, self.data_dir_2010)
			strain_acc_ls, strain_acc2abbr_name, strain_acc2index = self.get_mapping_info_regarding_strain_acc(curs, self.strain_info_table, self.strain_info_2010_table, abbr_name_ls)
			SNP_matrix_2010_sorted = self.shuffle_data_matrix_according_to_strain_acc_ls(SNP_matrix_2010, strain_acc_ls, strain_acc2index)
			abbr_name_ls_sorted = []
			for strain_acc in strain_acc_ls:
				abbr_name_ls_sorted.append(strain_acc2abbr_name[strain_acc])
			FilterStrainSNPMatrix_instance.write_data_matrix(SNP_matrix_2010_sorted, self.output_fname, header, strain_acc_ls, abbr_name_ls_sorted)
		
		
		#comparison
		data_matrix = Numeric.array(data_matrix)
		sub_data_matrix = self.extract_sub_data_matrix(src_strain_acc_list, data_matrix, strain_acc_ls)
		if self.sub_justin_output_fname:
			FilterStrainSNPMatrix_instance.write_data_matrix(sub_data_matrix, self.sub_justin_output_fname, header, strain_acc_ls, abbr_name_ls_sorted)
		diff_matrix, diff_tag_dict, diff_tag2counter= self.compare_two_SNP_matrix(SNP_matrix_2010_sorted, sub_data_matrix)
		if self.diff_output_fname:
			self.outputDiffType(diff_matrix, SNP_matrix_2010_sorted, sub_data_matrix, diff_tag_dict, self.diff_type_to_be_outputted, abbr_name_ls_sorted, header[2:], self.diff_output_fname)
		
		summary_result_ls = []
		for tag, counter in diff_tag2counter.iteritems():
			summary_result_ls.append('%s(%s):%s'%(tag, diff_tag_dict[tag], counter))
			print '\t%s(%s)\t%s'%(tag, diff_tag_dict[tag], counter)
		import pylab
		pylab.clf()
		diff_matrix_reverse = list(diff_matrix)
		diff_matrix_reverse.reverse()
		diff_matrix_reverse = Numeric.array(diff_matrix_reverse)
		pylab.imshow(diff_matrix_reverse, interpolation='nearest')
		pylab.title(' '.join(summary_result_ls))
		pylab.colorbar()
		pylab.show()
		
		#2007-11-01 do something as CmpAccession2Ecotype.py
		from CmpAccession2Ecotype import CmpAccession2Ecotype
		CmpAccession2Ecotype_ins = CmpAccession2Ecotype()
		nt_number2diff_matrix_index = CmpAccession2Ecotype_ins.get_nt_number2diff_matrix_index(nt2number)
		dc_placeholder = dict(zip(range(sub_data_matrix.shape[0]), range(sub_data_matrix.shape[1])))
		diff_matrix_ls = CmpAccession2Ecotype_ins.cmp_two_matricies(SNP_matrix_2010_sorted, sub_data_matrix, nt_number2diff_matrix_index, dc_placeholder, dc_placeholder, dc_placeholder)
		print diff_matrix_ls
    def run(self):
        """
		2007-03-29
		2007-04-03
		2007-05-01
			--db_connect()
			--FilterStrainSNPMatrix_instance.read_data()
			if self.comparison_only:
				--FilterStrainSNPMatrix_instance.read_data()
			else:
				--get_SNPpos2index()
				--create_SNP_matrix_2010()
					--get_align_length_from_fname()
						--get_positions_to_be_checked_ls()
					--get_align_matrix_from_fname()
						--get_positions_to_be_checked_ls()
				--get_mapping_info_regarding_strain_acc()
				--shuffle_data_matrix_according_to_strain_acc_ls()
				--FilterStrainSNPMatrix_instance.write_data_matrix()
			
			--extract_sub_data_matrix()
			if self.sub_justin_output_fname:
				--FilterStrainSNPMatrix_instance.write_data_matrix()
			--compare_two_SNP_matrix()
			--outputDiffType()
			
		"""
        from FilterStrainSNPMatrix import FilterStrainSNPMatrix
        FilterStrainSNPMatrix_instance = FilterStrainSNPMatrix()
        header, src_strain_acc_list, category_list, data_matrix = FilterStrainSNPMatrix_instance.read_data(
            self.input_fname)
        if self.comparison_only:
            header, strain_acc_ls, abbr_name_ls_sorted, SNP_matrix_2010_sorted = FilterStrainSNPMatrix_instance.read_data(
                self.output_fname)
            SNP_matrix_2010_sorted = Numeric.array(SNP_matrix_2010_sorted)
        else:
            (conn, curs) = db_connect(self.hostname, self.dbname, self.schema)
            #extract data from alignment
            snp_acc_ls = header[2:]
            SNPpos2index = self.get_SNPpos2index(curs, snp_acc_ls,
                                                 self.snp_locus_table)
            abbr_name_ls, SNP_matrix_2010 = self.create_SNP_matrix_2010(
                SNPpos2index, self.data_dir_2010)
            strain_acc_ls, strain_acc2abbr_name, strain_acc2index = self.get_mapping_info_regarding_strain_acc(
                curs, self.strain_info_table, self.strain_info_2010_table,
                abbr_name_ls)
            SNP_matrix_2010_sorted = self.shuffle_data_matrix_according_to_strain_acc_ls(
                SNP_matrix_2010, strain_acc_ls, strain_acc2index)
            abbr_name_ls_sorted = []
            for strain_acc in strain_acc_ls:
                abbr_name_ls_sorted.append(strain_acc2abbr_name[strain_acc])
            FilterStrainSNPMatrix_instance.write_data_matrix(
                SNP_matrix_2010_sorted, self.output_fname, header,
                strain_acc_ls, abbr_name_ls_sorted)

        #comparison
        data_matrix = Numeric.array(data_matrix)
        sub_data_matrix = self.extract_sub_data_matrix(src_strain_acc_list,
                                                       data_matrix,
                                                       strain_acc_ls)
        if self.sub_justin_output_fname:
            FilterStrainSNPMatrix_instance.write_data_matrix(
                sub_data_matrix, self.sub_justin_output_fname, header,
                strain_acc_ls, abbr_name_ls_sorted)
        diff_matrix, diff_tag_dict, diff_tag2counter = self.compare_two_SNP_matrix(
            SNP_matrix_2010_sorted, sub_data_matrix)
        if self.diff_output_fname:
            self.outputDiffType(diff_matrix, SNP_matrix_2010_sorted,
                                sub_data_matrix, diff_tag_dict,
                                self.diff_type_to_be_outputted,
                                abbr_name_ls_sorted, header[2:],
                                self.diff_output_fname)

        summary_result_ls = []
        for tag, counter in diff_tag2counter.iteritems():
            summary_result_ls.append('%s(%s):%s' %
                                     (tag, diff_tag_dict[tag], counter))
            print '\t%s(%s)\t%s' % (tag, diff_tag_dict[tag], counter)
        import pylab
        pylab.clf()
        diff_matrix_reverse = list(diff_matrix)
        diff_matrix_reverse.reverse()
        diff_matrix_reverse = Numeric.array(diff_matrix_reverse)
        pylab.imshow(diff_matrix_reverse, interpolation='nearest')
        pylab.title(' '.join(summary_result_ls))
        pylab.colorbar()
        pylab.show()

        #2007-11-01 do something as CmpAccession2Ecotype.py
        from CmpAccession2Ecotype import CmpAccession2Ecotype
        CmpAccession2Ecotype_ins = CmpAccession2Ecotype()
        nt_number2diff_matrix_index = CmpAccession2Ecotype_ins.get_nt_number2diff_matrix_index(
            nt2number)
        dc_placeholder = dict(
            zip(range(sub_data_matrix.shape[0]),
                range(sub_data_matrix.shape[1])))
        diff_matrix_ls = CmpAccession2Ecotype_ins.cmp_two_matricies(
            SNP_matrix_2010_sorted, sub_data_matrix,
            nt_number2diff_matrix_index, dc_placeholder, dc_placeholder,
            dc_placeholder)
        print diff_matrix_ls
Exemplo n.º 18
0
    def run(self):
        """
		2007-04-16
			(rank==0)
				--get_chr_start_ls()
			elif free_computing_nodes:
				-- (receive data)
			
			--mpi_synchronize()
			
			(rank==0)
				--input_node()
					--input_handler()
			elif free_computing_nodes:
				--computing_node()
					--computing_node_handler()
						--identify_ancestry_with_min_jumps()
							--initialize_score_trace_matrix()
								--is_child_heterozygous_SNP_compatible_with_parents()
							(for loop)
								--identify_ancestry_of_one_chr_with_DP()
									--is_child_heterozygous_SNP_compatible_with_parents()
							--trace()
								--recursive_trace()
			else:
				--output_node()
					--output_node_handler()
		"""
        node_rank = self.communicator.rank
        free_computing_nodes = range(1, self.communicator.size - 1)  # exclude the 1st and last node
        if node_rank == 0:
            FilterStrainSNPMatrix_instance = FilterStrainSNPMatrix()
            header, strain_acc_list, category_list, data_matrix = FilterStrainSNPMatrix_instance.read_data(
                self.input_fname
            )
            snp_acc_list = header[2:]
            data_matrix = Numeric.array(data_matrix)
            no_of_strains = data_matrix.shape[0]
            (conn, curs) = db_connect(self.hostname, self.dbname, self.schema, password="******", user="******")

            # 2007-09-17 send strain_acc_list to the output_node
            strain_acc_list_pickle = cPickle.dumps(strain_acc_list, -1)
            self.communicator.send(strain_acc_list_pickle, self.communicator.size - 1, 0)

            chr_start_ls = self.get_chr_start_ls(curs, snp_acc_list, self.snp_locus_table)

            chr_start_ls_pickle = cPickle.dumps(chr_start_ls, -1)  # -1 means use the highest protocol
            data_matrix_pickle = cPickle.dumps(data_matrix, -1)
            for node in free_computing_nodes:  # send it to the computing_node
                self.communicator.send(chr_start_ls_pickle, node, 0)
                self.communicator.send(data_matrix_pickle, node, 0)
        elif node_rank in free_computing_nodes:
            data, source, tag = self.communicator.receiveString(0, 0)
            chr_start_ls = cPickle.loads(data)  # take the data
            data, source, tag = self.communicator.receiveString(0, 0)
            data_matrix = cPickle.loads(data)
        else:
            data, source, tag = self.communicator.receiveString(0, 0)
            strain_acc_list = cPickle.loads(data)

        mpi_synchronize(self.communicator)

        if node_rank == 0:
            parameter_list = [no_of_strains]
            self.input_node(self.communicator, parameter_list, free_computing_nodes, self.message_size, self.report)
        elif node_rank in free_computing_nodes:
            trio_arrangement_ls = [
                [0, 1, 2],
                [1, 2, 0],
                [2, 0, 1],
            ]  # three different ways to pick the parent-set and the child
            parameter_list = [data_matrix, chr_start_ls, trio_arrangement_ls]
            computing_node(self.communicator, parameter_list, self.computing_node_handler, report=self.report)
        else:
            writer = csv.writer(open(self.output_fname, "w"), delimiter="\t")
            parameter_list = [writer, strain_acc_list]
            output_node(self.communicator, free_computing_nodes, parameter_list, self.output_node_handler, self.report)
            del writer
Exemplo n.º 19
0
    def run(self):
        """
		2007-04-16
			(rank==0)
				--get_chr_start_ls()
			elif free_computing_nodes:
				-- (receive data)
			
			--mpi_synchronize()
			
			(rank==0)
				--input_node()
					--input_handler()
			elif free_computing_nodes:
				--computing_node()
					--computing_node_handler()
						--identify_ancestry_with_min_jumps()
							--initialize_score_trace_matrix()
								--is_child_heterozygous_SNP_compatible_with_parents()
							(for loop)
								--identify_ancestry_of_one_chr_with_DP()
									--is_child_heterozygous_SNP_compatible_with_parents()
							--trace()
								--recursive_trace()
			else:
				--output_node()
					--output_node_handler()
		"""
        node_rank = self.communicator.rank
        free_computing_nodes = range(1, self.communicator.size -
                                     1)  #exclude the 1st and last node
        if node_rank == 0:
            FilterStrainSNPMatrix_instance = FilterStrainSNPMatrix()
            header, strain_acc_list, category_list, data_matrix = FilterStrainSNPMatrix_instance.read_data(
                self.input_fname)
            snp_acc_list = header[2:]
            data_matrix = Numeric.array(data_matrix)
            no_of_strains = data_matrix.shape[0]
            (conn, curs) = db_connect(self.hostname,
                                      self.dbname,
                                      self.schema,
                                      password='******',
                                      user='******')

            #2007-09-17 send strain_acc_list to the output_node
            strain_acc_list_pickle = cPickle.dumps(strain_acc_list, -1)
            self.communicator.send(strain_acc_list_pickle,
                                   self.communicator.size - 1, 0)

            chr_start_ls = self.get_chr_start_ls(curs, snp_acc_list,
                                                 self.snp_locus_table)

            chr_start_ls_pickle = cPickle.dumps(
                chr_start_ls, -1)  #-1 means use the highest protocol
            data_matrix_pickle = cPickle.dumps(data_matrix, -1)
            for node in free_computing_nodes:  #send it to the computing_node
                self.communicator.send(chr_start_ls_pickle, node, 0)
                self.communicator.send(data_matrix_pickle, node, 0)
        elif node_rank in free_computing_nodes:
            data, source, tag = self.communicator.receiveString(0, 0)
            chr_start_ls = cPickle.loads(data)  #take the data
            data, source, tag = self.communicator.receiveString(0, 0)
            data_matrix = cPickle.loads(data)
        else:
            data, source, tag = self.communicator.receiveString(0, 0)
            strain_acc_list = cPickle.loads(data)

        mpi_synchronize(self.communicator)

        if node_rank == 0:
            parameter_list = [no_of_strains]
            self.input_node(self.communicator, parameter_list, free_computing_nodes, self.message_size, \
             self.report)
        elif node_rank in free_computing_nodes:
            trio_arrangement_ls = [[0, 1, 2], [1, 2, 0], [
                2, 0, 1
            ]]  #three different ways to pick the parent-set and the child
            parameter_list = [data_matrix, chr_start_ls, trio_arrangement_ls]
            computing_node(self.communicator,
                           parameter_list,
                           self.computing_node_handler,
                           report=self.report)
        else:
            writer = csv.writer(open(self.output_fname, 'w'), delimiter='\t')
            parameter_list = [writer, strain_acc_list]
            output_node(self.communicator, free_computing_nodes,
                        parameter_list, self.output_node_handler, self.report)
            del writer