def potatoSimulate(args): log.info("loading database files") GenotypeData = genotype.load_hdf5_genotype_data(args['hdf5File']) GenotypeData_acc = genotype.load_hdf5_genotype_data(args['hdf5accFile']) log.info("done!") simulateSNPs(GenotypeData, GenotypeData_acc, args['AccID'], args['numSNPs'], args['outFile'], args['err_rate']) log.info("finished!")
def potatoCrossIdentifier(args): (snpCHR, snpPOS, snpGT, snpWEI, DPmean) = snpmatch.parseInput(inFile=args['inFile'], logDebug=args['logDebug']) log.info("loading genotype files!") GenotypeData = genotype.load_hdf5_genotype_data(args['hdf5File']) GenotypeData_acc = genotype.load_hdf5_genotype_data(args['hdf5accFile']) log.info("done!") log.info("running cross identifier!") crossIdentifier(args['binLen'], snpCHR, snpPOS, snpWEI, DPmean, GenotypeData, GenotypeData_acc, args['outFile']) log.info("finished!")
def genotyper(snpCHR, snpPOS, snpGT, snpWEI, DPmean, hdf5File, hdf5accFile, outFile): NumSNPs = len(snpCHR) log.info("loading database files") GenotypeData = genotype.load_hdf5_genotype_data(hdf5File) GenotypeData_acc = genotype.load_hdf5_genotype_data(hdf5accFile) log.info("done!") num_lines = len(GenotypeData.accessions) ScoreList = np.zeros(num_lines, dtype="float") NumInfoSites = np.zeros(len(GenotypeData.accessions), dtype="uint32") NumMatSNPs = 0 overlappedInds = np.zeros(0, dtype=int) chunk_size = 1000 for i in np.array(GenotypeData.chrs, dtype=int): perchrTarPos = np.where(snpCHR == i)[0] perchrtarSNPpos = snpPOS[perchrTarPos] log.info("Analysing chromosome %s positions", i) start = GenotypeData.chr_regions[i-1][0] end = GenotypeData.chr_regions[i-1][1] chrpositions = GenotypeData.positions[start:end] matchedAccInd = np.where(np.in1d(chrpositions, perchrtarSNPpos))[0] + start matchedTarInd = np.where(np.in1d(perchrtarSNPpos, chrpositions))[0] matchedTarWei = snpWEI[perchrTarPos[matchedTarInd],] TarGTs0 = np.zeros(len(matchedTarInd), dtype="int8") TarGTs1 = np.ones(len(matchedTarInd), dtype="int8") + 1 TarGTs2 = np.ones(len(matchedTarInd), dtype="int8") overlappedInds = np.append(overlappedInds, perchrTarPos[matchedTarInd]) NumMatSNPs = NumMatSNPs + len(matchedAccInd) for j in range(0, len(matchedAccInd), chunk_size): t1001SNPs = GenotypeData.snps[matchedAccInd[j:j+chunk_size],:] samSNPs0 = np.reshape(np.repeat(TarGTs0[j:j+chunk_size], num_lines), (len(TarGTs0[j:j+chunk_size]),num_lines)) samSNPs1 = np.reshape(np.repeat(TarGTs1[j:j+chunk_size], num_lines), (len(TarGTs1[j:j+chunk_size]),num_lines)) samSNPs2 = np.reshape(np.repeat(TarGTs2[j:j+chunk_size], num_lines), (len(TarGTs2[j:j+chunk_size]),num_lines)) tempScore0 = np.sum(np.multiply(np.array(t1001SNPs == samSNPs0, dtype=int).T, matchedTarWei[j:j+chunk_size,0]).T, axis=0) tempScore1 = np.sum(np.multiply(np.array(t1001SNPs == samSNPs1, dtype=int).T, matchedTarWei[j:j+chunk_size,1]).T, axis=0) tempScore2 = np.sum(np.multiply(np.array(t1001SNPs == samSNPs2, dtype=int).T, matchedTarWei[j:j+chunk_size,2]).T, axis=0) ScoreList = ScoreList + tempScore0 + tempScore1 + tempScore2 if(len(TarGTs0[j:j+chunk_size]) > 1): NumInfoSites = NumInfoSites + len(TarGTs0[j:j+chunk_size]) - np.sum(numpy.ma.masked_less(t1001SNPs, 0).mask.astype(int), axis = 0) # Number of informative sites elif(len(TarGTs0[j:j+chunk_size]) == 1): NumInfoSites = NumInfoSites + 1 - numpy.ma.masked_less(t1001SNPs, 0).mask.astype(int) log.info("Done analysing %s positions", NumMatSNPs) log.info("writing score file!") overlap = float(NumMatSNPs)/NumSNPs print_out_table(outFile + '.scores.txt',GenotypeData.accessions, ScoreList, NumInfoSites, NumMatSNPs, DPmean) if not outFile: outFile = "genotyper" print_topHits(outFile + ".matches.json", GenotypeData.accessions, ScoreList, NumInfoSites, overlap, NumMatSNPs) getHeterozygosity(snpGT[overlappedInds], outFile + ".matches.json") return (ScoreList, NumInfoSites)
def main(): geno_in, scale_kinship, kinship_out_hdf5, kinship_out_csv = sys.argv[1:] logger = LoggerFactory.get_logger(kinship_out_hdf5 + '.log') LoggerFactory.log_command(logger, sys.argv[1:]) ## Import genotype data geno = genotype.load_hdf5_genotype_data(geno_in) SNP_acc = geno.accessions logger.info('Finished reading SNP from %s', geno_in) logger.info('Start calculating kinship') K = geno.get_ibs_kinship_matrix() if (scale_kinship == '1'): logger.info('Scaling') K = kinship.scale_k(K) else: logger.info('NOT scaling') logger.info('Saving kinship to HDF5 file %s', kinship_out_hdf5) kinship.save_kinship_to_file(kinship_out_hdf5, K, geno.accessions, geno.num_snps) logger.info('Saving kinship to CSV file %s', kinship_out_csv) save_kinship_in_text_format(kinship_out_csv, K, geno.accessions) logger.info('Done!')
def _load_genotype_(folder,genotype_id): data_format = 'binary' file_prefix = os.path.join(folder,str(genotype_id)) hdf5_file = os.path.join(file_prefix,'all_chromosomes_%s.hdf5' % data_format) if os.path.isfile(hdf5_file): return genotype.load_hdf5_genotype_data(hdf5_file) raise Exception('No Genotype files in %s folder were found.' % file_prefix)
def on_post(self, req, resp,genotype_id,chr,position): #filter nan position = int(position) genotypeData = genotype.load_hdf5_genotype_data('%s/%s/all_chromosomes_binary.hdf5' % (self.storage_path,genotype_id)) num_snps = int(req.params.get('num_snps',250)) accessions = req.context.get('doc',[]) ld_data = _replace_NaN(ld.calculate_ld_for_region(genotypeData,accessions,chr,position,num_snps=num_snps)) req.context['result'] = ld_data resp.status = falcon.HTTP_200
def main(): geno_in, acc_in, maf_lb, maf_ub, geno_out = sys.argv[1:] logger = LoggerFactory.get_logger(geno_out + '.log') LoggerFactory.log_command(logger, sys.argv[1:]) maf_lb, maf_ub = float(maf_lb), float(maf_ub) ## Import genotype data geno = genotype.load_hdf5_genotype_data(geno_in) SNP_acc = geno.accessions logger.info('Finished reading SNP from %s', geno_in) ## accession subset with open(acc_in, 'rb') as f: reader = csv.reader(f) file_acc = list(reader) logger.info('Finished reading accession subset from %s', acc_in) ## get common accessions in the same order for genotype and accession subset acc_common = [acc for acc in SNP_acc if acc in file_acc] ## filtering logger.info( 'Start subsetting accessions and filtering SNPs by MAF >%f and <=%f', maf_lb, maf_ub) match = lambda a, b: [b.index(x) if x in b else None for x in a] geno.filter_accessions_ix(match(acc_common, SNP_acc)) (num_snps, num_removed) = filter_maf_snps(geno, maf_lb, maf_ub) logger.info('Removed %d from %d SNPs', num_removed, num_snps) logger.info('Number of SNPs remaining %d', geno.num_snps) logger.info('Start writing filtered genotype file to %s', geno_out) geno.save_as_hdf5(geno_out) logger.info('Finished') logger.info('Done!')
type="string") inOptions.add_option("-r", "--rareAlleleFreq", dest="allelFreq", help="Allele frequency to consider as rare allele", default=0.05, type="float") #inOptions.add_option("-s", "--error_rate", dest="error", help="Maximum score which is considered to be for top hit accession", default=0.98, type="float") (options, args) = inOptions.parse_args() logging.basicConfig(format='%(levelname)s:%(asctime)s: %(message)s', level=logging.DEBUG) GenotypeData = genotype.load_hdf5_genotype_data(options.hdf5File) NumAcc = len(GenotypeData.accessions) snps = GenotypeData.get_snps_iterator(is_chunked=True, chunk_size=1000) chunk_i = 0 NumRareAllele = numpy.zeros(NumAcc) InfoPOS = numpy.zeros(NumAcc) logging.info("Starting the calculation") for snp in snps: chunk_i = chunk_i + 1 snps_array = numpy.array(snp) info_array = numpy.copy(snps_array) snps_array[snps_array == -1] = 0 info_array[info_array == 0] = 1 info_array[info_array == -1] = 0
type="float") inOptions.add_option("-i", "--input_vcf", dest="inFile", help="Input VCF file", type="string") inOptions.add_option("-o", "--output", dest="outFile", help="Output file with the probability scores", type="string") inOptions.set_defaults(error=0.001, qual=100) (options, args) = inOptions.parse_args() inputVCFfile = open(options.inFile, 'r') GenotypeData = genotype.load_hdf5_genotype_data( '/lustre/scratch/users/rahul.pisupati/all_chromosomes_binary.hdf5') # Create a numpy array containing all the positions ScoreList = numpy.zeros(len(GenotypeData.accessions)) NumSNP = 0 CheckStatus = 0 for vcfLine in inputVCFfile.readlines()[0:]: if (vcfLine[0][0] != '#'): if (float(vcfLine.split()[5]) > options.qual and len(vcfLine.split()[3]) == 1 and len(vcfLine.split()[4]) == 1): dataSNPlist = getAllele(GenotypeData, vcfLine.split()[0].replace("Chr", ""), vcfLine.split()[1]) NumSNP += 1 if (dataSNPlist != '0'):
def crossGenotyper(args): ## Get the VCF file (filtered may be) generated by GATK. # inputs: # 1) VCF file # 2) Parent1 and Parent2 # 3) SNP matrix (hdf5 file) # 4) Bin length, default as 200Kbp # 5) Chromosome length (snpCHR, snpPOS, snpGT, snpWEI, DPmean) = snpmatch.parseInput(inFile=args['inFile'], logDebug=args['logDebug']) parents = args['parents'] ## need to filter the SNPs present in C and M log.info("loading HDF5 file") GenotypeData_acc = genotype.load_hdf5_genotype_data(args['hdf5accFile']) ## die if either parents are not in the dataset try: indP1 = np.where( GenotypeData_acc.accessions == parents.split("x")[0])[0][0] indP2 = np.where( GenotypeData_acc.accessions == parents.split("x")[1])[0][0] except: snpmatch.die("parents are not in the dataset") snpsP1 = GenotypeData_acc.snps[:, indP1] snpsP2 = GenotypeData_acc.snps[:, indP2] # identifying the segregating SNPs between the accessions # only selecting 0 or 1 segSNPsind = np.where((snpsP1 != snpsP2) & (snpsP1 >= 0) & (snpsP2 >= 0) & (snpsP1 < 2) & (snpsP2 < 2))[0] log.info("number of segregating snps between parents: %s", len(segSNPsind)) (ChrBins, PosBins) = getBins(GenotypeData_acc, args['binLen']) log.info("number of bins: %s", len(ChrBins)) outfile = open(args['outFile'], 'w') for i in range(len(PosBins)): start = np.sum(PosBins[0:i]) end = start + PosBins[i] # first snp positions which are segregating and are in this window reqPOSind = segSNPsind[np.where((segSNPsind < end) & (segSNPsind >= start))[0]] reqPOS = GenotypeData_acc.positions[reqPOSind] perchrTarPosind = np.where(snpCHR == ChrBins[i])[0] perchrTarPos = snpPOS[perchrTarPosind] matchedAccInd = reqPOSind[np.where(np.in1d(reqPOS, perchrTarPos))[0]] matchedTarInd = perchrTarPosind[np.where(np.in1d(perchrTarPos, reqPOS))[0]] matchedTarGTs = snpGT[matchedTarInd] try: TarGTs = snpmatch.parseGT(matchedTarGTs) TarGTs[np.where(TarGTs == 2)[0]] = 4 genP1 = np.subtract(TarGTs, snpsP1[matchedAccInd]) genP1no = len(np.where(genP1 == 0)[0]) if len(genP1) > 0: pValP1 = st.binom_test(genP1no, len(genP1), 0.8, alternative="greater") pValP2 = st.binom_test(len(genP1) - genP1no, len(genP1), 0.8, alternative="greater") if pValP1 < 0.05: outfile.write("%s\t%s\t%s\t0\t%s\n" % (i + 1, genP1no, len(genP1), pValP1)) elif pValP2 < 0.05: outfile.write("%s\t%s\t%s\t1\t%s\n" % (i + 1, genP1no, len(genP1), pValP2)) elif float(genP1no) / len(genP1) >= 0.8 or float( genP1no) / len(genP1) <= 0.2: outfile.write("%s\t%s\t%s\tNA\tNA\n" % (i + 1, genP1no, len(genP1))) else: outfile.write("%s\t%s\t%s\t0.5\tNA\n" % (i + 1, genP1no, len(genP1))) else: outfile.write("%s\t%s\t%s\tNA\tNA\n" % (i + 1, genP1no, len(genP1))) except: outfile.write("%s\tNA\tNA\tNA\tNA\n" % (i + 1)) if i % 10 == 0: log.info("progress: %s windows", i + 10) log.info("done!") outfile.close()
def geno(): return genotype.load_hdf5_genotype_data('%s/all_chromosomes_binary.hdf5' % resource_path)
def main(): geno_hdf5, kinship_hdf5, RNA_csv, COV_file, RNA_start, RNA_end, out_file = sys.argv[ 1:] RNA_start, RNA_end = int(RNA_start), int(RNA_end) logger = LoggerFactory.get_logger(out_file + '.log', file_level=logging.DEBUG, console_level=logging.DEBUG) LoggerFactory.log_command(logger, sys.argv[1:]) step_list = [""] ## Import genotype data logger.info('Start reading SNP from %s', geno_hdf5) geno = genotype.load_hdf5_genotype_data(geno_hdf5) SNP_accx = ['X' + acc for acc in geno.accessions] logger.info('Finished reading SNP from %s', geno_hdf5) ## Import phenotype data logger.info('Finished reading RNA from %s', RNA_csv) RNA_df = pd.read_csv(RNA_csv, sep='\t', header=0, index_col=0) # genes x accessions RNA_accx = list(RNA_df.columns.values) RNA_genes = list(RNA_df.index) logger.info('Finished reading RNA from %s', RNA_csv) ## get common accessions in the same order for genotype, phenotype and phenotype covariates logger.info('Consolidate accessions from genotype and RNA file') accx_common = [accx for accx in SNP_accx if accx in RNA_accx] RNA = RNA_df.as_matrix(columns=accx_common).T # accession x genes match = lambda a, b: [b.index(x) if x in b else None for x in a] geno.filter_accessions_ix(match(accx_common, SNP_accx)) logger.info( 'Number of accessions: genotype file %d, RNA file %d, common %d', len(SNP_accx), len(RNA_accx), len(accx_common)) logger.info('Start building SNP matrix in memory') snps = np.vstack(geno.get_snps_iterator(is_chunked=True)) snps = snps.T.astype(int) logger.info('Finished') logger.info('Start loading kinship matrix from %s', kinship_hdf5) load_k = kinship.load_kinship_from_file(kinship_hdf5, scaled=False) K0 = load_k['k'].astype(float) K_accx = ['X' + acc for acc in load_k['accessions']] K_accx_ix = np.ix_(match(accx_common, K_accx), match(accx_common, K_accx)) K = K0[K_accx_ix] logger.info('Finished') logger.info('Start loading covariance from %s', COV_file) COV_df = pd.read_csv(COV_file, sep='\t', header=0, index_col=0) # cov x accessions COV = COV_df.ix[accx_common].as_matrix() logger.info('Finished') logger.info('Start association testing: RNA start %d, RNA end %d', RNA_start, RNA_end) run_lmm_chunk(snps, RNA, COV, RNA_start, RNA_end, list(RNA_genes[RNA_start:RNA_end]), K, out_file)
def main(): logger = logging.getLogger(__name__) logger.setLevel(logging.DEBUG) # create console handler and set level to debug ch = logging.StreamHandler() ch.setLevel(logging.DEBUG) # create formatter formatter = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s - %(message)s') # add formatter to ch ch.setFormatter(formatter) # add ch to logger logger.addHandler(ch) step_list = [""] ## Output directory out_dir = '/gale/netapp/home/shhuang/projects/1001_genomes/marginal_test_cov_01' logger.info('Out dir %s', out_dir) ## Import genotype data #SSH: f = h5py.File('/Limix/samples/SNPs.h5py') #SSH: transsnp = f['snp'][:] #transsnp.shape SNP_file = '/gale/netapp/home/shhuang/data/1001_genomes/gmi_release_v3.1/SNP_matrix_imputed_hdf5/1001_SNP_MATRIX/imputed_snps_binary.hdf5' #SNP_file = '/gale/netapp/home/shhuang/data/1001_genomes/1001_250k_fullimputed/PYGWAS_GENOTYPES/1/all_chromosomes_binary.hdf5' geno = genotype.load_hdf5_genotype_data(SNP_file) #f = h5py.File(SNP_file) SNP_accx = ['X' + acc for acc in geno.accessions] logger.info('Finished reading SNP from %s', SNP_file) ## Import phenotype data #SSH: f = open('/Limix/samples/RNA_wo_Index.csv') #SSH: f.readline() #SSH: RNA = [] #SSH: for l in f: #SSH: RNA.append(l.strip().split(',')) #SSH: RNA = np.array(RNA) #SSH: RNA = RNA.astype(float) #SSH: RNA = RNA.T #SSH: RNA = (RNA-RNA.mean(axis=0))/RNA.std(axis=0) RNA_file = '/gale/netapp/home/shhuang/projects/1001_genomes/ath1001_tx_norm_2016-01-03/ath1001_tx_norm_2016-01-03-filtered01.txt' RNA_df = pd.read_csv(RNA_file, sep='\t', header=0, index_col=0) # genes x accessions RNA_accx = list(RNA_df.columns.values) RNA_genes = list(RNA_df.index) logger.info('Finished reading RNA from %s', RNA_file) ## get common accessions in the same order for genotype, phenotype and phenotype covariates accx_common = [accx for accx in SNP_accx if accx in RNA_accx] RNA = RNA_df.as_matrix(columns=accx_common).T # accession x genes ## filtering match = lambda a, b: [b.index(x) if x in b else None for x in a] geno.filter_accessions_ix(match(accx_common, SNP_accx)) #(num_snps,num_removed) = geno.filter_non_binary() (num_snps, num_removed) = filter_maf_snps(geno, 0.01, 0.5) #snps_ix = np.ix_(geno.filter_snps,geno.filter_accessions) #snps = geno.snps[:][snps_ix] #snps = snps.T.astype(int) snps = np.vstack(geno.get_snps_iterator(is_chunked=True)) snps = snps.T.astype(int) logger.info('Finished filtering SNPs') #SSH: transsnp = f['snps'][:,match(accx_common,SNP_accx)] # SNP x accessions # trans kinship matrix #SSH: ts = transsnp.T # accessions x SNP #SSH: sumts = ts.sum(axis=0) #SSH: pos_tf = (sumts!=0)&(sumts!=ts.shape[0]) #SSH: ts = ts[:,pos_tf] # not fixed #SSH: ts = ts.astype(float) #SSH: ts = (ts-ts.mean(axis=0))/ts.std(axis=0) #SSH: transk = np.dot(ts,ts.T) # accessions x accessions #transk ## Scaling Kinship matrix (from the Bjarni's scale_k()) #SSH: c = sp.sum((sp.eye(len(transk)) - (1.0 / len(transk)) * sp.ones(transk.shape)) * sp.array(transk)) #SSH: scalar = (len(transk) - 1) / c #SSH: transK = scalar * transk ## save the filtered genotypes #np.savetxt(os.path.join(out_dir,'positions_tf.txt'), geno.filter_snps, delimiter='\t',fmt='%d') if ('calc_kinship' in step_list): K = geno.get_ibd_kinship_matrix() scaledK = kinship.scale_k(K).astype(float) kinship.save_kinship_to_file( os.path.join(out_dir, 'kinship_maf1.hdf5'), K, geno.accessions, geno.num_snps) else: load_k = kinship.load_kinship_from_file(os.path.join( out_dir, 'kinship_maf1.hdf5'), scaled=False) K = load_k['k'].astype(float) scaledK = kinship.scale_k(K) logger.info('Done with kinship file') num_cores = 32 logger.info('Start testing') for k in range(4, 5): ## Import phenotype covariates COV_file = '/gale/netapp/home/shhuang/projects/1001_genomes/ath1001_tx_norm_2016-01-03/ath1001_tx_norm_2016-01-03-gNorm_W_k%d.txt' % ( k) COV_df = pd.read_csv(COV_file, sep='\t', header=0, index_col=0) # cov x accessions COV = COV_df.ix[accx_common].as_matrix() logger.info('Cov file %s', COV_file) #x = Parallel(n_jobs=num_cores)(delayed(runn_lmm)(ts,RNA,i,transK,out_csv) for i in range(0,10)) #x = Parallel(n_jobs=num_cores,verbose=100,max_nbytes=1e6)(delayed(has_shareable_memory)(run_lmm(ts,RNA,COV,i,transK,out_csv)) for i in range(0,RNA.shape[1])) out_dir_k = os.path.join(out_dir, 'gNorm_k%d' % k) if not os.path.exists(out_dir_k): os.makedirs(out_dir_k) #x = Parallel(n_jobs=num_cores,verbose=100,max_nbytes=1e6)( # delayed(run_lmm)(snps,RNA,COV,i,scaledK,os.path.join(out_dir_k,'%s.csv'%RNA_genes[i])) # for i in range(0,10)) for i in range(0, 5000): logger.debug('Gene %d', i) run_lmm(snps, RNA, COV, i, scaledK, os.path.join(out_dir_k, '%s.csv' % RNA_genes[i]))
from pygwas.core import genotype #__________________________________________ inOptions = OptionParser() inOptions.add_option("-p", "--pos_file", dest="posFile", help="Position file removing the header from VCF", type="string") inOptions.add_option("-t", "--file_num_snps", dest="file_num_snps", help="Output from the CalculateSNPseachAcc.py script", type="string") inOptions.add_option("-d", "--hdf5_file", dest="hdf5File", help="Path to SNP matrix given in binary hdf5 file", type="string") inOptions.add_option("-e", "--hdf5_acc_file", dest="hdf5accFile", help="Path to SNP matrix given in binary hdf5 file", type="string") inOptions.add_option("-o", "--output", dest="outFile", help="Output file with the probability scores", type="string") inOptions.add_option("-r", "--refScore", dest="refScore", help="Output for refined score", type="string") (options, args) = inOptions.parse_args() GenotypeData = genotype.load_hdf5_genotype_data(options.hdf5File) GenotypeData_acc = genotype.load_hdf5_genotype_data(options.hdf5accFile) # Create a numpy array containing all the positions targetSNPs = pandas.read_table(options.posFile, header=None) NumSNPs = len(targetSNPs) ScoreList = numpy.zeros(len(GenotypeData.accessions)) TotMatchedSNPind = numpy.zeros(0, dtype="uint32") NumMatSNPs = 0 for i in range(1,6): perchrtarSNPpos = targetSNPs[1][numpy.where(targetSNPs[0] == i)[0]] start = GenotypeData.chr_regions[i-1][0] end = GenotypeData.chr_regions[i-1][1] chrpositions = GenotypeData.positions[start:end] matchedSNPind = numpy.where(numpy.in1d(chrpositions, perchrtarSNPpos))[0] + start TotMatchedSNPind = numpy.append(TotMatchedSNPind, matchedSNPind)
def crossGenotyper(args): ## Get the VCF file (filtered may be) generated by GATK. ## inputs: # 1) VCF file # 2) Parent1 and Parent2 # 3) SNP matrix (hdf5 file) # 4) Bin length, default as 200Kbp # 5) Chromosome length log.info("loading genotype data for parents") if args['father'] is not None: log.info("input files: %s and %s" % (args['parents'], args['father'])) if not os.path.isfile(args['parents']) and os.path.isfile( args['father']): die("either of the input files do not exists, please provide VCF/BED file for parent genotype information" ) (p1snpCHR, p1snpPOS, p1snpGT, p1snpWEI, p1DPmean) = parsers.parseInput(inFile=args['parents'], logDebug=args['logDebug']) (p2snpCHR, p2snpPOS, p2snpGT, p2snpWEI, p2DPmean) = parsers.parseInput(inFile=args['father'], logDebug=args['logDebug']) commonCHRs_ids = np.union1d(p1snpCHR, p2snpCHR) commonSNPsCHR = np.zeros(0, dtype=commonCHRs_ids.dtype) commonSNPsPOS = np.zeros(0, dtype=int) snpsP1 = np.zeros(0, dtype='int8') snpsP2 = np.zeros(0, dtype='int8') for i in commonCHRs_ids: perchrP1inds = np.where(p1snpCHR == i)[0] perchrP2inds = np.where(p2snpCHR == i)[0] perchrPositions = np.union1d(p1snpPOS[perchrP1inds], p2snpPOS[perchrP2inds]) commonSNPsCHR = np.append(commonSNPsCHR, np.repeat(i, len(perchrPositions))) commonSNPsPOS = np.append(commonSNPsPOS, perchrPositions) perchrsnpsP1 = np.repeat(-1, len(perchrPositions)).astype('int8') perchrsnpsP2 = np.repeat(-1, len(perchrPositions)).astype('int8') perchrsnpsP1_inds = np.where( np.in1d(p1snpPOS[perchrP1inds], perchrPositions))[0] perchrsnpsP2_inds = np.where( np.in1d(p2snpPOS[perchrP2inds], perchrPositions))[0] snpsP1 = np.append(snpsP1, parsers.parseGT(p1snpGT[perchrsnpsP1_inds])) snpsP2 = np.append(snpsP2, parsers.parseGT(p2snpGT[perchrsnpsP2_inds])) log.info("done!") else: parents = args['parents'] ## need to filter the SNPs present in C and M if not args['hdf5accFile']: snpmatch.die("needed a HDF5 genotype file and not specified") log.info("loading HDF5 file") g_acc = genotype.load_hdf5_genotype_data(args['hdf5accFile']) ## die if either parents are not in the dataset #import ipdb; ipdb.set_trace() try: indP1 = np.where(g_acc.accessions == parents.split("x")[0])[0][0] indP2 = np.where(g_acc.accessions == parents.split("x")[1])[0][0] except: snpmatch.die("parents are not in the dataset") snpsP1 = g_acc.snps[:, indP1] snpsP2 = g_acc.snps[:, indP2] commonSNPsCHR = np.array(g_acc.chromosomes) commonSNPsPOS = np.array(g_acc.positions) log.info("done!") log.info("running cross genotyper") crossGenotypeWindows(commonSNPsCHR, commonSNPsPOS, snpsP1, snpsP2, args['inFile'], args['binLen'], args['outFile'], args['logDebug'])
#!/usr/bin/python import sys #These are the modules that are needed for this script # module load numpy # module use /net/gmi.oeaw.ac.at/software/shared/nordborg_common/modulefiles/ # module load pygwas import numpy from pygwas.core import genotype hdf5file = sys.argv[1] GenotypeData = genotype.load_hdf5_genotype_data(hdf5file) # Calculate the number of SNPs in all the accessions # Takes a really long time # Calculate the final probability based on the score count and total count #outfile = open("totalSNPsNum_1001genomes.txt", 'w') for i in range(0, len(GenotypeData.accessions)): # outScore = numpy.count_nonzero(GenotypeData.snps[:, i]) outScore = len(numpy.where(GenotypeData.snps[:, i] == 1)[0]) # outScore = len(numpy.where((GenotypeData.snps[:,i] == 1) | (GenotypeData.snps[:,i] == -1))[0]) print GenotypeData.accessions[i], "\t", outScore # print "Written count for", i+1, "accessions", "Accession:", GenotypeData.accessions[i], "Count:", outScore #outfile.close()
def geno(): return genotype.load_hdf5_genotype_data('%s/all_chromosomes_binary.hdf5' %resource_path)