def load_family_details(self, pheno_covar): """Load family data updating the pheno_covar with family ids found. :param pheno_covar: Phenotype/covariate object :return: None """ file = open(self.fam_details) header = file.readline() format = file.readline() self.file_index = 0 mask_components = [] # 1s indicate an individual is to be masked out for line in file: words = line.strip().split() indid = ":".join(words[0:2]) if DataParser.valid_indid(indid): mask_components.append(0) sex = int(words[5]) pheno = float(words[6]) pheno_covar.add_subject(indid, sex, pheno) else: mask_components.append(1) mask_components = numpy.array(mask_components) self.ind_mask = numpy.zeros(len(mask_components) * 2, dtype=numpy.int8).reshape(-1, 2) self.ind_mask[0:, 0] = mask_components self.ind_mask[0:, 1] = mask_components self.ind_count = self.ind_mask.shape[0] pheno_covar.freeze_subjects()
def load_fam(self, pheno_covar=None): """Load contents from the .fam file, updating the pheno_covar with \ family ids found. :param pheno_covar: Phenotype/covariate object :return: None """ logging.info("Loading file: %s" % (self.fam_file)) pheno_col = 5 if not DataParser.has_sex: pheno_col -= 1 if not DataParser.has_parents: pheno_col -= 2 if not DataParser.has_fid: pheno_col -= 1 sex_col = pheno_col - 1 mask_components = [] for line in open(self.fam_file): words = line.strip().split() if len(words) > 1: indid = ":".join(words[0:2]) if DataParser.valid_indid(indid): mask_components.append(0) sex = None pheno = None if DataParser.has_sex: sex = int(words[sex_col]) if DataParser.has_pheno: pheno = float(words[pheno_col]) if pheno_covar is not None: pheno_covar.add_subject(indid, sex, pheno) if len(words) > 0: self.families.append(words) else: mask_components.append(1) mask_components = numpy.array(mask_components) self.ind_mask = numpy.zeros(len(mask_components), dtype=numpy.int8) self.ind_mask = mask_components self.ind_count = self.ind_mask.shape[0] if pheno_covar is not None: pheno_covar.freeze_subjects()
def load_tfam(self, pheno_covar): """Load the pedigree portion of the data and sort out exclusions""" pheno_col = 5 if not DataParser.has_sex: pheno_col -= 1 if not DataParser.has_parents: pheno_col -= 2 if not DataParser.has_fid: pheno_col -= 1 sex_col = pheno_col - 1 mask_components = [] for line in open(self.tfam_file): words = line.strip().split() if len(words) > 1: indid = ":".join(words[0:2]) if DataParser.valid_indid(indid): mask_components.append(0) sex = None pheno = None if DataParser.has_sex: sex = int(words[sex_col]) if DataParser.has_pheno: pheno = float(words[pheno_col]) if pheno_covar is not None: pheno_covar.add_subject(indid, sex, pheno) if len(words) > 0: self.families.append(words) else: mask_components.append(1) mask_components = numpy.array(mask_components) self.ind_mask = numpy.zeros(len(mask_components) * 2, dtype=numpy.int8).reshape(-1, 2) self.ind_mask[0:, 0] = mask_components self.ind_mask[0:, 1] = mask_components self.ind_count = self.ind_mask.shape[0] if pheno_covar is not None: pheno_covar.freeze_subjects() self.load_genotypes()
def load_family_details(self, pheno_covar): """Load contents from the .fam file, updating the pheno_covar with \ family ids found. :param pheno_covar: Phenotype/covariate object :return: None """ self.file_index = 0 # 1s indicate an individual is to be masked out mask_components = [] file = self.family_details if DataParser.compressed_pedigree: data, serr = sys_call('gunzip -c %s | wc -l' % (file)) self.line_count = int(data[0].strip().split(" ")[0]) iddata, serr = sys_call('gunzip -c %s | cut -f 1' % (file)) else: data, serr = sys_call('wc -l %s' % (file)) self.line_count = int(data[0].strip().split(" ")[0]) iddata, serr = sys_call('cat %s | cut -f 1' % (file)) ids_observed = set() for line in iddata: indid = line.strip().split()[0] indid = ":".join(indid.split("->")) ExitIf("Duplicate ID found in dose file: %s" % (indid), indid in ids_observed) ids_observed.add(indid) if DataParser.valid_indid(indid): mask_components.append(0) pheno_covar.add_subject(indid, PhenoCovar.missing_encoding, PhenoCovar.missing_encoding) else: mask_components.append(1) self.ind_mask = numpy.array(mask_components) == 1 self.ind_count = self.ind_mask.shape[0] pheno_covar.freeze_subjects()
def load_genotypes(self, pheno_covar): """Load all data into memory and propagate valid individuals to \ pheno_covar. :param pheno_covar: Phenotype/covariate object is updated with subject information :return: None """ first_genotype = 6 pheno_col = 5 if not DataParser.has_sex: first_genotype -= 1 pheno_col -= 1 if not DataParser.has_parents: first_genotype -= 2 pheno_col -= 2 if not DataParser.has_pheno: first_genotype -= 1 if not DataParser.has_fid: first_genotype -= 1 pheno_col -= 1 if DataParser.has_liability: first_genotype += 1 sex_col = pheno_col - 1 individual_mask = [] self.individual_mask = [] dropped_individuals = [] # number of missing SNPs we can tolerate before dropping an individual max_missing_for_individual = numpy.sum( self.snp_mask[:, 0] == 0) * DataParser.ind_miss_tol if DataParser.compressed_pedigree: ind_count, err = sys_call("gzip -cd %s | wc -l" % ("%s.gz" % (self.datasource))) else: ind_count, err = sys_call("wc -l %s" % (self.datasource)) ind_count = int(ind_count[0].split()[0]) + 1 snp_count = numpy.sum(self.snp_mask[:, 0] == 0) allelic_data = numpy.empty((ind_count, snp_count, 2), dtype='S1') valid_allele_count = 0 if DataParser.compressed_pedigree: input_file = gzip.open("%s.gz" % self.datasource, 'rb') else: input_file = open(self.datasource) for line in input_file: line = line.strip() if len(line) > 0: raw_data = line.strip().split() alleles = numpy.ma.MaskedArray( numpy.array(raw_data[first_genotype:]).reshape(-1, 2), self.snp_mask).compressed().reshape(-1, 2) # Convert the alleles into genotypes indid = ":".join(raw_data[0:2]) if not DataParser.has_fid: indid = raw_data[0] # Ignore any subjects that are to be excluded and remove those # that have too much missingness if DataParser.valid_indid(indid): missing = numpy.sum( alleles[:, 0] == DataParser.missing_representation) if missing > max_missing_for_individual: individual_mask += [1, 1] self.individual_mask.append(1) dropped_individuals.append(indid) else: sex = None phenotype = None if DataParser.has_pheno: phenotype = float(raw_data[pheno_col]) if DataParser.has_sex: sex = int(raw_data[sex_col]) if pheno_covar is not None: pheno_covar.add_subject(indid, sex, phenotype) individual_mask += [0, 0] self.individual_mask.append(0) allelic_data[valid_allele_count] = alleles valid_allele_count += 1 else: individual_mask += [1, 1] self.individual_mask.append(1) self.ind_count = valid_allele_count allelic_data = allelic_data[0:valid_allele_count] self.genotypes = numpy.empty((snp_count, valid_allele_count)) max_missing_individuals = DataParser.snp_miss_tol * ind_count dropped_loci = [] valid_snps = 0 valid_markers = [] valid_rsids = [] valid_maf = [] valid_allele_list = [] allele_count2s = [] for i in xrange(0, snp_count): snp_geno = allelic_data[:, i] alleles = list( set(numpy.unique(snp_geno)) - set([DataParser.missing_representation])) if len(alleles) > 2: raise TooManyAlleles(chr=self.markers[i][0], rsid=self.rsids[i], alleles=alleles) allele_count1 = numpy.sum(snp_geno == alleles[0]) allele_count2 = 0 maf = 0 if len(alleles) > 1: allele_count2 = numpy.sum(snp_geno == alleles[1]) real_allele_count2 = allele_count2 if allele_count2 > allele_count1: sorted_alleles = [alleles[1], alleles[0]] alleles = sorted_alleles allele_count = allele_count1 allele_count1 = allele_count2 allele_count2 = allele_count maf = allele_count2 / float(allele_count1 + allele_count2) allele_count2s.append(allele_count2) #genotypes = [] major_allele = alleles[0] minor_allele = alleles[1] genotype_data = numpy.sum(snp_geno == alleles[1], axis=1) genotype_data[ snp_geno[:, 0]==DataParser.missing_representation] = \ DataParser.missing_storage else: major_allele = alleles[0] minor_allele = '?' missing = numpy.sum(genotype_data == DataParser.missing_storage) if maf == 0 or maf < DataParser.min_maf or \ maf > DataParser.max_maf or \ max_missing_individuals < missing: locus_details = self.markers[i] DataParser.boundary.dropped_snps[locus_details[0]].add( locus_details[1]) dropped_loci.append("%s:%s" % (locus_details[0], locus_details[1])) self.invalid_loci.append(i) else: self.genotypes[valid_snps, :] = genotype_data valid_snps += 1 valid_markers.append(list(self.markers[i])) valid_rsids.append(self.rsids[i]) valid_allele_list.append([major_allele, minor_allele]) valid_maf.append(maf) self.markers = valid_markers self.alleles = valid_allele_list self.rsids = valid_rsids self.locus_count = valid_snps self.genotypes = self.genotypes[0:self.locus_count, :] self.allele_count2s = allele_count2s