# where # - N is [0-45] # - a "SNP with Complete Trio" = SNP has all 6 alleles for a given family # IOW: histogram with # of trios on the X-axis (0 to 45) and # of SNPs on the Y-axis # # Sanity Checks: # - there should be 158 + 12 SNPs with 0 complete trios # - running this with all data sets should give same results (including both the 45-family sets and the 94-family sets) DATA_FILENAME = "Data_ALL_0s.ped" import logging logging.basicConfig(format="%(asctime)s %(levelname)s %(msg)s", level=logging.INFO) import pedparse everybody = pedparse.load_file(DATA_FILENAME) # initialize a list with buckets for 0 to N # where N is the total number of trios ... # OR #complete_trio_buckets = [0] * (len(everybody) + 1) # N is ttotal number of tros minus the number of known incomplete trios complete_trio_buckets = [0] * ( len(everybody) - len(pedparse.FAMILIES_WITH_NO_COMPLETE_TRIOS) + 1) # "+ 1" is for the 0-bucket try: for snp in xrange(pedparse.TOTAL_SNPS): complete_trios = 0 for fam in everybody.values(): if fam.count_alleles(snp) == 6:
#!/usr/bin/env python DATA_FILENAME = "Data_ALL_0s.ped" import logging logging.basicConfig(format="%(asctime)s %(levelname)s %(msg)s", level=logging.INFO) logging.info("Hi there.") import pedparse everybody = pedparse.load_file(DATA_FILENAME, limit_trios=1) logging.info("There are %d trios" % len(everybody)) for fam in everybody.values(): #print "Family ID: %s == %s" % ( everybody[fam].family_id, fam) #for snp in xrange(TOTAL_SNPS): # everybody[fam].is_snp_complete(snp) # #print "SNP %d complete? %s" % ( snp, everybody[fam].is_snp_complete(snp)) #print "Complete SNP data for %d SNPs" % len(filter(lambda x: x, everybody[fam].snp_completeness.values())) counts = fam.do_counts() counts['family_id'] = fam.family_id counts['complete_pct'] = 100.0 * counts['complete'] / pedparse.TOTAL_SNPS counts['empty_pct'] = 100.0 * counts['empty'] / pedparse.TOTAL_SNPS counts['partial_pct'] = 100.0 * counts['partial'] / pedparse.TOTAL_SNPS logging.info( "Family ID: %(family_id)s -- Complete SNP data for %(complete)7d SNPs (%(complete_pct)6.2f%%) - Empty SNP data for %(empty)7d SNPs (%(empty_pct)6.2f%%) - Partial SNP data for %(partial)7d SNPs (%(partial_pct)6.2f%%)" % counts)
#!/usr/bin/env python DATA_FILENAME = "Data_ALL_0s.ped" import logging logging.basicConfig(format="%(asctime)s %(levelname)s %(msg)s", level=logging.INFO) import pedparse everybody = pedparse.load_file(DATA_FILENAME) #snps_with_complete_trios = [] snps_with_no_complete_trios = [] # here, snp's are just line numbers - they have names, which we can find later in the .map file ... for snp in xrange(pedparse.TOTAL_SNPS): if snp % 500 == 0: logging.debug("Processing SNP # %d" % snp) for fam in everybody.values(): if fam.count_alleles(snp) == 6: #snps_with_complete_trios += [snp] break else: # exhausted the family for loop, didn't find any complete trios for # the current SNP .. add this one to the "no complete trios" list snps_with_no_complete_trios += [snp] logging.info("Found %d SNPs with NO complete trios" % len(snps_with_no_complete_trios)) logging.info("List of SNPs with NO complete trios: %s" % snps_with_no_complete_trios) snps_with_no_complete_trios_but_some_data = snps_with_no_complete_trios[:] for snp in snps_with_no_complete_trios:
#!/usr/bin/env python DATA_FILENAME = "Data_ALL_0s.ped" import logging logging.basicConfig(format="%(asctime)s %(levelname)s %(msg)s", level=logging.INFO) logging.info("Hi there.") import pedparse everybody = pedparse.load_file(DATA_FILENAME, limit_trios=1) logging.info( "There are %d trios" % len(everybody)) for fam in everybody.values(): #print "Family ID: %s == %s" % ( everybody[fam].family_id, fam) #for snp in xrange(TOTAL_SNPS): # everybody[fam].is_snp_complete(snp) # #print "SNP %d complete? %s" % ( snp, everybody[fam].is_snp_complete(snp)) #print "Complete SNP data for %d SNPs" % len(filter(lambda x: x, everybody[fam].snp_completeness.values())) counts = fam.do_counts() counts['family_id'] = fam.family_id counts['complete_pct'] = 100.0 * counts['complete'] / pedparse.TOTAL_SNPS counts['empty_pct'] = 100.0 * counts['empty'] / pedparse.TOTAL_SNPS counts['partial_pct'] = 100.0 * counts['partial'] / pedparse.TOTAL_SNPS logging.info( "Family ID: %(family_id)s -- Complete SNP data for %(complete)7d SNPs (%(complete_pct)6.2f%%) - Empty SNP data for %(empty)7d SNPs (%(empty_pct)6.2f%%) - Partial SNP data for %(partial)7d SNPs (%(partial_pct)6.2f%%)" % counts)