def task_create_riskreport( risk_report_id, genome_id ): # NOTE: arguments for celery task should be JSON serializable risk_report = RiskReport.objects.get(id=risk_report_id) genome = Genome.objects.get(id=genome_id) log.info('Creating riskreport ...') # TODO: Check for updates latest_date = GwasCatalogSnp.objects.aggregate( Max('date_downloaded'))['date_downloaded__max'] phenotypes = GwasCatalogPhenotype.objects.all() log.info('#phenotypes: {}'.format(len(phenotypes))) population = [genome.population] for phenotype in phenotypes: assert type(phenotype) == GwasCatalogPhenotype gwas_snps = GwasCatalogSnp.objects.filter( phenotype=phenotype, population__contains=population, date_downloaded=str(latest_date)) if not gwas_snps: continue # Select only one article for one phenotype # # TODO: add conditions # - risk alleles are present # - odds ratios are present # - (beta-coeeff is not present) # - lower than minimum p-value evidence_article_1st = gwas_snps.exclude( pubmed_id__isnull=True).order_by('reliability_rank').values_list( 'pubmed_id', flat=True).distinct().first() evidence_snps = gwas_snps.filter(pubmed_id=evidence_article_1st) evidence_snp_ids = evidence_snps.values_list('snp_id_current', flat=True) freqs = get_freqs(evidence_snp_ids, population=population) genotypes = Genotype.objects.filter(genome__id=genome.id, rs_id_current__in=evidence_snp_ids) phenotype_risk_report, _ = PhenotypeRiskReport.objects.get_or_create( risk_report=risk_report, phenotype=phenotype) # Calculate cumulative risk estimated_snp_risks = [] # Genotype specific risks for each SNP with transaction.atomic(): for evidence_snp in evidence_snps: # Risk allele and its frequency risk_allele_forward = evidence_snp.risk_allele_forward risk_allele_freq = freqs.get(evidence_snp.snp_id_current, {}).get(risk_allele_forward) odds_ratio = evidence_snp.odds_ratio # My genotype try: genotype = ''.join( genotypes.get(rs_id_current=evidence_snp.snp_id_current ).genotype) zygosities = zyg(genotype, risk_allele_forward) except Genotype.DoesNotExist: zygosities = None # Genotype specific risks if None not in (risk_allele_freq, odds_ratio, zygosities): genotype_specific_risks = genotype_specific_risks_relative_to_population( risk_allele_freq, odds_ratio) my_estimated_risk = estimated_risk(genotype_specific_risks, zygosities) else: my_estimated_risk = None SnpRiskReport(phenotype_risk_report=phenotype_risk_report, evidence_snp=evidence_snp, estimated_risk=my_estimated_risk).save() estimated_snp_risks.append(my_estimated_risk) phenotype_risk_report.estimated_risk = cumulative_risk( estimated_snp_risks) phenotype_risk_report.save() log.info('Done')
def handle(self, *args, **options): current_tz = timezone.get_current_timezone() if not os.path.exists(settings.GWASCATALOG_DIR): os.makedirs(settings.GWASCATALOG_DIR) # TODO: automatically choose latest version log.info('Fetching latest gwascatalog...') catalog_path = os.path.join(settings.GWASCATALOG_DIR, 'dbsnp-pg-min-0.5.2-b144-GRCh37.gwascatalog-cleaned.tsv.gz') get_url_content(url='https://github.com/knmkr/dbsnp-pg-min/releases/download/0.5.2/dbsnp-pg-min-0.5.2-b144-GRCh37.gwascatalog-cleaned.tsv.gz', dst=catalog_path, if_not_exists=True) log.info('Fetching latest gwascatalog allele freq...') catalog_freq_path = os.path.join(settings.GWASCATALOG_DIR, 'dbsnp-pg-min-0.5.2-b144-GRCh37.gwascatalog-snps-allele-freq.tsv.gz') get_url_content(url='https://github.com/knmkr/dbsnp-pg-min/releases/download/0.5.2/dbsnp-pg-min-0.5.2-b144-GRCh37.gwascatalog-snps-allele-freq.tsv.gz', dst=catalog_freq_path, if_not_exists=True) # - Gwas Catalog Allele Freq log.info('Updating snp allele freq records for gwascatalog...') num_created = 0 num_updated = 0 with transaction.atomic(): for record in csv.DictReader(gzip.open(catalog_freq_path, 'rb'), delimiter='\t', fieldnames=['snp_id_current', 'allele', 'freq', 'populations']): snp, created = Snp.objects.update_or_create( snp_id_current=record['snp_id_current'], population=record['populations'], defaults={'allele': text2pg_array(record['allele']), 'freq': text2pg_array(record['freq'])} ) if created: num_created += 1 else: num_updated += 1 log.info('updated: {} records'.format(num_updated)) log.info('created: {} records'.format(num_created)) # - Gwas Catalog log.info('Importing gwascatalog...') model_fields = [field for field in GwasCatalogSnp._meta.get_fields() if field.name not in ('id', 'created_at')] model_field_names = [field.name for field in model_fields] model_fields_map = dict(zip(model_field_names, model_fields)) gwascatalog_snps = [] num_phenotype_created = 0 for record in csv.DictReader(gzip.open(catalog_path, 'rb'), delimiter='\t'): data = {} # If date_downloaded is already imported, abort. date_downloaded = record['date_downloaded'] if GwasCatalogSnp.objects.filter(date_downloaded=date_downloaded).exists(): raise GwasCatalogParseError('Already imported date_downloaded: {}'.format(date_downloaded)) # Import only pre-defined model fields for k,v in record.items(): if k in model_field_names: # Set blank or null if v == '': if type(model_fields_map[k]) in (models.fields.CharField, models.fields.TextField): v = '' else: v = None # Set datetime with timezone if type(model_fields_map[k]) == models.DateTimeField: v = current_tz.localize(datetime(*(parse_date(v).timetuple()[:5]))) data[k] = v # Parse population population = get_population(record['initial_sample']) # Calculate reliability rank reliability_rank = 1.0 # TODO: get_reliability_rank() try: # Parse and validate odds_ratio, beta_coeff _, unit = get_ci_and_unit(record['confidence_interval_95_percent']) odds_ratio, beta_coeff = get_odds_ratio_or_beta_coeff(record['odds_ratio_or_beta_coeff'], unit) # Validate risk_allele # # Strands of risk alleles in GWAS Catalog are not set to forward strands with respect to # the human reference genome b37. So we get forward strand alleles by checking consistences of # allele frequencies between reported risk alleles and 1000 Genomes Project alleles. if data['snp_id_current']: snp_id = int(data['snp_id_current']) database_freq = get_freqs([snp_id], population).get(snp_id) risk_allele_forward = get_database_strand_allele(record['risk_allele'], record['risk_allele_freq_reported'], database_freq, freq_diff_thrs=settings.GWASCATALOG_FREQ_DIFF_THRS) else: risk_allele_forward = AMBIGOUS is_active = True except GwasCatalogParseError as e: log.error(e) odds_ratio, beta_coeff = None, None is_active = False # - Phenotype phenotype, phenotype_created = GwasCatalogPhenotype.objects.get_or_create(name=record['disease_or_trait']) if phenotype_created: num_phenotype_created += 1 data.update({'population': population, 'reliability_rank': reliability_rank, 'odds_ratio': odds_ratio, 'beta_coeff': beta_coeff, 'beta_coeff_unit': unit, 'risk_allele_forward': risk_allele_forward, 'phenotype': phenotype, 'is_active': is_active}) gwascatalog_snps.append(GwasCatalogSnp(**data)) # GwasCatalogSnp.objects.create(**data) with transaction.atomic(): GwasCatalogSnp.objects.bulk_create(gwascatalog_snps) log.info('GWAS Catalog snps processed: {} records'.format(len(gwascatalog_snps))) log.info('GWAS Catalog phenotypes newly created: {} records'.format(num_phenotype_created)) log.info('Done.')
def task_create_riskreport(risk_report_id, genome_id): # NOTE: arguments for celery task should be JSON serializable risk_report = RiskReport.objects.get(id=risk_report_id) genome = Genome.objects.get(id=genome_id) log.info('Creating riskreport ...') # TODO: Check for updates latest_date = GwasCatalogSnp.objects.aggregate(Max('date_downloaded'))['date_downloaded__max'] phenotypes = GwasCatalogPhenotype.objects.all() log.info('#phenotypes: {}'.format(len(phenotypes))) population = [genome.population] for phenotype in phenotypes: assert type(phenotype) == GwasCatalogPhenotype gwas_snps = GwasCatalogSnp.objects.filter(phenotype=phenotype, population__contains=population, date_downloaded=str(latest_date)) if not gwas_snps: continue # Select only one article for one phenotype # # TODO: add conditions # - risk alleles are present # - odds ratios are present # - (beta-coeeff is not present) # - lower than minimum p-value evidence_article_1st = gwas_snps.exclude(pubmed_id__isnull=True).order_by('reliability_rank').values_list('pubmed_id', flat=True).distinct().first() evidence_snps = gwas_snps.filter(pubmed_id=evidence_article_1st) evidence_snp_ids = evidence_snps.values_list('snp_id_current', flat=True) freqs = get_freqs(evidence_snp_ids, population=population) genotypes = Genotype.objects.filter(genome__id=genome.id, rs_id_current__in=evidence_snp_ids) phenotype_risk_report, _ = PhenotypeRiskReport.objects.get_or_create(risk_report=risk_report, phenotype=phenotype) # Calculate cumulative risk estimated_snp_risks = [] # Genotype specific risks for each SNP with transaction.atomic(): for evidence_snp in evidence_snps: # Risk allele and its frequency risk_allele_forward = evidence_snp.risk_allele_forward risk_allele_freq = freqs.get(evidence_snp.snp_id_current, {}).get(risk_allele_forward) odds_ratio = evidence_snp.odds_ratio # My genotype try: genotype = ''.join(genotypes.get(rs_id_current=evidence_snp.snp_id_current).genotype) zygosities = zyg(genotype, risk_allele_forward) except Genotype.DoesNotExist: zygosities = None # Genotype specific risks if None not in (risk_allele_freq, odds_ratio, zygosities): genotype_specific_risks = genotype_specific_risks_relative_to_population(risk_allele_freq, odds_ratio) my_estimated_risk = estimated_risk(genotype_specific_risks, zygosities) else: my_estimated_risk = None SnpRiskReport(phenotype_risk_report=phenotype_risk_report, evidence_snp=evidence_snp, estimated_risk=my_estimated_risk).save() estimated_snp_risks.append(my_estimated_risk) phenotype_risk_report.estimated_risk = cumulative_risk(estimated_snp_risks) phenotype_risk_report.save() log.info('Done')