def import_nyu_interpretations(**kwargs): """ """ nyu_interpretations_tsv = kwargs.pop('nyu_interpretations_tsv', config['nyu_interpretations_tsv']) num_created = 0 num_skipped = 0 with open(nyu_interpretations_tsv) as f: reader = csv.DictReader(f, delimiter='\t') for row in reader: tumor_type_instance = TumorType.objects.get( type=sanitize_tumor_tissue(row['TumorType'])) tissue_type_instance = TissueType.objects.get( type=sanitize_tumor_tissue(row['TissueType'])) instance, created = NYUInterpretation.objects.get_or_create( genes=row['Gene'], variant_type=row['VariantType'], tumor_type=tumor_type_instance, tissue_type=tissue_type_instance, variant=row['Variant'], interpretation=row['Interpretation'], citations=row['Citation']) if created: num_created += 1 else: num_skipped += 1 logger.debug( "Added {new} new NYU interpretations ({skipped} skipped) to the databse" .format(new=num_created, skipped=num_skipped))
def import_nyu_tiers(**kwargs): """ Imports values from the NYU tiers list to the database """ nyu_tiers_csv = kwargs.pop('nyu_tiers_csv', config['nyu_tiers_csv']) num_created = 0 num_skipped = 0 with open(nyu_tiers_csv) as f: reader = csv.DictReader(f) for row in reader: tumor_type_instance = TumorType.objects.get( type=sanitize_tumor_tissue(row['tumor_type'])) tissue_type_instance = TissueType.objects.get( type=sanitize_tumor_tissue(row['tissue_type'])) instance, created = NYUTier.objects.get_or_create( gene=row['gene'], variant_type=row['type'], tumor_type=tumor_type_instance, tissue_type=tissue_type_instance, coding=row['coding'], protein=row['protein'], tier=int(row['tier']), comment=row['comment']) if created: num_created += 1 else: num_skipped += 1 logger.debug( "Added {new} new tissue types ({skipped} skipped) to the databse". format(new=num_created, skipped=num_skipped))
def import_PMKB_get_or_create(entries): """ """ num_created_interpretations = 0 num_skipped_interpretations = 0 num_created_variants = 0 num_skipped_variants = 0 not_created = [] for index, row in entries.iterrows(): # set unique key for each variant variant_str = "".join([ row['Gene'], row['TumorType'], row['TissueType'], row['Variant'], str(row['Tier']), row['Interpretation'], row['Citation'], str(row['Source']) ]) variant_md5 = hashlib.md5(variant_str.encode('utf-8')).hexdigest() # get the tumor type from the database tumor_type_instance = TumorType.objects.get( type=sanitize_tumor_tissue(row['TumorType'])) tissue_type_instance = TissueType.objects.get( type=sanitize_tumor_tissue(row['TissueType'])) # add the interpretations first interpretation_instance, created_interpretation = PMKBInterpretation.objects.get_or_create( interpretation=row['Interpretation'], citations=row['Citation'], source_row=row['Source'], ) if created_interpretation: num_created_interpretations += 1 else: num_skipped_interpretations += 1 # add the variant in each row instance, created_variant = PMKBVariant.objects.get_or_create( gene=row['Gene'], tumor_type=tumor_type_instance, tissue_type=tissue_type_instance, variant=row['Variant'], tier=row['Tier'], interpretation=interpretation_instance, source_row=row['Source'], uid=variant_md5) if created_variant: num_created_variants += 1 else: num_skipped_variants += 1 not_created.append(row) total_db_variants = PMKBVariant.objects.count() # 22834 total_db_interpretations = PMKBInterpretation.objects.count() # 408 logger.debug( "Added {new_interp} new interpretations ({skip_interp} skipped) and {new_var} new variants ({skip_var} skipped) to the database. {tot_var} total variants and {tot_interp} total interpretations in the database" .format(new_interp=num_created_interpretations, skip_interp=num_skipped_interpretations, new_var=num_created_variants, skip_var=num_skipped_variants, tot_var=total_db_variants, tot_interp=total_db_interpretations)) return (not_created)
def import_tissue_types(**kwargs): """ Imports tumor types from JSON file to the database """ tissue_types_json = kwargs.pop('tissue_types_json', config['tissue_types_json']) with open(tissue_types_json) as f: tissue_types = json.load(f) num_created = 0 num_skipped = 0 for tissue_type in tissue_types: instance, created = TissueType.objects.get_or_create( type=sanitize_tumor_tissue(tissue_type)) if created: num_created += 1 else: num_skipped += 1 logger.debug( "Added {new} new tissue types ({skipped} skipped) to the database". format(new=num_created, skipped=num_skipped))
def import_PMKB_bulk(entries): """ """ num_created_interpretations = 0 num_skipped_interpretations = 0 # import unique interpretations first # need to jump through some hoops to get all unique interpretations; # iterate over dataframe and concatenate the interpretation fields to make a unique key # use key in dict to store database entry instance to use later logger.debug("Importing unique interpretations") unique_interpretations = defaultdict(OrderedDict) for index, row in entries.iterrows(): interpretation_data_str = "".join([ str(row['Interpretation']), str(row['Citation']), str(row['Source']) ]) if interpretation_data_str not in unique_interpretations: instance, created = PMKBInterpretation.objects.get_or_create( interpretation=row['Interpretation'], citations=row['Citation'], source_row=row['Source'], ) unique_interpretations[interpretation_data_str][ 'instance'] = instance unique_interpretations[interpretation_data_str][ 'created'] = created if created: num_created_interpretations += 1 else: num_skipped_interpretations += 1 else: num_skipped_interpretations += 1 logger.debug("Getting bulk variant entries") num_created_variants = 0 num_skipped_variants = 0 # make list of bulk entries to import bulk_variants = [] # list of skipped variants not_created = [] # unique variants unique_variants = defaultdict(OrderedDict) for index, row in entries.iterrows(): # get forgeign key instances needed for each interpretation_data_str = "".join([ str(row['Interpretation']), str(row['Citation']), str(row['Source']) ]) # set unique key for each variant variant_str = "".join([ row['Gene'], row['TumorType'], row['TissueType'], row['Variant'], str(row['Tier']), row['Interpretation'], row['Citation'], str(row['Source']) ]) variant_md5 = hashlib.md5(variant_str.encode('utf-8')).hexdigest() if variant_str not in unique_variants: unique_variants[variant_str]['row'] = row variant_instance = PMKBVariant( gene=row['Gene'], tumor_type=TumorType.objects.get( type=sanitize_tumor_tissue(row['TumorType'])), tissue_type=TissueType.objects.get( type=sanitize_tumor_tissue(row['TissueType'])), variant=row['Variant'], tier=row['Tier'], interpretation=unique_interpretations[interpretation_data_str] ['instance'], source_row=row['Source'], uid=variant_md5) bulk_variants.append(variant_instance) num_created_variants += 1 else: not_created.append(row) num_skipped_variants += 1 # add all variants to the database logger.debug("Importing bulk variant entries ({0} total)".format( len(bulk_variants))) PMKBVariant.objects.bulk_create(bulk_variants) total_db_variants = PMKBVariant.objects.count() # 22834 total_db_interpretations = PMKBInterpretation.objects.count() # 408 logger.debug( "Added {num_created} variants to the database. {tot_var} total variants and {tot_interp} total interpretations in the database" .format(num_created=num_created_variants, tot_var=total_db_variants, tot_interp=total_db_interpretations)) return (not_created)