Пример #1
0
def import_nyu_interpretations(**kwargs):
    """
    """
    nyu_interpretations_tsv = kwargs.pop('nyu_interpretations_tsv',
                                         config['nyu_interpretations_tsv'])
    num_created = 0
    num_skipped = 0
    with open(nyu_interpretations_tsv) as f:
        reader = csv.DictReader(f, delimiter='\t')
        for row in reader:
            tumor_type_instance = TumorType.objects.get(
                type=sanitize_tumor_tissue(row['TumorType']))
            tissue_type_instance = TissueType.objects.get(
                type=sanitize_tumor_tissue(row['TissueType']))

            instance, created = NYUInterpretation.objects.get_or_create(
                genes=row['Gene'],
                variant_type=row['VariantType'],
                tumor_type=tumor_type_instance,
                tissue_type=tissue_type_instance,
                variant=row['Variant'],
                interpretation=row['Interpretation'],
                citations=row['Citation'])
            if created:
                num_created += 1
            else:
                num_skipped += 1
    logger.debug(
        "Added {new} new NYU interpretations ({skipped} skipped) to the databse"
        .format(new=num_created, skipped=num_skipped))
Пример #2
0
def import_nyu_tiers(**kwargs):
    """
    Imports values from the NYU tiers list to the database
    """
    nyu_tiers_csv = kwargs.pop('nyu_tiers_csv', config['nyu_tiers_csv'])
    num_created = 0
    num_skipped = 0
    with open(nyu_tiers_csv) as f:
        reader = csv.DictReader(f)
        for row in reader:
            tumor_type_instance = TumorType.objects.get(
                type=sanitize_tumor_tissue(row['tumor_type']))
            tissue_type_instance = TissueType.objects.get(
                type=sanitize_tumor_tissue(row['tissue_type']))
            instance, created = NYUTier.objects.get_or_create(
                gene=row['gene'],
                variant_type=row['type'],
                tumor_type=tumor_type_instance,
                tissue_type=tissue_type_instance,
                coding=row['coding'],
                protein=row['protein'],
                tier=int(row['tier']),
                comment=row['comment'])
            if created:
                num_created += 1
            else:
                num_skipped += 1
        logger.debug(
            "Added {new} new tissue types ({skipped} skipped) to the databse".
            format(new=num_created, skipped=num_skipped))
Пример #3
0
def import_PMKB_get_or_create(entries):
    """
    """
    num_created_interpretations = 0
    num_skipped_interpretations = 0
    num_created_variants = 0
    num_skipped_variants = 0
    not_created = []
    for index, row in entries.iterrows():
        # set unique key for each variant
        variant_str = "".join([
            row['Gene'], row['TumorType'], row['TissueType'], row['Variant'],
            str(row['Tier']), row['Interpretation'], row['Citation'],
            str(row['Source'])
        ])
        variant_md5 = hashlib.md5(variant_str.encode('utf-8')).hexdigest()

        # get the tumor type from the database
        tumor_type_instance = TumorType.objects.get(
            type=sanitize_tumor_tissue(row['TumorType']))
        tissue_type_instance = TissueType.objects.get(
            type=sanitize_tumor_tissue(row['TissueType']))

        # add the interpretations first
        interpretation_instance, created_interpretation = PMKBInterpretation.objects.get_or_create(
            interpretation=row['Interpretation'],
            citations=row['Citation'],
            source_row=row['Source'],
        )
        if created_interpretation:
            num_created_interpretations += 1
        else:
            num_skipped_interpretations += 1

        # add the variant in each row
        instance, created_variant = PMKBVariant.objects.get_or_create(
            gene=row['Gene'],
            tumor_type=tumor_type_instance,
            tissue_type=tissue_type_instance,
            variant=row['Variant'],
            tier=row['Tier'],
            interpretation=interpretation_instance,
            source_row=row['Source'],
            uid=variant_md5)
        if created_variant:
            num_created_variants += 1
        else:
            num_skipped_variants += 1
            not_created.append(row)
    total_db_variants = PMKBVariant.objects.count()  # 22834
    total_db_interpretations = PMKBInterpretation.objects.count()  # 408
    logger.debug(
        "Added {new_interp} new interpretations ({skip_interp} skipped) and {new_var} new variants ({skip_var} skipped) to the database. {tot_var} total variants and {tot_interp} total interpretations in the database"
        .format(new_interp=num_created_interpretations,
                skip_interp=num_skipped_interpretations,
                new_var=num_created_variants,
                skip_var=num_skipped_variants,
                tot_var=total_db_variants,
                tot_interp=total_db_interpretations))
    return (not_created)
Пример #4
0
def import_tissue_types(**kwargs):
    """
    Imports tumor types from JSON file to the database
    """
    tissue_types_json = kwargs.pop('tissue_types_json',
                                   config['tissue_types_json'])

    with open(tissue_types_json) as f:
        tissue_types = json.load(f)

    num_created = 0
    num_skipped = 0
    for tissue_type in tissue_types:
        instance, created = TissueType.objects.get_or_create(
            type=sanitize_tumor_tissue(tissue_type))
        if created:
            num_created += 1
        else:
            num_skipped += 1
    logger.debug(
        "Added {new} new tissue types ({skipped} skipped) to the database".
        format(new=num_created, skipped=num_skipped))
Пример #5
0
def import_PMKB_bulk(entries):
    """
    """
    num_created_interpretations = 0
    num_skipped_interpretations = 0
    # import unique interpretations first
    # need to jump through some hoops to get all unique interpretations;
    # iterate over dataframe and concatenate the interpretation fields to make a unique key
    # use key in dict to store database entry instance to use later
    logger.debug("Importing unique interpretations")
    unique_interpretations = defaultdict(OrderedDict)
    for index, row in entries.iterrows():
        interpretation_data_str = "".join([
            str(row['Interpretation']),
            str(row['Citation']),
            str(row['Source'])
        ])
        if interpretation_data_str not in unique_interpretations:
            instance, created = PMKBInterpretation.objects.get_or_create(
                interpretation=row['Interpretation'],
                citations=row['Citation'],
                source_row=row['Source'],
            )
            unique_interpretations[interpretation_data_str][
                'instance'] = instance
            unique_interpretations[interpretation_data_str][
                'created'] = created
            if created:
                num_created_interpretations += 1
            else:
                num_skipped_interpretations += 1
        else:
            num_skipped_interpretations += 1

    logger.debug("Getting bulk variant entries")
    num_created_variants = 0
    num_skipped_variants = 0
    # make list of bulk entries to import
    bulk_variants = []
    # list of skipped variants
    not_created = []
    # unique variants
    unique_variants = defaultdict(OrderedDict)
    for index, row in entries.iterrows():
        # get forgeign key instances needed for each
        interpretation_data_str = "".join([
            str(row['Interpretation']),
            str(row['Citation']),
            str(row['Source'])
        ])

        # set unique key for each variant
        variant_str = "".join([
            row['Gene'], row['TumorType'], row['TissueType'], row['Variant'],
            str(row['Tier']), row['Interpretation'], row['Citation'],
            str(row['Source'])
        ])
        variant_md5 = hashlib.md5(variant_str.encode('utf-8')).hexdigest()
        if variant_str not in unique_variants:
            unique_variants[variant_str]['row'] = row
            variant_instance = PMKBVariant(
                gene=row['Gene'],
                tumor_type=TumorType.objects.get(
                    type=sanitize_tumor_tissue(row['TumorType'])),
                tissue_type=TissueType.objects.get(
                    type=sanitize_tumor_tissue(row['TissueType'])),
                variant=row['Variant'],
                tier=row['Tier'],
                interpretation=unique_interpretations[interpretation_data_str]
                ['instance'],
                source_row=row['Source'],
                uid=variant_md5)
            bulk_variants.append(variant_instance)
            num_created_variants += 1
        else:
            not_created.append(row)
            num_skipped_variants += 1
    # add all variants to the database
    logger.debug("Importing bulk variant entries ({0} total)".format(
        len(bulk_variants)))
    PMKBVariant.objects.bulk_create(bulk_variants)

    total_db_variants = PMKBVariant.objects.count()  # 22834
    total_db_interpretations = PMKBInterpretation.objects.count()  # 408
    logger.debug(
        "Added {num_created} variants to the database. {tot_var} total variants and {tot_interp} total interpretations in the database"
        .format(num_created=num_created_variants,
                tot_var=total_db_variants,
                tot_interp=total_db_interpretations))
    return (not_created)