Пример #1
0
def _extract(vcf_id):
    """Extract the genotypes from a VCF and insert into the DB.

    This also fills in a few fields in the vcfs table which aren't available
    until the entire VCF has been read, e.g. the variant count.

    Returns the vcf_id, or False if an error occurred.
    """
    engine = sqlalchemy.create_engine(DATABASE_URI)
    with tables(engine, 'vcfs') as (con, vcfs_table):
        metadata = sqlalchemy.MetaData(bind=con)
        metadata.reflect()
        vcf = vcfs_table.select().where(vcfs_table.c.id == vcf_id).execute().fetchone()

        # Validate the contents of the VCF before modifying the database.
        reader, header_text, release = load_vcf(vcf['uri'])

        # Fill in VCF header text, which is now available.
        (vcfs_table.update()
         .where(vcfs_table.c.id == vcf_id)
         .values(vcf_header=header_text, vcf_release=release)
         .execute())

        insert_genotypes_with_copy(reader, engine,
                                   default_values={'vcf_id': vcf_id},
                                   temporary_dir=TEMPORARY_DIR)

        update_extant_columns(metadata, con, vcf_id)
        update_vcf_count(metadata, con, vcf_id)
    return vcf_id
Пример #2
0
def _annotate(vcf_id):
    if vcf_id == False:
        return  # An error must have occurred earlier.

    engine = sqlalchemy.create_engine(DATABASE_URI)
    # See if we have any guesses for a release version
    with tables(engine, 'vcfs') as (con, vcfs):
        vcf = vcfs.select().where(vcfs.c.id == vcf_id).execute().fetchone()
        release = vcf['vcf_release']

    if not release:
        release = config.ENSEMBL_RELEASE

    # Only runs the first time for this release.
    EnsemblRelease(release).install()

    with tables(engine, 'genotypes') as (con, genotypes):
        metadata = sqlalchemy.MetaData(bind=con)
        metadata.reflect()
        annotations = get_varcode_annotations(genotypes, vcf_id, release)

        tmp_table = Table('gene_annotations',
                          metadata,
                          Column('contig', Text, nullable=False),
                          Column('position', Integer, nullable=False),
                          Column('reference', Text, nullable=False),
                          Column('alternates', Text, nullable=False),
                          Column('gene_name', Text, nullable=True),
                          Column('transcript', Text, nullable=True),
                          Column('notation', Text, nullable=True),
                          Column('effect_type', Text, nullable=True),
                          prefixes=['TEMPORARY'])

        try:
            tmp_table.create()
            write_to_table_via_csv(tmp_table, rows=annotations, connection=con)
            # Add gene names from temp table to genotypes.
            (genotypes.update()
             .where(genotypes.c.contig == tmp_table.c.contig)
             .where(genotypes.c.position == tmp_table.c.position)
             .where(genotypes.c.reference == tmp_table.c.reference)
             .where(genotypes.c.alternates == tmp_table.c.alternates)
             .where(genotypes.c.vcf_id == vcf_id)
             .values(
                 {
                    'annotations:gene_name': tmp_table.c.gene_name,
                    'annotations:transcript': tmp_table.c.transcript,
                    'annotations:effect_notation': tmp_table.c.notation,
                    'annotations:effect_type': tmp_table.c.effect_type
                 }
             )).execute()
        finally:
            con.execute("DISCARD TEMP")

        # We've added annotations:varcode_*, so update the columns to display.
        update_extant_columns(metadata, con, vcf_id)
        return vcf_id