def _extract(vcf_id): """Extract the genotypes from a VCF and insert into the DB. This also fills in a few fields in the vcfs table which aren't available until the entire VCF has been read, e.g. the variant count. Returns the vcf_id, or False if an error occurred. """ engine = sqlalchemy.create_engine(DATABASE_URI) with tables(engine, 'vcfs') as (con, vcfs_table): metadata = sqlalchemy.MetaData(bind=con) metadata.reflect() vcf = vcfs_table.select().where(vcfs_table.c.id == vcf_id).execute().fetchone() # Validate the contents of the VCF before modifying the database. reader, header_text, release = load_vcf(vcf['uri']) # Fill in VCF header text, which is now available. (vcfs_table.update() .where(vcfs_table.c.id == vcf_id) .values(vcf_header=header_text, vcf_release=release) .execute()) insert_genotypes_with_copy(reader, engine, default_values={'vcf_id': vcf_id}, temporary_dir=TEMPORARY_DIR) update_extant_columns(metadata, con, vcf_id) update_vcf_count(metadata, con, vcf_id) return vcf_id
def _annotate(vcf_id): if vcf_id == False: return # An error must have occurred earlier. engine = sqlalchemy.create_engine(DATABASE_URI) # See if we have any guesses for a release version with tables(engine, 'vcfs') as (con, vcfs): vcf = vcfs.select().where(vcfs.c.id == vcf_id).execute().fetchone() release = vcf['vcf_release'] if not release: release = config.ENSEMBL_RELEASE # Only runs the first time for this release. EnsemblRelease(release).install() with tables(engine, 'genotypes') as (con, genotypes): metadata = sqlalchemy.MetaData(bind=con) metadata.reflect() annotations = get_varcode_annotations(genotypes, vcf_id, release) tmp_table = Table('gene_annotations', metadata, Column('contig', Text, nullable=False), Column('position', Integer, nullable=False), Column('reference', Text, nullable=False), Column('alternates', Text, nullable=False), Column('gene_name', Text, nullable=True), Column('transcript', Text, nullable=True), Column('notation', Text, nullable=True), Column('effect_type', Text, nullable=True), prefixes=['TEMPORARY']) try: tmp_table.create() write_to_table_via_csv(tmp_table, rows=annotations, connection=con) # Add gene names from temp table to genotypes. (genotypes.update() .where(genotypes.c.contig == tmp_table.c.contig) .where(genotypes.c.position == tmp_table.c.position) .where(genotypes.c.reference == tmp_table.c.reference) .where(genotypes.c.alternates == tmp_table.c.alternates) .where(genotypes.c.vcf_id == vcf_id) .values( { 'annotations:gene_name': tmp_table.c.gene_name, 'annotations:transcript': tmp_table.c.transcript, 'annotations:effect_notation': tmp_table.c.notation, 'annotations:effect_type': tmp_table.c.effect_type } )).execute() finally: con.execute("DISCARD TEMP") # We've added annotations:varcode_*, so update the columns to display. update_extant_columns(metadata, con, vcf_id) return vcf_id