def vcf_import_worker(queue, file_id, samples): while True: query = queue.get() if query is None: break Model.execute(query) queue.task_done()
def create_annotation_db(reference_id, reference_name, table_name, vcf_annotation_metadata): """ Create an annotation database according to information retrieved from the VCF file with the prepare_vcf_parsing method """ # Create annotation table pk = 'transcript_id character varying(100), ' if vcf_annotation_metadata[ 'db_type'] == 'transcript' else '' pk2 = ',transcript_id' if vcf_annotation_metadata[ 'db_type'] == 'transcript' else '' pattern = "CREATE TABLE {0} (variant_id bigint, bin integer, chr integer, pos bigint, ref text, alt text, " + pk + "{1}, CONSTRAINT {0}_ukey UNIQUE (variant_id" + pk2 + "));" query = "" db_map = {} fields = [] for col in vcf_annotation_metadata['columns']: col_name = normalise_annotation_name(col) fields.append("{} text".format(col_name)) db_map[col_name] = { 'name': col_name, 'type': 'string', 'name_ui': col } # By default, create a table with only text field. Type can be changed by user via a dedicated UI query += pattern.format(table_name, ', '.join(fields)) query += "CREATE INDEX {0}_idx_vid ON {0} USING btree (variant_id);".format( table_name) query += "CREATE INDEX {0}_idx_var ON {0} USING btree (bin, chr, pos);".format( table_name) if vcf_annotation_metadata['db_type'] == 'transcript': query += "CREATE INDEX {0}_idx_tid ON {0} USING btree (transcript_id);".format( table_name) # Register annotation db_uid, pk_uid = Model.execute( "SELECT MD5('{0}'), MD5(concat(MD5('{0}'), '{1}'))".format( table_name, normalise_annotation_name( vcf_annotation_metadata['db_pk_field']))).first() query += "INSERT INTO annotation_database (uid, reference_id, name, version, name_ui, description, ord, type, db_pk_field_uid, jointure) VALUES " query += "('{0}', {1}, '{2}', '{3}', '{4}', '{5}', {6}, '{7}', '{8}', '{2} ON {2}.bin={{0}}.bin AND {2}.chr={{0}}.chr AND {2}.pos={{0}}.pos AND {2}.ref={{0}}.ref AND {2}.alt={{0}}.alt AND {2}.transcript_id={{0}}.transcript_pk_value');".format( # We removed this condition /*AND {{0}}.transcript_pk_field_uid=\"{8}\"*/ in the jointure as this condition is already done by a previous query when updating working table with annotations db_uid, reference_id, table_name, vcf_annotation_metadata['version'], vcf_annotation_metadata['name'], vcf_annotation_metadata['description'], 30, vcf_annotation_metadata['db_type'], pk_uid) query += "INSERT INTO annotation_field (database_uid, ord, name, name_ui, type) VALUES " for idx, f in enumerate(vcf_annotation_metadata['columns']): query += "('{0}', {1}, '{2}', '{3}', 'string'),".format( db_uid, idx, normalise_annotation_name(f), f) Model.execute(query[:-1]) Model.execute( "UPDATE annotation_field SET uid=MD5(concat(database_uid, name)) WHERE uid IS NULL;" ) return db_uid, db_map
def delete(self, project_id): """ Delete the project All its analyses are put into the trash project (id = 0) """ project = Model.Project.from_id(project_id) if not project: raise RegovarException(code="E102001", arg=[project_id]) sql = "UPDATE analysis SET project_id=0 WHERE project_id={0}; DELETE FROM project WHERE id={0}".format( project.id) result = project.to_json() Model.execute(sql) return result
def init(self, headers, reference_id): """ Check VCF headers and return true if SnpEff data can be imported; false otherwise By the way, when SnpEff data are here, init internal data of the importer """ result = False if 'SnpEffVersion' in headers.keys() : vcf_flag = None if 'EFF' in headers['INFO'].keys(): vcf_flag = 'EFF' err("TODO: Old SnpEff annotation (EFF) importation is not implemented") elif 'ANN' in headers['INFO'].keys(): vcf_flag = 'ANN' reference_name = Model.execute("SELECT table_suffix FROM reference WHERE id={}".format(reference_id)).first()[0] data = headers['INFO'][vcf_flag]['description'].split('Functional annotations:') self.name = "SnpEff" self.reference_id = reference_id self.description = "SnpEff variant annotation and effect prediction tool." self.columns = [self.normalise_annotation_name(c).title() for c in data[1].strip().strip("'").split('|')] self.version = headers['SnpEffVersion'][0].strip().strip('"').split(' ')[0] self.table_name = self.normalise_annotation_name('{}_{}_{}'.format('SnpEff', self.version, reference_name)) self.vcf_flag = vcf_flag self.columns_definitions = SnpEffImporter.columns_definitions result = 'Feature_Id' in self.columns if result: self.check_annotation_table() print("SnpEff init : ", result) return result
def delete(self, project_id, author_id=None): """ Delete the project All its analyses are put into the trash project (id = 0) """ from core.core import core project = Model.Project.from_id(project_id) if not project: raise RegovarException(code="E102001", arg=[project_id]) sql = "UPDATE analysis SET project_id=0 WHERE project_id={0}; DELETE FROM project WHERE id={0}".format( project.id) result = project.to_json() Model.execute(sql) core.events.log(author_id, "info", {"project_id": project.id}, "Project moved to trash: {}.".format(project.name)) return result
def create_annotation_db(reference_id, reference_name, table_name, vcf_annotation_metadata): """ Create an annotation database according to information retrieved from the VCF file with the prepare_vcf_parsing method """ # Create annotation table pk = 'transcript_id character varying(100), ' if vcf_annotation_metadata['db_type'] == 'transcript' else '' pk2 = ',transcript_id' if vcf_annotation_metadata['db_type'] == 'transcript' else '' pattern = "CREATE TABLE {0} (variant_id bigint, bin integer, chr integer, pos bigint, ref text, alt text, " + pk + "{1}, CONSTRAINT {0}_ukey UNIQUE (variant_id" + pk2 +"));" query = "" db_map = {} fields = [] for col in vcf_annotation_metadata['columns']: col_name = normalise_annotation_name(col) fields.append("{} text".format(col_name)) db_map[col_name] = { 'name' : col_name, 'type' : 'string', 'name_ui' : col } # By default, create a table with only text field. Type can be changed by user via a dedicated UI query += pattern.format(table_name, ', '.join(fields)) query += "CREATE INDEX {0}_idx_vid ON {0} USING btree (variant_id);".format(table_name) query += "CREATE INDEX {0}_idx_var ON {0} USING btree (bin, chr, pos);".format(table_name) if vcf_annotation_metadata['db_type'] == 'transcript': query += "CREATE INDEX {0}_idx_tid ON {0} USING btree (transcript_id);".format(table_name) # Register annotation db_uid, pk_uid = Model.execute("SELECT MD5('{0}'), MD5(concat(MD5('{0}'), '{1}'))".format(table_name, normalise_annotation_name(vcf_annotation_metadata['db_pk_field']))).first() query += "INSERT INTO annotation_database (uid, reference_id, name, version, name_ui, description, ord, type, db_pk_field_uid, jointure) VALUES " query += "('{0}', {1}, '{2}', '{3}', '{4}', '{5}', {6}, '{7}', '{8}', '{2} ON {2}.bin={{0}}.bin AND {2}.chr={{0}}.chr AND {2}.pos={{0}}.pos AND {2}.ref={{0}}.ref AND {2}.alt={{0}}.alt AND {2}.transcript_id={{0}}.transcript_pk_value');".format( # We removed this condition /*AND {{0}}.transcript_pk_field_uid=\"{8}\"*/ in the jointure as this condition is already done by a previous query when updating working table with annotations db_uid, reference_id, table_name, vcf_annotation_metadata['version'], vcf_annotation_metadata['name'], vcf_annotation_metadata['description'], 30, vcf_annotation_metadata['db_type'], pk_uid) query += "INSERT INTO annotation_field (database_uid, ord, name, name_ui, type) VALUES " for idx, f in enumerate(vcf_annotation_metadata['columns']): query += "('{0}', {1}, '{2}', '{3}', 'string'),".format(db_uid, idx, normalise_annotation_name(f), f) Model.execute(query[:-1]) Model.execute("UPDATE annotation_field SET uid=MD5(concat(database_uid, name)) WHERE uid IS NULL;") return db_uid, db_map
def init(self, headers, reference_id): """ Check VCF headers and return true if SnpEff data can be imported; false otherwise By the way, when SnpEff data are here, init internal data of the importer """ result = False if 'SnpEffVersion' in headers.keys(): vcf_flag = None if 'EFF' in headers['INFO'].keys(): vcf_flag = 'EFF' err("TODO: Old SnpEff annotation (EFF) importation is not implemented" ) elif 'ANN' in headers['INFO'].keys(): vcf_flag = 'ANN' reference_name = Model.execute( "SELECT table_suffix FROM reference WHERE id={}".format( reference_id)).first()[0] data = headers['INFO'][vcf_flag]['description'].split( 'Functional annotations:') self.name = "SnpEff" self.reference_id = reference_id self.description = "SnpEff variant annotation and effect prediction tool." self.columns = [ self.normalise_annotation_name(c).title() for c in data[1].strip().strip("'").split('|') ] self.version = headers['SnpEffVersion'][0].strip().strip( '"').split(' ')[0] self.table_name = self.normalise_annotation_name( '{}_{}_{}'.format('SnpEff', self.version, reference_name)) self.vcf_flag = vcf_flag self.columns_definitions = SnpEffImporter.columns_definitions result = 'Feature_Id' in self.columns if result: self.check_annotation_table() print("SnpEff init : ", result) return result
def init(self, headers, reference_id): """ Check VCF headers and return true if VEP data can be imported; false otherwise By the way, when VEP data are here, init internal data of the importer """ result = False if 'VEP' in headers.keys(): vcf_flag = None if 'CSQ' in headers['INFO'].keys(): vcf_flag = 'CSQ' elif 'ANN' in headers['INFO'].keys(): vcf_flag = 'ANN' if vcf_flag: reference_name = Model.execute( "SELECT table_suffix FROM reference WHERE id={}".format( reference_id)).first()[0] data = headers['INFO'][vcf_flag]['description'].split( 'Format:') self.name = "VEP" self.reference_id = reference_id self.description = data[0].strip() self.columns = [ self.normalise_annotation_name(c).title() for c in data[1].strip().split('|') ] self.version = headers['VEP'][0].split(' ')[0] self.table_name = self.normalise_annotation_name( '{}_{}_{}'.format('VEP', self.version, reference_name)) self.vcf_flag = vcf_flag self.columns_definitions = VepImporter.columns_definitions result = 'Feature' in self.columns if result: self.check_annotation_table() print("VEP init : ", result) return result
def prepare_annotation_db(reference_id, vcf_annotation_metadata): """ Prepare database for import of custom annotation, and set the mapping between VCF info fields and DB schema """ reference = Model.execute("SELECT table_suffix FROM reference WHERE id={}".format(reference_id)).first()[0] table_name = normalise_annotation_name('{}_{}_{}'.format(vcf_annotation_metadata['name'], vcf_annotation_metadata['version'], reference)) # Get database schema (if available) table_cols = {} db_uid = Model.execute("SELECT uid FROM annotation_database WHERE name='{}'".format(table_name)).first() if db_uid is None: # No table in db for these annotation : create new table db_uid, table_cols = create_annotation_db(reference_id, reference, table_name, vcf_annotation_metadata) else: db_uid = db_uid[0] # Table already exists : retrieve columns already defined for col in Model.execute("SELECT name, name_ui, type FROM annotation_field WHERE database_uid='{}'".format(db_uid)): table_cols[col.name] = {'name': col.name, 'type': col.type, 'name_ui': col.name_ui} # Get diff between columns in vcf and columns in DB, and update DB schema diff = [] for col in vcf_annotation_metadata['columns']: if normalise_annotation_name(col) not in table_cols.keys(): diff.append(col) if len(diff) > 0 : offset = len(vcf_annotation_metadata['columns']) query = "" for idx, col in enumerate(diff): name=normalise_annotation_name(col) query += "ALTER TABLE {0} ADD COLUMN {1} text; INSERT INTO public.annotation_field (database_uid, ord, name, name_ui, type) VALUES ('{2}', {3}, '{1}', '{4}', 'string');".format(table_name, name, db_uid, offset + idx, col) table_cols[name] = {'name': name, 'type': 'string', 'name_ui': col} # execute query Model.execute(query) # Update vcf_annotation_metadata with database mapping db_pk_field_uid = Model.execute("SELECT db_pk_field_uid FROM annotation_database WHERE uid='{}'".format(db_uid)).first().db_pk_field_uid vcf_annotation_metadata.update({'table': table_name, 'db_uid': db_uid, 'db_pk_field_uid': db_pk_field_uid}) vcf_annotation_metadata['db_map'] = {} for col in vcf_annotation_metadata['columns']: vcf_annotation_metadata['db_map'][col] = table_cols[normalise_annotation_name(col)] return vcf_annotation_metadata
def import_delegate(self, file_id, vcf_reader, reference_id, db_ref_suffix, vcf_metadata, samples): """ This delegate will do the "real" import. It will be called by the "import_data" method in a new thread in order to don't block the main thread """ from core.core import core # parsing vcf file records_count = vcf_metadata['count'] records_current = 0 vcf_line = vcf_metadata['header_count'] table = "variant" + db_ref_suffix sql_pattern1 = "INSERT INTO {0} (chr, pos, ref, alt, is_transition, bin, sample_list) VALUES ({1}, {2}, '{3}', '{4}', {5}, {6}, array[{7}]) ON CONFLICT (chr, pos, ref, alt) DO UPDATE SET sample_list=array_intersect({0}.sample_list, array[{7}]) WHERE {0}.chr={1} AND {0}.pos={2} AND {0}.ref='{3}' AND {0}.alt='{4}';" sql_pattern2 = "INSERT INTO sample_variant" + db_ref_suffix + " (sample_id, variant_id, vcf_line, bin, chr, pos, ref, alt, genotype, depth, depth_alt, quality, filter) SELECT {0}, id, {1}, {2}, '{3}', {4}, '{5}', '{6}', {7}, {8}, {9}, {10}, '{11}' FROM variant" + db_ref_suffix + " WHERE bin={2} AND chr={3} AND pos={4} AND ref='{5}' AND alt='{6}' ON CONFLICT (sample_id, variant_id) DO NOTHING;" sql_annot_trx = "INSERT INTO {0} (variant_id, bin,chr,pos,ref,alt, regovar_trx_id, {1}) SELECT id, {3},{4},{5},'{6}','{7}', '{8}', {2} FROM variant" + db_ref_suffix + " WHERE bin={3} AND chr={4} AND pos={5} AND ref='{6}' AND alt='{7}' ON CONFLICT (variant_id, regovar_trx_id) DO NOTHING; " # TODO : do update on conflict sql_annot_var = "INSERT INTO {0} (variant_id, bin,chr,pos,ref,alt, {1}) SELECT id, {3},{4},{5},'{6}','{7}', {2} FROM variant" + db_ref_suffix + " WHERE bin={3} AND chr={4} AND pos={5} AND ref='{6}' AND alt='{7}' ON CONFLICT (variant_id) DO NOTHING;" sql_query1 = "" sql_query2 = "" sql_query3 = "" count = 0 for row in vcf_reader: records_current += 1 vcf_line += 1 #log("> {} : {}".format(records_current, count)) #if records_current == 14356: #ipdb.set_trace() # TODO : update sample's progress indicator chrm = normalize_chr(str(row.chrom)) for allele in row.alleles: pos, ref, alt = normalise(row.pos, row.ref, allele) bin = getMaxUcscBin(pos, pos + len(ref)) # get list of sample that have this variant (chr-pos-ref-alt) samples_array = [] for sn in row.samples: sp = row.samples.get(sn) if allele in sp.alleles: samples_array.append(samples[sp.name]["id"]) if len(samples_array) == 0: continue # save variant samples_array = ",".join([str(s) for s in samples_array]) sql_query1 += sql_pattern1.format(table, chrm, pos, ref, alt, is_transition(ref, alt), bin, samples_array) # Register variant/sample associations for sn in row.samples: sp = row.samples.get(sn) gt = normalize_gt(sp) filters = escape_value_for_sql( json.dumps(row.filter.keys())) count += 1 if allele in sp.alleles: if "AD" in sp.keys(): # Get allelic depth if exists (AD field) depth_alt = sp["AD"][sp.alleles.index(allele)] elif "DP4" in sp.keys(): if gt == 0: depth_alt = sum(sp["DP4"]) else: depth_alt = sp["DP4"][2] + sp["DP4"][ 3] if alt != ref else sp["DP4"][0] + sp[ "DP4"][1] else: depth_alt = "NULL" sql_query2 += sql_pattern2.format( samples[sn]["id"], vcf_line, bin, chrm, pos, ref, alt, gt, get_info(sp, "DP"), depth_alt, row.qual, filters) else: # save that the sample HAVE NOT this variant sql_query2 += sql_pattern2.format( samples[sn]["id"], vcf_line, bin, chrm, pos, ref, alt, "NULL", get_info(sp, "DP"), "NULL", row.qual, filters) # Register variant annotations for ann_name, importer in vcf_metadata["annotations"].items(): if importer: importer_query, importer_count = importer.import_annotations( sql_annot_trx, bin, chrm, pos, ref, alt, row.info) sql_query3 += importer_query count += importer_count # split big request to avoid sql out of memory transaction or too long freeze of the server if count >= 5000: progress = records_current / records_count count = 0 transaction = sql_query1 + sql_query2 + sql_query3 log("VCF import : line {} (chrm {})".format( records_current, chrm)) log("VCF import : Execute sync query {}/{} ({}%)".format( records_current, records_count, round(progress * 100, 2))) # update sample's progress indicator # note : as we are updating lot of data in the database with several asynch thread # so to avoid conflict with session, we update data from "manual query" sps = [] sql = "UPDATE sample SET loading_progress={} WHERE id IN ({})".format( progress, ",".join([str(samples[sid]["id"]) for sid in samples])) Model.execute(sql) core.notify_all({ "action": "import_vcf_processing", "data": { "reference_id": reference_id, "file_id": file_id, "status": "loading", "progress": progress, "samples": [{ "id": samples[sname]["id"], "name": sname } for sname in samples] } }) log("VCF import : enqueue query") self.queue.put(transaction) # Reset query buffers sql_query1 = "" sql_query2 = "" sql_query3 = "" # Loop done, execute last pending query log("VCF import : Execute last async query") transaction = sql_query1 + sql_query2 + sql_query3 if transaction: self.queue.put(transaction) # Waiting that all query in the queue was executed log("VCF parsing done. Waiting for async execution of sql queries") # block until all tasks are done self.queue.join() log("No more sql query to proceed") # stop vcf_import_thread_workers for i in range(VCF_IMPORT_MAX_THREAD): self.queue.put(None) for t in self.workers: t.join() # Compute composite variant by sample sql_pattern = "UPDATE sample_variant" + db_ref_suffix + " u SET is_composite=TRUE WHERE u.sample_id = {0} AND u.variant_id IN (SELECT DISTINCT UNNEST(sub.vids) as variant_id FROM (SELECT array_agg(v.variant_id) as vids, g.name2 FROM sample_variant" + db_ref_suffix + " v INNER JOIN refgene" + db_ref_suffix + " g ON g.chr=v.chr AND g.trxrange @> v.pos WHERE v.sample_id={0} AND v.genotype=2 or v.genotype=3 GROUP BY name2 HAVING count(*) > 1) AS sub)" log("Computing is_composite fields by samples :") for sid in samples: query = sql_pattern.format(samples[sid]["id"]) log(" - sample {}".format(samples[sid]["id"])) Model.execute(query) log("Sample import from VCF Done") end = datetime.datetime.now() # update sample's progress indicator Model.execute( "UPDATE sample SET status='ready', loading_progress=1 WHERE id IN ({})" .format(",".join([str(samples[sid]["id"]) for sid in samples]))) core.notify_all({ "action": "import_vcf_end", "data": { "reference_id": reference_id, "file_id": file_id, "msg": "Import done without error.", "samples": [{ "id": samples[s]["id"], "name": samples[s]["name"] } for s in samples.keys()] } }) # When import is done, check if analysis are waiting for creation and then start wt creation if all sample are ready # TODO sql = "SELECT DISTINCT(analysis_id) FROM analysis_sample WHERE sample_id IN ({})".format( ",".join([str(samples[sid]["id"]) for sid in samples])) for row in Model.execute(sql): analysis = Model.Analysis.from_id(row.analysis_id, 1) if analysis.status == "waiting": log("Auto initialisation of the analysis in witing state : {} ({})" .format(analysis.name, analysis.id)) core.filters.request(analysis.id, analysis.filter, analysis.fields)
def prepare_annotation_db(reference_id, vcf_annotation_metadata): """ Prepare database for import of custom annotation, and set the mapping between VCF info fields and DB schema """ reference = Model.execute( "SELECT table_suffix FROM reference WHERE id={}".format( reference_id)).first()[0] table_name = normalise_annotation_name('{}_{}_{}'.format( vcf_annotation_metadata['name'], vcf_annotation_metadata['version'], reference)) # Get database schema (if available) table_cols = {} db_uid = Model.execute( "SELECT uid FROM annotation_database WHERE name='{}'".format( table_name)).first() if db_uid is None: # No table in db for these annotation : create new table db_uid, table_cols = create_annotation_db(reference_id, reference, table_name, vcf_annotation_metadata) else: db_uid = db_uid[0] # Table already exists : retrieve columns already defined for col in Model.execute( "SELECT name, name_ui, type FROM annotation_field WHERE database_uid='{}'" .format(db_uid)): table_cols[col.name] = { 'name': col.name, 'type': col.type, 'name_ui': col.name_ui } # Get diff between columns in vcf and columns in DB, and update DB schema diff = [] for col in vcf_annotation_metadata['columns']: if normalise_annotation_name(col) not in table_cols.keys(): diff.append(col) if len(diff) > 0: offset = len(vcf_annotation_metadata['columns']) query = "" for idx, col in enumerate(diff): name = normalise_annotation_name(col) query += "ALTER TABLE {0} ADD COLUMN {1} text; INSERT INTO public.annotation_field (database_uid, ord, name, name_ui, type) VALUES ('{2}', {3}, '{1}', '{4}', 'string');".format( table_name, name, db_uid, offset + idx, col) table_cols[name] = {'name': name, 'type': 'string', 'name_ui': col} # execute query Model.execute(query) # Update vcf_annotation_metadata with database mapping db_pk_field_uid = Model.execute( "SELECT db_pk_field_uid FROM annotation_database WHERE uid='{}'". format(db_uid)).first().db_pk_field_uid vcf_annotation_metadata.update({ 'table': table_name, 'db_uid': db_uid, 'db_pk_field_uid': db_pk_field_uid }) vcf_annotation_metadata['db_map'] = {} for col in vcf_annotation_metadata['columns']: vcf_annotation_metadata['db_map'][col] = table_cols[ normalise_annotation_name(col)] return vcf_annotation_metadata
async def import_data(self, file_id, **kargs): """ Import samples, variants and annotations from the provided file. This method check provided parameters and parse the header of the vcf to get samples and compute the number of line that need to be parse to allow us to compute a progress indicator. The parsing is done in delegate called in another thread. Return the list of sample that have been added. """ from core.core import core file = Model.File.from_id(file_id) filepath = file.path reference_id = kargs["reference_id"] start_0 = datetime.datetime.now() job_in_progress = [] vcf_metadata = prepare_vcf_parsing(reference_id, filepath) db_ref_suffix= "_" + Model.execute("SELECT table_suffix FROM reference WHERE id={}".format(reference_id)).first().table_suffix if vcf_metadata: filepath += ".regovar_import" # a tmp file have been created by prepare_vcf_parsing() method to avoid pysam unsupported file format. start = datetime.datetime.now() # Create vcf parser vcf_reader = VariantFile(filepath) # get samples in the VCF # samples = {i : Model.get_or_create(Model.Session(), Model.Sample, name=i)[0] for i in list((vcf_reader.header.samples))} samples = {} for i in vcf_reader.header.samples: sample = Model.Sample.new() sample.name = i sample.file_id = file_id sample.reference_id = reference_id sample.filter_description = {filter[0]:filter[1].description for filter in vcf_reader.header.filters.items()} sample.default_dbuid = [] sample.status = "loading" for dbname in vcf_metadata["annotations"].keys(): if vcf_metadata["annotations"][dbname]: sample.default_dbuid.append(vcf_metadata["annotations"][dbname].db_uid) # TODO : is_mosaic according to the data in the vcf sample.save() # As these sample will be shared with other threads, we remove them from the sql session to avoid error samples.update({i : sample.to_json()}) if len(samples.keys()) == 0 : war("VCF files without sample cannot be imported in the database.") await core.notify_all_co({"action": "import_vcf_error", "data" : {"reference_id": reference_id, "file_id" : file_id, "msg" : "VCF files without sample cannot be imported in the database."}}) return; # # tasks queue shared by all thread # self.queue = Queue(maxsize=0) # # list of worker created to execute multithread tasks # self.workers = [] # # init threading workers # for i in range(VCF_IMPORT_MAX_THREAD): # t = Thread(target=vcf_import_worker, args=(self.queue, file_id, samples), daemon=True) # t.start() # self.workers.append(t) await core.notify_all_co({"action":"import_vcf_start", "data" : {"reference_id": reference_id, "file_id" : file_id, "samples" : [ {"id" : samples[sid]["id"], "name" : samples[sid]["name"]} for sid in samples.keys()]}}) records_count = vcf_metadata["count"] log ("Importing file {0}\n\r\trecords : {1}\n\r\tsamples : ({2}) {3}\n\r\tstart : {4}".format(filepath, records_count, len(samples.keys()), reprlib.repr([sid for sid in samples.keys()]), start)) run_async(self.import_delegate, file_id, vcf_reader, reference_id, db_ref_suffix, vcf_metadata, samples) return {"success": True, "samples": samples, "records_count": records_count } return {"success": False, "error": "File not supported"}
def import_delegate(self, file_id, vcf_reader, reference_id, db_ref_suffix, vcf_metadata, samples): """ This delegate will do the "real" import. It will be called by the "import_data" method in a new thread in order to don't block the main thread """ from core.core import core # parsing vcf file records_count = vcf_metadata['count'] records_current = 0 vcf_line = vcf_metadata['header_count'] table = "variant" + db_ref_suffix sql_pattern1 = "INSERT INTO {0} (chr, pos, ref, alt, is_transition, bin, sample_list) VALUES ({1}, {2}, '{3}', '{4}', {5}, {6}, array[{7}]) ON CONFLICT (chr, pos, ref, alt) DO UPDATE SET sample_list=array_intersect({0}.sample_list, array[{7}]) WHERE {0}.chr={1} AND {0}.pos={2} AND {0}.ref='{3}' AND {0}.alt='{4}';" sql_pattern2 = "INSERT INTO sample_variant" + db_ref_suffix + " (sample_id, variant_id, vcf_line, bin, chr, pos, ref, alt, genotype, depth, depth_alt, quality, filter) SELECT {0}, id, {1}, {2}, '{3}', {4}, '{5}', '{6}', {7}, {8}, {9}, {10}, '{11}' FROM variant" + db_ref_suffix + " WHERE bin={2} AND chr={3} AND pos={4} AND ref='{5}' AND alt='{6}' ON CONFLICT (sample_id, variant_id) DO NOTHING;" sql_annot_trx = "INSERT INTO {0} (variant_id, bin,chr,pos,ref,alt, regovar_trx_id, {1}) SELECT id, {3},{4},{5},'{6}','{7}', '{8}', {2} FROM variant" + db_ref_suffix + " WHERE bin={3} AND chr={4} AND pos={5} AND ref='{6}' AND alt='{7}' ON CONFLICT (variant_id, regovar_trx_id) DO NOTHING; " # TODO : do update on conflict sql_annot_var = "INSERT INTO {0} (variant_id, bin,chr,pos,ref,alt, {1}) SELECT id, {3},{4},{5},'{6}','{7}', {2} FROM variant" + db_ref_suffix + " WHERE bin={3} AND chr={4} AND pos={5} AND ref='{6}' AND alt='{7}' ON CONFLICT (variant_id) DO NOTHING;" sql_query1 = "" sql_query2 = "" sql_query3 = "" count = 0 for row in vcf_reader: records_current += 1 vcf_line += 1 #log("> {} : {}".format(records_current, count)) #if records_current == 14356: #ipdb.set_trace() # TODO : update sample's progress indicator chrm = normalize_chr(str(row.chrom)) for allele in row.alleles: pos, ref, alt = normalise(row.pos, row.ref, allele) bin = getMaxUcscBin(pos, pos + len(ref)) # get list of sample that have this variant (chr-pos-ref-alt) samples_array = [] for sn, sp in row.samples.items(): if allele in sp.alleles: samples_array.append(samples[sp.name]["id"]) if len(samples_array) == 0: continue # save variant samples_array = ",".join([str(s) for s in samples_array]) sql_query1 += sql_pattern1.format(table, chrm, pos, ref, alt, is_transition(ref, alt), bin, samples_array) # Register variant/sample associations for sn, sp in row.samples.items(): gt = normalize_gt(sp) filters = escape_value_for_sql(json.dumps(row.filter.keys())) count += 1 if allele in sp.alleles: if "AD" in sp.keys(): # Get allelic depth if exists (AD field) depth_alt = sp["AD"][sp.alleles.index(allele)] elif "DP4" in sp.keys(): if gt == 0: depth_alt = sum(sp["DP4"]) else: depth_alt = sp["DP4"][2] + sp["DP4"][3] if alt != ref else sp["DP4"][0] + sp["DP4"][1] else : depth_alt = "NULL" sql_query2 += sql_pattern2.format(samples[sn]["id"], vcf_line, bin, chrm, pos, ref, alt, gt, get_info(sp, "DP"), sqlc(depth_alt), sqlc(row.qual), filters) else: # save that the sample HAVE NOT this variant sql_query2 += sql_pattern2.format(samples[sn]["id"], vcf_line, bin, chrm, pos, ref, alt, "NULL", get_info(sp, "DP"), "NULL", sqlc(row.qual), filters) # Register variant annotations for ann_name, importer in vcf_metadata["annotations"].items(): if importer: importer_query, importer_count = importer.import_annotations(sql_annot_trx, bin, chrm, pos, ref, alt, row.info) sql_query3 += importer_query count += importer_count # split big request to avoid sql out of memory transaction or too long freeze of the server if count >= 1000: progress = records_current / records_count count = 0 transaction = "BEGIN; " + sql_query1 + sql_query2 + sql_query3 + "COMMIT; " log("VCF import : line {} (chrm {})".format(records_current, chrm)) log("VCF import : Execute sync query {}/{} ({}%)".format(records_current, records_count, round(progress * 100, 2))) # update sample's progress indicator # note : as we are updating lot of data in the database with several asynch thread # so to avoid conflict with session, we update data from "manual query" sps = [] sql = "UPDATE sample SET loading_progress={} WHERE id IN ({})".format(progress, ",".join([str(samples[sid]["id"]) for sid in samples])) Model.execute(sql) core.notify_all({"action": "import_vcf_processing", "data" : {"reference_id": reference_id, "file_id" : file_id, "status" : "loading", "progress": progress, "samples": [ {"id" : samples[sname]["id"], "name" : sname} for sname in samples]}}) #log("VCF import : enqueue query") #self.queue.put(transaction) log("VCF import : execute query") Model.execute(transaction) # Reset query buffers sql_query1 = "" sql_query2 = "" sql_query3 = "" # # Loop done, execute last pending query # log("VCF import : Execute last async query") # transaction = sql_query1 + sql_query2 + sql_query3 # if transaction: # self.queue.put(transaction) # # Waiting that all query in the queue was executed # log("VCF parsing done. Waiting for async execution of sql queries") # # block until all tasks are done # self.queue.join() # log("No more sql query to proceed") # # stop vcf_import_thread_workers # for i in range(VCF_IMPORT_MAX_THREAD): # self.queue.put(None) # for t in self.workers: # t.join() # Compute composite variant by sample sql_pattern = "UPDATE sample_variant" + db_ref_suffix + " u SET is_composite=TRUE WHERE u.sample_id = {0} AND u.variant_id IN (SELECT DISTINCT UNNEST(sub.vids) as variant_id FROM (SELECT array_agg(v.variant_id) as vids, g.name2 FROM sample_variant" + db_ref_suffix + " v INNER JOIN refgene" + db_ref_suffix + " g ON g.chr=v.chr AND g.trxrange @> v.pos WHERE v.sample_id={0} AND v.genotype=2 or v.genotype=3 GROUP BY name2 HAVING count(*) > 1) AS sub)" log("Computing is_composite fields by samples :") # for sid in samples: # query = sql_pattern.format(samples[sid]["id"]) # log(" - sample {}".format(samples[sid]["id"])) # Model.execute(query) log("Sample import from VCF Done") end = datetime.datetime.now() # update sample's progress indicator Model.execute("UPDATE sample SET status='ready', loading_progress=1 WHERE id IN ({})".format(",".join([str(samples[sid]["id"]) for sid in samples]))) core.notify_all({"action": "import_vcf_end", "data" : {"reference_id": reference_id, "file_id" : file_id, "msg" : "Import done without error.", "samples": [ {"id" : samples[s]["id"], "name" : samples[s]["name"]} for s in samples.keys()]}}) # When import is done, check if analysis are waiting for creation and then start wt creation if all sample are ready # TODO sql = "SELECT DISTINCT(analysis_id) FROM analysis_sample WHERE sample_id IN ({})".format(",".join([str(samples[sid]["id"]) for sid in samples])) for row in Model.execute(sql): analysis = Model.Analysis.from_id(row.analysis_id,1) if analysis.status == "waiting": log("Auto initialisation of the analysis in witing state : {} ({})".format(analysis.name, analysis.id)) core.filters.request(analysis.id, analysis.filter, analysis.fields)
version = sys.argv[2] #hpopath = "/var/regovar/databases/" #version = "2018-03-09 09:06" print(version) # create path to hpo files to import obopath = hpopath + "hpo.obo" annotpath = hpopath + "hpo_annotation.txt" nannotpath = hpopath + "hpo_annotation_neg.txt" diseapath = hpopath + "hpo_disease.txt" phenopath = hpopath + "hpo_phenotype.txt" # Clear HPO tables print('Clear database: ', end='', flush=True) Model.execute("DELETE FROM hpo_phenotype") Model.execute("DELETE FROM hpo_disease") print('Done') # temp dict that store direct child relation between a term and all its childs p_data = {} # phenotype oriented data d_data = {} # disease oriented data # TOOLS def escape(value): if type(value) is str: value = value.replace('%%', '%') value = value.replace("'", "''") return value
async def import_data(self, file_id, **kargs): """ Import samples, variants and annotations from the provided file. This method check provided parameters and parse the header of the vcf to get samples and compute the number of line that need to be parse to allow us to compute a progress indicator. The parsing is done in delegate called in another thread. Return the list of sample that have been added. """ from core.core import core file = Model.File.from_id(file_id) filepath = file.path reference_id = kargs["reference_id"] start_0 = datetime.datetime.now() job_in_progress = [] vcf_metadata = prepare_vcf_parsing(reference_id, filepath) db_ref_suffix = "_" + Model.execute( "SELECT table_suffix FROM reference WHERE id={}".format( reference_id)).first().table_suffix if filepath.endswith(".vcf") or filepath.endswith(".vcf.gz"): filepath += ".regovar_import" # a tmp file have been created by prepare_vcf_parsing() method to avoid pysam unsupported file format. start = datetime.datetime.now() # Create vcf parser vcf_reader = VariantFile(filepath) # get samples in the VCF # samples = {i : Model.get_or_create(Model.Session(), Model.Sample, name=i)[0] for i in list((vcf_reader.header.samples))} samples = {} for i in list((vcf_reader.header.samples)): sample = Model.Sample.new() sample.name = i sample.file_id = file_id sample.reference_id = reference_id sample.filter_description = { filter[0]: filter[1].description for filter in vcf_reader.header.filters.items() } sample.default_dbuid = [] sample.status = "loading" for dbname in vcf_metadata["annotations"].keys(): if vcf_metadata["annotations"][dbname]: sample.default_dbuid.append( vcf_metadata["annotations"][dbname].db_uid) # TODO : is_mosaic according to the data in the vcf sample.save() # As these sample will be shared with other threads, we remove them from the sql session to avoid error samples.update({i: sample.to_json()}) if len(samples.keys()) == 0: war("VCF files without sample cannot be imported in the database." ) core.notify_all({ "action": "import_vcf_error", "data": { "reference_id": reference_id, "file_id": file_id, "msg": "VCF files without sample cannot be imported in the database." } }) return # tasks queue shared by all thread self.queue = Queue(maxsize=0) # list of worker created to execute multithread tasks self.workers = [] # init threading workers for i in range(VCF_IMPORT_MAX_THREAD): t = Thread(target=vcf_import_worker, args=(self.queue, file_id, samples), daemon=True) t.start() self.workers.append(t) core.notify_all({ "action": "import_vcf_start", "data": { "reference_id": reference_id, "file_id": file_id, "samples": [{ "id": samples[sid]["id"], "name": samples[sid]["name"] } for sid in samples.keys()] } }) records_count = vcf_metadata["count"] log("Importing file {0}\n\r\trecords : {1}\n\r\tsamples : ({2}) {3}\n\r\tstart : {4}" .format(filepath, records_count, len(samples.keys()), reprlib.repr([sid for sid in samples.keys()]), start)) run_async(self.import_delegate, file_id, vcf_reader, reference_id, db_ref_suffix, vcf_metadata, samples) return { "success": True, "samples": samples, "records_count": records_count } return {"success": False, "error": "File not supported"}
async def import_data(file_id, filepath, core=None, reference_id=2): import ipdb import os import datetime import sqlalchemy import subprocess import multiprocessing as mp import reprlib import gzip from pysam import VariantFile from core.framework.common import log, war, err, RegovarException import core.model as Model # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # Tools # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # def count_vcf_row(filename): """ Use linux OS commands to quickly count variant to parse in the vcf file """ bashCommand = 'grep -v "^#" ' + str(filename) + ' | wc -l' if filename.endswith("gz"): bashCommand = "z" + bashCommand process = subprocess.Popen(bashCommand, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) cmd_out = process.communicate()[0] return int(cmd_out.decode('utf8')) def debug_clear_header(filename): """ A workaround to fix a bug with GVCF header with pysam EDIT : in fact the problem to be that pysam do not support some kind of compression, so this command is still used to rezip the vcf in a supported format. """ bashCommand = "grep -v '^##GVCFBlock' {} | gzip --best > /var/regovar/downloads/tmp_workaround".format( filename) if filename.endswith("gz"): bashCommand = "z" + bashCommand process = subprocess.Popen(bashCommand, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) bashCommand = "mv /var/regovar/downloads/tmp_workaround {} ".format( filename) process = subprocess.Popen(bashCommand, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) def prepare_vcf_parsing(filename): """ Parse vf headers and return information about which data shall be parsed and stored in the database """ # Extract headers debug_clear_header(filename) headers = {} samples = [] _op = open if filename.endswith('gz') or filename.endswith('zip'): _op = gzip.open with _op(filename) as f: for line in f: if _op != open: line = line.decode() if line.startswith('##'): l = line[2:].strip() l = [l[0:l.index('=')], l[l.index('=') + 1:]] if l[0] not in headers.keys(): if l[0] == 'INFO': headers[l[0]] = {} else: headers[l[0]] = [] if l[0] == 'INFO': data = l[1][1:-1].split(',') info_id = data[0][3:] info_type = data[2][5:] info_desc = data[3][13:-1] headers['INFO'].update({ info_id: { 'type': info_type, 'description': info_desc } }) else: headers[l[0]].append(l[1]) elif line.startswith('#'): samples = line[1:].strip().split('\t')[9:] else: break # Check for VEP vep = {'vep': False} if 'VEP' in headers.keys() and 'CSQ' in headers['INFO'].keys(): d = headers['INFO']['CSQ']['description'].split('Format:') vep = { 'vep': { 'version': headers['VEP'][0].split(' ')[0], 'flag': 'CSQ', 'name': 'VEP', 'db_type': 'transcript', 'db_pk_field': 'Feature', 'description': d[0].strip(), 'columns': d[1].strip().split('|'), } } if 'Feature' not in vep['vep']['columns']: vep = {'vep': False} # Check for SnpEff snpeff = {'snpeff': False} if 'SnpEffVersion' in headers.keys(): if 'ANN' in headers['INFO'].keys(): # TODO pass elif 'EFF' in headers['INFO'].keys(): d = headers['INFO']['EFF']['description'].split('\'') snpeff = { 'snpeff': { 'version': headers['SnpEffVersion'][0].strip().strip('"').split( ' ')[0], 'flag': 'EFF', 'name': 'SnpEff', 'db_type': 'transcript', 'db_pk_field': 'Transcript_ID', 'columns': [c.strip() for c in d[1].strip().split('|')], 'description': d[0].strip(), } } if 'Transcript_ID' not in snpeff['snpeff']['columns']: snpeff = {'snpeff': False} # Retrieve extension file_type = os.path.split(filename)[1].split('.')[-1] if not 'vcf' in file_type: file_type += os.path.split(filename)[1].split('.')[-2] + "." # Return result result = { 'vcf_version': headers['fileformat'][0], 'name': os.path.split(filename)[1], 'count': count_vcf_row(filename), 'size': os.path.getsize(filename), 'type': file_type, 'samples': samples, 'annotations': {} } result['annotations'].update(vep) result['annotations'].update(snpeff) return result def normalise_annotation_name(name): """ Tool to convert a name of a annotation tool/db/field/version into the corresponding valid name for the database """ if name[0].isdigit(): name = '_' + name def check_char(char): if char in ['.', '-', '_', '/']: return '_' elif char.isalnum(): # TODO : remove accents return char.lower() else: return '' return ''.join(check_char(c) for c in name) def create_annotation_db(reference_id, reference_name, table_name, vcf_annotation_metadata): """ Create an annotation database according to information retrieved from the VCF file with the prepare_vcf_parsing method """ # Create annotation table pk = 'transcript_id character varying(100), ' if vcf_annotation_metadata[ 'db_type'] == 'transcript' else '' pk2 = ',transcript_id' if vcf_annotation_metadata[ 'db_type'] == 'transcript' else '' pattern = "CREATE TABLE {0} (variant_id bigint, bin integer, chr integer, pos bigint, ref text, alt text, " + pk + "{1}, CONSTRAINT {0}_ukey UNIQUE (variant_id" + pk2 + "));" query = "" db_map = {} fields = [] for col in vcf_annotation_metadata['columns']: col_name = normalise_annotation_name(col) fields.append("{} text".format(col_name)) db_map[col_name] = { 'name': col_name, 'type': 'string', 'name_ui': col } # By default, create a table with only text field. Type can be changed by user via a dedicated UI query += pattern.format(table_name, ', '.join(fields)) query += "CREATE INDEX {0}_idx_vid ON {0} USING btree (variant_id);".format( table_name) query += "CREATE INDEX {0}_idx_var ON {0} USING btree (bin, chr, pos);".format( table_name) if vcf_annotation_metadata['db_type'] == 'transcript': query += "CREATE INDEX {0}_idx_tid ON {0} USING btree (transcript_id);".format( table_name) # Register annotation db_uid, pk_uid = Model.execute( "SELECT MD5('{0}'), MD5(concat(MD5('{0}'), '{1}'))".format( table_name, normalise_annotation_name( vcf_annotation_metadata['db_pk_field']))).first() query += "INSERT INTO annotation_database (uid, reference_id, name, version, name_ui, description, ord, type, db_pk_field_uid, jointure) VALUES " query += "('{0}', {1}, '{2}', '{3}', '{4}', '{5}', {6}, '{7}', '{8}', '{2} ON {2}.bin={{0}}.bin AND {2}.chr={{0}}.chr AND {2}.pos={{0}}.pos AND {2}.ref={{0}}.ref AND {2}.alt={{0}}.alt AND {2}.transcript_id={{0}}.transcript_pk_value');".format( # We removed this condition /*AND {{0}}.transcript_pk_field_uid=\"{8}\"*/ in the jointure as this condition is already done by a previous query when updating working table with annotations db_uid, reference_id, table_name, vcf_annotation_metadata['version'], vcf_annotation_metadata['name'], vcf_annotation_metadata['description'], 30, vcf_annotation_metadata['db_type'], pk_uid) query += "INSERT INTO annotation_field (database_uid, ord, name, name_ui, type) VALUES " for idx, f in enumerate(vcf_annotation_metadata['columns']): query += "('{0}', {1}, '{2}', '{3}', 'string'),".format( db_uid, idx, normalise_annotation_name(f), f) Model.execute(query[:-1]) Model.execute( "UPDATE annotation_field SET uid=MD5(concat(database_uid, name)) WHERE uid IS NULL;" ) return db_uid, db_map def prepare_annotation_db(reference_id, vcf_annotation_metadata): """ Prepare database for import of custom annotation, and set the mapping between VCF info fields and DB schema """ reference = Model.execute( "SELECT table_suffix FROM reference WHERE id={}".format( reference_id)).first()[0] table_name = normalise_annotation_name('{}_{}_{}'.format( vcf_annotation_metadata['flag'], vcf_annotation_metadata['version'], reference)) # Get database schema (if available) table_cols = {} db_uid = Model.execute( "SELECT uid FROM annotation_database WHERE name='{}'".format( table_name)).first() if db_uid is None: # No table in db for these annotation : create new table db_uid, table_cols = create_annotation_db(reference_id, reference, table_name, vcf_annotation_metadata) else: db_uid = db_uid[0] # Table already exists : retrieve columns already defined for col in Model.execute( "SELECT name, name_ui, type FROM annotation_field WHERE database_uid='{}'" .format(db_uid)): table_cols[col.name] = { 'name': col.name, 'type': col.type, 'name_ui': col.name_ui } # Get diff between columns in vcf and columns in DB, and update DB schema diff = [] for col in vcf_annotation_metadata['columns']: if normalise_annotation_name(col) not in table_cols.keys(): diff.append(col) if len(diff) > 0: offset = len(vcf_annotation_metadata['columns']) query = "" for idx, col in enumerate(diff): name = normalise_annotation_name(col) query += "ALTER TABLE {0} ADD COLUMN {1} text; INSERT INTO public.annotation_field (database_uid, ord, name, name_ui, type) VALUES ('{2}', {3}, '{1}', '{4}', 'string');".format( table_name, name, db_uid, offset + idx, col) table_cols[name] = { 'name': name, 'type': 'string', 'name_ui': col } # execute query Model.execute(query) # Update vcf_annotation_metadata with database mapping db_pk_field_uid = Model.execute( "SELECT db_pk_field_uid FROM annotation_database WHERE uid='{}'". format(db_uid)).first().db_pk_field_uid vcf_annotation_metadata.update({ 'table': table_name, 'db_uid': db_uid, 'db_pk_field_uid': db_pk_field_uid }) vcf_annotation_metadata['db_map'] = {} for col in vcf_annotation_metadata['columns']: vcf_annotation_metadata['db_map'][col] = table_cols[ normalise_annotation_name(col)] return vcf_annotation_metadata def normalize_chr(chrm): """ Normalize chromosome number from VCF format into Database format """ chrm = chrm.upper() if chrm.startswith("CHROM"): chrm = chrm[5:] if chrm.startswith("CHRM") and chrm != "CHRM": chrm = chrm[4:] if chrm.startswith("CHR"): chrm = chrm[3:] if chrm == "X": chrm = 23 elif chrm == "Y": chrm = 24 elif chrm == "M": chrm = 25 else: try: chrm = int(chrm) except Exception as error: # TODO log /report error chrm = None return chrm def normalize(pos, ref, alt): """ Normalize given (position, ref and alt) from VCF into Database format - Assuming that position in VCF are 1-based (0-based in Database) - triming ref and alt to get minimal alt (and update position accordingly) """ # input pos comming from VCF are 1-based. # to be consistent with UCSC databases we convert it into 0-based pos -= 1 if (ref == alt): return None, None, None if ref is None: ref = '' if alt is None: alt = '' while len(ref) > 0 and len(alt) > 0 and ref[0] == alt[0]: ref = ref[1:] alt = alt[1:] pos += 1 if len(ref) == len(alt): while ref[-1:] == alt[-1:]: ref = ref[0:-1] alt = alt[0:-1] return pos, ref, alt def normalize_gt(infos): """ Normalize GT sample informatin from VCF format into Database format """ gt = get_info(infos, 'GT') if gt != 'NULL': if infos['GT'][0] == infos['GT'][1]: # Homozyot ref if infos['GT'][0] in [None, 0]: return 0 # Homozyot alt return '1' else: if 0 in infos['GT']: # Hetero ref return '2' else: return '3' log("unknow : " + str(infos['GT'])) return -1 def get_alt(alt): """ Retrieve alternative values from VCF data """ if ('|' in alt): return alt.split('|') else: return alt.split('/') def get_info(infos, key): """ Retrieving info annotation from VCF data """ if (key in infos): if infos[key] is None: return 'NULL' return infos[key] return 'NULL' def is_transition(ref, alt): """ Return true if the variant is a transversion; false otherwise """ tr = ref + alt if len(ref) == 1 and tr in ('AG', 'GA', 'CT', 'TC'): return True return False def escape_value_for_sql(value): if type(value) is str: value = value.replace('%', '%%') value = value.replace("'", "''") return value # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # Tiers code from vtools. Bin index calculation # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # Utility function to calculate bins. # # This function implements a hashing scheme that UCSC uses (developed by Jim Kent) to # take in a genomic coordinate range and return a set of genomic "bins" that your range # intersects. I found a Java implementation on-line (I need to find the URL) and I # simply manually converted the Java code into Python code. # IMPORTANT: Because this is UCSC code the start coordinates are 0-based and the end # coordinates are 1-based!!!!!! # BINRANGE_MAXEND_512M = 512 * 1024 * 1024 # binOffsetOldToExtended = 4681; # (4096 + 512 + 64 + 8 + 1 + 0) _BINOFFSETS = ( 512 + 64 + 8 + 1, # = 585, min val for level 0 bins (128kb binsize) 64 + 8 + 1, # = 73, min val for level 1 bins (1Mb binsize) 8 + 1, # = 9, min val for level 2 bins (8Mb binsize) 1, # = 1, min val for level 3 bins (64Mb binsize) 0) # = 0, only val for level 4 bin (512Mb binsize) # 1: 0000 0000 0000 0001 1<<0 # 8: 0000 0000 0000 1000 1<<3 # 64: 0000 0000 0100 0000 1<<6 # 512: 0000 0010 0000 0000 1<<9 _BINFIRSTSHIFT = 17 # How much to shift to get to finest bin. _BINNEXTSHIFT = 3 # How much to shift to get to next larger bin. _BINLEVELS = len(_BINOFFSETS) # # IMPORTANT: the start coordinate is 0-based and the end coordinate is 1-based. # def getUcscBins(start, end): bins = [] startBin = start >> _BINFIRSTSHIFT endBin = (end - 1) >> _BINFIRSTSHIFT for i in range(_BINLEVELS): offset = _BINOFFSETS[i] if startBin == endBin: bins.append(startBin + offset) else: for bin in range(startBin + offset, endBin + offset): bins.append(bin) startBin >>= _BINNEXTSHIFT endBin >>= _BINNEXTSHIFT return bins def getMaxUcscBin(start, end): bin = 0 startBin = start >> _BINFIRSTSHIFT endBin = (end - 1) >> _BINFIRSTSHIFT for i in range(_BINLEVELS): offset = _BINOFFSETS[i] if startBin == endBin: if startBin + offset > bin: bin = startBin + offset else: for i in range(startBin + offset, endBin + offset): if i > bin: bin = i startBin >>= _BINNEXTSHIFT endBin >>= _BINNEXTSHIFT return bin # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # Import # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # def transaction_end(job_id, result): job_in_progress.remove(job_id) if result is Exception or result is None: core.notify_all({ 'msg': 'import_vcf_end', 'data': { 'file_id': file_id, 'msg': 'Error occured : ' + str(err) } }) start_0 = datetime.datetime.now() job_in_progress = [] vcf_metadata = prepare_vcf_parsing(filepath) db_ref_suffix = "_" + Model.execute( "SELECT table_suffix FROM reference WHERE id={}".format( reference_id)).first().table_suffix # Prepare database for import of custom annotation, and set the mapping between VCF info fields and DB schema for annotation in vcf_metadata['annotations'].keys(): if vcf_metadata['annotations'][annotation]: data = prepare_annotation_db( reference_id, vcf_metadata['annotations'][annotation]) vcf_metadata['annotations'][annotation].update(data) if filepath.endswith(".vcf") or filepath.endswith(".vcf.gz"): start = datetime.datetime.now() # Create vcf parser vcf_reader = VariantFile(filepath) # get samples in the VCF samples = { i: Model.get_or_create(Model.session(), Model.Sample, name=i)[0] for i in list((vcf_reader.header.samples)) } if len(samples.keys()) == 0: war("VCF files without sample cannot be imported in the database.") if core is not None: core.notify_all({ 'msg': 'import_vcf_end', 'data': { 'file_id': file_id, 'msg': "VCF files without sample cannot be imported in the database." } }) return if core is not None: core.notify_all({ 'msg': 'import_vcf_start', 'data': { 'file_id': file_id, 'samples': [{ 'id': samples[s].id, 'name': samples[s].name } for s in samples.keys()] } }) # Associate sample to the file Model.execute( "INSERT INTO sample_file (sample_id, file_id) VALUES {0} ON CONFLICT DO NOTHING;" .format(','.join([ "({0}, {1})".format(samples[sid].id, file_id) for sid in samples ]))) # parsing vcf file records_count = vcf_metadata['count'] records_current = 0 table = "variant" + db_ref_suffix log("Importing file {0}\n\r\trecords : {1}\n\r\tsamples : ({2}) {3}\n\r\tstart : {4}" .format(filepath, records_count, len(samples.keys()), reprlib.repr([s for s in samples.keys()]), start)) # bar = Bar('\tparsing : ', max=records_count, suffix='%(percent).1f%% - %(elapsed_td)s') sql_pattern1 = "INSERT INTO {0} (chr, pos, ref, alt, is_transition, bin, sample_list) VALUES ({1}, {2}, '{3}', '{4}', {5}, {6}, array[{7}]) ON CONFLICT (chr, pos, ref, alt) DO UPDATE SET sample_list=array_intersect({0}.sample_list, array[{7}]) WHERE {0}.chr={1} AND {0}.pos={2} AND {0}.ref='{3}' AND {0}.alt='{4}';" sql_pattern2 = "INSERT INTO sample_variant" + db_ref_suffix + " (sample_id, variant_id, bin, chr, pos, ref, alt, genotype, depth) SELECT {0}, id, {1}, '{2}', {3}, '{4}', '{5}', '{6}', {7} FROM variant" + db_ref_suffix + " WHERE bin={1} AND chr={2} AND pos={3} AND ref='{4}' AND alt='{5}' ON CONFLICT (sample_id, variant_id) DO NOTHING;" sql_pattern3 = "INSERT INTO {0} (variant_id, bin,chr,pos,ref,alt, transcript_id, {1}) SELECT id, {3},{4},{5},'{6}','{7}', '{8}', {2} FROM variant" + db_ref_suffix + " WHERE bin={3} AND chr={4} AND pos={5} AND ref='{6}' AND alt='{7}' ON CONFLICT (variant_id, transcript_id) DO NOTHING;" # TODO : on conflict, shall update fields with value in the VCF to complete database annotation with (maybe) new fields sql_query1 = "" sql_query2 = "" sql_query3 = "" count = 0 for r in vcf_reader: records_current += 1 if core is not None: core.notify_all({ 'msg': 'import_vcf', 'data': { 'file_id': file_id, 'progress_total': records_count, 'progress_current': records_current, 'progress_percent': round(records_current / max(1, records_count) * 100, 2) } }) chrm = normalize_chr(str(r.chrom)) samples_array = ','.join([str(samples[s].id) for s in r.samples]) for sn in r.samples: s = r.samples.get(sn) if (len(s.alleles) > 0): pos, ref, alt = normalize(r.pos, r.ref, s.alleles[0]) if pos is not None and alt != ref: bin = getMaxUcscBin(pos, pos + len(ref)) sql_query1 += sql_pattern1.format( table, chrm, pos, ref, alt, is_transition(ref, alt), bin, samples_array) sql_query2 += sql_pattern2.format( samples[sn].id, bin, chrm, pos, ref, alt, normalize_gt(s), get_info(s, 'DP')) count += 1 pos, ref, alt = normalize(r.pos, r.ref, s.alleles[1]) if pos is not None and alt != ref: bin = getMaxUcscBin(pos, pos + len(ref)) sql_query1 += sql_pattern1.format( table, chrm, pos, ref, alt, is_transition(ref, alt), bin, samples_array) sql_query2 += sql_pattern2.format( samples[sn].id, bin, chrm, pos, ref, alt, normalize_gt(s), get_info(s, 'DP')) count += 1 # Import custom annotation for the variant for ann_name, metadata in vcf_metadata[ 'annotations'].items(): if metadata: # By transcript (r.info is a list of annotation. Inside we shall find, transcript and allele information to be able to save data for the current variant) for info in r.info[metadata['flag']]: data = info.split('|') q_fields = [] q_values = [] allele = "" trx_pk = "NULL" for col_pos, col_name in enumerate( metadata['columns']): q_fields.append( metadata['db_map'][col_name]['name']) val = escape_value_for_sql(data[col_pos]) if col_name == 'Allele': allele = val.strip().strip("-") if col_name == metadata['db_pk_field']: trx_pk = val.strip() q_values.append( '\'{}\''.format(val) if val != '' and val is not None else 'NULL') pos, ref, alt = normalize( r.pos, r.ref, s.alleles[0]) # print(pos, ref, alt, allele) if pos is not None and alt == allele: # print("ok") sql_query3 += sql_pattern3.format( metadata['table'], ','.join(q_fields), ','.join(q_values), bin, chrm, pos, ref, alt, trx_pk) count += 1 pos, ref, alt = normalize( r.pos, r.ref, s.alleles[1]) # print(pos, ref, alt, allele) if pos is not None and alt == allele: # print("ok") sql_query3 += sql_pattern3.format( metadata['table'], ','.join(q_fields), ','.join(q_values), bin, chrm, pos, ref, alt, trx_pk) count += 1 # manage split big request to avoid sql out of memory transaction if count >= 10000: count = 0 # Model.execute_async(transaction1 + transaction2 + transaction3, transaction_end) transaction = sql_query1 + sql_query2 + sql_query3 log("VCF import : Execute async query (as coroutine)") await Model.execute_aio(transaction) # job_id = Model.execute_bw(transaction, transaction_end) # job_in_progress.append(job_id) # log("VCF import : Execute async query, new job_id : {}. Jobs running [{}]".format(job_id, ','.join([job_in_progress]))) # Reset query buffers sql_query1 = "" sql_query2 = "" sql_query3 = "" # Loop done, execute last pending query log("VCF import : Execute last async query (as coroutine)") transaction = sql_query1 + sql_query2 + sql_query3 await Model.execute_aio(transaction) log("VCF import : Done") end = datetime.datetime.now() if core is not None: core.notify_all({ 'msg': 'import_vcf_end', 'data': { 'file_id': file_id, 'msg': 'Import done without error.', 'samples': [{ 'id': samples[s].id, 'name': samples[s].name } for s in samples.keys()] } })
def check_annotation_table(self): """ Check if annotation table exists and create it according to information collected by the init method """ # check if vep_version table exists columns_mapping = {} db_uid = Model.execute( "SELECT uid FROM annotation_database WHERE name='{}'".format( self.table_name)).first() if db_uid is not None: db_uid = db_uid[0] else: # Create new table pattern = "CREATE TABLE {0} (variant_id bigint, bin integer, chr integer, pos bigint, ref text, alt text, regovar_trx_id character varying(100), {1}, CONSTRAINT {0}_ukey UNIQUE (variant_id, regovar_trx_id));" query = "" db_map = {} fields = [] type_map = { "string": "text", "int": "integer", "float": "real", "bool": "boolean", "enum": "varchar(50)", "list": "varchar(250)[]" } for col_name in self.columns_definitions.keys(): fields.append("{} {}".format( col_name, type_map[self.columns_definitions[col_name]["type"]])) query += pattern.format(self.table_name, ', '.join(fields)) query += "CREATE INDEX {0}_idx_vid ON {0} USING btree (variant_id);".format( self.table_name) query += "CREATE INDEX {0}_idx_var ON {0} USING btree (bin, chr, pos);".format( self.table_name) # Register annotation DB db_uid, pk_uid = Model.execute( "SELECT MD5('{0}'), MD5(concat(MD5('{0}'), '{1}'))".format( self.table_name, self.colums_as_pk)).first() query += "CREATE INDEX {0}_idx_tid ON {0} USING btree (regovar_trx_id);".format( self.table_name) query += "INSERT INTO annotation_database (uid, reference_id, name, version, name_ui, description, ord, type, db_pk_field_uid, jointure) VALUES " q = "('{0}', {1}, '{2}', '{3}', '{4}', '{5}', {6}, '{7}', '{8}', '{2} {{0}} ON {{0}}.bin={{1}}.bin AND {{0}}.chr={{1}}.chr AND {{0}}.pos={{1}}.pos AND {{0}}.ref={{1}}.ref AND {{0}}.alt={{1}}.alt');" query += q.format(db_uid, self.reference_id, self.table_name, self.version, self.name, self.description, 30, 'transcript', pk_uid) query += "INSERT INTO annotation_field (database_uid, ord, name, name_ui, type, description, meta) VALUES " # Register annotation Fields fields = [field for field in self.columns_definitions.keys()] fields.sort() for idx, col_name in enumerate(fields): query += "('{0}', {1}, '{2}', '{3}', '{4}', '{5}', {6}),".format( db_uid, idx, self.columns_definitions[col_name]["order"], col_name.title(), self.columns_definitions[col_name]["type"], self.escape_value_for_sql( self.columns_definitions[col_name]["description"]), "'" + self.escape_value_for_sql( self.columns_definitions[col_name]["meta"]) + "'" if "meta" in self.columns_definitions[col_name] else "NULL") Model.execute(query[:-1]) Model.execute( "UPDATE annotation_field SET uid=MD5(concat(database_uid, name)) WHERE uid IS NULL;" ) # # Pre-process of polyphen/sift vcf columns that are split on 2 columns in regovar db # self.columns = [self.normalise_annotation_name(s) for s in self.columns] # if "sift" in self.columns: # self.columns.extend(["sift_pred", "sift_score"]) # self.columns.remove("sift") # if "polyphen" in self.columns: # self.columns.extend(["polyphen_pred", "polyphen_score"]) # self.columns.remove("polyphen") # Retrieve column mapping for column in vcf self.columns = [ self.normalise_annotation_name(s) for s in self.columns ] for col in Model.execute( "SELECT name, name_ui, type FROM annotation_field WHERE database_uid='{}'" .format(db_uid)): if col.name in self.columns: columns_mapping[col.name] = { 'name': col.name, 'type': col.type, 'name_ui': col.name_ui } for col in self.columns: if col not in columns_mapping.keys(): columns_mapping[col] = False self.db_uid = db_uid self.columns_mapping = columns_mapping return db_uid, columns_mapping
def check_annotation_table(self): """ Check if annotation table exists and create it according to information collected by the init method """ # check if vep_version table exists columns_mapping = {} db_uid = Model.execute("SELECT uid FROM annotation_database WHERE name='{}'".format(self.table_name)).first() if db_uid is not None: db_uid = db_uid[0] else: # Create new table pattern = "CREATE TABLE {0} (variant_id bigint, bin integer, chr integer, pos bigint, ref text, alt text, regovar_trx_id character varying(100), {1}, CONSTRAINT {0}_ukey UNIQUE (variant_id, regovar_trx_id));" query = "" db_map = {} fields = [] type_map = {"string" : "text", "int" : "integer", "float" : "real", "bool" : "boolean", "enum" : "varchar(50)", "list" : "varchar(250)[]"} for col_name in self.columns_definitions.keys(): fields.append("{} {}".format(col_name, type_map[self.columns_definitions[col_name]["type"]])) query += pattern.format(self.table_name, ', '.join(fields)) query += "CREATE INDEX {0}_idx_vid ON {0} USING btree (variant_id);".format(self.table_name) query += "CREATE INDEX {0}_idx_var ON {0} USING btree (bin, chr, pos);".format(self.table_name) # Register annotation DB db_uid, pk_uid = Model.execute("SELECT MD5('{0}'), MD5(concat(MD5('{0}'), '{1}'))".format(self.table_name, self.colums_as_pk)).first() query += "CREATE INDEX {0}_idx_tid ON {0} USING btree (regovar_trx_id);".format(self.table_name) query += "INSERT INTO annotation_database (uid, reference_id, name, version, name_ui, description, ord, type, db_pk_field_uid, jointure) VALUES " q = "('{0}', {1}, '{2}', '{3}', '{4}', '{5}', {6}, '{7}', '{8}', '{2} {{0}} ON {{0}}.bin={{1}}.bin AND {{0}}.chr={{1}}.chr AND {{0}}.pos={{1}}.pos AND {{0}}.ref={{1}}.ref AND {{0}}.alt={{1}}.alt');" query += q.format( db_uid, self.reference_id, self.table_name, self.version, self.name, self.description, 30, 'transcript', pk_uid) query += "INSERT INTO annotation_field (database_uid, ord, name, name_ui, type, description) VALUES " # Register annotation Fields fields = [field for field in self.columns_definitions.keys()] fields.sort() for idx, col_name in enumerate(fields): query += "('{0}', {1}, '{2}', '{3}', '{4}', '{5}'),".format(db_uid, idx, col_name, col_name.title(), self.columns_definitions[col_name]["type"], self.escape_value_for_sql(self.columns_definitions[col_name]["description"])) Model.execute(query[:-1]) Model.execute("UPDATE annotation_field SET uid=MD5(concat(database_uid, name)) WHERE uid IS NULL;") # Retrieve column mapping for column in vcf self.columns = [self.normalise_annotation_name(s) for s in self.columns] for col in Model.execute("SELECT name, name_ui, type FROM annotation_field WHERE database_uid='{}'".format(db_uid)): if col.name in self.columns: columns_mapping[col.name] = {'name': col.name, 'type': col.type, 'name_ui': col.name_ui} for col in self.columns: if col not in columns_mapping.keys(): columns_mapping[col] = False self.db_uid = db_uid self.columns_mapping = columns_mapping return db_uid, columns_mapping
async def import_data(file_id, filepath, core=None, reference_id = 2): import ipdb import os import datetime import sqlalchemy import subprocess import multiprocessing as mp import reprlib import gzip from pysam import VariantFile from core.framework.common import log, war, err, RegovarException import core.model as Model # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # Tools # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # def count_vcf_row(filename): """ Use linux OS commands to quickly count variant to parse in the vcf file """ bashCommand = 'grep -v "^#" ' + str(filename) +' | wc -l' if filename.endswith("gz"): bashCommand = "z" + bashCommand process = subprocess.Popen(bashCommand, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) cmd_out = process.communicate()[0] return int(cmd_out.decode('utf8')) def debug_clear_header(filename): """ A workaround to fix a bug with GVCF header with pysam EDIT : in fact the problem to be that pysam do not support some kind of compression, so this command is still used to rezip the vcf in a supported format. """ bashCommand = "grep -v '^##GVCFBlock' {} | gzip --best > /var/regovar/downloads/tmp_workaround".format(filename) if filename.endswith("gz"): bashCommand = "z" + bashCommand process = subprocess.Popen(bashCommand, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) bashCommand = "mv /var/regovar/downloads/tmp_workaround {} ".format(filename) process = subprocess.Popen(bashCommand, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) def prepare_vcf_parsing(filename): """ Parse vf headers and return information about which data shall be parsed and stored in the database """ # Extract headers debug_clear_header(filename) headers = {} samples = [] _op = open if filename.endswith('gz') or filename.endswith('zip'): _op = gzip.open with _op(filename) as f: for line in f: if _op != open: line = line.decode() if line.startswith('##'): l = line[2:].strip() l = [l[0:l.index('=')], l[l.index('=')+1:]] if l[0] not in headers.keys(): if l[0] == 'INFO' : headers[l[0]] = {} else: headers[l[0]] = [] if l[0] == 'INFO' : data = l[1][1:-1].split(',') info_id = data[0][3:] info_type = data[2][5:] info_desc = data[3][13:-1] headers['INFO'].update({info_id : {'type' : info_type, 'description' : info_desc}}) else: headers[l[0]].append(l[1]) elif line.startswith('#'): samples = line[1:].strip().split('\t')[9:] else : break; # Check for VEP vep = {'vep' : False} if 'VEP' in headers.keys() and 'CSQ' in headers['INFO'].keys(): d = headers['INFO']['CSQ']['description'].split('Format:') vep = { 'vep' : { 'version' : headers['VEP'][0].split(' ')[0], 'flag' : 'CSQ', 'name' : 'VEP', 'db_type' : 'transcript', 'db_pk_field' : 'Feature', 'description' : d[0].strip(), 'columns' : d[1].strip().split('|'), } } if 'Feature' not in vep['vep']['columns']: vep = {'vep' : False } # Check for SnpEff snpeff = {'snpeff' : False } if 'SnpEffVersion' in headers.keys() : if 'ANN' in headers['INFO'].keys(): # TODO pass elif 'EFF' in headers['INFO'].keys(): d = headers['INFO']['EFF']['description'].split('\'') snpeff = { 'snpeff' : { 'version' : headers['SnpEffVersion'][0].strip().strip('"').split(' ')[0], 'flag' : 'EFF', 'name' : 'SnpEff', 'db_type' : 'transcript', 'db_pk_field' : 'Transcript_ID', 'columns' : [c.strip() for c in d[1].strip().split('|')], 'description' : d[0].strip(), } } if 'Transcript_ID' not in snpeff['snpeff']['columns']: snpeff = {'snpeff' : False } # Retrieve extension file_type = os.path.split(filename)[1].split('.')[-1] if not 'vcf' in file_type : file_type += os.path.split(filename)[1].split('.')[-2] + "." # Return result result = { 'vcf_version' : headers['fileformat'][0], 'name' : os.path.split(filename)[1], 'count' : count_vcf_row(filename), 'size' : os.path.getsize(filename), 'type' : file_type, 'samples' : samples, 'annotations' : {} } result['annotations'].update(vep) result['annotations'].update(snpeff) return result def normalise_annotation_name(name): """ Tool to convert a name of a annotation tool/db/field/version into the corresponding valid name for the database """ if name[0].isdigit(): name = '_'+name def check_char(char): if char in ['.', '-', '_', '/']: return '_' elif char.isalnum(): # TODO : remove accents return char.lower() else: return '' return ''.join(check_char(c) for c in name) def create_annotation_db(reference_id, reference_name, table_name, vcf_annotation_metadata): """ Create an annotation database according to information retrieved from the VCF file with the prepare_vcf_parsing method """ # Create annotation table pk = 'transcript_id character varying(100), ' if vcf_annotation_metadata['db_type'] == 'transcript' else '' pk2 = ',transcript_id' if vcf_annotation_metadata['db_type'] == 'transcript' else '' pattern = "CREATE TABLE {0} (variant_id bigint, bin integer, chr integer, pos bigint, ref text, alt text, " + pk + "{1}, CONSTRAINT {0}_ukey UNIQUE (variant_id" + pk2 +"));" query = "" db_map = {} fields = [] for col in vcf_annotation_metadata['columns']: col_name = normalise_annotation_name(col) fields.append("{} text".format(col_name)) db_map[col_name] = { 'name' : col_name, 'type' : 'string', 'name_ui' : col } # By default, create a table with only text field. Type can be changed by user via a dedicated UI query += pattern.format(table_name, ', '.join(fields)) query += "CREATE INDEX {0}_idx_vid ON {0} USING btree (variant_id);".format(table_name) query += "CREATE INDEX {0}_idx_var ON {0} USING btree (bin, chr, pos);".format(table_name) if vcf_annotation_metadata['db_type'] == 'transcript': query += "CREATE INDEX {0}_idx_tid ON {0} USING btree (transcript_id);".format(table_name) # Register annotation db_uid, pk_uid = Model.execute("SELECT MD5('{0}'), MD5(concat(MD5('{0}'), '{1}'))".format(table_name, normalise_annotation_name(vcf_annotation_metadata['db_pk_field']))).first() query += "INSERT INTO annotation_database (uid, reference_id, name, version, name_ui, description, ord, type, db_pk_field_uid, jointure) VALUES " query += "('{0}', {1}, '{2}', '{3}', '{4}', '{5}', {6}, '{7}', '{8}', '{2} ON {2}.bin={{0}}.bin AND {2}.chr={{0}}.chr AND {2}.pos={{0}}.pos AND {2}.ref={{0}}.ref AND {2}.alt={{0}}.alt AND {2}.transcript_id={{0}}.transcript_pk_value');".format( # We removed this condition /*AND {{0}}.transcript_pk_field_uid=\"{8}\"*/ in the jointure as this condition is already done by a previous query when updating working table with annotations db_uid, reference_id, table_name, vcf_annotation_metadata['version'], vcf_annotation_metadata['name'], vcf_annotation_metadata['description'], 30, vcf_annotation_metadata['db_type'], pk_uid) query += "INSERT INTO annotation_field (database_uid, ord, name, name_ui, type) VALUES " for idx, f in enumerate(vcf_annotation_metadata['columns']): query += "('{0}', {1}, '{2}', '{3}', 'string'),".format(db_uid, idx, normalise_annotation_name(f), f) Model.execute(query[:-1]) Model.execute("UPDATE annotation_field SET uid=MD5(concat(database_uid, name)) WHERE uid IS NULL;") return db_uid, db_map def prepare_annotation_db(reference_id, vcf_annotation_metadata): """ Prepare database for import of custom annotation, and set the mapping between VCF info fields and DB schema """ reference = Model.execute("SELECT table_suffix FROM reference WHERE id={}".format(reference_id)).first()[0] table_name = normalise_annotation_name('{}_{}_{}'.format(vcf_annotation_metadata['flag'], vcf_annotation_metadata['version'], reference)) # Get database schema (if available) table_cols = {} db_uid = Model.execute("SELECT uid FROM annotation_database WHERE name='{}'".format(table_name)).first() if db_uid is None: # No table in db for these annotation : create new table db_uid, table_cols = create_annotation_db(reference_id, reference, table_name, vcf_annotation_metadata) else: db_uid = db_uid[0] # Table already exists : retrieve columns already defined for col in Model.execute("SELECT name, name_ui, type FROM annotation_field WHERE database_uid='{}'".format(db_uid)): table_cols[col.name] = {'name': col.name, 'type': col.type, 'name_ui': col.name_ui} # Get diff between columns in vcf and columns in DB, and update DB schema diff = [] for col in vcf_annotation_metadata['columns']: if normalise_annotation_name(col) not in table_cols.keys(): diff.append(col) if len(diff) > 0 : offset = len(vcf_annotation_metadata['columns']) query = "" for idx, col in enumerate(diff): name=normalise_annotation_name(col) query += "ALTER TABLE {0} ADD COLUMN {1} text; INSERT INTO public.annotation_field (database_uid, ord, name, name_ui, type) VALUES ('{2}', {3}, '{1}', '{4}', 'string');".format(table_name, name, db_uid, offset + idx, col) table_cols[name] = {'name': name, 'type': 'string', 'name_ui': col} # execute query Model.execute(query) # Update vcf_annotation_metadata with database mapping db_pk_field_uid = Model.execute("SELECT db_pk_field_uid FROM annotation_database WHERE uid='{}'".format(db_uid)).first().db_pk_field_uid vcf_annotation_metadata.update({'table': table_name, 'db_uid': db_uid, 'db_pk_field_uid': db_pk_field_uid}) vcf_annotation_metadata['db_map'] = {} for col in vcf_annotation_metadata['columns']: vcf_annotation_metadata['db_map'][col] = table_cols[normalise_annotation_name(col)] return vcf_annotation_metadata def normalize_chr(chrm): """ Normalize chromosome number from VCF format into Database format """ chrm = chrm.upper() if chrm.startswith("CHROM"): chrm = chrm[5:] if chrm.startswith("CHRM") and chrm != "CHRM": chrm = chrm[4:] if chrm.startswith("CHR"): chrm = chrm[3:] if chrm == "X": chrm = 23 elif chrm == "Y": chrm = 24 elif chrm == "M": chrm = 25 else: try: chrm = int(chrm) except Exception as error: # TODO log /report error chrm = None return chrm def normalize(pos, ref, alt): """ Normalize given (position, ref and alt) from VCF into Database format - Assuming that position in VCF are 1-based (0-based in Database) - triming ref and alt to get minimal alt (and update position accordingly) """ # input pos comming from VCF are 1-based. # to be consistent with UCSC databases we convert it into 0-based pos -= 1 if (ref == alt): return None,None,None if ref is None: ref = '' if alt is None: alt = '' while len(ref) > 0 and len(alt) > 0 and ref[0]==alt[0] : ref = ref[1:] alt = alt[1:] pos += 1 if len(ref) == len(alt): while ref[-1:]==alt[-1:]: ref = ref[0:-1] alt = alt[0:-1] return pos, ref, alt def normalize_gt(infos): """ Normalize GT sample informatin from VCF format into Database format """ gt = get_info(infos, 'GT') if gt != 'NULL': if infos['GT'][0] == infos['GT'][1]: # Homozyot ref if infos['GT'][0] in [None, 0] : return 0 # Homozyot alt return '1' else : if 0 in infos['GT'] : # Hetero ref return '2' else : return '3' log ("unknow : " + str(infos['GT']) ) return -1 def get_alt(alt): """ Retrieve alternative values from VCF data """ if ('|' in alt): return alt.split('|') else: return alt.split('/') def get_info(infos, key): """ Retrieving info annotation from VCF data """ if (key in infos): if infos[key] is None : return 'NULL' return infos[key] return 'NULL' def is_transition(ref, alt): """ Return true if the variant is a transversion; false otherwise """ tr = ref+alt if len(ref) == 1 and tr in ('AG', 'GA', 'CT', 'TC'): return True return False def escape_value_for_sql(value): if type(value) is str: value = value.replace('%', '%%') value = value.replace("'", "''") return value # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # Tiers code from vtools. Bin index calculation # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # Utility function to calculate bins. # # This function implements a hashing scheme that UCSC uses (developed by Jim Kent) to # take in a genomic coordinate range and return a set of genomic "bins" that your range # intersects. I found a Java implementation on-line (I need to find the URL) and I # simply manually converted the Java code into Python code. # IMPORTANT: Because this is UCSC code the start coordinates are 0-based and the end # coordinates are 1-based!!!!!! # BINRANGE_MAXEND_512M = 512 * 1024 * 1024 # binOffsetOldToExtended = 4681; # (4096 + 512 + 64 + 8 + 1 + 0) _BINOFFSETS = ( 512+64+8+1, # = 585, min val for level 0 bins (128kb binsize) 64+8+1, # = 73, min val for level 1 bins (1Mb binsize) 8+1, # = 9, min val for level 2 bins (8Mb binsize) 1, # = 1, min val for level 3 bins (64Mb binsize) 0) # = 0, only val for level 4 bin (512Mb binsize) # 1: 0000 0000 0000 0001 1<<0 # 8: 0000 0000 0000 1000 1<<3 # 64: 0000 0000 0100 0000 1<<6 # 512: 0000 0010 0000 0000 1<<9 _BINFIRSTSHIFT = 17; # How much to shift to get to finest bin. _BINNEXTSHIFT = 3; # How much to shift to get to next larger bin. _BINLEVELS = len(_BINOFFSETS) # # IMPORTANT: the start coordinate is 0-based and the end coordinate is 1-based. # def getUcscBins(start, end): bins = [] startBin = start >> _BINFIRSTSHIFT endBin = (end-1) >> _BINFIRSTSHIFT for i in range(_BINLEVELS): offset = _BINOFFSETS[i]; if startBin == endBin: bins.append(startBin + offset) else: for bin in range(startBin + offset, endBin + offset): bins.append(bin); startBin >>= _BINNEXTSHIFT endBin >>= _BINNEXTSHIFT return bins def getMaxUcscBin(start, end): bin = 0 startBin = start >> _BINFIRSTSHIFT endBin = (end-1) >> _BINFIRSTSHIFT for i in range(_BINLEVELS): offset = _BINOFFSETS[i]; if startBin == endBin: if startBin + offset > bin: bin = startBin + offset else: for i in range(startBin + offset, endBin + offset): if i > bin: bin = i startBin >>= _BINNEXTSHIFT endBin >>= _BINNEXTSHIFT return bin # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # Import # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # def transaction_end(job_id, result): job_in_progress.remove(job_id) if result is Exception or result is None: core.notify_all({'msg':'import_vcf_end', 'data' : {'file_id' : file_id, 'msg' : 'Error occured : ' + str(err)}}) start_0 = datetime.datetime.now() job_in_progress = [] vcf_metadata = prepare_vcf_parsing(filepath) db_ref_suffix= "_" + Model.execute("SELECT table_suffix FROM reference WHERE id={}".format(reference_id)).first().table_suffix # Prepare database for import of custom annotation, and set the mapping between VCF info fields and DB schema for annotation in vcf_metadata['annotations'].keys(): if vcf_metadata['annotations'][annotation]: data = prepare_annotation_db(reference_id, vcf_metadata['annotations'][annotation]) vcf_metadata['annotations'][annotation].update(data) if filepath.endswith(".vcf") or filepath.endswith(".vcf.gz"): start = datetime.datetime.now() # Create vcf parser vcf_reader = VariantFile(filepath) # get samples in the VCF samples = {i : Model.get_or_create(Model.session(), Model.Sample, name=i)[0] for i in list((vcf_reader.header.samples))} if len(samples.keys()) == 0 : war("VCF files without sample cannot be imported in the database.") if core is not None: core.notify_all({'msg':'import_vcf_end', 'data' : {'file_id' : file_id, 'msg' : "VCF files without sample cannot be imported in the database."}}) return; if core is not None: core.notify_all({'msg':'import_vcf_start', 'data' : {'file_id' : file_id, 'samples' : [ {'id' : samples[s].id, 'name' : samples[s].name} for s in samples.keys()]}}) # Associate sample to the file Model.execute("INSERT INTO sample_file (sample_id, file_id) VALUES {0} ON CONFLICT DO NOTHING;".format( ','.join(["({0}, {1})".format(samples[sid].id, file_id) for sid in samples]))) # parsing vcf file records_count = vcf_metadata['count'] records_current = 0 table = "variant" + db_ref_suffix log ("Importing file {0}\n\r\trecords : {1}\n\r\tsamples : ({2}) {3}\n\r\tstart : {4}".format(filepath, records_count, len(samples.keys()), reprlib.repr([s for s in samples.keys()]), start)) # bar = Bar('\tparsing : ', max=records_count, suffix='%(percent).1f%% - %(elapsed_td)s') sql_pattern1 = "INSERT INTO {0} (chr, pos, ref, alt, is_transition, bin, sample_list) VALUES ({1}, {2}, '{3}', '{4}', {5}, {6}, array[{7}]) ON CONFLICT (chr, pos, ref, alt) DO UPDATE SET sample_list=array_intersect({0}.sample_list, array[{7}]) WHERE {0}.chr={1} AND {0}.pos={2} AND {0}.ref='{3}' AND {0}.alt='{4}';" sql_pattern2 = "INSERT INTO sample_variant" + db_ref_suffix + " (sample_id, variant_id, bin, chr, pos, ref, alt, genotype, depth) SELECT {0}, id, {1}, '{2}', {3}, '{4}', '{5}', '{6}', {7} FROM variant" + db_ref_suffix + " WHERE bin={1} AND chr={2} AND pos={3} AND ref='{4}' AND alt='{5}' ON CONFLICT (sample_id, variant_id) DO NOTHING;" sql_pattern3 = "INSERT INTO {0} (variant_id, bin,chr,pos,ref,alt, transcript_id, {1}) SELECT id, {3},{4},{5},'{6}','{7}', '{8}', {2} FROM variant" + db_ref_suffix + " WHERE bin={3} AND chr={4} AND pos={5} AND ref='{6}' AND alt='{7}' ON CONFLICT (variant_id, transcript_id) DO NOTHING;" # TODO : on conflict, shall update fields with value in the VCF to complete database annotation with (maybe) new fields sql_query1 = "" sql_query2 = "" sql_query3 = "" count = 0 for r in vcf_reader: records_current += 1 if core is not None: core.notify_all({'msg':'import_vcf', 'data' : {'file_id' : file_id, 'progress_total' : records_count, 'progress_current' : records_current, 'progress_percent' : round(records_current / max(1,records_count) * 100, 2)}}) chrm = normalize_chr(str(r.chrom)) samples_array = ','.join([str(samples[s].id) for s in r.samples]) for sn in r.samples: s = r.samples.get(sn) if (len(s.alleles) > 0) : pos, ref, alt = normalize(r.pos, r.ref, s.alleles[0]) if pos is not None and alt != ref : bin = getMaxUcscBin(pos, pos + len(ref)) sql_query1 += sql_pattern1.format(table, chrm, pos, ref, alt, is_transition(ref, alt), bin, samples_array) sql_query2 += sql_pattern2.format(samples[sn].id, bin, chrm, pos, ref, alt, normalize_gt(s), get_info(s, 'DP')) count += 1 pos, ref, alt = normalize(r.pos, r.ref, s.alleles[1]) if pos is not None and alt != ref : bin = getMaxUcscBin(pos, pos + len(ref)) sql_query1 += sql_pattern1.format(table, chrm, pos, ref, alt, is_transition(ref, alt), bin, samples_array) sql_query2 += sql_pattern2.format(samples[sn].id, bin, chrm, pos, ref, alt, normalize_gt(s), get_info(s, 'DP')) count += 1 # Import custom annotation for the variant for ann_name, metadata in vcf_metadata['annotations'].items(): if metadata: # By transcript (r.info is a list of annotation. Inside we shall find, transcript and allele information to be able to save data for the current variant) for info in r.info[metadata['flag']]: data = info.split('|') q_fields = [] q_values = [] allele = "" trx_pk = "NULL" for col_pos, col_name in enumerate(metadata['columns']): q_fields.append(metadata['db_map'][col_name]['name']) val = escape_value_for_sql(data[col_pos]) if col_name == 'Allele': allele = val.strip().strip("-") if col_name == metadata['db_pk_field']: trx_pk = val.strip() q_values.append('\'{}\''.format(val) if val != '' and val is not None else 'NULL') pos, ref, alt = normalize(r.pos, r.ref, s.alleles[0]) # print(pos, ref, alt, allele) if pos is not None and alt==allele: # print("ok") sql_query3 += sql_pattern3.format(metadata['table'], ','.join(q_fields), ','.join(q_values), bin, chrm, pos, ref, alt, trx_pk) count += 1 pos, ref, alt = normalize(r.pos, r.ref, s.alleles[1]) # print(pos, ref, alt, allele) if pos is not None and alt==allele: # print("ok") sql_query3 += sql_pattern3.format(metadata['table'], ','.join(q_fields), ','.join(q_values), bin, chrm, pos, ref, alt, trx_pk) count += 1 # manage split big request to avoid sql out of memory transaction if count >= 10000: count = 0 # Model.execute_async(transaction1 + transaction2 + transaction3, transaction_end) transaction = sql_query1 + sql_query2 + sql_query3 log("VCF import : Execute async query (as coroutine)") await Model.execute_aio(transaction) # job_id = Model.execute_bw(transaction, transaction_end) # job_in_progress.append(job_id) # log("VCF import : Execute async query, new job_id : {}. Jobs running [{}]".format(job_id, ','.join([job_in_progress]))) # Reset query buffers sql_query1 = "" sql_query2 = "" sql_query3 = "" # Loop done, execute last pending query log("VCF import : Execute last async query (as coroutine)") transaction = sql_query1 + sql_query2 + sql_query3 await Model.execute_aio(transaction) log("VCF import : Done") end = datetime.datetime.now() if core is not None: core.notify_all({'msg':'import_vcf_end', 'data' : {'file_id' : file_id, 'msg' : 'Import done without error.', 'samples': [ {'id' : samples[s].id, 'name' : samples[s].name} for s in samples.keys()]}})