def handle(self, path, **options): database = options.get('database') transripts = options.get('transripts') stdout = options.get('stdout') with open(path) as fin: stream = EffectStream(fin, skip_existing=False) if stdout: while True: line = stream.readline() if line == '': break sys.stdout.write(line) else: cursor = connections[database].cursor() with transaction.commit_manually(database): try: cursor.execute('TRUNCATE {0}'.format(VariantEffect._meta.db_table)) if transripts: cursor.execute('TRUNCATE {0} CASCADE'.format(Transcript._meta.db_table)) columns = stream.output_columns db_table = VariantEffect._meta.db_table pgcopy_batch(stream, db_table, columns, cursor, database) transaction.commit(database) except Exception as e: transaction.rollback(database) log.exception(e) raise
def handle(self, path, **options): database = options.get('database') transripts = options.get('transripts') stdout = options.get('stdout') with open(path) as fin: stream = EffectStream(fin, skip_existing=False) if stdout: while True: line = stream.readline() if line == '': break sys.stdout.write(line) else: cursor = connections[database].cursor() with transaction.commit_manually(database): try: cursor.execute('TRUNCATE {0}'.format( VariantEffect._meta.db_table)) if transripts: cursor.execute('TRUNCATE {0} CASCADE'.format( Transcript._meta.db_table)) columns = stream.output_columns db_table = VariantEffect._meta.db_table pgcopy_batch(stream, db_table, columns, cursor, database) transaction.commit(database) except Exception as e: transaction.rollback(database) log.exception(e) raise
def load_effects(manifest_path, database, **kwargs): manifest = ManifestReader(manifest_path) vcf_info = manifest.section('vcf') # No data regarding VCF if 'file' not in vcf_info: return cursor = connections[database].cursor() vcf_path = os.path.join(os.path.dirname(manifest_path), vcf_info['file']) with open(vcf_path) as fin: log.debug("opening {0} in {1} load_effects".format(vcf_path, __name__)) stream = EffectStream(fin) columns = stream.output_columns db_table = VariantEffect._meta.db_table pgcopy_batch(stream, db_table, columns, cursor, database)
def load_variants(manifest_path, database, **kwargs): "Variant loading requires only a VCF file and will never load a duplicate." manifest = ManifestReader(manifest_path) vcf_info = manifest.section('vcf') # No data regarding VCF if 'file' not in vcf_info: return cursor = connections[database].cursor() vcf_path = os.path.join(os.path.dirname(manifest_path), vcf_info['file']) with open(vcf_path) as fin: log.debug("opening {0} in {1}".format(vcf_path, __name__)) stream = VariantStream(fin) columns = stream.output_columns db_table = Variant._meta.db_table pgcopy_batch(stream, db_table, columns, cursor, database) VARIANT_CHANNEL.publish(manifest_path=manifest_path, database=database)
def load_results(manifest_path, database, **kwargs): manifest = ManifestReader(manifest_path) if not manifest.marked_for_load(): log.info('Sample not marked for load', extra={ 'manifest_path': manifest_path, }) return # Ensure the sample section is valid.. if not check_sample_section(manifest): log.info('Manifest sample section is not valid', extra={ 'manifest_path': manifest_path, }) return sample_info = manifest.section('sample') vcf_info = manifest.section('vcf') # Ignore whatever sample is listed in the manifest and scan the vcf for # samples. vcf_path = os.path.join(os.path.dirname(manifest_path), vcf_info['file']) with open(vcf_path) as file_obj: log.debug("opening {0} in load_samples".format(vcf_path)) reader = vcf.Reader(file_obj) samples = reader.samples if 'sample' in sample_info: pretty_names = sample_info['sample'].split(',') else: pretty_names = samples for pretty_name, vcf_sample in zip(pretty_names, samples): try: sample = Sample.objects.get( name__iexact=pretty_name, batch__name__iexact=sample_info['batch'], project__name__iexact=sample_info['project'], version=sample_info['version']) except Sample.DoesNotExist: log.error('Sample does not exist', extra=sample_info) return #is it already loaded, let's skip for now if Result.objects.filter(sample=sample).exists(): log.debug('{0} exists in results'.format(vcf_sample)) else: log.debug('about to load results for {0}'.format(vcf_sample)) #STSError: Cannot start transition while already in one. successful = False while not successful: try: with transition(sample, 'Sample Published', event='Loading Results'): connection = connections[database] cursor = connection.cursor() with open(vcf_path) as fin: stream = ResultStream(fin, sample_id=sample.id, vcf_sample=vcf_sample) columns = stream.output_columns db_table = Result._meta.db_table pgcopy_batch(stream, db_table, columns, cursor, database) # Update result count sample.count = sample.results.count() sample.published = True sample.save() successful = True except: log.error('STS errors') time.sleep(10) vcf_info = manifest.section('vcf') # Absolute path relative to the MANIFEST directory vcf_path = os.path.join(os.path.dirname(manifest_path), vcf_info['file']) # Compare expected MD5 (in manifest) to the file MD5 if 'md5' in vcf_info: vcf_md5 = checks.file_md5(vcf_path) if vcf_md5 != vcf_info['md5']: log.error('VCF file MD5 does not match expected in manifest', extra={'manifest_path': manifest_path}) # Existing samples by the same name of a previous version are unpublished # now that is is ready to be published. count = Sample.objects.filter( name__iexact=sample.name, project=sample.project, batch=sample.batch, version__lt=sample.version).update(published=False) if count: log.info('{0} previous versions unpublished for {1}' .format(count, sample))
def load_evs(database, **kwargs): if not db.utils.table_exists('evs', schema='raw'): return cursor = connections[database].cursor() cursor.execute(db.utils.sequence_reset_sql(EVS, database)) with transaction.commit_manually(database): try: # MAFs are divided by 100 since they are percentages to begin with cursor.execute(''' SELECT variant.id, r.ea_ac_ref, r.ea_ac_alt, r.aa_ac_ref, r.aa_ac_alt, r.all_ac_ref, r.all_ac_alt, r.ea_maf / 100.0, r.aa_maf / 100.0, r.all_maf / 100.0, r.gts, r.ea_gtc, r.aa_gtc, r.all_gtc, r.clinical_association FROM "variant" INNER JOIN "raw"."evs" r ON ("variant".md5 = r."md5") LEFT OUTER JOIN "evs" ON ("evs"."variant_id" = "variant"."id") WHERE "evs"."id" IS NULL ''') # Note, *_af are actually minor allele frequencies by default. # The handler checks to see if the reference count is less than # the alternate and sets the AF if the reference count is less than # the alternate columns = [ 'variant_id', 'ea_ac_ref', 'ea_ac_alt', 'aa_ac_ref', 'aa_ac_alt', 'all_ac_ref', 'all_ac_alt', 'ea_af', 'aa_af', 'all_af', 'gts', 'ea_gtc', 'aa_gtc', 'all_gtc', 'clinical_association' ] def compare_counts(ref, alt): "Compares allele counts and handles heterozygotes." ref = int(ref) for _alt in alt.split(','): if ref < int(_alt): return True return False def handler(row): record = OrderedDict(zip(columns, row)) # All if compare_counts(record['all_ac_ref'], record['all_ac_alt']): record['all_af'] = 1 - record['all_af'] # European if compare_counts(record['ea_ac_ref'], record['ea_ac_alt']): record['ea_af'] = 1 - record['ea_af'] # African American if compare_counts(record['aa_ac_ref'], record['aa_ac_alt']): record['aa_af'] = 1 - record['aa_af'] cleaned = [ str(x) if x is not None else '\N' for x in record.values() ] return '\t'.join(cleaned) + '\n' def streamer(cursor): while True: rows = cursor.fetchmany(100) if not rows: break for row in rows: yield handler(row) pgcopy_batch(streamer(cursor), EVS._meta.db_table, columns=columns, database=database) transaction.commit() except Exception as e: transaction.rollback() log.exception(e)
def load_evs(database, **kwargs): if not db.utils.table_exists('evs', schema='raw'): return cursor = connections[database].cursor() cursor.execute(db.utils.sequence_reset_sql(EVS, database)) with transaction.commit_manually(database): try: # MAFs are divided by 100 since they are percentages to begin with cursor.execute(''' SELECT variant.id, r.ea_ac_ref, r.ea_ac_alt, r.aa_ac_ref, r.aa_ac_alt, r.all_ac_ref, r.all_ac_alt, r.ea_maf / 100.0, r.aa_maf / 100.0, r.all_maf / 100.0, r.gts, r.ea_gtc, r.aa_gtc, r.all_gtc, r.clinical_association FROM "variant" INNER JOIN "raw"."evs" r ON ("variant".md5 = r."md5") LEFT OUTER JOIN "evs" ON ("evs"."variant_id" = "variant"."id") WHERE "evs"."id" IS NULL ''') # Note, *_af are actually minor allele frequencies by default. # The handler checks to see if the reference count is less than # the alternate and sets the AF if the reference count is less than # the alternate columns = ['variant_id', 'ea_ac_ref', 'ea_ac_alt', 'aa_ac_ref', 'aa_ac_alt', 'all_ac_ref', 'all_ac_alt', 'ea_af', 'aa_af', 'all_af', 'gts', 'ea_gtc', 'aa_gtc', 'all_gtc', 'clinical_association'] def compare_counts(ref, alt): "Compares allele counts and handles heterozygotes." ref = int(ref) for _alt in alt.split(','): if ref < int(_alt): return True return False def handler(row): record = OrderedDict(zip(columns, row)) # All if compare_counts(record['all_ac_ref'], record['all_ac_alt']): record['all_af'] = 1 - record['all_af'] # European if compare_counts(record['ea_ac_ref'], record['ea_ac_alt']): record['ea_af'] = 1 - record['ea_af'] # African American if compare_counts(record['aa_ac_ref'], record['aa_ac_alt']): record['aa_af'] = 1 - record['aa_af'] cleaned = [str(x) if x is not None else '\N' for x in record.values()] return '\t'.join(cleaned) + '\n' def streamer(cursor): while True: rows = cursor.fetchmany(100) if not rows: break for row in rows: yield handler(row) pgcopy_batch(streamer(cursor), EVS._meta.db_table, columns=columns, database=database) transaction.commit() except Exception as e: transaction.rollback() log.exception(e)
def load_results(manifest_path, database, **kwargs): manifest = ManifestReader(manifest_path) if not manifest.marked_for_load(): log.info('Sample not marked for load', extra={ 'manifest_path': manifest_path, }) return # Ensure the sample section is valid.. if not check_sample_section(manifest): log.info('Manifest sample section is not valid', extra={ 'manifest_path': manifest_path, }) return sample_info = manifest.section('sample') vcf_info = manifest.section('vcf') # ignore whatever sample is listed in the manifest and scan the vcf for samples vcf_path = os.path.join(os.path.dirname(manifest_path), vcf_info['file']) with open(vcf_path) as file_obj: log.debug("opening {0} in load_samples".format(vcf_path)) reader = vcf.Reader(file_obj, preserve_order=False) samples = reader.samples if 'sample' in sample_info: pretty_names = sample_info['sample'].split(',') else: pretty_names = samples for pretty_name, vcf_sample in zip(pretty_names, samples): try: sample = Sample.objects.get( name__iexact=pretty_name, batch__name__iexact=sample_info['batch'], project__name__iexact=sample_info['project'], version=sample_info['version']) except Sample.DoesNotExist: log.error('Sample does not exist', extra=sample_info) return #is it already loaded, let's skip for now if Result.objects.filter(sample=sample).exists(): log.debug('{0} exists in results'.format(vcf_sample)) else: log.debug('about to load results for {0}'.format(vcf_sample)) #STSError: Cannot start transition while already in one. successful = False while not successful: try: with transition(sample, 'Sample Published', event='Loading Results'): connection = connections[database] cursor = connection.cursor() with open(vcf_path) as fin: stream = ResultStream(fin, sample_id=sample.id, vcf_sample=vcf_sample) columns = stream.output_columns db_table = Result._meta.db_table pgcopy_batch(stream, db_table, columns, cursor, database) # Update result count sample.count = sample.results.count() sample.published = True sample.save() successful = True except: log.error('STS errors') time.sleep(10) vcf_info = manifest.section('vcf') # Absolute path relative to the MANIFEST directory vcf_path = os.path.join(os.path.dirname(manifest_path), vcf_info['file']) # Compare expected MD5 (in manifest) to the file MD5 if 'md5' in vcf_info: vcf_md5 = checks.file_md5(vcf_path) if vcf_md5 != vcf_info['md5']: log.error('VCF file MD5 does not match expected in manifest', extra={ 'manifest_path': manifest_path, }) # Existing samples by the same name of a previous version are unpublished # now that is is ready to be published. count = Sample.objects.filter( name__iexact=sample.name, project=sample.project, batch=sample.batch, version__lt=sample.version).update(published=False) if count: log.info('{0} previous versions unpublished for {1}'.format( count, sample))