示例#1
0
    def handle(self, path, **options):
        database = options.get('database')
        transripts = options.get('transripts')
        stdout = options.get('stdout')

        with open(path) as fin:
            stream = EffectStream(fin, skip_existing=False)

            if stdout:
                while True:
                    line = stream.readline()
                    if line == '':
                        break
                    sys.stdout.write(line)
            else:
                cursor = connections[database].cursor()

                with transaction.commit_manually(database):
                    try:
                        cursor.execute('TRUNCATE {0}'.format(VariantEffect._meta.db_table))
                        if transripts:
                            cursor.execute('TRUNCATE {0} CASCADE'.format(Transcript._meta.db_table))
                        columns = stream.output_columns
                        db_table = VariantEffect._meta.db_table
                        pgcopy_batch(stream, db_table, columns, cursor, database)

                        transaction.commit(database)
                    except Exception as e:
                        transaction.rollback(database)
                        log.exception(e)
                        raise
示例#2
0
    def handle(self, path, **options):
        database = options.get('database')
        transripts = options.get('transripts')
        stdout = options.get('stdout')

        with open(path) as fin:
            stream = EffectStream(fin, skip_existing=False)

            if stdout:
                while True:
                    line = stream.readline()
                    if line == '':
                        break
                    sys.stdout.write(line)
            else:
                cursor = connections[database].cursor()

                with transaction.commit_manually(database):
                    try:
                        cursor.execute('TRUNCATE {0}'.format(
                            VariantEffect._meta.db_table))
                        if transripts:
                            cursor.execute('TRUNCATE {0} CASCADE'.format(
                                Transcript._meta.db_table))
                        columns = stream.output_columns
                        db_table = VariantEffect._meta.db_table
                        pgcopy_batch(stream, db_table, columns, cursor,
                                     database)

                        transaction.commit(database)
                    except Exception as e:
                        transaction.rollback(database)
                        log.exception(e)
                        raise
示例#3
0
def load_effects(manifest_path, database, **kwargs):
    manifest = ManifestReader(manifest_path)

    vcf_info = manifest.section('vcf')

    # No data regarding VCF
    if 'file' not in vcf_info:
        return

    cursor = connections[database].cursor()

    vcf_path = os.path.join(os.path.dirname(manifest_path), vcf_info['file'])

    with open(vcf_path) as fin:
        log.debug("opening {0} in {1} load_effects".format(vcf_path, __name__))
        stream = EffectStream(fin)
        columns = stream.output_columns
        db_table = VariantEffect._meta.db_table
        pgcopy_batch(stream, db_table, columns, cursor, database)
示例#4
0
def load_effects(manifest_path, database, **kwargs):
    manifest = ManifestReader(manifest_path)

    vcf_info = manifest.section('vcf')

    # No data regarding VCF
    if 'file' not in vcf_info:
        return

    cursor = connections[database].cursor()

    vcf_path = os.path.join(os.path.dirname(manifest_path), vcf_info['file'])

    with open(vcf_path) as fin:
        log.debug("opening {0} in {1} load_effects".format(vcf_path, __name__))
        stream = EffectStream(fin)
        columns = stream.output_columns
        db_table = VariantEffect._meta.db_table
        pgcopy_batch(stream, db_table, columns, cursor, database)
示例#5
0
def load_variants(manifest_path, database, **kwargs):
    "Variant loading requires only a VCF file and will never load a duplicate."
    manifest = ManifestReader(manifest_path)

    vcf_info = manifest.section('vcf')

    # No data regarding VCF
    if 'file' not in vcf_info:
        return

    cursor = connections[database].cursor()

    vcf_path = os.path.join(os.path.dirname(manifest_path), vcf_info['file'])

    with open(vcf_path) as fin:
        log.debug("opening {0} in {1}".format(vcf_path, __name__))
        stream = VariantStream(fin)
        columns = stream.output_columns
        db_table = Variant._meta.db_table
        pgcopy_batch(stream, db_table, columns, cursor, database)

    VARIANT_CHANNEL.publish(manifest_path=manifest_path, database=database)
示例#6
0
def load_variants(manifest_path, database, **kwargs):
    "Variant loading requires only a VCF file and will never load a duplicate."
    manifest = ManifestReader(manifest_path)

    vcf_info = manifest.section('vcf')

    # No data regarding VCF
    if 'file' not in vcf_info:
        return

    cursor = connections[database].cursor()

    vcf_path = os.path.join(os.path.dirname(manifest_path), vcf_info['file'])

    with open(vcf_path) as fin:
        log.debug("opening {0} in {1}".format(vcf_path, __name__))
        stream = VariantStream(fin)
        columns = stream.output_columns
        db_table = Variant._meta.db_table
        pgcopy_batch(stream, db_table, columns, cursor, database)

    VARIANT_CHANNEL.publish(manifest_path=manifest_path, database=database)
示例#7
0
def load_results(manifest_path, database, **kwargs):
    manifest = ManifestReader(manifest_path)

    if not manifest.marked_for_load():
        log.info('Sample not marked for load', extra={
            'manifest_path': manifest_path,
        })
        return

    # Ensure the sample section is valid..
    if not check_sample_section(manifest):
        log.info('Manifest sample section is not valid', extra={
            'manifest_path': manifest_path,
        })
        return

    sample_info = manifest.section('sample')
    vcf_info = manifest.section('vcf')
    # Ignore whatever sample is listed in the manifest and scan the vcf for
    # samples.
    vcf_path = os.path.join(os.path.dirname(manifest_path), vcf_info['file'])
    with open(vcf_path) as file_obj:
        log.debug("opening {0} in load_samples".format(vcf_path))
        reader = vcf.Reader(file_obj)
        samples = reader.samples
    if 'sample' in sample_info:
        pretty_names = sample_info['sample'].split(',')
    else:
        pretty_names = samples

    for pretty_name, vcf_sample in zip(pretty_names, samples):
        try:
            sample = Sample.objects.get(
                name__iexact=pretty_name,
                batch__name__iexact=sample_info['batch'],
                project__name__iexact=sample_info['project'],
                version=sample_info['version'])
        except Sample.DoesNotExist:
            log.error('Sample does not exist', extra=sample_info)
            return

        #is it already loaded, let's skip for now
        if Result.objects.filter(sample=sample).exists():
            log.debug('{0} exists in results'.format(vcf_sample))
        else:
            log.debug('about to load results for {0}'.format(vcf_sample))

            #STSError: Cannot start transition while already in one.
            successful = False
            while not successful:
                try:
                    with transition(sample, 'Sample Published',
                                    event='Loading Results'):
                        connection = connections[database]
                        cursor = connection.cursor()

                        with open(vcf_path) as fin:
                            stream = ResultStream(fin, sample_id=sample.id,
                                                  vcf_sample=vcf_sample)
                            columns = stream.output_columns
                            db_table = Result._meta.db_table
                            pgcopy_batch(stream, db_table, columns, cursor,
                                         database)

                        # Update result count
                        sample.count = sample.results.count()
                        sample.published = True
                        sample.save()
                        successful = True
                except:
                    log.error('STS errors')
                    time.sleep(10)

    vcf_info = manifest.section('vcf')

    # Absolute path relative to the MANIFEST directory
    vcf_path = os.path.join(os.path.dirname(manifest_path), vcf_info['file'])

    # Compare expected MD5 (in manifest) to the file MD5
    if 'md5' in vcf_info:
        vcf_md5 = checks.file_md5(vcf_path)

        if vcf_md5 != vcf_info['md5']:
            log.error('VCF file MD5 does not match expected in manifest',
                      extra={'manifest_path': manifest_path})

    # Existing samples by the same name of a previous version are unpublished
    # now that is is ready to be published.
    count = Sample.objects.filter(
        name__iexact=sample.name, project=sample.project, batch=sample.batch,
        version__lt=sample.version).update(published=False)

    if count:
        log.info('{0} previous versions unpublished for {1}'
                 .format(count, sample))
示例#8
0
def load_evs(database, **kwargs):
    if not db.utils.table_exists('evs', schema='raw'):
        return

    cursor = connections[database].cursor()
    cursor.execute(db.utils.sequence_reset_sql(EVS, database))

    with transaction.commit_manually(database):
        try:
            # MAFs are divided by 100 since they are percentages to begin with
            cursor.execute('''
                SELECT variant.id, r.ea_ac_ref, r.ea_ac_alt, r.aa_ac_ref,
                    r.aa_ac_alt, r.all_ac_ref, r.all_ac_alt, r.ea_maf / 100.0,
                    r.aa_maf / 100.0, r.all_maf / 100.0, r.gts, r.ea_gtc,
                    r.aa_gtc, r.all_gtc, r.clinical_association
                FROM "variant"
                    INNER JOIN "raw"."evs" r ON ("variant".md5 = r."md5")
                    LEFT OUTER JOIN "evs"
                        ON ("evs"."variant_id" = "variant"."id")
                WHERE "evs"."id" IS NULL
            ''')

            # Note, *_af are actually minor allele frequencies by default.
            # The handler checks to see if the reference count is less than
            # the alternate and sets the AF if the reference count is less than
            # the alternate
            columns = [
                'variant_id', 'ea_ac_ref', 'ea_ac_alt', 'aa_ac_ref',
                'aa_ac_alt', 'all_ac_ref', 'all_ac_alt', 'ea_af', 'aa_af',
                'all_af', 'gts', 'ea_gtc', 'aa_gtc', 'all_gtc',
                'clinical_association'
            ]

            def compare_counts(ref, alt):
                "Compares allele counts and handles heterozygotes."
                ref = int(ref)
                for _alt in alt.split(','):
                    if ref < int(_alt):
                        return True
                return False

            def handler(row):
                record = OrderedDict(zip(columns, row))

                # All
                if compare_counts(record['all_ac_ref'], record['all_ac_alt']):
                    record['all_af'] = 1 - record['all_af']

                # European
                if compare_counts(record['ea_ac_ref'], record['ea_ac_alt']):
                    record['ea_af'] = 1 - record['ea_af']

                # African American
                if compare_counts(record['aa_ac_ref'], record['aa_ac_alt']):
                    record['aa_af'] = 1 - record['aa_af']

                cleaned = [
                    str(x) if x is not None else '\N' for x in record.values()
                ]
                return '\t'.join(cleaned) + '\n'

            def streamer(cursor):
                while True:
                    rows = cursor.fetchmany(100)
                    if not rows:
                        break

                    for row in rows:
                        yield handler(row)

            pgcopy_batch(streamer(cursor),
                         EVS._meta.db_table,
                         columns=columns,
                         database=database)

            transaction.commit()
        except Exception as e:
            transaction.rollback()
            log.exception(e)
示例#9
0
def load_evs(database, **kwargs):
    if not db.utils.table_exists('evs', schema='raw'):
        return

    cursor = connections[database].cursor()
    cursor.execute(db.utils.sequence_reset_sql(EVS, database))

    with transaction.commit_manually(database):
        try:
            # MAFs are divided by 100 since they are percentages to begin with
            cursor.execute('''
                SELECT variant.id, r.ea_ac_ref, r.ea_ac_alt, r.aa_ac_ref,
                    r.aa_ac_alt, r.all_ac_ref, r.all_ac_alt, r.ea_maf / 100.0,
                    r.aa_maf / 100.0, r.all_maf / 100.0, r.gts, r.ea_gtc,
                    r.aa_gtc, r.all_gtc, r.clinical_association
                FROM "variant"
                    INNER JOIN "raw"."evs" r ON ("variant".md5 = r."md5")
                    LEFT OUTER JOIN "evs"
                        ON ("evs"."variant_id" = "variant"."id")
                WHERE "evs"."id" IS NULL
            ''')

            # Note, *_af are actually minor allele frequencies by default.
            # The handler checks to see if the reference count is less than
            # the alternate and sets the AF if the reference count is less than
            # the alternate
            columns = ['variant_id', 'ea_ac_ref', 'ea_ac_alt', 'aa_ac_ref',
                       'aa_ac_alt', 'all_ac_ref', 'all_ac_alt', 'ea_af',
                       'aa_af', 'all_af', 'gts', 'ea_gtc', 'aa_gtc', 'all_gtc',
                       'clinical_association']

            def compare_counts(ref, alt):
                "Compares allele counts and handles heterozygotes."
                ref = int(ref)
                for _alt in alt.split(','):
                    if ref < int(_alt):
                        return True
                return False

            def handler(row):
                record = OrderedDict(zip(columns, row))

                # All
                if compare_counts(record['all_ac_ref'], record['all_ac_alt']):
                    record['all_af'] = 1 - record['all_af']

                # European
                if compare_counts(record['ea_ac_ref'], record['ea_ac_alt']):
                    record['ea_af'] = 1 - record['ea_af']

                # African American
                if compare_counts(record['aa_ac_ref'], record['aa_ac_alt']):
                    record['aa_af'] = 1 - record['aa_af']

                cleaned = [str(x) if x is not None else '\N' for x in
                           record.values()]
                return '\t'.join(cleaned) + '\n'

            def streamer(cursor):
                while True:
                    rows = cursor.fetchmany(100)
                    if not rows:
                        break

                    for row in rows:
                        yield handler(row)

            pgcopy_batch(streamer(cursor), EVS._meta.db_table, columns=columns,
                         database=database)

            transaction.commit()
        except Exception as e:
            transaction.rollback()
            log.exception(e)
示例#10
0
def load_results(manifest_path, database, **kwargs):
    manifest = ManifestReader(manifest_path)

    if not manifest.marked_for_load():
        log.info('Sample not marked for load',
                 extra={
                     'manifest_path': manifest_path,
                 })
        return

    # Ensure the sample section is valid..
    if not check_sample_section(manifest):
        log.info('Manifest sample section is not valid',
                 extra={
                     'manifest_path': manifest_path,
                 })
        return

    sample_info = manifest.section('sample')
    vcf_info = manifest.section('vcf')
    # ignore whatever sample is listed in the manifest and scan the vcf for samples
    vcf_path = os.path.join(os.path.dirname(manifest_path), vcf_info['file'])
    with open(vcf_path) as file_obj:
        log.debug("opening {0} in load_samples".format(vcf_path))
        reader = vcf.Reader(file_obj, preserve_order=False)
        samples = reader.samples
    if 'sample' in sample_info:
        pretty_names = sample_info['sample'].split(',')
    else:
        pretty_names = samples

    for pretty_name, vcf_sample in zip(pretty_names, samples):
        try:
            sample = Sample.objects.get(
                name__iexact=pretty_name,
                batch__name__iexact=sample_info['batch'],
                project__name__iexact=sample_info['project'],
                version=sample_info['version'])
        except Sample.DoesNotExist:
            log.error('Sample does not exist', extra=sample_info)
            return

        #is it already loaded, let's skip for now
        if Result.objects.filter(sample=sample).exists():
            log.debug('{0} exists in results'.format(vcf_sample))
        else:
            log.debug('about to load results for {0}'.format(vcf_sample))

            #STSError: Cannot start transition while already in one.
            successful = False
            while not successful:
                try:
                    with transition(sample,
                                    'Sample Published',
                                    event='Loading Results'):
                        connection = connections[database]
                        cursor = connection.cursor()

                        with open(vcf_path) as fin:
                            stream = ResultStream(fin,
                                                  sample_id=sample.id,
                                                  vcf_sample=vcf_sample)
                            columns = stream.output_columns
                            db_table = Result._meta.db_table
                            pgcopy_batch(stream, db_table, columns, cursor,
                                         database)

                        # Update result count
                        sample.count = sample.results.count()
                        sample.published = True
                        sample.save()
                        successful = True
                except:
                    log.error('STS errors')
                    time.sleep(10)

    vcf_info = manifest.section('vcf')

    # Absolute path relative to the MANIFEST directory
    vcf_path = os.path.join(os.path.dirname(manifest_path), vcf_info['file'])

    # Compare expected MD5 (in manifest) to the file MD5
    if 'md5' in vcf_info:
        vcf_md5 = checks.file_md5(vcf_path)

        if vcf_md5 != vcf_info['md5']:
            log.error('VCF file MD5 does not match expected in manifest',
                      extra={
                          'manifest_path': manifest_path,
                      })

    # Existing samples by the same name of a previous version are unpublished
    # now that is is ready to be published.
    count = Sample.objects.filter(
        name__iexact=sample.name,
        project=sample.project,
        batch=sample.batch,
        version__lt=sample.version).update(published=False)

    if count:
        log.info('{0} previous versions unpublished for {1}'.format(
            count, sample))