예제 #1
0
파일: load.py 프로젝트: arahuja/cohorts
    def _load_single_sample_variants(self, sample_idx, file_format_funcs, variant_type, merge_type):
        sample_id = self.sample_ids[sample_idx]
        normal_bam_id = None if self.normal_bam_ids is None else self.normal_bam_ids[sample_idx]
        tumor_bam_id = None if self.tumor_bam_ids is None else self.tumor_bam_ids[sample_idx]
        cached_file_name = "%s-%s-variants.pkl" % (variant_type, merge_type)
        cached = self.load_from_cache(self.cache_names["variant"], sample_id, cached_file_name)
        if cached is not None:
            return cached

        combined_variants = []
        for file_format_func in file_format_funcs:
            file_name = file_format_func(
                sample_id, normal_bam_id, tumor_bam_id)
            variants = varcode.load_vcf_fast(path.join(self.data_dir, file_name))
            combined_variants.append(set(variants.elements))

        if len(combined_variants) == 1:
            # There is nothing to merge
            merged_variants =  VariantCollection(combined_variants[0])
        else:
            assert merge_type in ["union", "intersection"], "Unknown merge type: %s" % merge_type
            if merge_type == "union":
                merged_variants = VariantCollection(set.union(*combined_variants))
            elif merge_type == "intersection":
                merged_variants = VariantCollection(set.intersection(*combined_variants))

        self.save_to_cache(merged_variants, self.cache_names["variant"], sample_id, cached_file_name)

        return merged_variants
예제 #2
0
 def do_test(kwargs):
     vcf_pandas = load_vcf_fast(**kwargs)
     vcf_pyvcf = load_vcf(**kwargs)
     eq_(vcf_pandas, vcf_pyvcf)
     eq_(len(vcf_pandas), len(vcf_pyvcf))
     eq_(vcf_pandas.elements, vcf_pyvcf.elements)
     eq_(vcf_pandas.metadata, vcf_pyvcf.metadata)
     assert len(vcf_pandas) > 1
     assert len(vcf_pyvcf) > 1
예제 #3
0
 def do_test(kwargs):
     vcf_pandas = load_vcf_fast(**kwargs)
     vcf_pyvcf = load_vcf(**kwargs)
     eq_(vcf_pandas, vcf_pyvcf)
     eq_(len(vcf_pandas), len(vcf_pyvcf))
     eq_(vcf_pandas.elements, vcf_pyvcf.elements)
     eq_(vcf_pandas.metadata, vcf_pyvcf.metadata)
     assert len(vcf_pandas) > 1
     assert len(vcf_pyvcf) > 1
예제 #4
0
 def loader(filename):
     collection = varcode.load_vcf_fast(
         filename,
         genome=genome,
         max_variants=max_variants,
         only_passing=only_passing,
         allow_extended_nucleotides=True)
     return variants_to_dataframe(
         collection,
         collection.metadata,
         metadata_column_prefix=metadata_column_prefix)
예제 #5
0
 def loader(filename):
     collection = varcode.load_vcf_fast(
         filename,
         genome=genome,
         max_variants=max_variants,
         only_passing=only_passing,
         allow_extended_nucleotides=True)
     return variants_to_dataframe(
         collection,
         collection.metadata,
         metadata_column_prefix=metadata_column_prefix)
예제 #6
0
def run():
    args = parser.parse_args()

    extra_args = {}
    if not args.info_field:
        extra_args["include_info"] = False

    start = time.time()

    if args.pyvcf:
        result = varcode.load_vcf(args.path, allow_extended_nucleotides=True)
    else:
        result = varcode.load_vcf_fast(args.path,
                                       allow_extended_nucleotides=True,
                                       **extra_args)

    print("Loaded %d variants in %0.3f sec. " %
          (len(result), time.time() - start))
    print(result.to_string(limit=5))
예제 #7
0
    def __init__(self,
                 ranked_variants_with_vaccine_peptides,
                 patient_info,
                 final_review,
                 reviewers,
                 args_for_report,
                 input_json_file,
                 cosmic_vcf_filename=None):
        """
        Construct a TemplateDataCreator object, from the output of the vaxrank pipeline.
        """
        self.ranked_variants_with_vaccine_peptides = ranked_variants_with_vaccine_peptides
        self.patient_info = patient_info

        # filter output-related command-line args: we want to display everything else
        args_to_display_in_report = {
            k: v
            for k, v in args_for_report.items() if not k.startswith("output")
        }

        self.template_data = {
            'args': sorted(args_to_display_in_report.items()),
            'reviewers': reviewers.split(',') if reviewers else [],
            'final_review': final_review,
            'input_json_file': input_json_file,
            # these report sections are optional
            'include_manufacturability': args_for_report['manufacturability'],
            'include_wt_epitopes': args_for_report['wt_epitopes'],
        }

        # map from peptide objects to their COSMIC IDs if they exist
        if cosmic_vcf_filename:
            logger.info('Loading COSMIC data...')
            self.cosmic_variant_collection = load_vcf_fast(
                cosmic_vcf_filename,
                allow_extended_nucleotides=True,
                include_info=False)
            logger.info('COSMIC data loaded.')
        else:
            self.cosmic_variant_collection = None
예제 #8
0
def run():
    args = parser.parse_args()

    extra_args = {}
    if not args.info_field:
        extra_args["include_info"] = False

    start = time.time()

    if args.pyvcf:
        result = varcode.load_vcf(
            args.path,
            allow_extended_nucleotides=True)
    else:
        result = varcode.load_vcf_fast(
            args.path,
            allow_extended_nucleotides=True,
            **extra_args)

    print("Loaded %d variants in %0.3f sec. " % (
        len(result), time.time() - start))
    print(result.to_string(limit=5))
예제 #9
0
파일: report.py 프로젝트: hammerlab/vaxrank
    def __init__(
            self,
            ranked_variants_with_vaccine_peptides,
            patient_info,
            final_review,
            reviewers,
            args_for_report,
            input_json_file,
            cosmic_vcf_filename=None):
        """
        Construct a TemplateDataCreator object, from the output of the vaxrank pipeline.
        """
        self.ranked_variants_with_vaccine_peptides = ranked_variants_with_vaccine_peptides
        self.patient_info = patient_info

        # filter output-related command-line args: we want to display everything else
        args_to_display_in_report = {
            k: v for k, v in args_for_report.items() if not k.startswith("output")
        }

        self.template_data = {
            'args': sorted(args_to_display_in_report.items()),
            'reviewers': reviewers.split(',') if reviewers else [],
            'final_review': final_review,
            'input_json_file': input_json_file,
            # these report sections are optional
            'include_manufacturability': args_for_report['manufacturability'],
            'include_wt_epitopes': args_for_report['wt_epitopes'],
        }

        # map from peptide objects to their COSMIC IDs if they exist
        if cosmic_vcf_filename:
            logger.info('Loading COSMIC data...')
            self.cosmic_variant_collection = load_vcf_fast(
                cosmic_vcf_filename, allow_extended_nucleotides=True, include_info=False)
            logger.info('COSMIC data loaded.')
        else:
            self.cosmic_variant_collection = None
예제 #10
0
def generate_vcfs(id_to_mutation_count, file_format, template_name):
    """
    Generate cropped VCFs from a template, for each sample.

    Parameters
    ----------
    id_to_mutation_count : dict
        sample ID to number of mutations we want to generate for that sample

    Returns
    -------
    str
        Path to the generated VCF directory
    """
    for sample_id in id_to_mutation_count.keys():
        template_path = data_path(template_name)
        vcf_reader = vcf.Reader(filename=template_path)
        file_path = generated_data_path(
            path.join("vcfs", file_format % sample_id))
        file_dir = path.dirname(file_path)
        if not path.exists(file_dir):
            makedirs(file_dir)
        with open(file_path, "w") as f:
            vcf_writer = vcf.Writer(f, vcf_reader)
            i = 0
            num_records_in_template = len(load_vcf_fast(template_path))
            num_records_to_generate = id_to_mutation_count[sample_id]
            assert num_records_in_template >= num_records_to_generate, (
                "Cannot generate more records than exist in the template: %d is less than %d" % (
                    num_records_in_template, num_records_to_generate))
            for record in vcf_reader:
                if i < id_to_mutation_count[sample_id]:
                    vcf_writer.write_record(record)
                    i += 1
                else:
                    break

    return path.dirname(f.name)
예제 #11
0
def generate_vcfs(id_to_mutation_count, file_format_func, template_name):
    """
    Generate cropped VCFs from a template, for each sample.

    Parameters
    ----------
    id_to_mutation_count : dict
        sample ID to number of mutations we want to generate for that sample

    Returns
    -------
    str
        Path to the generated VCF directory
    """
    for sample_id in id_to_mutation_count.keys():
        template_path = data_path(template_name)
        vcf_reader = vcf.Reader(filename=template_path)
        file_path = generated_data_path(
            path.join("vcfs", file_format_func(sample_id, None, None)))
        file_dir = path.dirname(file_path)
        if not path.exists(file_dir):
            makedirs(file_dir)
        with open(file_path, "w") as f:
            vcf_writer = vcf.Writer(f, vcf_reader)
            i = 0
            num_records_in_template = len(load_vcf_fast(template_path))
            num_records_to_generate = id_to_mutation_count[sample_id]
            assert num_records_in_template >= num_records_to_generate, (
                "Cannot generate more records than exist in the template: %d is less than %d"
                % (num_records_in_template, num_records_to_generate))
            for record in vcf_reader:
                if i < id_to_mutation_count[sample_id]:
                    vcf_writer.write_record(record)
                    i += 1
                else:
                    break

    return path.dirname(f.name)
예제 #12
0
파일: load.py 프로젝트: arahuja/cohorts
    def _load_single_sample_variants(self, sample_idx, file_format_funcs,
                                     variant_type, merge_type):
        sample_id = self.sample_ids[sample_idx]
        normal_bam_id = None if self.normal_bam_ids is None else self.normal_bam_ids[
            sample_idx]
        tumor_bam_id = None if self.tumor_bam_ids is None else self.tumor_bam_ids[
            sample_idx]
        cached_file_name = "%s-%s-variants.pkl" % (variant_type, merge_type)
        cached = self.load_from_cache(self.cache_names["variant"], sample_id,
                                      cached_file_name)
        if cached is not None:
            return cached

        combined_variants = []
        for file_format_func in file_format_funcs:
            file_name = file_format_func(sample_id, normal_bam_id,
                                         tumor_bam_id)
            variants = varcode.load_vcf_fast(
                path.join(self.data_dir, file_name))
            combined_variants.append(set(variants.elements))

        if len(combined_variants) == 1:
            # There is nothing to merge
            merged_variants = VariantCollection(combined_variants[0])
        else:
            assert merge_type in ["union", "intersection"
                                  ], "Unknown merge type: %s" % merge_type
            if merge_type == "union":
                merged_variants = VariantCollection(
                    set.union(*combined_variants))
            elif merge_type == "intersection":
                merged_variants = VariantCollection(
                    set.intersection(*combined_variants))

        self.save_to_cache(merged_variants, self.cache_names["variant"],
                           sample_id, cached_file_name)

        return merged_variants
예제 #13
0
def test_load_vcf_mouse_with_explicit_urls():
    variants = load_vcf(MOUSE_VCF, genome=explicit_url_genome)
    eq_(len(variants), 217)
    variants = load_vcf_fast(MOUSE_VCF, genome=explicit_url_genome)
    eq_(len(variants), 217)
예제 #14
0
def test_load_vcf_mouse_with_ensembl_release():
    variants = load_vcf(MOUSE_VCF, genome=ensembl_mouse_genome)
    eq_(len(variants), 217)
    variants = load_vcf_fast(MOUSE_VCF, genome=ensembl_mouse_genome)
    eq_(len(variants), 217)
예제 #15
0
def test_load_vcf_mouse_with_inferred_genome():
    variants = load_vcf(MOUSE_VCF)
    eq_(len(variants), 217)
    variants = load_vcf_fast(MOUSE_VCF)
    eq_(len(variants), 217)
예제 #16
0
def test_load_vcf_mouse_with_ensembl_release():
    variants = load_vcf(MOUSE_VCF, genome=ensembl_mouse_genome)
    eq_(len(variants), 217)
    variants = load_vcf_fast(MOUSE_VCF, genome=ensembl_mouse_genome)
    eq_(len(variants), 217)
예제 #17
0
def test_load_vcf_mouse_with_explicit_urls():
    variants = load_vcf(MOUSE_VCF, genome=explicit_url_genome)
    eq_(len(variants), 217)
    variants = load_vcf_fast(MOUSE_VCF, genome=explicit_url_genome)
    eq_(len(variants), 217)
예제 #18
0
def test_load_vcf_mouse_with_inferred_genome():
    variants = load_vcf(MOUSE_VCF)
    eq_(len(variants), 217)
    variants = load_vcf_fast(MOUSE_VCF)
    eq_(len(variants), 217)