def find_mutation_files( input_files, combined_maf=False, max_peptide_length=31): """ Collect all .vcf/.maf file paths in the `input_filenames` list. Returns a dictionary mapping patient IDs to DataFrames containing basic variant information (chr, pos, ref, alt). The patient IDs will be each filename without its extension, unless the argument combined_maf is True. In this case, patient IDs are derived from the tumor barcode column in each MAF file. """ mutation_files = OrderedDict() for path in input_filenames: _, filename = split(path) base, ext = splitext(filename) if ext in MUTATION_FILE_EXTENSIONS: if ext.endswith('maf') and combined_maf: maf_df = load_maf(path) file_patients = {} for barcode, group_df in ( maf_df.groupby(['Tumor_Sample_Barcode'])): vcf_df = maf_to_vcf(group_df) patient_id = get_patient_id(barcode) file_patients[patient_id] = vcf_df else: patient_id = get_patient_id(base) vcf_df = load_variants(path) file_patients = {patient_id: vcf_df} for patient_id, vcf_df in file_patients.iteritems(): patient_id = "-".join(patient_id.split("-")[:3]) if patient_id in mutation_files: logging.warning( "Already processed patient %s before file %s", patient_id, path) else: mutation_files[patient_id] = vcf_df if args.debug_patient_id: patient_id = args.debug_patient_id mutation_files = {patient_id: mutation_files[patient_id]} return mutation_files
def find_mutation_files(input_files, combined_maf=False, max_peptide_length=31): """ Collect all .vcf/.maf file paths in the `input_filenames` list. Returns a dictionary mapping patient IDs to DataFrames containing basic variant information (chr, pos, ref, alt). The patient IDs will be each filename without its extension, unless the argument combined_maf is True. In this case, patient IDs are derived from the tumor barcode column in each MAF file. """ mutation_files = OrderedDict() for path in input_filenames: _, filename = split(path) base, ext = splitext(filename) if ext in MUTATION_FILE_EXTENSIONS: if ext.endswith('maf') and combined_maf: maf_df = load_maf(path) file_patients = {} for barcode, group_df in (maf_df.groupby( ['Tumor_Sample_Barcode'])): vcf_df = maf_to_vcf(group_df) patient_id = get_patient_id(barcode) file_patients[patient_id] = vcf_df else: patient_id = get_patient_id(base) vcf_df = load_variants(path) file_patients = {patient_id: vcf_df} for patient_id, vcf_df in file_patients.iteritems(): patient_id = "-".join(patient_id.split("-")[:3]) if patient_id in mutation_files: logging.warning( "Already processed patient %s before file %s", patient_id, path) else: mutation_files[patient_id] = vcf_df if args.debug_patient_id: patient_id = args.debug_patient_id mutation_files = {patient_id: mutation_files[patient_id]} return mutation_files
def load_variants(input_filename): """ Read the input file into a DataFrame containing (at least) the basic columns of a VCF: - chr - pos - ref - alt """ # VCF and MAF files give us the raw mutations in genomic coordinates if input_filename.endswith(".vcf"): vcf_df = load_vcf(input_filename) elif input_filename.endswith(".maf"): maf_df = load_maf(input_filename) vcf_df = maf_to_vcf(maf_df) elif input_filename.endswith("tab"): tab_df = pd.read_csv(input_filename, sep='\t', header=0) vcf_df = tab_to_vcf(tab_df) else: assert False, "Unrecognized file type %s" % input_filename return vcf_df