lc = number_of_subseqs/max_number_of_subseqs else: lc = float('nan') return lc if __name__ == "__main__": parser = argparse.ArgumentParser(description="Calculate linguistic sequence complexity according to DOI:10.1093/bioinformatics/18.5.679", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('-seq', '--sequence', type=str, help="GCTA sequences") parser.add_argument('-len', '--substring-length', type=int, help="sub-lenght up to...") args = parser.parse_args() if args.substring_length: length = args.substring_length assert length <= len(args.sequence) else: length = len(args.sequence) # This one adds up sub-strings up to a length print( seq_features.subLC(args.sequence, length) )
def vcf2tsv(is_vcf=None, is_bed=None, is_pos=None, bam_fn=None, truth=None, cosmic=None, dbsnp=None, mutect=None, varscan=None, vardict=None, lofreq=None, scalpel=None, strelka=None, dedup=True, min_mq=1, min_bq=5, min_caller=0, ref_fa=None, p_scale=None, outfile=None): # Convert contig_sequence to chrom_seq dict: fai_file = ref_fa + '.fai' chrom_seq = genome.faiordict2contigorder(fai_file, 'fai') # Determine input format: if is_vcf: mysites = is_vcf elif is_bed: mysites = is_bed elif is_pos: mysites = is_pos else: mysites = fai_file logger.info('No position supplied. Will evaluate the whole genome.') # Re-scale output or not: if p_scale == None: logger.info('NO RE-SCALING') elif p_scale.lower() == 'phred': p_scale = 'phred' elif p_scale.lower() == 'fraction': p_scale = 'fraction' else: p_scale = None logger.info('NO RE-SCALING') # Define NaN and Inf: nan = float('nan') inf = float('inf') pattern_chr_position = genome.pattern_chr_position ## Running with genome.open_textfile(mysites) as my_sites, open(outfile, 'w') as outhandle: my_line = my_sites.readline().rstrip() bam = pysam.AlignmentFile(bam_fn, reference_filename=ref_fa) ref_fa = pysam.FastaFile(ref_fa) if truth: truth = genome.open_textfile(truth) truth_line = genome.skip_vcf_header(truth) if cosmic: cosmic = genome.open_textfile(cosmic) cosmic_line = genome.skip_vcf_header(cosmic) if dbsnp: dbsnp = genome.open_textfile(dbsnp) dbsnp_line = genome.skip_vcf_header(dbsnp) # 6 Incorporate callers: get thru the #'s if mutect: mutect = genome.open_textfile(mutect) mutect_line = genome.skip_vcf_header(mutect) if varscan: varscan = genome.open_textfile(varscan) varscan_line = genome.skip_vcf_header(varscan) if vardict: vardict = genome.open_textfile(vardict) vardict_line = genome.skip_vcf_header(vardict) if lofreq: lofreq = genome.open_textfile(lofreq) lofreq_line = genome.skip_vcf_header(lofreq) if scalpel: scalpel = genome.open_textfile(scalpel) scalpel_line = genome.skip_vcf_header(scalpel) if strelka: strelka = genome.open_textfile(strelka) strelka_line = genome.skip_vcf_header(strelka) # Get through all the headers: while my_line.startswith('#') or my_line.startswith('track='): my_line = my_sites.readline().rstrip() # First coordinate, for later purpose of making sure the input is sorted properly coordinate_i = re.match(genome.pattern_chr_position, my_line) coordinate_i = coordinate_i.group() if coordinate_i else '' # First line: outhandle.write(out_header.replace('{', '').replace('}', '') + '\n') while my_line: # If VCF, get all the variants with the same coordinate into a list: if is_vcf: my_vcf = genome.Vcf_line(my_line) my_coordinates = [(my_vcf.chromosome, my_vcf.position)] variants_at_my_coordinate = [] alt_bases = my_vcf.altbase.split(',') for alt_i in alt_bases: vcf_i = copy(my_vcf) vcf_i.altbase = alt_i variants_at_my_coordinate.append(vcf_i) # As long as the "coordinate" stays the same, it will keep reading until it's different. while my_coordinates[0] == (my_vcf.chromosome, my_vcf.position): my_line = my_sites.readline().rstrip() my_vcf = genome.Vcf_line(my_line) ########## This block is code is to ensure the input VCF file is properly sorted ## coordinate_j = re.match(genome.pattern_chr_position, my_line) coordinate_j = coordinate_j.group() if coordinate_j else '' if genome.whoisbehind(coordinate_i, coordinate_j, chrom_seq) == 1: raise Exception( '{} does not seem to be properly sorted.'.format( mysites)) coordinate_i = coordinate_j ################################################################################### if my_coordinates[0] == (my_vcf.chromosome, my_vcf.position): alt_bases = my_vcf.altbase.split(',') for alt_i in alt_bases: vcf_i = copy(my_vcf) vcf_i.altbase = alt_i variants_at_my_coordinate.append(vcf_i) elif is_bed: bed_item = my_line.split('\t') my_coordinates = genomic_coordinates(bed_item[0], int(bed_item[1]) + 1, int(bed_item[2])) elif is_pos: pos_item = my_line.split('\t') my_coordinates = genomic_coordinates(pos_item[0], int(pos_item[1]), int(pos_item[1])) elif fai_file: fai_item = my_line.split('\t') my_coordinates = genomic_coordinates(fai_item[0], 1, int(fai_item[1])) ##### ##### ##### ##### ##### ##### for my_coordinate in my_coordinates: ######## If VCF, can get ref base, variant base, as well as other identifying information ######## if is_vcf: ref_bases = [] alt_bases = [] indel_lengths = [] all_my_identifiers = [] for variant_i in variants_at_my_coordinate: ref_base = variant_i.refbase first_alt = variant_i.altbase.split(',')[0] indel_length = len(first_alt) - len(ref_base) ref_bases.append(ref_base) alt_bases.append(first_alt) indel_lengths.append(indel_length) # Extract these information if they exist in the VCF file, but they could be re-written if dbSNP/COSMIC are supplied. if_dbsnp = 1 if re.search(r'rs[0-9]+', variant_i.identifier) else 0 if_cosmic = 1 if re.search(r'COS[MN][0-9]+', variant_i.identifier) else 0 if_common = 1 if variant_i.get_info_value( 'COMMON') == '1' else 0 num_cases = variant_i.get_info_value( 'CNT') if variant_i.get_info_value('CNT') else nan if variant_i.identifier == '.': my_identifier_i = set() else: my_identifier_i = variant_i.identifier.split(';') my_identifier_i = set(my_identifier_i) all_my_identifiers.append(my_identifier_i) ## If not, 1) get ref_base, first_alt from other VCF files. # 2) Create placeholders for dbSNP and COSMIC that can be overwritten with dbSNP/COSMIC VCF files (if provided) else: variants_at_my_coordinate = [ None ] # Just to have something to iterate ref_base = first_alt = indel_length = None # Could be re-written if dbSNP/COSMIC are supplied. If not, they will remain NaN. if_dbsnp = if_cosmic = if_common = num_cases = nan #################################### Find the same coordinate in those VCF files #################################### if mutect: got_mutect, mutect_variants, mutect_line = genome.find_vcf_at_coordinate( my_coordinate, mutect_line, mutect, chrom_seq) if varscan: got_varscan, varscan_variants, varscan_line = genome.find_vcf_at_coordinate( my_coordinate, varscan_line, varscan, chrom_seq) if vardict: got_vardict, vardict_variants, vardict_line = genome.find_vcf_at_coordinate( my_coordinate, vardict_line, vardict, chrom_seq) if lofreq: got_lofreq, lofreq_variants, lofreq_line = genome.find_vcf_at_coordinate( my_coordinate, lofreq_line, lofreq, chrom_seq) if scalpel: got_scalpel, scalpel_variants, scalpel_line = genome.find_vcf_at_coordinate( my_coordinate, scalpel_line, scalpel, chrom_seq) if strelka: got_strelka, strelka_variants, strelka_line = genome.find_vcf_at_coordinate( my_coordinate, strelka_line, strelka, chrom_seq) if truth: got_truth, truth_variants, truth_line = genome.find_vcf_at_coordinate( my_coordinate, truth_line, truth, chrom_seq) if dbsnp: got_dbsnp, dbsnp_variants, dbsnp_line = genome.find_vcf_at_coordinate( my_coordinate, dbsnp_line, dbsnp, chrom_seq) if cosmic: got_cosmic, cosmic_variants, cosmic_line = genome.find_vcf_at_coordinate( my_coordinate, cosmic_line, cosmic, chrom_seq) # Now, use pysam to look into the tBAM file(s), variant by variant from the input: for ith_call, my_call in enumerate(variants_at_my_coordinate): if is_vcf: # The particular line in the input VCF file: variant_id = ((my_call.chromosome, my_call.position), my_call.refbase, my_call.altbase) ref_base = ref_bases[ith_call] first_alt = alt_bases[ith_call] indel_length = indel_lengths[ith_call] my_identifiers = all_my_identifiers[ith_call] else: variant_id = ((my_coordinate[0], my_coordinate[1]), ref_base, first_alt) # Reset num_caller to 0 for each variant in the same coordinate num_callers = 0 #################### Collect Caller Vcf ####################: if mutect: mutect_classification, tlod, ecnt = annotate_caller.ssMuTect( variant_id, mutect_variants) num_callers += mutect_classification else: mutect_classification = tlod = ecnt = nan if varscan: varscan_classification, score_varscan2 = annotate_caller.ssVarScan( variant_id, varscan_variants) num_callers += varscan_classification else: varscan_classification = score_varscan2 = nan if vardict: vardict_classification, msi, msilen, shift3, t_pmean, t_pstd, t_qstd = annotate_caller.ssVarDict( variant_id, vardict_variants) num_callers += vardict_classification else: vardict_classification = msi = msilen = shift3 = t_pmean = t_pstd = t_qstd = nan if lofreq: lofreq_classification = annotate_caller.ssLoFreq( variant_id, lofreq_variants) num_callers += lofreq_classification else: lofreq_classification = nan if scalpel: scalpel_classification = annotate_caller.ssScalpel( variant_id, scalpel_variants) num_callers += scalpel_classification else: scalpel_classification = nan if strelka: strelka_classification = annotate_caller.ssStrelka( variant_id, strelka_variants) num_callers += strelka_classification else: strelka_classification = nan # Potentially write the output only if it meets this threshold: if num_callers >= min_caller: ########## Ground truth file ########## if truth: if variant_id in truth_variants.keys(): judgement = 1 my_identifiers.add('TruePositive') else: judgement = 0 my_identifiers.add('FalsePositive') else: judgement = nan ########## dbSNP ########## Will overwrite dbSNP info from input VCF file if dbsnp: if_dbsnp, if_common, rsID = annotate_caller.dbSNP( variant_id, dbsnp_variants) for ID_i in rsID: my_identifiers.add(ID_i) ########## COSMIC ########## Will overwrite COSMIC info from input VCF file if cosmic: if_cosmic, num_cases, cosmicID = annotate_caller.COSMIC( variant_id, cosmic_variants) for ID_i in cosmicID: my_identifiers.add(ID_i) ########## ######### INFO EXTRACTION FROM BAM FILES ########## ######### # Tumor tBAM file: tBamFeatures = sequencing_features.from_bam( bam, my_coordinate, ref_base, first_alt, min_mq, min_bq) # Homopolymer eval: homopolymer_length, site_homopolymer_length = sequencing_features.from_genome_reference( ref_fa, my_coordinate, ref_base, first_alt) # Linguistic sequence complexity in a +/-80bp window, but substring calculation stops at 20-bp substring. seq_span_80bp = ref_fa.fetch( my_coordinate[0], max(0, my_coordinate[1] - 41), my_coordinate[1] + 40) seq_left_80bp = ref_fa.fetch( my_coordinate[0], max(0, my_coordinate[1] - 81), my_coordinate[1]) seq_right_80bp = ref_fa.fetch(my_coordinate[0], my_coordinate[1], my_coordinate[1] + 81) if len(seq_span_80bp) > 20: LC_spanning = sequencing_features.subLC( seq_span_80bp, 20) else: LC_spanning = math.nan if len(seq_left_80bp) > 20: left_LC = sequencing_features.subLC( seq_left_80bp, 20) else: left_LC = math.nan if len(seq_right_80bp) > 20: right_LC = sequencing_features.subLC( seq_right_80bp, 20) else: right_LC = math.nan LC_adjacent = min(left_LC, right_LC) LC_spanning_phred = genome.p2phred(1 - LC_spanning, 40) LC_adjacent_phred = genome.p2phred(1 - LC_adjacent, 40) # Fill the ID field of the TSV/VCF my_identifiers = ';'.join( my_identifiers) if my_identifiers else '.' ### out_line = out_header.format( \ CHROM = my_coordinate[0], \ POS = my_coordinate[1], \ ID = my_identifiers, \ REF = ref_base, \ ALT = first_alt, \ if_MuTect = mutect_classification, \ if_Strelka = strelka_classification, \ if_VarScan2 = varscan_classification, \ if_VarDict = vardict_classification, \ if_LoFreq = lofreq_classification, \ if_Scalpel = scalpel_classification, \ VarScan2_Score = rescale(score_varscan2, 'phred', p_scale, 1001), \ if_dbsnp = if_dbsnp, \ COMMON = if_common, \ if_COSMIC = if_cosmic, \ COSMIC_CNT = num_cases, \ Consistent_Mates = tBamFeatures['consistent_mates'], \ Inconsistent_Mates = tBamFeatures['inconsistent_mates'], \ Seq_Complexity_Span = LC_spanning_phred, \ Seq_Complexity_Adj = LC_adjacent_phred, \ M2_TLOD = tlod, \ M2_ECNT = ecnt, \ MSI = msi, \ MSILEN = msilen, \ SHIFT3 = shift3, \ MaxHomopolymer_Length = homopolymer_length, \ SiteHomopolymer_Length = site_homopolymer_length, \ T_DP = tBamFeatures['dp'], \ tBAM_REF_MQ = '%g' % tBamFeatures['ref_mq'], \ tBAM_ALT_MQ = '%g' % tBamFeatures['alt_mq'], \ tBAM_p_MannWhitneyU_MQ = '%g' % tBamFeatures['p_mannwhitneyu_mq'], \ tBAM_REF_BQ = '%g' % tBamFeatures['ref_bq'], \ tBAM_ALT_BQ = '%g' % tBamFeatures['alt_bq'], \ tBAM_p_MannWhitneyU_BQ = '%g' % tBamFeatures['p_mannwhitneyu_bq'], \ tBAM_REF_NM = '%g' % tBamFeatures['ref_NM'], \ tBAM_ALT_NM = '%g' % tBamFeatures['alt_NM'], \ tBAM_NM_Diff = '%g' % tBamFeatures['NM_Diff'], \ tBAM_REF_Concordant = tBamFeatures['ref_concordant_reads'], \ tBAM_REF_Discordant = tBamFeatures['ref_discordant_reads'], \ tBAM_ALT_Concordant = tBamFeatures['alt_concordant_reads'], \ tBAM_ALT_Discordant = tBamFeatures['alt_discordant_reads'], \ tBAM_Concordance_FET = rescale(tBamFeatures['concordance_fet'], 'fraction', p_scale, 1001), \ T_REF_FOR = tBamFeatures['ref_for'], \ T_REF_REV = tBamFeatures['ref_rev'], \ T_ALT_FOR = tBamFeatures['alt_for'], \ T_ALT_REV = tBamFeatures['alt_rev'], \ tBAM_StrandBias_FET = rescale(tBamFeatures['strandbias_fet'], 'fraction', p_scale, 1001), \ tBAM_p_MannWhitneyU_EndPos = '%g' % tBamFeatures['p_mannwhitneyu_endpos'], \ tBAM_REF_Clipped_Reads = tBamFeatures['ref_SC_reads'], \ tBAM_ALT_Clipped_Reads = tBamFeatures['alt_SC_reads'], \ tBAM_Clipping_FET = rescale(tBamFeatures['clipping_fet'], 'fraction', p_scale, 1001), \ tBAM_MQ0 = tBamFeatures['MQ0'], \ tBAM_Other_Reads = tBamFeatures['noise_read_count'], \ tBAM_Poor_Reads = tBamFeatures['poor_read_count'], \ tBAM_REF_InDel_3bp = tBamFeatures['ref_indel_3bp'], \ tBAM_REF_InDel_2bp = tBamFeatures['ref_indel_2bp'], \ tBAM_REF_InDel_1bp = tBamFeatures['ref_indel_1bp'], \ tBAM_ALT_InDel_3bp = tBamFeatures['alt_indel_3bp'], \ tBAM_ALT_InDel_2bp = tBamFeatures['alt_indel_2bp'], \ tBAM_ALT_InDel_1bp = tBamFeatures['alt_indel_1bp'], \ InDel_Length = indel_length, \ TrueVariant_or_False = judgement ) # Print it out to stdout: outhandle.write(out_line + '\n') # Read into the next line: if not is_vcf: my_line = my_sites.readline().rstrip() ########## Close all open files if they were opened ########## opened_files = (ref_fa, bam, truth, cosmic, dbsnp, mutect, varscan, vardict, lofreq, scalpel, strelka) [opened_file.close() for opened_file in opened_files if opened_file]