vcf_out.write( line_i + '\n' ) line_i = vcf_in.readline().rstrip() vcf_out.write( line_i + '\n' ) # This line will be #CHROM: header = line_i.split('\t') sample_index = header.index(sample) - 9 # This will be the first variant line: line_i = vcf_in.readline().rstrip() while line_i: vcf_i = genome.Vcf_line( line_i ) if vcf_i.filters == 'PASS': refMQ = float( vcf_i.get_sample_value('refMQ', sample_index) ) altMQ = float( vcf_i.get_sample_value('altMQ', sample_index) ) refBQ = float( vcf_i.get_sample_value('refBQ', sample_index) ) altBQ = float( vcf_i.get_sample_value('altBQ', sample_index) ) refNM = float( vcf_i.get_sample_value('refNM', sample_index) ) altNM = float( vcf_i.get_sample_value('altNM', sample_index) ) fetSB = float( vcf_i.get_sample_value('fetSB', sample_index) ) fetCD = float( vcf_i.get_sample_value('fetCD', sample_index) ) zMQ = float( vcf_i.get_sample_value('zMQ', sample_index) ) zBQ = float( vcf_i.get_sample_value('zBQ', sample_index) ) MQ0 = int( vcf_i.get_sample_value('MQ0', sample_index) ) VAF = float( vcf_i.get_sample_value('VAF', sample_index) )
### It will be a single file, otherwise it goes chr1, chr2, ...., or in the list of files input as ordered: for chr_i_vcf in right_files: with genome.open_textfile(chr_i_vcf) as vcf: line_i = vcf.readline().rstrip() # Skip headers from now on: while line_i.startswith('#'): line_i = vcf.readline().rstrip() # Doing the work here: while line_i: vcf_i = genome.Vcf_line(line_i) num_samples = len(vcf_i.samples) if num_samples == 1: paired = False elif num_samples == 2: paired = True elif num_samples > 2: sys.stderr.write( 'We found more than 2 sammples in this VCF file. It may be messed up, but I\'ll just assume the first 2 samples mean anything at all' ) paired = True elif num_samples == 0:
def catch_up(line_1, line_2, file_1, file_2, output_vcf, id_1, id_2, id_12): id_1, id_2, id_12 = id_1, id_2, id_12 vcf_1 = genome.Vcf_line(line_1) vcf_2 = genome.Vcf_line(line_2) coord_1 = [vcf_1.chromosome, vcf_1.position] coord_2 = [vcf_2.chromosome, vcf_2.position] print(coord_1, coord_2) is_behind = whoisbehind(coord_1, coord_2) # As long as the coordinates are not the same, and both files are not finished: while is_behind != 10: # If 1st VCF is behind: if is_behind == 0: item_1 = line_1.rstrip('\n').split('\t') # Write, unless... if item_1[idx_filter] != 'PrintEmALL': #item_1[idx_id] = id_1 id_item = item_1[idx_id].split(';') id_item.append(id_1) item_1[idx_id] = ';'.join(id_item) item_1[idx_id] = re.sub(r'^\.;', '', item_1[idx_id]) line_1 = '\t'.join(item_1) output_vcf.write(line_1 + '\n') line_1 = file_1.readline() vcf_1 = genome.Vcf_line(line_1) coord_1 = [vcf_1.chromosome, vcf_1.position] # If 2nd VCF is behind: elif is_behind == 1: item_2 = line_2.rstrip('\n').split('\t') # Write, unless... #if item_2[idx_filter] != 'PrintEmALL': #IF #item_2[idx_id] = id_2 id_item = item_2[idx_id].split(';') id_item.append(id_2) item_2[idx_id] = ';'.join(id_item) item_2[idx_id] = re.sub(r'^\.;', '', item_2[idx_id]) line_2 = '\t'.join(item_2) output_vcf.write(line_2 + '\n') ## FI line_2 = file_2.readline() vcf_2 = genome.Vcf_line(line_2) coord_2 = [vcf_2.chromosome, vcf_2.position] is_behind = whoisbehind(coord_1, coord_2) # Returns the value of the function: if coord_1[0] == coord_2[0] == '': result = 42 else: item_1 = line_1.rstrip('\n').split('\t') item_2 = line_2.rstrip('\n').split('\t') #item_1[idx_id] = id_12 id_item = item_1[idx_id].split(';') id_item.append(id_12) item_1[idx_id] = ';'.join(id_item) item_1[idx_id] = re.sub(r'^.;', '', item_1[idx_id]) line_1 = '\t'.join(item_1) output_vcf.write(line_1 + '\n') result = ( line_1, line_2, ) return result
while strelka_line.startswith('#'): strelka_line = strelka.readline().rstrip() # Get through all the headers: while my_line.startswith('#') or my_line.startswith('track='): my_line = my_sites.readline().rstrip() # First line: outhandle.write(out_header.replace('{', '').replace('}', '') + '\n') while my_line: # If VCF, get all the variants with the same coordinate into a list: if is_vcf: my_vcf = genome.Vcf_line(my_line) my_coordinates = [(my_vcf.chromosome, my_vcf.position)] variants_at_my_coordinate = [] alt_bases = my_vcf.altbase.split(',') for alt_i in alt_bases: vcf_i = copy(my_vcf) vcf_i.altbase = alt_i variants_at_my_coordinate.append(vcf_i) # As long as the "coordinate" stays the same, it will keep reading until it's different. while my_coordinates[0] == (my_vcf.chromosome, my_vcf.position): my_line = my_sites.readline().rstrip() my_vcf = genome.Vcf_line(my_line)
'##FORMAT=<ID=SCORE,Number=1,Type=Float,Description="SomaticSeq Probability (either fraction or Phred)">\n' ) tumor_column = line_in.split('\t').index(tumor) tumor_idx = tumor_column - 9 (normal_column, normal_idx) = (9, 0) if tumor_idx == 1 else (None, None) # This is the #CHROM line vcfout.write(line_in + '\n') line_in = vcfin.readline().rstrip('\n') # Move COMBO and NUM_TOOLS from INFO to Tumor Sample, and move QUAL to the Tumor Sample as well while line_in: vcf_line_in = genome.Vcf_line(line_in) # New INFO new_info = [] for info_item in vcf_line_in.get_info_items(): if not (info_item.startswith('NUM_TOOLS=') or info_item.startswith(caller_string)): new_info.append(info_item) if new_info == []: new_info_line = '.' else: new_info_line = ';'.join(new_info) # FORMAT: if somaticseq_trained: