def __enter__(self): self.outfile = helpers.GetFileHandle(self.outfile, 'wt').handler self.snpeff_outfile = helpers.GetFileHandle(self.snpeff_outfile, 'wt').handler self.ma_outfile = helpers.GetFileHandle(self.ma_outfile, 'wt').handler self.ids_outfile = helpers.GetFileHandle(self.ids_outfile, 'wt').handler self.write_headers() return self
def concatenate_vcf(infiles, outfile): ''' Concatenate VCF files :param infiles: dictionary of input VCF files to be concatenated :param outfile: output VCF file ''' if isinstance(infiles, dict): keys = infiles.keys() keys = sorted(keys) infiles = [infiles[val] for val in keys] with helpers.GetFileHandle(outfile, 'w') as ofile: header = None for ifile in infiles: if os.path.getsize(ifile) == 0: warnings.warn('input file {} is empty'.format(ifile)) continue with helpers.GetFileHandle(ifile) as f: if not header: header = _get_header(f) for line in header: ofile.write(line) else: if not _get_header(f) == header: warnings.warn( 'merging vcf files with mismatching headers') for l in f: ofile.write(l)
def write_headerless_csv(self, infile): with helpers.GetFileHandle(self.filepath, 'wt') as writer: with helpers.GetFileHandle(infile) as reader: if not reader.readline() == self.header_line: raise CsvWriterError("cannot write, wrong header") self.write_csv_data(reader, writer) self.__write_yaml()
def write_csv_with_header(self, infile, headerless_input=True): with helpers.GetFileHandle(self.filepath, 'wt') as writer: with helpers.GetFileHandle(infile) as reader: if headerless_input: writer.write(self.header_line) self.write_csv_data(reader, writer) self.__write_yaml()
def concatenate_files(self, infiles): header = self.header_line if self.header else None with helpers.GetFileHandle(self.filepath, 'wt') as writer: if header: writer.write(header) for infile in infiles: with helpers.GetFileHandle(infile) as reader: self.write_csv_data(reader, writer) self.__write_yaml()
def update_germline_header_sample_ids(infile, outfile, sample_id): with helpers.GetFileHandle(infile) as indata: with helpers.GetFileHandle(outfile, 'wt') as outdata: for line in indata: if line.startswith('#CHROM'): outdata.write('##normal_sample={}\n'.format(sample_id)) line = line.strip().split() assert line[-1] in ['normal', sample_id] line[-1] = sample_id line = '\t'.join(line) + '\n' outdata.write(line) else: outdata.write(line)
def generate_metadata(self): with helpers.GetFileHandle(self.filepath) as inputfile: header = inputfile.readline().strip() sep = self.__detect_sep_from_header(header) columns = header.split(sep) header = True dtypes = self.__generate_dtypes(sep=sep) return header, sep, dtypes, columns
def get_vcf_header(vcf_file): vcf_data = [] with helpers.GetFileHandle(vcf_file) as vcf_file: for line in vcf_file: if line.startswith('#'): vcf_data.append(line) continue break return vcf_data
def __write_yaml(self): yamldata = {'header': self.header, 'sep': self.sep, 'columns': []} for column in self.columns: data = {'name': column, 'dtype': self.dtypes[column]} yamldata['columns'].append(data) with helpers.GetFileHandle(self.yaml_file, 'wt') as f: yaml.safe_dump(yamldata, f, default_flow_style=False)
def reheader_reads(infile, outfile): with helpers.GetFileHandle(infile, 'rt') as indata, helpers.GetFileHandle(outfile, 'wt') as outdata: line_one = indata.readline() header = line_one.split('\t') assert len(header) == 8 assert not header[0] == 'prediction_id' header = ['prediction_id', 'library_id', 'fragment_id', 'read_end', 'seq', 'qual', 'comment', 'filtered'] header = '\t'.join(header) + '\n' outdata.write(header) outdata.write(line_one) for line in indata: outdata.write(line)
def merge_mafs(maf_files, output): if isinstance(maf_files, dict): maf_files = list(maf_files.values()) with helpers.GetFileHandle(output, 'wt') as maf_writer: with helpers.GetFileHandle(maf_files[0]) as header_read: header = header_read.readline() assert header.startswith('#version 2.4') maf_writer.write(header) header = header_read.readline() assert header.startswith('Hugo_Symbol') maf_writer.write(header) for filepath in maf_files: with helpers.GetFileHandle(filepath, 'rt') as maf_reader: for line in maf_reader: if line.startswith('Hugo_Symbol') or line.startswith('#'): continue maf_writer.write(line)
def main(infile, outfile, mappability_blacklist): blacklist = load_blacklist(mappability_blacklist) vcf_header = get_vcf_header(infile) vcf_header = update_vcf_header(vcf_header, mappability_blacklist) with helpers.GetFileHandle(outfile, 'wt') as vcf_writer: write_to_file(vcf_writer, vcf_header) for vcf_data in load_vcf_file(infile): annotated_vcf_data = annotate_vcf_data(vcf_data, blacklist) write_to_file(vcf_writer, annotated_vcf_data)
def load_vcf_file(vcf_file): vcf_data = [] with helpers.GetFileHandle(vcf_file) as vcf_file: for line in vcf_file: if line.startswith('#'): continue line = line.strip().split() vcf_data.append(line) if len(vcf_data) > 1e6: yield vcf_data vcf_data = [] yield vcf_data
def __parse_metadata(self): with helpers.GetFileHandle(self.filepath + '.yaml') as yamlfile: yamldata = yaml.safe_load(yamlfile) header = yamldata['header'] sep = yamldata.get('sep', ',') dtypes = {} columns = [] for coldata in yamldata['columns']: colname = coldata['name'] dtypes[colname] = coldata['dtype'] columns.append(colname) return header, sep, dtypes, columns
def parse_roh_output(infile, outfile): parsed = [] with helpers.GetFileHandle(infile) as indata: for line in indata: if line.startswith('#'): continue line = line.strip().split() if line[0] == 'ST': parsed.append({ 'type': line[0], 'sample': line[1], 'chromosome': line[2], 'start': line[3], 'end': float('nan'), 'state': line[4], 'length': float('nan'), 'num_markers': float('nan'), 'quality': line[5] }) elif line[0] == 'RG': parsed.append({ 'type': line[0], 'sample': line[1], 'chromosome': line[2], 'start': line[3], 'end': line[4], 'state': float('nan'), 'length': line[5], 'num_markers': line[6], 'quality': line[7] }) parsed = pd.DataFrame(parsed) csvutils.write_dataframe_to_csv_and_yaml(parsed, outfile, write_header=True)
def select_optimal_solution( chunks, params_files, segments, igv_segs, markers, parsed_files, plots, optimal_segment, optimal_igv_segs, optimal_param, optimal_marker, optimal_parsed, optimal_plot, ): ''' selects the optimal cluster and ploidy combination from an input set of cluster/ploidy- resolved params and writes the corresponding set of optimal output files to an 'optimal' output directory. :params nclusts: number of clusters / sample :params nploidy: ploidy options/cluster/sample :params params_files: set of paramater files per ploidy/cluster :params segments: input set of output segments files per ploidy/cluster :params params: input set of output params files per ploidy/cluster :params markers: input set of output markers files per ploidy/cluster :params parsed_files: input set of parsed files per ploidy/cluster :params plots: input set of plots files per ploidy/cluster :params optimal_segment: output path for the optimal segment file :params optimal_param: output path for the optimal param file :params optimal_marker: output path for optimal marker file :params optimal_parsed: output path for optimal parsed file :params optimal_plots: output path for optimal plots file ''' model_select_idxs = [] # find optimal cluster/ploidy for chunk in chunks: params = params_files[chunk] parsed_params = get_param_file_vals(params) dbw_index = parsed_params['S_Dbw validity index (Both)'][0] model_select_idxs.append((chunk, dbw_index)) best_model = min(model_select_idxs, key=lambda t: t[1]) best_model = best_model[0] # copy the file at the optimal cluster/ploidy to the # optimal file output path csvutils.finalize_csv(segments[best_model], optimal_segment, sep='\t') csvutils.finalize_csv(params_files[best_model], optimal_param, sep='\t') csvutils.finalize_csv(markers[best_model], optimal_marker, sep='\t') csvutils.finalize_csv(parsed_files[best_model], optimal_parsed, sep='\t') shutil.copyfile(plots[best_model], optimal_plot) shutil.copyfile(igv_segs[best_model], optimal_igv_segs) with helpers.GetFileHandle(optimal_param, 'at') as params_output: ploidy, num_clusters = best_model params_output.write('ploidy: {}\n'.format(ploidy)) params_output.write('num_clusters: {}\n'.format(num_clusters))
def __detect_sep_from_file(self): with helpers.GetFileHandle(self.filepath) as reader: header = reader.readline().strip() return self.__detect_sep_from_header(header)
def split_vcf_by_chr(vcf_file, chromosome, output): with helpers.GetFileHandle(vcf_file, 'rt') as vcf_reader, \ helpers.GetFileHandle(output, 'wt') as vcf_writer: for line in vcf_reader: if line.startswith('#') or line.startswith(chromosome): vcf_writer.write(line)