def _build_coordinates(vcf_readers): coordinate_set = set() mult_alts = defaultdict(set) for i, vcf_reader in enumerate(vcf_readers): logger.info("Reading [{}] ({}/{})", os.path.basename(vcf_reader.file_name), i + 1, len(vcf_readers)) try: vcf_reader.open() for vcf_record in vcf_reader.vcf_records(): coordinate_set.add(vcf_record.get_empty_record()) ref_alt = vcf_record.ref, vcf_record.alt locus = vcf_record.chrom, vcf_record.pos mult_alts[locus].add(ref_alt) finally: vcf_reader.close() for vcf_record in coordinate_set: ref_alts_for_this_locus = mult_alts[vcf_record.chrom, vcf_record.pos] inferred_mult_alt = len(ref_alts_for_this_locus) > 1 explicit_mult_alt = "," in next(iter(ref_alts_for_this_locus))[1] if inferred_mult_alt or explicit_mult_alt: vcf_record.add_info_field(_MULT_ALT_TAG) return sorted(list(coordinate_set))
def _merge_records(vcf_readers, coordinates, filter_strategy, all_sample_names, format_tags_to_keep, file_writer): #pylint: disable=too-many-arguments row_count = 0 next_breakpoint = 0 buffered_readers = _create_buffered_readers(vcf_readers) for coordinate in coordinates: vcf_records = _pull_matching_records(filter_strategy, coordinate, buffered_readers) if vcf_records: merged_record = _build_merged_record(coordinate, vcf_records, all_sample_names, format_tags_to_keep) file_writer.write(merged_record.text()) row_count += 1 progress = 100 * row_count / len(coordinates) if progress > next_breakpoint: logger.info("Merging: {} rows processed ({}%)", row_count, next_breakpoint) next_breakpoint = 10 * int(progress / 10) + 10 logger.info("Merge complete: {} rows processed (100%)", row_count) filter_strategy.log_statistics()
def _merge_records(vcf_readers, coordinates, filter_strategy, all_sample_names, format_tags_to_keep, file_writer): #pylint: disable=too-many-arguments row_count = 0 next_breakpoint = 0 buffered_readers = _create_buffered_readers(vcf_readers) for coordinate in coordinates: vcf_records = _pull_matching_records(filter_strategy, coordinate, buffered_readers) if vcf_records: merged_record = _build_merged_record(coordinate, vcf_records, all_sample_names, format_tags_to_keep) file_writer.write(merged_record.text()) row_count += 1 progress = 100 * row_count / len(coordinates) if progress > next_breakpoint: logger.info("Merging: {} rows processed ({}%)", row_count, next_breakpoint) next_breakpoint = 10 * int(progress/10) + 10 logger.info("Merge complete: {} rows processed (100%)", row_count) filter_strategy.log_statistics()
def test_noExceptionOnMalformedMessage(self): tool = "foo" args = Namespace(subparser_name=tool, log_file=None, verbose=None) logger.initialize_logger(args) logger.info("bar {}/{}/{}", "1", "2") root_logger = logger.logging.getLogger() self.assertEquals(["root: INFO: Malformed log message (IndexError: tuple index out of range)|bar {}/{}/{}|['1', '2']"], root_logger.handlers[0].buffer)
def _move_tmp_contents_to_original(args): temp_dir = args.temp_working_dir dest_dir = os.path.dirname(args.temp_working_dir) logger.debug( "Moving files from tmp directory [{}] to output " "directory [{}]", temp_dir, dest_dir) distutils.dir_util.copy_tree(temp_dir, dest_dir) logger.debug("Output saved to [{}]", os.path.abspath(args.original_output)) logger.info("Output saved to [{}]", args.original_output) _cleanup(temp_dir)
def _write_to_tmp_file(caller, vcf_reader, tmp_writer): vcf_reader.open() tmp_writer.open() try: _write_metaheaders(caller, vcf_reader, tmp_writer) logger.info("Adding summary tags for [{}]", vcf_reader.file_name) _add_tags(caller, vcf_reader, tmp_writer) finally: vcf_reader.close() tmp_writer.close()
def test_noExceptionOnMalformedMessage(self): tool = "foo" args = Namespace(subparser_name=tool, log_file=None, verbose=None) logger.initialize_logger(args) logger.info("bar {}/{}/{}", "1", "2") root_logger = logger.logging.getLogger() errmsg = root_logger.handlers[0].buffer[0] self.assertRegexpMatches( r"root: INFO: Malformed log message .*\|bar {}/{}/{}\|['1', '2']", errmsg)
def _move_tmp_contents_to_original(args): temp_dir = args.temp_working_dir dest_dir = os.path.dirname(args.temp_working_dir) logger.debug("Moving files from tmp directory [{}] to output " "directory [{}]", temp_dir, dest_dir) distutils.dir_util.copy_tree(temp_dir, dest_dir) logger.debug("Output saved to [{}]", os.path.abspath(args.original_output)) logger.info("Output saved to [{}]", args.original_output) _cleanup(temp_dir)
def log_statistics(self): cell_msg = "{}% ({}) cells were excluded because (--include_cells={})" logger.info(cell_msg, int(round(100 * self.cells_excluded / self.cell_count, 0)), self.cells_excluded, self._args.include_cells) row_msg = "{}% ({}) rows were excluded because (--include_rows={})" logger.info(row_msg, int(round(100 * self.rows_excluded / self.row_count, 0)), self.rows_excluded, self._args.include_rows) for key, count in self.excluded_breakdown.items(): msg = "{} cells were excluded with [{}]" logger.debug(msg, count, key)
def _sort_readers(vcf_readers, output_path): unsorted_readers = _get_unsorted_readers(vcf_readers) sorted_readers = [] unsorted_count = 0 if unsorted_readers: sorted_dir = os.path.join(os.path.dirname(output_path), "tmp") if not os.path.isdir(sorted_dir): os.makedirs(sorted_dir) for reader in vcf_readers: if reader in unsorted_readers: unsorted_count += 1 logger.info("Sorting vcf [{}] ({}/{})", reader.file_name, unsorted_count, len(unsorted_readers)) reader = _sort_vcf(reader, sorted_dir) sorted_readers.append(reader) return sorted_readers
def _get_unsorted_readers(vcf_readers): unsorted_readers = [] for i, reader in enumerate(vcf_readers): logger.info("Checking sort order of [{}] ({}/{})", reader.file_name, i + 1, len(vcf_readers)) previous_record = None reader.open() for vcf_record in reader.vcf_records(): if previous_record and vcf_record < previous_record: logger.debug( "VCF file:chrom:pos [{}:{}:{}] is out of order".format( reader.file_name, vcf_record.chrom, vcf_record.pos)) unsorted_readers.append(reader) break else: previous_record = vcf_record reader.close() return unsorted_readers
def execute(args, dummy_execution_context): #for the moment, there is no good place to put the execution context input_file = os.path.abspath(args.input) output_file = os.path.abspath(args.output) col_spec = None if args.selected_columns_file: col_spec = args.selected_columns_file logger.debug("Expanding [{}] to [{}]", input_file, output_file) logger.info("Expanding [{}] to [{}]", args.input, args.original_output) vcf_reader = vcf.VcfReader(vcf.FileReader(input_file)) file_writer = vcf.FileWriter(output_file) file_writer.open() (columns, glossary_fields) = _get_actual_columns(vcf_reader, col_spec) file_writer.write("#" + "\t".join(columns) + "\n") line_count = 0 vcf_reader.open() for vcf_record in vcf_reader.vcf_records(): row_dict = _create_row_dict(vcf_reader.split_column_header, vcf_record) new_line = [] for col in columns: if col in row_dict: new_line.append(row_dict[col]) else: new_line.append(".") file_writer.write("\t".join(new_line) + "\n") line_count +=1 if line_count % 10000 == 0: logger.info("Expanding: {} rows processed", line_count) logger.info("Expand complete: {} rows processed", line_count) file_writer.close() glossary_writer = _get_glossary_writer(output_file) glossary_writer.open() _create_glossary(vcf_reader.metaheaders, glossary_fields, glossary_writer) glossary_writer.close() logger.info("Wrote glossary to [{}]", os.path.basename(glossary_writer.output_filepath)) vcf_reader.close() logger.debug("Wrote input [{}] to output [{}]", input_file, output_file)
def execute(args, execution_context): input_file = os.path.abspath(args.input) output = os.path.abspath(args.output) summary_caller = summarize_caller.SummarizeCaller() vcf_reader = vcf.VcfReader(vcf.FileReader(input_file)) tmp_output_file = output + ".tmp" tmp_writer = vcf.FileWriter(tmp_output_file) _write_to_tmp_file(summary_caller, vcf_reader, tmp_writer) tmp_reader = vcf.VcfReader(vcf.FileReader(tmp_output_file)) file_writer = vcf.FileWriter(output) logger.info("Calculating zscores") caller = zscore_caller.ZScoreCaller(tmp_reader) metaheaders = execution_context + summary_caller.get_metaheaders() _write_zscores(caller, metaheaders, tmp_reader, file_writer) os.remove(tmp_output_file)
def test_info(self): tool = "foo" args = Namespace(subparser_name=tool, log_file=None, verbose=None) logger.initialize_logger(args) current_time = datetime.now() logger.info("bar") root_logger = logger.logging.getLogger() self.assertEquals(["root: INFO: bar"], root_logger.handlers[0].buffer) output_lines = self.output.getvalue().rstrip().split("\n") self.assertEquals(len(output_lines), 1) log_fields = output_lines[0].split("|") self.assertEquals(4, len(log_fields)) log_time_string = log_fields[0] log_datetime = datetime.strptime(log_time_string, "%Y-%m-%d %H:%M:%S") seconds_delta = (log_datetime - current_time).total_seconds() self.assertLess(seconds_delta, 1) self.assertEquals(["INFO", "foo", "bar"], log_fields[1:])
def execute(args, execution_context): validate_args(args) output_dir = os.path.abspath(args.output) unclaimed_readers, trans_vcf_readers = _claim_readers(args) _log_unclaimed_readers(unclaimed_readers) logger.info("Processing [{}] VCF file(s) from [{}]", len(trans_vcf_readers), args.input) new_tags = [_ExcludeMalformedRef(), _ExcludeMalformedAlt(), _ExcludeMissingAlt()] for i, trans_vcf_reader in enumerate(trans_vcf_readers): logger.info("Translating file {}/{} [{}]", i + 1, len(trans_vcf_readers), trans_vcf_reader.file_name) new_filename = _mangle_output_filename(trans_vcf_reader.file_name) output_filepath = os.path.join(output_dir, new_filename) file_writer = FileWriter(output_filepath) _translate_files(trans_vcf_reader, new_tags, execution_context, file_writer,) logger.info("Wrote [{}] VCF file(s)", len(trans_vcf_readers))
def execute(args, execution_context): validate_args(args) output_dir = os.path.abspath(args.output) unclaimed_readers, trans_vcf_readers = _claim_readers(args) _log_unclaimed_readers(unclaimed_readers) logger.info("Processing [{}] VCF file(s) from [{}]", len(trans_vcf_readers), args.input) new_tags = [ _ExcludeMalformedRef(), _ExcludeMalformedAlt(), _ExcludeMissingAlt() ] for i, trans_vcf_reader in enumerate(trans_vcf_readers): logger.info("Translating file {}/{} [{}]", i + 1, len(trans_vcf_readers), trans_vcf_reader.file_name) new_filename = _mangle_output_filename(trans_vcf_reader.file_name) output_filepath = os.path.join(output_dir, new_filename) file_writer = FileWriter(output_filepath) _translate_files( trans_vcf_reader, new_tags, execution_context, file_writer, ) logger.info("Wrote [{}] VCF file(s)", len(trans_vcf_readers))
def _dispatch(modules, arguments): try: command, args = _parse_command_line_args(modules, arguments) execution_context = _get_execution_context(arguments) logger.initialize_logger(args) logger.debug("Jacquard run begins") logger.debug("cwd|{}", os.getcwd()) logger.debug("command|{}", " ".join(arguments)) command_validator.preflight(command, args) logger.info("Jacquard begins (v{})", __version__) logger.info("Saving log to [{}]", os.path.basename(logger.log_filename)) logger.debug("Writing output to tmp directory [{}]", args.temp_working_dir) command.execute(args, execution_context) _move_tmp_contents_to_original(args) if logger.WARNING_OCCURRED: logger.info("Done. (See warnings above)") else: logger.info("Done") except utils.UsageError as usage_error: message = "Jacquard usage problem: {}".format(str(usage_error)) logger.debug(message) print(message, file=sys.stderr) try: print("See 'jacquard {} --help'.".format(args.subparser_name), file=sys.stderr) except NameError: #could not determine the command print("See 'jacquard --help'.", file=sys.stderr) sys.exit(1) except Exception as exception: #pylint: disable=broad-except logger.error(str(exception)) logger.error("Jacquard encountered an unanticipated problem. " "Please review log file and contact your sysadmin " "or Jacquard support for assistance.") logger.debug(traceback.format_exc()) sys.exit(1) finally: try: _cleanup(args.temp_working_dir) except Exception: #pylint: disable=broad-except pass #we tried
def _get_unsorted_readers(vcf_readers): unsorted_readers = [] for i, reader in enumerate(vcf_readers): logger.info("Checking sort order of [{}] ({}/{})", reader.file_name, i+1, len(vcf_readers) ) previous_record = None reader.open() for vcf_record in reader.vcf_records(): if previous_record and vcf_record < previous_record: logger.debug("VCF file:chrom:pos [{}:{}:{}] is out of order" .format(reader.file_name, vcf_record.chrom, vcf_record.pos)) unsorted_readers.append(reader) break else: previous_record = vcf_record reader.close() return unsorted_readers
def execute(args, dummy_execution_context): #for the moment, there is no good place to put the execution context input_file = os.path.abspath(args.input) output_file = os.path.abspath(args.output) col_spec = None if args.selected_columns_file: col_spec = args.selected_columns_file logger.debug("Expanding [{}] to [{}]", input_file, output_file) logger.info("Expanding [{}] to [{}]", args.input, args.original_output) vcf_reader = vcf.VcfReader(vcf.FileReader(input_file)) file_writer = vcf.FileWriter(output_file) file_writer.open() (columns, glossary_fields) = _get_actual_columns(vcf_reader, col_spec) file_writer.write("#" + "\t".join(columns) + "\n") line_count = 0 vcf_reader.open() for vcf_record in vcf_reader.vcf_records(): row_dict = _create_row_dict(vcf_reader.split_column_header, vcf_record) new_line = [] for col in columns: if col in row_dict: new_line.append(row_dict[col]) else: new_line.append(".") file_writer.write("\t".join(new_line) + "\n") line_count += 1 if line_count % 10000 == 0: logger.info("Expanding: {} rows processed", line_count) logger.info("Expand complete: {} rows processed", line_count) file_writer.close() glossary_writer = _get_glossary_writer(output_file) glossary_writer.open() _create_glossary(vcf_reader.metaheaders, glossary_fields, glossary_writer) glossary_writer.close() logger.info("Wrote glossary to [{}]", os.path.basename(glossary_writer.output_filepath)) vcf_reader.close() logger.debug("Wrote input [{}] to output [{}]", input_file, output_file)
def test_info_message_args(self): tool = "foo" args = Namespace(subparser_name=tool, log_file=None, verbose=None) logger.initialize_logger(args) current_time = datetime.now() logger.info("bar {}/{}", "1", "2") root_logger = logger.logging.getLogger() self.assertEquals(["root: INFO: bar 1/2"], root_logger.handlers[0].buffer) output_lines = self.output.getvalue().rstrip().split("\n") self.assertEquals(len(output_lines), 1) log_fields = output_lines[0].split("|") self.assertEquals(4, len(log_fields)) log_time_string = log_fields[0] log_datetime = datetime.strptime(log_time_string, "%Y-%m-%d %H:%M:%S") seconds_delta = (log_datetime - current_time).total_seconds() self.assertLess(seconds_delta, 1) self.assertEquals(["INFO", "foo", "bar 1/2"], log_fields[1:])