def link_samples(run_path, data_path): """ Load the data from a run folder into the BaseSpace layout. """ shutil.rmtree(data_path, ignore_errors=True) makedirs(data_path) results_path = os.path.join(run_path, 'Results', 'basespace') makedirs(results_path) output_path = os.path.join(data_path, 'output') os.symlink(results_path, output_path) scratch_path = os.path.join(data_path, 'scratch') makedirs(scratch_path) sample_groups = [] run_info_path = os.path.join(run_path, 'RunInfo.xml') interop_path = os.path.join(run_path, 'InterOp') if not (os.path.exists(run_info_path) and os.path.exists(interop_path)): read_sizes = None else: read_sizes = parse_read_sizes(run_info_path) run_info = RunInfo(sample_groups, reports=['PR_RT', 'IN', 'NS3', 'NS5a', 'NS5b'], interop_path=interop_path, scratch_path=scratch_path, output_path=output_path, read_sizes=read_sizes) fastq_files = list(glob(os.path.join(run_path, 'Data', 'Intensities', 'BaseCalls', '*_R1_*')) or glob(os.path.join(run_path, '*_R1_*'))) source_folder = fastq_files and os.path.dirname(fastq_files[0]) file_names = [os.path.basename(fastq_file) for fastq_file in fastq_files] groups = find_groups(file_names, os.path.join(run_path, 'SampleSheet.csv')) for group in groups: main_file, midi_file = group.names if main_file.startswith('Undetermined'): continue main_sample = Sample(fastq1=os.path.join(source_folder, main_file)) if midi_file is None: midi_sample = None else: midi_sample = Sample(fastq1=os.path.join(source_folder, midi_file)) sample_groups.append(SampleGroup(main_sample, midi_sample)) sample_count = sum(1 for _ in run_info.get_all_samples()) for i, sample in enumerate(run_info.get_all_samples(), 1): sample.rank = '({} of {})'.format(i, sample_count) sample.bad_cycles_csv = run_info.bad_cycles_csv sample.scratch_path = os.path.join(scratch_path, sample.name) return run_info
def test_get_all_samples(self): expected_fastq_paths = ['1a_R1_001.fastq', '1b_R1_001.fastq', '2_R1_001.fastq'] run_info = RunInfo( sample_groups=[SampleGroup(Sample(fastq1='1a_R1_001.fastq'), Sample(fastq1='1b_R1_001.fastq')), SampleGroup(Sample(fastq1='2_R1_001.fastq'))]) fastq_paths = [sample.fastq1 for sample in run_info.get_all_samples()] self.assertEqual(expected_fastq_paths, fastq_paths)
def test_get_all_samples(self): expected_fastq_paths = [ '1a_R1_001.fastq', '1b_R1_001.fastq', '2_R1_001.fastq' ] run_info = RunInfo(sample_groups=[ SampleGroup(Sample( fastq1='1a_R1_001.fastq'), Sample(fastq1='1b_R1_001.fastq')), SampleGroup(Sample(fastq1='2_R1_001.fastq')) ]) fastq_paths = [sample.fastq1 for sample in run_info.get_all_samples()] self.assertEqual(expected_fastq_paths, fastq_paths)
def hcv_sample(args): resolved_args = MiCallArgs(args) midi_args = MiCallArgs(args, map_midi=True) scratch_path = os.path.join(args.results_folder, "scratch") midi_scratch_path = os.path.join(args.results_folder, "scratch_midi") makedirs(scratch_path) shutil.rmtree(midi_scratch_path, ignore_errors=True) sample_groups = [] run_info = RunInfo(sample_groups, reports=['PR_RT', 'IN', 'NS3', 'NS5a', 'NS5b'], output_path=args.results_folder, scratch_path=scratch_path, is_denovo=args.denovo) main_sample = Sample(fastq1=resolved_args.fastq1, fastq2=resolved_args.fastq2, bad_cycles_csv=resolved_args.bad_cycles_csv, scratch_path=scratch_path) midi_sample = Sample(fastq1=midi_args.fastq1, fastq2=midi_args.fastq2, bad_cycles_csv=resolved_args.bad_cycles_csv, scratch_path=midi_scratch_path) main_and_midi = SampleGroup(main_sample, midi_sample) sample_groups.append(main_and_midi) process_run(run_info, args)
def single_sample(args): resolved_args = MiCallArgs(args) scratch_path = os.path.join(args.results_folder, "scratch") makedirs(scratch_path) sample_groups = [] run_info = RunInfo(sample_groups, reports=['PR_RT', 'IN', 'NS3', 'NS5a', 'NS5b'], output_path=args.results_folder, scratch_path=scratch_path, is_denovo=args.denovo) sample = Sample(fastq1=resolved_args.fastq1, fastq2=resolved_args.fastq2, bad_cycles_csv=resolved_args.bad_cycles_csv, scratch_path=scratch_path) sample.project_code = args.project_code sample_group = SampleGroup(sample) sample_groups.append(sample_group) process_run(run_info, args)
def link_samples( run_path: str, output_path: str, is_denovo: bool, fastq1s: typing.Sequence[str] = None, fastq2s: typing.Sequence[str] = None, project_code: str = None): """ Load the data from a run folder. """ shutil.rmtree(output_path, ignore_errors=True) makedirs(output_path) scratch_path = os.path.join(output_path, 'scratch') makedirs(scratch_path) sample_groups = [] run_info_path = os.path.join(run_path, 'RunInfo.xml') interop_path = os.path.join(run_path, 'InterOp') if not (os.path.exists(run_info_path) and os.path.exists(interop_path)): read_sizes = None else: read_sizes = parse_read_sizes(run_info_path) run_info = RunInfo(sample_groups, reports=['PR_RT', 'IN', 'NS3', 'NS5a', 'NS5b'], interop_path=interop_path, scratch_path=scratch_path, output_path=output_path, read_sizes=read_sizes, is_denovo=is_denovo) sample_sheet_path = os.path.join(run_path, "SampleSheet.csv") if (fastq1s is not None and len(fastq1s) > 0 or not os.path.exists(sample_sheet_path)): if fastq1s is not None and len(fastq1s) > 0: # forward files are specified if fastq2s is None: raise ValueError("Reverse read files must also be specified.") elif len(fastq2s) != len(fastq1s): raise ValueError( "The same number of forward and reverse read files must be " "specified." ) forward_reverse_pairs = zip(fastq1s, fastq2s) else: # there is no sample sheet # Sort the FASTQ files alphabetically and run them in pairs. logger.info( "No sample sheet found; running on all FASTQ files in folder {}".format( run_path ) ) fastq_files = (list(glob(os.path.join(run_path, "*.fastq"))) + list(glob(os.path.join(run_path, "*.fastq.gz")))) fastq_files.sort() forward_reverse_pairs = [] for idx in range(0, len(fastq_files), 2): forward = fastq_files[idx] if idx == len(fastq_files) - 1: # We have an odd number of FASTQ files; ignore this last one. logger.info( "File {} appears extraneous; omitting.".format(forward) ) break reverse = fastq_files[idx + 1] logger.info( "Pairing files {} and {}.".format(forward, reverse) ) forward_reverse_pairs.append((forward, reverse)) for forward, reverse in forward_reverse_pairs: sample = Sample( fastq1=os.path.join(run_path, forward), fastq2=os.path.join(run_path, reverse), ) sample.project_code = project_code sample_groups.append(SampleGroup(sample, midi_sample=None)) else: # a sample sheet is specified fastq_files = list(glob(os.path.join(run_path, 'Data', 'Intensities', 'BaseCalls', '*_R1_*')) or glob(os.path.join(run_path, '*_R1_*'))) source_folder = fastq_files and os.path.dirname(fastq_files[0]) file_names = [os.path.basename(fastq_file) for fastq_file in fastq_files] groups = find_groups(file_names, sample_sheet_path) for group in groups: main_file, midi_file = group.names if main_file.startswith('Undetermined'): continue main_sample = Sample(fastq1=os.path.join(source_folder, main_file)) main_sample.project_code = project_code if midi_file is None: midi_sample = None else: midi_sample = Sample(fastq1=os.path.join(source_folder, midi_file)) midi_sample.project_code = project_code sample_groups.append(SampleGroup(main_sample, midi_sample)) sample_count = sum(1 for _ in run_info.get_all_samples()) for i, sample in enumerate(run_info.get_all_samples(), 1): sample.rank = '{} of {}'.format(i, sample_count) sample.bad_cycles_csv = run_info.bad_cycles_csv sample.scratch_path = os.path.join(scratch_path, sample.name) return run_info
def load_samples(data_path): """ Load JSON file from the data path, and pull out the arguments for this run. :param str data_path: folder that contains a JSON file in the BaseSpace AppSession format. :return RunInfo: details about the run and samples """ json_path = os.path.join(data_path, 'input', 'AppSession.json') try: with open(json_path, 'r') as json_file: raw_args = json.load(json_file) arg_map = {item['Name']: item for item in raw_args['Properties']['Items']} href_app_session = raw_args['Href'] run = arg_map.get('Input.run-id') if run is None: run_id = interop_path = read_sizes = None else: run_content = run['Content'] run_id = run_content['Id'] interop_path = os.path.join(data_path, 'input', 'runs', run_id, 'InterOp') read_sizes = ReadSizes( run_content['SequencingStats']['NumCyclesRead1'], run_content['SequencingStats']['NumCyclesRead2'], run_content['SequencingStats']['NumCyclesIndex1'], run_content['SequencingStats']['NumCyclesIndex2']) project_id = arg_map['Input.project-id']['Content']['Id'] output_path = os.path.join(data_path, 'output', 'appresults', project_id, 'results') makedirs(output_path) reports = arg_map['Input.reports']['Items'] builder_node = arg_map.get('Input.builder') if builder_node is None: is_denovo = False else: is_denovo = builder_node['Content'] == 'denovo' primer_node = arg_map.get('Input.project_code') if primer_node is None: project_code = None else: project_code = primer_node['Content'] scratch_path = os.path.join(data_path, 'scratch') sample_groups = [] run_info = RunInfo(sample_groups, reports, interop_path, scratch_path, output_path, read_sizes, href_app_session, is_denovo) main_samples = arg_map['Input.sample-ids.main']['Items'] midi_samples = arg_map['Input.sample-ids.midi']['Items'] for main_sample_json, midi_sample_json in zip(main_samples, midi_samples): sample_group = SampleGroup(load_sample(main_sample_json, data_path, scratch_path, project_code), load_sample(midi_sample_json, data_path, scratch_path, project_code)) sample_groups.append(sample_group) # Do we have run_ids for all sample_ids ? if run_id is not None: bs = BSrequest() all_ids = {s.basespace_id for s in run_info.get_all_samples()} sample_id_set = bs.check_run_sample_ids( [run_id], all_ids) if len(sample_id_set) != len(all_ids): for s in run_info.get_all_samples(): if s.basespace_id not in sample_id_set: logger.warning( 'Run info not found for %s, skipping error rate data.', s) run_info.read_sizes = run_info.interop_path = None create_app_result(run_info) except IOError: if os.path.exists(json_path): # copy the input file to the output dir for postmortem analysis logger.error("Error occurred while parsing %r.", json_path) with open(json_path, 'r') as json_file: file_cont = json_file.read() out_path = os.path.join(data_path, 'logs', 'AppSession.json') with open(out_path, 'w') as json_file: json_file.write(file_cont) else: logger.error("Error: no such file as %r.", json_path) raise return run_info
def collate_samples(run_info: RunInfo): """ Combine all the sample files into run files. :param run_info: details of the run and samples """ filenames = ['remap_counts.csv', 'remap_conseq.csv', 'conseq_ins.csv', 'failed_read.csv', 'nuc.csv', 'amino.csv', 'coord_ins.csv', 'conseq.csv', 'conseq_all.csv', 'conseq_region.csv', 'failed_align.csv', 'coverage_scores.csv', 'g2p.csv', 'g2p_summary.csv', 'resistance.csv', 'mutations.csv', 'nuc_mutations.csv', 'resistance_fail.csv', 'resistance_consensus.csv', 'cascade.csv', 'merge_lengths.csv'] for filename in filenames: out_path = run_info.output_path with open(os.path.join(out_path, filename), 'w') as fout: writer = csv.writer(fout, lineterminator=os.linesep) is_header_written = False for sample_info in run_info.get_all_samples(): sample_name = sample_info.name sample_scratch_path = sample_info.scratch_path srcfile = os.path.join(sample_scratch_path, filename) try: with open(srcfile, 'r') as fin: reader = csv.reader(fin) for i, row in enumerate(reader): if i == 0: if not is_header_written: row.insert(0, 'sample') writer.writerow(row) is_header_written = True else: row.insert(0, sample_name) writer.writerow(row) except IOError as ex: if ex.errno != errno.ENOENT: raise resistance_reports_path = os.path.join(run_info.output_path, 'resistance_reports') makedirs(resistance_reports_path) coverage_maps_path = os.path.join(run_info.output_path, 'coverage_maps') genome_coverage_path = os.path.join(coverage_maps_path, 'genome') makedirs(genome_coverage_path) merge_lengths_path = os.path.join(run_info.output_path, 'merge_lengths') makedirs(merge_lengths_path) for sample_info in run_info.get_all_samples(): if os.path.exists(sample_info.coverage_maps): for map_file in os.listdir(sample_info.coverage_maps): safe_file_move(os.path.join(sample_info.coverage_maps, map_file), os.path.join(coverage_maps_path, map_file)) if os.path.exists(sample_info.contigs_svg): safe_file_move(sample_info.contigs_svg, os.path.join(coverage_maps_path, sample_info.name + '_contigs.svg')) if os.path.exists(sample_info.genome_coverage_svg): safe_file_move(sample_info.genome_coverage_svg, os.path.join(genome_coverage_path, sample_info.name + '_genome_coverage.svg')) if os.path.exists(sample_info.merge_lengths_svg): safe_file_move(sample_info.merge_lengths_svg, os.path.join(merge_lengths_path, sample_info.name + '_merge_lengths.svg')) if os.path.exists(sample_info.resistance_pdf): safe_file_move(sample_info.resistance_pdf, os.path.join(resistance_reports_path, sample_info.name + '_resistance.pdf')) try: # Remove directory, if it's empty. os.rmdir(genome_coverage_path) except OSError: # Guess it wasn't empty. pass
def load_samples(data_path): """ Load JSON file from the data path, and pull out the arguments for this run. :param str data_path: folder that contains a JSON file in the BaseSpace AppSession format. :return RunInfo: details about the run and samples """ json_path = os.path.join(data_path, 'input', 'AppSession.json') try: with open(json_path, 'r') as json_file: raw_args = json.load(json_file) arg_map = {item['Name']: item for item in raw_args['Properties']['Items']} href_app_session = raw_args['Href'] run = arg_map.get('Input.run-id') if run is None: run_id = interop_path = read_sizes = None else: run_content = run['Content'] run_id = run_content['Id'] interop_path = os.path.join(data_path, 'input', 'runs', run_id, 'InterOp') read_sizes = ReadSizes( run_content['SequencingStats']['NumCyclesRead1'], run_content['SequencingStats']['NumCyclesRead2'], run_content['SequencingStats']['NumCyclesIndex1'], run_content['SequencingStats']['NumCyclesIndex2']) project_id = arg_map['Input.project-id']['Content']['Id'] output_path = os.path.join(data_path, 'output', 'appresults', project_id, 'results') makedirs(output_path) reports = arg_map['Input.reports']['Items'] scratch_path = os.path.join(data_path, 'scratch') sample_groups = [] run_info = RunInfo(sample_groups, reports, interop_path, scratch_path, output_path, read_sizes, href_app_session) main_samples = arg_map['Input.sample-ids.main']['Items'] midi_samples = arg_map['Input.sample-ids.midi']['Items'] for main_sample_json, midi_sample_json in zip(main_samples, midi_samples): sample_group = SampleGroup(load_sample(main_sample_json, data_path, scratch_path), load_sample(midi_sample_json, data_path, scratch_path)) sample_groups.append(sample_group) # Do we have run_ids for all sample_ids ? if run_id is not None: bs = BSrequest() all_ids = {s.basespace_id for s in run_info.get_all_samples()} sample_id_set = bs.check_run_sample_ids( [run_id], all_ids) if len(sample_id_set) != len(all_ids): for s in run_info.get_all_samples(): if s.basespace_id not in sample_id_set: logger.warning( 'Run info not found for %s, skipping error rate data.', s) run_info.read_sizes = run_info.interop_path = None create_app_result(run_info) except IOError: if os.path.exists(json_path): # copy the input file to the output dir for postmortem analysis logger.error("Error occurred while parsing %r.", json_path) with open(json_path, 'r') as json_file: file_cont = json_file.read() out_path = os.path.join(data_path, 'logs', 'AppSession.json') with open(out_path, 'w') as json_file: json_file.write(file_cont) else: logger.error("Error: no such file as %r.", json_path) raise return run_info
def main(): logging.basicConfig(level=logging.WARN) args = parse_args() sample_group = load_sample(args) sample_group.process_resistance(RunInfo([sample_group]))