示例#1
0
def hcv_sample(args):
    resolved_args = MiCallArgs(args)
    midi_args = MiCallArgs(args, map_midi=True)
    scratch_path = os.path.join(args.results_folder, "scratch")
    midi_scratch_path = os.path.join(args.results_folder, "scratch_midi")
    makedirs(scratch_path)
    shutil.rmtree(midi_scratch_path, ignore_errors=True)

    sample_groups = []
    run_info = RunInfo(sample_groups,
                       reports=['PR_RT', 'IN', 'NS3', 'NS5a', 'NS5b'],
                       output_path=args.results_folder,
                       scratch_path=scratch_path,
                       is_denovo=args.denovo)
    main_sample = Sample(fastq1=resolved_args.fastq1,
                         fastq2=resolved_args.fastq2,
                         bad_cycles_csv=resolved_args.bad_cycles_csv,
                         scratch_path=scratch_path)
    midi_sample = Sample(fastq1=midi_args.fastq1,
                         fastq2=midi_args.fastq2,
                         bad_cycles_csv=resolved_args.bad_cycles_csv,
                         scratch_path=midi_scratch_path)
    main_and_midi = SampleGroup(main_sample, midi_sample)
    sample_groups.append(main_and_midi)

    process_run(run_info, args)
示例#2
0
def single_sample(args):
    resolved_args = MiCallArgs(args)
    scratch_path = os.path.join(args.results_folder, "scratch")
    makedirs(scratch_path)

    sample_groups = []
    run_info = RunInfo(sample_groups,
                       reports=['PR_RT', 'IN', 'NS3', 'NS5a', 'NS5b'],
                       output_path=args.results_folder,
                       scratch_path=scratch_path,
                       is_denovo=args.denovo)
    sample = Sample(fastq1=resolved_args.fastq1,
                    fastq2=resolved_args.fastq2,
                    bad_cycles_csv=resolved_args.bad_cycles_csv,
                    scratch_path=scratch_path)
    sample.project_code = args.project_code
    sample_group = SampleGroup(sample)
    sample_groups.append(sample_group)

    process_run(run_info, args)
示例#3
0
def link_samples(
        run_path: str,
        output_path: str,
        is_denovo: bool,
        fastq1s: typing.Sequence[str] = None,
        fastq2s: typing.Sequence[str] = None,
        project_code: str = None):
    """ Load the data from a run folder. """

    shutil.rmtree(output_path, ignore_errors=True)
    makedirs(output_path)

    scratch_path = os.path.join(output_path, 'scratch')
    makedirs(scratch_path)

    sample_groups = []
    run_info_path = os.path.join(run_path, 'RunInfo.xml')
    interop_path = os.path.join(run_path, 'InterOp')
    if not (os.path.exists(run_info_path) and os.path.exists(interop_path)):
        read_sizes = None
    else:
        read_sizes = parse_read_sizes(run_info_path)
    run_info = RunInfo(sample_groups,
                       reports=['PR_RT', 'IN', 'NS3', 'NS5a', 'NS5b'],
                       interop_path=interop_path,
                       scratch_path=scratch_path,
                       output_path=output_path,
                       read_sizes=read_sizes,
                       is_denovo=is_denovo)

    sample_sheet_path = os.path.join(run_path, "SampleSheet.csv")
    if (fastq1s is not None and len(fastq1s) > 0
            or not os.path.exists(sample_sheet_path)):
        if fastq1s is not None and len(fastq1s) > 0:  # forward files are specified
            if fastq2s is None:
                raise ValueError("Reverse read files must also be specified.")
            elif len(fastq2s) != len(fastq1s):
                raise ValueError(
                    "The same number of forward and reverse read files must be "
                    "specified."
                )
            forward_reverse_pairs = zip(fastq1s, fastq2s)

        else:  # there is no sample sheet
            # Sort the FASTQ files alphabetically and run them in pairs.
            logger.info(
                "No sample sheet found; running on all FASTQ files in folder {}".format(
                    run_path
                )
            )
            fastq_files = (list(glob(os.path.join(run_path, "*.fastq")))
                           + list(glob(os.path.join(run_path, "*.fastq.gz"))))
            fastq_files.sort()
            forward_reverse_pairs = []
            for idx in range(0, len(fastq_files), 2):
                forward = fastq_files[idx]
                if idx == len(fastq_files) - 1:
                    # We have an odd number of FASTQ files; ignore this last one.
                    logger.info(
                        "File {} appears extraneous; omitting.".format(forward)
                    )
                    break
                reverse = fastq_files[idx + 1]
                logger.info(
                    "Pairing files {} and {}.".format(forward, reverse)
                )
                forward_reverse_pairs.append((forward, reverse))

        for forward, reverse in forward_reverse_pairs:
            sample = Sample(
                fastq1=os.path.join(run_path, forward),
                fastq2=os.path.join(run_path, reverse),
            )
            sample.project_code = project_code
            sample_groups.append(SampleGroup(sample, midi_sample=None))

    else:  # a sample sheet is specified
        fastq_files = list(glob(os.path.join(run_path,
                                             'Data',
                                             'Intensities',
                                             'BaseCalls',
                                             '*_R1_*')) or
                           glob(os.path.join(run_path,
                                             '*_R1_*')))
        source_folder = fastq_files and os.path.dirname(fastq_files[0])
        file_names = [os.path.basename(fastq_file) for fastq_file in fastq_files]
        groups = find_groups(file_names, sample_sheet_path)
        for group in groups:
            main_file, midi_file = group.names
            if main_file.startswith('Undetermined'):
                continue
            main_sample = Sample(fastq1=os.path.join(source_folder, main_file))
            main_sample.project_code = project_code
            if midi_file is None:
                midi_sample = None
            else:
                midi_sample = Sample(fastq1=os.path.join(source_folder, midi_file))
                midi_sample.project_code = project_code
            sample_groups.append(SampleGroup(main_sample, midi_sample))

    sample_count = sum(1 for _ in run_info.get_all_samples())
    for i, sample in enumerate(run_info.get_all_samples(), 1):
        sample.rank = '{} of {}'.format(i, sample_count)
        sample.bad_cycles_csv = run_info.bad_cycles_csv
        sample.scratch_path = os.path.join(scratch_path, sample.name)

    return run_info
示例#4
0
def load_samples(data_path):
    """ Load JSON file from the data path, and pull out the arguments for this run.

    :param str data_path: folder that contains a JSON file in the BaseSpace
    AppSession format.
    :return RunInfo: details about the run and samples
    """
    json_path = os.path.join(data_path, 'input', 'AppSession.json')
    try:
        with open(json_path, 'r') as json_file:
            raw_args = json.load(json_file)

        arg_map = {item['Name']: item
                   for item in raw_args['Properties']['Items']}

        href_app_session = raw_args['Href']
        run = arg_map.get('Input.run-id')
        if run is None:
            run_id = interop_path = read_sizes = None
        else:
            run_content = run['Content']
            run_id = run_content['Id']
            interop_path = os.path.join(data_path,
                                        'input',
                                        'runs',
                                        run_id,
                                        'InterOp')
            read_sizes = ReadSizes(
                run_content['SequencingStats']['NumCyclesRead1'],
                run_content['SequencingStats']['NumCyclesRead2'],
                run_content['SequencingStats']['NumCyclesIndex1'],
                run_content['SequencingStats']['NumCyclesIndex2'])
        project_id = arg_map['Input.project-id']['Content']['Id']
        output_path = os.path.join(data_path,
                                   'output',
                                   'appresults',
                                   project_id,
                                   'results')
        makedirs(output_path)
        reports = arg_map['Input.reports']['Items']
        builder_node = arg_map.get('Input.builder')
        if builder_node is None:
            is_denovo = False
        else:
            is_denovo = builder_node['Content'] == 'denovo'
        primer_node = arg_map.get('Input.project_code')
        if primer_node is None:
            project_code = None
        else:
            project_code = primer_node['Content']

        scratch_path = os.path.join(data_path, 'scratch')
        sample_groups = []
        run_info = RunInfo(sample_groups,
                           reports,
                           interop_path,
                           scratch_path,
                           output_path,
                           read_sizes,
                           href_app_session,
                           is_denovo)
        main_samples = arg_map['Input.sample-ids.main']['Items']
        midi_samples = arg_map['Input.sample-ids.midi']['Items']
        for main_sample_json, midi_sample_json in zip(main_samples, midi_samples):
            sample_group = SampleGroup(load_sample(main_sample_json,
                                                   data_path,
                                                   scratch_path,
                                                   project_code),
                                       load_sample(midi_sample_json,
                                                   data_path,
                                                   scratch_path,
                                                   project_code))
            sample_groups.append(sample_group)

        # Do we have run_ids for all sample_ids ?
        if run_id is not None:
            bs = BSrequest()
            all_ids = {s.basespace_id for s in run_info.get_all_samples()}
            sample_id_set = bs.check_run_sample_ids(
                [run_id],
                all_ids)
            if len(sample_id_set) != len(all_ids):
                for s in run_info.get_all_samples():
                    if s.basespace_id not in sample_id_set:
                        logger.warning(
                            'Run info not found for %s, skipping error rate data.',
                            s)
                run_info.read_sizes = run_info.interop_path = None

        create_app_result(run_info)
    except IOError:
        if os.path.exists(json_path):
            # copy the input file to the output dir for postmortem analysis
            logger.error("Error occurred while parsing %r.", json_path)
            with open(json_path, 'r') as json_file:
                file_cont = json_file.read()
            out_path = os.path.join(data_path, 'logs', 'AppSession.json')
            with open(out_path, 'w') as json_file:
                json_file.write(file_cont)
        else:
            logger.error("Error: no such file as %r.", json_path)
        raise

    return run_info
示例#5
0
def collate_samples(run_info: RunInfo):
    """ Combine all the sample files into run files.

    :param run_info: details of the run and samples
    """
    filenames = ['remap_counts.csv',
                 'remap_conseq.csv',
                 'conseq_ins.csv',
                 'failed_read.csv',
                 'nuc.csv',
                 'amino.csv',
                 'coord_ins.csv',
                 'conseq.csv',
                 'conseq_all.csv',
                 'conseq_region.csv',
                 'failed_align.csv',
                 'coverage_scores.csv',
                 'g2p.csv',
                 'g2p_summary.csv',
                 'resistance.csv',
                 'mutations.csv',
                 'nuc_mutations.csv',
                 'resistance_fail.csv',
                 'resistance_consensus.csv',
                 'cascade.csv',
                 'merge_lengths.csv']
    for filename in filenames:
        out_path = run_info.output_path
        with open(os.path.join(out_path, filename), 'w') as fout:
            writer = csv.writer(fout, lineterminator=os.linesep)
            is_header_written = False
            for sample_info in run_info.get_all_samples():
                sample_name = sample_info.name
                sample_scratch_path = sample_info.scratch_path
                srcfile = os.path.join(sample_scratch_path, filename)
                try:
                    with open(srcfile, 'r') as fin:
                        reader = csv.reader(fin)
                        for i, row in enumerate(reader):
                            if i == 0:
                                if not is_header_written:
                                    row.insert(0, 'sample')
                                    writer.writerow(row)
                                    is_header_written = True
                            else:
                                row.insert(0, sample_name)
                                writer.writerow(row)
                except IOError as ex:
                    if ex.errno != errno.ENOENT:
                        raise
    resistance_reports_path = os.path.join(run_info.output_path,
                                           'resistance_reports')
    makedirs(resistance_reports_path)
    coverage_maps_path = os.path.join(run_info.output_path, 'coverage_maps')
    genome_coverage_path = os.path.join(coverage_maps_path, 'genome')
    makedirs(genome_coverage_path)
    merge_lengths_path = os.path.join(run_info.output_path, 'merge_lengths')
    makedirs(merge_lengths_path)
    for sample_info in run_info.get_all_samples():
        if os.path.exists(sample_info.coverage_maps):
            for map_file in os.listdir(sample_info.coverage_maps):
                safe_file_move(os.path.join(sample_info.coverage_maps, map_file),
                               os.path.join(coverage_maps_path, map_file))
        if os.path.exists(sample_info.contigs_svg):
            safe_file_move(sample_info.contigs_svg,
                           os.path.join(coverage_maps_path,
                                        sample_info.name + '_contigs.svg'))
        if os.path.exists(sample_info.genome_coverage_svg):
            safe_file_move(sample_info.genome_coverage_svg,
                           os.path.join(genome_coverage_path,
                                        sample_info.name + '_genome_coverage.svg'))
        if os.path.exists(sample_info.merge_lengths_svg):
            safe_file_move(sample_info.merge_lengths_svg,
                           os.path.join(merge_lengths_path,
                                        sample_info.name + '_merge_lengths.svg'))
        if os.path.exists(sample_info.resistance_pdf):
            safe_file_move(sample_info.resistance_pdf,
                           os.path.join(resistance_reports_path,
                                        sample_info.name + '_resistance.pdf'))
    try:
        # Remove directory, if it's empty.
        os.rmdir(genome_coverage_path)
    except OSError:
        # Guess it wasn't empty.
        pass
示例#6
0
文件: sample.py 项目: pastvir/MiCall
    def process(self,
                pssm,
                excluded_seeds=(),
                excluded_projects=(),
                force_gzip=False,
                use_denovo=False):
        """ Process a single sample.

        :param pssm: the pssm library for running G2P analysis
        :param excluded_seeds: seeds to exclude from mapping
        :param excluded_projects: project codes to exclude from reporting
        :param bool force_gzip: treat FASTQ files as gzipped, even when they
            don't end in .gz
        :param bool use_denovo: True if de novo assembly should be used,
            instead of bowtie2 mapping against references.
        """
        logger.info('Processing %s (%r).', self, self.fastq1)
        scratch_path = self.get_scratch_path()
        makedirs(scratch_path)
        use_gzip = force_gzip or self.fastq1.endswith('.gz')

        sample_info = self.load_sample_info()

        with open(self.read_summary_csv, 'w') as read_summary:
            trim((self.fastq1, self.fastq2),
                 self.bad_cycles_csv,
                 (self.trimmed1_fastq, self.trimmed2_fastq),
                 summary_file=read_summary,
                 use_gzip=use_gzip,
                 skip=self.skip,
                 project_code=sample_info.get('project'))

        if use_denovo:
            logger.info('Running merge_for_entropy on %s.', self)
            with open(self.read_entropy_csv, 'w') as read_entropy_csv:
                merge_for_entropy(self.trimmed1_fastq,
                                  self.trimmed2_fastq,
                                  read_entropy_csv,
                                  scratch_path)

            write_merge_lengths_plot(self.read_entropy_csv,
                                     self.merge_lengths_svg)

        logger.info('Running fastq_g2p on %s.', self)
        with open(self.trimmed1_fastq) as fastq1, \
                open(self.trimmed2_fastq) as fastq2, \
                open(self.g2p_csv, 'w') as g2p_csv, \
                open(self.g2p_summary_csv, 'w') as g2p_summary_csv, \
                open(self.g2p_unmapped1_fastq, 'w') as g2p_unmapped1, \
                open(self.g2p_unmapped2_fastq, 'w') as g2p_unmapped2, \
                open(self.g2p_aligned_csv, 'w') as g2p_aligned_csv, \
                open(self.merged_contigs_csv, 'w') as merged_contigs_csv:

            fastq_g2p(pssm=pssm,
                      fastq1=fastq1,
                      fastq2=fastq2,
                      g2p_csv=g2p_csv,
                      g2p_summary_csv=g2p_summary_csv,
                      unmapped1=g2p_unmapped1,
                      unmapped2=g2p_unmapped2,
                      aligned_csv=g2p_aligned_csv,
                      min_count=DEFAULT_MIN_COUNT,
                      min_valid=MIN_VALID,
                      min_valid_percent=MIN_VALID_PERCENT,
                      merged_contigs_csv=merged_contigs_csv)

        if use_denovo:
            self.run_denovo(excluded_seeds)
        else:
            self.run_mapping(excluded_seeds)

        logger.info('Running sam2aln on %s.', self)
        with open(self.remap_csv) as remap_csv, \
                open(self.aligned_csv, 'w') as aligned_csv, \
                open(self.conseq_ins_csv, 'w') as conseq_ins_csv, \
                open(self.failed_csv, 'w') as failed_csv, \
                open(self.clipping_csv, 'w') as clipping_csv:

            sam2aln(remap_csv,
                    aligned_csv,
                    conseq_ins_csv,
                    failed_csv,
                    clipping_csv=clipping_csv)

        logger.info('Running aln2counts on %s.', self)
        if use_denovo:
            contigs_path = self.contigs_csv
        else:
            contigs_path = os.devnull
        with open(self.aligned_csv) as aligned_csv, \
                open(self.g2p_aligned_csv) as g2p_aligned_csv, \
                open(self.clipping_csv) as clipping_csv, \
                open(self.conseq_ins_csv) as conseq_ins_csv, \
                open(self.remap_conseq_csv) as remap_conseq_csv, \
                open(contigs_path) as contigs_csv, \
                open(self.nuc_csv, 'w') as nuc_csv, \
                open(self.nuc_detail_csv, 'w') as nuc_detail_csv, \
                open(self.amino_csv, 'w') as amino_csv, \
                open(self.amino_detail_csv, 'w') as amino_detail_csv, \
                open(self.coord_ins_csv, 'w') as coord_ins_csv, \
                open(self.conseq_csv, 'w') as conseq_csv, \
                open(self.conseq_region_csv, 'w') as conseq_region_csv, \
                open(self.failed_align_csv, 'w') as failed_align_csv, \
                open(self.coverage_summary_csv, 'w') as coverage_summary_csv, \
                open(self.genome_coverage_csv, 'w') as genome_coverage_csv, \
                open(self.conseq_all_csv, "w") as conseq_all_csv, \
                open(self.minimap_hits_csv, "w") as minimap_hits_csv:

            if not use_denovo:
                for f in (amino_detail_csv, nuc_detail_csv):
                    f.close()
                    os.remove(f.name)
                amino_detail_csv = nuc_detail_csv = None
            aln2counts(aligned_csv,
                       nuc_csv,
                       amino_csv,
                       coord_ins_csv,
                       conseq_csv,
                       failed_align_csv,
                       coverage_summary_csv=coverage_summary_csv,
                       clipping_csv=clipping_csv,
                       conseq_ins_csv=conseq_ins_csv,
                       g2p_aligned_csv=g2p_aligned_csv,
                       remap_conseq_csv=remap_conseq_csv,
                       conseq_region_csv=conseq_region_csv,
                       amino_detail_csv=amino_detail_csv,
                       nuc_detail_csv=nuc_detail_csv,
                       genome_coverage_csv=genome_coverage_csv,
                       contigs_csv=contigs_csv,
                       conseq_all_csv=conseq_all_csv,
                       minimap_hits_csv=minimap_hits_csv)

        logger.info('Running coverage_plots on %s.', self)
        os.makedirs(self.coverage_maps)
        with open(self.amino_csv) as amino_csv, \
                open(self.coverage_scores_csv, 'w') as coverage_scores_csv:
            coverage_plot(amino_csv,
                          coverage_scores_csv,
                          coverage_maps_path=self.coverage_maps,
                          coverage_maps_prefix=self.name,
                          excluded_projects=excluded_projects)

        with open(self.genome_coverage_csv) as genome_coverage_csv, \
                open(self.minimap_hits_csv) as minimap_hits_csv:
            if not use_denovo:
                minimap_hits_csv = None
            plot_genome_coverage(genome_coverage_csv,
                                 minimap_hits_csv,
                                 self.genome_coverage_svg)

        logger.info('Running cascade_report on %s.', self)
        with open(self.g2p_summary_csv) as g2p_summary_csv, \
                open(self.remap_counts_csv) as remap_counts_csv, \
                open(self.aligned_csv) as aligned_csv, \
                open(self.cascade_csv, 'w') as cascade_csv:
            cascade_report = CascadeReport(cascade_csv)
            cascade_report.g2p_summary_csv = g2p_summary_csv
            cascade_report.remap_counts_csv = remap_counts_csv
            cascade_report.aligned_csv = aligned_csv
            cascade_report.generate()
        logger.info('Finished sample %s.', self)