def test_cmd_all_ok_long_options(self, cmd_all_ok_long_options) -> None:
        # Function tests how `parse_arguments` parses correct long options

        # Backup argv
        buff_argv: List[str] = sys.argv
        # Set test argv
        sys.argv = cmd_all_ok_long_options

        # Parse arguments
        params: args.HighlighterParams = args.parse_arguments()

        # Restore argv
        argv = buff_argv

        assert params.target_fasta_fpath == cmd_all_ok_long_options[2]
        assert params.bam_fpath == cmd_all_ok_long_options[4]
        assert params.outfpath == cmd_all_ok_long_options[6]
        assert params.suppress_zero_cov_output == True
        assert params.topology == 'circular'
        assert params.organism == cmd_all_ok_long_options[12]

        obtained_threshold_repr: str = ','.join(
            map(lambda covthr: str(covthr.get_coverage()),
                params.coverage_thresholds))
        expected_threshold_repr: str = cmd_all_ok_long_options[8]
        assert obtained_threshold_repr == expected_threshold_repr
    def test_cmd_bam_missing(self, cmd_bam_missing) -> None:
        # Function tests how `parse_arguments` parses command line
        #    with positional arguments

        # Backup argv
        buff_argv: List[str] = sys.argv
        # Set test argv
        sys.argv = cmd_bam_missing

        # Parse arguments
        with pytest.raises(SystemExit):
            params: args.HighlighterParams = args.parse_arguments()
        # end with

        # Restore argv
        argv = buff_argv
Пример #3
0
def main():
    # Get the arguments for the current execution.
    args = parse_arguments()
    print(args)

    # Read the training manifest tsv file.
    df = pd.read_csv(args.manifest, sep='\t')

    # Define the scoring function and apply it to the manifest.
    scoring_function = ScoringFunction().create(args)
    df = scoring_function(df)

    # Create the directory for output manifest storage.
    os.makedirs(args.out_dir, exist_ok=True)

    # Save the sorted manifest.
    df.to_csv(os.path.join(args.out_dir, 'sorted_manifest.tsv'),
              sep='\t',
              index=None)
Пример #4
0
def main():
    # Get the arguments for the current execution.
    args = parse_arguments()
    print(args)

    # Read the training manifest tsv file.
    df = pd.read_csv(args.manifest, sep='\t')

    # Define the pacing function and apply it to the sorted manifest.
    pacing_function = PacingFunction().create(args)
    df_list = pacing_function(df)

    # Create the directory for output smaller datasets storage.
    if not os.path.isdir(args.out_dir):
        os.mkdir(args.out_dir)

    df_paths_list = []
    for index, sub_df in enumerate(df_list):
        path = os.path.join(args.out_dir, f'train_{index}.tsv')
        sub_df.to_csv(path, sep='\t', index=None)
        df_paths_list.append(path)

    if args.fairseq:
        train_fairseq(args, df_paths_list)
Пример #5
0
def main(version: str, last_update_date: str) -> None:

    # Parse arguments
    params: HighlighterParams = parse_arguments()

    # This string will be used for annotation of result GenBank file
    base_feature_note: str = f'generated by consensus-highlighter v{version}'

    # String for storing info about warnings
    with_warnings: str = ''

    # Read fasta records from input file
    print('Importing fasta from `{}`...'.format(params.target_fasta_fpath),
          end=' ')
    sys.stdout.flush()
    fasta_records: Sequence[SeqRecord] = pfr.parse_fasta_reference(
        params.target_fasta_fpath)
    print('done')

    # Create ouput directory
    _create_outdir_from_outfile(params.outfpath)
    out.create_or_emply_file(params.outfpath)

    # Obtain path to coverage file
    coverage_fpath: str = out.conf_path_to_depth_file(params.outfpath)

    # Count coverages with samtools depth
    print('Silently counting coverages with `samtools depth`...', end=' ')
    sys.stdout.flush()
    cov_fpath: str = oc.count_cov_for_all_refs(params.target_fasta_fpath,
                                               params.bam_fpath,
                                               coverage_fpath)
    print('done\n')

    # Proceed with annotation
    rec: SeqRecord
    for rec in fasta_records:

        print(f'Processing sequence `{rec.description}`')

        # Obtain coverages for current sequence
        cov_array: CoverageArray = oc.get_coverage_for_reference(
            rec.id, cov_fpath)

        # Check length of the coverage array
        if len(cov_array) == 0:
            print(
                f'!  Warning: no coverage information found for sequence `{rec.id}`.'
            )
            print(
                f"""!  Please, make sure that field `RNAME` (3-rd column) in your BAM file contains
!    id of this sequence specified in fasta header (i.e. `{rec.id}`).""")
            print('! Omitting this sequence.')
            print('=' * 10)
            with_warnings = ' with warnings'
            continue
        # end if

        if len(cov_array) != len(rec.seq):
            print(
                f"""!  Warning: length of sequence `{rec.id}` ({len(rec.seq)} bp)
!    is not equal to number of coverage positions ({len(cov_array)}) reported by `samtools depth`
!    and stored in coverage file `{cov_fpath}`.""")
            print(
                '!  Re-creating the bam file might be the solution of this issue.'
            )
            print('!  Omitting this sequence.')
            print('=' * 10)
            with_warnings = ' with warnings'
            continue
        # end if

        mean_coverage = round(sts.mean(cov_array.coverages), 2)
        print(f'Average coverage: {mean_coverage}')

        cov_threshold: CoverageThreshold
        coverage_features: MutableSequence[SeqFeature]

        # Detect all necessary coverage features
        for cov_threshold in params.coverage_thresholds:

            print(
                f'Screening the sequence for regions with {cov_threshold.get_label()}...',
                end=' ')
            sys.stdout.flush()

            # Get coverage features
            coverage_features = hlft.highlight_coverage_features(
                cov_array, cov_threshold, base_feature_note)

            coverage_features = ddf.dedupl_features(coverage_features,
                                                    rec.features)

            # Append features to list
            rec.features.extend(coverage_features)
            print('done')
        # end for

        print(f'Writing annotated sequence to `{params.outfpath}`...', end=' ')
        sys.stdout.flush()

        # Write result GanBank record
        out.write_genbank_output(rec, params.topology, params.organism,
                                 params.outfpath)
        print('done')

        print('=' * 10)
    # end for

    print(f'Completed{with_warnings}!')