示例#1
0
def sample_distance_iter(seq_iter,
                         sample_size,
                         per_site=True,
                         aligned=False,
                         ignore_gaps=True,
                         alphabet=None,
                         aligner_tools=['mafft', 'muscle'],
                         rng=None):
    seqs = dataio.BufferedIter(seq_iter)
    seqs_to_sample = dataio.BufferedIter(seqs)
    for seq1 in seqs:
        samples = functions.sample_iter(iterable=seqs_to_sample,
                                        sample_size=sample_size,
                                        exclude=[seq1],
                                        exclude_attribute='id',
                                        rng=rng)
        for seq2 in samples:
            assert seq1.id != seq2.id
            d, drc = get_distances(seq1=seq1,
                                   seq2=seq2,
                                   per_site=per_site,
                                   aligned=aligned,
                                   ignore_gaps=ignore_gaps,
                                   alphabet=alphabet,
                                   aligner_tools=aligner_tools)
            yield seq1, seq2, d, drc
示例#2
0
def get_population_pair_diversity_summary(seq_iter1,
                                          seq_iter2,
                                          per_site=True,
                                          aligned=False,
                                          ignore_gaps=True,
                                          alphabet=None,
                                          aligner_tools=['mafft', 'muscle']):
    sum_diffs = 0.0
    seqs_1 = dataio.BufferedIter(seq_iter1)
    seqs_2 = dataio.BufferedIter(seq_iter2)
    pi_1 = average_number_of_pairwise_differences(seqs_1,
                                                  per_site=per_site,
                                                  aligned=aligned,
                                                  ignore_gaps=ignore_gaps,
                                                  alphabet=alphabet,
                                                  aligner_tools=aligner_tools)
    pi_2 = average_number_of_pairwise_differences(seqs_2,
                                                  per_site=per_site,
                                                  aligned=aligned,
                                                  ignore_gaps=ignore_gaps,
                                                  alphabet=alphabet,
                                                  aligner_tools=aligner_tools)
    assert pi_1 is not None
    assert pi_2 is not None
    n = 0
    for seq1 in seqs_1:
        for seq2 in seqs_2:
            sum_diffs += distance(seq1=seq1,
                                  seq2=seq2,
                                  per_site=per_site,
                                  aligned=aligned,
                                  ignore_gaps=ignore_gaps,
                                  alphabet=alphabet,
                                  aligner_tools=aligner_tools)
            n += 1
    assert n > 0
    pi_b = sum_diffs / n
    pi_w = (pi_1 + pi_2) / 2.0
    pi_net = pi_b - pi_w
    return {
        "pi_1": pi_1,
        "pi_2": pi_2,
        "pi_within": pi_w,
        "pi_between": pi_b,
        "pi_net": pi_net,
    }
示例#3
0
def duplicate_id_filter(seq_iter):
    ids = set()
    checked_seqs = dataio.BufferedIter()
    for seq in seq_iter:
        if seq.id in ids:
            for prev_seq in checked_seqs:
                if prev_seq.id == seq.id:
                    break
            if not sequtils.sequences_are_equal(prev_seq, seq):
                    raise Exception('Found {0} more than once, but with '
                            'different data!'.format(seq.id))
            continue
        else:
            checked_seqs.append(seq)
        ids.add(seq.id)
        yield seq
示例#4
0
def pairwise_distance_iter(seq_iter,
                           per_site=True,
                           aligned=False,
                           ignore_gaps=True,
                           alphabet=None,
                           aligner_tools=['mafft', 'muscle']):
    seqs = dataio.BufferedIter(seq_iter)
    for seq1, seq2 in itertools.combinations(seqs, 2):
        d, drc = get_distances(seq1=seq1,
                               seq2=seq2,
                               per_site=per_site,
                               aligned=aligned,
                               ignore_gaps=ignore_gaps,
                               alphabet=alphabet,
                               aligner_tools=aligner_tools)
        yield seq1, seq2, d, drc
示例#5
0
def variable_columns(seq_iter):
    seqs = dataio.BufferedIter(seq_iter)
    ref_seq = None
    align_length = None
    is_variable = None
    for i, seq_record in enumerate(seqs):
        if align_length == None:
            align_length = len(seq_record)
            is_variable = [False] * align_length
            ref_seq = seq_record
            continue
        elif len(seq_record) != align_length:
            raise AlignmentError('Sequence {0} has unexpected '
                                 'length {1}.'.format(seq_record.name,
                                                      len(seq_record)))
        for j, character in enumerate(seq_record.seq):
            if character.lower() != ref_seq.seq[j].lower():
                is_variable[j] = True
    return is_variable, seqs
示例#6
0
def column_frequencies(seq_iter, character_list=['-', '?']):
    seqs = dataio.BufferedIter(seq_iter)
    char_list = [c.lower() for c in character_list]
    column_counts = []
    align_length = None
    for i, seq_record in enumerate(seqs):
        if align_length == None:
            align_length = len(seq_record)
            column_counts = [0] * align_length
        else:
            if len(seq_record) != align_length:
                raise AlignmentError('Sequence {0} has unexpected '
                                     'length {1}.'.format(
                                         seq_record.name, len(seq_record)))
        for j, character in enumerate(seq_record.seq):
            if character.lower() in char_list:
                column_counts[j] += 1
    column_freqs = [count / float(i + 1) for count in column_counts]
    return column_freqs, seqs
示例#7
0
def average_number_of_pairwise_differences(seq_iter,
                                           per_site=True,
                                           aligned=False,
                                           ignore_gaps=True,
                                           alphabet=None,
                                           aligner_tools=['mafft', 'muscle']):
    sum_diffs = 0.0
    seqs = dataio.BufferedIter(seq_iter)
    i = -1
    for i, (seq1, seq2) in enumerate(itertools.combinations(seqs, 2)):
        sum_diffs += distance(seq1=seq1,
                              seq2=seq2,
                              per_site=per_site,
                              aligned=aligned,
                              ignore_gaps=ignore_gaps,
                              alphabet=alphabet,
                              aligner_tools=aligner_tools)
    if i < 0:
        return None
    return sum_diffs / (i + 1)
示例#8
0
文件: seqvet.py 项目: joaks1/SeqSift
def main_cli():
    description = '{name} {version}\n\n{description}'.format(**_program_info)
    parser = argparse.ArgumentParser(
        description=description,
        formatter_class=argparse.RawDescriptionHelpFormatter)
    parser.add_argument('input_file',
                        metavar='INPUT-SEQ-FILE',
                        type=argparse_utils.arg_is_file,
                        help=('Input sequence file to be vetted.'))

    comparison_args = parser.add_argument_group(
        'Comparison Options',
        'Options to control the number and nature of sequence comparisons')
    comparison_args.add_argument(
        '-n',
        '--num-samples',
        type=int,
        default=0,
        help=('The number of randomly sampled sequences to which each '
              'sequence will be compared. If less than 1 (the defualt is '
              '0), all pairwise comparisons will be performed. For very '
              'large numbers of sequences, performing all pairwise '
              'comparisons will take a long time. This option will speed '
              'things up as long as the number specified is less than '
              'about half of the number of input sequences. If the '
              'number you are considering is close to half of the number '
              'sequences, you should probably specify zero and do all '
              'combinations. You should not specify a number greater than '
              'half the number of sequences, because it will take longer '
              'and be less thorough than the default.'))
    comparison_args.add_argument(
        '--seed',
        action='store',
        type=int,
        help=('Random number seed to use for the analysis. This option '
              'is only revelant if a number greater than 0 is specified '
              'for the `-n/--num-samples` option.'))
    comparison_args.add_argument(
        '--compare-translated',
        action='store_true',
        help=('Compare amino acid sequences encoded by the longest '
              'reading frame found in each sequence. To use this option, '
              '`data-type` must be dna or rna. See "Translation Options" '
              'for controlling how the longest reading frame of each '
              'sequence is determined and translated.'))
    comparison_args.add_argument('--check-ids',
                                 action='store_true',
                                 help=('Check sequence IDs for duplicates.'))
    comparison_args.add_argument(
        '--summarize-reading-frame-lengths',
        action='store_true',
        help=('Report the length of the longest reading frame of '
              'each sequence. See "Translation Options" for controlling '
              'how reading frames are determined.'))
    comparison_args.add_argument(
        '-g',
        '--count-gaps',
        action='store_true',
        help=('Count gaps when calculating pairwise sequence distances. '
              'The default is to calculate (number of differences '
              'ignoring gaps / number of aligned sites ignoring sites '
              'with gaps) for each pairwise comparison. When this option '
              'is used, the distance is (number of differences including '
              'gap differences / total number of aligned sites).'))

    alignment_args = parser.add_argument_group(
        'Alignment Options',
        ('These options control if/how sequences are to be aligned prior '
         'to calculating distances.'))
    alignment_args.add_argument(
        '-a',
        '--aligned',
        action='store_true',
        help=('Treat input sequences as aligned. I.e., do not perform '
              'pairwise alignment before calculating distances between '
              'sequences (except when calculating distances for reverse '
              'and complemented sequences).'))
    alignment_args.add_argument(
        '--aligner',
        type=argparse_utils.arg_is_executable,
        help=('Path to alignment program executable to use for pairwise'
              'alignments of sequences. '
              'The default is to look for muscle and then mafft in PATH, '
              'and if neither are found use the (slow) built-in '
              'function. Even if the `-a`/`--aligned` option is '
              'specified, the aligner will still be used for pairwise '
              'alignments when calculating distances of reverse and '
              'complemented sequences.'))
    alignment_args.add_argument(
        '--msa',
        action='store_true',
        help=('Perform a full multiple sequence alignemnt prior to '
              'comparing sequences. The default is to align each '
              'pair of sequences being compared. This option is '
              'overruled by the `-a`/`--aligned` option. '
              'If this option is used '
              'the resulting alignment is written to file.'))
    alignment_args.add_argument(
        '--msa-aligner',
        type=argparse_utils.arg_is_executable,
        help=('Path to alignment program executable to use for full '
              'multiple sequence alignment. '
              'The default is to look for mafft and then muscle in PATH, '
              'and if neither are found the program will exit with an '
              'error message. If you do not have mafft or muscle '
              'you cannot use this option. '
              'This option is only used if the `-a`/`--aligned` option '
              'is not specified, and the `--msa` option is specified.'))

    translation_args = parser.add_argument_group(
        'Translation Options',
        ('These options control translation from nucleotide to amino acid '
         'sequences.'))
    translation_args.add_argument(
        '--table',
        type=int,
        choices=list(range(1, 7)) + list(range(9, 17)) + list(range(21, 26)),
        default=1,
        help=('The translation table to use for any options associated '
              'with translating nucleotide sequences to amino acids. '
              'Option should be the integer that corresponds to the '
              'desired translation table according to NCBI '
              '(http://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi). '
              'The default is 1 (the "standard" code).'))
    translation_args.add_argument(
        '--allow-partial',
        action='store_true',
        default=False,
        help=('Allow partial reading frames at the beginning (no start '
              'codon) and end (no stop codon) of sequences.'))
    translation_args.add_argument(
        '--read-after-stop',
        action='store_true',
        default=False,
        help=('A new reading frame begins immediately after a stop codon. '
              'The default is to start reading frame at next start codon '
              'after a stop codon. This option might be useful for exons.'))

    data_args = parser.add_argument_group(
        'Data Options', ('Options specifying the input data type and format'))
    data_args.add_argument(
        '-d',
        '--data-type',
        type=str,
        choices=VALID_DATA_TYPES,
        default='dna',
        help=('The type of sequence data. The default is dna. Valid '
              'options include: {0}.'.format(', '.join(VALID_DATA_TYPES))))
    data_args.add_argument(
        '--format',
        dest='input_format',
        type=str,
        choices=FILE_FORMATS.supported_formats,
        help=('The format of the input sequence file. Valid options '
              'include: {0}. By default, the format is guessed based on '
              'the extension of the first input file. However, if '
              'provided, this option will always take precedence over '
              'the file extension.'.format(', '.join(
                  FILE_FORMATS.supported_formats))))

    output_args = parser.add_argument_group(
        'Output Options', 'Options for controlling output of program')
    output_args.add_argument(
        '-o',
        '--output-dir',
        type=argparse_utils.arg_is_dir,
        help=('The directory in which all output files will be written. '
              'The default is to use the directory of the input file.'))

    messaging_args = parser.add_argument_group(
        'Messaging Options', ('These options control verbosity of messaging.'))
    messaging_args.add_argument(
        '--log-frequency',
        type=argparse_utils.arg_is_nonnegative_int,
        default=1000,
        help=('The frequency at which to log progress. Default is to log '
              'every 1000 sequence comparisons.'))
    messaging_args.add_argument('--quiet',
                                action='store_true',
                                help='Run without verbose messaging.')
    messaging_args.add_argument('--debug',
                                action='store_true',
                                help='Run in debugging mode.')

    args = parser.parse_args()

    ##########################################################################
    ## set up logging

    from seqsift.utils.messaging import get_logger, LOGGING_LEVEL_ENV_VAR

    os.environ[LOGGING_LEVEL_ENV_VAR] = "INFO"
    if args.quiet:
        os.environ[LOGGING_LEVEL_ENV_VAR] = "WARNING"
    if args.debug:
        os.environ[LOGGING_LEVEL_ENV_VAR] = "DEBUG"
    log = get_logger(name=__name__)

    ##########################################################################
    ## package imports

    from seqsift.utils import GLOBAL_RNG, dataio, functions, alphabets
    from seqsift.seqops import seqsum, seqmod, seqstats
    from seqsift.utils.fileio import OpenFile

    ##########################################################################
    ## handle args

    ## set seed if randomly sampling sequences
    if args.num_samples > 0:
        if not args.seed:
            args.seed = random.randint(1, 999999999)
        GLOBAL_RNG.seed(args.seed)
        log.warning('Seed: {0}'.format(args.seed))

    ## get input file format
    if not args.input_format:
        args.input_format = FILE_FORMATS.get_format_from_file_object(
            args.input_file)
    if not args.input_format:
        log.error("Could not determine input format.\n"
                  "You must either provide the input format\n"
                  "using the '--from' option or have a recognizable\n"
                  "file extension on the input file name.\n"
                  "Here are the supported file extensions:\n{0}".format(
                      str(FILE_FORMATS)))
        sys.stderr.write(str(parser.print_help()))
        sys.exit(1)

    aligner_tools = ['muscle', 'mafft']
    if args.aligner:
        aligner_tools = [args.aligner]
    full_aligner_tools = ['mafft', 'muscle']
    if args.msa_aligner:
        full_aligner_tools = [args.msa_aligner]

    if not args.output_dir:
        args.output_dir = os.path.dirname(args.input_file)

    full_alignment_out_path = os.path.join(args.output_dir, 'seqvet-msa.txt')
    alphabet = alphabets.DnaAlphabet()
    if args.data_type in ['aa', 'protein']:
        alphabet = alphabets.ProteinAlphabet()

    if (args.summarize_reading_frame_lengths
            and (not args.data_type in ['dna', 'rna'])):
        log.error("`--summarize-reading-frame-lengths` is only compatible "
                  "with DNA or RNA.")
        sys.stderr.write(str(parser.print_help()))
        sys.exit(1)

    if (args.compare_translated and (not args.data_type in ['dna', 'rna'])):
        log.error("`-compare-translated` is only compatible with DNA or RNA.")
        sys.stderr.write(str(parser.print_help()))
        sys.exit(1)

    ##########################################################################
    ## heavy lifting

    seqs = dataio.get_seq_iter([args.input_file],
                               format=args.input_format,
                               data_type=args.data_type)

    if args.summarize_reading_frame_lengths:
        log.info('Summarizing longest reading frame lengths...')
        if not isinstance(seqs, dataio.BufferedIter):
            seqs = dataio.BufferedIter(seqs)
        lengths = seqsum.summarize_longest_read_lengths(
            seqs,
            table=args.table,
            allow_partial=args.allow_partial,
            require_start_after_stop=(not args.read_after_stop))
        length_path = os.path.join(args.output_dir,
                                   'seqvet-reading-frame-lengths.txt')
        log.info('Writing longest reading frame lengths to file...')
        with OpenFile(length_path, 'w') as out:
            out.write('seq_id\tlrf\trev_comp_lrf\n')
            for (l, rc_l, seq_id) in lengths:
                out.write('{0}\t{1}\t{2}\n'.format(seq_id, l, rc_l))

    if args.compare_translated:
        log.info('Translating longest reading frames for distance '
                 'calculations...')
        seqs = seqmod.translate_longest_reading_frames(
            seqs,
            table=args.table,
            allow_partial=args.allow_partial,
            require_start_after_stop=(not args.read_after_stop))
        alphabet = alphabets.ProteinAlphabet()

    if args.check_ids:
        log.info('Checking sequence IDs...')
        if not isinstance(seqs, dataio.BufferedIter):
            seqs = dataio.BufferedIter(seqs)
        dups = seqstats.get_duplicate_ids(seqs)
        if len(dups) > 0:
            dup_path = functions.get_new_path(
                os.path.join(args.output_dir, 'seqvet-duplicate-ids.txt'))
            log.warning('Duplicate IDs found! Writing them to '
                        '{0}'.format(dup_path))
            with OpenFile(dup_path, 'w') as out:
                for dup in dups:
                    out.write('{0}\n'.format(dup))
        else:
            log.info('No duplicate sequence IDs were found.')

    log.info('Calculating pairwise distances...')
    distances, rev_comp_errors = seqsum.summarize_distances(
        seqs,
        sample_size=args.num_samples,
        per_site=True,
        aligned=args.aligned,
        ignore_gaps=(not args.count_gaps),
        alphabet=alphabet,
        do_full_alignment=args.msa,
        full_alignment_out_path=full_alignment_out_path,
        aligner_tools=aligner_tools,
        full_aligner_tools=full_aligner_tools,
        log_frequency=args.log_frequency)
    log.info('Done!')

    log.info('Writing mean distances to file...')
    distances = sorted([(k, v) for k, v in iteritems(distances)],
                       key=lambda x: x[1].mean,
                       reverse=True)
    mean_path = functions.get_new_path(
        os.path.join(args.output_dir, 'seqvet-mean-distances.txt'))
    with OpenFile(mean_path, 'w') as out:
        out.write('seq_id\tmean_distance\n')
        for (seq_id, dist) in distances:
            out.write('{0}\t{1}\n'.format(seq_id, dist.mean))

    log.info('Writing max distances to file...')
    distances = sorted(distances, key=lambda x: x[1].maximum, reverse=True)
    max_path = functions.get_new_path(
        os.path.join(args.output_dir, 'seqvet-max-distances.txt'))
    with OpenFile(max_path, 'w') as out:
        out.write('seq_id\tmax_distance\n')
        for (seq_id, dist) in distances:
            out.write('{0}\t{1}\n'.format(seq_id, dist.maximum))

    if rev_comp_errors:
        rev_comp_errors = sorted(rev_comp_errors)
        rce_set = set()
        rce = []
        for (s1, s2, d, drc) in rev_comp_errors:
            pair = tuple(sorted([s1, s2]))
            if pair in rce_set:
                continue
            rce_set.add(pair)
            rce.append((pair[0], pair[1], d, drc))
        log.info('Writing potential reverse-complement errors to file...')
        path = functions.get_new_path(
            os.path.join(args.output_dir,
                         'seqvet-reverse-complement-warnings.txt'))
        with OpenFile(path, 'w') as out:
            out.write('seq1\tseq2\tdistance\trev_comp_distance\n')
            for (seq1, seq2, d, drc) in rce:
                out.write('{0}\t{1}\t{2}\t{3}\n'.format(seq1, seq2, d, drc))