Exemplo n.º 1
0
 def test_empty_file(self):
     "It guesses the format of an empty file"
     fhand = StringIO()
     try:
         guess_format(fhand)
         self.fail("UnknownFormatError expected")
     except UnknownFormatError:
         pass
Exemplo n.º 2
0
 def test_unkown(self):
     "It tests unkown formats"
     fhand = StringIO("xseq\nACTC\n")
     try:
         guess_format(fhand)
         self.fail("UnknownFormatError expected")
     except UnknownFormatError:
         pass
Exemplo n.º 3
0
 def test_long_illumina(self):
     "The qualities seem illumina, but the reads are too lengthly"
     txt = "@read\n"
     txt += "T" * 400 + "\n"
     txt += "+\n"
     txt += "@" * 400 + "\n"
     fhand = StringIO(txt)
     try:
         guess_format(fhand)
         self.fail("UndecidedFastqVersionError expected")
     except UndecidedFastqVersionError:
         pass
Exemplo n.º 4
0
    def test_fasta(self):
        "It guess fasta formats"
        fhand = StringIO(">seq\nACTC\n")
        assert guess_format(fhand) == "fasta"

        # qual
        fhand = StringIO(">seq\n10 20\n")
        assert guess_format(fhand) == "qual"

        # qual
        qual = ">seq1\n30 30 30 30 30 30 30 30\n>seq2\n30 30 30 30 30 30 30"
        qual += " 30\n>seq3\n30 30 30 30 30 30 30 30\n"

        fhand = StringIO(qual)
        assert guess_format(fhand) == "qual"
Exemplo n.º 5
0
def seqio(in_fhands, out_fhands, out_format, copy_if_same_format=True):
    'It converts sequence files between formats'

    in_formats = [guess_format(fhand) for fhand in in_fhands]

    if (len(in_formats) == 1 and in_formats[0] == out_format and
        hasattr(in_fhands[0], 'name')):
        if copy_if_same_format:
            copyfileobj(in_fhands[0], out_fhands[0])
        else:
            rel_symlink(in_fhands[0].name, out_fhands[0].name)

    elif len(in_fhands) == 1 and len(out_fhands) == 1:
        try:
            SeqIO.convert(in_fhands[0], in_formats[0], out_fhands[0],
                          out_format)
        except ValueError as error:
            if error_quality_disagree(error):
                raise MalformedFile(str(error))
            raise
    elif (len(in_fhands) == 1 and len(out_fhands) == 2 and
          out_format == 'fasta'):
        try:
            for seq in read_seqrecords([in_fhands[0]]):
                SeqIO.write([seq], out_fhands[0], out_format)
                SeqIO.write([seq], out_fhands[1], 'qual')
        except ValueError, error:
            if error_quality_disagree(error):
                raise MalformedFile(str(error))
            raise
Exemplo n.º 6
0
    def test_fastq(self):
        "It guesses the format for the solexa and illumina fastq"

        txt = "@HWI-EAS209_0006_FC706VJ:5:58:5894:21141#ATCACG/1\n"
        txt += "TTAATTGGTAAATAAATCTCCTAATAGCTTAGATNTTACCTTNNNNNNNNNNTAGTTTCT\n"
        txt += "+HWI-EAS209_0006_FC706VJ:5:58:5894:21141#ATCACG/1\n"
        txt += "efcfffffcfeefffcffffffddf`feed]`]_Ba_^__[YBBBBBBBBBBRTT\]][]\n"
        fhand = StringIO(txt)
        assert guess_format(fhand) == "fastq-illumina"

        fhand = StringIO("@HWI-EAS209\n@")
        try:
            assert guess_format(fhand) == "fasta"
            self.fail("UnknownFormatError expected")
        except UnknownFormatError:
            pass

        # sanger
        txt = "@HWI-EAS209_0006_FC706VJ:5:58:5894:21141#ATCACG/1\n"
        txt += "TTAATTGGTAAATAAATCTCCTAATAGCTTAGATNTTACCTTNNNNNNNNNNTAGTTTCT\n"
        txt += "+HWI-EAS209_0006_FC706VJ:5:58:5894:21141#ATCACG/1\n"
        txt += "000000000000000000000000000000000000000000000000000000000000\n"
        fhand = StringIO(txt)
        assert guess_format(fhand) == "fastq"
Exemplo n.º 7
0
def count_seqs_in_files(fhands, file_format=GUESS_FORMAT):
    'It counts the seqs in the given files'
    count = 0
    for fhand in fhands:
        if file_format == GUESS_FORMAT or file_format is None:
            file_format = guess_format(fhand)
        else:
            file_format = file_format

        if file_format == 'fasta':
            count += _count_seqs_in_fasta(fhand)
        elif 'fastq' in file_format:
            count += length(QualityIO.FastqGeneralIterator(fhand))
        else:
            count += length(read_seqrecords([fhand]))
    return count
Exemplo n.º 8
0
def _index_seq_file(fpath, file_format=None):
    '''It indexes a seq file using Biopython index.

    It uses the title line line as the key and not just the id.
    '''
    if file_format is None:
        file_format = guess_format(open(fpath))

    # pylint: disable W0212
    # we monkey patch to be able to index using the whole tile line and not
    # only the id. We need it because in a pair end file sequences with the
    # same id could be found
    accessor = _index._FormatToRandomAccess
    old_accessor = accessor.copy()
    accessor['fastq'] = FastqRandomAccess
    accessor['astq-sanger'] = FastqRandomAccess
    accessor['fastq-solexa'] = FastqRandomAccess
    accessor['fastq-illumina'] = FastqRandomAccess

    file_index = index(fpath, format=file_format)

    _index._FormatToRandomAccess = old_accessor

    return file_index
Exemplo n.º 9
0
def read_seqrecords(fhands, file_format=GUESS_FORMAT):
    'it returns an iterator of seqrecords'
    seq_iters = []
    for fhand in fhands:
        if file_format == GUESS_FORMAT or file_format is None:
            fmt = guess_format(fhand)
        else:
            fmt = file_format
        if fmt in ('fasta', 'qual') or 'fastq' in fmt:
            title = title2ids
        if fmt == 'fasta':
            seq_iter = FastaIO.FastaIterator(fhand, title2ids=title)
        elif fmt == 'qual':
            seq_iter = QualityIO.QualPhredIterator(fhand, title2ids=title)
        elif fmt == 'fastq' or fmt == 'fastq-sanger':
            seq_iter = QualityIO.FastqPhredIterator(fhand, title2ids=title)
        elif fmt == 'fastq-solexa':
            seq_iter = QualityIO.FastqSolexaIterator(fhand, title2ids=title)
        elif fmt == 'fastq-illumina':
            seq_iter = QualityIO.FastqIlluminaIterator(fhand, title2ids=title)
        else:
            seq_iter = SeqIO.parse(fhand, fmt)
        seq_iters.append(seq_iter)
    return chain.from_iterable(seq_iters)
Exemplo n.º 10
0
                print 'hello', error
                parser.error(error)

            new_out_fhands.append(out_f)
        out_fhand = new_out_fhands
    else:
        try:
            out_fhand = compress_fhand(out_fhand, compression_kind=comp_kind)
        except RuntimeError, error:
            parser.error(error)

    out_format = parsed_args.out_format
    # The default format is the same as the first file
    if not out_format:
        if in_format == GUESS_FORMAT:
            out_format = guess_format(wrapped_fhands[0])
        else:
            out_format = in_format
    # The original fhands should be stored, because otherwise they would be
    # closed
    args = {'out_fhand': out_fhand, 'in_fhands': wrapped_fhands,
            'out_format': out_format, 'original_in_fhands': in_fhands,
            'in_format': in_format}
    return args, parsed_args


def parse_basic_process_args(parser):
    'It parses the command line and it returns a dict with the arguments.'
    args, parsed_args = parse_basic_args(parser)
    args['processes'] = parsed_args.processes
    return args, parsed_args