def test_fastq(self):
        "It guesses the format for the solexa and illumina fastq"

        txt = "@HWI-EAS209_0006_FC706VJ:5:58:5894:21141#ATCACG/1\n"
        txt += "TTAATTGGTAAATAAATCTCCTAATAGCTTAGATNTTACCTTNNNNNNNNNNTAGTTTCT\n"
        txt += "+HWI-EAS209_0006_FC706VJ:5:58:5894:21141#ATCACG/1\n"
        txt += "efcfffffcfeefffcffffffddf`feed]`]_Ba_^__[YBBBBBBBBBBRTT\]][]\n"
        fhand = StringIO(txt)
        assert guess_format(fhand) == "fastq-illumina"

        txt = "@HWI-EAS209_0006_FC706VJ:5:58:5894:21141#ATCACG/1\n"
        txt += "TTAATTGGTAAATAAATCTCCTAATAGCTTAGATNTTACCTTNNNNNNNNNNTAGTTTCT\n"
        txt += "TTAATTGGTAAATAAATCTCCTAATAGCTTAGATNTTACCTTNNNNNNNNNNTAGTTTCT\n"
        txt += "+HWI-EAS209_0006_FC706VJ:5:58:5894:21141#ATCACG/1\n"
        txt += "efcfffffcfeefffcffffffddf`feed]`]_Ba_^__[YBBBBBBBBBBRTT\]][]\n"
        txt += "efcfffffcfeefffcffffffddf`feed]`]_Ba_^__[YBBBBBBBBBBRTT\]][]\n"

        fhand = StringIO(txt + txt)
        assert guess_format(fhand) == "fastq-illumina-multiline"

        fhand = StringIO("@HWI-EAS209\n@")
        try:
            assert guess_format(fhand) == "fasta"
            self.fail("UnknownFormatError expected")
        except UnknownFormatError:
            pass

        # sanger
        txt = "@HWI-EAS209_0006_FC706VJ:5:58:5894:21141#ATCACG/1\n"
        txt += "TTAATTGGTAAATAAATCTCCTAATAGCTTAGATNTTACCTTNNNNNNNNNNTAGTTTCT\n"
        txt += "+HWI-EAS209_0006_FC706VJ:5:58:5894:21141#ATCACG/1\n"
        txt += "000000000000000000000000000000000000000000000000000000000000\n"
        fhand = StringIO(txt)
        assert guess_format(fhand) == "fastq"
Пример #2
0
    def test_fastq(self):
        'It guesses the format for the solexa and illumina fastq'

        txt = '@HWI-EAS209_0006_FC706VJ:5:58:5894:21141#ATCACG/1\n'
        txt += 'TTAATTGGTAAATAAATCTCCTAATAGCTTAGATNTTACCTTNNNNNNNNNNTAGTTTCT\n'
        txt += '+HWI-EAS209_0006_FC706VJ:5:58:5894:21141#ATCACG/1\n'
        txt += 'efcfffffcfeefffcffffffddf`feed]`]_Ba_^__[YBBBBBBBBBBRTT\]][]\n'
        fhand = StringIO(txt)
        assert guess_format(fhand) == 'fastq-illumina'

        txt = '@HWI-EAS209_0006_FC706VJ:5:58:5894:21141#ATCACG/1\n'
        txt += 'TTAATTGGTAAATAAATCTCCTAATAGCTTAGATNTTACCTTNNNNNNNNNNTAGTTTCT\n'
        txt += 'TTAATTGGTAAATAAATCTCCTAATAGCTTAGATNTTACCTTNNNNNNNNNNTAGTTTCT\n'
        txt += '+HWI-EAS209_0006_FC706VJ:5:58:5894:21141#ATCACG/1\n'
        txt += 'efcfffffcfeefffcffffffddf`feed]`]_Ba_^__[YBBBBBBBBBBRTT\]][]\n'
        txt += 'efcfffffcfeefffcffffffddf`feed]`]_Ba_^__[YBBBBBBBBBBRTT\]][]\n'

        fhand = StringIO(txt + txt)
        assert guess_format(fhand) == 'fastq-illumina-multiline'

        fhand = StringIO('@HWI-EAS209\n@')
        try:
            assert guess_format(fhand) == 'fasta'
            self.fail('UnknownFormatError expected')
        except UnknownFormatError:
            pass

        # sanger
        txt = '@HWI-EAS209_0006_FC706VJ:5:58:5894:21141#ATCACG/1\n'
        txt += 'TTAATTGGTAAATAAATCTCCTAATAGCTTAGATNTTACCTTNNNNNNNNNNTAGTTTCT\n'
        txt += '+HWI-EAS209_0006_FC706VJ:5:58:5894:21141#ATCACG/1\n'
        txt += '000000000000000000000000000000000000000000000000000000000000\n'
        fhand = StringIO(txt)
        assert guess_format(fhand) == 'fastq'
 def test_unkown(self):
     "It tests unkown formats"
     fhand = StringIO("xseq\nACTC\n")
     try:
         guess_format(fhand)
         self.fail("UnknownFormatError expected")
     except UnknownFormatError:
         pass
Пример #4
0
 def test_unkown(self):
     'It tests unkown formats'
     fhand = StringIO('xseq\nACTC\n')
     try:
         guess_format(fhand)
         self.fail('UnknownFormatError expected')
     except UnknownFormatError:
         pass
Пример #5
0
 def test_empty_file(self):
     'It guesses the format of an empty file'
     fhand = StringIO()
     try:
         guess_format(fhand)
         self.fail('UnknownFormatError expected')
     except UnknownFormatError:
         pass
 def test_empty_file(self):
     "It guesses the format of an empty file"
     fhand = StringIO()
     try:
         guess_format(fhand)
         self.fail("UnknownFormatError expected")
     except UnknownFormatError:
         pass
 def test_long_illumina(self):
     "The qualities seem illumina, but the reads are too lengthly"
     txt = "@read\n"
     txt += "T" * 400 + "\n"
     txt += "+\n"
     txt += "@" * 400 + "\n"
     fhand = StringIO(txt)
     try:
         guess_format(fhand)
         self.fail("UndecidedFastqVersionError expected")
     except UndecidedFastqVersionError:
         pass
Пример #8
0
 def test_long_illumina(self):
     'The qualities seem illumina, but the reads are too lengthly'
     txt = '@read\n'
     txt += 'T' * 400 + '\n'
     txt += '+\n'
     txt += '@' * 400 + '\n'
     fhand = StringIO(txt)
     try:
         guess_format(fhand)
         self.fail('UndecidedFastqVersionError expected')
     except UndecidedFastqVersionError:
         pass
Пример #9
0
def _read_seqrecords(fhands, file_format=GUESS_FORMAT):
    'It returns an iterator of seqrecords'
    seq_iters = []
    for fhand in fhands:
        if file_format == GUESS_FORMAT or file_format is None:
            fmt = guess_format(fhand)
        else:
            fmt = file_format

        fmt = remove_multiline(fmt)

        if fmt in ('fasta', 'qual') or 'fastq' in fmt:
            title = title2ids
        if fmt == 'fasta':
            seq_iter = FastaIterator(fhand, title2ids=title)
        elif fmt == 'qual':
            seq_iter = QualPhredIterator(fhand, title2ids=title)
        elif fmt == 'fastq' or fmt == 'fastq-sanger':
            seq_iter = FastqPhredIterator(fhand, title2ids=title)
        elif fmt == 'fastq-solexa':
            seq_iter = FastqSolexaIterator(fhand, title2ids=title)
        elif fmt == 'fastq-illumina':
            seq_iter = FastqIlluminaIterator(fhand, title2ids=title)
        else:
            seq_iter = parse_into_seqrecs(fhand, fmt)
        seq_iters.append(seq_iter)
    return chain.from_iterable(seq_iters)
Пример #10
0
def _read_seqrecords(fhands, file_format=GUESS_FORMAT):
    'It returns an iterator of seqrecords'
    seq_iters = []
    for fhand in fhands:
        if file_format == GUESS_FORMAT or file_format is None:
            fmt = guess_format(fhand)
        else:
            fmt = file_format

        fmt = remove_multiline(fmt)

        if fmt in ('fasta', 'qual') or 'fastq' in fmt:
            title = title2ids
        if fmt == 'fasta':
            seq_iter = FastaIterator(fhand, title2ids=title)
        elif fmt == 'qual':
            seq_iter = QualPhredIterator(fhand, title2ids=title)
        elif fmt == 'fastq' or fmt == 'fastq-sanger':
            seq_iter = FastqPhredIterator(fhand, title2ids=title)
        elif fmt == 'fastq-solexa':
            seq_iter = FastqSolexaIterator(fhand, title2ids=title)
        elif fmt == 'fastq-illumina':
            seq_iter = FastqIlluminaIterator(fhand, title2ids=title)
        else:
            seq_iter = parse_into_seqrecs(fhand, fmt)
        seq_iters.append(seq_iter)
    return chain.from_iterable(seq_iters)
Пример #11
0
def _index_seq_file(fpath, file_format=None):
    '''It indexes a seq file using Biopython index.

    It uses the title line line as the key and not just the id.
    '''
    if file_format is None:
        file_format = guess_format(open(fpath))

    file_format = _remove_multiline(file_format)

    # pylint: disable W0212
    # we monkey patch to be able to index using the whole tile line and not
    # only the id. We need it because in a pair end file sequences with the
    # same id could be found
    accessor = _index._FormatToRandomAccess
    old_accessor = accessor.copy()
    accessor['fastq'] = FastqRandomAccess
    accessor['astq-sanger'] = FastqRandomAccess
    accessor['fastq-solexa'] = FastqRandomAccess
    accessor['fastq-illumina'] = FastqRandomAccess

    file_index = index(fpath, format=file_format)

    _index._FormatToRandomAccess = old_accessor

    return file_index
Пример #12
0
def _index_seq_file(fpath, file_format=None):
    '''It indexes a seq file using Biopython index.

    It uses the title line line as the key and not just the id.
    '''
    if file_format is None:
        file_format = guess_format(open(fpath))

    file_format = remove_multiline(file_format)

    # pylint: disable W0212
    # we monkey patch to be able to index using the whole tile line and not
    # only the id. We need it because in a pair end file sequences with the
    # same id could be found
    accessor = _index._FormatToRandomAccess
    old_accessor = accessor.copy()
    accessor['fastq'] = FastqRandomAccess
    accessor['astq-sanger'] = FastqRandomAccess
    accessor['fastq-solexa'] = FastqRandomAccess
    accessor['fastq-illumina'] = FastqRandomAccess

    file_index = index(fpath, format=file_format)

    _index._FormatToRandomAccess = old_accessor

    return file_index
    def test_fasta(self):
        "It guess fasta formats"
        fhand = StringIO(">seq\nACTC\n")
        assert guess_format(fhand) == "fasta"

        # multiline fasta
        fhand = StringIO(">seq\nACTC\nACTG\n>seq2\nACTG\n")
        assert guess_format(fhand) == "fasta"

        # qual
        fhand = StringIO(">seq\n10 20\n")
        assert guess_format(fhand) == "qual"

        # qual
        qual = ">seq1\n30 30 30 30 30 30 30 30\n>seq2\n30 30 30 30 30 30 30"
        qual += " 30\n>seq3\n30 30 30 30 30 30 30 30\n"

        fhand = StringIO(qual)
        assert guess_format(fhand) == "qual"
Пример #14
0
    def test_fasta(self):
        'It guess fasta formats'
        fhand = StringIO('>seq\nACTC\n')
        assert guess_format(fhand) == 'fasta'

        # multiline fasta
        fhand = StringIO('>seq\nACTC\nACTG\n>seq2\nACTG\n')
        assert guess_format(fhand) == 'fasta'

        # qual
        fhand = StringIO('>seq\n10 20\n')
        assert guess_format(fhand) == 'qual'

        # qual
        qual = ">seq1\n30 30 30 30 30 30 30 30\n>seq2\n30 30 30 30 30 30 30"
        qual += " 30\n>seq3\n30 30 30 30 30 30 30 30\n"

        fhand = StringIO(qual)
        assert guess_format(fhand) == 'qual'
Пример #15
0
def _read_seqitems(fhands, file_format):
    'it returns an iterator of seq items (tuples of name and chunk)'
    seq_iters = []
    for fhand in fhands:
        if file_format == GUESS_FORMAT or file_format is None:
            file_format = guess_format(fhand)
        else:
            file_format = file_format

        if file_format == 'fasta':
            seq_iter = _itemize_fasta(fhand)
        elif 'multiline' not in file_format and 'fastq' in file_format:
            seq_iter = _itemize_fastq(fhand)
        else:
            msg = 'Format not supported by the itemizers: ' + file_format
            raise NotImplementedError(msg)
        seq_iter = assing_kind_to_seqs(SEQITEM, seq_iter, file_format)
        seq_iters.append(seq_iter)
    return chain.from_iterable(seq_iters)
Пример #16
0
def read_seqs(fhands,
              file_format=GUESS_FORMAT,
              out_format=None,
              prefered_seq_classes=None):
    'It returns a stream of seqs in different codings: seqrecords, seqitems...'

    if not prefered_seq_classes:
        prefered_seq_classes = [SEQITEM, SEQRECORD]

    if file_format == GUESS_FORMAT:
        in_format = guess_format(fhands[0])
    else:
        in_format = file_format

    if out_format not in (None, GUESS_FORMAT):

        if in_format != out_format:
            if SEQITEM in prefered_seq_classes:
                # seqitems is incompatible with different input and output
                # formats
                prefered_seq_classes.pop(prefered_seq_classes.index(SEQITEM))

    if not prefered_seq_classes:
        msg = 'No valid seq class left or prefered'
        raise ValueError(msg)

    for seq_class in prefered_seq_classes:
        if seq_class == SEQITEM:
            try:
                return _read_seqitems(fhands, in_format)
            except NotImplementedError:
                continue
        elif seq_class == SEQRECORD:
            try:
                seqs = _read_seqrecords(fhands, in_format)
                return assing_kind_to_seqs(SEQRECORD, seqs, None)
            except NotImplementedError:
                continue
        else:
            raise ValueError('Unknown class for seq: ' + seq_class)
    raise RuntimeError('We should not be here, fixme')
Пример #17
0
def read_seqs(fhands, file_format=GUESS_FORMAT, out_format=None,
              prefered_seq_classes=None):
    'It returns a stream of seqs in different codings: seqrecords, seqitems...'

    if not prefered_seq_classes:
        prefered_seq_classes = [SEQITEM, SEQRECORD]

    if file_format == GUESS_FORMAT:
        in_format = guess_format(fhands[0])
    else:
        in_format = file_format

    if out_format not in (None, GUESS_FORMAT):

        if in_format != out_format:
            if SEQITEM in prefered_seq_classes:
                # seqitems is incompatible with different input and output
                # formats
                prefered_seq_classes.pop(prefered_seq_classes.index(SEQITEM))

    if not prefered_seq_classes:
        msg = 'No valid seq class left or prefered'
        raise ValueError(msg)

    for seq_class in prefered_seq_classes:
        if seq_class == SEQITEM:
            try:
                return _read_seqitems(fhands, in_format)
            except NotImplementedError:
                continue
        elif seq_class == SEQRECORD:
            try:
                seqs = _read_seqrecords(fhands, in_format)
                return assing_kind_to_seqs(SEQRECORD, seqs, None)
            except NotImplementedError:
                continue
        else:
            raise ValueError('Unknown class for seq: ' + seq_class)
    raise RuntimeError('We should not be here, fixme')
Пример #18
0
def parse_basic_args(parser):
    'It parses the command line and it returns a dict with the arguments.'
    parsed_args = parser.parse_args()
    # we have to wrap the file in a BufferedReader to allow peeking into stdin
    wrapped_fhands = []
    # if input is stdin it will be a fhand not a list of fhands.
    # we have to convert to a list
    in_fhands = parsed_args.input
    if not isinstance(in_fhands, list):
        in_fhands = [in_fhands]
    for fhand in in_fhands:
        fhand = wrap_in_buffered_reader(fhand)
        fhand = uncompress_if_required(fhand)
        wrapped_fhands.append(fhand)

    # We have to add the one_line to the fastq files in order to get the
    # speed improvements of the seqitems
    in_format = parsed_args.in_format
    if 'fastq' in in_format:
        guessed_in_format = guess_format(wrapped_fhands[0])
        if '-one_line' in guessed_in_format:
            in_format += '-one_line'
    else:
        guessed_in_format = None

    out_fhand = getattr(parsed_args, OUTFILE)

    comp_kind = get_requested_compression(parsed_args)
    if isinstance(out_fhand, list):
        new_out_fhands = []
        for out_f in out_fhand:
            try:
                out_f = compress_fhand(out_f, compression_kind=comp_kind)
            except RuntimeError, error:
                parser.error(error)

            new_out_fhands.append(out_f)
        out_fhand = new_out_fhands
Пример #19
0
def parse_basic_args(parser):
    'It parses the command line and it returns a dict with the arguments.'
    parsed_args = parser.parse_args()
    # we have to wrap the file in a BufferedReader to allow peeking into stdin
    wrapped_fhands = []
    # if input is stdin it will be a fhand not a list of fhands.
    # we have to convert to a list
    in_fhands = parsed_args.input
    if not isinstance(in_fhands, list):
        in_fhands = [in_fhands]
    for fhand in in_fhands:
        fhand = wrap_in_buffered_reader(fhand)
        fhand = uncompress_if_required(fhand)
        wrapped_fhands.append(fhand)

    # We have to add the one_line to the fastq files in order to get the
    # speed improvements of the seqitems
    in_format = parsed_args.in_format
    if 'fastq' in in_format:
        guessed_in_format = guess_format(wrapped_fhands[0])
        if '-one_line' in guessed_in_format:
            in_format += '-one_line'
    else:
        guessed_in_format = None

    out_fhand = getattr(parsed_args, OUTFILE)

    comp_kind = get_requested_compression(parsed_args)
    if isinstance(out_fhand, list):
        new_out_fhands = []
        for out_f in out_fhand:
            try:
                out_f = compress_fhand(out_f, compression_kind=comp_kind)
            except RuntimeError, error:
                parser.error(error)

            new_out_fhands.append(out_f)
        out_fhand = new_out_fhands
Пример #20
0
def seqio(in_fhands, out_fhand, out_format, copy_if_same_format=True):
    'It converts sequence files between formats'
    if out_format not in get_setting('SUPPORTED_OUTPUT_FORMATS'):
        raise IncompatibleFormatError("This output format is not supported")

    in_formats = [remove_multiline(guess_format(fhand)) for fhand in in_fhands]

    if len(in_fhands) == 1 and in_formats[0] == out_format:
        if copy_if_same_format:
            copyfileobj(in_fhands[0], out_fhand)
        else:
            rel_symlink(in_fhands[0].name, out_fhand.name)
    else:
        seqs = _read_seqrecords(in_fhands)
        try:
            write_seqrecs(seqs, out_fhand, out_format)
        except ValueError, error:
            if error_quality_disagree(error):
                raise MalformedFile(str(error))
            if 'No suitable quality scores' in str(error):
                msg = 'No qualities available to write output file'
                raise IncompatibleFormatError(msg)
            raise
Пример #21
0
def seqio(in_fhands, out_fhand, out_format, copy_if_same_format=True):
    'It converts sequence files between formats'
    if out_format not in get_setting('SUPPORTED_OUTPUT_FORMATS'):
        raise IncompatibleFormatError("This output format is not supported")

    in_formats = [remove_multiline(guess_format(fhand)) for fhand in in_fhands]

    if len(in_fhands) == 1 and in_formats[0] == out_format:
        if copy_if_same_format:
            copyfileobj(in_fhands[0], out_fhand)
        else:
            rel_symlink(in_fhands[0].name, out_fhand.name)
    else:
        seqs = _read_seqrecords(in_fhands)
        try:
            write_seqrecs(seqs, out_fhand, out_format)
        except ValueError, error:
            if error_quality_disagree(error):
                raise MalformedFile(str(error))
            if 'No suitable quality scores' in str(error):
                msg = 'No qualities available to write output file'
                raise IncompatibleFormatError(msg)
            raise
Пример #22
0
def _read_seqitems(fhands, file_format):
    'it returns an iterator of seq items (tuples of name and chunk)'
    seq_iters = []
    for fhand in fhands:
        if file_format == GUESS_FORMAT or file_format is None:
            file_format = guess_format(fhand)
        else:
            file_format = file_format

        if file_format == 'fasta':
            seq_iter = _itemize_fasta(fhand)
        elif 'multiline' not in file_format and 'fastq' in file_format:
            try:
                seq_iter = _itemize_fastq(fhand)
            except ValueError as error:
                if error_quality_disagree(error):
                    raise MalformedFile(str(error))
                raise
        else:
            msg = 'Format not supported by the itemizers: ' + file_format
            raise NotImplementedError(msg)
        seq_iter = assing_kind_to_seqs(SEQITEM, seq_iter, file_format)
        seq_iters.append(seq_iter)
    return chain.from_iterable(seq_iters)
Пример #23
0
                out_f = compress_fhand(out_f, compression_kind=comp_kind)
            except RuntimeError, error:
                parser.error(error)

            new_out_fhands.append(out_f)
        out_fhand = new_out_fhands
    else:
        try:
            out_fhand = compress_fhand(out_fhand, compression_kind=comp_kind)
        except RuntimeError, error:
            parser.error(error)

    # The default output format is the same as the first file
    if in_format == GUESS_FORMAT:
        if not guessed_in_format:
            guessed_in_format = guess_format(wrapped_fhands[0])
        out_format = guessed_in_format
    else:
        out_format = in_format

    # The original fhands should be stored, because otherwise they would be
    # closed
    args = {'out_fhand': out_fhand, 'in_fhands': wrapped_fhands,
            'out_format': out_format, 'original_in_fhands': in_fhands,
            'in_format': in_format}
    return args, parsed_args


def parse_basic_parallel_args(parser):
    'It parses the command line and it returns a dict with the arguments.'
    args, parsed_args = parse_basic_args(parser)
Пример #24
0
                out_f = compress_fhand(out_f, compression_kind=comp_kind)
            except RuntimeError, error:
                parser.error(error)

            new_out_fhands.append(out_f)
        out_fhand = new_out_fhands
    else:
        try:
            out_fhand = compress_fhand(out_fhand, compression_kind=comp_kind)
        except RuntimeError, error:
            parser.error(error)

    # The default output format is the same as the first file
    if in_format == GUESS_FORMAT:
        if not guessed_in_format:
            guessed_in_format = guess_format(wrapped_fhands[0])
        out_format = guessed_in_format
    else:
        out_format = in_format

    # The original fhands should be stored, because otherwise they would be
    # closed
    args = {
        'out_fhand': out_fhand,
        'in_fhands': wrapped_fhands,
        'out_format': out_format,
        'original_in_fhands': in_fhands,
        'in_format': in_format
    }
    return args, parsed_args