示例#1
0
文件: glimmer.py 项目: sadikmu/mgkit
def parse_glimmer3(file_handle):
    """
    Parses an ouput file from glimmer3 and yields the header and prediction
    lines. Used to feed the :func:`mgkit.io.gff.from_glimmer3` function.

    Arguments:
        file_handle (str, file): file name or file handle to read from

    Yields:
        tuple: first element is the sequence of the predicted gene and the
        second is the prediction line
    """
    if isinstance(file_handle, str):
        file_handle = open_file(file_handle, 'r')

    curr_seq = ''
    predictions = []

    for line in file_handle:
        line = line.strip()
        if line.startswith('>'):
            if len(predictions) > 0:
                for prediction in predictions:
                    yield curr_seq, prediction
            curr_seq = line[1:]
            predictions = []
        else:
            if line != '':
                predictions.append(line)
    else:
        if len(predictions) > 0:
            for prediction in predictions:
                yield curr_seq, prediction
示例#2
0
def load_htseq_counts(file_handle, conv_func=int):
    """
    .. versionchanged:: 0.1.15
        added *conv_func* parameter

    Loads an HTSeq-count result file

    Arguments:

        file_handle (file or str): file handle or string with file name
        conv_func (func): function to convert the number from string, defaults
            to *int*, but *float* can be used as well

    Yields:
        tuple: first element is the gene_id and the second is the count

    """

    if isinstance(file_handle, str):
        file_handle = open_file(file_handle, 'rb')

    if getattr(file_handle, 'name', None) is not None:
        LOG.info("Loading HTSeq-count file %s", file_handle.name)

    for line in file_handle:
        line = line.decode('ascii')
        gene_id, count = line.rstrip().split('\t')

        if line.startswith('__') or (gene_id in SKIP):
            continue

        yield gene_id, conv_func(count)
示例#3
0
def test_Annotation_to_file(gff_file, tmpdir):

    ann = gff.from_gff(gff_file[0])

    file_name = (tmpdir / 'test-write.gff').strpath
    file_handle = open_file(file_name, 'wb')
    ann.to_file(file_handle)
    file_handle.close()

    ann2 = next(gff.parse_gff(file_name))

    assert ann == ann2
示例#4
0
def test_write_fastq2(fastq_file, tmpdir):

    header, seq, qual = next(fastq.load_fastq(fastq_file, num_qual=True))

    file_name = (tmpdir / 'test.fq').strpath

    file_handle = open_file(file_name, 'w')

    fastq.write_fastq_sequence(file_handle, header, seq, qual)
    file_handle.close()

    headerw, seqw, qualw = next(fastq.load_fastq(file_name, num_qual=True))

    assert (header, seq, list(qual)) == (headerw, seqw, list(qualw))
示例#5
0
def sample_command(verbose, prefix, number, prob, max_seq, fastq, gzip,
                   input_file):
    mgkit.logger.config_log(level=logging.DEBUG if verbose else logging.INFO)
    LOG.info("Sampling %s file (%d) chunks with prefix (%s)",
             'FastQ' if fastq else 'Fasta', number, prefix)

    if (prob > 1) or (prob <= 0):
        utils.exit_script(
            "The probability value ({}) is outside the correct range" +
            " (0 < p <= 1)", 1)

    dist = scipy.stats.binom(1, prob)

    LOG.info("Probability of picking a sequence (%.5f), max number of seqs %d",
             prob, max_seq)
    name_mask = "%s-{0:05}.%s" % (prefix, 'fq' if fastq else 'fa')

    if gzip:
        name_mask += '.gz'
        LOG.info("Output files will be compressed (gzip)")

    output_files = [
        dict(h=open_file(name_mask.format(i), 'wb'), c=0)
        for i in range(number)
    ]

    load_func = load_fastq if fastq else fasta.load_fasta
    write_func = write_fastq_sequence if fastq else fasta.write_fasta_sequence

    for seq in load_func(input_file):
        # reached the maximum number of sequences for all samples
        if all(x['c'] == max_seq for x in output_files):
            break

        for output in output_files:
            if output['c'] == max_seq:
                continue

            if dist.rvs():
                write_func(output['h'], *seq)
                output['c'] += 1
示例#6
0
文件: edit_gff.py 项目: sadikmu/mgkit
def add_fields_from_table(verbose, key, attribute, only_edited, skip_rows,
                          separator, comment, table_file, key_index,
                          attr_index, input_file, output_file):

    logger.config_log(level=logging.DEBUG if verbose else logging.INFO)

    LOG.info("Key used is '%s' and attribute '%s'", key, attribute)
    LOG.info("N. rows skipped '%d' Key index is '%d' and attribute index '%d'",
             skip_rows, key_index, attr_index)

    if getattr(table_file, 'name', None) is not None:
        LOG.info("Reading values from (%s)", table_file.name)

    fields = dict(
        text_to_dict(open_file(table_file),
                     skip_lines=skip_rows,
                     sep=separator,
                     key_index=key_index,
                     value_index=attr_index,
                     encoding='ascii',
                     skip_empty=True,
                     skip_comment=comment))

    changed = 0

    for annotation in gff.parse_gff(input_file):
        try:
            key_ann_value = annotation.get_attr(key)
        except gff.AttributeNotFound:
            if only_edited:
                continue
        try:
            annotation.set_attr(attribute, fields[key_ann_value])
            changed += 1
        except KeyError:
            if only_edited:
                continue
        annotation.to_file(output_file)

    LOG.info('Changed %d annotations', changed)
示例#7
0
文件: snpdat.py 项目: sadikmu/mgkit
def snpdat_reader(f_handle):
    """
    Simple SNPDat reader.

    f_handle: file handle or string for the SNPDat result file

    :return: generator of SNPDatRow instances
    """

    if isinstance(f_handle, str):
        f_handle = open_file(f_handle, 'r')
    LOG.info("Reading from file %s", f_handle.name)

    f_handle.readline()  # skips header line

    for line in f_handle:
        line = line.decode('ascii')
        try:
            yield SNPDatRow(line)
        except ValueError:
            LOG.critical(line)
            LOG.exception("Error reading line")
            raise ValueError
示例#8
0
def keggmod_file(shared_datadir):
    return open_file(str(shared_datadir / 'kmod-entry1.txt'), 'rb').readlines()
示例#9
0
def glimmer_file(shared_datadir):
    return open_file(str(shared_datadir / 'glimmer3.txt'), 'rb').readlines()
示例#10
0
def hmmer_file(shared_datadir):
    return open_file(str(shared_datadir / 'test-hmmer-dom.txt'), 'rb').readlines()
示例#11
0
def gff_file(shared_datadir):
    return open_file(str(shared_datadir / 'test.gff'), 'rb').readlines()