def parse_glimmer3(file_handle): """ Parses an ouput file from glimmer3 and yields the header and prediction lines. Used to feed the :func:`mgkit.io.gff.from_glimmer3` function. Arguments: file_handle (str, file): file name or file handle to read from Yields: tuple: first element is the sequence of the predicted gene and the second is the prediction line """ if isinstance(file_handle, str): file_handle = open_file(file_handle, 'r') curr_seq = '' predictions = [] for line in file_handle: line = line.strip() if line.startswith('>'): if len(predictions) > 0: for prediction in predictions: yield curr_seq, prediction curr_seq = line[1:] predictions = [] else: if line != '': predictions.append(line) else: if len(predictions) > 0: for prediction in predictions: yield curr_seq, prediction
def load_htseq_counts(file_handle, conv_func=int): """ .. versionchanged:: 0.1.15 added *conv_func* parameter Loads an HTSeq-count result file Arguments: file_handle (file or str): file handle or string with file name conv_func (func): function to convert the number from string, defaults to *int*, but *float* can be used as well Yields: tuple: first element is the gene_id and the second is the count """ if isinstance(file_handle, str): file_handle = open_file(file_handle, 'rb') if getattr(file_handle, 'name', None) is not None: LOG.info("Loading HTSeq-count file %s", file_handle.name) for line in file_handle: line = line.decode('ascii') gene_id, count = line.rstrip().split('\t') if line.startswith('__') or (gene_id in SKIP): continue yield gene_id, conv_func(count)
def test_Annotation_to_file(gff_file, tmpdir): ann = gff.from_gff(gff_file[0]) file_name = (tmpdir / 'test-write.gff').strpath file_handle = open_file(file_name, 'wb') ann.to_file(file_handle) file_handle.close() ann2 = next(gff.parse_gff(file_name)) assert ann == ann2
def test_write_fastq2(fastq_file, tmpdir): header, seq, qual = next(fastq.load_fastq(fastq_file, num_qual=True)) file_name = (tmpdir / 'test.fq').strpath file_handle = open_file(file_name, 'w') fastq.write_fastq_sequence(file_handle, header, seq, qual) file_handle.close() headerw, seqw, qualw = next(fastq.load_fastq(file_name, num_qual=True)) assert (header, seq, list(qual)) == (headerw, seqw, list(qualw))
def sample_command(verbose, prefix, number, prob, max_seq, fastq, gzip, input_file): mgkit.logger.config_log(level=logging.DEBUG if verbose else logging.INFO) LOG.info("Sampling %s file (%d) chunks with prefix (%s)", 'FastQ' if fastq else 'Fasta', number, prefix) if (prob > 1) or (prob <= 0): utils.exit_script( "The probability value ({}) is outside the correct range" + " (0 < p <= 1)", 1) dist = scipy.stats.binom(1, prob) LOG.info("Probability of picking a sequence (%.5f), max number of seqs %d", prob, max_seq) name_mask = "%s-{0:05}.%s" % (prefix, 'fq' if fastq else 'fa') if gzip: name_mask += '.gz' LOG.info("Output files will be compressed (gzip)") output_files = [ dict(h=open_file(name_mask.format(i), 'wb'), c=0) for i in range(number) ] load_func = load_fastq if fastq else fasta.load_fasta write_func = write_fastq_sequence if fastq else fasta.write_fasta_sequence for seq in load_func(input_file): # reached the maximum number of sequences for all samples if all(x['c'] == max_seq for x in output_files): break for output in output_files: if output['c'] == max_seq: continue if dist.rvs(): write_func(output['h'], *seq) output['c'] += 1
def add_fields_from_table(verbose, key, attribute, only_edited, skip_rows, separator, comment, table_file, key_index, attr_index, input_file, output_file): logger.config_log(level=logging.DEBUG if verbose else logging.INFO) LOG.info("Key used is '%s' and attribute '%s'", key, attribute) LOG.info("N. rows skipped '%d' Key index is '%d' and attribute index '%d'", skip_rows, key_index, attr_index) if getattr(table_file, 'name', None) is not None: LOG.info("Reading values from (%s)", table_file.name) fields = dict( text_to_dict(open_file(table_file), skip_lines=skip_rows, sep=separator, key_index=key_index, value_index=attr_index, encoding='ascii', skip_empty=True, skip_comment=comment)) changed = 0 for annotation in gff.parse_gff(input_file): try: key_ann_value = annotation.get_attr(key) except gff.AttributeNotFound: if only_edited: continue try: annotation.set_attr(attribute, fields[key_ann_value]) changed += 1 except KeyError: if only_edited: continue annotation.to_file(output_file) LOG.info('Changed %d annotations', changed)
def snpdat_reader(f_handle): """ Simple SNPDat reader. f_handle: file handle or string for the SNPDat result file :return: generator of SNPDatRow instances """ if isinstance(f_handle, str): f_handle = open_file(f_handle, 'r') LOG.info("Reading from file %s", f_handle.name) f_handle.readline() # skips header line for line in f_handle: line = line.decode('ascii') try: yield SNPDatRow(line) except ValueError: LOG.critical(line) LOG.exception("Error reading line") raise ValueError
def keggmod_file(shared_datadir): return open_file(str(shared_datadir / 'kmod-entry1.txt'), 'rb').readlines()
def glimmer_file(shared_datadir): return open_file(str(shared_datadir / 'glimmer3.txt'), 'rb').readlines()
def hmmer_file(shared_datadir): return open_file(str(shared_datadir / 'test-hmmer-dom.txt'), 'rb').readlines()
def gff_file(shared_datadir): return open_file(str(shared_datadir / 'test.gff'), 'rb').readlines()