Exemplo n.º 1
0
def get_max_reads_length(reads_file, log, num_checked):
    file_type = SeqIO.get_read_file_type(reads_file)
    if not file_type:
        error('Incorrect extension of reads file: ' + reads_file, log)

    max_reads_length = max([len(rec) for rec in itertools.islice(SeqIO.parse(SeqIO.Open(reads_file, "r"), file_type), num_checked)])
    log.info(reads_file + ': max reads length: ' + str(max_reads_length))
    return max_reads_length
Exemplo n.º 2
0
def collect_contigs(dataset, barcodes_dir, output_base, format):
    output = open(output_base + "." + format, "w")
    for barcode in dataset:
        file = os.path.join(barcodes_dir, barcode.id, "truseq_long_reads." + format)
        if os.path.exists(file):
            contigs = SeqIO.parse(open(file), format)
            for contig in contigs:
                contig.id = barcode.id + "-" + contig.id
                SeqIO.write(contig, output, format)
    output.close()
Exemplo n.º 3
0
def check_file_not_empty(input_filename, message="", log=None):
    filename = abspath(expanduser(input_filename))
    file_type = get_read_file_type(input_filename, log)
    if (file_type == 'bam'):
        return

    try:
        reads_iterator = SeqIO.parse(SeqIO.Open(filename, "r"), file_type)
        if next(reads_iterator, None) is None:
            error("file is empty: %s (%s)" % (filename, message), log=log)
    except Exception as inst:
        error(inst.args[0].format(FILE=filename) + "\n\n" +
              traceback.format_exc().format(FILE=filename),
              log=log)
Exemplo n.º 4
0
def get_max_reads_length(reads_file, log, num_checked):
    if reads_file in options_storage.dict_of_prefixes:
        ext = options_storage.dict_of_prefixes[reads_file]
        file_type = SeqIO.get_read_file_type(ext)
    else:
        file_type = SeqIO.get_read_file_type(reads_file)

    if not file_type:
        error("incorrect extension of reads file: %s" % reads_file, log)

    max_reads_length = max([
        len(rec) for rec in itertools.islice(
            SeqIO.parse(SeqIO.Open(reads_file, "r"), file_type), num_checked)
    ])
    log.info("%s: max reads length: %s" % (reads_file, str(max_reads_length)))
    return max_reads_length
Exemplo n.º 5
0
def moleculo_postprocessing(contigs_file, output_file, sam_files, log):
    log.info("===== Starting postprocessing based on read alignment")
    log.info("Processing scaffolds from " + contigs_file)
    log.info("Using read alignments to break and filter scaffolds")
    contigs = list(SeqIO.parse(open(contigs_file, "rU"), "fasta"))
    sam = sam_parser.SamChain([sam_parser.Samfile(sam_file) for sam_file in sam_files])
    generate_quality.GenerateQuality(contigs, sam)
    pattern_filter = moleculo_filter_contigs.PatternContigFilter(contigs, sam, pattern, rc_pattern)
    length_filter = moleculo_filter_contigs.ContigLengthFilter(1500)
    coverage_breaker = break_by_coverage.ContigBreaker(contigs, sam, 100, 50)
    pattern_breaker = break_by_coverage.PatternBreaker(pattern, rc_pattern, 150)
    n_breaker = break_by_coverage.NBreaker(3)
    result = SplitAndFilter(contigs, coverage_breaker, length_filter, n_breaker, pattern_breaker, pattern_filter)
    OutputResults(output_file, "fasta", result)
    OutputResults(output_file, "fastq", result)
    log.info("===== Postprocessing finished. Results can be found in " + output_file + ".fastq")
Exemplo n.º 6
0
def get_max_reads_length(reads_file, log, num_checked):
    file_type = get_read_file_type(reads_file, log)
    max_reads_length = 0
    try:
        max_reads_length = max([
            len(rec) for rec in itertools.islice(
                SeqIO.parse(SeqIO.Open(reads_file, "r"), file_type),
                num_checked)
        ])
    except Exception as inst:
        error(inst.args[0].format(FILE=reads_file) + "\n\n" +
              traceback.format_exc().format(FILE=reads_file),
              log=log)
    else:
        log.info("%s: max reads length: %s" %
                 (reads_file, str(max_reads_length)))
    return max_reads_length