Exemplos de write_header_data em Python, exemplos de general_utilities.write_header_data em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: mutant_count_alignments.py Projeto: Jonikas-Lab/mutant-pools

def save_dataset_files(dataset, outfile, verbosity_level=0, if_pickle=True, count_cassette=True, count_other=True, 
                       merge_boundary_features=True, sort_data_by='position', N_sequences_per_mutant=5, options="N/A"):
    """ Print summary and data to output file; optionally print summary to stdout; optionally pickle dataset to picklefile. 
    
    The options argument is only used to be printed in the header to make it clear how the file was generated - 
     it should be the applicable optparse options object if there is one, or a text message otherwise.
    """
    # print summary info to stdout if desired
    if verbosity_level>1: print "\nDATA SUMMARY:"
    if verbosity_level>0: dataset.print_summary(merge_boundary_features=merge_boundary_features, 
                                                count_cassette=count_cassette, count_other=count_other)
    # print full data to outfile
    if verbosity_level>1: print "printing output - time %s."%time.ctime()
    with open(outfile,'w') as OUTFILE:
        write_header_data(OUTFILE,options)
        OUTFILE.write("### SUMMARY:\n")
        dataset.print_summary(OUTFILE, line_prefix="#  ", header_prefix="## ", merge_boundary_features=merge_boundary_features,
                              count_cassette = count_cassette, count_other=count_other)
        OUTFILE.write("### HEADER AND DATA:\n")
        dataset.print_data(OUTPUT=OUTFILE, sort_data_by=sort_data_by, N_sequences=N_sequences_per_mutant, 
                           header_line=True, header_prefix='# ')
    # print pickled dataset to picklefile, if desired
    if if_pickle:
        outfile_basename = os.path.splitext(outfile)[0]
        pickled_outfile = outfile_basename + '.pickle'
        with open(pickled_outfile,'w') as PICKLEFILE:
            pickle.dump(dataset, PICKLEFILE, 0)

Exemplo n.º 2

0

Exibir arquivo

Arquivo: mutant_count_alignments.py Projeto: damluffy/Li-Zhang-Patena-2015

def save_dataset_files(dataset,
                       outfile,
                       verbosity_level=0,
                       if_pickle=True,
                       count_cassette=True,
                       count_other=True,
                       merge_boundary_features=True,
                       sort_data_by='position',
                       N_sequences_per_mutant=5,
                       options="N/A"):
    """ Print summary and data to output file; optionally print summary to stdout; optionally pickle dataset to picklefile. 
    
    The options argument is only used to be printed in the header to make it clear how the file was generated - 
     it should be the applicable optparse options object if there is one, or a text message otherwise.
    """
    # print summary info to stdout if desired
    if verbosity_level > 1: print "\nDATA SUMMARY:"
    if verbosity_level > 0:
        dataset.print_summary(merge_boundary_features=merge_boundary_features,
                              count_cassette=count_cassette,
                              count_other=count_other)
    # print full data to outfile
    if verbosity_level > 1: print "printing output - time %s." % time.ctime()
    with open(outfile, 'w') as OUTFILE:
        write_header_data(OUTFILE, options)
        OUTFILE.write("### SUMMARY:\n")
        dataset.print_summary(OUTFILE,
                              line_prefix="#  ",
                              header_prefix="## ",
                              merge_boundary_features=merge_boundary_features,
                              count_cassette=count_cassette,
                              count_other=count_other)
        OUTFILE.write("### HEADER AND DATA:\n")
        dataset.print_data(OUTPUT=OUTFILE,
                           sort_data_by=sort_data_by,
                           N_sequences=N_sequences_per_mutant,
                           header_line=True,
                           header_prefix='# ')
    # print pickled dataset to picklefile, if desired
    if if_pickle:
        outfile_basename = os.path.splitext(outfile)[0]
        pickled_outfile = outfile_basename + '.pickle'
        with open(pickled_outfile, 'w') as PICKLEFILE:
            pickle.dump(dataset, PICKLEFILE, 0)

Exemplo n.º 3

0

Exibir arquivo

Arquivo: mutant_join_datasets.py Projeto: Jonikas-Lab/Zhang-Patena-2014

def main(infiles, outfile, options):
    """ Run the main functionality of the module (see module docstring for more information), excluding testing. 
    Print final dataset to outfile (if given); return final multi-dataset object and the list of dataset names in order.
    The options argument should be generated by an optparse parser.
    """

    # parse all infiles, print summaries to stdout if requested
    all_datasets = {}

    if options.dataset_names:
        dataset_names = options.dataset_names.split(',')
        if not len(dataset_names)==len(infiles):
            raise ValueError("If dataset names are provided via -D option, you must provide the same number of names "
                             +"as the total number of infiles! We have %s names and %s infiles."%(len(dataset_names), 
                                                                                                  len(infiles)))
    else:
        dataset_names = [os.path.splitext(os.path.basename(infile))[0] for infile in infiles]

    for dataset_name,infile in zip(dataset_names,infiles):
        if options.verbosity_level>1:   print "parsing input file %s - time %s."%(infile, time.ctime())
        if infile.endswith('.pickle'):
            current_dataset = unpickle(infile)
        else:
            current_dataset = mutant_analysis_classes.Insertional_mutant_pool_dataset(infile=infile)
            current_dataset.count_adjacent_mutants(OUTPUT=None)
            # note - read_data_from_file doesn't deal with merging/counting info, so that will be wrong/missing
        all_datasets[dataset_name] = current_dataset
        if options.verbosity_level>0:   print "%s mutants in file %s"%(len(current_dataset), infile)
        elif options.verbosity_level>1: current_dataset.print_summary()
    
    # merge datasets into one multi-dataset object
    if options.verbosity_level>1:   print "merging the mutant data into combined dataset - time %s."%(time.ctime())
    multi_dataset = mutant_analysis_classes.Insertional_mutant_pool_dataset(multi_dataset=True)
    multi_dataset.populate_multi_dataset(all_datasets, overwrite=False, check_gene_data=True)
    # make sure the datasets are in the same order as they were given on the command-line
    #  (using all_datasets to initialize multi_dataset didn't give an order, since all_datasets is a dictionary)
    multi_dataset.dataset_order = dataset_names
    # print varying amounts of summary data to stdout
    if options.verbosity_level>0:   print "total %s mutants present in combined dataset"%(len(multi_dataset))
    elif options.verbosity_level>0: multi_dataset.print_summary()

    ### optionally remove mutants based on another dataset
    if options.remove_mutants_from_file:
        other_dataset = mutant_analysis_classes.read_mutant_file(options.remove_mutants_from_file)
        old_N = len(multi_dataset)
        multi_dataset.remove_mutants_based_on_other_dataset(other_dataset, 
                 readcount_min=options.remove_mutants_readcount_min, perfect_reads=options.remove_mutants_min_is_perfect)
        if options.verbosity_level>0:   
            new_N = len(multi_dataset)
            print "removed %s mutants based on %s - %s mutants remaining in combined dataset"%(old_N - new_N, 
                                                                                   options.remove_mutants_from_file, new_N)

    # if requested, add gene annotation info from separate file
    if options.gene_annotation_file:
        if options.verbosity_level>1: 
            print "adding gene annotation from file %s - time %s."%(options.gene_annotation_file, time.ctime())
        multi_dataset.add_gene_annotation(options.gene_annotation_file, 
                                               if_standard_Cre_file=options.annotation_file_is_standard)

    # print full data to outfile, unless there is no outfile name given
    if outfile:
        if options.verbosity_level>1:   
            print "printing combined dataset output to file %s - time %s."%(outfile, time.ctime())
        with open(outfile,'w') as OUTFILE:
            write_header_data(OUTFILE,options)
            OUTFILE.write("### DATASET SUMMARIES:\n")
            multi_dataset.print_summary(OUTPUT=OUTFILE, line_prefix="#  ", header_prefix="## ")
            OUTFILE.write("### HEADER AND DATA:\n")
            multi_dataset.print_data(OUTPUT=OUTFILE, sort_data_by=options.sort_data_key, header_line=True)
        # TODO make a *.pickle outfile as well?

    return multi_dataset, dataset_names

Exemplo n.º 4

0

Exibir arquivo

Arquivo: deepseq_preprocessing_wrapper.py Projeto: Jonikas-Lab/Zhang-Patena-2014

def main(args, options):
    """ Run the main functionality of the module (see module docstring for more information), excluding testing.
    The options argument should be generated by an optparse parser.
    """
    try:
        [infile] = args
    except ValueError:
        parser = define_option_parser()
        parser.print_help()
        sys.exit("Error: exactly one infile required!")
    # MAYBE-TODO implement option with multiple infiles? Need to make sure they're the same fa/fq type etc...

    ### check inputs
    adapter_options = '-a --adapter -b --anywhere -g --front'
    if any([x in options.other_cutadapt_options for x in adapter_options.split()]):
        sys.exit("Error: --other_cutadapt_options value shouldn't contain any adapter seq options (%s)"%adapter_options
                 +" - use -5/-3 options to specify adapters instead!")

    ### outfile and tmpfile names
    infile_suffix = os.path.splitext(infile)[1]
    outfile_suffix = '.fa'
    #outfile_suffix = '.fa' if options.collapse_to_unique else infile_suffix
    ends = "5' 3'".split()
    outfiles = {end: options.outfile_basename + '_%s.fa'%end.replace("'","prime") for end in ends}
    infofile = options.outfile_basename + '_info.txt'
    wrong_start_file = options.outfile_basename + '_wrong-start.fa'
    no_cassette_tmpfiles = {end: options.outfile_basename + '_no-cassette-tmpfile_%s.fa'%end.replace("'","prime") for end in ends}
    no_cassette_file = options.outfile_basename + '_no-cassette.fa'
    trimmed_tmpfile = trimmed_tmpfile_original = options.outfile_basename + '_trimmed-tmpfile.fa'
    cutadapt_tmpfiles = {end: options.outfile_basename + '_cutadapt-tmpfile_%s.fa'%end.replace("'","prime") for end in ends}
    cutadapt_tmpfiles_original = cutadapt_tmpfiles
    
    with open(infofile,'w') as INFOFILE:

        ### write header data
        write_header_data(INFOFILE,options)
        INFOFILE.write('\n')

        ### 0. look at the infile; make sure it's readable, etc
        #       (check_readcount uses seq_count_and_lengths, which uses HTSeq and autodetects fa/fq format)
        starting_readcount = check_readcount(infile, INFOFILE, bool(options.verbosity>1), "original input", 
                                             options.total_read_number_only, False)

        ### 1. Trim the first bases (from adapter)
        # MAYBE-TODO I could do this with cutadapt again, instead of with my own trim_prefix function... 
        #  Would that be faster, or better in any other way?
        # MAYBE-TODO could also do it with a multiplexing barcode-splitting tool (like fastx_barcode_splitter.pl), 
        #  since that's the eventual point of having those constant first bases there...
        if options.first_bases_to_trim == 'NONE':
            text = "### Not trimming first bases, since NONE was passed to -F option.\n"
            if options.verbosity>0:   print text
            INFOFILE.write(text+'\n')
            trimmed_tmpfile = infile
            trimmed_readcount = starting_readcount
            untrimmed_readcount = 0
        else:
            trim_prefix(options.first_bases_to_trim, infile, trimmed_tmpfile, wrong_start_file, INFOFILE, options.verbosity)
            trimmed_readcount = check_readcount(trimmed_tmpfile, INFOFILE, bool(options.verbosity>1), 
                                                "first-base-trimming output", options.total_read_number_only, False)
            untrimmed_readcount = check_readcount(wrong_start_file, None, False, True, False)
            assert trimmed_readcount+untrimmed_readcount==starting_readcount,\
                    "Trimmed/untrimmed readcounts don't add up to starting readcount - check tmpfile!"\
                    +"(%s+%s != %s)"%(trimmed_readcount, untrimmed_readcount, starting_readcount)

        ### 2. run cutadapt to strip cassette sequence
            # NOTE: this currently requires my version of cutadapt, cutadapt_mod (based on some older cutadapt version), 
            #  to deal with too-long seqs correctly - LATER-TODO submit my modification as a patch to cutadapt to get it in the 
            #  standard install!  Or wait until the cutadapt maintainer does it (I submitted it as an issue) 
            #  (see ~/experiments/basic_programs/cutadapt_modifications/).
        if_running_cutadapt = True
        if options.other_cutadapt_options == 'NONE':
            if_running_cutadapt = False
            text = "### Not running cutadapt, since NONE was passed to -A option.\n"
        elif not (options.adapter_5prime or options.adapter_3prime):
            if_running_cutadapt = False
            text = "### Not running cutadapt, since empty sequences were passed to -5 and -3 options.\n"
        # if not running it, just skip it 
        if not if_running_cutadapt:
            if options.verbosity>0:   print text
            INFOFILE.write(text+'\n')
            cutadapt_tmpfile = trimmed_tmpfile
            cutadapt_readcount = {'all': trimmed_readcount}
            no_cassette_readcount = 0
        # otherwise run the 5' and 3' ends separately
        else:
            cutadapt_readcount = {}
            for (end_type, adapter_seq) in [("5'", options.adapter_5prime), ("3'", options.adapter_3prime)]:
                assert end_type in ends
                # if the adapter sequence for that side is empty, skip
                if not adapter_seq.replace('"','').replace("'",'').replace(' ',''):  continue
                cutadapt_tmpfile = cutadapt_tmpfiles[end_type]
                full_cutadapt_options = '-a %s %s'%(adapter_seq, options.other_cutadapt_options)
                for extra_seq_category in ('untrimmed', 'too-short', 'too-long'):
                    if not extra_seq_category in full_cutadapt_options:
                        full_cutadapt_options += ' --%s-output %s'%(extra_seq_category, no_cassette_tmpfiles[end_type])
                command = "cutadapt_mod %s -o %s %s"%(full_cutadapt_options, cutadapt_tmpfile, trimmed_tmpfile)
                run_command_print_info_output(command, INFOFILE, options.verbosity, shell=True, 
                                              program_name="cutadapt for %s"%end_type)
                cutadapt_readcount[end_type] = check_readcount(cutadapt_tmpfile, INFOFILE, bool(options.verbosity>1), 
                                                               "cutadapt output", options.total_read_number_only, False)
                tmp_no_cassette_readcount = check_readcount(no_cassette_tmpfiles[end_type], None, False, True, False)
                assert cutadapt_readcount[end_type] + tmp_no_cassette_readcount == trimmed_readcount,\
                        "%s cassette/no-cassette readcounts don't add up to trimmed readcount - check tmpfile!"\
                        +"(%s+%s != %s)"%(end_type, cutadapt_readcount[end_type], tmp_no_cassette_readcount, trimmed_readcount)
            # make an actual no_cassette_file based on the overlap of the two no_cassette_tmpfiles!
            text = "### Merging the 5' and 3' cutadapt untrimmed outputs to get single no-cassette file.\n"
            if options.verbosity>0:   print text
            INFOFILE.write(text+'\n')
            no_cassette_seqs = []
            for no_cassette_tmpfile in no_cassette_tmpfiles.values():
                try:                no_cassette_seqs.append(dict(parse_fasta(no_cassette_tmpfile)))
                except IOError:     pass
            # the real no-cassette seqs are the intersection of the seq headers from both no_cassette_tmpfile sets
            overlapping_no_cassette_headers = set.intersection(*[set(d.keys()) for d in no_cassette_seqs])
            no_cassette_readcount = len(overlapping_no_cassette_headers)
            with open(no_cassette_file,'w') as NO_CASSETTE_FILE:
                for header in sorted(overlapping_no_cassette_headers):
                    # some fastx_toolkit tools give errors on lowercase bases, so make everything uppercase
                    write_fasta_line(header, no_cassette_seqs[0][header].upper(), NO_CASSETTE_FILE)
            assert no_cassette_readcount + sum(cutadapt_readcount.values()) == trimmed_readcount,\
                            "Final cassette/no-cassette readcounts don't add up to trimmed readcount - check tmpfile!"\
                            +"(%s+%s != %s)"%(sum(cutadapt_readcount.values()), no_cassette_readcount, trimmed_readcount)
            # remove the original no_cassette_tmpfiles
            for tmpfile in no_cassette_tmpfiles.values():
                if os.path.exists(tmpfile):     os.remove(tmpfile)

        ### 3. run fastx_collapser to collapse the sequences to unique
        if not options.collapse_to_unique:
            text = "### Not running fastx_collapser, since -C option was not used.\n"
            if options.verbosity>0:   print text
            INFOFILE.write(text+'\n')
            for (end_type,cutadapt_tmpfile) in cutadapt_tmpfiles.items():
                if os.path.exists(cutadapt_tmpfile):     os.rename(cutadapt_tmpfile, outfiles[end_type])
            collapsed_readcount = cutadapt_readcount
            # Note for fastx_collapser, but also for the others - NONE is necessary here, can't just use '', because 
            #    fastx_collapser works fine with no options, so '' is a sensible input and can't be used to turn it off.
        else:
            collapsed_readcount, uncollapsed_readcount = {}, {}
            for (end_type,cutadapt_tmpfile) in cutadapt_tmpfiles.items():
                outfile = outfiles[end_type]
                # if there is no file for that end, skip
                if not os.path.exists(cutadapt_tmpfile):     continue
                command = "fastx_collapser -v %s -i %s -o %s"%(FASTQ_ENCODINGS_FASTX_TOOLKIT[options.fastq_encoding], 
                                                               cutadapt_tmpfile, outfile)
                run_command_print_info_output(command, INFOFILE, options.verbosity, shell=True, 
                                              program_name="fastx_collapser for %s"%end_type)
                INFOFILE.write('\n')
                collapsed_readcount[end_type] = check_readcount(outfile,INFOFILE,bool(options.verbosity>1),
                                    "fastx_collapser output", options.total_read_number_only, input_collapsed_to_unique=False)
                # make sure uncollapsed readcount is the same as before collapsing
                uncollapsed_readcount[end_type] = check_readcount(outfile, None, False, "", True, input_collapsed_to_unique=True)
                if not uncollapsed_readcount[end_type] == cutadapt_readcount[end_type]:
                    text = "ERROR: the uncollapsed read-count after fastx_collapser isn't the same as the before-collapser count!  Collapsing went wrong somehow, or the way fastx_collapser works changed since this program was written?\n"
                else:
                    text = "(checked that all the reads are still there if you uncollapse the numbers using header info)\n"
                if options.verbosity>1: print text
                INFOFILE.write(text+'\n')
            # also run fastx_collapser on wrong_start_file and no_cassette_file
            text = "### Running fastx_collapser on the \"bad\" output files. Not printing the output to info file.\n"
            if options.verbosity: print text
            INFOFILE.write(text+'\n')
            extra_collapsed_readcounts = {}    
            for extra_file in (wrong_start_file, no_cassette_file):
                command = "fastx_collapser -v %s -i %s -o tmp.fa"%(FASTQ_ENCODINGS_FASTX_TOOLKIT[options.fastq_encoding], 
                                                                   extra_file)
                retcode = run_command_print_info_output(command, None, options.verbosity-1, shell=True)
                # note: actually fastx_collapser doesn't give proper retcodes, so just check if outfile exists
                #  (also it chokes on empty files, AND on lowercase bases!  That's a bit ridiculous...)
                #  it also apparently sometimes changes the order of the sequences for no good reason! ARGH.
                if retcode in (0, None) and os.path.exists('tmp.fa'):
                    os.remove(extra_file)
                    os.rename('tmp.fa', extra_file)
                extra_collapsed_readcounts[extra_file] = check_readcount(extra_file, None, False, "", True, 
                                                                             input_collapsed_to_unique=False)

        ### Final readcount check
        final_output = ["### Final read count info for %s (main output files %s)\n"%(infile, ', '.join(outfiles))]
        final_output.append("# starting total read count:\t%s\n"%starting_readcount)
        if not options.first_bases_to_trim == 'NONE':
            final_output.append('# "good" read count after start trimming (%% of total):\t%s\n'%
                                value_and_percentages(trimmed_readcount, [starting_readcount]))
            final_output.append('#  "bad" read count (wrong-start) (%% of total):\t%s\n'%
                                value_and_percentages(untrimmed_readcount, [starting_readcount]))
        if if_running_cutadapt:
            for end_type in cutadapt_readcount.keys():
                final_output.append('# "good" %s read count after cassette stripping (%% of total, %% of trimmed):\t%s\n'%
                        (end_type, value_and_percentages(cutadapt_readcount[end_type], [starting_readcount, trimmed_readcount])))
            final_output.append('#  "bad" read count (no-cassette) (%% of total, %% of trimmed):\t%s\n'%
                                value_and_percentages(no_cassette_readcount, [starting_readcount, trimmed_readcount]))
        for end_type in cutadapt_readcount.keys():
            final_output.append('## final "good" %s reads (in main output file) (%% of total):\t%s\n'%(end_type, 
                                value_and_percentages(cutadapt_readcount[end_type], [starting_readcount])))
        final_output.append('## final "bad" reads (in _wrong-start and/or _no-cassette files) (%% of total):\t%s\n'%
                            value_and_percentages(starting_readcount-sum(cutadapt_readcount.values()), [starting_readcount]))
        if options.collapse_to_unique:
            for end_type in cutadapt_readcount.keys():
                final_output.append('# "good" %s unique sequence count after collapsing reads to unique sequences '%end_type
                                    +'(%% of read count):\t%s\n'%value_and_percentages(collapsed_readcount[end_type], 
                                                                                       [cutadapt_readcount[end_type]]))
            if not options.first_bases_to_trim == 'NONE':
                final_output.append('# wrong-start unique sequence count after collapsing (%% of read count):\t%s\n'
                        %value_and_percentages(extra_collapsed_readcounts[wrong_start_file], [untrimmed_readcount]))
            if if_running_cutadapt:
                final_output.append('# no-cassette unique sequence count after collapsing (%% of read count):\t%s\n'
                        %value_and_percentages(extra_collapsed_readcounts[no_cassette_file], [no_cassette_readcount]))
        for line in final_output:
            INFOFILE.write(line)
            if options.verbosity>0:  print line,

    ### Remove tmpfiles
    # need to use the tmpfile*_original names here because I do "trimmed_tmpfile = infile" etc if skipping steps, 
    #   and I don't want to remove the infile!
    if not options.keep_tmpfiles:
        for tmpfile in [trimmed_tmpfile_original] + cutadapt_tmpfiles_original.values():
            if os.path.exists(tmpfile):     os.remove(tmpfile)

Exemplo n.º 5

0

Exibir arquivo

def main(args, options):
    """ Run the main functionality of the module (see module docstring for more information), excluding testing.
    The options argument should be generated by an optparse parser.
    """
    try:
        [infile] = args
        # TODO multiple infiles would be nice!
    except ValueError:
        parser = define_option_parser()
        parser.print_help()
        sys.exit("Error: exactly one infile required!")
    # MAYBE-TODO implement option with multiple infiles? Need to make sure they're the same fa/fq type etc...

    ### check inputs
    adapter_options = '-a --adapter -b --anywhere -g --front'
    if any(
        [x in options.other_cutadapt_options
         for x in adapter_options.split()]):
        sys.exit(
            "Error: --other_cutadapt_options value shouldn't contain any adapter seq options (%s)"
            % adapter_options +
            " - use -5/-3 options to specify adapters instead!")

    ### outfile and tmpfile names
    # outfile suffix is always fa because we always discard quality info right now, even when not forced to do that by collapsing to unique! MAYBE-TODO change that?
    #infile_suffix = os.path.splitext(infile)[1]
    #outfile_suffix = '.fa' if options.collapse_to_unique else infile_suffix
    outfile_suffix = '.fa'
    infofile = options.outfile_basename + '_info.txt'
    wrong_start_file = options.outfile_basename + '_wrong-start.fa'
    no_cassette_file = options.outfile_basename + '_no-cassette.fa'
    trimmed_tmpfile = trimmed_tmpfile_original = options.outfile_basename + '_trimmed-tmpfile.fa'
    # outfiles and tmpfiles should be split by end ONLY if cutadapt is being run!
    if options.other_cutadapt_options == 'NONE' or not (
            options.adapter_5prime or options.adapter_3prime):
        outfiles = {'': options.outfile_basename + '.fa'}
        no_cassette_tmpfiles = {
            '': options.outfile_basename + '_no-cassette-tmpfile.fa'
        }
        cutadapt_tmpfiles = {
            '': options.outfile_basename + '_cutadapt-tmpfile.fa'
        }
        cutadapt_tmpfiles_original = dict(cutadapt_tmpfiles)
    else:
        ends = "5' 3'".split()
        outfiles = {
            end:
            options.outfile_basename + '_%s.fa' % end.replace("'", "prime")
            for end in ends
        }
        no_cassette_tmpfiles = {
            end: options.outfile_basename +
            '_no-cassette-tmpfile_%s.fa' % end.replace("'", "prime")
            for end in ends
        }
        cutadapt_tmpfiles = {
            end: options.outfile_basename +
            '_cutadapt-tmpfile_%s.fa' % end.replace("'", "prime")
            for end in ends
        }
        cutadapt_tmpfiles_original = dict(cutadapt_tmpfiles)

    with open(infofile, 'w') as INFOFILE:

        ### write header data
        write_header_data(INFOFILE, options)
        INFOFILE.write('\n')

        ### 0. look at the infile; make sure it's readable, etc
        #       (check_readcount uses seq_count_and_lengths, which uses HTSeq and autodetects fa/fq format)
        starting_readcount = check_readcount(infile, INFOFILE,
                                             bool(options.verbosity > 1),
                                             "original input",
                                             options.total_read_number_only,
                                             False)

        ### 1. Trim the first bases (from adapter)
        # MAYBE-TODO I could do this with cutadapt again, instead of with my own trim_prefix function...
        #  Would that be faster, or better in any other way?
        # MAYBE-TODO could also do it with a multiplexing barcode-splitting tool (like fastx_barcode_splitter.pl),
        #  since that's the eventual point of having those constant first bases there...
        if options.first_bases_to_trim == 'NONE':
            text = "### Not trimming first bases, since NONE was passed to -F option.\n"
            if options.verbosity > 0: print text
            INFOFILE.write(text + '\n')
            trimmed_tmpfile = infile
            trimmed_readcount = starting_readcount
            untrimmed_readcount = 0
        else:
            trim_prefix(options.first_bases_to_trim, infile, trimmed_tmpfile,
                        wrong_start_file, INFOFILE, options.verbosity)
            trimmed_readcount = check_readcount(trimmed_tmpfile, INFOFILE,
                                                bool(options.verbosity > 1),
                                                "first-base-trimming output",
                                                options.total_read_number_only,
                                                False)
            untrimmed_readcount = check_readcount(wrong_start_file, None,
                                                  False, True, False)
            assert trimmed_readcount+untrimmed_readcount==starting_readcount,\
                    "Trimmed/untrimmed readcounts don't add up to starting readcount - check tmpfile!"\
                    +"(%s+%s != %s)"%(trimmed_readcount, untrimmed_readcount, starting_readcount)

        ### 2. run cutadapt to strip cassette sequence
        # NOTE: this currently requires my version of cutadapt, cutadapt_mod (based on some older cutadapt version),
        #  to deal with too-long seqs correctly - LATER-TODO submit my modification as a patch to cutadapt to get it in the
        #  standard install!  Or wait until the cutadapt maintainer does it (I submitted it as an issue)
        #  (see ~/experiments/basic_programs/cutadapt_modifications/).
        if_running_cutadapt = True
        if options.other_cutadapt_options == 'NONE':
            if_running_cutadapt = False
            text = "### Not running cutadapt, since NONE was passed to -A option.\n"
        elif not (options.adapter_5prime or options.adapter_3prime):
            if_running_cutadapt = False
            text = "### Not running cutadapt, since empty sequences were passed to -5 and -3 options.\n"
        # if not running it, just skip it
        if not if_running_cutadapt:
            if options.verbosity > 0: print text
            INFOFILE.write(text + '\n')
            cutadapt_tmpfiles[''] = trimmed_tmpfile
            cutadapt_readcount = {'all': trimmed_readcount}
            no_cassette_readcount = 0
        # otherwise run the 5' and 3' ends separately
        else:
            cutadapt_readcount = {}
            for (end_type, adapter_seqs) in [("5'", options.adapter_5prime),
                                             ("3'", options.adapter_3prime)]:
                assert end_type in ends
                # if the adapter sequence for that side is empty, skip
                adapter_seqs = adapter_seqs.replace('"', '').replace(
                    "'", '').replace(' ', '')
                if not adapter_seqs: continue
                cutadapt_tmpfile = cutadapt_tmpfiles[end_type]
                all_adapter_options = ' '.join(
                    ['-a %s' % seq for seq in adapter_seqs.split(',')])
                full_cutadapt_options = all_adapter_options + ' ' + options.other_cutadapt_options
                for extra_seq_category in ('untrimmed', 'too-short',
                                           'too-long'):
                    if not extra_seq_category in full_cutadapt_options:
                        full_cutadapt_options += ' --%s-output %s' % (
                            extra_seq_category, no_cassette_tmpfiles[end_type])
                command = "cutadapt_mod %s -o %s %s" % (
                    full_cutadapt_options, cutadapt_tmpfile, trimmed_tmpfile)
                run_command_print_info_output(command,
                                              INFOFILE,
                                              options.verbosity,
                                              shell=True,
                                              program_name="cutadapt for %s" %
                                              end_type)
                cutadapt_readcount[end_type] = check_readcount(
                    cutadapt_tmpfile, INFOFILE, bool(options.verbosity > 1),
                    "cutadapt output", options.total_read_number_only, False)
                tmp_no_cassette_readcount = check_readcount(
                    no_cassette_tmpfiles[end_type], None, False, True, False)
                assert cutadapt_readcount[end_type] + tmp_no_cassette_readcount == trimmed_readcount,\
                        "%s cassette/no-cassette readcounts don't add up to trimmed readcount - check tmpfile!"\
                        +"(%s+%s != %s)"%(end_type, cutadapt_readcount[end_type], tmp_no_cassette_readcount, trimmed_readcount)
            # make an actual no_cassette_file based on the overlap of the two no_cassette_tmpfiles!
            text = "### Merging the 5' and 3' cutadapt untrimmed outputs to get single no-cassette file.\n"
            if options.verbosity > 0: print text
            INFOFILE.write(text + '\n')
            no_cassette_seqs = []
            for no_cassette_tmpfile in no_cassette_tmpfiles.values():
                try:
                    no_cassette_seqs.append(
                        dict(parse_fasta(no_cassette_tmpfile)))
                except IOError:
                    pass
            # the real no-cassette seqs are the intersection of the seq headers from both no_cassette_tmpfile sets
            overlapping_no_cassette_headers = set.intersection(
                *[set(d.keys()) for d in no_cassette_seqs])
            no_cassette_readcount = len(overlapping_no_cassette_headers)
            with open(no_cassette_file, 'w') as NO_CASSETTE_FILE:
                for header in sorted(overlapping_no_cassette_headers):
                    # some fastx_toolkit tools give errors on lowercase bases, so make everything uppercase
                    write_fasta_line(header,
                                     no_cassette_seqs[0][header].upper(),
                                     NO_CASSETTE_FILE)
            assert no_cassette_readcount + sum(cutadapt_readcount.values()) == trimmed_readcount,\
                            "Final cassette/no-cassette readcounts don't add up to trimmed readcount - check tmpfile!"\
                            +"(%s+%s != %s)"%(sum(cutadapt_readcount.values()), no_cassette_readcount, trimmed_readcount)
            # remove the original no_cassette_tmpfiles
            for tmpfile in no_cassette_tmpfiles.values():
                if os.path.exists(tmpfile): os.remove(tmpfile)

        ### 3. run fastx_collapser to collapse the sequences to unique
        if not options.collapse_to_unique:
            text = "### Not running fastx_collapser, since -C option was not used.\n"
            if options.verbosity > 0: print text
            INFOFILE.write(text + '\n')
            for (end_type, cutadapt_tmpfile) in cutadapt_tmpfiles.items():
                if os.path.exists(cutadapt_tmpfile):
                    os.rename(cutadapt_tmpfile, outfiles[end_type])
            collapsed_readcount = cutadapt_readcount
            # Note for fastx_collapser, but also for the others - NONE is necessary here, can't just use '', because
            #    fastx_collapser works fine with no options, so '' is a sensible input and can't be used to turn it off.
        else:
            collapsed_readcount, uncollapsed_readcount = {}, {}
            for (end_type, cutadapt_tmpfile) in cutadapt_tmpfiles.items():
                outfile = outfiles[end_type]
                # if there is no file for that end, skip
                if not os.path.exists(cutadapt_tmpfile): continue
                command = "fastx_collapser -v %s -i %s -o %s" % (
                    FASTQ_ENCODINGS_FASTX_TOOLKIT[options.fastq_encoding],
                    cutadapt_tmpfile, outfile)
                run_command_print_info_output(
                    command,
                    INFOFILE,
                    options.verbosity,
                    shell=True,
                    program_name="fastx_collapser for %s" % end_type)
                INFOFILE.write('\n')
                collapsed_readcount[end_type] = check_readcount(
                    outfile,
                    INFOFILE,
                    bool(options.verbosity > 1),
                    "fastx_collapser output",
                    options.total_read_number_only,
                    input_collapsed_to_unique=False)
                # make sure uncollapsed readcount is the same as before collapsing
                uncollapsed_readcount[end_type] = check_readcount(
                    outfile,
                    None,
                    False,
                    "",
                    True,
                    input_collapsed_to_unique=True)
                if not uncollapsed_readcount[end_type] == cutadapt_readcount[
                        end_type]:
                    text = "ERROR: the uncollapsed read-count after fastx_collapser isn't the same as the before-collapser count!  Collapsing went wrong somehow, or the way fastx_collapser works changed since this program was written?\n"
                else:
                    text = "(checked that all the reads are still there if you uncollapse the numbers using header info)\n"
                if options.verbosity > 1: print text
                INFOFILE.write(text + '\n')
            # also run fastx_collapser on wrong_start_file and no_cassette_file
            text = "### Running fastx_collapser on the \"bad\" output files. Not printing the output to info file.\n"
            if options.verbosity: print text
            INFOFILE.write(text + '\n')
            extra_collapsed_readcounts = {}
            for extra_file in (wrong_start_file, no_cassette_file):
                command = "fastx_collapser -v %s -i %s -o tmp.fa" % (
                    FASTQ_ENCODINGS_FASTX_TOOLKIT[options.fastq_encoding],
                    extra_file)
                retcode = run_command_print_info_output(command,
                                                        None,
                                                        options.verbosity - 1,
                                                        shell=True)
                # note: actually fastx_collapser doesn't give proper retcodes, so just check if outfile exists
                #  (also it chokes on empty files, AND on lowercase bases!  That's a bit ridiculous...)
                #  it also apparently sometimes changes the order of the sequences for no good reason! ARGH.
                if retcode in (0, None) and os.path.exists('tmp.fa'):
                    os.remove(extra_file)
                    os.rename('tmp.fa', extra_file)
                extra_collapsed_readcounts[extra_file] = check_readcount(
                    extra_file,
                    None,
                    False,
                    "",
                    True,
                    input_collapsed_to_unique=False)

        ### Final readcount check
        final_output = [
            "### Final read count info for %s (main output files %s)\n" %
            (infile, ', '.join(outfiles))
        ]
        final_output.append("# starting total read count:\t%s\n" %
                            starting_readcount)
        if not options.first_bases_to_trim == 'NONE':
            final_output.append(
                '# "good" read count after start trimming (%% of total):\t%s\n'
                %
                value_and_percentages(trimmed_readcount, [starting_readcount]))
            final_output.append(
                '#  "bad" read count (wrong-start) (%% of total):\t%s\n' %
                value_and_percentages(untrimmed_readcount,
                                      [starting_readcount]))
        if if_running_cutadapt:
            for end_type in cutadapt_readcount.keys():
                final_output.append(
                    '# "good" %s read count after cassette stripping (%% of total, %% of trimmed):\t%s\n'
                    % (end_type,
                       value_and_percentages(
                           cutadapt_readcount[end_type],
                           [starting_readcount, trimmed_readcount])))
            final_output.append(
                '#  "bad" read count (no-cassette) (%% of total, %% of trimmed):\t%s\n'
                %
                value_and_percentages(no_cassette_readcount,
                                      [starting_readcount, trimmed_readcount]))
        for end_type in cutadapt_readcount.keys():
            final_output.append(
                '## final "good" %s reads (in main output file) (%% of total):\t%s\n'
                % (end_type,
                   value_and_percentages(cutadapt_readcount[end_type],
                                         [starting_readcount])))
        final_output.append(
            '## final "bad" reads (in _wrong-start and/or _no-cassette files) (%% of total):\t%s\n'
            % value_and_percentages(
                starting_readcount - sum(cutadapt_readcount.values()),
                [starting_readcount]))
        if options.collapse_to_unique:
            for end_type in cutadapt_readcount.keys():
                final_output.append(
                    '# "good" %s unique sequence count after collapsing reads to unique sequences '
                    % end_type + '(%% of read count):\t%s\n' %
                    value_and_percentages(collapsed_readcount[end_type],
                                          [cutadapt_readcount[end_type]]))
            if not options.first_bases_to_trim == 'NONE':
                final_output.append(
                    '# wrong-start unique sequence count after collapsing (%% of read count):\t%s\n'
                    % value_and_percentages(
                        extra_collapsed_readcounts[wrong_start_file],
                        [untrimmed_readcount]))
            if if_running_cutadapt:
                final_output.append(
                    '# no-cassette unique sequence count after collapsing (%% of read count):\t%s\n'
                    % value_and_percentages(
                        extra_collapsed_readcounts[no_cassette_file],
                        [no_cassette_readcount]))
        for line in final_output:
            INFOFILE.write(line)
            if options.verbosity > 0: print line,

    ### Remove tmpfiles
    # need to use the tmpfile*_original names here because I do "trimmed_tmpfile = infile" etc if skipping steps,
    #   and I don't want to remove the infile!
    if not options.keep_tmpfiles:
        for tmpfile in [trimmed_tmpfile_original
                        ] + cutadapt_tmpfiles_original.values():
            if os.path.exists(tmpfile): os.remove(tmpfile)

Exemplo n.º 6

0

Exibir arquivo

Arquivo: deepseq_alignment_wrapper.py Projeto: damluffy/Li-Zhang-Patena-2015

def main(args, options):
    """ Run the main functionality of the module (see module docstring for more information), excluding testing.
    The options argument should be generated by an optparse parser.
    """

    try:
        [infile] = args
    except ValueError:
        parser.print_help()
        sys.exit(
            "Error: exactly one infile required! %s infiles provided: %s" %
            (len(args), args))
        # MAYBE-TODO bowtie could take multiple infiles, but then I'd have to deal with multiple preprocessing metafiles...

    other_bowtie_options_split = options.other_bowtie_options.split(' ')
    if any([
            x in other_bowtie_options_split
            for x in ('-v -e --maqerr -n --seedmms -l --seedlen'.split(' '))
    ]):
        raise Exception(
            "Cannot include -v/-n/-e and related bowtie options in -B!  Use separate -e option for that; "
            "note that this program allows -v bowtie mode only.")
    if any([
            x in other_bowtie_options_split
            for x in ('-m -k -a --all'.split(' '))
    ]):
        raise Exception(
            "Cannot include -m/-a bowtie options in -B!  Use separate -m option for that."
        )

    specific_bowtie_options = '-v %s' % options.allowed_errors
    if not any([x in options.other_bowtie_options for x in ('-f', '-q')]):
        infile_format = check_fasta_fastq_format(infile)
        if infile_format == 'fasta': specific_bowtie_options += ' -f'
        elif infile_format == 'fastq': specific_bowtie_options += ' -q'
        else:
            raise Exception("Cannot process auto-detected infile format %s!" %
                            infile_format)

    # using a minimum of -k 2 (or -a) in order to make sure I can easily tell multiple from unique alignments
    if options.multiple_to_show == -1: multiple_bowtie_option = '-a'
    else: multiple_bowtie_option = '-k %s' % max(options.multiple_to_show, 2)

    # output file names: temporary for alignments, final (split or all), metadata info file.
    outfile_suffix = '.sam' if any(
        [x in options.other_bowtie_options
         for x in ['-S', '--sam']]) else '.map'
    tmpfile_genome = options.outfile_basename + '_tmp_genome' + outfile_suffix
    if options.cassette_bowtie_index != 'NONE':
        tmpfile_cassette = options.outfile_basename + '_tmp_cassette' + outfile_suffix
    if options.dont_split_by_category:
        outfile_all = options.outfile_basename + outfile_suffix
    else:
        outfile_unaligned = options.outfile_basename + '_unaligned.fa'
        outfile_cassette = options.outfile_basename + '_cassette' + outfile_suffix
        outfile_multiple_genomic = options.outfile_basename + '_multiple-genomic'\
                                   + ('.fa' if options.multiple_to_show==0 else outfile_suffix)
        outfile_genomic_unique = options.outfile_basename + '_genomic-unique' + outfile_suffix
    infofile = options.outfile_basename + '_info.txt'

    with open(infofile, 'w') as INFOFILE:

        ### write header data
        write_header_data(INFOFILE, options)

        ### run bowtie vs the main/genome index file
        # run 'bowtie --version' to get that data (print to INFOFILE but not stdout)
        INFOFILE.write('\n\n')
        run_command_print_info_output("bowtie --version",
                                      INFOFILE,
                                      printing_level=0,
                                      shell=True)
        # run the actual bowtie alignment command; always print output to stdout as well as INFOFILE
        #   (bowtie actually prints the summary to stderr, not stdout, so I need to print it to stdout in case there's
        #    an error, so I can see the error message!  Or I could try to detect whether there was an error or not
        #    based on the output contents, but that seems like unnecessary work.)
        INFOFILE.write('\n\n')
        command = "bowtie %s %s %s %s %s %s" % (
            specific_bowtie_options, multiple_bowtie_option,
            options.other_bowtie_options, options.genome_bowtie_index, infile,
            tmpfile_genome)

        if options.bowtie_aln_file_genome is None:
            run_command_print_info_output(command,
                                          INFOFILE,
                                          printing_level=(not options.quiet),
                                          shell=True)
        else:
            options.keep_tmpfiles = True
            if not os.access(options.bowtie_aln_file_genome, os.R_OK):
                raise Exception(
                    "Can't read provided options.bowtie_aln_file_genome %s!" %
                    options.bowtie_aln_file_genome)
            text = "UNUSUAL RUN: Instead of running \"%s\", using file %s." % (
                command, options.bowtie_aln_file_genome)
            print text
            INFOFILE.write('\n' + text + '\n')
            tmpfile_genome = options.bowtie_aln_file_genome

        ### run bowtie vs the cassette index file if given
        if options.cassette_bowtie_index != 'NONE':
            INFOFILE.write('\n\n')
            command = "bowtie %s %s %s %s %s %s" % (
                specific_bowtie_options, '--all', options.other_bowtie_options,
                options.cassette_bowtie_index, infile, tmpfile_cassette)
            if options.bowtie_aln_file_cassette is None:
                run_command_print_info_output(
                    command,
                    INFOFILE,
                    printing_level=(not options.quiet),
                    shell=True)
            else:
                options.keep_tmpfiles = True
                if not os.access(options.bowtie_aln_file_cassette, os.R_OK):
                    raise Exception(
                        "Can't read provided options.bowtie_aln_file_cassette %s!"
                        % options.bowtie_aln_file_cassette)
                text = "UNUSUAL RUN: Instead of running \"%s\", using file %s." % (
                    command, options.bowtie_aln_file_cassette)
                print text
                INFOFILE.write('\n' + text + '\n')
                tmpfile_cassette = options.bowtie_aln_file_cassette

        ### Check that bowtie runs worked
        missing_alnfile_text = "Bowtie run against %s failed! See above or %s file for bowtie error message."
        if not os.access(tmpfile_genome, os.R_OK):
            sys.exit(missing_alnfile_text %
                     (options.genome_bowtie_index, infofile))
        if options.cassette_bowtie_index != 'NONE' and not os.access(
                tmpfile_cassette, os.R_OK):
            sys.exit(missing_alnfile_text %
                     (options.cassette_bowtie_index, infofile))
        # MAYBE-TODO make sure bowtie errors are printed to stdout even with -1?  Hard - bowtie is unfortunately ANNOYING
        #  and uses stderr both for normal output and for errors, AND gives no returncode.

        ### Parse the two alignment files in parallel, and merge them together (remove sub-optimal alignments,
        #    (and remove non-cassette ones if there are cassette ones with equal quality); remove alignment files.
        #  Do all this WITHOUT reading the entire files into memory!  A bit tricky.
        if options.cassette_bowtie_index != 'NONE':
            aln_list_generator = aln_generator_from_two_samfiles_parallel(
                tmpfile_genome, tmpfile_cassette)
        else:
            aln_list_generator = aln_generator_from_single_samfile(
                tmpfile_genome)
        ### Decide the proper category for each read, and write the info to appropriate final output files
        if options.dont_split_by_category:
            GENOMIC_UNIQUE_FILE = MULTIPLE_GENOMIC_FILE = CASSETTE_FILE = UNALIGNED_FILE = open(
                outfile_all, 'w')
            unaligned_as_fasta = False
        else:
            UNALIGNED_FILE = open(outfile_unaligned, 'w')
            CASSETTE_FILE = open(outfile_cassette, 'w')
            MULTIPLE_GENOMIC_FILE = open(outfile_multiple_genomic, 'w')
            GENOMIC_UNIQUE_FILE = open(outfile_genomic_unique, 'w')
            unaligned_as_fasta = True
        category_readcounts = {
            'unaligned': 0,
            'cassette': 0,
            'multiple-genomic': 0,
            'genomic-unique': 0,
            'cassette-multiple': 0
        }
        for (readname, full_aln_list) in aln_list_generator:
            reduced_aln_list = reduce_alignment_list(full_aln_list)
            final_aln_list = prioritize_cassette_reads(
                reduced_aln_list, if_cassette_function=is_cassette_chromosome)
            categorize_reads_print_to_files(
                readname,
                final_aln_list,
                category_readcounts,
                UNALIGNED_FILE,
                CASSETTE_FILE,
                MULTIPLE_GENOMIC_FILE,
                GENOMIC_UNIQUE_FILE,
                unaligned_as_fasta=unaligned_as_fasta,
                multiple_to_write=options.multiple_to_show,
                input_collapsed_to_unique=options.input_collapsed_to_unique,
                no_multi_cassette_warnings=options.no_multi_cassette_warnings)
        if options.dont_split_by_category:
            # all files are actually the same pointer, so only close once
            GENOMIC_UNIQUE_FILE.close()
        else:
            UNALIGNED_FILE.close()
            CASSETTE_FILE.close()
            MULTIPLE_GENOMIC_FILE.close()
            GENOMIC_UNIQUE_FILE.close()

        # delete alignment tmpfiles now that they've been parsed
        if not options.keep_tmpfiles:
            os.remove(tmpfile_genome)
            if options.cassette_bowtie_index != 'NONE':
                os.remove(tmpfile_cassette)

        ### print category_readcounts to INFOFILE in a nice way
        text1 = "\n### FINAL ALIGNMENT CATEGORY COUNTS"
        cassette_multiple = category_readcounts.pop('cassette-multiple')
        total_reads = sum(category_readcounts.values())
        text2 = "# total reads:  %s" % total_reads
        if options.input_collapsed_to_unique:
            text2 += " (uncollapsed readcounts)"
        lines = [text1, text2]
        for category, count in sorted(category_readcounts.items()):
            text = "# %s:  %s" % (category,
                                  value_and_percentages(count, [total_reads]))
            if category == 'cassette' and cassette_multiple:
                text += ' (Warning: %s multiple!!)' % cassette_multiple
            lines.append(text)
        INFOFILE.write('\n')
        for text in lines:
            INFOFILE.write(text + '\n')
            if not options.quiet: print text

        ### copy preprocessing metadata file to the bottom of the new metadata file
        INFOFILE.write(
            "\n\n################## Metadata from input preprocessing ##################\n\n"
        )
        if options.input_metadata_file == 'NONE':
            INFOFILE.write(
                'Not looking for a metadata input file, as specified by options\n'
            )
        else:
            if options.input_metadata_file == 'AUTO':
                # the correct info file for X.txt is X.fa, but for X_5prime.txt it can be either X_5prime.txt or X.txt, so try both.
                #  (in the new preprocessing version all files are X_*prime.txt and the info files are X_info.txt;
                #   in the old version it was just X.txt and X_info.txt)
                # MAYBE-TODO add a test-case for this thing!  Probably too minor.
                metafile_basename = os.path.splitext(infile)[0]
                options.input_metadata_file = metafile_basename + '_info.txt'
                if not os.path.exists(options.input_metadata_file):
                    if metafile_basename.endswith(
                            '_3prime') or metafile_basename.endswith(
                                '_5prime'):
                        options.input_metadata_file = metafile_basename[:-len(
                            '_3prime')] + '_info.txt'
                text = 'Automatically determining metadata input file name: %s\n' % options.input_metadata_file
                if not options.quiet:
                    print text,
            else:
                text = 'Metadata input file name provided in options: %s\n' % options.input_metadata_file
            INFOFILE.write(text + '\n')
            if os.path.exists(options.input_metadata_file):
                print_text_from_file(options.input_metadata_file,
                                     INFOFILE,
                                     printing=False)
            else:
                text = 'Metadata input file %s not found!\n' % options.input_metadata_file
                if not options.quiet:
                    print text,
                INFOFILE.write(text)

Exemplo n.º 7

0

Exibir arquivo

Arquivo: deepseq_alignment_wrapper.py Projeto: Jonikas-Lab/Zhang-Patena-2014

def main(args, options):
    """ Run the main functionality of the module (see module docstring for more information), excluding testing.
    The options argument should be generated by an optparse parser.
    """

    try:
        [infile] = args
    except ValueError:
        parser.print_help()
        sys.exit("Error: exactly one infile required! %s infiles provided: %s"%(len(args), args))
        # MAYBE-TODO bowtie could take multiple infiles, but then I'd have to deal with multiple preprocessing metafiles...

    other_bowtie_options_split = options.other_bowtie_options.split(' ')
    if any([x in other_bowtie_options_split for x in ('-v -e --maqerr -n --seedmms -l --seedlen'.split(' '))]):
        raise Exception("Cannot include -v/-n/-e and related bowtie options in -B!  Use separate -e option for that; "
                        "note that this program allows -v bowtie mode only.")
    if any([x in other_bowtie_options_split for x in ('-m -k -a --all'.split(' '))]):
        raise Exception("Cannot include -m/-a bowtie options in -B!  Use separate -m option for that.")

    specific_bowtie_options = '-v %s'%options.allowed_errors
    if not any([x in options.other_bowtie_options for x in ('-f', '-q')]):
        infile_format = check_fasta_fastq_format(infile)
        if infile_format=='fasta':      specific_bowtie_options += ' -f'
        elif infile_format=='fastq':    specific_bowtie_options += ' -q'
        else:                           raise Exception("Cannot process auto-detected infile format %s!"%infile_format)

    # using a minimum of -k 2 (or -a) in order to make sure I can easily tell multiple from unique alignments
    if options.multiple_to_show == -1:  multiple_bowtie_option = '-a' 
    else:                               multiple_bowtie_option = '-k %s'%max(options.multiple_to_show, 2)

    # output file names: temporary for alignments, final (split or all), metadata info file. 
    outfile_suffix = '.sam' if any([x in options.other_bowtie_options for x in ['-S','--sam']]) else '.map'
    tmpfile_genome = options.outfile_basename + '_tmp_genome' + outfile_suffix
    if options.cassette_bowtie_index != 'NONE':
        tmpfile_cassette = options.outfile_basename + '_tmp_cassette' + outfile_suffix
    if options.dont_split_by_category:
        outfile_all = options.outfile_basename + outfile_suffix
    else:
        outfile_unaligned = options.outfile_basename + '_unaligned.fa'
        outfile_cassette = options.outfile_basename + '_cassette' + outfile_suffix
        outfile_multiple_genomic = options.outfile_basename + '_multiple-genomic'\
                                   + ('.fa' if options.multiple_to_show==0 else outfile_suffix)
        outfile_genomic_unique = options.outfile_basename + '_genomic-unique' + outfile_suffix
    infofile = options.outfile_basename + '_info.txt'

    with open(infofile,'w') as INFOFILE:

        ### write header data
        write_header_data(INFOFILE,options)

        ### run bowtie vs the main/genome index file
        # run 'bowtie --version' to get that data (print to INFOFILE but not stdout)
        INFOFILE.write('\n\n')
        run_command_print_info_output("bowtie --version", INFOFILE, printing_level=0, shell=True)
        # run the actual bowtie alignment command; always print output to stdout as well as INFOFILE
        #   (bowtie actually prints the summary to stderr, not stdout, so I need to print it to stdout in case there's 
        #    an error, so I can see the error message!  Or I could try to detect whether there was an error or not
        #    based on the output contents, but that seems like unnecessary work.)
        INFOFILE.write('\n\n')
        command = "bowtie %s %s %s %s %s %s"%(specific_bowtie_options, multiple_bowtie_option, 
                                      options.other_bowtie_options, options.genome_bowtie_index, infile, tmpfile_genome)
        run_command_print_info_output(command, INFOFILE, printing_level=(not options.quiet), shell=True)

        ### run bowtie vs the cassette index file if given
        if options.cassette_bowtie_index != 'NONE':
            INFOFILE.write('\n\n')
            command = "bowtie %s %s %s %s %s %s"%(specific_bowtie_options, '--all', options.other_bowtie_options, 
                                                  options.cassette_bowtie_index, infile, tmpfile_cassette)
            run_command_print_info_output(command, INFOFILE, printing_level=(not options.quiet), shell=True)

        ### Check that bowtie runs worked
        missing_alnfile_text = "Bowtie run against %s failed! See above or %s file for bowtie error message."
        if not os.access(tmpfile_genome, os.R_OK):
            sys.exit(missing_alnfile_text%(options.genome_bowtie_index, infofile))
        if options.cassette_bowtie_index != 'NONE' and not os.access(tmpfile_cassette, os.R_OK):
            sys.exit(missing_alnfile_text%(options.cassette_bowtie_index, infofile))
        # MAYBE-TODO make sure bowtie errors are printed to stdout even with -1?  Hard - bowtie is unfortunately ANNOYING 
        #  and uses stderr both for normal output and for errors, AND gives no returncode. 

        ### Parse the two alignment files, and merge them together (remove sub-optimal alignments,
        #    (and remove non-cassette ones if there are cassette ones with equal quality); remove alignment files.
        readname_to_aln_list = make_aln_dict_from_samfile(tmpfile_genome)
        if options.cassette_bowtie_index != 'NONE':
            readname_to_aln_list = make_aln_dict_from_samfile(tmpfile_cassette, starting_dict=readname_to_aln_list)
        # MAYBE-TODO right now I'm reading the entire files into memory before merging and processing them, 
        #  which takes a fair amount of memory - could instead write something that would read both alignment files
        #  in parallel and do the merging and output-writing read-by-read.  Do that if I start getting memory issues.
        reduce_alignment_dict(readname_to_aln_list)
        prioritize_cassette_reads(readname_to_aln_list, if_cassette_function=is_cassette_chromosome)
        # delete alignment tmpfiles now that they've been parsed
        os.remove(tmpfile_genome)
        if options.cassette_bowtie_index != 'NONE':
            os.remove(tmpfile_cassette)

        ### Decide the proper category for each read, and write the info to appropriate final output files
        if options.dont_split_by_category:
            with open(outfile_all,'w') as ALL_FILE:
                category_counts = categorize_reads_print_to_files(readname_to_aln_list, ALL_FILE, ALL_FILE, ALL_FILE, 
                                          ALL_FILE, unaligned_as_fasta=False, multiple_to_write=options.multiple_to_show, 
                                          input_collapsed_to_unique=options.input_collapsed_to_unique, 
                                          no_warnings=options.quiet)
        else:
            with open(outfile_unaligned, 'w') as UNALIGNED_FILE:
                with open(outfile_cassette, 'w') as CASSETTE_FILE:
                    with open(outfile_multiple_genomic, 'w') as MULTIPLE_GENOMIC_FILE:
                        with open(outfile_genomic_unique, 'w') as GENOMIC_UNIQUE_FILE:
                            category_counts = categorize_reads_print_to_files(readname_to_aln_list, UNALIGNED_FILE, 
                                                      CASSETTE_FILE, MULTIPLE_GENOMIC_FILE, GENOMIC_UNIQUE_FILE, 
                                                      unaligned_as_fasta=True, multiple_to_write=options.multiple_to_show, 
                                                      input_collapsed_to_unique=options.input_collapsed_to_unique, 
                                                      no_warnings=options.quiet)

        ### print category_readcounts to INFOFILE in a nice way
        text1 = "\n### FINAL ALIGNMENT CATEGORY COUNTS"
        cassette_multiple = category_counts.pop('cassette-multiple')
        total_reads = sum(category_counts.values())
        text2 = "# total reads:  %s"%total_reads
        if options.input_collapsed_to_unique: text2 +=" (uncollapsed readcounts)"
        lines = [text1, text2]
        for category,count in sorted(category_counts.items()):
            text = "# %s:  %s"%(category, value_and_percentages(count, [total_reads]))
            if category=='cassette' and cassette_multiple:  
                text += ' (Warning: %s multiple!!)'%cassette_multiple
            lines.append(text)
        INFOFILE.write('\n')
        for text in lines:
            INFOFILE.write(text + '\n')
            if not options.quiet: print text

        ### copy preprocessing metadata file to the bottom of the new metadata file
        INFOFILE.write("\n\n################## Metadata from input preprocessing ##################\n\n")
        if options.input_metadata_file == 'NONE':
            INFOFILE.write('Not looking for a metadata input file, as specified by options\n')
        else:
            if options.input_metadata_file == 'AUTO':
                # the correct info file for X.txt is X.fa, but for X_5prime.txt it can be either X_5prime.txt or X.txt, so try both.
                #  (in the new preprocessing version all files are X_*prime.txt and the info files are X_info.txt; 
                #   in the old version it was just X.txt and X_info.txt)
                # MAYBE-TODO add a test-case for this thing!  Probably too minor.
                metafile_basename = os.path.splitext(infile)[0] 
                options.input_metadata_file = metafile_basename + '_info.txt'
                if not os.path.exists(options.input_metadata_file):
                    if metafile_basename.endswith('_3prime') or metafile_basename.endswith('_5prime'):
                        options.input_metadata_file = metafile_basename[:-len('_3prime')] + '_info.txt'
                text = 'Automatically determining metadata input file name: %s\n'%options.input_metadata_file
                if not options.quiet:
                    print text,
            else:
                text = 'Metadata input file name provided in options: %s\n'%options.input_metadata_file
            INFOFILE.write(text+'\n')
            if os.path.exists(options.input_metadata_file):
                print_text_from_file(options.input_metadata_file, INFOFILE, printing=False)
            else:
                text = 'Metadata input file %s not found!\n'%options.input_metadata_file
                if not options.quiet:
                    print text,
                INFOFILE.write(text)