示例#1
0
def main():
    epi_text = ('If no NGS data directory is given, will automatically scan the following directories recursively: \n' +
                '\tread directory: {}'.format(default_read_dir) +
                '\n\tassembly directory: {}'.format(default_ass_dir)+
                '\n\tMiSeq directories: {}'.format(','.join(BML_read_locations)))

    parser = argparse.ArgumentParser(description='A program collect information about NGS data files',epilog=epi_text)
    ### general info
    parser.add_argument('--version','-V',action='version',version='%(prog)s {}.{}'.format(SCRIPT_VERSION,SCRIPT_SUBVERSION))
#     parser.add_argument('--debug',action='store_true',help="Preserve intermediate files and do not update reference files")

    ### controls

    ### required
    parser.add_argument('--assembly_dir','-ad',help='Directory with assemblies')
    parser.add_argument('--read_dir','-rd',help='Directory with reads')
    parser.add_argument('--out_dir','-od',help='Output directory')   
    parser.add_argument('--MiSeq_dir','-md',help='Directory with MiSeq reads and sample sheets')
    parser.add_argument('--misname_file','-mf',help='Excel spreadsheet with name corrections',default=default_misname_file,type=str)
#     parser.add_argument('')   
    args = parser.parse_args()
    out_dir = args.out_dir if args.out_dir else utilities.safeMakeOutputFolder(_outputBase)
    ass_out = os.path.join(out_dir,assemblies_file)
    read_out = os.path.join(out_dir,reads_file)
    Mi_out = os.path.join(out_dir,MiSeq_files)
    mirror_out = os.path.join(out_dir,mirrored_reads)
#     NCBS_out = os.path.join(out_dir,NCBS_processed)
    if isinstance(args.misname_file,str):
        if os.path.exists(args.misname_file):
            pass
    if args.read_dir or args.assembly_dir or args.MiSeq_dir:
        if args.read_dir:
            listReadFilesWithNames(args.read_dir,outfile = read_out,read_extension=read_ext,verbose=False,doAssignReadSets=True)
        if args.assembly_dir:
            listGenomeFilesWithNames(args.assembly_dir,outfile = ass_out, deep_search = True, verbose = False)
        if args.MiSeq_dir:
            listReadsFromMiSeqToplevel(args.MiSeq_dir, outfile=Mi_out, read_extension=read_ext, verbose=False, doAssignReadSets=False)
    else:
        print("\nStarting BML MiSeq reads...")
        df = listReadsFromMiSeqToplevel(BML_read_locations, outfile=Mi_out, read_extension=read_ext, verbose=True, doAssignReadSets=False)
        print("\tReported {} records".format(len(df)))
        print("Starting BCFB reads...")
        df = listReadFilesWithNames(default_read_dir,outfile = read_out,read_extension=read_ext,verbose=False,doAssignReadSets=True)
        print("\tReported {} records".format(len(df)))        
        print("\nStarting BCFB assemblies...")
        df = listGenomeFilesWithNames(default_ass_dir,outfile = ass_out, deep_search = True, verbose = False)
        print("\tReported {} records".format(len(df)))

        
        ##Get the NCBS stuff
        print("Starting BCFB mirrored reads...") ##Note, this contains some files that were deleted from our main data directory
        NCBS_raw = listReadFilesWithNames(default_read_mirror,outfile = mirror_out,read_extension=read_ext,verbose=False,doAssignReadSets=True)
        print("\tReported {} mirrored reads".format(len(NCBS_raw)))
示例#2
0
 def __init__(self,
              out_directory,
              file_identifiers,
              output_basename='alleles.fasta'):
     assert isinstance(out_directory, str)
     assert isinstance(file_identifiers, set)
     assert isinstance(out_directory, str)
     self.directory = utilities.safeMakeOutputFolder(
         out_directory)  ##Tacks on timestamp // safeMakeDir does not
     self.ids = file_identifiers
     self.basename = output_basename
     self.sequence_files = dict()
     for locus in self.ids:
         filename = os.path.join(self.directory,
                                 '{}_{}'.format(locus, self.basename))
         self.sequence_files[locus] = utilities.checkForOverwrite(
             filename)  ## Will not overwrite file
示例#3
0
 def __init__(self,primer_file,working_dir=None,generate_output=False):
     ###  Make writable directories
     if working_dir is None:
         working_dir = os.getcwd() 
     ##utilities.safeMakeOutputFolder(os.path.join(working_dir,'AmpExtTemp'))
     self.generate_output = generate_output   
     self.primers_dict = read_file_to_dict(primer_file)            
     if generate_output:
         self.outDir = utilities.safeMakeOutputFolder(os.path.join(working_dir,'AmpliconExtractor'))
         self.sequence_files = {locus: os.path.join(self.outDir,'{}_primer-extracted_sequences.fasta'.format(locus)) for locus in self.primers_dict.keys()}
         self.amplicon_info_file = os.path.join(self.outDir,'amplicon_information.tab')
         self.tempDirObj = tempfile.TemporaryDirectory(suffix='_AmpExt', prefix='tmp', dir=self.outDir)
     else:
         self.outDir = self.sequence_files = self.amplicon_info_file = None
         self.tempDirObj = tempfile.TemporaryDirectory(suffix='_AmpExt', prefix='tmp', dir=working_dir)
     
     self.amplicon_info_list = []
示例#4
0
def multiple(multi_args):
    if multi_args.force and multi_args.resume:
        print(
            "Exiting: the options 'force' and 'resume' are incompatible. Use only 'force' if you want to overwrite prior files."
        )
        return 1
    output_dir = multi_args.output if multi_args.output else utilities.safeMakeOutputFolder(
        _outputBase)
    utilities.safeMakeDir(output_dir)
    logFile = os.path.join(output_dir, "AssemblyCleanup.log")
    resultFile = os.path.join(output_dir, "AssemblyCleanupTable.tab")
    tempFile = utilities.appendToFilename(resultFile, '_temp')
    sys.stdout = utilities.Logger(logFile)
    assembler_name = None if multi_args.assembler is None else multi_args.assembler.lower(
    )
    print("Parameters:")
    for k, v in vars(multi_args).items():
        print('{} : {}'.format(k, v))
    draft_location = multi_args.draft_location
    if os.path.isfile(draft_location):
        guideFrame = pd.read_table(draft_location)
        print('Loaded guide table from ' + draft_location)
        print("\t table contains {} records".format(len(guideFrame)))
    elif os.path.isdir(draft_location):
        print("Searching for files in " + os.path.abspath(draft_location))
        deep_search = False if multi_args.shallow_search_assemblies else True
        guideFrame = NGS_data_utilities.listGenomeFilesWithNames(
            draft_location,
            deep_search=deep_search,
            extension=multi_args.extension)
        ##Exclude reads
        size_limit = multi_args.size_limit
        if size_limit > 0:
            guideFrame['filesize'] = guideFrame.Filename.apply(os.path.getsize)
            small_enough = (guideFrame.filesize <= size_limit)
            if sum(small_enough) < len(guideFrame):
                print('Only {} of {} files pass the upper size limit of {}'.
                      format(sum(small_enough), len(guideFrame), size_limit))
                guideFrame = guideFrame[small_enough].copy()
        guideFrame = guideFrame[NGS_data_utilities.dfHeaders].copy()
        if guideFrame is None or (len(guideFrame) == 0):
            print("Exiting. Failed to retrieve any files")
            return 1
        if assembler_name:
            guideFrame['assembler'] = assembler_name
            print('assigned assembler to be ' + assembler_name)
        else:  #This is not passed to AssemblyStats
            for i in guideFrame.index:
                if 'spades' in guideFrame.loc[i, 'Filename'].lower():
                    guideFrame.loc[i, 'assembler'] = 'spades'
                    print('assigned assembler to be spades for {}'.format(
                        guideFrame.loc[i, 'Lab_ID']))
        print('Calculating raw stats...')
        assemblyStats = AssemblyStats.calculateStats(
            guideFrame.Filename.tolist(),
            ass_format=assembler_name,
            image_dir=output_dir
        )  ##This will independently infer assembler from name unless given
        assemblyStats['Contig_Count'] = assemblyStats['Contig_Count'].astype(
            int)
        if assemblyStats is None or len(assemblyStats) == 0:
            print("Exiting failed to calculate assembly stats on input")
            return 1
        guideFrame = pd.merge(
            guideFrame, assemblyStats, how='left'
        )  ##Should merge on Filename. Don't want confusion if they share other fields
        if multi_args.BCFB_PacBio_Name:
            print('interpreting BCFB PacBio names...')
            for i in guideFrame.index:
                guideFrame.loc[i,
                               'Gaps'] = False if '.ro1m.' in guideFrame.loc[
                                   i, 'Filename'] else True
        else:
            guideFrame[
                'Gaps'] = True  ### Assume no closed genomes unless stated
    else:
        print("Exiting. Unable to find the location of draft files: {}".format(
            draft_location))
        return (1)
    print('Loaded data...')
    process = None
    if multi_args.reorient:
        process = 'RO'
    elif multi_args.discard:
        process = 'DIS'
    elif multi_args.discard_then_reorient:
        process = 'DIS_RO'
    else:
        print("Exiting. No processing specified")
        return (1)
    expectedArgs = set(['working_dir', 'report_file', 'assembler'])
    # circle_new_start=None,reverse_contig=None,closed_circle=None,broken_circle=None,circularize_with_Ns=0,
    #                     length=250,coverage=10,report_file=None,reference=None,assembler=None
    if 'RO' in process:
        expectedArgs.update(RO_argset)
        if not os.path.isfile(multi_args.reference):
            print("Cannot find reference file. Exiting")
            return 1
    if 'DIS' in process:
        expectedArgs.update(DIS_argset)

    tag = multi_args.tag if multi_args.tag else process
    print('Result files will have the tag "{}"'.format(tag))

    ##TODO test columns here

    permitted_fields = req_fields + list(expectedArgs)
    keep_fields = [x for x in guideFrame.columns if x in permitted_fields]
    parameterFrame = guideFrame[keep_fields].copy()
    if len(parameterFrame) == 0:
        return 1  ##Failuer
    fail_list = []
    for i, row in parameterFrame.iterrows(
    ):  ##Row gets converted to keyword arguments; shares index with guideFrame
        assembly_file = row['Filename']
        if not os.path.isfile(assembly_file):
            print("Error: unable to find file: {}".format(assembly_file))
            output_file = 'error. '
        else:
            print("Working on " + os.path.basename(assembly_file))
            print("\tat  {}".format(time.ctime()))
            del row['Filename']
            if 'Contig_Count' in row.index:
                if (str(row['Contig_Count']) == str(1)):
                    gaps = row['Gaps']
                    gap_bool = True  ##Safest default (will introduce contig breaks). But should probably skip reorientation
                    if isinstance(gaps, str):
                        if gaps.upper() == 'TRUE':
                            gap_bool = True
                        elif gaps.upper() == 'FALSE':
                            gap_bool = False
                        else:
                            print("unable to interpret 'gaps' notation: {}".
                                  format(gaps))
                            continue
                    elif isinstance(gaps, bool):
                        gap_bool = gaps
                    else:
                        print("unable to interpret 'gaps' notation: {}".format(
                            gaps))
                        continue
                    if gap_bool:
                        row['broken_circle'] = True  ##NOTE: with our bacteria, we assume circle
                    else:
                        row['closed_circle'] = True

                del row['Gaps']

            assembly_basename = utilities.appendToFilename(
                os.path.basename(assembly_file), '_' + tag)
            output_file = os.path.join(output_dir, assembly_basename)

            report_file = os.path.join(
                output_dir, os.path.basename(assembly_file)) + '.report.txt'
            has_out = os.path.isfile(output_file)
            has_rpt = os.path.isfile(report_file)
            if has_out or has_rpt:
                if multi_args.force:
                    if has_out:
                        print(
                            "Removing prexisting file: {}".format(output_file))
                        os.remove(output_file)
                    if has_rpt:
                        print("Removing pre-existing file: {}".format(
                            report_file))
                        os.remove(report_file)
                else:
                    if not multi_args.resume:
                        print(
                            "Error: Refusing to overwrite pre-existing output files: \n\t{}\n\t{}"
                            .format(output_file, report_file))
                    continue
            try:
                open(output_file, 'a').close()
                os.remove(output_file)
            except IOError:
                print(
                    "Error. Do not have permission to write to output file \n\t{}"
                    .format(output_file))
                continue

            cleanup_args = vars(multi_args).copy()  ##TODO: put this up front?
            cleanup_args.update(row.to_dict())
            cleanup_args['working_dir'] = os.path.join(output_dir, 'work')
            cleanup_args = {
                k: v
                for k, v in cleanup_args.items() if k in expectedArgs
            }
            if 'Mean_Coverage' in row.index:
                proportion_cutoff = multi_args.coverage_proportion * row.loc[
                    'Mean_Coverage']
                min_coverage = max(multi_args.coverage, proportion_cutoff)
                cleanup_args['coverage'] = min_coverage
                del cleanup_args['Mean_Coverage']
            else:
                cleanup_args[
                    'coverage'] = multi_args.coverage  ##This should actually be irrelevant --
            try:
                print("Arguments:")
                print(cleanup_args)
                if cleanupAndWrite(assembly_file,
                                   output_file,
                                   report_file=report_file,
                                   **cleanup_args) != 0:  ##TODO: return stats
                    output_file = 'error'
                    fail_list.append(assembly_file)
            except Exception as e:
                fail_list.append(assembly_file)
                output_file = 'error'
                warn = "Exception on cleanupAndWrite:"
                utilities.printExceptionDetails(e, warn)
            print()  ##Blank line
        guideFrame.loc[i, 'CleanedFile'] = output_file
        guideFrame.to_csv(tempFile, index=False, sep='\t')
    print("Errors on {} files: ".format(len(fail_list)))
    print("\n\t".join(fail_list))
    if process in ['DIS',
                   'DIS_RO']:  ##recalculate stats for filtered contig sets
        assemblyStats2 = AssemblyStats.calculateStats(
            guideFrame.CleanedFile.tolist(), ass_format=assembler_name)
        if assemblyStats2 is not None:
            #             assemblyStats2.rename(columns={'Filename':'CleanedFile'},inplace=True)
            guideFrame = AssemblyStats.BeforeAndAfter(
                guideFrame.set_index("CleanedFile"),
                assemblyStats2.set_index('Filename'))
#             guideFrame = pd.merge(guideFrame,assemblyStats2,on='CleanedFile',suffixes=('_raw',''),how='outer')
    print("Reporting stats for {} genomes.".format(len(guideFrame)))
    guideFrame.fillna('N/A', inplace=True)
    utilities.safeOverwriteTable(resultFile, guideFrame, 'tab', index=False)
    return 0
示例#5
0
def single(args):
    assembly_file = args.assembly
    if not os.path.isfile(assembly_file):
        print("Exiting. Unable to find file {}".format(assembly_file))
        return 1
#     assembly_format,assembly_compressed = utilities.guessFileFormat(assembly_file)
    if args.output:
        output_file = args.output
        output_dir = os.path.dirname(output_file)
    else:
        output_dir = utilities.safeMakeOutputFolder(_outputBase)
        basename = utilities.appendToFilename(os.path.basename(assembly_file),
                                              '_RO')
        output_file = os.path.join(output_dir, basename)
    logFile = os.path.join(output_dir, "AssemblyCleanup.log")
    sys.stdout = utilities.Logger(logFile)
    print(_outputBase)
    report_file = os.path.join(output_dir,
                               os.path.basename(assembly_file)) + '.report.txt'
    has_out = os.path.isfile(output_file)
    has_rpt = os.path.isfile(report_file)
    if has_out or has_rpt:
        if args.force:
            if has_out:
                print("Removing prexisting file: {}".format(output_file))
                os.remove(output_file)
            if has_rpt:
                print("Removing pre-existing file: {}".format(report_file))
                os.remove(report_file)
        else:
            print(
                "Exiting. Refusing to overwrite pre-existing output files: \n\t{}\n\t{}"
                .format(output_file, report_file))
            return 1

    try:
        open(output_file, 'a').close()
    except IOError:
        print("Exiting. Do not have permission to write to output file")
        return 1

###########Should probably be a method
    process = None
    if args.reorient:
        process = 'RO'
    elif args.discard:
        process = 'DIS'
    elif args.discard_then_reorient:
        process = 'DIS_RO'
    else:
        print("Exiting. No processing specified")
        return (1)
    expectedArgs = set(['working_dir', 'report_file'])
    # circle_new_start=None,reverse_contig=None,closed_circle=None,broken_circle=None,circularize_with_Ns=0,
    #                     length=250,coverage=10,report_file=None,reference=None,assembler=None
    if 'RO' in process:
        expectedArgs.update(RO_argset)
    if 'DIS' in process:
        expectedArgs.update(DIS_argset)

    cleanup_args = vars(args)
    cleanup_args = {k: v for k, v in cleanup_args.items() if k in expectedArgs}
    return cleanupAndWrite(assembly_file,
                           output_file,
                           report_file=report_file,
                           **cleanup_args)
示例#6
0
def reorientClosedChromosome(raw_contig,
                             reference_file,
                             N_padding=-1,
                             working_dir=None,
                             set_steps=5,
                             set_window=5000):
    temp_dir = None
    if isinstance(working_dir, str):
        try:
            utilities.safeMakeDir(working_dir)
            temp_dir = working_dir
        except IOError:
            pass  ##Leave temp_dir as None
    if temp_dir is None:
        temp_dir = utilities.safeMakeOutputFolder('AssemblyCleanup_temp_')
    ## Setup blast database for the sequences you are searching against
    raw_contig_dict = SeqIO.to_dict(raw_contig)
    raw_contig_file = os.path.join(
        temp_dir, utilities.makeSafeName("-".join(raw_contig_dict.keys())))
    SeqIO.write(raw_contig, raw_contig_file, 'fasta')
    db_name = os.path.join(temp_dir, os.path.basename(raw_contig_file))
    BLASThelpers.makeblastdb(raw_contig_file)
    ##Get several chunks near the beginning of the reference file, as query
    ref_seqs = seq_utilities.seqs_guess_and_parse2list(reference_file)
    for rs in ref_seqs:
        rename = re.sub('\W', '_', rs.name)
        steps = set_steps
        window = set_window
        expected_search_length = steps * window - 1
        if len(rs) < expected_search_length:
            steps = len(rs) // window
            if steps == 0:
                steps = 1
                window = len(rs)
            search_length = steps * window
            print(
                "Reference sequence is only {}bp; dropping search sequence from {} to {}"
                .format(len(rs), expected_search_length, search_length))
        else:
            search_length = expected_search_length
        SearchWindows = []
        for i in range(0, search_length, window):
            end_base = i + window
            contig = rs[i:end_base]
            contig.id = 'fragment_{}_to_{}'.format(i, end_base)
            SearchWindows.append(contig)
        query_basename = rename + '_WindowsQuery.fasta'
        #         re.sub('[^\w\s-]', '', value).strip().lower())
        query_filename = os.path.join(temp_dir, query_basename)
        with open(query_filename, 'wt') as seq_out:
            SeqIO.write(SearchWindows, seq_out, 'fasta')
        ##Run BLAST
        outfile = os.path.join(temp_dir,
                               rename + '_' + os.path.basename(db_name))
        ##Note: may need to have "high stringency" and "low stringency" options. This is low stringency (for mapping to distant relatives). High stringency would increase perc_identity here and the qcovs filtering of "results"
        blast_cline = NcbiblastnCommandline(query=shlex.quote(query_filename),
                                            db=shlex.quote(db_name),
                                            outfmt=_outfmt_str,
                                            out=shlex.quote(outfile),
                                            evalue=1E-100,
                                            perc_identity=80,
                                            qcov_hsp_perc=25,
                                            num_threads=2)
        stdout = stderr = None
        try:
            stdout, stderr = blast_cline()
        except Exception as e:
            print("Blast failed on {} with {}...output below...".format(
                rename, reference_file))
            print("\t{}".format(stdout))
            print("\t{}".format(stderr))
            print(e)
            raise
        results = pd.read_table(outfile,
                                names=_outfmt_head)  #No headers in file
        results = results[results[bh['qcovs']] > 50].sort(
            bh['bitscore'],
            ascending=False)  ##Should already be sorted by bitscore
        full_start = full_end = 0  ##BLAST uses a 1 index
        first_hit = None
        coherent_fragments = 0
        for w in SearchWindows:
            window_hits = results[results[bh['qseqid']] == w.id]
            if len(window_hits) > 0:
                hit = window_hits.iloc[0]
                start = hit[bh['sstart']]
                end = hit[bh['send']]
                contig = hit[bh['sseqid']]
                forward = start < end
                if first_hit is None:  ##Serves as a sign that there was no prior hit
                    first_hit = hit
                    hit_contig = contig
                    full_start = start
                    full_end = end
                    full_forward = forward
                else:  ##Check that it is consistent with prior
                    in_order = (contig == hit_contig)
                    in_order &= full_forward == forward
                    in_order &= (full_end < start) == full_forward
                    in_order &= abs(end - full_end) < 2 * window
                    if in_order:
                        full_end = end
                        coherent_fragments += 1
                    else:
                        print("Warning: Contig {} is not in order. \nStopping".
                              format(w.id))
                        break  #For search windows
            else:
                print("Warning: Failed to find a match to fragment {}".format(
                    w.id))
                if coherent_fragments > 0:
                    print('Stopping since we have an anchor already')
                    break  #For search windows
        if coherent_fragments > 0:
            print("Shifting contig {} ({} bp)".format(
                hit_contig, len(raw_contig_dict[hit_contig])))
            new_contigs = shiftCirclarChromosome(raw_contig_dict[hit_contig],
                                                 full_start, not full_forward,
                                                 N_padding)
            del raw_contig_dict[hit_contig]
            for new_contig in new_contigs:
                raw_contig_dict[new_contig.id] = new_contig
                assert len(
                    new_contig
                ) != 0, 'Contig with length 0. Aborting. Contact developer'
            print('Rotating contig: {}'.format(hit_contig))
            print('Starting at {}'.format(full_start))
            if full_forward:
                print("keeping orientation")
            else:
                print("Reverse complement")
        else:
            print(
                'Aborting: Failed to identify the start position based on the reference genome.'
            )
            print(
                "\t Reorient contig by specifying args.circle_new_start and/or args.reverse_contig"
            )
            blast_results = outfile + '.tab'
            print('\t Saving BLAST results at ' + blast_results)
            results.to_csv(blast_results, sep='\t')
            return None
    return [x for x in raw_contig_dict.values()]
示例#7
0
def main():
    parser = argparse.ArgumentParser(
        description=
        'A program to perform batched Mauve contig reordering (and someday more)'
    )
    ### general info
    parser.add_argument('--version',
                        '-V',
                        action='version',
                        version='%(prog)s {}.{}'.format(
                            SCRIPT_VERSION, SCRIPT_SUBVERSION))
    #     parser.add_argument('-p','--projectID',help='Provide an identifier that will be added to output directory and data table')
    #     parser.add_argument('-s','--setting_dir',help='Location of setting files')
    #     parser.add_argument('-m','--min_cov',help='Alternate minimum coverage',default=0.8,type=float)

    #     parser.add_argument('--debug',action='store_true',help="Preserve intermediate files and do not update reference files")

    ### controls
    parser.add_argument(
        '--find_mauve',
        action='store_true',
        help="Search for known Mauve directories if it is not on the path")
    parser.add_argument(
        '--search_subdirectories',
        action='store_true',
        help=
        "Search for draft genome files in subdirectories of specified folder")
    parser.add_argument(
        '-wd',
        '--working_directory',
        help=
        'Working directory for Mauve to align assemblies. Will make a new subdirectory if not specified, starting with: '
        + _outputBase)
    parser.add_argument(
        '-rd',
        '--result_directory',
        help=
        'Result directory for reoriented assemblies. Will use top level of working directory if not specified.'
    )

    ### required
    parser.add_argument('draft_dir', help='Location of draft assemblies')
    parser.add_argument('reference_genome',
                        help='Reference genome file to orient towards')

    args = parser.parse_args()

    assert os.path.isdir(args.draft_dir), "Draft_dir is not a directory"
    assert os.path.isfile(
        args.reference_genome), "Reference genome file does not exist"
    working_dir = os.path.abspath(
        args.working_directory
    ) if args.working_directory else utilities.safeMakeOutputFolder(
        _outputBase)
    result_dir = os.path.abspath(
        args.result_directory) if args.result_directory else working_dir

    mh = MauveHelper(args.find_mauve)
    if (mh.mauve_dir is None):
        sys.exit("Cannot Find the Mauve path")
    elif not os.path.isfile(mh.mauve_dir + mauve_jar):
        sys.exit("Cannot Find the Mauve jar file. Searched on this path: " +
                 mh.mauve_dir)
    else:
        reorder_stats = []
        draft_genomes = NGS_data_utilities.listGenomeFilesWithNames(
            os.path.abspath(args.draft_dir), None, args.search_subdirectories,
            True)
        if len(draft_genomes) > 0:
            for draft_file in draft_genomes['Filename']:
                print('starting with {}'.format(draft_file))
                reorder_stats.append(
                    mh.reorder_contigs(os.path.abspath(args.reference_genome),
                                       draft_file, working_dir, result_dir))
        else:
            print("Found no genomes. Exiting")
    try:
        statTable = pd.DataFrame(reorder_stats)
        statTable.to_csv(
            os.path.join(result_dir, "reorderStats.tab", sep='\t',
                         index=False))
    except Exception as e:
        print("Failure to save statistics...")
        print(e)
示例#8
0
def main():
    epi_text = ('If no NGS data directory is given, will automatically scan the following directories recursively: \n' +
                '\tread directory: {}'.format(default_read_dir) +
                '\n\tassembly directory: {}'.format(default_ass_dir)+
                '\n\tMiSeq directories: {}'.format(','.join(BML_read_locations)))

    parser = argparse.ArgumentParser(description='A program collect information about NGS data files',epilog=epi_text)
    ### general info
    parser.add_argument('--version','-V',action='version',version='%(prog)s {}.{}'.format(SCRIPT_VERSION,SCRIPT_SUBVERSION))
#     parser.add_argument('--debug',action='store_true',help="Preserve intermediate files and do not update reference files")

    ### controls

    ### required
    parser.add_argument('--assembly_dir','-ad',help='Directory with assemblies')
    parser.add_argument('--read_dir','-rd',help='Directory with reads')
    parser.add_argument('--out_dir','-od',help='Output directory')   
    parser.add_argument('--MiSeq_dir','-md',help='Directory with MiSeq reads and sample sheets')
    parser.add_argument('--misname_file','-mf',help='Excel spreadsheet with name corrections',default=default_misname_file,type=str)
#     parser.add_argument('')   
    args = parser.parse_args()
    out_dir = args.out_dir if args.out_dir else utilities.safeMakeOutputFolder(_outputBase)
    
    logFile = os.path.join(out_dir,default_logfile)
    print("LogFile is : "+logFile)
    sys.stdout = utilities.Logger(logFile)    
    print(_outputBase)
    
    ass_out = os.path.join(out_dir,assemblies_file)
    read_out = os.path.join(out_dir,reads_file)
    Mi_out = os.path.join(out_dir,MiSeq_files)
    mirror_out = os.path.join(out_dir,mirrored_reads)
    motif_out = os.path.join(out_dir,'Motif_Extra.txt')
#     NCBS_out = os.path.join(out_dir,NCBS_processed)
    if isinstance(args.misname_file,str):
        if os.path.exists(args.misname_file):
            pass
    if args.read_dir or args.assembly_dir or args.MiSeq_dir:
        if args.read_dir:
            listReadFilesWithNames(args.read_dir,outfile = read_out,read_extension=read_ext,verbose=False,doAssignReadSets=True)
        if args.assembly_dir:
            listGenomeFilesWithNames(args.assembly_dir,outfile = ass_out, deep_search = True, verbose = False)
        if args.MiSeq_dir:
            listReadsFromMiSeqToplevel(args.MiSeq_dir, outfile=Mi_out, read_extension=read_ext, verbose=False, doAssignReadSets=False)
    else:
        print("\nStarting BML MiSeq reads...")
        df = listReadsFromMiSeqToplevel(BML_read_locations, outfile=Mi_out, read_extension=read_ext, verbose=True, doAssignReadSets=False)
        print("\tReported {} records".format(len(df)))
        print("Starting BCFB reads...")
        df = listReadFilesWithNames(default_read_dir,outfile = read_out,read_extension=read_ext,verbose=False,doAssignReadSets=True)
        print("\tReported {} records".format(len(df))) 
        ##Assemblies and associated files       
        print("\nStarting BCFB assemblies...")
        df = listGenomeFilesWithNames(default_ass_dir,outfile = ass_out, deep_search = True, verbose = False)
        print("\tReported {} records".format(len(df)))
        motif_base = df.Filename.str.rstrip('.fasta')
        for i in motif_base.index:
            f = motif_base[i]
            matches = glob.glob(f+"*"+motif_ext)
            if len(matches) > 1:
                print("Warning, found multiple motif files for {}. Selecting first one by glob.".format(f))
            elif len(matches) == 1:
                motif_base[i] = matches[0]
#         motif_files = df.Filename.str.rstrip('.fasta') + motif_ext
        motif_exists = motif_base.apply(os.path.isfile)
        print("\tIdentified {} associated motif files".format(sum(motif_exists)))
        if sum(motif_exists) > 0:
            df.loc[motif_exists,'Motif_Data'] = motif_base.loc[motif_exists]
        basepath = df.Filename.str.extract(r'(^.*\/[^.]*\.[^.]*)[^/]')#,expand=False)
        summary_files = basepath + '.summary'
        summary_exists = summary_files.apply(os.path.isfile)
        if sum(summary_exists) >0:
            df.loc[summary_exists,'BLAST_summary'] = summary_files.loc[summary_exists]
        circlator_files = basepath + '.circlator.json'      
        circlator_exists = circlator_files.apply(os.path.isfile)      
        if sum(circlator_exists) >0:
            df.loc[circlator_exists,'Circulator'] = circlator_files.loc[circlator_exists]            
        df.to_csv(ass_out,sep='\t',index=False) ## ovirwrite file from listGenomesFileWithNames
        ##Get all motif for comparison
        motif_list = []
        for rootdir, _, files in os.walk(default_ass_dir):
            motif_list += [os.path.join(rootdir,x) for x in files if x.endswith(motif_ext)] 
        extra_motif = [x for x in motif_list if x not in df.Motif_Data.tolist()]
        if len(extra_motif) > 0:
            print("Found extra motif files. Saving list to {}".format(motif_out))
            with open(motif_out,'wt') as fout:
                for f in extra_motif:                   
                    print(f,file=fout)
        
        ##Get the NCBS stuff
        print("Starting BCFB mirrored reads...") ##Note, this contains some files that were deleted from our main data directory
        NCBS_raw = listReadFilesWithNames(default_read_mirror,outfile = mirror_out,read_extension=read_ext,verbose=False,doAssignReadSets=True)
        print("\tReported {} mirrored reads".format(len(NCBS_raw)))