Exemplo n.º 1
0
def make_VDJtools_dir(
                      indir,
                      outdir=False,
                      VDJout_dir=None,
                      genes=False,
                      emptycols=['D'],
                      overwrite=False,
                      filetype='fastq',
                      ):
    if not genes:
        genes = {'V':'V','J':'J','C':'C'}
    if not outdir:
        outdir = indir
    
    VDJout_dir,_ = reptools.build_path(True, VDJout_dir, 'VDJtools', outdir)

    filetypes = reptools.select_filetypes(filetype)
    typefiles = [fn for fn in os.listdir(indir) if os.path.splitext(fn)[1] in filetypes]
    if len(typefiles)==0:
        print('No files of specified type found.\n')
        return
    
    if overwrite:
        reptools.remove_dir(VDJout_dir,recursive=True)
    reptools.cautious_mkdir(VDJout_dir)
    
    for fn in typefiles:
        outfn = os.path.splitext(fn)[0]+'.tab'
        reptools.make_VDJtools(os.path.join(indir,fn),os.path.join(VDJout_dir,outfn),genes,emptycols,filetype)
Exemplo n.º 2
0
def denoise_dir(
        indir,
        outdir=False,
        weight_by_qual=True,
        threshold=10,
        indel_threshold=100,
        FASTQout_dir=None,
        #FASTAout_dir = False,
        subs=True,
        indels=True,
        deambig=True,
        filetype='fastq',
        overwrite=False):

    filetypes = reptools.select_filetypes(filetype)
    infiles = [
        fn for fn in os.listdir(indir)
        if os.path.splitext(fn.lower())[1] in filetypes
    ]

    if len(infiles) == 0:
        print('No fastq files found.\n')
        return

    if not outdir:
        outdir = indir

    FASTQout_dir, _ = reptools.build_path(True, FASTQout_dir, 'denoisedCDR3',
                                          outdir)
    FASTAout_dir = False
    #FASTAout_dir = build_path(True, FASTAout_dir, 'denoisedCDR3_fasta', outdir)

    #make output directory, deleting pre-existing data is overwrite is set
    for pth in [FASTQout_dir, FASTAout_dir]:
        if pth: reptools.reptools.cautious_mkdir(pth, overwrite=overwrite)

    for fn in infiles:
        FASTQout = reptools.make_unpaired_filepaths(FASTQout_dir,
                                                    os.path.splitext(fn)[0])
        FASTAout = reptools.make_unpaired_filepaths(FASTAout_dir,
                                                    os.path.splitext(fn)[0],
                                                    'fas')
        _ = reptools.denoise_file(os.path.join(indir, fn),
                                  weight_by_qual=weight_by_qual,
                                  threshold=threshold,
                                  indel_threshold=indel_threshold,
                                  FASTQout=True,
                                  FASTAout=False,
                                  FASTQout_fn=FASTQout,
                                  change_logs=False,
                                  subs=subs,
                                  indels=indels,
                                  deambig=deambig,
                                  overwrite=overwrite)[0]

    return (FASTQout_dir)
Exemplo n.º 3
0
def EEfilter_dir(indir,
                 outdir=False,
                 FASTQout_dir=None,
                 FASTAout_dir=False,
                 maxee=1,
                 overwrite=False,
                 filetype='fastq'):
    if not outdir:
        outdir = indir

    FASTQout_dir, _ = reptools.build_path(True, FASTQout_dir, 'EEfilteredCDR3',
                                          outdir)
    FASTAout_dir = False

    if not FASTQout_dir and not FASTAout_dir:
        raise ValueError(
            'Please supply one or both of FASTQout_dir and FASTAout_dir to EEfilter_dir()'
        )

    for pth in [FASTQout_dir, FASTAout_dir]:
        if pth: reptools.cautious_mkdir(pth, overwrite=overwrite)

    filetypes = reptools.select_filetypes(filetype)
    infiles = [
        fn for fn in os.listdir(indir)
        if os.path.splitext(fn.lower())[1] in filetypes
    ]

    if len(infiles) == 0:
        print('No fastq files found.\n')
        return

    for fn in infiles:
        FASTQout = reptools.make_unpaired_filepaths(FASTQout_dir,
                                                    os.path.splitext(fn)[0])
        FASTAout = reptools.make_unpaired_filepaths(FASTAout_dir,
                                                    os.path.splitext(fn)[0],
                                                    'fas')
        _ = reptools.EEfilter_file(os.path.join(indir, fn),
                                   FASTQout=FASTQout,
                                   FASTAout=FASTAout,
                                   maxee=maxee)

    return (FASTQout_dir)
Exemplo n.º 4
0
def counts_csv(csvfile,paireddirs=(),unpaireddirs=(),pairsuffixes=('_1','_2'),filetype=None,overwrite=False,basename=False):
    if not overwrite and os.path.exists(csvfile):
        raise IOError('Target file already exists. To overwrite, set overwrite=True.')
    
    filetypes = reptools.select_filetypes(filetype)
    counts={}
    for dir in paireddirs:
        filelist = [os.path.join(dir,fn) for fn in os.listdir(dir) if os.path.splitext(fn)[1] in filetypes]
        counts[dir] = {}
        for fn in filelist:
            root = os.path.splitext(os.path.split(fn)[1])[0][:-len(pairsuffixes[0])]
            if root not in list(counts.keys()):
                counts[dir][root]=0
            if os.path.getsize(fn)>0:
                if os.path.splitext(fn)[1] in ['.fastq','.fq']:
                    counts[dir][root] += reptools.fastqcounter(fn)
                else:
                    counts[dir][root] += reptools.fascounter(fn)
    for dir in unpaireddirs:
        counts[dir] = {}
        filelist = [os.path.join(dir,fn) for fn in os.listdir(dir) if os.path.splitext(fn)[1] in filetypes]
        for fn in filelist:
            root = os.path.splitext(os.path.split(fn)[1])[0]
            if os.path.getsize(fn)>0:
                if os.path.splitext(fn)[1] in ['.fastq','.fq']:
                    counts[dir][root] = reptools.fastqcounter(fn)
                else:
                    counts[dir][root] = reptools.fascounter(fn)
            else:
                counts[dir][root] = 0
    #
    allroots = sorted(set([k for dict in counts for k in counts[dict] ]))
    
    if basename:
        titles = [os.path.split(path)[1] for path in paireddirs] + [os.path.split(path)[1] for path in unpaireddirs]
    else:
        titles = paireddirs+unpaireddirs
    
    with open(csvfile,'wb') as out_handle:
        out_handle.write('root,%s\n' % (','.join(titles) ))
        for root in allroots:
            out_handle.write('%s,%s\n' % (root,','.join([str(counts[dir][root]) if root in counts[dir] else '0' for dir in paireddirs+unpaireddirs] ) ) )
Exemplo n.º 5
0
def v081_CDR3slice_dir(
        indir,
        outdir,
        genedict,
        db_files,
        db_dir=False,
        hitsDir=False,
        store_search_out=False,
        mincols=(20, 15),
        id=(0.93, 0.93),
        strand='both',
        evalue=(0.001, 0.001),
        genes=('V', 'J'),
        locations=('VregionC104start', 'JregionF118start'),
        filetype='fastq',
        overwrite=False,
        usearchpath='usearch',
        stellarPath='stellar',
        swipePath='swipe',
        blastPath='blastn',
        makeblastdbPath='makeblastdb',
        alnout=False,  #for debugging use only
        algorithm='swipe',
        Vdb_length=30,
        threads=10,
        verbose=True,
        **kwargs):
    #modification: the databases and gene dict are now modified to look at the last Vdb_length bases of V only
    import os
    import tempfile
    import reptools
    import reptools.test
    filetypes = reptools.select_filetypes(filetype)
    typefiles = [
        fn for fn in os.listdir(indir) if os.path.splitext(fn)[1] in filetypes
    ]
    if len(typefiles) == 0:
        print('No files of specified type found.\n')
        reptools.ensure_dir(outdir)
        return

    if overwrite:
        reptools.remove_dir(outdir)
        if alnout: reptools.remove_dir(alnout)
    reptools.cautious_mkdir(outdir)
    if alnout: reptools.cautious_mkdir(alnout)

    if store_search_out is True:
        #note that store_searchout can be True, False, or contain a path to write uSearch files to
        store_search_out = os.path.join(outdir, 'search_reports')

    if store_search_out:
        reptools.cautious_mkdir(store_search_out)

    if db_dir:
        db_files = [os.path.join(db_dir, fn) for fn in db_files]

    counts = {}

    if algorithm.lower() == 'swipe':
        hitExt = '.tsv'
    elif algorithm.lower() == 'stellar':
        hitExt = '.gff'
    elif algorithm.lower() in ['local', 'ublast']:
        hitExt = '.u14'
    elif algorithm.lower() == 'blast':
        hitExt = '.b6'
    else:
        raise ValueError(
            'Unknown algorithm: must be "swipe","stellar","blast","local" or "ublast". '
            'local uses usearch local.')

    #make modified V database and gene dictionary, to work with only last 30 bases (by default - value in Vdb_length)
    (temp_Vdb,
     temp_genedict) = reptools.make_shorter_db(db_files[0], genedict,
                                               Vdb_length)
    db_files = [temp_Vdb, db_files[1]]  #new db_files list

    #call reptools.CDR3slice()
    for fn in typefiles:
        if hitsDir:
            hitFiles = []
            hitFiles = [
                fn for fn in os.listdir(hitsDir)
                if os.path.splitext(fn)[0] == hitExt
            ]
            for gene in genes:
                hitFiles = hitFiles + [
                    os.path.join(hitsDir, fn)
                    for fn in hitFiles if os.path.splitext(fn)[0][-1] == gene
                ]
            if len(hitFiles) != len(genes):
                raise ValueError('Missing hits file for %s' % fn)
        else:
            hitFiles = False
        infile = os.path.join(indir, fn)
        outfile = os.path.join(outdir, fn)
        alnoutfile = False
        if alnout:
            alnoutfile = os.path.join(alnout, os.path.splitext(fn)[0] + '.aln')
        reptools.test.v081_CDR3slice(infile,
                                     outfile,
                                     db_files=db_files,
                                     genedict=temp_genedict,
                                     hitFiles=hitFiles,
                                     store_search_out=store_search_out,
                                     mincols=mincols,
                                     id=id,
                                     strand=strand,
                                     evalue=evalue,
                                     genes=genes,
                                     locations=locations,
                                     usearchpath=usearchpath,
                                     stellarPath=stellarPath,
                                     swipePath=swipePath,
                                     blastPath=blastPath,
                                     makeblastdbPath=makeblastdbPath,
                                     alnout=alnoutfile,
                                     algorithm=algorithm,
                                     threads=threads,
                                     verbose=verbose,
                                     **kwargs)

    os.remove(temp_Vdb)
    os.remove(temp_genedict)