Пример #1
0
def wrapper(args):
    
    try:
        npar = args.noiseparam.strip('[').strip(']').split(',')
    except:
        npar = []
    nbins = args.nbins
    # Run funciton
    if args.i:
        df = pd.io.parsers.read_csv(
            args.i,delim_whitespace=True,
            dtype={'seqs':str,'batch':int})
    else:
        df = pd.io.parsers.read_csv(
            sys.stdin,delim_whitespace=True,
            dtype={'seqs':str,'batch':int})
    if len(utils.get_column_headers(df)) > 0:
         raise SortSeqError('Library already sorted!')
    model_df = io.load_model(args.model)
    output_df = main(
        df,model_df,args.noisemodel,npar,
        nbins,start=args.start,end=args.end)
    
    if args.out:
        outloc = open(args.out,'w')
    else:
        outloc = sys.stdout
    pd.set_option('max_colwidth',int(1e8))

    # Validate dataframe for writting
    output_df = qc.validate_dataset(output_df,fix=True)
    io.write(output_df,outloc)
Пример #2
0
def wrapper(args):

    #validate some of the input arguments
    qc.validate_input_arguments_for_learn_model(
        foreground=args.foreground,background=args.background,alpha=args.penalty,
        modeltype=args.modeltype,learningmethod=args.learningmethod,
        start=args.start,end=args.end,iteration=args.iteration,
        burnin=args.burnin,thin=args.thin,pseudocounts=args.pseudocounts,)

    inloc = io.validate_file_for_reading(args.i) if args.i else sys.stdin
    input_df = io.load_dataset(inloc)
    
    outloc = io.validate_file_for_writing(args.out) if args.out else sys.stdout
    #pdb.set_trace()

    output_df = main(input_df,lm=args.learningmethod,\
        modeltype=args.modeltype,db=args.db_filename,\
        LS_means_std=args.LS_means_std,\
        iteration=args.iteration,\
        burnin=args.burnin,thin=args.thin,start=args.start,end=args.end,\
        runnum=args.runnum,initialize=args.initialize,\
        foreground=args.foreground,background=args.background,\
        alpha=args.penalty,pseudocounts=args.pseudocounts,
        verbose=args.verbose)

    io.write(output_df,outloc)
Пример #3
0
def wrapper(args):
    """ Commandline wrapper for main()
    """ 
    inloc = io.validate_file_for_reading(args.i) if args.i else sys.stdin
    outloc = io.validate_file_for_writing(args.out) if args.out else sys.stdout
    input_df = io.load_dataset(inloc)
    output_df = main(input_df,bin=args.bin,start=args.start,end=args.end)
    io.write(output_df,outloc)
Пример #4
0
def wrapper(args):
    """ Commandline wrapper for main()
    """  
    output_df = main(wtseq=args.wtseq, mutrate=args.mutrate,\
        numseq=args.numseqs,dicttype=args.type,tags=args.tags,\
        tag_length=args.tag_length)
    outloc = io.validate_file_for_writing(args.out) if args.out else sys.stdout
    io.write(output_df,outloc)
Пример #5
0
def wrapper(args):
    inloc = io.validate_file_for_reading(args.i) if args.i else sys.stdin
    dataset_df = io.load_dataset(inloc)
    model_df = io.load_model(args.model)
    output_df = main(dataset_df=dataset_df, model_df=model_df,\
        left=args.left, right=args.right)
    outloc = io.validate_file_for_writing(args.out) if args.out else sys.stdout
    io.write(output_df,outloc,fast=args.fast)
Пример #6
0
def wrapper(args):
    """ Commandline wrapper for main()
    """
    inloc = io.validate_file_for_reading(args.i) if args.i else sys.stdin
    outloc = io.validate_file_for_writing(args.out) if args.out else sys.stdout
    input_df = io.load_dataset(inloc)
    output_df = main(input_df, start=args.start,end=args.end,\
        err=args.err, method=args.method, pseudocount=args.pseudocount)
    io.write(output_df,outloc)
Пример #7
0
    def test_profile_ct_bincounts(self):
        """ Test the ability of sortseq_tools.profile_ct to count frequencies
        """

        print '\nIn test_profile_ct_bincounts...'
        library_files = glob.glob(self.input_dir+'library_*.txt')
        library_files += glob.glob(self.input_dir+'dataset_*.txt')
        good_bin_num = 2
        bad_bin_num = 5
        for file_name in library_files:
            print '\t%s ='%file_name,
            description = file_name.split('_')[-1].split('.')[0]
            executable = lambda:\
                profile_ct.main(io.load_dataset(file_name),bin=good_bin_num)
            print '(bin=%d)'%good_bin_num,

            # If bad or library, then profile_ct.main should raise SortSeqError
            if ('_bad' in file_name) or ('library' in file_name):
                try:
                    self.assertRaises(SortSeqError,executable)
                    print 'badtype,',
                except:
                    print 'good (ERROR).'
                    raise

            # If good, then profile_ct.main should produce a valid df
            elif ('_good' in file_name) or ('dataset' in file_name):
                try:
                    df = executable()
                    qc.validate_profile_ct(df)
                    out_file = self.output_dir+\
                        'profile_ct_bin_%s.txt'%description
                    io.write(df,out_file)
                    io.load_profile_ct(out_file)
                    print 'good,',

                except:
                    print 'bad (ERROR).'
                    raise

            # There are no other options
            else:
                raise SortSeqError('Unrecognized class of file_name.')

            # Should always raise an error if bin num is too large
            executable = lambda:\
                profile_ct.main(io.load_dataset(file_name),bin=bad_bin_num)
            print '(bin=%d)'%bad_bin_num,
            try:
                self.assertRaises(SortSeqError,executable)
                print 'badtype.'
            except:
                print 'good (ERROR).'
                raise
Пример #8
0
    def test_preprocess(self):
        """ Test the ability of sortseq_tools.preprocess to collate data in multiple sequence files
        """

        print '\nIn test_preprocess...'
        file_names = glob.glob(self.input_dir+'files_*.txt')

        # Make sure there are files to test
        self.assertTrue(len(file_names)>0)

        for file_name in file_names:
            print '\t%s ='%file_name,
            description = file_name.split('_')[-1].split('.')[0]

            # If fasta or fastq, assume dna
            if ('fasta' in file_name) or ('fastq' in file_name):
                seq_type = 'dna'
            else:
                seq_type = None

            executable = lambda: preprocess.main(io.load_filelist(file_name),indir=self.input_dir, seq_type=seq_type)

            # If _good_, then preprocess.main should produce a valid df
            if ('_good' in file_name) or ('_fix' in file_name):
                try:
                    df = executable()
                    qc.validate_dataset(df)
                    out_file = self.output_dir+'dataset_%s.txt'%description
                    io.write(df,out_file)       # Test write
                    io.load_dataset(out_file)   # Test loading
                    print 'good.'
                except:
                    print 'bad (ERROR).'
                    raise

            # If _bad, then preprocess.main should raise SortSeqError
            elif '_bad' in file_name:
                try:
                    self.assertRaises(SortSeqError,executable)
                    print 'badtype.'
                except:
                    print 'good (ERROR).'
                    raise

            # There are no other options
            else:
                raise SortSeqError('Unrecognized class of file_name.')
Пример #9
0
def wrapper(args):
    """ Wrapper for functions io.load_* and io.write
    """  

    # Determine input and output
    inloc = io.validate_file_for_reading(args.i) if args.i else sys.stdin
    outloc = io.validate_file_for_writing(args.out) if args.out else sys.stdout

    try:
        # Get load function corresponding to file type
        func = filetype_to_loadfunc_dict[str(args.type)]

        # Run load function on input
        df = func(inloc)

        # Write df to stdout or to outfile 
        io.write(df,outloc,fast=args.fast)

    except SortSeqError:
        raise
Пример #10
0
def wrapper(args):
    """ Commandline wrapper for main()
    """  

    inloc = io.validate_file_for_reading(args.i) if args.i else sys.stdin
    outloc = io.validate_file_for_writing(args.out) if args.out else sys.stdout
    
    # Get filelist
    filelist_df = io.load_filelist(inloc)
    inloc.close()

    # Get tagkeys dataframe if provided
    if args.tagkeys:
        tagloc = io.validate_file_for_reading(args.tagkeys) 
        tags_df = io.load_tagkey(tagloc)
        tagloc.close()
    else:
        tags_df = None
    
    output_df = main(filelist_df,tags_df=tags_df,seq_type=args.seqtype)
    io.write(output_df,outloc,fast=args.fast)
Пример #11
0
    def generic_test(self,test_name,function_str,file_names,allbad=False):
        """ 
        Standardizes tests for different dataframe loading functions.
        The argument function_str must have "%s" where file_name goes. 
        Example:
        generic_test('test_io_load_tagkey','io.load_tagkey("%s"),file_names)'
        """
        print '\nIn %s...'%test_name   

        # Make sure there are files to test
        self.assertTrue(len(file_names)>0)

        # For each file, run test
        for file_name in file_names:
            executable = lambda: eval(function_str%file_name)
            print '\t%s ='%file_name,
            if not allbad and any([c in file_name for c in \
                ['_good','_fix','_badio','_badtype']]):
                try:
                    df = executable()
                    self.assertTrue(df.shape[0]>=1)
                    # Write df
                    base_filename = file_name.split('/')[-1]
                    io.write(df,self.output_dir+'loaded_'+base_filename)
                    print 'good.'
                except:
                    print 'bad (ERROR).'
                    raise

            elif allbad or ('_bad' in file_name):
                try:
                    self.assertRaises(SortSeqError,executable)
                    print 'bad.'
                except:
                    print 'good (ERROR).'
                    raise
            else:
                print 'what should I expect? (ERROR)'
                raise
        print '\tDone.'
Пример #12
0
    def test_profile_info(self):
        """ Test the ability of sortseq_tools.profile_info to compute mutation rates based on total count values
        """

        print '\nIn test_profile_info...'
        file_names = glob.glob(self.input_dir+'dataset_*.txt')
        for err in [True,False]:
            for file_name in file_names:
                print '\t%s, err=%s ='%(file_name,str(err)),
                description = file_name.split('_')[-1].split('.')[0]
                executable = lambda: \
                    profile_info.main(io.load_dataset(file_name),err=err)

                # If good, then profile_info.main should produce a valid df
                if '_good' in file_name:
                    try:
                        df = executable()
                        qc.validate_profile_info(df)
                        out_file = self.output_dir+\
                            'profile_info_%s_err_%s.txt'%(description,str(err))
                        io.write(df,out_file)
                        io.load_profile_info(out_file)
                        print 'good.'
                    except:
                        print 'bad (ERROR).'
                        raise

                # If bad, then profile_info.main should raise SortSeqError
                elif '_bad' in file_name:
                    try:
                        self.assertRaises(SortSeqError,executable)
                        print 'badtype.'
                    except:
                        print 'good (ERROR).'
                        raise

                # There are no other options
                else:
                    raise SortSeqError('Unrecognized class of file_name.')
Пример #13
0
    def test_profile_ct_totalcounts(self):
        """ Test the ability of sortseq_tools.profile_ct to count frequencies based on total count values
        """

        print '\nIn test_profile_ct_totalcounts...'
        library_files = glob.glob(self.input_dir+'library_*.txt')
        library_files += glob.glob(self.input_dir+'dataset_*.txt')
        for file_name in library_files:
            print '\t%s ='%file_name,
            description = file_name.split('_')[-1].split('.')[0]
            executable = lambda: profile_ct.main(io.load_dataset(file_name))

            # If good, then profile_ct.main should produce a valid df
            if '_good' in file_name:
                try:
                    df = executable()
                    qc.validate_profile_ct(df)
                    out_file = self.output_dir+\
                        'profile_ct_total_%s.txt'%description
                    io.write(df,out_file)
                    io.load_profile_ct(out_file)
                    print 'good.'
                except:
                    print 'bad (ERROR).'
                    raise

            # If bad, then profile_ct.main should raise SortSeqError
            elif '_bad' in file_name:
                try:
                    self.assertRaises(SortSeqError,executable)
                    print 'badtype.'
                except:
                    print 'good (ERROR).'
                    raise

            # There are no other options
            else:
                raise SortSeqError('Unrecognized class of file_name.')
Пример #14
0
    def test_profile_ct_seqslicing(self):
        """ Test the ability of sortseq_tools.profile_ct to slice sequences properly, and to raise the correct errors
        """

        print '\nIn test_profile_ct_seqslicing...'
        library_files = glob.glob(self.input_dir+'library_*.txt')
        library_files += glob.glob(self.input_dir+'dataset_*.txt')
        for file_name in library_files:
            print '\t%s ='%file_name,
            description = file_name.split('_')[-1].split('.')[0]
            executable_good1 =\
                lambda: profile_ct.main(io.load_dataset(file_name),\
                    start=2,end=10)
            executable_good2 =\
                lambda: profile_ct.main(io.load_dataset(file_name),\
                    start=2)
            executable_good3 =\
                lambda: profile_ct.main(io.load_dataset(file_name),\
                    end=2)
            executable_nopro =\
                lambda: profile_ct.main(io.load_dataset(file_name),\
                    start=50,end=60)
            executable_bad1 =\
                lambda: profile_ct.main(io.load_dataset(file_name),\
                    start=-1)
            executable_bad2 =\
                lambda: profile_ct.main(io.load_dataset(file_name),\
                    end=100)
            executable_bad3 =\
                lambda: profile_ct.main(io.load_dataset(file_name),\
                    start=20,end=10)

            # If good, then sequences will be valid
            if 'good' in file_name:
                try:
                    df = executable_good1()
                    io.write(df,self.output_dir+\
                        'profile_ct_splice2-10_%s.txt'%description)
                    executable_good2()
                    executable_good3()
                    self.assertRaises(SortSeqError,executable_bad1)
                    self.assertRaises(SortSeqError,executable_bad2)
                    self.assertRaises(SortSeqError,executable_bad3)
                    if '_pro' in file_name:
                        self.assertRaises(SortSeqError,executable_nopro)
                    else:
                        df = executable_nopro()
                    print 'ok.'
                except:
                    print 'ok (ERROR).'
                    raise

            # If bad, then profile_ct.main should raise SortSeqError
            elif '_bad' in file_name:
                try:
                    self.assertRaises(SortSeqError,executable_good1)
                    self.assertRaises(SortSeqError,executable_good2)
                    self.assertRaises(SortSeqError,executable_good3)
                    self.assertRaises(SortSeqError,executable_nopro)
                    self.assertRaises(SortSeqError,executable_bad1)
                    self.assertRaises(SortSeqError,executable_bad2)
                    self.assertRaises(SortSeqError,executable_bad3)
                    print 'ok.'
                except:
                    print 'not ok (ERROR).'
                    raise

            # There are no other options
            else:
                raise SortSeqError('Unrecognized class of file_name.')
Пример #15
0
def wrapper(args):
    """ Wrapper for function for scan_model.main()
    """

    # Prepare input to main
    model_df = io.load_model(args.model)
    seqtype, modeltype = qc.get_model_type(model_df)
    L = model_df.shape[0]
    if modeltype=='NBR':
        L += 1 
    
    chunksize = args.chunksize
    if not chunksize>0:
        raise SortSeqError(\
            'chunksize=%d must be positive'%chunksize)

    if args.numsites <= 0:
        raise SortSeqError('numsites=%d must be positive.'%args.numsites)

    if args.i and args.seq:
        raise SortSeqError('Cannot use flags -i and -s simultaneously.')

    # If sequence is provided manually
    if args.seq:
        pos_offset=0
        contig_str = args.seq

        # Add a bit on end if circular
        if args.circular:
            contig_str += contig_str[:L-1] 

        contig_list = [(contig_str,'manual',pos_offset)]

    # Otherwise, read sequence from FASTA file
    else:
        contig_list = []
        inloc = io.validate_file_for_reading(args.i) if args.i else sys.stdin
        for i,record in enumerate(SeqIO.parse(inloc,'fasta')):
            name = record.name if record.name else 'contig_%d'%i

             # Split contig up into chunk)size bits
            full_contig_str = str(record.seq)

            # Add a bit on end if circular
            if args.circular:
                full_contig_str += full_contig_str[:L-1] 

            # Define chunks containing chunksize sites
            start = 0
            end = start+chunksize+L-1
            while end < len(full_contig_str):
                contig_str = full_contig_str[start:end]
                contig_list.append((contig_str,name,start))
                start += chunksize
                end = start+chunksize+L-1
            contig_str = full_contig_str[start:]
            contig_list.append((contig_str,name,start))

        if len(contig_list)==0:
            raise SortSeqError('No input sequences to read.')

    # Compute results
    outloc = io.validate_file_for_writing(args.out) if args.out else sys.stdout
    output_df = main(model_df,contig_list,numsites=args.numsites,\
        verbose=args.verbose)

    # Write df to stdout or to outfile 
    io.write(output_df,outloc,fast=args.fast)