Exemplo n.º 1
0
def generate_wmat(consensus_pairing_csv_file, minreadcount, min_uei_count, outfilename = 'wmat.csv'):
    #consensus_pairing_csv_file has elements: 
    #uei index, beacon-umi index, target-umi index, read-count
    #if outfilename == None, does not print data to new files
    
    [bcn_dict,trg_dict,
     bcn_abund_dict,trg_abund_dict,
     bcn_div_dict,trg_div_dict] = get_umi_uei_matrices(consensus_pairing_csv_file, minreadcount)       
    if len(trg_dict)==0 or len(bcn_dict)==0:
        sysOps.throw_exception(consensus_pairing_csv_file + ' generated an empty UEI matrix.')
        sysOps.exitProgram()
    
    sysOps.throw_status(['Generating feature list.',sysOps.statuslogfilename])
    trg_feature_dict_list = get_features_from_dict(trg_dict) #collects salient pieces of information on targets for printing in file later
    [bcn_dict, trg_dict, bcn_div_dict, trg_div_dict] = filter_mats(bcn_dict, trg_dict,
                                                                   bcn_div_dict, trg_div_dict, min_uei_count)

    sysOps.throw_status(['Replacing matrix elements with UEI numbers (scalars).',sysOps.statuslogfilename])
    del bcn_dict
    sysOps.throw_status(['Generating weight matrix.',sysOps.statuslogfilename])
    
    if len(trg_dict)==0:
        sysOps.throw_exception('After filtering, ' + consensus_pairing_csv_file + ' generated an empty UEI matrix.')
        sysOps.exitProgram()
    
    if outfilename != None:
        print_features(trg_dict, 'trg_' + outfilename, trg_feature_dict_list)
    
    return trg_dict
Exemplo n.º 2
0
def define_nuc_degeneracy(c1):
    c1 = c1.upper()
    if (c1 in 'ACGTU'):
        return [c1]
    elif (c1 == 'N'):
        return ['A', 'C', 'G', 'T']
    elif (c1 == 'W'):
        return ['A', 'T']
    elif (c1 == 'S'):
        return ['C', 'G']
    elif (c1 == 'M'):
        return ['A', 'C']
    elif (c1 == 'K'):
        return ['G', 'T']
    elif (c1 == 'R'):
        return ['A', 'G']
    elif (c1 == 'Y'):
        return ['C', 'T']
    elif (c1 == 'B'):
        return ['C', 'G', 'T']
    elif (c1 == 'D'):
        return ['A', 'G', 'T']
    elif (c1 == 'H'):
        return ['A', 'C', 'T']
    elif (c1 == 'V'):
        return ['A', 'C', 'G']
    else:
        sysOps.throw_exception([
            'Error: ' + c1 +
            'does not code for a single- or degenerate-nucleotide'
        ])
        sysOps.exitProgram()
Exemplo n.º 3
0
    def generate_uxi_library(self):
        # Perform sequence analysis (read-parsing, clustering, pairing UEIs/UMIs, sub-sampling data for rarefaction analyses)
        
        if not sysOps.check_file_exists('uxi_lib_tasklist.csv'):
            # create task list for library processing
            [subdirnames, filenames] = sysOps.get_directory_and_file_list(sysOps.globaldatapath)
            with open(sysOps.globaldatapath + 'uxi_lib_tasklist.csv','w') as task_input_file_handle:
                for subdir in subdirnames:
                    if sysOps.check_file_exists(subdir + '//libsettings.txt'):
                        task_input_file_handle.write('generate_uxi_library;' + sysOps.globaldatapath + subdir + '//\n')
                            
        original_datapath = str(sysOps.globaldatapath)
        [my_task,time_start] = parallelOps.get_next_open_task('tasklog.csv', 'uxi_lib_tasklist.csv', 'generate_uxi_library')
        if not (my_task is None):

            sysOps.initiate_runpath(str(my_task[1]))
            myLibObj = libOps.libObj(settingsfilename = 'libsettings.txt', output_prefix = '_')
            if not sysOps.check_file_exists(myLibObj.output_prefix + 'lib_stats.txt'):
                myLibObj.partition_fastq_library(discarded_sequence_path = "discarded_sequences.fastq", mean_phred_score_path = "mean_phred_scores.txt")
            self.generate_cluster_analysis()
                
            libOps.subsample(myLibObj.seqform_for_params,myLibObj.seqform_rev_params, myLibObj.output_prefix)
            [subdirnames, filenames] = sysOps.get_directory_and_file_list()
            dirnames = list([subdirname for subdirname in subdirnames if subdirname.startswith('sub')])
            sysOps.throw_status('Performing cluster analysis on sub-directories: ' + str(dirnames))
            for dirname in dirnames:
                sysOps.initiate_runpath(str(my_task[1]) + dirname + '//')
                self.generate_cluster_analysis()
        
            sysOps.globaldatapath = str(original_datapath)   
            if not parallelOps.close_task('tasklog.csv', ';'.join(my_task), time_start):
                sysOps.throw_exception('Task ' + str(my_task) + ' no longer exists in log ' + sysOps.globaldatapath + 'tasklog.csv' + ' -- exiting.')
                sysOps.exitProgram()
Exemplo n.º 4
0
def threshold_cluster_uxi_prelinked(uxi_list,identical_uxi_filename,threshold,P=0,subsample = -1, prefix = ''):
    
    # Function will be called while loading linkage_file into uxi_list through load_linkage_file_to_list(linkage_file) in hashAlignments.py
    # Format of linkage file:
    #    uxi-sequence, self-read-number, RND: list of linked-to indices with self-index as first in line
    # linkage_list elements: [uxi-sequence,self-read-number,RND,[list of linked-to indices with self-index as first in line]])
            
    #sort uxi_list by decreasing RND
    num_uxi = len(uxi_list)
    sysOps.throw_status('Starting uxi list sort. List size = ' + str(num_uxi))
    sorted_uxi_list = sorted(uxi_list, key=lambda row: -row[2]) #note: sorted_uxi_list _REMAINS_ a pointer to uxi_list
    index_vals = [-1 for i in range(num_uxi)]
    sysOps.throw_status('Completed uxi list sort. Assigning EASL-clusters ...')
        
    for sorted_uxi_el in sorted_uxi_list: 
        #index_vals, with indices corresponding to _original_ positions in pre-sorted uxi_list, are initiated at -1 (stored in list at row[3])
        #uxi's accepted into cluster with seed of index i, will be given value i in index_vals
        #uxi's rejected from all classification are given index
        if index_vals[sorted_uxi_el[3][0]] < 0: #if this seed has index -1 (has not been assigned to any seed itself)
            index_vals[sorted_uxi_el[3][0]] = int(sorted_uxi_el[3][0]) # set cluster seed to itself
            
        my_index_val = int(index_vals[sorted_uxi_el[3][0]])
        
        for i in range(1,len(sorted_uxi_el[3])):
            if index_vals[sorted_uxi_el[3][i]] < 0: #connected read is unassigned -- assign to current cluster seed
                index_vals[sorted_uxi_el[3][i]] = my_index_val

    sysOps.throw_status('Consolidating clustered uxis ...')
    #consolidate clustered uxi's
    
    if -1 in index_vals:
        sysOps.throw_exception('Error: UNASSIGNED/UNCLUSTERED uxis. Exiting program')
        sysOps.exitProgram()
        
    index_str_vals = [str(int(x)) for x in index_vals]
    new_uxi_dict= dict()
    
    for i in range(num_uxi):
        my_index_str = index_str_vals[i] 
        if my_index_str in new_uxi_dict:
            new_uxi_dict[my_index_str].append(uxi_list[i][0] + "_" + str(uxi_list[i][1]))
        else:
            new_uxi_dict[my_index_str] = [(uxi_list[i][0] + "_" + str(uxi_list[i][1]))]
            
    if(subsample<=0):
        new_uxi_handle = open(sysOps.globaldatapath + prefix + "thresh" + str(threshold) + "_" + identical_uxi_filename,'w')
    else:
        new_uxi_handle = open(sysOps.globaldatapath + prefix + "thresh" + str(threshold) + "_sub" + str(subsample) + identical_uxi_filename,'w')
    
    i = 0
    for dict_el in new_uxi_dict:
        for el in new_uxi_dict[dict_el]:
            new_uxi_handle.write(str(i) + "_" + el + "\n")     
        i += 1   
        
    new_uxi_handle.close()
    
    print "Completed clustering."
    
    return True
Exemplo n.º 5
0
    def generate_cluster_analysis(self):
        # Perform clustering analysis of UMI and UEI sequences, consolidate pairings and determine consenses of these pairings
        
        sysOps.initiate_statusfilename()
        missing_uxi_files = sysOps.find_missing_uxi_files('libsettings.txt', '_')
        if len(missing_uxi_files)>0:
            sysOps.throw_exception('Missing uxi files: ' + str(missing_uxi_files))
        
        if(sysOps.check_file_exists('_for_uxi0.fasta')):
            sysOps.throw_status("Clustering for_uxi0")
            clustering_up_to_date_1 = hashAlignments.initiate_hash_alignment('_for_uxi0.fasta')
        else:
            clustering_up_to_date_1 = True
            sysOps.throw_status(sysOps.globaldatapath + '_for_uxi0.fasta does not exist. Skipping.')
        
        if(sysOps.check_file_exists('_for_uxi1.fasta')):
            sysOps.throw_status("Clustering for_uxi1")
            clustering_up_to_date_2 = hashAlignments.initiate_hash_alignment('_for_uxi1.fasta')
        else:
            clustering_up_to_date_2 = True
            sysOps.throw_status(sysOps.globaldatapath + '_for_uxi1.fasta does not exist. Skipping.')
            
        if(sysOps.check_file_exists('_rev_uxi0.fasta')):
            sysOps.throw_status("Clustering rev_uxi0")
            clustering_up_to_date_3 = hashAlignments.initiate_hash_alignment('_rev_uxi0.fasta')
        else:
            clustering_up_to_date_3 = True
            sysOps.throw_status(sysOps.globaldatapath + '_rev_uxi0.fasta does not exist. Skipping.')
            
        if (clustering_up_to_date_1 and clustering_up_to_date_2 and clustering_up_to_date_3):
            
            filter_val = 0.75 #maximum fraction of same-base permitted in a single UMI/UEI
            min_pairing_readcount = 2
            sysOps.throw_status('Clustering completed. Beginning final output.')
            
            if (sysOps.check_file_exists('thresh1_identical__for_uxi0.fasta') and sysOps.check_file_exists('thresh1_identical__for_uxi1.fasta') and sysOps.check_file_exists('thresh1_identical__rev_uxi0.fasta') 
                and not (sysOps.check_file_exists('consensus_pairing_filter' + str(filter_val) + '_uei_umi.csv'))):
                if not sysOps.check_file_exists("pairing_filter" + str(filter_val) + "_uei_umi.csv"):
                    dnamicOps.assign_umi_pairs('thresh1_identical__for_uxi1.fasta','thresh1_identical__for_uxi0.fasta','thresh1_identical__rev_uxi0.fasta', 
                                                 '_for_uxi1.fasta' , '_for_uxi0.fasta', '_rev_uxi0.fasta', 
                                                 'pairing',filter_val,False) # final parameter = False: excluding invalid amplicon sequences

                dnamicOps.assign_consensus_pairs("pairing_filter" + str(filter_val) + "_uei_umi.csv",min_pairing_readcount)
            else:
                sysOps.throw_status('Consensus-pairing file found pre-computed.')
                
            if (sysOps.check_file_exists('thresh1_identical__rev_uxi0.fasta') and not sysOps.check_file_exists('trg_amplicon_calls.csv')):
                #assign amplicon-identities to target umi's
                sysOps.throw_status('Assigning amplicon-identities and consensus sequences to target umis.')
                dnamicOps.assign_umi_amplicons('thresh1_identical__rev_uxi0.fasta','_rev_uxi0.fasta','_amp_match.txt', '_rev_amp0.fasta', 'trg_amplicon_calls.csv') 
Exemplo n.º 6
0
def get_next_uxi_file_entry(handle):

    header = handle.readline()
    if len(header) == 0:
        return [[], []]

    header = header.strip('\n').split('_')
    if len(header) != 3:
        sysOps.throw_exception(
            'Error in get_next_uxi_file_entry(): new line = ' +
            '_'.join(header))
    id_list = list()

    for i in range(int(header[2])):
        id_list.append(handle.readline().strip('\n'))

    return [header, id_list]
Exemplo n.º 7
0
def group_uxi_reads(uxi_clust_file, uxi_list_file):
    #takes in clustered -file and identically-matched  file, generates look-up of reads based on identically-matched file
    uxi_dict = load_uxi_dict(uxi_list_file)
    uxi_clust_handle = open(sysOps.globaldatapath +uxi_clust_file,'rU')
    read_id_grouping = []
    uxis_and_readnums = [] #list of -sequences and corresponding read-numbers, indexed as a list of lists with one-to-one correspondence to clusters
    for uxi_clust_line in uxi_clust_handle:
        [clust_index, my_uxi, read_num] = uxi_clust_line.strip('\n').split("_")
        clust_index = int(clust_index)
        if clust_index >= len(read_id_grouping):
            read_id_grouping.append([])
            uxis_and_readnums.append([])
        
        if not (my_uxi in uxi_dict):
            print "Error: could not find  " + my_uxi
            sysOps.throw_exception("Could not find  " + my_uxi)
            
        read_id_grouping[clust_index].extend(uxi_dict[my_uxi]) 
        
    uxi_clust_handle.close()
    
    return read_id_grouping
Exemplo n.º 8
0
def filter_mats(bcn_dict, trg_dict, bcn_div_dict, trg_div_dict, min_uei_count):
    
    # prune UEI data to exclude UMIs with UEI counts < min_uei_count
    
    if len(bcn_dict) == 0:
        return [bcn_dict, trg_dict, bcn_div_dict, trg_div_dict]
    
    deletion_iteration = 0
    is_list = None
    
    sysOps.throw_status('Filtering matrices with ' + str(len(bcn_div_dict)) + '+' + str(len(trg_div_dict)) + ' UMIs.')
    
    while True:
        
        bcn_retained = 0
        trg_retained = 0
        bcn_deleted = list()
        trg_deleted = list()
        
        for bcn_el in bcn_div_dict:
            if bcn_div_dict[bcn_el]<min_uei_count:
                bcn_deleted.append(bcn_el)
            else:
                bcn_retained += 1
                
        for trg_el in trg_div_dict:
            if trg_div_dict[trg_el]<min_uei_count:
                trg_deleted.append(trg_el)
            else:
                trg_retained += 1
        
        #check if bcn_dict and trg_dict are still list or already converted to values
        if is_list == None:
            for bcn_el in bcn_dict:
                for trg_el in bcn_dict[bcn_el]:
                    is_list = (type(bcn_dict[bcn_el][trg_el]) is list)
                    break
                break
            
        if len(bcn_deleted)==0 and len(trg_deleted)==0:
            sysOps.throw_status('On deletion-iteration ' + str(deletion_iteration) + ', all retained.')
            break
            
        sysOps.throw_status('On deletion-iteration ' + str(deletion_iteration) + ' deleting ' + str(len(bcn_deleted)) + '+' + str(len(trg_deleted)) + ', retained ' + str(bcn_retained) + '+' + str(trg_retained) + '. is_list=' + str(is_list))
        
        if is_list == None:
            sysOps.throw_exception('Error, could not find any elements: len(bcn_dict) = ' + str(len(bcn_dict)))
            sysOps.exitProgram()
            
        for bcn_el in bcn_deleted:
            for trg_el in bcn_dict[bcn_el]:
                if is_list:
                    trg_div_dict[trg_el] -= len(trg_dict[trg_el][bcn_el])
                else:
                    trg_div_dict[trg_el] -= trg_dict[trg_el][bcn_el]
                del trg_dict[trg_el][bcn_el]
                
            del bcn_dict[bcn_el]
            del bcn_div_dict[bcn_el]
            
        for trg_el in trg_deleted:
            for bcn_el in trg_dict[trg_el]:
                if bcn_el in bcn_div_dict: #if not already deleted above
                    if is_list:
                        bcn_div_dict[bcn_el] -= len(bcn_dict[bcn_el][trg_el])
                    else:
                        bcn_div_dict[bcn_el] -= bcn_dict[bcn_el][trg_el]
                    del bcn_dict[bcn_el][trg_el]
                
            del trg_dict[trg_el]
            del trg_div_dict[trg_el]
                        
        deletion_iteration += 1
    
    #check for consistency
    for bcn_el in bcn_dict:
        for trg_el in bcn_dict[bcn_el]:
            if is_list and len(bcn_dict[bcn_el][trg_el])!=len(trg_dict[trg_el][bcn_el]):
                sysOps.throw_exception('ERROR: bcn_dict and trg_dict contain different elements')
                sysOps.exitProgram()
                
    for trg_el in trg_dict:
        for bcn_el in trg_dict[trg_el]:
            if is_list and len(bcn_dict[bcn_el][trg_el])!=len(trg_dict[trg_el][bcn_el]):
                sysOps.throw_exception('ERROR: bcn_dict and trg_dict contain different elements')
                sysOps.exitProgram()
               
    
    return [bcn_dict, trg_dict, bcn_div_dict, trg_div_dict]
Exemplo n.º 9
0
def print_final_results(trgcalls_filename, trgseq_filename):

    #output final_*.csv containing columns (index, -1 (beacon)/ target-amplicon match,
    #                                            x, y, ..., segment
    #output final_feat*.csv containing columns (index, features, consensus sequence (if target)
    #
    [dirnames, filenames] = sysOps.get_directory_and_file_list()
    seq_dat_filename = [
        filename for filename in filenames if filename.startswith('seq_params')
    ]
    seq_dat_filename = seq_dat_filename[0][len('seq_params_'):]

    for result_dat_file in filenames:
        if (result_dat_file.startswith('Xumi_') and
                not (sysOps.check_file_exists('final_' + result_dat_file))):
            key_dat_file = 'key' + seq_dat_filename[
                (seq_dat_filename.find('_')):]
            if sysOps.check_file_exists(key_dat_file):
                coords_dict = dict()
                sysOps.throw_status('Generating final output for ' +
                                    sysOps.globaldatapath +
                                    str(result_dat_file))
                result = np.loadtxt(sysOps.globaldatapath + result_dat_file,
                                    delimiter=',')
                for i in range(result.shape[0]):
                    coords_dict[str(int(result[i, 0]))] = ','.join(
                        [str(x) for x in result[i, 1:]])

                trg_match_dict = dict()
                trg_match_file = open(
                    sysOps.globaldatapath + trgcalls_filename, 'rU')
                trg_seq_file = open(sysOps.globaldatapath + trgseq_filename,
                                    'rU')

                for line, fasta_record in itertools.izip(
                        trg_match_file, SeqIO.parse(trg_seq_file, "fasta")):
                    [trg_umi_index, max_match, max_tally,
                     tot_tally] = line.strip('\n').split(',')
                    trg_match_dict[trg_umi_index] = [
                        str(max_match),
                        str(max_tally),
                        str(tot_tally),
                        str(fasta_record.seq)
                    ]

                trg_match_file.close()
                trg_seq_file.close()

                outfile = open(
                    sysOps.globaldatapath + '//final_' + result_dat_file, 'w')
                outfile_feat = open(
                    sysOps.globaldatapath + '//final_feat_' + result_dat_file,
                    'w')

                bcn_excluded = 0
                trg_excluded = 0
                with open(sysOps.globaldatapath + key_dat_file,
                          'rU') as key_file:
                    for line in key_file:
                        [bcn0trg1, orig_index,
                         mle_index] = line.strip('\n').split(',')
                        #key file columns: 0 or 1 (for beacon or target, respectively), cluster-index, MLE processing index
                        if mle_index in coords_dict:
                            outfile.write(orig_index + ',' +
                                          coords_dict[mle_index] + '\n')
                            if bcn0trg1 == '0':
                                outfile_feat.write(orig_index +
                                                   ',-1,-1,-1,N\n')
                            else:
                                outfile_feat.write(
                                    orig_index + ',' +
                                    ','.join(trg_match_dict[orig_index]) +
                                    '\n')
                        else:
                            if bcn0trg1 == '0':
                                bcn_excluded += 1
                            else:
                                trg_excluded += 1
                sysOps.throw_status(
                    str(bcn_excluded) + ' beacons, ' + str(trg_excluded) +
                    ' targets excluded from final estimation')
                outfile.close()
                outfile_feat.close()

            else:
                sysOps.throw_exception(sysOps.globaldatapath + key_dat_file +
                                       ' does not exist.')
    return
Exemplo n.º 10
0
def assign_consensus_pairs(pairing_csv_file, min_pairing_readcount):
    '''
    Assumes CSV file with columns:
    1. UEI cluster-index
    2. Beacon UMI cluster-index
    3. Target UMI cluster-index
    4. Read-number
    '''

    sysOps.throw_status('Loading pairing file ' + pairing_csv_file + ' ...')
    uei_clust_index_dict = dict()

    with open(sysOps.globaldatapath + pairing_csv_file, 'rU') as csvfile:
        for line in csvfile:
            row = line.strip('\n').split(',')
            index_str = str(row[0])  #UEI cluster-index
            if index_str in uei_clust_index_dict:
                uei_clust_index_dict[index_str].append(
                    [int(row[1]),
                     int(row[2]),
                     int(row[3]),
                     int(row[4])]
                )  #append dictionary entry as list with row having indices of beacon- and target-umi clusters, the read-number, and the set-index (will all be 0 if invalid-amplicon reads are excluded)
            else:
                uei_clust_index_dict[index_str] = [[
                    int(row[1]),
                    int(row[2]),
                    int(row[3]),
                    int(row[4])
                ]]

    #replace each entry with umi pairing having plurality of reads, in same indexed format
    sysOps.throw_status('Generating consensus-pairs ...')
    discarded_ueis = 0
    accepted_ueis = 0
    for uei_clust_el in uei_clust_index_dict:
        maxcount = 0
        secondmaxcount = 0  #detect ties, discard if tie exists
        maxcount_pair_bcn_index = -1
        maxcount_pair_trg_index = -1
        maxcount_set_index = -1
        for row in uei_clust_index_dict[uei_clust_el]:
            if (row[2] >= min_pairing_readcount and row[2] > maxcount):
                secondmaxcount = int(maxcount)
                if maxcount_set_index >= 0 and maxcount_set_index != row[3]:
                    sysOps.throw_exception('Error: set-index mismatch.')
                    sysOps.exitProgram()
                maxcount_pair_bcn_index = int(row[0])
                maxcount_pair_trg_index = int(row[1])
                maxcount = int(row[2])
                maxcount_set_index = int(row[3])
            elif (row[2] >= min_pairing_readcount and row[2] > secondmaxcount):
                secondmaxcount = int(row[2])

        if maxcount >= min_pairing_readcount and maxcount > secondmaxcount:
            # note: this condition requires that not only must the uei have at least min_pairing_readcount,
            # but the plurality-tally be must min_pairing_readcount as well
            uei_clust_index_dict[uei_clust_el] = list([
                int(maxcount_pair_bcn_index),
                int(maxcount_pair_trg_index),
                int(maxcount),
                int(maxcount_set_index)
            ])
            accepted_ueis += 1
        else:
            uei_clust_index_dict[uei_clust_el] = list()
            discarded_ueis += 1

    sysOps.throw_status('Outputting consensus-pairs with at least ' +
                        str(min_pairing_readcount) +
                        ' read-plurality. Accepted ' + str(accepted_ueis) +
                        ' UEIs, discarded ' + str(discarded_ueis) +
                        ' UEIs ...')
    #index outputted as uei-index, beacon-umi-index, target-umi-index, read-count
    outfile_handle = open(
        sysOps.globaldatapath + "consensus_" + str(min_pairing_readcount) +
        "r_" + pairing_csv_file, 'w')

    for uei_clust_el in uei_clust_index_dict:
        if len(uei_clust_index_dict[uei_clust_el]) > 0:
            outfile_handle.write(
                uei_clust_el + "," +
                ",".join([str(s)
                          for s in uei_clust_index_dict[uei_clust_el]]) + "\n")

    outfile_handle.close()

    return
Exemplo n.º 11
0
def assign_umi_amplicons(trg_umi_cluster_file, trg_umi_fasta, amp_match_file,
                         amp_seq_fasta, outfilename):
    #function will tally reads counted for each target umi across each amplicon-call, and return a csv file with the following columns:
    #(target umi cluster-index),(leading amplicon-call),(reads for leading amplicon-call),(total reads counted)

    sysOps.throw_status('Loading cluster-file ' + sysOps.globaldatapath +
                        trg_umi_cluster_file)
    trg_umi_cluster_dict = fileOps.load_cluster_file_to_dictionary(
        trg_umi_cluster_file)
    #outputs dictionary with entries {uxi-sequence: [uxi-cluster-index, read-number]}

    trg_umi_handle = open(sysOps.globaldatapath + trg_umi_fasta, "rU")
    amp_seq_handle = open(sysOps.globaldatapath + amp_seq_fasta, "rU")
    realign_amplicons = False
    amp_match_handle = None
    try:
        sysOps.throw_status('Loading ' + sysOps.globaldatapath +
                            amp_match_file)
        amp_match_handle = open(sysOps.globaldatapath + amp_match_file, "rU")
    except:
        sysOps.throw_status(
            sysOps.globaldatapath + amp_match_file +
            ' not found. Alignments will occur from sequence-consenses directly.'
        )
        realign_amplicons = True
        if not sysOps.check_file_exists('amplicon_refs.txt'):
            sysOps.throw_exception('Error: ' + sysOps.globaldatapath +
                                   'amplicon_refs.txt not found.')
            sysOps.exitProgram()

    trg_umi_dict = dict()
    trg_amp_seq_dict = dict()

    for trg_umi_record, amp_seq_record in itertools.izip(
            SeqIO.parse(trg_umi_handle, "fasta"),
            SeqIO.parse(amp_seq_handle, "fasta")):

        if not realign_amplicons:
            amp_match = int(amp_match_handle.readline().strip('\n'))
        else:
            amp_match = -1

        trg_umi_seq = str(trg_umi_record.seq)
        if trg_umi_seq in trg_umi_cluster_dict:
            trg_umi_index = str(
                trg_umi_cluster_dict[trg_umi_seq][0])  #uxi cluster-index
            if trg_umi_index in trg_umi_dict:
                if amp_match in trg_umi_dict[trg_umi_index]:
                    trg_umi_dict[trg_umi_index][
                        amp_match] += 1  #add 1, because every read is being entered
                else:
                    trg_umi_dict[trg_umi_index][amp_match] = 1
            else:
                trg_umi_dict[trg_umi_index] = dict()
                trg_amp_seq_dict[trg_umi_index] = baseTally()
                trg_umi_dict[trg_umi_index][amp_match] = 1

            trg_amp_seq_dict[trg_umi_index].add_record(str(amp_seq_record.seq),
                                                       1)

    trg_umi_handle.close()
    amp_seq_handle.close()
    if not realign_amplicons:
        amp_match_handle.close()

    csvfile = open(sysOps.globaldatapath + outfilename, 'w')
    fastafile = open(
        sysOps.globaldatapath + outfilename[:outfilename.rfind('.')] +
        '.fasta', 'w')
    ref_sequences = list()
    if realign_amplicons and sysOps.check_file_exists('amplicon_refs.txt'):
        with open(sysOps.globaldatapath + 'amplicon_refs.txt',
                  'rU') as ref_file_handle:
            for ref_line in ref_file_handle:
                [ref_name, ref_seq] = ref_line.strip('\n').upper().split('|')
                # amplicon_refs.txt will contain sequences in reverse complementary orientation. We therefore reverse both complementarity and order
                ref_sequences.append([
                    str(Seq.Seq(my_ref_seq).reverse_complement())
                    for my_ref_seq in reversed(ref_seq.split(','))
                ])
        mySettings = fileOps.read_settingsfile_to_dictionary('libsettings.txt')
        max_mismatch_amplicon = float(mySettings["-max_mismatch_amplicon"][0])
        trg_umi_index_dict = dict()

    accepted_consensus_sequences = 0
    inadmis_consensus_sequences = 0
    for trg_umi_index in trg_umi_dict:
        max_tally = 0
        tot_tally = 0

        for amp_match in trg_umi_dict[trg_umi_index]:

            my_tally = trg_umi_dict[trg_umi_index][amp_match]

            if my_tally >= max_tally:
                max_tally = int(my_tally)
                max_match = int(amp_match)

            tot_tally += int(my_tally)

        consensus_seq = str(
            trg_amp_seq_dict[trg_umi_index].get_str_consensus())

        if realign_amplicons:
            # perform direct, un-gapped alignment of consensus_seq to reference options to obtain max_match
            max_match = -1
            max_tally = -1  # exclude max_tally as count, since alignment is happening post-consensus
            min_mismatch_count = -1
            for i in range(len(ref_sequences)):
                all_subamplicons_pass = True
                start_index = 0
                tot_mismatches = 0
                for j in range(len(ref_sequences[i])
                               ):  # loop through sub-amplicon-sequences
                    ref_subamplicon_len = len(ref_sequences[i][j])
                    my_mismatches, minlen = alignOps.count_mismatches(
                        ref_sequences[i][j],
                        consensus_seq[start_index:(start_index +
                                                   ref_subamplicon_len)])
                    if minlen == 0:
                        all_subamplicons_pass = False
                        break
                    all_subamplicons_pass = all_subamplicons_pass and (
                        my_mismatches / float(minlen) <= max_mismatch_amplicon)
                    start_index += ref_subamplicon_len
                    tot_mismatches += my_mismatches
                if all_subamplicons_pass and (
                        max_match < 0 or min_mismatch_count < tot_mismatches):
                    max_match = int(i)
                    min_mismatch_count = int(tot_mismatches)

        if max_match >= 0:
            csvfile.write(trg_umi_index + "," + str(max_match) + "," +
                          str(max_tally) + "," + str(tot_tally) + "\n")
            fastafile.write(">" + trg_umi_index + '\n')
            fastafile.write(consensus_seq + '\n')
            if realign_amplicons:
                trg_umi_index_dict[trg_umi_index] = True
            accepted_consensus_sequences += 1
        else:
            inadmis_consensus_sequences += 1

    csvfile.close()
    fastafile.close()
    sysOps.throw_status('Discarded ' + str(inadmis_consensus_sequences) + '/' +
                        str(accepted_consensus_sequences +
                            inadmis_consensus_sequences) +
                        ' sequences in writing ' + sysOps.globaldatapath +
                        outfilename + ' due to inadequate amplicon match.')

    if realign_amplicons:
        # create a new consensus pairing file that's filtered with the accepted trg umi indices
        [dirnames, filenames] = sysOps.get_directory_and_file_list()
        consensus_filenames = [
            filename for filename in filenames
            if filename.startswith('consensus')
        ]
        for consensus_filename in consensus_filenames:  # find all consensus files present
            accepted_consensus_sequences = 0
            inadmis_consensus_sequences = 0
            os.rename(
                sysOps.globaldatapath + consensus_filename,
                sysOps.globaldatapath + 'unfiltered_' + consensus_filename)
            with open(sysOps.globaldatapath + consensus_filename,
                      'w') as new_consensus_file:
                with open(
                        sysOps.globaldatapath + 'unfiltered_' +
                        consensus_filename, 'rU') as old_consensus_file:
                    for old_consensus_file_line in old_consensus_file:
                        consensus_list = old_consensus_file_line.strip(
                            '\n'
                        ).split(
                            ','
                        )  # [uei_index, bcn_umi_index, trg_umi_index, read_count, (additional variables)]
                        if consensus_list[2] in trg_umi_index_dict:
                            new_consensus_file.write(old_consensus_file_line)
                            accepted_consensus_sequences += 1
                        else:
                            inadmis_consensus_sequences += 1
            sysOps.throw_status('Discarded ' +
                                str(inadmis_consensus_sequences) + '/' +
                                str(accepted_consensus_sequences +
                                    inadmis_consensus_sequences) +
                                ' consensus-pairings in writing ' +
                                sysOps.globaldatapath + consensus_filename +
                                ' due to inadequate amplicon match.')
        if len(consensus_filenames) == 0:
            sysOps.throw_exception(
                'Error: no consensus files available to update with realigned amplicon information. Exiting.'
            )
            sysOps.exitProgram()
Exemplo n.º 12
0
    def dnamic_inference(self,
                         smle_infer=False,
                         msmle_infer=False,
                         segment_infer=False,
                         compute_local_solutions_only=True):
        # Perform image inference on the basis of raw output of DNA microscopy sequence analysis

        # Basic settings
        read_thresh = 2
        min_uei_count = 2
        output_dim = 2
        version = 1.0
        infer_dir = ''

        # raw data files
        consensus_pairing_csv_file = "..//consensus_" + str(
            read_thresh) + "r_pairing_filter0.75_uei_umi.csv"
        outname = 'minuei' + str(min_uei_count) + 'DMv' + str(
            version) + '_' + str(read_thresh) + 'r_filter0.75'
        wmat_outfilename = 'wmat_' + outname + '.csv'
        param_name = 'minuei' + str(min_uei_count) + 'dim' + str(
            output_dim) + 'DMv' + str(version) + '_.csv'
        imagemodule_input_filename = 'data_' + param_name
        key_filename = 'key_' + param_name
        if not sysOps.check_file_exists('microscopy_tasklist.csv'):
            [subdirnames, filenames
             ] = sysOps.get_directory_and_file_list(sysOps.globaldatapath)
            with open(sysOps.globaldatapath + 'microscopy_tasklist.csv',
                      'w') as task_input_file_handle:
                for subdir in subdirnames:
                    if sysOps.check_file_exists(subdir + '//libsettings.txt'):
                        task_input_file_handle.write('infer_smle;' +
                                                     sysOps.globaldatapath +
                                                     subdir + '//\n')
                        task_input_file_handle.write('infer_msmle;' +
                                                     sysOps.globaldatapath +
                                                     subdir + '//\n')
                        task_input_file_handle.write('infer_segment;' +
                                                     sysOps.globaldatapath +
                                                     subdir + '//\n')
                        task_input_file_handle.write('infer_ptmle;' +
                                                     sysOps.globaldatapath +
                                                     subdir + '//\n')

        original_datapath = str(sysOps.globaldatapath)
        if smle_infer:
            infer_dir = 'infer_smle//'
            [my_task, time_start
             ] = parallelOps.get_next_open_task('tasklog.csv',
                                                'microscopy_tasklist.csv',
                                                'infer_smle')
        elif msmle_infer:
            infer_dir = 'infer_msmle//'
            [my_task, time_start
             ] = parallelOps.get_next_open_task('tasklog.csv',
                                                'microscopy_tasklist.csv',
                                                'infer_msmle')
        elif segment_infer:
            infer_dir = 'infer_segment//'
            [my_task, time_start
             ] = parallelOps.get_next_open_task('tasklog.csv',
                                                'microscopy_tasklist.csv',
                                                'infer_segment')
        else:
            infer_dir = 'infer_ptmle//'
            [my_task, time_start
             ] = parallelOps.get_next_open_task('tasklog.csv',
                                                'microscopy_tasklist.csv',
                                                'infer_ptmle')

        if not (my_task is None):

            sysOps.initiate_runpath(str(my_task[1]))

            [subdirnames, filenames] = sysOps.get_directory_and_file_list()
            dirnames = list(["."])
            subdirnames_nodatayet = [
                subdirname for subdirname in subdirnames
                if subdirname.startswith('sub') and (
                    not sysOps.check_file_exists(subdirname + '//' +
                                                 imagemodule_input_filename))
            ]
            subdirnames_nodatayet = [
                subdirnames_nodatayet[i] for i in np.argsort(-np.array([
                    int(subdirname[3:].strip('/'))
                    for subdirname in subdirnames_nodatayet
                ]))
            ]  # sort by descending read count
            subdirnames_dataalready = [
                subdirname for subdirname in subdirnames
                if subdirname.startswith('sub') and (
                    sysOps.check_file_exists(subdirname + '//' +
                                             imagemodule_input_filename))
            ]
            subdirnames_dataalready = [
                subdirnames_dataalready[i] for i in np.argsort(-np.array([
                    int(subdirname[3:].strip('/'))
                    for subdirname in subdirnames_dataalready
                ]))
            ]  # sort by descending read count
            dirnames.extend(subdirnames_nodatayet)
            dirnames.extend(subdirnames_dataalready)
            sysOps.throw_status('Checking directories ' +
                                sysOps.globaldatapath + ' ... ' +
                                str(dirnames) + ' for infer-subdirectories.')
            for dirname in dirnames:  # make inference directories
                try:
                    with open(
                            sysOps.globaldatapath + dirname + '//' +
                            infer_dir + 'tmpfile.txt', 'w') as tmpfile:
                        tmpfile.write('test')
                    os.remove(sysOps.globaldatapath + dirname + '//' +
                              infer_dir + 'tmpfile.txt')
                    sysOps.throw_status('Directory ' + sysOps.globaldatapath +
                                        dirname + '//' + infer_dir +
                                        ' found already created.')
                except:
                    os.mkdir(sysOps.globaldatapath + dirname + '//' +
                             infer_dir)
                    sysOps.throw_status('Created directory ' +
                                        sysOps.globaldatapath + dirname +
                                        '//' + infer_dir)

            for dirname in dirnames:
                sysOps.initiate_runpath(
                    str(my_task[1]) + dirname + '//' + infer_dir)
                sysOps.initiate_statusfilename()
                sysOps.throw_status('Assigned path ' + sysOps.globaldatapath)

                if not (sysOps.check_file_exists(key_filename) and
                        sysOps.check_file_exists(imagemodule_input_filename)
                        and sysOps.check_file_exists(
                            'read_' + imagemodule_input_filename) and
                        sysOps.check_file_exists('seq_params_' +
                                                 imagemodule_input_filename)):

                    sysOps.throw_status('Calling matOps.generate_wmat()')

                    trg_dict = matOps.generate_wmat(consensus_pairing_csv_file,
                                                    read_thresh, min_uei_count,
                                                    wmat_outfilename)
                    sysOps.throw_status('Completed matOps.generate_wmat()')
                    matOps.print_imagemodule_input(trg_dict,
                                                   imagemodule_input_filename,
                                                   key_filename, output_dim)
                    #print_imagemodule_input outputs
                    #    1. File key_filename containing 3 columns: 0 or 1 (for beacon or target, respectively), cluster-index, MLE processing index
                    #    2. imagemodule_input_filename containing 3 columns: MLE processing index for beacon, MLE processing index for target, uei-count, max UEI read count
                    #    3. Summary file containing: Number of beacons inputted to MLE, number of targets inputted to MLE,
                else:
                    sysOps.throw_status(
                        'Image-module input pre-computed. Proceeding ...')

                #optimOps.test_ffgt()

                if sysOps.check_file_exists(imagemodule_input_filename):
                    if segment_infer:
                        optimOps.run_mle(
                            imagemodule_input_filename,
                            False,
                            False,
                            True,
                            compute_local_solutions_only,
                        )  # segmentation only
                    elif msmle_infer:
                        optimOps.run_mle(imagemodule_input_filename, False,
                                         True, False,
                                         compute_local_solutions_only)  # msMLE
                    elif smle_infer:
                        optimOps.run_mle(imagemodule_input_filename, True,
                                         False, False,
                                         compute_local_solutions_only)  # sMLE
                    else:
                        optimOps.run_mle(imagemodule_input_filename, False,
                                         False, False,
                                         compute_local_solutions_only)  # ptMLE

                    if not compute_local_solutions_only:
                        dnamicOps.print_final_results(
                            '..//trg_amplicon_calls.csv',
                            '..//trg_amplicon_calls.fasta')
                    else:
                        sysOps.exitProgram()
                else:
                    sysOps.throw_status('Could not locate ' +
                                        sysOps.globaldatapath +
                                        imagemodule_input_filename)

            sysOps.globaldatapath = str(original_datapath)
            if not parallelOps.close_task('tasklog.csv', ';'.join(my_task),
                                          time_start):
                sysOps.throw_exception('Task ' + str(my_task) +
                                       ' no longer exists in log ' +
                                       sysOps.globaldatapath + 'tasklog.csv' +
                                       ' -- exiting.')
                sysOps.exitProgram()

        return
Exemplo n.º 13
0
    def sim_reads(self):
        simLibObj = libOps.libObj(settingsfilename='libsettings.txt',
                                  output_prefix='_')
        enforced_rev_read_len = 100
        [for_read_len, rev_read_len] = simLibObj.get_min_allowed_readlens(
            simLibObj.filter_amplicon_window)
        rev_read_len = int(enforced_rev_read_len)
        '''
        simLibObj.seqform_for_params and simLibObj.seqform_rev_params are already stored in current object's memory
        Form of these variables is a list of the following:
            Element 1: [start_pos,end_pos]
            Element 2: np.ndarray(seq_bool_vec, dtype=np.bool_)
            Element 3: np.ndarray(capital_bool_vec, dtype=np.bool_)
            Element 4: np.ndarray(ambig_vec, dtype=np.bool_)
        '''
        [subdirnames, filenames] = sysOps.get_directory_and_file_list()

        for_umi_seqs = list()
        rev_umi_seqs = list()
        rev_umi_amplicon_list = list()
        uei_seqs = list()
        base_order = 'ACGT'

        sysOps.throw_status('Generating simulated sequences ...')
        amplicon_list = list()
        if "-amplicon" in simLibObj.mySettings:
            amplicon_list = [
                simLibObj.mySettings["-amplicon"][i].upper().split(',')
                for i in range(len(simLibObj.mySettings["-amplicon"]))
            ]

        for for_umi_i in range(self.Nbcn):
            for_param_index = np.random.randint(
                len(simLibObj.seqform_for_params))
            if len(simLibObj.seqform_for_params[for_param_index]) > 1:
                sysOps.throw_exception(
                    'Error: len(simLibObj.seqform_for_params[for_param_index]) = '
                    + str(len(simLibObj.seqform_for_params[for_param_index])))
                sysOps.exitProgram()
            my_for_umi_param = simLibObj.seqform_for_params[for_param_index][
                0]['U'][0]
            [start_pos, end_pos] = my_for_umi_param[0]
            seq_bool_vec = my_for_umi_param[1]
            my_for_umi = str('')
            for pos in range(end_pos - start_pos):
                possible_bases = np.where(seq_bool_vec[(pos * 4):((pos + 1) *
                                                                  4)])[0]
                my_for_umi += base_order[possible_bases[np.random.randint(
                    possible_bases.shape[0])]]

            for_umi_seqs.append([int(for_param_index), str(my_for_umi)])

        for for_uei_i in range(self.Nuei):
            for_param_index = 0  # there should be no difference across UMI's
            my_for_uei_param = simLibObj.seqform_for_params[for_param_index][
                0]['U'][1]
            [start_pos, end_pos] = my_for_uei_param[0]
            seq_bool_vec = my_for_uei_param[1]
            my_for_uei = str('')
            for pos in range(end_pos - start_pos):
                possible_bases = np.where(seq_bool_vec[(pos * 4):((pos + 1) *
                                                                  4)])[0]
                my_for_uei += base_order[possible_bases[np.random.randint(
                    possible_bases.shape[0])]]

            uei_seqs.append(str(my_for_uei))

        for rev_umi_i in range(self.Ntrg):
            rev_param_index = np.random.randint(
                len(simLibObj.seqform_rev_params))
            my_rev_umi_param = simLibObj.seqform_rev_params[rev_param_index][
                0]['U'][0]
            [start_pos, end_pos] = my_rev_umi_param[0]
            seq_bool_vec = my_rev_umi_param[1]
            my_rev_umi = str('')
            for pos in range(end_pos - start_pos):
                possible_bases = np.where(seq_bool_vec[(pos * 4):((pos + 1) *
                                                                  4)])[0]
                my_rev_umi += base_order[possible_bases[np.random.randint(
                    possible_bases.shape[0])]]

            if len(amplicon_list) == 0:
                encoded_amplicon = str('')
            else:
                this_gsp_primer_amplicon_pair = list(
                    amplicon_list[np.random.randint(len(amplicon_list))]
                )  # already properly oriented # already properly oriented
                # generate single error on amplicon
                lenamp = len(this_gsp_primer_amplicon_pair[1])
                rand_loc = np.random.randint(lenamp)
                this_gsp_primer_amplicon_pair[1] = str(
                    this_gsp_primer_amplicon_pair[1][:rand_loc] +
                    base_order[np.random.randint(4)] +
                    this_gsp_primer_amplicon_pair[1][(rand_loc + 1):])
                encoded_amplicon = ''.join(this_gsp_primer_amplicon_pair)

            tmp_umi_index = float(rev_umi_i)

            if tmp_umi_index == 0:
                encoded_amplicon += base_order[0]
            else:
                for myexponent in range(
                        int(np.floor(np.log(tmp_umi_index) / np.log(4.0))), -1,
                        -1):
                    mydigit = np.floor(tmp_umi_index /
                                       np.power(4.0, myexponent))
                    encoded_amplicon += base_order[int(mydigit)]
                    tmp_umi_index -= mydigit * np.power(4.0, myexponent)

            rev_umi_seqs.append(
                [int(rev_param_index),
                 str(my_rev_umi),
                 str(encoded_amplicon)])

        sysOps.throw_status('Writing simulated reads ...')

        for filename in filenames:
            if filename.endswith('_sim_ueifile.csv'):
                ueifile = np.int64(
                    np.loadtxt(sysOps.globaldatapath + filename,
                               delimiter=','))
                newdirname = filename[:filename.find('_')]
                read_list = list()
                for i in range(ueifile.shape[0]):
                    for myread in range(ueifile[i, 3]):
                        read_list.append(np.array([ueifile[i, :3]]))
                read_list = np.concatenate(
                    read_list, axis=0
                )  # re-write array so that there is now one row per read
                # randomly permute:
                read_list = read_list[
                    np.random.permutation(read_list.shape[0]), :]

                for_chararray = np.chararray((for_read_len))
                rev_chararray = np.chararray((rev_read_len))
                for_fastq_outfile = open(newdirname + '_for.fastq', "w")
                rev_fastq_outfile = open(newdirname + '_rev.fastq', "w")
                for i in range(read_list.shape[0]):
                    for_param_index = for_umi_seqs[read_list[i, 1]][0]
                    for_umi_seq = for_umi_seqs[read_list[i, 1]][1]
                    rev_param_index = rev_umi_seqs[read_list[i, 2]][
                        0]  # both beacon and target indices are at this point are independently indexed from 0
                    rev_umi_seq = rev_umi_seqs[read_list[i, 2]][1]
                    rev_amp_seq = rev_umi_seqs[read_list[i, 2]][2]
                    uei_seq = uei_seqs[read_list[i, 0]]

                    for j in range(for_read_len):
                        for_chararray[j] = 'N'
                    for j in range(rev_read_len):
                        rev_chararray[j] = 'N'

                    my_for_umi_param = simLibObj.seqform_for_params[
                        for_param_index][0]['U'][0]
                    [start_pos, end_pos] = my_for_umi_param[0]
                    for j in range(end_pos - start_pos):
                        for_chararray[j + start_pos] = for_umi_seq[j]

                    my_for_uei_param = simLibObj.seqform_for_params[
                        for_param_index][0]['U'][1]
                    [start_pos, end_pos] = my_for_uei_param[0]
                    for j in range(end_pos - start_pos):
                        for_chararray[j + start_pos] = uei_seq[j]

                    for my_for_param in simLibObj.seqform_for_params[
                            for_param_index][0]['P']:
                        [start_pos, end_pos] = my_for_param[0]
                        for j in range(end_pos - start_pos):
                            for_chararray[j + start_pos] = base_order[np.where(
                                my_for_param[1][(4 * j):(4 * (j + 1))])[0][0]]

                    my_rev_umi_param = simLibObj.seqform_rev_params[
                        rev_param_index][0]['U'][0]
                    [start_pos, end_pos] = my_rev_umi_param[0]
                    for j in range(end_pos - start_pos):
                        rev_chararray[j + start_pos] = rev_umi_seq[j]
                    my_rev_amp_param = simLibObj.seqform_rev_params[
                        rev_param_index][0]['A'][0]
                    start_pos = my_rev_amp_param[0][0]
                    for j in range(len(rev_amp_seq)):
                        rev_chararray[j + start_pos] = rev_amp_seq[j]

                    if 'P' in simLibObj.seqform_rev_params[rev_param_index][0]:
                        for my_rev_param in simLibObj.seqform_rev_params[
                                rev_param_index][0]['P']:
                            [start_pos, end_pos] = my_rev_param[0]
                            for j in range(end_pos - start_pos):
                                rev_chararray[j +
                                              start_pos] = base_order[np.where(
                                                  my_rev_param[1][(4 * j):(
                                                      4 * (j + 1))])[0][0]]

                    for_record = SeqIO.SeqRecord(
                        Seq.Seq(for_chararray.tostring()))
                    for_record.id = '-' + str(i) + '-' + str(read_list[i, 1])
                    for_record.description = ''
                    for_record.letter_annotations['phred_quality'] = list(
                        [30 for j in range(for_read_len)])
                    rev_record = SeqIO.SeqRecord(
                        Seq.Seq(rev_chararray.tostring()))
                    rev_record.id = '-' + str(i) + '-' + str(read_list[i, 2])
                    rev_record.description = ''
                    rev_record.letter_annotations['phred_quality'] = list(
                        [30 for j in range(rev_read_len)])
                    SeqIO.write(for_record, for_fastq_outfile, "fastq")
                    SeqIO.write(rev_record, rev_fastq_outfile, "fastq")

                for_fastq_outfile.close()
                rev_fastq_outfile.close()
                os.mkdir(newdirname)
                with open('libsettings.txt', 'rU') as oldsettingsfile:
                    with open(newdirname + '//libsettings.txt',
                              'w') as newsettingsfile:
                        for oldsettings_row in oldsettingsfile:
                            if oldsettings_row.startswith('-source_for'):
                                newsettingsfile.write('-source_for ..//' +
                                                      newdirname +
                                                      '_for.fastq\n')
                            elif oldsettings_row.startswith('-source_rev'):
                                newsettingsfile.write('-source_rev ..//' +
                                                      newdirname +
                                                      '_rev.fastq\n')
                            else:
                                newsettingsfile.write(oldsettings_row)

        sysOps.throw_status('Done.')
        return
Exemplo n.º 14
0
def compare(clustfile1,
            clustfile2,
            comparison_file_name,
            rev_comp,
            read_thresh=2,
            filter_substr_list=[],
            filter_val=0.75):
    #rev_comp = True/False depending on need of reverse-complement being taken
    #filter_val = maximum fraction of bases in uxi allowed to be the same

    #all filtering of legitimate comparison occurs here, at the front end
    print "Beginning comparison between " + clustfile1 + " and " + clustfile2

    #Stage 1 of comparison: determine total read-abundance of clusters in clustfile1 and clustfile2,
    #assign to abund_dict1 and abund_dict2

    abund_dict1 = dict()
    with open(sysOps.globaldatapath + clustfile1, 'rU') as clust1_handle:
        for clust_line in clust1_handle:
            my_el = clust_line.strip('\n').split('_')
            if (len(my_el) == 3):
                uxi_index = my_el[0]
                my_numreads = int(my_el[2])
                if uxi_index not in abund_dict1:
                    abund_dict1[uxi_index] = {
                        'reads': my_numreads,
                        'is_shared': False
                    }
                else:
                    abund_dict1[uxi_index]['reads'] += my_numreads

    abund_dict2 = dict()
    with open(sysOps.globaldatapath + clustfile2, 'rU') as clust2_handle:
        for clust_line in clust2_handle:
            my_el = clust_line.strip('\n').split('_')
            if (len(my_el) == 3):
                uxi_index = my_el[0]
                my_numreads = int(my_el[2])
                if uxi_index not in abund_dict2:
                    abund_dict2[uxi_index] = {
                        'reads': my_numreads,
                        'is_shared': False
                    }
                else:
                    abund_dict2[uxi_index]['reads'] += my_numreads

    #Stage 2 of comparison: enter actual uxi sequences into dict_clust1 and dict_clust2,
    #enter their respective cluster-indices into dict_uxi_indices1 and dict_uxi_indices2

    dict_clust1 = dict()
    with open(sysOps.globaldatapath + clustfile1, 'rU') as clust1_handle:
        for clust_line in clust1_handle:
            my_el = clust_line.strip('\n').split('_')
            if (len(my_el) == 3):
                uxi_index = int(my_el[0])
                this_uxi = str(my_el[1])
                my_numreads = int(my_el[2])
                has_disallowed_substr = [
                    my_substr in this_uxi for my_substr in filter_substr_list
                ]
                if abund_dict1[my_el[0]]['reads'] >= read_thresh and (
                        True not in has_disallowed_substr) and max(
                            numpy.bincount([('ACGT').index(s) for s in this_uxi
                                            ])) <= filter_val * len(this_uxi):
                    dict_clust1[this_uxi] = [
                        uxi_index, my_numreads, False
                    ]  #final entry corresponds to being shared

    print "Completed first cluster-file input. Second cluster-file being read, output to cross_comparisons//" + comparison_file_name

    comparison_handle = open(
        sysOps.globaldatapath + 'cross_comparisons//' + comparison_file_name,
        'w')

    dict_clust2 = dict()
    with open(sysOps.globaldatapath + clustfile2, 'rU') as clust2_handle:
        for clust_line in clust2_handle:
            my_el = clust_line.strip('\n').split('_')
            if (len(my_el) == 3):
                uxi_index = int(my_el[0])  #references clustfile2
                #my_uxi references clustfile2 uxi sequences
                #this_uxi references clustfile1 uxi sequences
                my_uxi = str(my_el[1])
                my_numreads = int(my_el[2])
                this_uxi = str(my_uxi)
                if (rev_comp):
                    this_uxi = str(Seq.Seq(this_uxi).reverse_complement())
                has_disallowed_substr = [
                    my_substr in this_uxi for my_substr in filter_substr_list
                ]

                if abund_dict2[my_el[0]]['reads'] >= read_thresh and (
                        True not in has_disallowed_substr) and max(
                            numpy.bincount([('ACGT').index(s) for s in this_uxi
                                            ])) <= filter_val * len(this_uxi):
                    dict_clust2[my_uxi] = [uxi_index, my_numreads, False]
                    if this_uxi in dict_clust1:
                        dict_clust1[this_uxi][2] = True
                        dict_clust2[my_uxi][2] = True
                        if str(dict_clust1[this_uxi][0]) not in abund_dict1:
                            sysOps.throw_exception(
                                'A: ' + str(dict_clust1[this_uxi][0]) +
                                ' not in dict_uxi_indices1')
                            sysOps.exitProgram()
                        if str(uxi_index) not in abund_dict2:
                            sysOps.throw_exception('B: ' + str(uxi_index) +
                                                   ' not in dict_uxi_indices2')
                            sysOps.exitProgram()

                        abund_dict1[str(
                            dict_clust1[this_uxi][0])]['is_shared'] = True
                        abund_dict2[str(uxi_index)]['is_shared'] = True

                        comparison_handle.write(
                            str(this_uxi) + "," +
                            str(dict_clust1[this_uxi][0]) + "," +
                            str(dict_clust1[this_uxi][1]) + "," +
                            str(abund_dict1[str(dict_clust1[this_uxi][0])]
                                ['reads']) + "," +
                            str(dict_clust2[my_uxi][0]) + "," +
                            str(dict_clust2[my_uxi][1]) + "," +
                            str(abund_dict2[str(dict_clust2[my_uxi][0])]
                                ['reads']) + "\n")

    comparison_handle.close()

    #count number unique shared and unique unshared
    num_unique_shared = [0, 0]
    num_unique_unshared = [0, 0]
    read_abundance_shared = [0, 0]
    read_abundance_unshared = [0, 0]

    for uxi_index1 in abund_dict1:
        if abund_dict1[uxi_index1]['is_shared']:
            num_unique_shared[0] += 1
            read_abundance_shared[0] += abund_dict1[uxi_index1]['reads']
        else:
            num_unique_unshared[0] += 1
            read_abundance_unshared[0] += abund_dict1[uxi_index1]['reads']

    for uxi_index2 in abund_dict2:
        if abund_dict2[uxi_index2]['is_shared']:
            num_unique_shared[1] += 1
            read_abundance_shared[1] += abund_dict2[uxi_index2]['reads']
        else:
            num_unique_unshared[1] += 1
            read_abundance_unshared[1] += abund_dict2[uxi_index2]['reads']

    return [
        num_unique_shared, num_unique_unshared, read_abundance_shared,
        read_abundance_unshared
    ]
Exemplo n.º 15
0
def gather_rarefaction_data(conditions_filename = 'conditions.csv', outfilename = 'rarefaction_file.txt', raw_uxi_files = ['_for_uxi0.fasta','_for_uxi1.fasta','_rev_uxi0.fasta']):
    
    #use conditions conditions_filename to specify output order
    dirnames = list()
    with open(sysOps.globaldatapath + conditions_filename, 'rU') as conditions_handle:
        for myline in conditions_handle:
            thisline = myline.strip('\n').split(',')
            dirnames.append('lib_' + str(thisline[0]) + '_' + str(thisline[1]) + '_' + str(thisline[2]))
    
    outfile_1r = open(sysOps.globaldatapath +'1r_' + outfilename,'w')
    outfile_2r = open(sysOps.globaldatapath +'2r_' + outfilename,'w')
    outfile_3r = open(sysOps.globaldatapath +'3r_' + outfilename,'w')
    
    for dir in dirnames:
        print 'Gathering rarefaction data for directory ' + sysOps.globaldatapath + dir
        sum_reads_raw = 0
        with open(sysOps.globaldatapath +dir + '/' + raw_uxi_files[0],'rU') as uxi_file_handle:
            #first UMI/UEI file in list to count raw reads
            for uxi_record in SeqIO.parse(uxi_file_handle,'fasta'):
                sum_reads_raw += 1
        
        subsample = 500
        terminate = False
        while not terminate:
            all_diversities = []
            try:
                for my_raw_uxi_file in raw_uxi_files:
                    try:
                        cluster_file_handle = open(sysOps.globaldatapath +dir + '/thresh1_identical_sub' + str(subsample) + my_raw_uxi_file,'rU')
                        consensus_pairing_csv_file = dir + '/consensus_2r_sub' + str(subsample) + 'pairing_filter0.75_uei_umi.csv'
                    except:
                        terminate = True
                        try:
                            cluster_file_handle = open(sysOps.globaldatapath +dir + '/thresh1_identical_' + my_raw_uxi_file,'rU')
                            consensus_pairing_csv_file = dir + '/consensus_2r_pairing_filter0.75_uei_umi.csv'
                        except:
                            sysOps.throw_exception('Directory ' + sysOps.globaldatapath + dir + ' does not contain clustered file' +  sysOps.globaldatapath +dir + '/thresh1_identical_' + my_raw_uxi_file + '. Skipping ...')
                            break
                        
                        subsample = sum_reads_raw
                        
                    cluster_dict = dict()
                    for myline in cluster_file_handle:
                        thisline = myline.strip('\n').split('_')
                        if thisline[0] in cluster_dict:
                            cluster_dict[thisline[0]] += int(thisline[2])
                        else:
                            cluster_dict[thisline[0]] = int(thisline[2])
                            
                    cluster_file_handle.close()
    
                    diversity = [0,0,0] #first element is 1-read-gated diversity, second is 2-read-gated, third is 3-read-gated
                    for el in cluster_dict:
                        if cluster_dict[el]>=3:
                            diversity[0] += 1
                            diversity[1] += 1
                            diversity[2] += 1
                        elif cluster_dict[el]>=2:
                            diversity[0] += 1
                            diversity[1] += 1
                        else:
                            diversity[0] += 1
                            
                    all_diversities.append(diversity)
    
                #if sysOps.check_file_exists(consensus_pairing_csv_file):
                if False: #temp
                    sysOps.throw_status('Found ' + sysOps.globaldatapath + consensus_pairing_csv_file + '.')
                    min_uei_count = 2  
                    min_umi_readcount = 2
                    outname = 'minb' + str(min_uei_count) + 'v' + str(0) + '_' + str(min_umi_readcount) + 'r_filter0.75'
                    wmat_outfilename = 'noabundcorr_wmat_' + outname + '.csv'
                    sysOps.throw_status('Calling matOps.generate_wmat()')
                    [num_unique_trg, num_unique_bcn, trg_dict] = matOps.generate_wmat(consensus_pairing_csv_file, min_umi_readcount, min_umi_readcount, min_uei_count, wmat_outfilename = None)
                    
                    if num_unique_bcn>0:
                        filtered_minb_diversity_2r = [num_unique_bcn, sum([trg_dict[trg_el] for trg_el in trg_dict]), num_unique_trg]
                    else:
                        filtered_minb_diversity_2r = [0,0,0]
                else:
                    sysOps.throw_status(sysOps.globaldatapath + consensus_pairing_csv_file + ' not found.')
                    filtered_minb_diversity_2r = []
                    
                outfile_1r.write(','.join([dir, str(subsample), ','.join([str(my_diversity[0]) for my_diversity in all_diversities])]) + '\n')
                outfile_2r.write(','.join([dir, str(subsample), ','.join([str(my_diversity[1]) for my_diversity in all_diversities]), ','.join([str(s) for s in filtered_minb_diversity_2r])]) + '\n')
                outfile_3r.write(','.join([dir, str(subsample), ','.join([str(my_diversity[2]) for my_diversity in all_diversities])]) + '\n')                 
            
            except:
                terminate = True
                
            subsample *= 2
        
    outfile_1r.close()
    outfile_2r.close()
    outfile_3r.close()
Exemplo n.º 16
0
    def crosscomparison_analysis(self, args):

        sysOps.initiate_statusfilename()
        list_of_dirs = list()

        file_to_compare = args[1]

        with open(sysOps.globaldatapath + args[2], 'rU') as csvfile:
            for myline in csvfile:
                thisline = myline.strip('\n').split(',')
                subdir = 'lib_' + str(thisline[0]) + '_' + str(
                    thisline[1]) + '_' + str(thisline[2])
                list_of_dirs.append(subdir)

        print "Beginning comparison analysis"
        print "File to compare = " + file_to_compare
        print "Directories = " + ",".join(list_of_dirs)

        try:
            os.mkdir(sysOps.globaldatapath + 'cross_comparisons')
        except:
            sysOps.throw_exception(
                'cross_comparisons directory already exists. Terminating comparison analysis.'
            )
            sysOps.exitProgram()

        shared_num_unique_matrix = list()
        unshared_num_unique_matrix = list()
        shared_read_abund_matrix = list()
        unshared_read_abund_matrix = list()

        for i in range(len(list_of_dirs)):
            shared_num_unique_matrix.append(list([-1] * len(list_of_dirs)))
            unshared_num_unique_matrix.append(list([-1] * len(list_of_dirs)))
            shared_read_abund_matrix.append(list([-1] * len(list_of_dirs)))
            unshared_read_abund_matrix.append(list([-1] * len(list_of_dirs)))

        for ind1 in range(len(list_of_dirs)):
            for ind2 in range(ind1):
                dir1 = list_of_dirs[ind1]
                dir2 = list_of_dirs[ind2]
                clustfile1 = dir1 + "//" + file_to_compare
                clustfile2 = dir2 + "//" + file_to_compare
                dir1_abbrev = dir1[(
                    dir1.rfind('/') + 1
                ):]  #remove superdirectory structure of path -- requires individual directories have unique names
                dir2_abbrev = dir2[(dir2.rfind('/') + 1):]
                sysOps.throw_status('Began writing cross_comparisons//' +
                                    dir1_abbrev + "_" + dir2_abbrev + "_" +
                                    file_to_compare)
                [
                    num_unique_shared, num_unique_unshared,
                    read_abundance_shared, read_abundance_unshared
                ] = alignOps.compare(
                    clustfile1, clustfile2,
                    dir1_abbrev + "_" + dir2_abbrev + "_" + file_to_compare,
                    False)
                sysOps.throw_status('Completed writing cross_comparisons//' +
                                    dir1_abbrev + "_" + dir2_abbrev + "_" +
                                    file_to_compare)
                shared_num_unique_matrix[ind1][ind2] = num_unique_shared[0]
                shared_num_unique_matrix[ind2][ind1] = num_unique_shared[1]
                unshared_num_unique_matrix[ind1][ind2] = num_unique_unshared[0]
                unshared_num_unique_matrix[ind2][ind1] = num_unique_unshared[1]
                print str(num_unique_unshared[0]
                          ) + '-> unshared_num_unique_matrix[ ' + str(
                              ind1) + '][' + str(ind2) + ']'
                shared_read_abund_matrix[ind1][ind2] = read_abundance_shared[0]
                shared_read_abund_matrix[ind2][ind1] = read_abundance_shared[1]
                unshared_read_abund_matrix[ind1][
                    ind2] = read_abundance_unshared[0]
                unshared_read_abund_matrix[ind2][
                    ind1] = read_abundance_unshared[1]

        print shared_num_unique_matrix
        print unshared_num_unique_matrix
        print shared_read_abund_matrix
        print unshared_read_abund_matrix

        with open('comparison_matrices.csv', 'w') as compare_matrix_file:
            for i1 in range(len(list_of_dirs)):
                compare_matrix_file.write(
                    ','.join([str(j)
                              for j in shared_num_unique_matrix[i1]]) + '\n')

            for i2 in range(len(list_of_dirs)):
                compare_matrix_file.write(
                    ','.join([str(j)
                              for j in unshared_num_unique_matrix[i2]]) + '\n')

            for i3 in range(len(list_of_dirs)):
                compare_matrix_file.write(
                    ','.join([str(j)
                              for j in shared_read_abund_matrix[i3]]) + '\n')

            for i4 in range(len(list_of_dirs)):
                compare_matrix_file.write(
                    ','.join([str(j)
                              for j in unshared_read_abund_matrix[i4]]) + '\n')
Exemplo n.º 17
0
def parse_seqform(parseable, amplicon_option=None):
    '''
    parse input from -seqform_for or -seqform_rev tag in settings file
    parseable must contain integers separated by '|' characters, X_position1:position2
    X is one of the following characters
    1. P -- primer
    2. S -- spacer
    3. A -- amplicon
    4. U -- uxi
    X's may be redundant (there may be multiple primers, spacers, and amplicons)
    If form is X_N_position1:position2 (with a string between 2 underscores), N represents a sequence to which the input is aligned and match-score stored (N's in case of uxi)
    Final form of returned my_seqform dictionary entry is:
    Character1: [[[positionA1,positionA2],filter-sequence A (="" if none given)],[[positionB1,positionB2],filter-sequence B (="" if none given)]]
    '''
    my_seqform = dict()
    parseable = parseable.split("|")
    for this_parseable in parseable:
        my_elements = this_parseable.split("_")
        try:
            if (len(my_elements) < 3):
                my_char = my_elements[0].upper()
                seq = ""
                boundaries = my_elements[1].split(":")
            else:
                my_char = my_elements[0].upper()
                seq = my_elements[1]
                boundaries = my_elements[2].split(":")

            if (len(boundaries[0]) == 0):
                boundaries = [None, int(boundaries[1])]
            elif (len(boundaries[1]) == 0):
                boundaries = [int(boundaries[0]), None]
            else:
                boundaries = [int(boundaries[0]), int(boundaries[1])]
                if (boundaries[1] - boundaries[0] != len(seq)
                        and len(my_elements) == 3):
                    sysOps.throw_exception(
                        'Error: mismatch between filter boundary-indices and filter string-size, boundaries='
                        + str(boundaries) + ", seq=" + seq)

        except:
            print "Error parsing seqform " + this_parseable
            sysOps.throw_exception(["Error parsing seqform " + this_parseable])

        if my_char not in "PSAU":
            sysOps.throw_status([
                "Ignoring this_parseable=" + this_parseable +
                " -- unrecognized character-type."
            ])
        else:
            if my_char == "A" and type(amplicon_option) == str and type(
                    boundaries[1]) != int:
                start_pos = int(boundaries[0])
                for sub_seq in amplicon_option.split(','):
                    len_sub_seq = len(sub_seq)
                    seq_bool_vec = np.zeros(4 * len_sub_seq, dtype=np.bool_)
                    capital_bool_vec = np.zeros(4 * len_sub_seq,
                                                dtype=np.bool_)
                    ambig_vec = np.zeros(len_sub_seq, dtype=np.bool_)
                    ambig_seq_to_np(sub_seq, seq_bool_vec, capital_bool_vec,
                                    ambig_vec)
                    if my_char in my_seqform:
                        my_seqform[my_char].append(
                            [[start_pos, start_pos + len_sub_seq],
                             seq_bool_vec[:], capital_bool_vec, ambig_vec])
                    else:
                        my_seqform[my_char] = [[[
                            start_pos, start_pos + len_sub_seq
                        ], seq_bool_vec, capital_bool_vec, ambig_vec]]
                    start_pos += len_sub_seq
                # since original type(boundaries[1]) != int, re-set final boundaries[1] = None
                my_seqform[my_char][len(my_seqform[my_char]) - 1][0][1] = None
            else:
                seq_bool_vec = np.zeros(4 * len(seq), dtype=np.bool_)
                capital_bool_vec = np.zeros(4 * len(seq), dtype=np.bool_)
                ambig_vec = np.zeros(len(seq), dtype=np.bool_)
                ambig_seq_to_np(seq, seq_bool_vec, capital_bool_vec, ambig_vec)
                if my_char in my_seqform:
                    my_seqform[my_char].append([
                        boundaries, seq_bool_vec, capital_bool_vec, ambig_vec
                    ])
                else:
                    my_seqform[my_char] = [[
                        boundaries, seq_bool_vec, capital_bool_vec, ambig_vec
                    ]]

    return my_seqform
Exemplo n.º 18
0
    elif sys.argv[0].endswith('infer'):
        compute_local_solutions_only = False
        if len(sys.argv) > 1 and sys.argv[1] == 'local':
            sysOps.throw_status('Performing local computing function alone.')
            compute_local_solutions_only = True
        if sys.argv[0] == 'smle_infer':
            sysOps.globalmasterProcess.dnamic_inference(
                True, False, False, compute_local_solutions_only)
        elif sys.argv[0] == 'msmle_infer':
            sysOps.globalmasterProcess.dnamic_inference(
                False, True, False, compute_local_solutions_only)
        elif sys.argv[0] == 'ptmle_infer':
            sysOps.globalmasterProcess.dnamic_inference(
                False, False, False, compute_local_solutions_only)
        elif sys.argv[0] == 'segment_infer':
            sysOps.globalmasterProcess.dnamic_inference(
                False, False, True, compute_local_solutions_only)
    elif sys.argv[0] == 'layout':
        upstream.generate_data_layout()
    elif (len(sys.argv) > 2 and sys.argv[0] == 'compare'):
        sysOps.globalmasterProcess.crosscomparison_analysis(sys.argv)
    elif (sys.argv[0] == 'stats'):
        summaryAnalysis.gather_rarefaction_data()
        summaryAnalysis.gather_raw_read_stats()
        summaryAnalysis.gather_stats()
        summaryAnalysis.gather_cluster_stats()
    else:
        sysOps.throw_exception('Unrecognized pipeline input: ' + str(sys.argv))

    print "Completed run."