def loadOligoFeaturesAndReadCounts(oligo_id, sample_names): oligo_idx = getOligoIdxFromId(oligo_id) oligo_subdir, _ = getFileForOligoIdx(oligo_idx, ext='') features_file = FEATURES_DIR + '/' + oligo_subdir + '/%s_gen_indel_features.txt' % oligo_id reads_file = READS_DIR + '/' + oligo_subdir + '/%s_gen_indel_reads.txt' % oligo_id cut_site = getCutSite(features_file) indel_feature_data, feature_cols = readFeaturesData(features_file) if len(sample_names) > 0: read_data = pd.read_csv(reads_file, skiprows=1, sep='\t') read_data['Sum Sample Reads'] = read_data[sample_names].sum( axis=1) + 0.5 read_data = read_data.loc[read_data['Indel'] != 'All Mutated'] total_mut_reads = read_data['Sum Sample Reads'].sum() if total_mut_reads == 0: raise Exception('No Mutated Reads in %s' % reads_file) read_data['Frac Sample Reads'] = read_data[ 'Sum Sample Reads'] / total_mut_reads merged_data = pd.merge(indel_feature_data, read_data[['Indel', 'Frac Sample Reads']], left_index=True, right_on='Indel', how='inner') else: merged_data = indel_feature_data merged_data['Indel'] = merged_data.index return merged_data
def predictMutations(theta_file, target_seq, pam_idx, add_null=True): theta, train_set, theta_feature_columns = readTheta(theta_file) #generate indels left_trim = 0 tmp_genindels_file = 'tmp_genindels_%s_%d.txt' % (target_seq, random.randint(0,100000)) cmd = INDELGENTARGET_EXE + ' %s %d %s' % (target_seq, pam_idx, tmp_genindels_file) print(cmd); subprocess.check_call(cmd.split()) rep_reads = fetchRepReads(tmp_genindels_file) isize, smallest_indel = min([(tokFullIndel(x)[1],x) for x in rep_reads]) if len(rep_reads) > 0 else (0,'-') if isize > 0: left_trim = target_seq.find(rep_reads[smallest_indel][:10]) #compute features for all generated indels tmp_features_file = 'tmp_features_%s_%d.txt' % (target_seq, random.randint(0,100000)) calculateFeaturesForGenIndelFile( tmp_genindels_file, target_seq, pam_idx-3, tmp_features_file) os.remove(tmp_genindels_file) feature_data, feature_columns = readFeaturesData(tmp_features_file) os.remove(tmp_features_file) if len(set(theta_feature_columns).difference(set(feature_columns))) != 0: raise Exception('Stored feature names associated with model thetas are not contained in those computed') if len(set(theta_feature_columns).union(set(feature_columns))) != len(theta_feature_columns): feature_data = feature_data[['Indel'] + theta_feature_columns] feature_columns = theta_feature_columns #Predict the profile p_predict, _ = computePredictedProfile(feature_data, theta, theta_feature_columns) in_frame, out_frame, _ = fetchIndelSizeCounts(p_predict) in_frame_perc = in_frame*100.0/(in_frame + out_frame) if add_null: p_predict['-'] = 1000 rep_reads['-'] = target_seq[left_trim:] return p_predict, rep_reads, in_frame_perc