Exemplo n.º 1
0
def loadOligoFeaturesAndReadCounts(oligo_id, sample_names):

    oligo_idx = getOligoIdxFromId(oligo_id)
    oligo_subdir, _ = getFileForOligoIdx(oligo_idx, ext='')

    features_file = FEATURES_DIR + '/' + oligo_subdir + '/%s_gen_indel_features.txt' % oligo_id
    reads_file = READS_DIR + '/' + oligo_subdir + '/%s_gen_indel_reads.txt' % oligo_id

    cut_site = getCutSite(features_file)
    indel_feature_data, feature_cols = readFeaturesData(features_file)

    if len(sample_names) > 0:
        read_data = pd.read_csv(reads_file, skiprows=1, sep='\t')
        read_data['Sum Sample Reads'] = read_data[sample_names].sum(
            axis=1) + 0.5
        read_data = read_data.loc[read_data['Indel'] != 'All Mutated']
        total_mut_reads = read_data['Sum Sample Reads'].sum()
        if total_mut_reads == 0:
            raise Exception('No Mutated Reads in %s' % reads_file)
        read_data['Frac Sample Reads'] = read_data[
            'Sum Sample Reads'] / total_mut_reads
        merged_data = pd.merge(indel_feature_data,
                               read_data[['Indel', 'Frac Sample Reads']],
                               left_index=True,
                               right_on='Indel',
                               how='inner')
    else:
        merged_data = indel_feature_data
        merged_data['Indel'] = merged_data.index

    return merged_data
Exemplo n.º 2
0
def predictMutations(theta_file, target_seq, pam_idx, add_null=True):

    theta, train_set, theta_feature_columns = readTheta(theta_file)

    #generate indels
    left_trim = 0
    tmp_genindels_file = 'tmp_genindels_%s_%d.txt' % (target_seq, random.randint(0,100000))
    cmd = INDELGENTARGET_EXE + ' %s %d %s' % (target_seq, pam_idx, tmp_genindels_file)
    print(cmd); subprocess.check_call(cmd.split())
    rep_reads = fetchRepReads(tmp_genindels_file)
    isize, smallest_indel = min([(tokFullIndel(x)[1],x) for x in rep_reads]) if len(rep_reads) > 0 else (0,'-') 
    if isize > 0: left_trim = target_seq.find(rep_reads[smallest_indel][:10])

    #compute features for all generated indels
    tmp_features_file = 'tmp_features_%s_%d.txt' % (target_seq, random.randint(0,100000))
    calculateFeaturesForGenIndelFile( tmp_genindels_file, target_seq, pam_idx-3, tmp_features_file)
    os.remove(tmp_genindels_file)
    feature_data, feature_columns = readFeaturesData(tmp_features_file)
    os.remove(tmp_features_file)

    if len(set(theta_feature_columns).difference(set(feature_columns))) != 0:
        raise Exception('Stored feature names associated with model thetas are not contained in those computed')

    if len(set(theta_feature_columns).union(set(feature_columns))) != len(theta_feature_columns):
        feature_data = feature_data[['Indel'] + theta_feature_columns]
        feature_columns = theta_feature_columns

    #Predict the profile
    p_predict, _ = computePredictedProfile(feature_data, theta, theta_feature_columns)
    in_frame, out_frame, _ = fetchIndelSizeCounts(p_predict)
    in_frame_perc = in_frame*100.0/(in_frame + out_frame)
    if add_null:
        p_predict['-'] = 1000
        rep_reads['-'] = target_seq[left_trim:]
    return p_predict, rep_reads, in_frame_perc