def main(infile, labelfile, outfile): label_dict = _labelfile2dict_matchMirna(labelfile) with open(outfile, 'w') as out: with open(infile) as f: for line in f: line = line.strip().split('\t') chrom = line[0] strand = line[6] info = line[8].strip(';') info = re.split('[;@]', info) mstart = get_value_from_keycolonvalue_list('mirna_start', info) mstop = get_value_from_keycolonvalue_list('mirna_stop', info) #mirna = ','.join([chrom,mstart,mstop,strand]) if mstart == '' and mstop == '': label = 'NA' else: mirna = get_value_from_keycolonvalue_list('mirbase_id', info) mirna = re.match('^(\w*-\w*-\d*)', mirna).group(1) if label_dict.has_key(mirna): label = label_dict[mirna] else: label = 'unknown' info.append('mirna_label:'+label) line[8] = ';'.join(info) newline = '\t'.join(line) out.write(newline + '\n') print outfile
def _cleanup_extra_positions(infile, outfile): ## cleanup of extra positions ## compare miRNA positions in PROX & CORR with open(outfile, 'w') as out: with open(infile) as f: for line in f: l = line.split('\t') descript = l[8].split('@') if (descript[1] != '') and (descript[2] != '\n'): info_mprox = descript[1].split(';') prox_start = get_value_from_keycolonvalue_list( 'mirna_start', info_mprox) prox_stop = get_value_from_keycolonvalue_list( 'mirna_stop', info_mprox) info_corr = descript[2].split(';') corr_start = get_value_from_keycolonvalue_list( 'mirna_start', info_corr) corr_stop = get_value_from_keycolonvalue_list( 'mirna_stop', info_corr) if (prox_start == corr_start) and \ (prox_stop == prox_stop): out.write(line) else: out.write(line) return outfile
def _verify_valid_distance(infile): out_good = infile + '.validdistance' out_bad = infile + '.badpair' with open(out_bad, 'w') as outB: with open(out_good, 'w') as outG: with open(infile) as f: for l in f: l = l.strip().split('\t') info = l[8].split(';') d = get_value_from_keycolonvalue_list('distance', info) if d == 'NA': chrom = l[0] start = l[3] stop = l[4] strand = l[6] mirna = get_value_from_keycolonvalue_list( 'mirna_query', info) badpair = 'chr%s:%s..%s,%s\t%s' % (chrom, start, stop, strand, mirna) outB.write(badpair + '\n') else: outG.write('\t'.join(l) + '\n') if os.stat(out_bad).st_size != 0: print "## There are some bad positions in your input file:" print "## chromosome or strand differences between TSS and miRNA pair" print out_bad else: os.remove(out_bad) return out_good
def _make_newline(l, d): ## output in gff format mirna_proximity = str(distance_score(d)) chrom = l[0] peak_start = l[3] peak_stop = l[4] strand = l[6] info = l[8].split(';') region_up = get_value_from_keycolonvalue_list('region_start', info) region_down = get_value_from_keycolonvalue_list('region_stop', info) mirna_start = l[12] mirna_stop = l[13] mirna_info = re.sub(' |"', '', l[17]).strip().split(';') mirna_acc = get_value_from_keycolonvalue_list('ACC', mirna_info, '=') mirbase_id = get_value_from_keycolonvalue_list('ID', mirna_info, '=') new_info = ';'.join(['distance:'+str(d), #'region_start:'+region_up, 'region_stop:'+region_down, 'mirna_acc:'+mirna_acc, 'mirbase_id:'+mirbase_id, 'mirna_start:'+mirna_start, 'mirna_stop:'+mirna_stop]) newline = '\t'.join([chrom, l[1], l[2], #'overlap', 'putative_tss', peak_start, peak_stop, mirna_proximity, strand, '.', new_info]) return newline
def _swap_columns(f_cage, f_out): with open(f_out, 'w') as out: with open(f_cage) as f: for l in f: l = l.strip().split('\t') tss_up = l[3] tss_down = l[4] info = l[8].split(';') start = get_value_from_keycolonvalue_list('start', info) stop = get_value_from_keycolonvalue_list('stop', info) ## new info column info = filter( lambda x: not (x.startswith('start:') or x.startswith( 'stop:')), info) info.append('region_start:%s;region_stop:%s' % (tss_up, tss_down)) l[8] = ';'.join(info) ## new start & stop l[3] = start l[4] = stop out.write('\t'.join(l) + '\n') return
def _pull_putative_prom(somefile, somelist, is_strict): with open(somefile) as f: for l in f: l = l.strip().split('\t') label = l[13] if label.startswith('prom'): if is_strict: mprox = float( get_value_from_keycolonvalue_list( 'mirna_prox', l[7].split(';'))) if mprox == 0: continue chrom = l[0] start = l[3] stop = l[4] count = l[5] strand = l[6] prob_prom = l[11] info = l[8].split('@')[1].split(';') mirna = '%s-%s' % ( get_value_from_keycolonvalue_list('mirna_start', info), get_value_from_keycolonvalue_list('mirna_stop', info)) if mirna == '-': mirna = 'NA-NA' try: info = l[8].split('@')[2].split(';') mirnaid = get_value_from_keycolonvalue_list( 'mirbase_id', info) if mirnaid != '': mirna = '%s:%s' % (mirna, mirnaid) else: mirna = '%s:%s' % (mirna, 'NA') except: pass position = '%s,%s,%s,%s,%s' % (chrom, start, stop, strand, mirna) ## consider position with max prob if position in somelist: old_count, old_prob, nlib, _ = somelist[position] if (old_prob > prob_prom) or \ (old_prob == prob_prom and old_count >= count): somelist[position][2] += 1 continue else: somelist[position] = [count, prob_prom, nlib + 1, l[7]] else: somelist[position] = [count, prob_prom, 1, l[7]] return somelist
def _verify_mirbaseID(gff_infile, gff_outfile): with open(gff_outfile, 'w') as out: with open(gff_infile) as f: for l in f: info = l.strip().split('\t')[8].split('@') _x = info[-2].split(';') _y = info[-1].split(';') _x = get_value_from_keycolonvalue_list('mirbase_id', _x) _y = get_value_from_keycolonvalue_list('mirbase_id', _y) if _x == _y or _x == '' or _y == '': out.write(l) return
def _average_conservation(f_cons,f_aver_cons): with open(f_aver_cons, 'w') as out: with open(f_cons) as f: for l in f: l = l.split(',') info = l[0].split(';') start_pos = get_value_from_keycolonvalue_list('start', info) stop_pos = get_value_from_keycolonvalue_list('stop', info) try: av_conservation = lmean([float(i) for i in l[1:]]) except: av_conservation = 0.0 out.write('\t'.join([start_pos, stop_pos, str(av_conservation)]) +'\n')
def build_features_matrix(sorted_gff, sorted_cpg, sorted_avcons, sorted_tata, f_out): ## check that all in files contain same number of data lines n_g = line_count(sorted_gff) n_c = line_count(sorted_cpg) n_a = line_count(sorted_avcons) n_t = line_count(sorted_tata) if not all_same([n_g, n_c, n_a, n_t]): sys.exit('Error: line count of feature files are not all equal:%s,%s,%s,%s' % n_g, n_c, n_a, n_t) ## create matrix lcount = 0 with open(f_out, 'w') as out: with open(sorted_gff) as f: for l in f: lcount += 1 l = l.strip().split('\t') c = l[0] region_up = l[3] #500bp upstream of start; not used region_down = l[4] #500bp downstream of start; not used count = l[5] strand = l[6] info = l[8].split(';') #dist_score = '?' peak_start = get_value_from_keycolonvalue_list('start', info) peak_stop = get_value_from_keycolonvalue_list('stop', info) CpG_value = linecache.getline(sorted_cpg,lcount).strip().split('\t')[3] try: conservation = linecache.getline(sorted_avcons,lcount).strip().split('\t')[2] except: conservation = '0' affinity = linecache.getline(sorted_tata,lcount).strip().split('\t')[7] features = ';'.join(['cpg:'+CpG_value, 'cons:'+conservation, 'tata:'+affinity]) new_info = ';'.join(['region_start:'+region_up, 'region_stop:'+region_down]) line = '\t'.join([c, l[1], l[2], peak_start, peak_stop, count, strand, features, new_info]) out.write(line + '\n')
def _make_newline(l, d): ## output in gff format mirna_proximity = str(distance_score(d)) chrom = l[0] peak_start = l[3] peak_stop = l[4] strand = l[6] info = l[8].split(';') region_up = get_value_from_keycolonvalue_list('region_start', info) region_down = get_value_from_keycolonvalue_list('region_stop', info) mirna_start = l[12] mirna_stop = l[13] mirna_info = re.sub(' |"', '', l[17]).strip().split(';') mirna_acc = get_value_from_keycolonvalue_list('ACC', mirna_info, '=') mirbase_id = get_value_from_keycolonvalue_list('ID', mirna_info, '=') new_info = ';'.join([ 'distance:' + str(d), #'region_start:'+region_up, 'region_stop:'+region_down, 'mirna_acc:' + mirna_acc, 'mirbase_id:' + mirbase_id, 'mirna_start:' + mirna_start, 'mirna_stop:' + mirna_stop ]) newline = '\t'.join([ chrom, l[1], l[2], #'overlap', 'putative_tss', peak_start, peak_stop, mirna_proximity, strand, '.', new_info ]) return newline
def gff_unify_features(gff_a, gff_b, fname, dfvalue, f_out, retainSourceFeature=False): ## unify f_out_tmp = f_out + '.tmp' bedtools_intersect(gff_a, gff_b, f_out_tmp) ## parse with open(f_out, 'w') as out: with open(f_out_tmp) as f: for l in f: l = l.strip().split('\t') chrom = l[0] start = l[3] stop = l[4] count = l[5] strand = l[6] features = l[7] info_a = l[8] _chrom = l[9] if chrom == _chrom: ## yes overlap of features w/ mirna_proximity x_b = l[14] info_b = l[17] mirbase_id = get_value_from_keycolonvalue_list( 'mirbase_id', info_b.split(';')) else: x_b = dfvalue info_b = '' mirbase_id = '.' features = '%s;%s:%s' % (features, fname, x_b) new_info = info_a + '@' + info_b if retainSourceFeature: newline = '\t'.join([ chrom, l[1], l[2], start, stop, count, strand, features, new_info ]) else: newline = '\t'.join([ chrom, 'putative_tss', mirbase_id, start, stop, count, strand, features, new_info ]) out.write(newline + '\n') os.system('rm ' + f_out_tmp) return
def estimate_betas(f_trainingset, trainingfeatures): if 'mirna_prox' in trainingfeatures: mprox = trainingfeatures.index('mirna_prox') trainingfeatures.pop(mprox) add_mprox = True else: add_mprox = False ## preprocess to right format for Rscript f_intermediate = f_trainingset + '.intermediate.tmp' with open(f_intermediate, 'w') as out: with open(f_trainingset) as f: for l in f: l = l.strip().split('\t') features = l[7].split(';') fvalues = [] for i in trainingfeatures: try: fvalues.append( float( get_value_from_keycolonvalue_list(i, features))) except ValueError: fvalues.append(0) fvalues = [str(i) for i in fvalues] if 'back' in l[2].lower(): label = '0' else: label = '1' out.write('\t'.join([label] + fvalues) + '\n') ## estimating the beta parameters ## note: mirna_proximity is not considered in the fitting of betas f_beta_tmp = f_trainingset + '.parameters_beta.tmp' os.system('R --slave --vanilla --args '+f_intermediate+' '+f_beta_tmp+\ ' < external/choose_beta_params.R') betas = [] with open(f_beta_tmp) as f: for l in f: betas.append(l.strip()) ## estimate beta4 (for mirna_proximity) betas = [float(b) for b in betas] if add_mprox: beta_mprox = min(betas[1:]) betas.insert(mprox + 1, beta_mprox) os.system('rm %s %s' % (f_intermediate, f_beta_tmp)) return betas
def estimate_betas(f_trainingset, trainingfeatures): if 'mirna_prox' in trainingfeatures: mprox = trainingfeatures.index('mirna_prox') trainingfeatures.pop(mprox) add_mprox = True else: add_mprox = False ## preprocess to right format for Rscript f_intermediate = f_trainingset + '.intermediate.tmp' with open(f_intermediate, 'w') as out: with open(f_trainingset) as f: for l in f: l = l.strip().split('\t') features = l[7].split(';') fvalues = [] for i in trainingfeatures: try: fvalues.append(float( get_value_from_keycolonvalue_list(i, features))) except ValueError: fvalues.append(0) fvalues = [str(i) for i in fvalues] if 'back' in l[2].lower(): label = '0' else: label = '1' out.write('\t'.join([label] + fvalues) +'\n') ## estimating the beta parameters ## note: mirna_proximity is not considered in the fitting of betas f_beta_tmp = f_trainingset + '.parameters_beta.tmp' os.system('R --slave --vanilla --args '+f_intermediate+' '+f_beta_tmp+\ ' < external/choose_beta_params.R') betas = [] with open(f_beta_tmp) as f: for l in f: betas.append(l.strip()) ## estimate beta4 (for mirna_proximity) betas = [float(b) for b in betas] if add_mprox: beta_mprox = min(betas[1:]) betas.insert(mprox+1, beta_mprox) os.system('rm %s %s' % (f_intermediate, f_beta_tmp)) return betas
def _swap_columns(f_cage, f_out): with open(f_out, 'w') as out: with open(f_cage) as f: for l in f: l = l.strip().split('\t') tss_up = l[3] tss_down = l[4] info = l[8].split(';') start = get_value_from_keycolonvalue_list('start', info) stop = get_value_from_keycolonvalue_list('stop', info) ## new info column info = filter(lambda x: not (x.startswith('start:') or x.startswith('stop:')), info) info.append('region_start:%s;region_stop:%s' % (tss_up, tss_down)) l[8] = ';'.join(info) ## new start & stop l[3] = start l[4] = stop out.write('\t'.join(l) + '\n') return
def _check_labelling(infile, labelfile): ## simple check with open(infile) as f: for l in f: info = l.strip().split('\t')[8].split(';') label = get_value_from_keycolonvalue_list('mirna_label', info) if label == '': isLabelled = False else: isLabelled = True break if isLabelled: return infile else: print '## No labelling is found, proceed with labelling...' outfile = '%s.label' % infile lb.main(infile, labelfile, outfile) return outfile
def _readcount_finder(somefile, somelist, get_id=False): with open(somefile) as f: for l in f: l = l.split('\t') chrom = l[0] start = l[3] stop = l[4] strand = l[6] position = '%s,%s,%s,%s' % (chrom, start, stop, strand) if position in somelist: x = l[5] if get_id: info = l[8].strip().split(';') sid = get_value_from_keycolonvalue_list('id', info) somelist[position].append(sid + ':' + x) else: somelist[position].append(x) return somelist
def read_data(f_trainingset, trainingfeatures): data = [] with open(f_trainingset) as f: for l in f: l = l.split('\t') count = float(l[5]) if count != 0: chrom = l[0] start = int(l[3]) stop = int(l[4]) strand = l[6] features = l[7].split(';') fvalues = [] for i in trainingfeatures: try: fvalues.append( float( get_value_from_keycolonvalue_list(i, features))) except ValueError: fvalues.append(0) label = l[2].lower() ##probability of promoter (z1) & background (z2) if 'back' in label: z1 = 0.0 z2 = 1.0 else: z1 = 0.5 z2 = 0.5 item = (chrom, start, stop, strand, count, trainingfeatures, fvalues, z1, z2) data.append(item) return data
def promi2(f_param, listoffeatures, infile, outfile): mu1, mu2, lambda1, lambda2, betas = _read_params(f_param) if len(betas) != len(listoffeatures) + 1: sys.exit("ERROR: number of betas does not match number of features") with open(outfile, 'w') as out: with open(infile) as f: for line in f: line = line.strip() l = line.split('\t') x = float(l[5]) _features = l[7].split(';') fvalues = [] for lof in listoffeatures: try: fvalues.append( float( get_value_from_keycolonvalue_list( lof, _features))) except ValueError: fvalues.append(0) p_prom, p_back, prior_prom, prior_back = promirna.promirna( x, mu1, mu2, lambda1, lambda2, betas, fvalues) prediction = _make_prediction(prior_prom, p_prom, p_back) #line = '\t'.join([line, # ';'.join(['prior_prom:'+str(prior_prom), 'prior_back:'+str(prior_back), # 'prob_prom:'+str(p_prom), 'prob_back:'+str(p_back)]), # prediction]) + '\n' line = line + '\t%s\t%s\t%s\t%s\t%s\n' % ( prior_prom, prior_back, p_prom, p_back, prediction) out.write(line) return
def _read_dat(gff_infile): dat = {} n = 0 with open(gff_infile) as f: for l in f: n += 1 l = l.strip().split('\t') chrom = l[0] tstart = l[3] tstop = l[4] strand = l[6] tss = ','.join([chrom, tstart, tstop, strand]) info = l[8].split(';') mirbase_id = get_value_from_keycolonvalue_list('mirbase_id', info) mstart = get_value_from_keycolonvalue_list('mirna_start', info) mstop = get_value_from_keycolonvalue_list('mirna_start', info) label = get_value_from_keycolonvalue_list('mirna_label', info) if label == '': label = 'NA' mirna = ','.join([chrom, mstart, mstop, strand]) features = l[7].split(';') corr = get_value_from_keycolonvalue_list('corr', features) if get_value_from_keycolonvalue_list('mirna_prox', features) != 0: distance = get_value_from_keycolonvalue_list('distance', info) if distance == '': distance = 0 dat[n] = [ tss, mirna, mirbase_id, label, distance, abs(float(distance)), corr ] dat = pd.DataFrame.from_dict(dat, orient='index') dat.columns = [ 'tss', 'mirna', 'mirbase_id', 'label', 'Distance', 'distance', 'correlation' ] return dat
def _index_feat(gff_ufeat, has_mirna): pairid_index = {} with open(gff_ufeat) as f: c = 0 for l in f: c += 1 chrom, _, _, start, stop, _, strand, _, info = l.strip().rsplit('\t') info = re.split('[;@]', info) pid = '.'.join([chrom, start, stop, strand]) if has_mirna: mirna = get_value_from_keycolonvalue_list('mirna_query', info) val = '%s:%s' % (mirna, c) else: val = c try: pairid_index[pid].append(val) except KeyError: pairid_index[pid] = [val] return pairid_index
def read_data(f_trainingset, trainingfeatures): data = [] with open(f_trainingset) as f: for l in f: l = l.split('\t') count = float(l[5]) if count != 0: chrom = l[0] start = int(l[3]) stop = int(l[4]) strand = l[6] features = l[7].split(';') fvalues = [] for i in trainingfeatures: try: fvalues.append(float( get_value_from_keycolonvalue_list(i, features))) except ValueError: fvalues.append(0) label = l[2].lower() ##probability of promoter (z1) & background (z2) if 'back' in label: z1 = 0.0 z2 = 1.0 else: z1 = 0.5 z2 = 0.5 item = (chrom, start, stop, strand, count, trainingfeatures, fvalues, z1, z2) data.append(item) return data
def _index_feat(gff_ufeat, has_mirna): pairid_index = {} with open(gff_ufeat) as f: c = 0 for l in f: c += 1 chrom, _, _, start, stop, _, strand, _, info = l.strip().rsplit( '\t') info = re.split('[;@]', info) pid = '.'.join([chrom, start, stop, strand]) if has_mirna: mirna = get_value_from_keycolonvalue_list('mirna_query', info) val = '%s:%s' % (mirna, c) else: val = c try: pairid_index[pid].append(val) except KeyError: pairid_index[pid] = [val] return pairid_index
def feature_closest_corr( f_querygff, f_mirbasegff, ##sRNAseq to gff format m_mirna, m_tss, m_back, ## matrices with expression value f_tcfilesinput, ## will determine columnID for m_back method, ## correlation method outfile, verbose=False): ## files d = outfile + '_intermediates' ensure_dir(d, False) fo_mirnagff = os.path.join(d, 'srnaseq_pos.gff') fo_closest = os.path.join(d, 'closest_tss-mirna.txt') f_pos_pairing = os.path.join(d, 'pairing_position.txt') f_sample_pairing = os.path.join(d, 'pairing_sample.txt') fo_corr = os.path.join(d, 'closest_corr.gff') ## 1. sRNAseq to gff format _find_miRNA_pos(m_mirna, f_mirbasegff, fo_mirnagff) ## 2a. find closest pair cmd = 'bedtools closest -a ' + f_querygff + ' -b ' + fo_mirnagff + ' -s -iu -D a -t first > ' + fo_closest if verbose: print "STATUS: finding closest pair..." if verbose: print cmd os.system(cmd) ## 2b. get pairing info: position ## -> seq_id, seq_line, mirna_info, mirna_line, label if verbose: print 'STATUS: identifying pairing info: position...' with open(f_pos_pairing + '.posSet', 'w') as out_pos: with open(f_pos_pairing + '.negSet', 'w') as out_neg: cageseq_dict = {} with open(m_tss) as f: linenum = 0 for l in f: linenum += 1 if l.startswith('#') \ or l.startswith('00Annotation') \ or l.startswith('01STAT'): continue pos, _ = l.split('\t', 1) chrom, start, _, stop, strand = re.split('[:.,]', pos) start, stop = _determine_region(start, stop) pos = '%s:%s..%s,%s' % (chrom, start, stop, strand) cageseq_dict[pos] = linenum background_dict = {} with open(m_back) as f: linenum = 0 for l in f: linenum += 1 pos = l.split('\t')[3] background_dict[pos] = linenum with open(fo_closest) as f: for l in f: l = l.strip() _, d = l.rsplit('\t', 1) d = int(d) if (d >= 0) and (d <= 50000): l = l.split('\t') label = l[2] pos = 'chr%s:%s..%s,%s' % (l[0], l[3], l[4], l[6]) seq_id = 'title=%s' % pos mirna_line = l[16] mirna_info = ','.join([ 'title=' + l[17].split(':')[1], 'mirbase_id=' + l[11], 'mirna_start=' + l[12], 'mirna_stop=' + l[13] ]) if label == 'BACK': info = l[8].split(';') pos = 'chr%s:%s..%s,%s' % ( l[0], get_value_from_keycolonvalue_list( 'region_start', info), get_value_from_keycolonvalue_list( 'region_stop', info), l[6]) try: seq_line = str(background_dict[pos]) newline = '\t'.join( [seq_id, seq_line, mirna_info, mirna_line]) out_neg.write(newline + '\n') except KeyError: continue else: try: seq_line = str(cageseq_dict[pos]) newline = '\t'.join( [seq_id, seq_line, mirna_info, mirna_line]) out_pos.write(newline + '\n') except KeyError: continue ## 3. get pairing info: sample ## -> sampleID, cage_column_index, srnaseq_matrix_column_index, (cid,mid) if verbose: print 'STATUS: identifying pairing info: samples...' cage_id_pattern = re.compile('^tpm.*(CNhs.*\..*)$') back_id_pattern = re.compile('^.*(CNhs.*?\..*?)\..*$') srnaseq_id_pattern = re.compile('^.*(SRh.*?\..*?)\.') cage_index = {} with open(m_tss) as f: for l in f: if l.startswith('00Annotation'): l = l.strip().split('\t') c = 0 for header in l: if header.startswith('tpm'): cage_sample_id = cage_id_pattern.match(header).group(1) cage_id = cage_sample_id.split('.')[1] try: cage_index[cage_id].append('%s:%s' % (cage_sample_id, c)) except KeyError: cage_index[cage_id] = [ '%s:%s' % (cage_sample_id, c) ] c += 1 break back_index = {} with open(f_tcfilesinput) as f: line = 6 for l in f: cage_sample_id = back_id_pattern.match(l).group(1) cage_id = cage_sample_id.split('.')[1] try: back_index[cage_id].append('%s:%s' % (cage_sample_id, line)) except KeyError: back_index[cage_id] = ['%s:%s' % (cage_sample_id, line)] line += 1 srnaseq_index = {} with open(m_mirna) as f: for l in f: if l.startswith('ID'): l = l.strip().split('\t') c = 0 for header in l: if header.endswith('.bam'): srnaseq_sample_id = srnaseq_id_pattern.match( header).group(1) srnaseq_id = srnaseq_sample_id.split('.')[1] try: srnaseq_index[srnaseq_id].append( '%s:%s' % (srnaseq_sample_id, c)) except KeyError: srnaseq_index[srnaseq_id] = [ '%s:%s' % (srnaseq_sample_id, c) ] c += 1 break ## combine with open(f_sample_pairing + '.posSet', 'w') as out: sample_ids = set(cage_index.keys()).intersection(srnaseq_index.keys()) for k in sample_ids: for c in cage_index[k]: for m in srnaseq_index[k]: cid, cindex = c.split(':') mid, mindex = m.split(':') out.write( '\t'.join([k, cindex, mindex, '%s,%s' % (cid, mid)]) + '\n') with open(f_sample_pairing + '.negSet', 'w') as out: sample_ids = set(back_index.keys()).intersection(srnaseq_index.keys()) for k in sample_ids: for c in back_index[k]: for m in srnaseq_index[k]: cid, cindex = c.split(':') mid, mindex = m.split(':') out.write( '\t'.join([k, cindex, mindex, '%s,%s' % (cid, mid)]) + '\n') ## 4. compute correlation if verbose: print 'STATUS: computing correlation (method)...' % method _compute_correlation(f_pos_pairing + '.posSet', f_sample_pairing + '.posSet', m_tss, m_mirna, fo_corr + '.posSet', method, 'putative_tss') _compute_correlation(f_pos_pairing + '.negSet', f_sample_pairing + '.negSet', m_back, m_mirna, fo_corr + '.negSet', method, 'background') with open(fo_corr, 'w') as out: with open(fo_corr + '.negSet') as f: for l in f: out.write(l) with open(fo_corr + '.posSet') as f: for l in f: out.write(l) os.remove(fo_corr + '.negSet') os.remove(fo_corr + '.posSet') ## 5. unify if verbose: print 'STATUS: creating "%s"' % outfile gff_unify_features.main(f_querygff, fo_corr, 'corr', '0', outfile, True) return fo_corr
def main(files, outdir, N, percent_lib, is_get_id, f_config, verbose=False): if os.path.isdir(outdir): sys.exit('## ERROR: "%s" already exists' % outdir) cparser = SafeConfigParser() cparser.read(f_config) verbose = True f_mirbasegff = cparser.get('mirbase', 'gff2') f_chromsizes = cparser.get('genome', 'chromsizes') f_repeats = cparser.get('genome', 'repeats') f_ensembl = cparser.get('genome', 'ensemblgtf') f_fasta = cparser.get('genome', 'fasta') d_phastcons = cparser.get('cons', 'phastcons') TRAP = cparser.get('tata', 'trap') f_psemmatrix = cparser.get('tata', 'psem') f_traincfg = cparser.get('configs', 'tcconfig') m_mirna = cparser.get('correlation', 'srnaseqmatrix') m_tss = cparser.get('correlation', 'cageseqmatrix') corrmethod = cparser.get('correlation', 'corrmethod') f_trainingset = os.path.join(outdir, 'TrainingSet.gff') outdir1 = f_trainingset + '_intermediates' ensure_dir(outdir, False) ensure_dir(outdir1, False) _files = glob.glob(files) ## creating auxillary file for negative set f_fiveprimegff = '../data/hsa.five_prime.gff' if not os.path.exists(f_fiveprimegff): if verbose: print 'STATUS: creating "%s" auxillary file...' % f_fiveprimegff extract_tss_from_ensembl(f_ensembl, f_fiveprimegff) ## create training set gff_ts_pos = os.path.join(outdir1, 'trainingset_pos.gff') gff_ts_neg = os.path.join(outdir1, 'trainingset_neg.gff') if verbose: print 'STATUS: creating positive candidate set...' create_positiveset(percent_lib, _files, f_mirbasegff, N, gff_ts_pos, is_get_id) if verbose: print 'STATUS: creating negative candidate set...' create_negativeset(f_chromsizes, f_repeats, f_fiveprimegff, f_traincfg, N, gff_ts_neg) shutil.move(os.path.join(outdir1, 'tc-norm_negSet'), os.path.join(outdir, 'tc-norm_negSet')) ## feature extraction: cpg, cons, tata (features.py) if verbose: print 'STATUS: extracting features cpg/cons/tata...' gff_1kbfeatures_pos = os.path.join(outdir1, 'features1kb_ts_pos.gff') gff_1kbfeatures_neg = os.path.join(outdir1, 'features1kb_ts_neg.gff') features.main(gff_ts_pos, outdir1, f_fasta, f_chromsizes, d_phastcons, TRAP, f_psemmatrix, gff_1kbfeatures_pos) features.main(gff_ts_neg, outdir1, f_fasta, f_chromsizes, d_phastcons, TRAP, f_psemmatrix, gff_1kbfeatures_neg) ## feature extraction: mirna_proximity if verbose: print 'STATUS: extracting features mirna_proximity...' gff_mirnaprox_pos = os.path.join(outdir1, 'featureMprox_ts_pos.gff') gff_mirnaprox_neg = os.path.join(outdir1, 'featureMprox_ts_neg.gff') mirna_proximity.main(gff_ts_pos, f_mirbasegff, gff_mirnaprox_pos) mirna_proximity.main(gff_ts_neg, f_mirbasegff, gff_mirnaprox_neg) gff_features_pos = os.path.join(outdir1, 'Features_ts_pos.gff') gff_features_neg = os.path.join(outdir1, 'Features_ts_neg.gff') gff_unify_features.main(gff_1kbfeatures_pos, gff_mirnaprox_pos, 'mirna_prox', '0', gff_features_pos, True) gff_unify_features.main(gff_1kbfeatures_neg, gff_mirnaprox_neg, 'mirna_prox', '0', gff_features_neg, True) ## create final training set ... ## where background must pass criteria: cpg <= 0.5 and cons <= 0.2 and tata <= 0.1 and mirna_prox == 0 if verbose: print 'STATUS: creating final training set...' good_background = gff_features_neg + '_cpglt0.5-conslt0.2-tatalt0.1-mproxeq0.gff' with open(good_background, 'w') as out: with open(gff_features_neg) as f: for line in f: info = line.strip().split('\t')[7].split(';') cpg = float(get_value_from_keycolonvalue_list('cpg', info)) cons = float(get_value_from_keycolonvalue_list('cons', info)) tata = float(get_value_from_keycolonvalue_list('tata', info)) mprx = float( get_value_from_keycolonvalue_list('mirna_prox', info)) if cpg <= 0.5 and cons <= 0.2 and tata <= 0.1 and mprx == 0: out.write(line) wc = line_count(good_background) selectedlines = random.sample(range(1, wc + 1), N) with open(f_trainingset, 'w') as out: ## writing negative set for l in selectedlines: out.write(linecache.getline(good_background, l)) ## writing positive set with open(gff_features_pos) as f: ## when mirna_prox extraction feature was used, ## extracted all pairs within 50kb upstream mirna ## -> single tss could have many mirna ## take pair with min distance ## -> essential first entry pos_list = [] for line in f: l = line.split('\t') pos = ','.join([l[0], l[3], l[4], l[6]]) if not (pos in pos_list): pos_list.append(pos) out.write(line) if not (os.path.isfile(m_mirna) and os.path.isfile(m_tss)): return f_trainingset ## create final training set with feature:correlation of closest tss->miRNA ... if verbose: print 'STATUS: creating final training set with correlation of closest tss->miRNA...' f_trainingset2 = os.path.join(outdir, 'TrainingSet-corr.gff') m_back = glob.glob('%s/tc-norm_negSet/*tpm_rle.matrix' % outdir)[0] f_tcfilesinput = os.path.join(outdir, 'tc-norm_negSet', 'files.txt') feature_closest_corr(f_trainingset, f_mirbasegff, m_mirna, m_tss, m_back, f_tcfilesinput, corrmethod, f_trainingset2) return f_trainingset2