def first_run(dataset, fixed, outdir, pat, patsubset, patruns, run, testsize): run = funcs.establish_run('boruta', fixed, outdir, run) patients = set() if patsubset is not None: done = 0 for name in dataset.keys(): with open( '%s%s/%s_patients_%d.txt' % (dataset[name], patsubset, patsubset, patruns[name])) as file: selected = [int(line.strip()) for line in file.readlines()] patients = patients.union([p + done for p in selected]) done += pat[name] else: patients = set([i for i in range(sum(pat.values()))]) case, control = funcs.patients_diagnoses(dataset, patients) if testsize != 0: half = round(len(patients) * testsize) / 2 testpat = set( random.sample(case, max(math.floor(half), 1)) + random.sample(control, max(math.ceil(half), 1))) trainpat = set([p for p in patients if p not in testpat]) with open('%stestpat_%d.txt' % (outdir, run), 'w') as ts: ts.write('\n'.join([str(s) for s in sorted(testpat)])) else: testpat = set() trainpat = patients return run, testpat, trainpat
def find_crossed(dataset, chrlist, fixed, run, borutaruns, perc): """ Searching for crossed SNPs among given data sets, writing them into files. :param dataset: (dict) the keys are name of data sets, values are directories to folders with them :param chrlist: (list) chromosomes for analysis :param fixed: (boolean) if number of run can be overwritten :param run: (int or None) number of run given as a parameter - None if not given :return: number of crossed SNPs for the given data sets """ crossed_snps = 0 runs = {} for setname in dataset.keys(): runs[setname] = funcs.establish_run('crossed', fixed, dataset[setname] + 'crossed/', run) for ch in chrlist: print('Analysis for chromosome %d has started!' % ch) crossed, ref = subset_funcs.first_intersection(dataset, ch, borutaruns, perc) ''' for setname in list(dataset.keys())[2:]: set = open('%smatrices/snps_chr%d.txt' % (dataset[setname], ch), 'r') crossed = subset_funcs.next_intersection(set, crossed, ref, ch) ''' for n, setname in enumerate(dataset.keys()): file = open( '%scrossed/crossed_snps_chr%d_%d.txt' % (dataset[setname], ch, runs[setname]), 'w') for snp in sorted(crossed.keys()): file.write('%d\n' % crossed[snp][n]) file.close() crossed_snps += len(crossed) print('For chr %d found %d crossed SNPs.' % (ch, len(crossed))) print('Run information for every dataset is writing to the file.') for setname in dataset.keys(): funcs.runs_file_add( 'crossed', dataset[setname] + 'crossed/', runs[setname], '%d\t%s\t%s\t%s\t%d\t%s\t%d\n' % (runs[setname], setname, ', '.join([ k for k in dataset.keys() if k != setname ]), funcs.make_chrstr(chrlist), crossed_snps, ','.join( list(map(str, borutaruns.values()))), perc)) return crossed_snps
def map_locs_to_rows(directory, infile, run, fixed, name, analysistype='frombed'): run = funcs.establish_run(analysistype, fixed, '%s%s/' % (directory, analysistype), run) bedfile = open(infile, 'r') bedline = bedfile.readline().strip().split() ch = int(bedline[0].strip('chr')) pos = int(bedline[2]) chrlist = [ch] numsnps = 0 while bedline: out = open( '%s%s/%s_snps_chr%d_%d.txt' % (directory, analysistype, analysistype, ch, run), 'w') print('Rewriting SNPs for chr %s' % ch) with open('%smatrices/snps_chr%d.txt' % (directory, ch), 'r') as snpfile: for i, snpline in enumerate(snpfile): if snpline.startswith(str(pos)): out.write('%d\n' % i) numsnps += 1 bedline = bedfile.readline().strip().split() if not bedline: break pos = int(bedline[2]) if int(bedline[0].strip('chr')) != ch: ch = int(bedline[0].strip('chr')) chrlist.append(ch) break out.close() bedfile.close() funcs.runs_file_add( analysistype, '%s%s/' % (directory, analysistype), run, '%d\t%s\t%s\t%d\t%s\t' % (run, infile, name, numsnps, funcs.make_chrstr(chrlist))) print('%s SNPs were rewritten to a %s file!' % (numsnps, analysistype)) return 0
if sys.argv[q] == '-thresh': thresh = float(sys.argv[q + 1]) if 'pp' in globals(): if borutaruns is None: borutaruns = OrderedDict([ (n, num) for n, num in zip(dataset.keys(), [pp] * len(dataset)) ]) else: for name in [ nn for nn in dataset.keys() if nn not in borutaruns.keys() ]: borutaruns[name] = pp if 'outdir' in globals(): run = funcs.establish_run('deficient', fixed, outdir, run) output = outdir l = 0 all = 0 for name, directory in dataset.items(): print('Analysis for %s dataset' % name) if 'outdir' not in globals(): output = '%sdeficient/' % directory run = funcs.establish_run('deficient', fixed, output, run) for ch in chrlist: print('Checking SNPs for chromosome %d has just started!' % ch) locs, al = find_weak(ch, directory, perc, borutaruns[name], thresh) l += len(locs) all += al file = open('%sdeficient_snps_chr%d_%d.txt' % (output, ch, run), 'w')
funcs.runs_file_add( 'boruta', outdir, borutarun, '%d\t%s\t%d\t%s\t%s\t%s\t%s\t%.2f\t%s\t%d\t%s\n' % (borutarun, '+'.join(dataset.keys()), len(trainpat) + len(testpat), patsubset, patruns_string, snpsubset, snpruns_string, testsize, ','.join(list(map(str, perc))), r, funcs.make_chrstr(chrlist))) else: funcs.runs_file_rewrite('boruta', outdir, towrite) if not boruta_only: if classrun is None and run is not None: classrun = run # determination of number of class run classrun = funcs.establish_run('class', fixed, outdir, classrun) scores_file = open('%sclass_scores_%d.txt' % (outdir, classrun), 'w', 1) if frombed and cv: scores_file.write( 'perc\tSNPs\tdataset_train_score\tdataset_test_score\tdataset_AUC\ttestset_score\ttestset_AUC\n' ) else: scores_file.write('perc\tSNPs\ttrain_score\ttest_score\tAUC\n' ) # writing heading to class_scores file if makey: build_y_matrices(dataset, borutarun, outdir, funcs.patients(dataset), testpat, trainpat) if cv and dataset and testset: assert frombed
if 'rr' in globals(): if runs is None: runs = OrderedDict([ (n, num) for n, num in zip(dataset.keys(), [rr] * len(dataset)) ]) else: for name in [nn for nn in dataset.keys() if nn not in runs.keys()]: runs[name] = rr # determination number of patient in given data sets pat = funcs.patients(dataset) if not continuation: for name, directory in dataset: runs[name] = funcs.establish_run('boruta', fixed, outdir, runs[name]) testpat, trainpat = read_patlist(dataset, outdir, pat, patsubset, patruns, testsize) else: dataset, outdir, patruns, perc, r, snpsubset, snpruns, testpat, testsize, towrite, trainpat = \ read_boruta_params(chrlist, continuation, dataset, fixed, pat, runs) # running Boruta analysis pooling(chrlist, dataset, outdir, pat, perc, r, borutarun, snpsubset, snpruns, testpat, trainpat) # saving information about done run to boruta_runs file if not continuation: if patruns is None: patruns_string = '-'
toremove = lower_threshold(lower, linkage, pat) print('Number of patients to remove (below the lower threshold %.4f): %d' % (lower, len(toremove))) else: print('No lower threshold given') toremove = set() if 'upper' in globals(): selected = upper_threshold(upper, linkage) print( 'Number of patients out of the biggest cluster (the upper threshold %.4f): %d' % (upper, pat - len(selected))) else: print('No upper threshold given') selected = [i for i in range(sims.shape[0])] final = [el for el in selected if el not in toremove] print('Number of selected patients: %d (%d removed)' % (len(final), pat - len(final))) diagnoses_dist(dir, final) run = funcs.establish_run('similar', fixed, outdir, run) file = open('%ssimilar_patients_%d.txt' % (outdir, run), 'w') file.write('\n'.join([str(p) for p in final])) file.close() funcs.runs_file_add( 'similar', outdir, run, '%d\t%s\t%.4f\t%.4f\t%d\t%d\n' % (run, name, lower, upper, len(final), pat))