예제 #1
0
def first_run(dataset, fixed, outdir, pat, patsubset, patruns, run, testsize):

    run = funcs.establish_run('boruta', fixed, outdir, run)

    patients = set()
    if patsubset is not None:
        done = 0
        for name in dataset.keys():
            with open(
                    '%s%s/%s_patients_%d.txt' %
                (dataset[name], patsubset, patsubset, patruns[name])) as file:
                selected = [int(line.strip()) for line in file.readlines()]
            patients = patients.union([p + done for p in selected])
            done += pat[name]
    else:
        patients = set([i for i in range(sum(pat.values()))])

    case, control = funcs.patients_diagnoses(dataset, patients)
    if testsize != 0:
        half = round(len(patients) * testsize) / 2
        testpat = set(
            random.sample(case, max(math.floor(half), 1)) +
            random.sample(control, max(math.ceil(half), 1)))
        trainpat = set([p for p in patients if p not in testpat])
        with open('%stestpat_%d.txt' % (outdir, run), 'w') as ts:
            ts.write('\n'.join([str(s) for s in sorted(testpat)]))
    else:
        testpat = set()
        trainpat = patients

    return run, testpat, trainpat
def find_crossed(dataset, chrlist, fixed, run, borutaruns, perc):
    """
    Searching for crossed SNPs among given data sets, writing them into files.
    :param dataset: (dict) the keys are name of data sets, values are directories to folders with them
    :param chrlist: (list) chromosomes for analysis
    :param fixed: (boolean) if number of run can be overwritten
    :param run: (int or None) number of run given as a parameter - None if not given
    :return: number of crossed SNPs for the given data sets
    """

    crossed_snps = 0
    runs = {}
    for setname in dataset.keys():
        runs[setname] = funcs.establish_run('crossed', fixed,
                                            dataset[setname] + 'crossed/', run)

    for ch in chrlist:

        print('Analysis for chromosome %d has started!' % ch)
        crossed, ref = subset_funcs.first_intersection(dataset, ch, borutaruns,
                                                       perc)
        '''
        for setname in list(dataset.keys())[2:]:
            set = open('%smatrices/snps_chr%d.txt' % (dataset[setname], ch), 'r')
            crossed = subset_funcs.next_intersection(set, crossed, ref, ch)
        '''

        for n, setname in enumerate(dataset.keys()):
            file = open(
                '%scrossed/crossed_snps_chr%d_%d.txt' %
                (dataset[setname], ch, runs[setname]), 'w')
            for snp in sorted(crossed.keys()):
                file.write('%d\n' % crossed[snp][n])
            file.close()

        crossed_snps += len(crossed)
        print('For chr %d found %d crossed SNPs.' % (ch, len(crossed)))

    print('Run information for every dataset is writing to the file.')

    for setname in dataset.keys():
        funcs.runs_file_add(
            'crossed', dataset[setname] + 'crossed/', runs[setname],
            '%d\t%s\t%s\t%s\t%d\t%s\t%d\n' %
            (runs[setname], setname, ', '.join([
                k for k in dataset.keys() if k != setname
            ]), funcs.make_chrstr(chrlist), crossed_snps, ','.join(
                list(map(str, borutaruns.values()))), perc))

    return crossed_snps
def map_locs_to_rows(directory,
                     infile,
                     run,
                     fixed,
                     name,
                     analysistype='frombed'):

    run = funcs.establish_run(analysistype, fixed,
                              '%s%s/' % (directory, analysistype), run)
    bedfile = open(infile, 'r')
    bedline = bedfile.readline().strip().split()
    ch = int(bedline[0].strip('chr'))
    pos = int(bedline[2])
    chrlist = [ch]
    numsnps = 0
    while bedline:
        out = open(
            '%s%s/%s_snps_chr%d_%d.txt' %
            (directory, analysistype, analysistype, ch, run), 'w')
        print('Rewriting SNPs for chr %s' % ch)
        with open('%smatrices/snps_chr%d.txt' % (directory, ch),
                  'r') as snpfile:
            for i, snpline in enumerate(snpfile):
                if snpline.startswith(str(pos)):
                    out.write('%d\n' % i)
                    numsnps += 1
                    bedline = bedfile.readline().strip().split()
                    if not bedline:
                        break
                    pos = int(bedline[2])
                    if int(bedline[0].strip('chr')) != ch:
                        ch = int(bedline[0].strip('chr'))
                        chrlist.append(ch)
                        break
    out.close()
    bedfile.close()
    funcs.runs_file_add(
        analysistype, '%s%s/' % (directory, analysistype), run,
        '%d\t%s\t%s\t%d\t%s\t' %
        (run, infile, name, numsnps, funcs.make_chrstr(chrlist)))
    print('%s SNPs were rewritten to a %s file!' % (numsnps, analysistype))
    return 0
예제 #4
0
    if sys.argv[q] == '-thresh':
        thresh = float(sys.argv[q + 1])

if 'pp' in globals():
    if borutaruns is None:
        borutaruns = OrderedDict([
            (n, num) for n, num in zip(dataset.keys(), [pp] * len(dataset))
        ])
    else:
        for name in [
                nn for nn in dataset.keys() if nn not in borutaruns.keys()
        ]:
            borutaruns[name] = pp

if 'outdir' in globals():
    run = funcs.establish_run('deficient', fixed, outdir, run)
    output = outdir

l = 0
all = 0
for name, directory in dataset.items():
    print('Analysis for %s dataset' % name)
    if 'outdir' not in globals():
        output = '%sdeficient/' % directory
        run = funcs.establish_run('deficient', fixed, output, run)
    for ch in chrlist:
        print('Checking SNPs for chromosome %d has just started!' % ch)
        locs, al = find_weak(ch, directory, perc, borutaruns[name], thresh)
        l += len(locs)
        all += al
        file = open('%sdeficient_snps_chr%d_%d.txt' % (output, ch, run), 'w')
예제 #5
0
        funcs.runs_file_add(
            'boruta', outdir, borutarun,
            '%d\t%s\t%d\t%s\t%s\t%s\t%s\t%.2f\t%s\t%d\t%s\n' %
            (borutarun, '+'.join(dataset.keys()), len(trainpat) + len(testpat),
             patsubset, patruns_string, snpsubset, snpruns_string, testsize,
             ','.join(list(map(str, perc))), r, funcs.make_chrstr(chrlist)))
    else:
        funcs.runs_file_rewrite('boruta', outdir, towrite)

if not boruta_only:

    if classrun is None and run is not None:
        classrun = run

    # determination of number of class run
    classrun = funcs.establish_run('class', fixed, outdir, classrun)
    scores_file = open('%sclass_scores_%d.txt' % (outdir, classrun), 'w', 1)
    if frombed and cv:
        scores_file.write(
            'perc\tSNPs\tdataset_train_score\tdataset_test_score\tdataset_AUC\ttestset_score\ttestset_AUC\n'
        )
    else:
        scores_file.write('perc\tSNPs\ttrain_score\ttest_score\tAUC\n'
                          )  # writing heading to class_scores file

    if makey:
        build_y_matrices(dataset, borutarun, outdir, funcs.patients(dataset),
                         testpat, trainpat)

    if cv and dataset and testset:
        assert frombed
if 'rr' in globals():
    if runs is None:
        runs = OrderedDict([
            (n, num) for n, num in zip(dataset.keys(), [rr] * len(dataset))
        ])
    else:
        for name in [nn for nn in dataset.keys() if nn not in runs.keys()]:
            runs[name] = rr

# determination number of patient in given data sets
pat = funcs.patients(dataset)

if not continuation:
    for name, directory in dataset:
        runs[name] = funcs.establish_run('boruta', fixed, outdir, runs[name])
    testpat, trainpat = read_patlist(dataset, outdir, pat, patsubset, patruns,
                                     testsize)

else:
    dataset, outdir, patruns, perc, r, snpsubset, snpruns, testpat, testsize, towrite, trainpat = \
        read_boruta_params(chrlist, continuation, dataset, fixed, pat, runs)

    # running Boruta analysis
    pooling(chrlist, dataset, outdir, pat, perc, r, borutarun, snpsubset,
            snpruns, testpat, trainpat)

    # saving information about done run to boruta_runs file
    if not continuation:
        if patruns is None:
            patruns_string = '-'
예제 #7
0
    toremove = lower_threshold(lower, linkage, pat)
    print('Number of patients to remove (below the lower threshold %.4f): %d' %
          (lower, len(toremove)))
else:
    print('No lower threshold given')
    toremove = set()

if 'upper' in globals():
    selected = upper_threshold(upper, linkage)
    print(
        'Number of patients out of the biggest cluster (the upper threshold %.4f): %d'
        % (upper, pat - len(selected)))
else:
    print('No upper threshold given')
    selected = [i for i in range(sims.shape[0])]

final = [el for el in selected if el not in toremove]

print('Number of selected patients: %d (%d removed)' %
      (len(final), pat - len(final)))
diagnoses_dist(dir, final)

run = funcs.establish_run('similar', fixed, outdir, run)
file = open('%ssimilar_patients_%d.txt' % (outdir, run), 'w')
file.write('\n'.join([str(p) for p in final]))
file.close()

funcs.runs_file_add(
    'similar', outdir, run, '%d\t%s\t%.4f\t%.4f\t%d\t%d\n' %
    (run, name, lower, upper, len(final), pat))