def createJaccardDistribution(file_name1, file_name2, gene_database_name): path = os.path.join(settings.MEDIA_ROOT, 'peak_file_db', str(gene_database_name)) file_name1 = os.path.join(path, file_name1) file_name2 = os.path.join(path, file_name2) # Create shuffled BED files of file1 and file2 file1 = bt(file_name1) file2 = bt(file_name2) genome_chrom_size = os.path.join(settings.MEDIA_ROOT, 'genome_chrom_sizes') randomised_files = [] for rep in range(10000): shuffled_file1 = file1.shuffle(g=genome_chrom_size + '/mm9.chrom.sizes', chrom=True) shuffled_file2 = file2.shuffle(g=genome_chrom_size + '/mm9.chrom.sizes', chrom=True) randomised_files.append((shuffled_file1, shuffled_file2)) # set multiprocessing pool = Pool(processes=4) bootstrapped_jaccard = pool.map( bootstrapRandom, randomised_files) # returns list of jaccard indices pool.join() pool.close() return bootstrapped_jaccard
def createJaccardDistribution(file_name1, file_name2, genome_chrom_size, bootstrap_num=10000, process_num=4): print(file_name1, file_name2) #print(genome_chrom_size) global file1, file2, genome_chrom_sizes # Create shuffled BED files of file1 and file2 file1 = bt(file_name1) file2 = bt(file_name2) genome_chrom_sizes = genome_chrom_size # set multiprocessing pool = Pool(processes=process_num) bootstrapped_jaccard = pool.map( bootstrapRandom, range(bootstrap_num)) # returns list of jaccard indices pool.close() pool.join() #step = 0.001 #precision = 3 #jaccard_count = [0 for i in range(int(1/step))] #print(bootstrapped_jaccard) #print(max(bootstrapped_jaccard)) #for jaccard_value in bootstrapped_jaccard: # rounded_value = round(jaccard_value, precision) # jaccard_count[int(rounded_value/step)] += 1 #jaccard_distribution = Histogram(step, jaccard_count) return numpy.sort(bootstrapped_jaccard)
def calculate_jaccard_index(file_paths): file_one, file_two = sorted(file_paths) bed_file_one = bt(file_one) bed_file_two = bt(file_two) jaccard = bed_file_one.jaccard(bed_file_two) jaccard_index = jaccard['jaccard'] if math.isnan(jaccard_index): jaccard_index = -1 # Jaccard index values range between 0 and 1 (inclusive), so NaN will be represented by -1 proxdis = file_one.split("_")[-2] cutoff = file_one.split("_")[-1] return file_one.split("/")[-1], file_two.split( "/")[-1], proxdis, cutoff, str(jaccard_index)
def parse_annotations(f, genome): """ Parse the annotations and write them into the db. """ feat = bt(f) logging.info('processing: %s' % f.name) chunk_size = 1024 con = Connection() seqdb = con['seqdb'] features = seqdb.features strains = seqdb.genomestrains small_feature_chunks = ichunked(feat, chunk_size) try: for chunk in small_feature_chunks: feature_list = [] strain_list = [] for a in chunk: feature = { 'seqid': a[0], 'source': a[1], 'type': a[2], 'start': int(a[3]), 'end': int(a[4]), 'score': a[5], 'strand': a[6], 'phase': a[7], 'attributes': a.attrs } feature['attributes']['genome'] = genome feature_list.append(feature) try: strain = feature['attributes']['Strain'] strain_list.append(strain) except: pass strain_list = list(set(strain_list)) strains.update( {'genome': genome}, {'$addToSet': {'strains': {'$each': strain_list}}} ) features.insert(feature_list, safe=True) except pybedtools.cbedtools.MalformedBedLineError: logging.error("Gff file is not formated corectly, please validate it!") sys.exit(0) logging.info('finished processing file') return
def snps_by_location(args): """ Group and annotate SNPs by their location. Best is to use snpEff or other tools, but for quick course results this function can be used. """ t0 = time.time() # save annotations and snps as single temp bedtools files annotations = bt(chain(*args.annot)).saveas().fn snps = bt(chain(*args.snps)).saveas().fn # group features by type featuretypes = ['gene', 'CDS', 'exon', 'five_prime_UTR', 'three_prime_UTR'] # use multiprocessing to subset each type in sepatate thread pool = multiprocessing.Pool(len(featuretypes)) annotation_list = repeat(annotations, times=len(featuretypes)) results = pool.map(subset_features, zip(featuretypes, annotation_list)) features_by_type = {ftype: feature for (ftype, feature) in results} # clean up features_by_type by removing empty bts from dict for k in features_by_type.keys(): if (not bt(features_by_type[k]).count()): del features_by_type[k] available_features = features_by_type.keys() # create intron intervals if there are gene annotations if ('gene' in available_features): genes = features_by_type['gene'] # if there are exons use them to determine introns if ('exon' in available_features): exons = features_by_type['exon'] introns = bt(genes).subtract(bt(exons)) introns = introns.sort().merge().remove_invalid() features_by_type['intron'] = introns.saveas().fn # after getting introns, exons are not needed any more del features_by_type['exon'] # otherwise use 'CDS' and UTR information else: needed_set = set(['CDS', 'three_prime_UTR', 'five_prime_UTR']) if (needed_set.issubset(features_by_type.keys())): cdss = features_by_type['CDS'] utr3s = features_by_type['three_prime_UTR'] utr5s = features_by_type['five_prime_UTR'] introns = bt(genes).subtract(bt(cdss)).subtract(bt(utr3s)) introns = introns.subtract(bt(utr5s)).sort().merge() introns = introns.remove_invalid() features_by_type['intron'] = introns.saveas().fn # group snps by feature overlap pool_size = len(features_by_type.keys()) pool = multiprocessing.Pool(pool_size) snps_list = [snps for i in range(pool_size)] features_list = repeat(features_by_type, times=pool_size) results = pool.map(subset_snps, zip(features_by_type.keys(), features_list, snps_list)) snps_by_location = {ftype: feature for (ftype, feature) in results} # remove 'gene' snps, they're not needed anymore try: del snps_by_location['gene'] except: # there where no 'gene' snps pass # remove duplicate SNPs available_locs = snps_by_location.keys() if ('intron' in available_locs): intronic = bt(snps_by_location['intron']) if ('CDS' in available_locs): cdss = bt(snps_by_location['CDS']) intronic = intronic.intersect(cdss, v=True) if ('five_prime_UTR' in available_locs): utr5s = bt(snps_by_location['five_prime_UTR']) intronic = intronic.intersect(utr5s, v=True) if ('three_prime_UTR' in available_locs): utr3s = bt(snps_by_location['three_prime_UTR']) intronic = intronic.intersect(utr3s, v=True) snps_by_location['intron'] = intronic.saveas().fn if ('CDS' in available_locs): cdss = bt(snps_by_location['CDS']) if ('five_prime_UTR' in available_locs): utr5s = bt(snps_by_location['five_prime_UTR']) utr5s = utr5s.intersect(cdss, v=True) snps_by_location['five_prime_UTR'] = utr5s.saveas().fn if ('three_prime_UTR' in available_locs): utr3s = bt(snps_by_location['three_prime_UTR']) utr3s = utr3s.intersect(cdss, v=True) snps_by_location['three_prime_UTR'] = utr3s.saveas().fn # annotate snps data_dir = os.path.dirname(args.snps[0].name) pool_size = len(snps_by_location.keys()) pool = multiprocessing.Pool(pool_size) dd_list = repeat(data_dir, times=pool_size) pool.map(annotate_snps, zip(snps_by_location.items(), dd_list)) t1 = time.time() t = t1 - t0 logging.info("time elapsed: %d" % t)
def subset_snps((featuretype, features, snps)): s = bt(snps) snps_in_location = s.intersect(bt(features[featuretype]), u=True).saveas().fn logging.info("subsetting snps in " + featuretype) return((featuretype, snps_in_location))
def subset_features((featuretype, annotations)): a = bt(annotations) features_of_type = a.filter(featuretype_filter, featuretype).saveas().fn logging.info("subsetting features in " + featuretype) return ((featuretype, features_of_type))
features_of_type = a.filter(featuretype_filter, featuretype).saveas().fn logging.info("subsetting features in " + featuretype) return ((featuretype, features_of_type)) def subset_snps((featuretype, features, snps)): s = bt(snps) snps_in_location = s.intersect(bt(features[featuretype]), u=True).saveas().fn logging.info("subsetting snps in " + featuretype) return((featuretype, snps_in_location)) def annotate_snps(((loc, snps), data_dir)): annotated_snps = bt(snps).each(annotate_location, loc) fname = os.path.join(data_dir, loc + "_snps.gff") annotated_snps.saveas(fname) def snps_by_location(args): """ Group and annotate SNPs by their location. Best is to use snpEff or other tools, but for quick course results this function can be used. """ t0 = time.time() # save annotations and snps as single temp bedtools files annotations = bt(chain(*args.annot)).saveas().fn
def tfClassifyResult(request): # cutoff value from post cutoff = long(request.session['cut_off']) pvalue = float(request.session['pvalue']) peak_database_names = request.session['peak_database_names'] gene_database_name = request.session['gene_database_name'] peakfile_names = request.session['peakfile_names'] heatmap = request.session['heatmap'] peakfile_choices = tuple([(x, x) for x in peakfile_names]) form = VariableInputForm(request.POST or None, request.FILES or None, initial={ 'cut_off': cutoff, 'selected_peaks': '\n'.join(peak_database_names), 'uploaded_peak_File': peakfile_names, 'pvalue': pvalue, }) # leave previous choices at field form.fields['uploaded_peak_File'].choices = peakfile_choices if form.is_valid(): # new user input form is submitted cleaned_data = form.cleaned_data cutoff = cleaned_data.get('cut_off') pvalue = cleaned_data.get('pvalue') new_peak_File = request.FILES.getlist('new_peak_File') peakfile_names = request.POST.getlist('uploaded_peak_File') createPeakToBedFile(new_peak_File, str(request.session.session_key), gene_database_name) for name in new_peak_File: peakfile_names.append(name.name) request.session['cut_off'] = cutoff request.session['pvalue'] = pvalue peak_name_strings = cleaned_data.get('selected_peaks') peak_database_names = peak_name_strings.rstrip().split() request.session['peak_database_names'] = peak_database_names request.session['gene_database_name'] = gene_database_name request.session['peakfile_names'] = peakfile_names request.session['heatmap'] = cleaned_data.get('heatmap') # redirect to this page with different parameters return redirect('/result') # THIS IS WHERE USER INPUT PEAK FILE IS SEPARATED TO PROXIMAL AND DISTAL fname = peakfile_names user_uploaded_filename = peakfile_names for name in peakfile_names: full_path = os.path.join(settings.MEDIA_ROOT, 'users_peak_files', str(request.session.session_key), str(gene_database_name), '') # grab the original file with open(full_path + name, 'rb') as orig_file: prox_file = open(full_path + name + '_proximal_' + str(cutoff), 'w') dist_file = open(full_path + name + '_distal_' + str(cutoff), 'w') for line in orig_file.readlines(): gene_dist = re.search('\d+\s\n', line) if int(gene_dist.group(0)) <= int(cutoff): prox_file.writelines(line) else: dist_file.writelines(line) prox_file.close() dist_file.close() orig_file.close() fname = fname + peak_database_names fname.sort() matrix_size = len(fname) proximal_matrix = [[0 for x in range(matrix_size)] for x in range(matrix_size)] proximal_pval_matrix = [[0 for x in range(matrix_size)] for x in range(matrix_size)] proximal_dist_matrix = [[0 for x in range(matrix_size)] for x in range(matrix_size)] distal_matrix = [[0 for x in range(matrix_size)] for x in range(matrix_size)] distal_pval_matrix = [[0 for x in range(matrix_size)] for x in range(matrix_size)] distal_dist_matrix = [[0 for x in range(matrix_size)] for x in range(matrix_size)] path = os.path.join(settings.MEDIA_ROOT, 'peak_file_db', str(gene_database_name), str(cutoff)) user_path = os.path.join(settings.MEDIA_ROOT, 'users_peak_files', str(request.session.session_key), str(gene_database_name)) start_time = time.time() # FOR PROXIMAL proximal_fc_limit = [float("inf"), float("-inf")] for i in range(matrix_size): file_one_user_uploaded = True if fname[i] not in user_uploaded_filename: full_filename_i = Peaks_db.objects.get(fileID=fname[i]) full_filename_i = full_filename_i.origFile + '_proximal_' + str( cutoff) f1 = Peaks_db_file.objects.filter( filename=full_filename_i).values('path') f1 = os.path.join(f1[0]['path'], full_filename_i) file_one_user_uploaded = False else: full_filename_i = fname[i] + '_proximal_' + str(cutoff) f1 = os.path.join(user_path, full_filename_i) if not file_one_user_uploaded: jaccard_indices = getPrecomputedJaccardValuePerFile( full_filename_i) for j in range(matrix_size): file_two_user_uploaded = True if fname[j] not in user_uploaded_filename: full_filename_j = Peaks_db.objects.get(fileID=fname[j]) full_filename_j = full_filename_j.origFile + '_proximal_' + str( cutoff) f2 = Peaks_db_file.objects.filter( filename=full_filename_j).values('path') f2 = os.path.join(f2[0]['path'], full_filename_j) file_two_user_uploaded = False else: full_filename_j = fname[j] + '_proximal_' + str(cutoff) f2 = os.path.join(user_path, full_filename_j) if i == j: proximal_matrix[i][j] = calculateJaccardFC(1, "proximal") proximal_pval_matrix[i][j] = 0 proximal_dist_matrix[i][j] = 0 break elif file_one_user_uploaded: f1 = os.path.join(user_path, fname[i] + '_proximal_' + str(cutoff)) elif file_two_user_uploaded: f2 = os.path.join(user_path, fname[j] + '_proximal_' + str(cutoff)) if file_one_user_uploaded or file_two_user_uploaded: # use jaccard index file1 = bt(f1) file2 = bt(f2) result = file1.jaccard( file2) # This is where the jaccard is calculated jaccard_index = result['jaccard'] else: jaccard_index = jaccard_indices[full_filename_j] if math.isnan(jaccard_index) or jaccard_index < 0: proximal_matrix[i][j] = 0 proximal_matrix[j][i] = 0 proximal_pval_matrix[i][j] = 1 proximal_pval_matrix[j][i] = 1 proximal_dist_matrix[i][j] = 1 proximal_dist_matrix[j][i] = 1 else: jaccard_fc = calculateJaccardFC(jaccard_index, "proximal") proximal_matrix[i][j] = jaccard_fc proximal_matrix[j][i] = jaccard_fc jaccard_pval = calculateJaccardPval(jaccard_index, "proximal") proximal_pval_matrix[i][j] = jaccard_pval proximal_pval_matrix[j][i] = jaccard_pval proximal_dist_matrix[i][j] = 1 - jaccard_index proximal_dist_matrix[j][i] = 1 - jaccard_index if jaccard_fc > proximal_fc_limit[-1]: proximal_fc_limit[-1] = jaccard_fc elif jaccard_fc < proximal_fc_limit[0]: proximal_fc_limit[0] = jaccard_fc # FOR DISTAL distal_fc_limit = [float("inf"), float("-inf")] for i in range(matrix_size): file_one_user_uploaded = True if fname[i] not in user_uploaded_filename: full_filename_i = Peaks_db.objects.get(fileID=fname[i]) full_filename_i = full_filename_i.origFile + '_distal_' + str( cutoff) f1 = Peaks_db_file.objects.filter( filename=full_filename_i).values('path') f1 = os.path.join(f1[0]['path'], full_filename_i) file_one_user_uploaded = False else: full_filename_i = fname[i] + '_distal_' + str(cutoff) f1 = os.path.join(user_path, full_filename_i) if not file_one_user_uploaded: jaccard_indices = getPrecomputedJaccardValuePerFile( full_filename_i) for j in range(matrix_size): file_two_user_uploaded = True if fname[j] not in user_uploaded_filename: full_filename_j = Peaks_db.objects.get(fileID=fname[j]) full_filename_j = full_filename_j.origFile + '_distal_' + str( cutoff) f2 = Peaks_db_file.objects.filter( filename=full_filename_j).values('path') f2 = os.path.join(f2[0]['path'], full_filename_j) file_two_user_uploaded = False else: full_filename_j = fname[j] + '_distal_' + str(cutoff) f2 = os.path.join(user_path, full_filename_j) if i == j: distal_matrix[i][j] = calculateJaccardFC(1, "distal") distal_pval_matrix[i][j] = 0 distal_dist_matrix[i][j] = 0 break elif file_one_user_uploaded: f1 = os.path.join(user_path, fname[i] + '_distal_' + str(cutoff)) elif file_two_user_uploaded: f2 = os.path.join(user_path, fname[j] + '_distal_' + str(cutoff)) if file_one_user_uploaded or file_two_user_uploaded: # use jaccard index file1 = bt(f1) file2 = bt(f2) result = file1.jaccard( file2) # This is where the jaccard is calculated jaccard_index = result['jaccard'] else: jaccard_index = jaccard_indices[full_filename_j] if math.isnan(jaccard_index) or jaccard_index < 0: distal_matrix[i][j] = 0 distal_matrix[j][i] = 0 distal_pval_matrix[i][j] = 1 distal_pval_matrix[j][i] = 1 distal_dist_matrix[i][j] = 1 distal_dist_matrix[j][i] = 1 else: jaccard_fc = calculateJaccardFC(jaccard_index, "distal") distal_matrix[i][j] = jaccard_fc distal_matrix[j][i] = jaccard_fc jaccard_pval = calculateJaccardPval(jaccard_index, "distal") distal_pval_matrix[i][j] = jaccard_pval distal_pval_matrix[j][i] = jaccard_pval distal_dist_matrix[i][j] = 1 - jaccard_index distal_dist_matrix[j][i] = 1 - jaccard_index if jaccard_fc > distal_fc_limit[-1]: distal_fc_limit[-1] = jaccard_fc elif jaccard_fc < distal_fc_limit[0]: distal_fc_limit[0] = jaccard_fc ######## proximal_dendrogram = {} distal_dendrogram = {} # heatmap plots styles if heatmap == 'Independent': proximal_dist_vector = ssd.squareform(proximal_dist_matrix) proximal_linkage_matrix = linkage(proximal_dist_vector, "single", 'euclidean') proximal_dendrogram = dendrogram(proximal_linkage_matrix, labels=fname) distal_dist_vector = ssd.squareform(distal_dist_matrix) distal_linkage_matrix = linkage(distal_dist_vector, "single", "euclidean") distal_dendrogram = dendrogram(distal_linkage_matrix, labels=fname) # fname = names of the files (ivl) f_order = proximal_dendrogram['ivl'] # reorder the matrix by new order of the dendogram ordered_proximal_matrix = [[0 for x in range(matrix_size)] for x in range(matrix_size)] ordered_proximal_pval_matrix = [[0 for x in range(matrix_size)] for x in range(matrix_size)] # find the index of ordered item in the original matrix # use the index found and get the value from the original matrix and get the value and insert to new matrix for i, f_name1 in enumerate(f_order): index1 = fname.index(f_name1) for j, f_name2 in enumerate(f_order): index2 = fname.index(f_name2) if proximal_pval_matrix[index1][index2] <= pvalue: ordered_proximal_matrix[i][j] = proximal_matrix[index1][ index2] else: ordered_proximal_matrix[i][j] = float("-inf") ordered_proximal_pval_matrix[i][j] = 'p-value: {:1.5f}'.format( proximal_pval_matrix[index1][index2]) f_order = distal_dendrogram['ivl'] ordered_distal_matrix = [[0 for x in range(matrix_size)] for x in range(matrix_size)] ordered_distal_pval_matrix = [[0 for x in range(matrix_size)] for x in range(matrix_size)] for i, f_name1 in enumerate(f_order): index1 = fname.index(f_name1) for j, f_name2 in enumerate(f_order): index2 = fname.index(f_name2) if distal_pval_matrix[index1][index2] <= pvalue: ordered_distal_matrix[i][j] = distal_matrix[index1][index2] else: ordered_distal_matrix[i][j] = float("-inf") ordered_distal_pval_matrix[i][j] = 'p-value: {:1.5f}'.format( distal_pval_matrix[index1][index2]) p_name = proximal_dendrogram['ivl'] p_new_ls = p_name d_name = distal_dendrogram['ivl'] d_new_ls = d_name else: f_order = [] p_new_ls = [] d_new_ls = [] if heatmap == 'Follow proximal': proximal_dist_vector = ssd.squareform(proximal_dist_matrix) proximal_linkage_matrix = linkage(proximal_dist_vector, "single", 'euclidean') proximal_dendrogram = dendrogram(proximal_linkage_matrix, labels=fname) distal_dendrogram = "None" f_order = proximal_dendrogram['ivl'] p_name = proximal_dendrogram['ivl'] p_new_ls = p_name d_new_ls = p_name elif heatmap == 'Follow distal': distal_dist_vector = ssd.squareform(distal_dist_matrix) distal_linkage_matrix = linkage(distal_dist_vector, "single", "euclidean") distal_dendrogram = dendrogram(distal_linkage_matrix, labels=fname) proximal_dendrogram = "None" f_order = distal_dendrogram['ivl'] d_name = distal_dendrogram['ivl'] p_new_ls = d_name d_new_ls = d_name ordered_proximal_matrix = [[0 for x in range(matrix_size)] for x in range(matrix_size)] ordered_proximal_pval_matrix = [[0 for x in range(matrix_size)] for x in range(matrix_size)] ordered_distal_matrix = [[0 for x in range(matrix_size)] for x in range(matrix_size)] ordered_distal_pval_matrix = [[0 for x in range(matrix_size)] for x in range(matrix_size)] for i, f_name1 in enumerate(f_order): index1 = fname.index(f_name1) for j, f_name2 in enumerate(f_order): index2 = fname.index(f_name2) if proximal_pval_matrix[index1][index2] <= pvalue: ordered_proximal_matrix[i][j] = proximal_matrix[index1][ index2] else: ordered_proximal_matrix[i][j] = float("-inf") if distal_pval_matrix[index1][index2] <= pvalue: ordered_distal_matrix[i][j] = distal_matrix[index1][index2] else: ordered_distal_matrix[i][j] = float("-inf") ordered_proximal_pval_matrix[i][j] = 'p-value: {:1.5f}'.format( proximal_pval_matrix[index1][index2]) ordered_distal_pval_matrix[i][j] = 'p-value: {:1.5f}'.format( distal_pval_matrix[index1][index2]) proc_time = time.time() - start_time json_data = json.dumps({ 'p_filename': p_new_ls, 'd_filename': d_new_ls, 'matrix_size': matrix_size, 'proximal_matrix': ordered_proximal_matrix, 'proximal_pval_matrix': ordered_proximal_pval_matrix, 'proximal_dendrogram': proximal_dendrogram, 'distal_matrix': ordered_distal_matrix, 'distal_pval_matrix': ordered_distal_pval_matrix, 'distal_dendrogram': distal_dendrogram, 'proxdist_fc_limit': [proximal_fc_limit, distal_fc_limit], 'proc_time': proc_time }) table = Peaks_db.objects.all().values('protein', 'fileID', 'num_peaks', 'cells', 'labs', 'year') context = { 'form': form, 'table': table, 'peakfile_names': peakfile_names, 'json_data': json_data, 'proximal_dendrogram': proximal_dendrogram, 'distal_dendrogram': distal_dendrogram } return render(request, 'tfClassify.html', context)