def test_read_transfac_alphabet_superset(self): with data_stream("transfac_matrix.txt") as f: Motif.read_transfac(f, alphabet='TCGA') # Supplied alphabet can be superset of defacto alphabet. # Reverts to defacto alphabet with data_stream("transfac_matrix.txt") as f: Motif.read_transfac(f, alphabet='TCGAXYZ')
def test_reverse(self): f = data_stream("transfac_matrix.txt") m = Motif.read_transfac(f) f2 = data_stream("transfac_matrix.txt") m2 = Motif.read_transfac(f2) m2.reverse() (K, N) = np.shape(m2) for k in range(0, K): for n in range(0, N): assert (m[k, n] == m2[K - k - 1, n]) f.close() f2.close()
def test_reverse_complement(self): f = data_stream("transfac_matrix.txt") m = Motif.read_transfac(f) f2 = data_stream("transfac_matrix.txt") m2 = Motif.read_transfac(f2) m.complement() m.reverse() m2.reverse_complement() assert (m.array == m2.array).all() f.close() f2.close()
def test_complement(self): f = data_stream("transfac_matrix.txt") m = Motif.read_transfac(f) f2 = data_stream("transfac_matrix.txt") m2 = Motif.read_transfac(f2) m2.complement() (K, N) = np.shape(m2) for k in range(0, K): assert (m[k, 'A'] == m2[k, 'T']) assert (m[k, 'G'] == m2[k, 'C']) assert (m[k, 'C'] == m2[k, 'G']) assert (m[k, 'T'] == m2[k, 'A']) f.close() f2.close()
def test_complement(self): f = data_stream("transfac_matrix.txt") m = Motif.read_transfac(f) f2 = data_stream("transfac_matrix.txt") m2 = Motif.read_transfac(f2) m2.complement() (K, N) = np.shape(m2) for k in range(0, K): assert m[k, "A"] == m2[k, "T"] assert m[k, "G"] == m2[k, "C"] assert m[k, "C"] == m2[k, "G"] assert m[k, "T"] == m2[k, "A"] f.close() f2.close()
def test_reindex(self): f = data_stream("transfac_matrix.txt") m = Motif.read_transfac(f) f.close() m2 = m.reindex("TCGA") assert (str(m2.alphabet) == "TCGA") for k in range(0, 12): for i, a in enumerate("AGCT"): assert m[k, a] == m2[k, a]
def test_read_transfac(self): f = data_stream("transfac_matrix.txt") m = Motif.read_transfac(f) f.close() assert m[3, 'A'] == 0.0 assert m[0, 'G'] == 2.0 assert np.shape(m.array) == (12, 4) f.close() f = data_stream("transfac_matrix2.txt") m = Motif.read_transfac(f) f.close() assert m[3, 'A'] == 3.0 assert m[0, 'G'] == 152.0 assert np.shape(m.array) == (15, 4) # this one has extra Ps on start of each line f = data_stream("transfac_matrix3.txt") m = Motif.read_transfac(f) f.close()
def get_weblogos(args): """Build standard weblogos per convolutional filter.""" # create output directory if not os.path.exists(args.out_dir): os.makedirs(args.out_dir) samples = np.load(args.train_data, mmap_mode='r') gc_content = np.sum(np.mean(np.mean(samples, axis=1), axis=0)[1:3]) # for each convolutional filter for file in os.listdir(args.in_dir): if bool(re.search("_motifs_filter_[0-9]+.*" + args.file_ext, file)) and \ os.stat(args.in_dir + "/" + file).st_size > 0: c_filter = re.search("filter_[0-9]+", file).group() filter_index = c_filter.replace("filter_", "") print("Processing filter: " + filter_index) fin = open(args.in_dir + "/" + file) # load motifs from fasta file if args.file_ext == ".fasta": seqs = read_seq_data(fin) prior = parse_prior(str(gc_content), seqs.alphabet) try: data = LogoData.from_seqs(seqs, prior) except ValueError as err: print(err) continue except RuntimeError as err: print(err) continue # load count matrix from transfac file elif args.file_ext == ".transfac": motif = Motif.read_transfac(fin) prior = parse_prior(str(gc_content), motif.alphabet) try: data = LogoData.from_counts(motif.alphabet, motif, prior) except ValueError as err: print(err) continue except RuntimeError as err: print(err) continue # set logo options options = LogoOptions() options.logo_title = "filter " + filter_index options.color_scheme = classic options.stack_width = std_sizes["large"] options.resolution = 300 # save filter logo l_format = LogoFormat(data, options) png = png_formatter(data, l_format) with open( args.out_dir + "/weblogo_" + file.replace(args.file_ext, ".png"), 'wb') as out_file: out_file.write(png) eps = eps_formatter(data, l_format) with open( args.out_dir + "/weblogo_" + file.replace(args.file_ext, ".eps"), 'wb') as out_file: out_file.write(eps)
def get_weblogos_ext(args): """ Build extended weblogos per convolutional filter with nucleotide coloring.""" s_max = 1 / args.gain s_min = -s_max samples = np.load(args.train_data, mmap_mode='r') gc_content = np.sum(np.mean(np.mean(samples, axis=1), axis=0)[1:3]) at_content = 1 - gc_content base_pseudocounts = np.array( [at_content, gc_content, gc_content, at_content]) / 2.0 # create output directory if not os.path.exists(args.out_dir): os.makedirs(args.out_dir) letter_dict = dict({'A': 0, 'C': 1, 'G': 2, 'T': 3}) # nucleotide color scheme: blue - grey - red colormap = plt.cm.coolwarm # for each convolutional filter for file_fasta in os.listdir(args.fasta_dir): if bool(re.search("_motifs_filter_[0-9]+.*" + ".fasta", file_fasta)) and \ os.stat(args.fasta_dir + "/" + file_fasta).st_size > 0: c_filter = re.search("filter_[0-9]+", file_fasta).group() filter_index = c_filter.replace("filter_", "") print("Processing filter: " + filter_index) else: continue file_transfac = [] file_scores = [ filename for filename in os.listdir(args.scores_dir) if bool( re.search( "rel_filter_" + str(filter_index) + "_nucleotides\.csv", filename)) ] assert len( file_scores) < 2, "Multiple score files for filter {}".format( filter_index) # load transfac files if args.logo_dir: file_transfac = [ filename for filename in os.listdir(args.logo_dir) if bool( re.search( "filter_" + str(filter_index) + "_seq_weighting.transfac", filename)) ] if len(file_transfac) == 0: continue assert len(file_transfac ) < 2, "Multiple transfac files for filter {}".format( filter_index) # load nucleotide contribution scores contribution_scores = [] with open(args.scores_dir + "/" + file_scores[0], 'r') as csvfile: reader = csv.reader(csvfile) for ind, row in enumerate(reader): if ind % 2 == 1: scores = np.array(row, dtype=np.float32) contribution_scores.append(scores) # load motifs from fasta file try: fin = open(args.fasta_dir + "/" + file_fasta) seqs = read_seq_data(fin) except IOError: print("No data, skipping.") continue except ValueError: print("No data, skipping.") continue # load weighted count matrix from transfac file if args.logo_dir: fin = open(args.logo_dir + "/" + file_transfac[0]) motif = Motif.read_transfac(fin) prior = parse_prior(str(gc_content), motif.alphabet) data = LogoData.from_counts(motif.alphabet, motif, prior) out_png_name = args.out_dir + "/weblogo_extended_" + file_transfac[ 0].replace(".transfac", ".png") out_eps_name = args.out_dir + "/weblogo_extended_" + file_transfac[ 0].replace(".transfac", ".eps") else: prior = parse_prior(str(gc_content), seqs.alphabet) data = LogoData.from_seqs(seqs, prior) out_png_name = args.out_dir + "/weblogo_extended_" + file_fasta.replace( ".fasta", ".png") out_eps_name = args.out_dir + "/weblogo_extended_" + file_fasta.replace( ".fasta", ".eps") seq_names = [seq.name for seq in seqs] seen = set() seqs_unique = [ seqs[idx] for idx, seq_name in enumerate(seq_names) if seq_name not in seen and not seen.add(seq_name) ] assert len(contribution_scores) == len( seqs_unique ), "Numbers of contribution scores and sequences differ." # compute mean contribution score per nucleotide and logo position mean_scores = np.zeros((len(seqs_unique[0]), len(seqs.alphabet))) counts = np.zeros_like(data.counts.array) for r_id, read in enumerate(seqs_unique): for pos, base in enumerate(read): base = str(base) if base in letter_dict.keys(): mean_scores[ pos, letter_dict[base]] += contribution_scores[r_id][pos] counts[pos, letter_dict[base]] += 1 # add pseudocount to avoid divion by 0 motif_len = len(seqs_unique[0]) pseudocounts = np.reshape( np.concatenate([base_pseudocounts] * motif_len, axis=0), [motif_len, 4]) mean_scores /= (counts + pseudocounts) # normalize scores to [0, 255] and assign color according the selected color scheme norm_scores = ((mean_scores - s_min) / (s_max - s_min)) * 255 color_rules = [] for base in letter_dict.keys(): for pos in range(len(seqs[0])): custom_color = matplotlib.colors.rgb2hex( colormap(int(norm_scores[pos, letter_dict[base]]))) color_rules.append(SymbolIndexColor(base, [pos], custom_color)) # set logo options options = LogoOptions() options.logo_title = "filter " + str(filter_index) options.color_scheme = ColorScheme(color_rules) options.stack_width = std_sizes["large"] options.resolution = 300 # save filter logo l_format = LogoFormat(data, options) png = png_formatter(data, l_format) with open(out_png_name, 'wb') as out_file: out_file.write(png) eps = eps_formatter(data, l_format) with open(out_eps_name, 'wb') as out_file: out_file.write(eps)