예제 #1
0
    def test_read_transfac_alphabet_superset(self):
        with data_stream("transfac_matrix.txt") as f:
            Motif.read_transfac(f, alphabet='TCGA')

        # Supplied alphabet can be superset of defacto alphabet.
        # Reverts to defacto alphabet
        with data_stream("transfac_matrix.txt") as f:
            Motif.read_transfac(f, alphabet='TCGAXYZ')
예제 #2
0
    def test_reverse(self):
        f = data_stream("transfac_matrix.txt")
        m = Motif.read_transfac(f)
        f2 = data_stream("transfac_matrix.txt")
        m2 = Motif.read_transfac(f2)
        m2.reverse()

        (K, N) = np.shape(m2)
        for k in range(0, K):
            for n in range(0, N):
                assert (m[k, n] == m2[K - k - 1, n])

        f.close()
        f2.close()
예제 #3
0
    def test_reverse_complement(self):
        f = data_stream("transfac_matrix.txt")
        m = Motif.read_transfac(f)

        f2 = data_stream("transfac_matrix.txt")
        m2 = Motif.read_transfac(f2)

        m.complement()
        m.reverse()

        m2.reverse_complement()

        assert (m.array == m2.array).all()
        f.close()
        f2.close()
예제 #4
0
    def test_complement(self):
        f = data_stream("transfac_matrix.txt")
        m = Motif.read_transfac(f)
        f2 = data_stream("transfac_matrix.txt")
        m2 = Motif.read_transfac(f2)
        m2.complement()

        (K, N) = np.shape(m2)
        for k in range(0, K):
            assert (m[k, 'A'] == m2[k, 'T'])
            assert (m[k, 'G'] == m2[k, 'C'])
            assert (m[k, 'C'] == m2[k, 'G'])
            assert (m[k, 'T'] == m2[k, 'A'])
        f.close()
        f2.close()
예제 #5
0
    def test_complement(self):
        f = data_stream("transfac_matrix.txt")
        m = Motif.read_transfac(f)
        f2 = data_stream("transfac_matrix.txt")
        m2 = Motif.read_transfac(f2)
        m2.complement()

        (K, N) = np.shape(m2)
        for k in range(0, K):
            assert m[k, "A"] == m2[k, "T"]
            assert m[k, "G"] == m2[k, "C"]
            assert m[k, "C"] == m2[k, "G"]
            assert m[k, "T"] == m2[k, "A"]
        f.close()
        f2.close()
예제 #6
0
    def test_reindex(self):
        f = data_stream("transfac_matrix.txt")
        m = Motif.read_transfac(f)
        f.close()
        m2 = m.reindex("TCGA")

        assert (str(m2.alphabet) == "TCGA")

        for k in range(0, 12):
            for i, a in enumerate("AGCT"):
                assert m[k, a] == m2[k, a]
예제 #7
0
    def test_read_transfac(self):
        f = data_stream("transfac_matrix.txt")
        m = Motif.read_transfac(f)
        f.close()
        assert m[3, 'A'] == 0.0
        assert m[0, 'G'] == 2.0
        assert np.shape(m.array) == (12, 4)
        f.close()

        f = data_stream("transfac_matrix2.txt")
        m = Motif.read_transfac(f)
        f.close()
        assert m[3, 'A'] == 3.0
        assert m[0, 'G'] == 152.0
        assert np.shape(m.array) == (15, 4)

        # this one has extra Ps on start of each line
        f = data_stream("transfac_matrix3.txt")
        m = Motif.read_transfac(f)
        f.close()
예제 #8
0
def get_weblogos(args):
    """Build standard weblogos per convolutional filter."""
    # create output directory
    if not os.path.exists(args.out_dir):
        os.makedirs(args.out_dir)

    samples = np.load(args.train_data, mmap_mode='r')
    gc_content = np.sum(np.mean(np.mean(samples, axis=1), axis=0)[1:3])

    # for each convolutional filter
    for file in os.listdir(args.in_dir):

        if bool(re.search("_motifs_filter_[0-9]+.*" + args.file_ext, file)) and \
                os.stat(args.in_dir + "/" + file).st_size > 0:
            c_filter = re.search("filter_[0-9]+", file).group()
            filter_index = c_filter.replace("filter_", "")
            print("Processing filter: " + filter_index)

            fin = open(args.in_dir + "/" + file)

            # load motifs from fasta file
            if args.file_ext == ".fasta":
                seqs = read_seq_data(fin)
                prior = parse_prior(str(gc_content), seqs.alphabet)
                try:
                    data = LogoData.from_seqs(seqs, prior)
                except ValueError as err:
                    print(err)
                    continue
                except RuntimeError as err:
                    print(err)
                    continue

            # load count matrix from transfac file
            elif args.file_ext == ".transfac":

                motif = Motif.read_transfac(fin)
                prior = parse_prior(str(gc_content), motif.alphabet)
                try:
                    data = LogoData.from_counts(motif.alphabet, motif, prior)
                except ValueError as err:
                    print(err)
                    continue
                except RuntimeError as err:
                    print(err)
                    continue

            # set logo options
            options = LogoOptions()
            options.logo_title = "filter " + filter_index
            options.color_scheme = classic
            options.stack_width = std_sizes["large"]
            options.resolution = 300

            # save filter logo
            l_format = LogoFormat(data, options)
            png = png_formatter(data, l_format)
            with open(
                    args.out_dir + "/weblogo_" +
                    file.replace(args.file_ext, ".png"), 'wb') as out_file:
                out_file.write(png)
            eps = eps_formatter(data, l_format)
            with open(
                    args.out_dir + "/weblogo_" +
                    file.replace(args.file_ext, ".eps"), 'wb') as out_file:
                out_file.write(eps)
예제 #9
0
def get_weblogos_ext(args):
    """ Build extended weblogos per convolutional filter with nucleotide coloring."""
    s_max = 1 / args.gain
    s_min = -s_max

    samples = np.load(args.train_data, mmap_mode='r')
    gc_content = np.sum(np.mean(np.mean(samples, axis=1), axis=0)[1:3])
    at_content = 1 - gc_content
    base_pseudocounts = np.array(
        [at_content, gc_content, gc_content, at_content]) / 2.0

    # create output directory
    if not os.path.exists(args.out_dir):
        os.makedirs(args.out_dir)

    letter_dict = dict({'A': 0, 'C': 1, 'G': 2, 'T': 3})
    # nucleotide color scheme: blue - grey - red
    colormap = plt.cm.coolwarm

    # for each convolutional filter
    for file_fasta in os.listdir(args.fasta_dir):
        if bool(re.search("_motifs_filter_[0-9]+.*" + ".fasta", file_fasta)) and \
                os.stat(args.fasta_dir + "/" + file_fasta).st_size > 0:
            c_filter = re.search("filter_[0-9]+", file_fasta).group()
            filter_index = c_filter.replace("filter_", "")
            print("Processing filter: " + filter_index)
        else:
            continue

        file_transfac = []
        file_scores = [
            filename for filename in os.listdir(args.scores_dir) if bool(
                re.search(
                    "rel_filter_" + str(filter_index) +
                    "_nucleotides\.csv", filename))
        ]
        assert len(
            file_scores) < 2, "Multiple score files for filter {}".format(
                filter_index)

        # load transfac files
        if args.logo_dir:
            file_transfac = [
                filename for filename in os.listdir(args.logo_dir) if bool(
                    re.search(
                        "filter_" + str(filter_index) +
                        "_seq_weighting.transfac", filename))
            ]
            if len(file_transfac) == 0:
                continue
            assert len(file_transfac
                       ) < 2, "Multiple transfac files for filter {}".format(
                           filter_index)

        # load nucleotide contribution scores
        contribution_scores = []
        with open(args.scores_dir + "/" + file_scores[0], 'r') as csvfile:
            reader = csv.reader(csvfile)
            for ind, row in enumerate(reader):
                if ind % 2 == 1:
                    scores = np.array(row, dtype=np.float32)
                    contribution_scores.append(scores)

        # load motifs from fasta file
        try:
            fin = open(args.fasta_dir + "/" + file_fasta)
            seqs = read_seq_data(fin)
        except IOError:
            print("No data, skipping.")
            continue
        except ValueError:
            print("No data, skipping.")
            continue

        # load weighted count matrix from transfac file
        if args.logo_dir:
            fin = open(args.logo_dir + "/" + file_transfac[0])
            motif = Motif.read_transfac(fin)
            prior = parse_prior(str(gc_content), motif.alphabet)
            data = LogoData.from_counts(motif.alphabet, motif, prior)
            out_png_name = args.out_dir + "/weblogo_extended_" + file_transfac[
                0].replace(".transfac", ".png")
            out_eps_name = args.out_dir + "/weblogo_extended_" + file_transfac[
                0].replace(".transfac", ".eps")
        else:
            prior = parse_prior(str(gc_content), seqs.alphabet)
            data = LogoData.from_seqs(seqs, prior)
            out_png_name = args.out_dir + "/weblogo_extended_" + file_fasta.replace(
                ".fasta", ".png")
            out_eps_name = args.out_dir + "/weblogo_extended_" + file_fasta.replace(
                ".fasta", ".eps")

        seq_names = [seq.name for seq in seqs]
        seen = set()
        seqs_unique = [
            seqs[idx] for idx, seq_name in enumerate(seq_names)
            if seq_name not in seen and not seen.add(seq_name)
        ]

        assert len(contribution_scores) == len(
            seqs_unique
        ), "Numbers of contribution scores and sequences differ."

        # compute mean contribution score per nucleotide and logo position
        mean_scores = np.zeros((len(seqs_unique[0]), len(seqs.alphabet)))
        counts = np.zeros_like(data.counts.array)
        for r_id, read in enumerate(seqs_unique):
            for pos, base in enumerate(read):
                base = str(base)
                if base in letter_dict.keys():
                    mean_scores[
                        pos,
                        letter_dict[base]] += contribution_scores[r_id][pos]
                    counts[pos, letter_dict[base]] += 1

        # add pseudocount to avoid divion by 0
        motif_len = len(seqs_unique[0])
        pseudocounts = np.reshape(
            np.concatenate([base_pseudocounts] * motif_len, axis=0),
            [motif_len, 4])
        mean_scores /= (counts + pseudocounts)

        # normalize scores to [0, 255] and assign color according the selected color scheme
        norm_scores = ((mean_scores - s_min) / (s_max - s_min)) * 255
        color_rules = []
        for base in letter_dict.keys():
            for pos in range(len(seqs[0])):
                custom_color = matplotlib.colors.rgb2hex(
                    colormap(int(norm_scores[pos, letter_dict[base]])))
                color_rules.append(SymbolIndexColor(base, [pos], custom_color))

        # set logo options
        options = LogoOptions()
        options.logo_title = "filter " + str(filter_index)
        options.color_scheme = ColorScheme(color_rules)
        options.stack_width = std_sizes["large"]
        options.resolution = 300

        # save filter logo
        l_format = LogoFormat(data, options)
        png = png_formatter(data, l_format)
        with open(out_png_name, 'wb') as out_file:
            out_file.write(png)
        eps = eps_formatter(data, l_format)
        with open(out_eps_name, 'wb') as out_file:
            out_file.write(eps)