def generate_data(contact_threshold, sequence_separation, pdb_dir, psicov_dir):
    number_contacts = {}
    for contact_thr in contact_threshold:
        number_contacts[contact_thr] = {}
        for seqsep in sequence_separation:
            number_contacts[contact_thr][seqsep] = {'L': [],
                                                    'number of contacts': []}

    alignment_files = glob.glob(psicov_dir + "/*psc")

    for alignment_file in alignment_files:

        pdb_file = pdb_dir + "/" + os.path.basename(alignment_file).split(".")[0] + ".pdb"

        if not os.path.exists(pdb_file):
            continue

        print os.path.basename(alignment_file).split(".")[0]
        L = len(open(alignment_file).readline().rstrip())

        distance_map = pdb.distance_map(pdb_file, L)

        for contact_thr in contact_threshold:
            residue_i, residue_j = np.where((distance_map < contact_thr))

            for seqsep in sequence_separation:
                indices_seq_sep = np.where((residue_j - residue_i > seqsep))

                number_contacts[contact_thr][seqsep]["L"].append(L)
                number_contacts[contact_thr][seqsep]["number of contacts"].append(len(indices_seq_sep[0]))

    return number_contacts
def generate_data(contact_threshold, sequence_separation, pdb_dir, psicov_dir):
    number_contacts = {}
    for contact_thr in contact_threshold:
        number_contacts[contact_thr] = {}
        for seqsep in sequence_separation:
            number_contacts[contact_thr][seqsep] = {
                'L': [],
                'number of contacts': []
            }

    alignment_files = glob.glob(psicov_dir + "/*psc")

    for alignment_file in alignment_files:

        pdb_file = pdb_dir + "/" + os.path.basename(alignment_file).split(
            ".")[0] + ".pdb"

        if not os.path.exists(pdb_file):
            continue

        print os.path.basename(alignment_file).split(".")[0]
        L = len(open(alignment_file).readline().rstrip())

        distance_map = pdb.distance_map(pdb_file, L)

        for contact_thr in contact_threshold:
            residue_i, residue_j = np.where((distance_map < contact_thr))

            for seqsep in sequence_separation:
                indices_seq_sep = np.where((residue_j - residue_i > seqsep))

                number_contacts[contact_thr][seqsep]["L"].append(L)
                number_contacts[contact_thr][seqsep][
                    "number of contacts"].append(len(indices_seq_sep[0]))

    return number_contacts
def collect_data(braw_dirs, alignment_dir, pdb_dir, bin_size, ab):

    #define distance bins
    bins = [0, 5, 8, 12, 15, 20, np.inf]

    max_nr_couplings_per_protein = 500

    methods = braw_dirs.keys()
    couplings_per_bin = {}
    for method in methods:
        couplings_per_bin[method] = {}
        for bin in range(len(bins) - 1):
            bin_name = str(bin + 1) + ": " + str(bins[bin]) + "-" + str(
                bins[bin + 1])
            couplings_per_bin[method][bin_name] = []

    # iterate over proteins
    psc_files = glob.glob(alignment_dir + "/*psc")
    for psc_file in psc_files:

        # psc_file = psc_files[0]
        protein = os.path.basename(psc_file).split(".")[0]
        pdb_file = pdb_dir + "/" + protein + ".pdb"

        # check if ALL braw files exist
        braw_files = {}
        for method in methods:
            braw_files[
                method] = braw_dirs[method] + "/" + protein + ".filt.braw.gz"

        if any([not os.path.exists(braw_files[method]) for method in methods]):
            print("Skip this protein (braw files does not exist).")
            continue

        alignment = io.read_alignment(psc_file, format="psicov")
        distance_map = pdb.distance_map(pdb_file, alignment.shape[1])

        diversity = np.sqrt(alignment.shape[0]) / alignment.shape[1]
        if diversity < 0.3:
            print("Skip this protein (low diversity = {0}).".format(diversity))
            continue

        # read braw files
        braw = {}
        for method in methods:
            if ab == 'all':
                braw[method] = bu.compute_l2norm_from_brawfile(
                    braw_files[method], apc=True)
            else:
                braw[method] = raw.parse_msgpack(braw_files[method])

        # mask highly gapped positions
        gaps = ali.compute_gaps_per_position(alignment)
        highly_gapped_pos = np.where(np.array(gaps) > 0.3)[0]
        distance_map[highly_gapped_pos, :] = np.nan
        distance_map[:, highly_gapped_pos] = np.nan

        # iterate over pairs for bins
        for bin in range(len(bins) - 1):
            cb_lower = bins[bin]
            cb_upper = bins[bin + 1]
            bin_name = sorted(couplings_per_bin[methods[0]].keys())[bin]

            residue_indices = np.where((distance_map > cb_lower)
                                       & (distance_map < cb_upper))

            #shuffle indices to remove positioning bias
            c = list(zip(residue_indices[0], residue_indices[1]))
            random.shuffle(c)
            residue_indices = zip(*c)

            for method in methods:
                if len(couplings_per_bin[method][bin_name]) < bin_size:
                    if ab == 'all':
                        ab_coupling = braw[method][
                            residue_indices[0], residue_indices[1]].tolist(
                            )[:max_nr_couplings_per_protein]
                    else:
                        ab_coupling = braw[method].x_pair[
                            residue_indices[0], residue_indices[1],
                            io.AMINO_INDICES[ab[0]],
                            io.AMINO_INDICES[ab[2]]].tolist(
                            )[:max_nr_couplings_per_protein]

                    couplings_per_bin[method][bin_name].extend(ab_coupling)

            print("\nprotein {0} bin: {1:<8} size: {2}".format(
                protein, bin_name,
                len(couplings_per_bin[methods[0]][bin_name])))

        # stop condition: all bins are full
        if all([
                len(v) >= bin_size
                for v in couplings_per_bin[methods[0]].values()
        ]):
            break

    return couplings_per_bin
def collect_data(pdb_dir, alignment_dir, distance_definition, size):


    pdb_files = os.listdir(pdb_dir +"/")

    sequence_separations = [1, 6, 12, 24]

    distances_ab = {}
    for seq_sep in sequence_separations:
        distances_ab[seq_sep] = {}
        for a in io.AMINO_ACIDS[:20]:
            for b in io.AMINO_ACIDS[:20]:
                distances_ab[seq_sep][a+"-"+b] = []

    for pdb_file in pdb_files[:size]:
        #pdb_file=pdb_files[0]

        protein = os.path.basename(pdb_file).split(".")[0]
        print protein

        alignment_file = alignment_dir +"/" + protein +".filt.psc"
        if not os.path.exists(alignment_file):
            continue
        alignment = io.read_alignment(alignment_file)
        L = alignment.shape[1]

        query_sequence = alignment[0]
        dist_matrix = pdb.distance_map(pdb_dir +"/" + pdb_file, L, distance_definition)

        for seq_sep in sequence_separations:
            indices_upper_tri_i, indices_upper_tri_j  =  np.triu_indices(L, k=seq_sep)

            if len(indices_upper_tri_i) == 0:
                continue

            distances_ab_seqsep = dist_matrix[indices_upper_tri_i, indices_upper_tri_j]
            AA_a = query_sequence[indices_upper_tri_i]
            AA_b = query_sequence[indices_upper_tri_j]

            for pair in range(len(indices_upper_tri_i)):
                ab = io.AMINO_ACIDS[AA_a[pair]] + "-" + io.AMINO_ACIDS[AA_b[pair]]
                if AA_a[pair] == 20 or AA_b[pair] == 20:
                    continue
                distances_ab[seq_sep][ab].extend(list(distances_ab_seqsep[pair][~np.isnan(distances_ab_seqsep[pair])]))


        # if ab == 'all':
        #     indices_a = range(L)
        #     indices_b = range(L)
        # else:
        #     query_sequence = alignment[0]
        #     indices_a = np.where(query_sequence == io.AMINO_INDICES[a])[0]
        #     indices_b = np.where(query_sequence == io.AMINO_INDICES[b])[0]
        # grid_indices_ab_pairs = [(x,y) for x in indices_a for y in indices_b]
        #
        # if len(grid_indices_ab_pairs) == 0:
        #     continue
        #
        # dist_matrix = pdb.distance_map(pdb_dir +"/" + pdb_file, L, distance_definition)
        #
        # for seq_sep in sequence_separations:
        #
        #     if len(distances_ab[seq_sep]) < size:
        #         indices_upper_tri_i, indices_upper_tri_j  =  np.triu_indices(L, k=seq_sep)
        #
        #         if len(indices_upper_tri_i) == 0:
        #             continue
        #
        #         indices_seqsep = list(set(zip(indices_upper_tri_i, indices_upper_tri_j)).intersection(grid_indices_ab_pairs))
        #         if len(indices_seqsep) == 0:
        #             continue
        #
        #         indices_a_seqsep, indices_b_seqsep = zip(*indices_seqsep)
        #         distances_ab_seqsep = dist_matrix[indices_a_seqsep, indices_b_seqsep]
        #         distances_ab[seq_sep].extend(distances_ab_seqsep[~np.isnan(distances_ab_seqsep)])
        #
        # for seq_sep in sequence_separations:
        #     print(protein + " seq sep " + str(seq_sep) +": " + str(len(distances_ab[seq_sep])))
        #
        # if all([len(distances_ab[seq_sep]) >= size for seq_sep in sequence_separations]):
        #     break

    for seq_sep in distances_ab.keys():
        distances_ab[seq_sep]['all'] = np.concatenate(distances_ab[seq_sep].values())

    return distances_ab
Пример #5
0
def main():

    ### Parse arguments
    parser = argparse.ArgumentParser(description='Plotting a contact map.')
    parser.add_argument("braw_dir", type=str, help="path to binary raw files")
    parser.add_argument("pdb_dir", type=str, help="path to pdb files")
    parser.add_argument("alignment_dir",
                        type=str,
                        help="path to alignment files")
    parser.add_argument("nr_couplings",
                        type=int,
                        default=10000,
                        help="number of couplings")
    parser.add_argument("plot_out", type=str, help="path to plot file")
    parser.add_argument("max_per_protein",
                        type=int,
                        default=100,
                        help="maximum numbr couplings per protein")

    args = parser.parse_args()

    braw_dir = args.braw_dir
    pdb_dir = args.pdb_dir
    alignment_dir = args.alignment_dir
    nr_couplings = args.nr_couplings
    plot_out = args.plot_out
    max_per_protein = args.max_per_protein

    #debugging
    braw_dir = "/home/vorberg/work/data/benchmarkset_cathV4.1/contact_prediction/ccmpredpy_cd/braw/"
    pdb_dir = "/home/vorberg/work/data/benchmarkset_cathV4.1/pdb_renum_combs/"
    alignment_dir = "/home/vorberg/work/data/benchmarkset_cathV4.1/psicov/"
    nr_couplings = 20000
    plot_out = '/home/vorberg/'
    max_per_protein = 100

    if not os.path.exists(braw_dir):
        raise IOError("Braw Path {0} does not exist.".format(braw_dir))

    coupling_df = pd.DataFrame(columns=range(400) + ['Neff'])

    braw_files = glob.glob(braw_dir + "/*braw*")
    for braw_file in braw_files:

        if len(coupling_df) > nr_couplings:
            break

        protein = os.path.basename(braw_file).split(".")[0]
        print protein

        #-------------get couplings and metadata ---------------------------------------------------------------------
        braw = raw.parse_msgpack(braw_file)
        meta = braw.meta
        neff = meta['workflow'][0]['parameters']['msafile']['neff']
        L = meta['workflow'][0]['parameters']['msafile']['ncol']
        N = meta['workflow'][0]['parameters']['msafile']['nrow']
        diversity = np.sqrt(N) / L
        #-------------------------------------------------------------------------------------------------------------

        #-------------filter contacts -------------------------------------------------------------------------------
        pdb_file = pdb_dir + "/" + protein + ".pdb"
        dist_matrix = pdb.distance_map(pdb_file)

        # get contact map (either 1 encoding a contact or 1 encoding non-contact (according to class variable)
        contact_map = dist_matrix < 8

        # select all residue pairs within contact Threshold
        indices_contact = list(np.where(np.triu(contact_map, k=1)))
        #-------------------------------------------------------------------------------------------------------------

        #--------------filter gap columns ---------------------------------------------------------------------------
        psicov_file = alignment_dir + "/" + protein + ".filt.psc"
        psicov = io.read_alignment(psicov_file)

        percent_gaps_per_column = [
            float(psicov[:, l].tolist().count(0)) / N for l in range(L)
        ]
        columns_with_many_gaps = [
            i for i, j in enumerate(percent_gaps_per_column) if j > 0.2
        ]

        index_delete_contact_i = [
            index for index in range(len(indices_contact[0]))
            if indices_contact[0][index] in columns_with_many_gaps
        ]
        index_delete_contact_j = [
            index for index in range(len(indices_contact[1]))
            if indices_contact[1][index] in columns_with_many_gaps
        ]

        # delete column pairs from indices_contact
        indices_contact[0] = np.delete(
            indices_contact[0],
            np.unique(index_delete_contact_i + index_delete_contact_j))
        indices_contact[1] = np.delete(
            indices_contact[1],
            np.unique(index_delete_contact_i + index_delete_contact_j))
        #-------------------------------------------------------------------------------------------------------------

        nr_contacts = len(indices_contact[0])

        if nr_contacts == 0:
            continue

        random_sample = np.random.choice(range(nr_contacts),
                                         replace=False,
                                         size=np.min(
                                             [max_per_protein, nr_contacts]))
        couplings = braw.x_pair[
            indices_contact[0][random_sample],
            indices_contact[1][random_sample], :20, :20].reshape(
                len(random_sample), 400)
        df = pd.DataFrame(couplings)
        df['L'] = L
        df['Neff'] = neff
        df['Diversity'] = diversity
        df['sum_wij'] = couplings.sum(1)
        df['ratio_0.2L_Neff'] = 0.2 * L / neff

        coupling_df = coupling_df.append(df)
        print "nr of couplings: {0}".format(len(coupling_df))

    plot_file = plot_out + "/coupling_matrix_neff_" + str(
        nr_couplings) + ".html"
    plots.plot_coupling_vs_neff(coupling_df, 'Neff', plot_file)

    plot_file = plot_out + "/coupling_matrix_diversity_" + str(
        nr_couplings) + ".html"
    plots.plot_coupling_vs_neff(coupling_df, 'Diversity', plot_file)

    plot_file = plot_out + "/coupling_matrix_L_" + str(nr_couplings) + ".html"
    plots.plot_coupling_vs_neff(coupling_df, 'L', plot_file)

    plot_file = plot_out + "/coupling_matrix_ratio_0.2L_Neff_" + str(
        nr_couplings) + ".html"
    plots.plot_coupling_vs_neff(coupling_df, 'ratio_0.2L_Neff', plot_file)
def main():

    parser = argparse.ArgumentParser(
        description=
        "Generate SEQATOM sequences from deprecated database or recompute")

    parser.add_argument("-a",
                        "--alignment",
                        dest="ali",
                        help="path to alignment files")
    parser.add_argument("-p", "--pdb", dest="pdb", help="path to pdb files")
    parser.add_argument("-o",
                        "--output",
                        dest="output",
                        help="path to filter directory")
    parser.add_argument("--min-N",
                        dest="minN",
                        default=10,
                        type=int,
                        help="Minimum number of sequences")
    parser.add_argument("--max-gap-percentage",
                        dest="maxGap",
                        default=0.8,
                        type=float,
                        help="Maximum percentage of gaps in alignment")
    parser.add_argument("--max-L",
                        dest="maxL",
                        default=600,
                        type=float,
                        help="Maximum length of protein")
    parser.add_argument("--min-L",
                        dest="minL",
                        default=20,
                        type=float,
                        help="Minimum length of protein")
    parser.add_argument("--min-contacts",
                        dest="mincontacts",
                        default=1,
                        type=int,
                        help="Minimum number of contacts")
    parser.add_argument(
        "--contact-threshold",
        dest="contact_threshold",
        default=8,
        type=int,
        help="Contact defined as distance between Cbeta atoms < threshold")
    parser.add_argument(
        "--sequence-separation",
        dest="seqsep",
        default=12,
        type=int,
        help=
        "Consider only residues separated by this many positions in sequence.")

    args = parser.parse_args()
    alignment_dir = args.ali
    pdb_dir = args.pdb
    output_dir = args.output

    minL = args.minL
    maxL = args.maxL
    minN = args.minN
    maxgappercentage = args.maxGap
    mincontacts = args.mincontacts
    contact_threshold = args.contact_threshold
    seqsep = args.seqsep

    aln_files = glob.glob(alignment_dir + "/*")

    for alignment_file in aln_files:
        protein = os.path.basename(alignment_file).split(".")[0]
        pdb_file = pdb_dir + "/" + protein + ".pdb"

        if not os.path.exists(pdb_file):
            print(
                "PDB file {0} does not exist. Skip protein.".format(pdb_file))
            continue

        alignment = io.read_alignment(alignment_file, format="psicov")

        N = alignment.shape[0]
        L = alignment.shape[1]

        percent_gaps = np.mean(ali_ut.compute_gaps_per_position(alignment))

        distance_map = pdb.distance_map(pdb_file, L)
        nr_contacts = np.sum(
            (distance_map[np.triu_indices(L, k=seqsep)] < contact_threshold) *
            1)

        filter = False
        if N < minN:
            print("Alignment size {0} is smaller than filter threshold of {1}".
                  format(N, minN))
            filter = True

        if L < minL:
            print("Protein length {0} is smaller than filter threshold of {1}".
                  format(L, minL))
            filter = True

        if L > maxL:
            print("Protein length {0} is bigger than filter threshold of {1}".
                  format(L, maxL))
            filter = True

        if percent_gaps > maxgappercentage:
            print(
                "Percentag of gaps in alignment ({0}) is larger than filter threshold of {1}"
                .format(percent_gaps, maxgappercentage))
            filter = True

        if nr_contacts < mincontacts:
            print(
                "Number of contacts (contact_thr = {0}, sequence separation = {1}) in protein structure ({2}) is less than {3}"
                .format(contact_threshold, seqsep, nr_contacts, mincontacts))
            filter = True

        if filter:
            dest_alignment_file = output_dir + "/" + os.path.basename(
                alignment_file)
            os.rename(alignment_file, dest_alignment_file)
            print("Successfully moved {0} to {1}".format(
                alignment_file, dest_alignment_file))
def main():

    ### Parse arguments
    parser = argparse.ArgumentParser(description='Plotting a contact map.')
    parser.add_argument("braw_dir",         type=str,   help="path to binary raw files")
    parser.add_argument("pdb_dir",          type=str,   help="path to pdb files")
    parser.add_argument("alignment_dir",    type=str,   help="path to alignment files")
    parser.add_argument("nr_couplings",     type=int,   default=10000, help="number of couplings")
    parser.add_argument("plot_out",         type=str,   help="path to plot file")
    parser.add_argument("max_per_protein",  type=int,   default=100, help="maximum numbr couplings per protein")


    args = parser.parse_args()

    braw_dir        = args.braw_dir
    pdb_dir         = args.pdb_dir
    alignment_dir   = args.alignment_dir
    nr_couplings    = args.nr_couplings
    plot_out        = args.plot_out
    max_per_protein = args.max_per_protein

    #debugging
    braw_dir    = "/home/vorberg/work/data/benchmarkset_cathV4.1/contact_prediction/ccmpredpy_cd/braw/"
    pdb_dir     = "/home/vorberg/work/data/benchmarkset_cathV4.1/pdb_renum_combs/"
    alignment_dir = "/home/vorberg/work/data/benchmarkset_cathV4.1/psicov/"
    nr_couplings = 20000
    plot_out='/home/vorberg/'
    max_per_protein=100


    if not os.path.exists(braw_dir):
        raise IOError("Braw Path {0} does not exist.".format(braw_dir))


    coupling_df = pd.DataFrame(columns=range(400) + ['Neff'])

    braw_files = glob.glob(braw_dir + "/*braw*")
    for braw_file in braw_files:


        if len(coupling_df) > nr_couplings:
            break

        protein = os.path.basename(braw_file).split(".")[0]
        print protein


        #-------------get couplings and metadata ---------------------------------------------------------------------
        braw = raw.parse_msgpack(braw_file)
        meta = braw.meta
        neff = meta['workflow'][0]['parameters']['msafile']['neff']
        L = meta['workflow'][0]['parameters']['msafile']['ncol']
        N = meta['workflow'][0]['parameters']['msafile']['nrow']
        diversity = np.sqrt(N)/L
        #-------------------------------------------------------------------------------------------------------------


        #-------------filter contacts -------------------------------------------------------------------------------
        pdb_file = pdb_dir +"/"+protein+".pdb"
        dist_matrix = pdb.distance_map(pdb_file)

        # get contact map (either 1 encoding a contact or 1 encoding non-contact (according to class variable)
        contact_map = dist_matrix < 8

        # select all residue pairs within contact Threshold
        indices_contact = list(np.where(np.triu(contact_map, k=1)))
        #-------------------------------------------------------------------------------------------------------------



        #--------------filter gap columns ---------------------------------------------------------------------------
        psicov_file = alignment_dir + "/"+protein+".filt.psc"
        psicov = io.read_alignment(psicov_file)

        percent_gaps_per_column = [float(psicov[:, l].tolist().count(0)) / N for l in range(L)]
        columns_with_many_gaps = [i for i, j in enumerate(percent_gaps_per_column) if j > 0.2]

        index_delete_contact_i = [index for index in range(len(indices_contact[0])) if
                                  indices_contact[0][index] in columns_with_many_gaps]
        index_delete_contact_j = [index for index in range(len(indices_contact[1])) if
                                  indices_contact[1][index] in columns_with_many_gaps]

        # delete column pairs from indices_contact
        indices_contact[0] = np.delete(indices_contact[0],
                                       np.unique(index_delete_contact_i + index_delete_contact_j))
        indices_contact[1] = np.delete(indices_contact[1],
                                       np.unique(index_delete_contact_i + index_delete_contact_j))
        #-------------------------------------------------------------------------------------------------------------


        nr_contacts = len(indices_contact[0])

        if nr_contacts == 0:
            continue


        random_sample = np.random.choice(range(nr_contacts), replace=False, size=np.min([max_per_protein, nr_contacts]))
        couplings = braw.x_pair[indices_contact[0][random_sample], indices_contact[1][random_sample],:20,:20].reshape(len(random_sample), 400)
        df = pd.DataFrame(couplings)
        df['L'] = L
        df['Neff'] = neff
        df['Diversity'] = diversity
        df['sum_wij'] = couplings.sum(1)
        df['ratio_0.2L_Neff'] = 0.2 * L / neff

        coupling_df = coupling_df.append(df)
        print "nr of couplings: {0}".format(len(coupling_df))


    plot_file = plot_out + "/coupling_matrix_neff_" + str(nr_couplings) + ".html"
    plots.plot_coupling_vs_neff(coupling_df, 'Neff', plot_file)

    plot_file = plot_out + "/coupling_matrix_diversity_" + str(nr_couplings) + ".html"
    plots.plot_coupling_vs_neff(coupling_df, 'Diversity', plot_file)

    plot_file = plot_out + "/coupling_matrix_L_" + str(nr_couplings) + ".html"
    plots.plot_coupling_vs_neff(coupling_df, 'L', plot_file)


    plot_file = plot_out + "/coupling_matrix_ratio_0.2L_Neff_" + str(nr_couplings) + ".html"
    plots.plot_coupling_vs_neff(coupling_df, 'ratio_0.2L_Neff', plot_file)
def collect_data(braw_dir, alignment_dir, pdb_dir, ab, cd, cb_lower, cb_upper):

    #define distance bins
    couplings = {ab: [], cd: []}

    max_nr_couplings_per_protein = 500
    sequence_separation = 10
    evidence_threshold = 80
    max_nr_couplings = 5000
    diversity_thr = 0.3
    a = ab[0]
    b = ab[2]
    c = cd[0]
    d = cd[2]

    # iterate over proteins
    braw_files = glob.glob(braw_dir + "/*braw.gz")
    for braw_file in braw_files:
        # braw_file = braw_files[0]

        protein = os.path.basename(braw_file).split(".")[0]
        pdb_file = pdb_dir + "/" + protein + ".pdb"
        alignment_file = alignment_dir + "/" + protein + ".filt.psc"

        if not os.path.exists(pdb_file):
            print("PDB file {0} does not exist. Skip this protein.".format(
                pdb_file))
            continue

        if not os.path.exists(braw_file):
            print("Braw file {0} does not exist. Skip this protein.".format(
                braw_file))
            continue

        if not os.path.exists(alignment_file):
            print("Alignment file {0} does not exist. Skip this protein.".
                  format(alignment_file))
            continue

        AF = AlignmentFeatures(alignment_file, sequence_separation, 8, 8)

        diversity = np.sqrt(AF.N) / AF.L
        if diversity < diversity_thr:
            print("Diversity = {0}. Skip this protein.".format(diversity))
            continue

        braw = raw.parse_msgpack(braw_file)
        distance_map = pdb.distance_map(pdb_file, AF.L)

        #mask highly gapped positions
        gaps = 1 - (AF.Ni / AF.neff)
        highly_gapped_pos = np.where(np.array(gaps) > 0.3)[0]
        distance_map[highly_gapped_pos, :] = np.nan
        distance_map[:, highly_gapped_pos] = np.nan

        # iterate over pairs for bins
        residue_i, residue_j = np.where((distance_map > cb_lower)
                                        & (distance_map < cb_upper))

        Nij = AF.Nij[residue_i, residue_j]
        q_i_a = AF.single_frequencies[residue_i, io.AMINO_INDICES[a]]
        q_j_b = AF.single_frequencies[residue_j, io.AMINO_INDICES[b]]
        q_i_c = AF.single_frequencies[residue_i, io.AMINO_INDICES[c]]
        q_j_d = AF.single_frequencies[residue_j, io.AMINO_INDICES[d]]

        evidence_ab = Nij * q_i_a * q_j_b
        evidence_cd = Nij * q_i_c * q_j_d

        residue_i = residue_i[(evidence_ab > evidence_threshold)
                              & (evidence_cd > evidence_threshold)]
        residue_j = residue_j[(evidence_ab > evidence_threshold)
                              & (evidence_cd > evidence_threshold)]

        if len(residue_i) == 0:
            continue

        ab_coupling = braw.x_pair[
            residue_i, residue_j, io.AMINO_INDICES[a],
            io.AMINO_INDICES[b]].tolist()[:max_nr_couplings_per_protein]
        cd_coupling = braw.x_pair[
            residue_i, residue_j, io.AMINO_INDICES[c],
            io.AMINO_INDICES[d]].tolist()[:max_nr_couplings_per_protein]
        couplings[ab].extend(ab_coupling)
        couplings[cd].extend(cd_coupling)

        print("\nprotein {0}  size: {1}".format(protein, len(couplings[ab])))

        # stop condition: all bins are full
        if len(couplings[ab]) >= max_nr_couplings:
            break

    return couplings
Пример #9
0
def collect_data(braw_dir, alignment_dir, pdb_dir, ab):

    #define distance bins
    couplings_per_bin = {
        'bin1': {
            'couplings': [],
            'lower': 0,
            'upper': 8
        },
        'bin2': {
            'couplings': [],
            'lower': 5,
            'upper': 10
        },
        'bin3': {
            'couplings': [],
            'lower': 8,
            'upper': 12
        },
        'bin4': {
            'couplings': [],
            'lower': 10,
            'upper': 15
        },
        'bin5': {
            'couplings': [],
            'lower': 20,
            'upper': 50
        }
    }

    max_nr_couplings_per_protein = 500
    sequence_separation = 10
    evidence_threshold = 100
    max_couplings_per_bin = 10000
    a = ab[0]
    b = ab[2]

    # iterate over proteins
    braw_files = glob.glob(braw_dir + "/*braw.gz")
    for braw_file in braw_files:
        # braw_file = braw_files[0]

        protein = os.path.basename(braw_file).split(".")[0]
        pdb_file = pdb_dir + "/" + protein + ".pdb"
        alignment_file = alignment_dir + "/" + protein + ".filt.psc"

        if not os.path.exists(pdb_file):
            print("PDB file {0} does not exist. Skip this protein.".format(
                pdb_file))
            continue

        if not os.path.exists(braw_file):
            print("Braw file {0} does not exist. Skip this protein.".format(
                braw_file))
            continue

        if not os.path.exists(alignment_file):
            print(
                "Alignment file {0} does not exist. Skip this protein.".format(
                    alignment_file))
            continue

        AF = AlignmentFeatures(alignment_file, sequence_separation, 8, 8)

        diversity = np.sqrt(AF.N) / AF.L
        if diversity < 0.3:
            print("Diversity = {0}. Skip this protein.".format(diversity))
            continue

        braw = raw.parse_msgpack(braw_file)
        distance_map = pdb.distance_map(pdb_file, AF.L)

        #mask highly gapped positions
        gaps = 1 - (AF.Ni / AF.neff)
        highly_gapped_pos = np.where(np.array(gaps) > 0.3)[0]
        distance_map[highly_gapped_pos, :] = np.nan
        distance_map[:, highly_gapped_pos] = np.nan

        # iterate over pairs for bins
        for bin_name in sorted(couplings_per_bin.keys(), reverse=True):

            if len(couplings_per_bin[bin_name]
                   ['couplings']) >= max_couplings_per_bin:
                continue

            cb_lower = couplings_per_bin[bin_name]['lower']
            cb_upper = couplings_per_bin[bin_name]['upper']

            residue_i, residue_j = np.where((distance_map > cb_lower)
                                            & (distance_map < cb_upper))

            Nij = AF.Nij[residue_i, residue_i]
            q_i_a = AF.single_frequencies[residue_i, io.AMINO_INDICES[a]]
            q_j_b = AF.single_frequencies[residue_j, io.AMINO_INDICES[b]]

            evidence = Nij * q_i_a * q_j_b

            residue_i = residue_i[evidence > evidence_threshold]
            residue_j = residue_j[evidence > evidence_threshold]

            if len(residue_i) == 0:
                continue

            ab_coupling = braw.x_pair[
                residue_i, residue_j, io.AMINO_INDICES[a],
                io.AMINO_INDICES[b]].tolist()[:max_nr_couplings_per_protein]
            couplings_per_bin[bin_name]['couplings'].extend(ab_coupling)

        for bin_name in sorted(couplings_per_bin.keys(), reverse=True):
            print("\nprotein {0} {1:<8} size: {2}".format(
                protein, bin_name,
                len(couplings_per_bin[bin_name]['couplings'])))

        # stop condition: all bins are full
        if all([
                len(bindict['couplings']) >= max_couplings_per_bin
                for bindict in couplings_per_bin.values()
        ]):
            break

    return couplings_per_bin
def main():

    ### Parse arguments
    parser = argparse.ArgumentParser(description='Plotting a contact map.')
    parser.add_argument("braw_dir", type=str, help="path to binary_raw_files")
    parser.add_argument("alignment_dir",
                        type=str,
                        help="path to alignment files")
    parser.add_argument("pdb_dir", type=str, help="path to pdb files")
    parser.add_argument("ab", type=str, help="ab in range(400)")
    parser.add_argument("cd", type=str, help="cd in range(400)")
    parser.add_argument("dist_lower",
                        type=int,
                        default=0,
                        help="Lower Cbeta distance threshold")
    parser.add_argument("dist_upper",
                        type=int,
                        default=8,
                        help="Upper Cbeta distance threshold")
    parser.add_argument(
        "Nij_threshold",
        type=int,
        default=100,
        help="Minimum number of non-gapped sequences at positions i and j ")
    parser.add_argument("size", type=int, help="number of pairs ij")
    parser.add_argument("plot_dir", type=str, help="where to save the plot")

    args = parser.parse_args()

    braw_dir = args.braw_dir
    pdb_dir = args.pdb_dir
    alignment_dir = args.alignment_dir
    ab = args.ab
    cd = args.cd
    dist_lower = args.dist_lower
    dist_upper = args.dist_upper
    Nij_threshold = args.Nij_threshold
    size = args.size
    plot_dir = args.plot_dir

    #debugging
    # pdb_dir = "/home/vorberg/work/data/benchmarkset_cathV4.1/pdb_renum_combs/"
    # braw_dir = "/home/vorberg/work/data/benchmarkset_cathV4.1/contact_prediction/ccmpred-pll-centerv/braw/"
    # alignment_dir = "/home/vorberg/work/data/benchmarkset_cathV4.1/psicov/"
    # ab='R-E'
    # cd='E-R'
    # dist_lower = 0
    # dist_upper = 8
    # Nij_threshold = 100
    # size = 10000
    # plot_dir='/home/vorberg/'

    braw_files = glob.glob(braw_dir + "/*braw.gz")

    couplings = {}
    couplings[ab] = []
    couplings[cd] = []
    for braw_file in braw_files:
        if len(couplings[ab]) > size:
            break

        if not os.path.exists(braw_file):
            print("Braw File " + str(braw_file) + "cannot be found. ")
            continue

        braw = raw.parse_msgpack(braw_file)
        L = braw.ncol

        protein = os.path.basename(braw_file).split(".")[0]

        alignment_file = alignment_dir + "/" + protein + ".filt.psc"
        if not os.path.exists(alignment_file):
            print("Alignment File " + str(alignment_file) +
                  " cannot be found. ")
            continue

        pdb_file = pdb_dir + "/" + protein.replace("_", "") + ".pdb"
        if not os.path.exists(pdb_file):
            print("PDB File " + str(pdb_file) + " cannot be found. ")
            continue

        print protein

        indices_upper_tri = np.triu_indices(L, k=1)

        #filter pair indices that have specified Cb distances
        dist_matrix = pdb.distance_map(pdb_file, L)
        indices_dist_true = np.where(
            (dist_matrix[indices_upper_tri] > dist_lower)
            & (dist_matrix[indices_upper_tri] < dist_upper))[0]

        #filter pair indices that have more than Nij_threshold ungapped sequences
        alignment = io.read_alignment(alignment_file)
        weights = weighting.calculate_weights_simple(alignment, 0.8, True)
        pairwise_counts = counts.pair_counts(alignment, weights)
        Nij = pairwise_counts[:, :, :20, :20].sum(3).sum(2)
        indices_Nij_true = np.where(Nij[indices_upper_tri] > Nij_threshold)[0]

        #get pair indices that fullfill both requirements
        indices_merge = list(
            set(indices_dist_true).intersection(indices_Nij_true))

        #get couplings for filtered pairs
        braw_reshaped = braw.x_pair[:, :, :20, :20].reshape(L, L, 400)
        couplings[ab].extend(
            braw_reshaped[indices_upper_tri][indices_merge][:,
                                                            io.AB_INDICES[ab]])
        couplings[cd].extend(
            braw_reshaped[indices_upper_tri][indices_merge][:,
                                                            io.AB_INDICES[cd]])

        print "Nr of couplings: {0}".format(len(couplings[ab]))

    plot_file = plot_dir + "/pairwise_couplings_" + ab + "_" + cd + "_Nijthreshold" + str(
        Nij_threshold) + "_Cbdistance_" + str(dist_lower) + "_" + str(
            dist_upper) + ".html"
    title = "Couplings {0} vs  {1} <br> Nij threshold: {2},  {3} <= Cb_ij <= {4}".format(
        ab, cd, Nij_threshold, dist_lower, dist_upper)
    plots.plot_pairwise_couplings_density(couplings, title, plot_out=plot_file)
def collect_data(braw_dir, alignment_dir, pdb_dir, ab):

    #define distance bins
    couplings_per_bin={
        'bin1': {
            'couplings' : [],
            'lower':0,
            'upper':8
        },
        'bin2': {
            'couplings': [],
            'lower': 5,
            'upper': 10
        },
        'bin3': {
            'couplings': [],
            'lower': 8,
            'upper': 12
        },
        'bin4': {
            'couplings': [],
            'lower': 10,
            'upper': 15
        },
        'bin5': {
            'couplings': [],
            'lower': 20,
            'upper': 50
        }
    }


    max_nr_couplings_per_protein = 500
    sequence_separation=10
    evidence_threshold = 100
    max_couplings_per_bin = 10000
    a = ab[0]
    b = ab[2]

    # iterate over proteins
    braw_files = glob.glob(braw_dir + "/*braw.gz")
    for braw_file in braw_files:
        # braw_file = braw_files[0]

        protein = os.path.basename(braw_file).split(".")[0]
        pdb_file = pdb_dir + "/" + protein + ".pdb"
        alignment_file = alignment_dir + "/" + protein + ".filt.psc"

        if not os.path.exists(pdb_file):
            print("PDB file {0} does not exist. Skip this protein.".format(pdb_file))
            continue

        if not os.path.exists(braw_file):
            print("Braw file {0} does not exist. Skip this protein.".format(braw_file))
            continue

        if not os.path.exists(alignment_file):
            print("Alignment file {0} does not exist. Skip this protein.".format(alignment_file))
            continue

        AF = AlignmentFeatures(alignment_file, sequence_separation, 8, 8)

        diversity = np.sqrt(AF.N) / AF.L
        if diversity < 0.3:
            print("Diversity = {0}. Skip this protein.".format(diversity))
            continue

        braw = raw.parse_msgpack(braw_file)
        distance_map = pdb.distance_map(pdb_file, AF.L)

        #mask highly gapped positions
        gaps = 1 - (AF.Ni / AF.neff)
        highly_gapped_pos = np.where(np.array(gaps) > 0.3)[0]
        distance_map[highly_gapped_pos, :] = np.nan
        distance_map[:, highly_gapped_pos] = np.nan


        # iterate over pairs for bins
        for bin_name in sorted(couplings_per_bin.keys(), reverse=True):

            if len(couplings_per_bin[bin_name]['couplings']) >= max_couplings_per_bin:
                continue

            cb_lower = couplings_per_bin[bin_name]['lower']
            cb_upper = couplings_per_bin[bin_name]['upper']

            residue_i, residue_j = np.where((distance_map > cb_lower) & (distance_map < cb_upper))

            Nij = AF.Nij[residue_i, residue_i]
            q_i_a = AF.single_frequencies[residue_i, io.AMINO_INDICES[a]]
            q_j_b = AF.single_frequencies[residue_j, io.AMINO_INDICES[b]]

            evidence = Nij * q_i_a  * q_j_b

            residue_i = residue_i[evidence > evidence_threshold]
            residue_j = residue_j[evidence > evidence_threshold]

            if len(residue_i) == 0:
                continue

            ab_coupling = braw.x_pair[residue_i, residue_j, io.AMINO_INDICES[a], io.AMINO_INDICES[b]].tolist()[:max_nr_couplings_per_protein]
            couplings_per_bin[bin_name]['couplings'].extend(ab_coupling)

        for bin_name in sorted(couplings_per_bin.keys(), reverse=True):
            print("\nprotein {0} {1:<8} size: {2}".format(
                protein, bin_name, len(couplings_per_bin[bin_name]['couplings'])))

        # stop condition: all bins are full
        if all([len(bindict['couplings']) >= max_couplings_per_bin for bindict in couplings_per_bin.values()]):
            break

    return couplings_per_bin
def collect_data(braw_dirs, alignment_dir, pdb_dir, bin_size, ab):

    #define distance bins
    bins=[0, 5, 8, 12, 15, 20, np.inf]

    max_nr_couplings_per_protein = 500

    methods = braw_dirs.keys()
    couplings_per_bin = {}
    for method in methods:
        couplings_per_bin[method] = {}
        for bin in range(len(bins) - 1):
            bin_name = str(bin+1) + ": " + str(bins[bin]) + "-" + str(bins[bin + 1])
            couplings_per_bin[method][bin_name] = []

    # iterate over proteins
    psc_files = glob.glob(alignment_dir + "/*psc")
    for psc_file in psc_files:

        # psc_file = psc_files[0]
        protein = os.path.basename(psc_file).split(".")[0]
        pdb_file = pdb_dir + "/" + protein + ".pdb"

        # check if ALL braw files exist
        braw_files = {}
        for method in methods:
            braw_files[method] = braw_dirs[method] + "/" + protein + ".filt.braw.gz"

        if any([not os.path.exists(braw_files[method]) for method in methods]):
            print("Skip this protein (braw files does not exist).")
            continue

        alignment = io.read_alignment(psc_file, format="psicov")
        distance_map = pdb.distance_map(pdb_file, alignment.shape[1])

        diversity = np.sqrt(alignment.shape[0]) / alignment.shape[1]
        if diversity < 0.3:
            print("Skip this protein (low diversity = {0}).".format(diversity))
            continue

        # read braw files
        braw = {}
        for method in methods:
            if ab == 'all':
                braw[method] = bu.compute_l2norm_from_brawfile(braw_files[method], apc=True)
            else:
                braw[method] = raw.parse_msgpack(braw_files[method])



        # mask highly gapped positions
        gaps = ali.compute_gaps_per_position(alignment)
        highly_gapped_pos = np.where(np.array(gaps) > 0.3)[0]
        distance_map[highly_gapped_pos, :] = np.nan
        distance_map[:, highly_gapped_pos] = np.nan

        # iterate over pairs for bins
        for bin in range(len(bins) - 1):
            cb_lower = bins[bin]
            cb_upper = bins[bin + 1]
            bin_name = sorted(couplings_per_bin[methods[0]].keys())[bin]

            residue_indices = np.where((distance_map > cb_lower) & (distance_map < cb_upper))

            #shuffle indices to remove positioning bias
            c = list(zip(residue_indices[0], residue_indices[1]))
            random.shuffle(c)
            residue_indices = zip(*c)


            for method in methods:
                if len(couplings_per_bin[method][bin_name]) < bin_size:
                    if ab == 'all':
                        ab_coupling = braw[method][residue_indices[0], residue_indices[1]].tolist()[:max_nr_couplings_per_protein]
                    else:
                        ab_coupling = braw[method].x_pair[residue_indices[0], residue_indices[1], io.AMINO_INDICES[ab[0]], io.AMINO_INDICES[ab[2]]].tolist()[:max_nr_couplings_per_protein]

                    couplings_per_bin[method][bin_name].extend(ab_coupling)

            print("\nprotein {0} bin: {1:<8} size: {2}".format(
                protein, bin_name, len(couplings_per_bin[methods[0]][bin_name])))

        # stop condition: all bins are full
        if all([len(v) >= bin_size for v in couplings_per_bin[methods[0]].values()]):
            break

    return couplings_per_bin
def main():

    ### Parse arguments
    parser = argparse.ArgumentParser(description='Plotting a contact map.')
    parser.add_argument("braw_dir",         type=str,   help="path to binary_raw_files")
    parser.add_argument("alignment_dir",    type=str,   help="path to alignment files")
    parser.add_argument("pdb_dir",          type=str,   help="path to pdb files")
    parser.add_argument("ab",               type=str,   help="ab in range(400)")
    parser.add_argument("cd",               type=str,   help="cd in range(400)")
    parser.add_argument("dist_lower",       type=int,   default=0, help="Lower Cbeta distance threshold")
    parser.add_argument("dist_upper",       type=int,   default=8, help="Upper Cbeta distance threshold")
    parser.add_argument("Nij_threshold",    type=int,   default=100, help="Minimum number of non-gapped sequences at positions i and j ")
    parser.add_argument("size",             type=int,   help="number of pairs ij")
    parser.add_argument("plot_dir",         type=str,   help="where to save the plot")


    args = parser.parse_args()

    braw_dir        = args.braw_dir
    pdb_dir         = args.pdb_dir
    alignment_dir   = args.alignment_dir
    ab              = args.ab
    cd              = args.cd
    dist_lower      = args.dist_lower
    dist_upper      = args.dist_upper
    Nij_threshold   = args.Nij_threshold
    size            = args.size
    plot_dir        = args.plot_dir

    #debugging
    # pdb_dir = "/home/vorberg/work/data/benchmarkset_cathV4.1/pdb_renum_combs/"
    # braw_dir = "/home/vorberg/work/data/benchmarkset_cathV4.1/contact_prediction/ccmpred-pll-centerv/braw/"
    # alignment_dir = "/home/vorberg/work/data/benchmarkset_cathV4.1/psicov/"
    # ab='R-E'
    # cd='E-R'
    # dist_lower = 0
    # dist_upper = 8
    # Nij_threshold = 100
    # size = 10000
    # plot_dir='/home/vorberg/'


    braw_files = glob.glob(braw_dir + "/*braw.gz")

    couplings={}
    couplings[ab]=[]
    couplings[cd]=[]
    for braw_file in braw_files:
        if len(couplings[ab]) > size:
            break

        if not os.path.exists(braw_file):
            print("Braw File " + str(braw_file) + "cannot be found. ")
            continue

        braw = raw.parse_msgpack(braw_file)
        L  = braw.ncol

        protein = os.path.basename(braw_file).split(".")[0]


        alignment_file = alignment_dir + "/" + protein + ".filt.psc"
        if not os.path.exists(alignment_file):
            print("Alignment File " + str(alignment_file) + " cannot be found. ")
            continue


        pdb_file = pdb_dir + "/" + protein.replace("_", "") + ".pdb"
        if not os.path.exists(pdb_file):
            print("PDB File " + str(pdb_file) + " cannot be found. ")
            continue

        print protein

        indices_upper_tri  =  np.triu_indices(L, k=1)

        #filter pair indices that have specified Cb distances
        dist_matrix = pdb.distance_map(pdb_file, L)
        indices_dist_true = np.where((dist_matrix[indices_upper_tri] > dist_lower) & (dist_matrix[indices_upper_tri] < dist_upper))[0]

        #filter pair indices that have more than Nij_threshold ungapped sequences
        alignment = io.read_alignment(alignment_file)
        weights = weighting.calculate_weights_simple(alignment, 0.8, True)
        pairwise_counts = counts.pair_counts(alignment, weights)
        Nij = pairwise_counts[:, :, :20, :20].sum(3).sum(2)
        indices_Nij_true = np.where(Nij[indices_upper_tri] > Nij_threshold)[0]

        #get pair indices that fullfill both requirements
        indices_merge = list(set(indices_dist_true).intersection(indices_Nij_true))

        #get couplings for filtered pairs
        braw_reshaped =  braw.x_pair[:,:,:20,:20].reshape(L,L,400)
        couplings[ab].extend(braw_reshaped[indices_upper_tri][indices_merge][:, io.AB_INDICES[ab]])
        couplings[cd].extend(braw_reshaped[indices_upper_tri][indices_merge][:, io.AB_INDICES[cd]])

        print "Nr of couplings: {0}".format(len(couplings[ab]))


    plot_file = plot_dir + "/pairwise_couplings_" + ab + "_"+ cd + "_Nijthreshold" + str(Nij_threshold) + "_Cbdistance_" + str(dist_lower) +"_" + str(dist_upper) + ".html"
    title="Couplings {0} vs  {1} <br> Nij threshold: {2},  {3} <= Cb_ij <= {4}".format(ab, cd, Nij_threshold, dist_lower, dist_upper)
    plots.plot_pairwise_couplings_density(couplings, title, plot_out=plot_file)
def main():

    parser = argparse.ArgumentParser(description="Generate SEQATOM sequences from deprecated database or recompute")

    parser.add_argument("-a", "--alignment",    dest="ali",                                 help="path to alignment files")
    parser.add_argument("-p", "--pdb",          dest="pdb",                                 help="path to pdb files")
    parser.add_argument("-o", "--output",       dest="output",                              help="path to filter directory")
    parser.add_argument("--min-N",              dest="minN",    default=10,     type=int,   help="Minimum number of sequences")
    parser.add_argument("--max-gap-percentage", dest="maxGap",  default=0.8,    type=float, help="Maximum percentage of gaps in alignment")
    parser.add_argument("--max-L",              dest="maxL",    default=600,    type=float, help="Maximum length of protein")
    parser.add_argument("--min-L",              dest="minL",    default=20,     type=float, help="Minimum length of protein")
    parser.add_argument("--min-contacts",       dest="mincontacts", default=1,  type=int,   help="Minimum number of contacts")
    parser.add_argument("--contact-threshold",  dest="contact_threshold", default=8, type=int, help="Contact defined as distance between Cbeta atoms < threshold")
    parser.add_argument("--sequence-separation",  dest="seqsep", default=12, type=int,      help="Consider only residues separated by this many positions in sequence.")

    args = parser.parse_args()
    alignment_dir  = args.ali
    pdb_dir  = args.pdb
    output_dir     = args.output

    minL = args.minL
    maxL = args.maxL
    minN = args.minN
    maxgappercentage = args.maxGap
    mincontacts = args.mincontacts
    contact_threshold = args.contact_threshold
    seqsep = args.seqsep

    aln_files = glob.glob(alignment_dir + "/*")


    for alignment_file in aln_files:
        protein = os.path.basename(alignment_file).split(".")[0]
        pdb_file = pdb_dir + "/" + protein + ".pdb"

        if not os.path.exists(pdb_file):
            print("PDB file {0} does not exist. Skip protein.".format(pdb_file))
            continue

        alignment = io.read_alignment(alignment_file, format="psicov")

        N = alignment.shape[0]
        L = alignment.shape[1]

        percent_gaps = np.mean(ali_ut.compute_gaps_per_position(alignment))

        distance_map = pdb.distance_map(pdb_file, L)
        nr_contacts = np.sum((distance_map[np.triu_indices(L, k=seqsep)] < contact_threshold) * 1)

        filter=False
        if N < minN:
            print("Alignment size {0} is smaller than filter threshold of {1}".format(N, minN))
            filter=True

        if L < minL:
            print("Protein length {0} is smaller than filter threshold of {1}".format(L, minL))
            filter=True

        if L > maxL:
            print("Protein length {0} is bigger than filter threshold of {1}".format(L, maxL))
            filter=True

        if percent_gaps > maxgappercentage:
            print("Percentag of gaps in alignment ({0}) is larger than filter threshold of {1}".format(percent_gaps, maxgappercentage))
            filter=True

        if nr_contacts < mincontacts:
            print("Number of contacts (contact_thr = {0}, sequence separation = {1}) in protein structure ({2}) is less than {3}".format(contact_threshold,seqsep, nr_contacts, mincontacts))
            filter=True


        if filter:
            dest_alignment_file = output_dir + "/" + os.path.basename(alignment_file)
            os.rename(alignment_file, dest_alignment_file)
            print("Successfully moved {0} to {1}".format(alignment_file, dest_alignment_file))
def collect_data(braw_dir, alignment_dir, pdb_dir, pairs, lower_cb_distance, upper_cb_distance):

    #define distance bins
    couplings_per_pair={}
    for pair in pairs:
        couplings_per_pair[pair] = []


    max_nr_couplings_per_protein = 500
    sequence_separation=8
    evidence_threshold = 100
    max_couplings_per_bin = 1000

    # iterate over proteins
    braw_files = glob.glob(braw_dir + "/*braw.gz")
    for braw_file in braw_files:
        # braw_file = braw_files[0]

        protein = os.path.basename(braw_file).split(".")[0]
        pdb_file = pdb_dir + "/" + protein + ".pdb"
        alignment_file = alignment_dir + "/" + protein + ".filt.psc"

        if not os.path.exists(pdb_file):
            print("PDB file {0} does not exist. Skip this protein.".format(pdb_file))
            continue

        if not os.path.exists(braw_file):
            print("Braw file {0} does not exist. Skip this protein.".format(braw_file))
            continue

        if not os.path.exists(alignment_file):
            print("Alignment file {0} does not exist. Skip this protein.".format(alignment_file))
            continue

        AF = AlignmentFeatures(alignment_file, sequence_separation, 8, 8)

        diversity = np.sqrt(AF.N) / AF.L
        if diversity < 0.3:
            print("Diversity = {0}. Skip this protein.".format(diversity))
            continue

        braw = raw.parse_msgpack(braw_file)
        distance_map = pdb.distance_map(pdb_file, AF.L)

        #mask highly gapped positions
        gaps = 1 - (AF.Ni / AF.neff)
        highly_gapped_pos = np.where(np.array(gaps) > 0.3)[0]
        distance_map[highly_gapped_pos, :] = np.nan
        distance_map[:, highly_gapped_pos] = np.nan


        # iterate over pairs for bins
        for pair in pairs:

            if len(couplings_per_pair[pair]) >= max_couplings_per_bin:
                continue

            residue_i, residue_j = np.where((distance_map > lower_cb_distance) & (distance_map < upper_cb_distance))

            if len(residue_i) == 0:
                continue

            a = pair[0]
            b = pair[2]

            Nij = AF.Nij[residue_i, residue_i]
            q_i_a = AF.single_frequencies[residue_i, io.AMINO_INDICES[a]]
            q_j_b = AF.single_frequencies[residue_j, io.AMINO_INDICES[b]]
            q_ij_ab = AF.pairwise_frequencies[residue_i, residue_j, io.AMINO_INDICES[a], io.AMINO_INDICES[b]]

            evidence = np.max([Nij * q_i_a  * q_j_b, Nij * q_ij_ab])

            residue_i = residue_i[evidence > evidence_threshold]
            residue_j = residue_j[evidence > evidence_threshold]

            if len(residue_i) == 0:
                continue

            ab_coupling = braw.x_pair[residue_i, residue_j, io.AMINO_INDICES[a], io.AMINO_INDICES[b]].tolist()[:max_nr_couplings_per_protein]
            couplings_per_pair[pair].extend(ab_coupling)


        str="\n\nprotein {0}".format(protein)
        for pair in sorted(couplings_per_pair.keys()):
            str += "\n{0:<8} : {1}".format(pair, len(couplings_per_pair[pair]))
        print str

        # stop condition: all bins are full
        if all([len(couplings_per_pair[pair]) >= max_couplings_per_bin for pair in pairs]):
            break

    return couplings_per_pair
def collect_data(pdb_dir, alignment_dir, distance_definition, size):

    pdb_files = os.listdir(pdb_dir + "/")

    sequence_separations = [1, 6, 12, 24]

    distances_ab = {}
    for seq_sep in sequence_separations:
        distances_ab[seq_sep] = {}
        for a in io.AMINO_ACIDS[:20]:
            for b in io.AMINO_ACIDS[:20]:
                distances_ab[seq_sep][a + "-" + b] = []

    for pdb_file in pdb_files[:size]:
        #pdb_file=pdb_files[0]

        protein = os.path.basename(pdb_file).split(".")[0]
        print protein

        alignment_file = alignment_dir + "/" + protein + ".filt.psc"
        if not os.path.exists(alignment_file):
            continue
        alignment = io.read_alignment(alignment_file)
        L = alignment.shape[1]

        query_sequence = alignment[0]
        dist_matrix = pdb.distance_map(pdb_dir + "/" + pdb_file, L,
                                       distance_definition)

        for seq_sep in sequence_separations:
            indices_upper_tri_i, indices_upper_tri_j = np.triu_indices(
                L, k=seq_sep)

            if len(indices_upper_tri_i) == 0:
                continue

            distances_ab_seqsep = dist_matrix[indices_upper_tri_i,
                                              indices_upper_tri_j]
            AA_a = query_sequence[indices_upper_tri_i]
            AA_b = query_sequence[indices_upper_tri_j]

            for pair in range(len(indices_upper_tri_i)):
                ab = io.AMINO_ACIDS[AA_a[pair]] + "-" + io.AMINO_ACIDS[
                    AA_b[pair]]
                if AA_a[pair] == 20 or AA_b[pair] == 20:
                    continue
                distances_ab[seq_sep][ab].extend(
                    list(distances_ab_seqsep[pair]
                         [~np.isnan(distances_ab_seqsep[pair])]))

        # if ab == 'all':
        #     indices_a = range(L)
        #     indices_b = range(L)
        # else:
        #     query_sequence = alignment[0]
        #     indices_a = np.where(query_sequence == io.AMINO_INDICES[a])[0]
        #     indices_b = np.where(query_sequence == io.AMINO_INDICES[b])[0]
        # grid_indices_ab_pairs = [(x,y) for x in indices_a for y in indices_b]
        #
        # if len(grid_indices_ab_pairs) == 0:
        #     continue
        #
        # dist_matrix = pdb.distance_map(pdb_dir +"/" + pdb_file, L, distance_definition)
        #
        # for seq_sep in sequence_separations:
        #
        #     if len(distances_ab[seq_sep]) < size:
        #         indices_upper_tri_i, indices_upper_tri_j  =  np.triu_indices(L, k=seq_sep)
        #
        #         if len(indices_upper_tri_i) == 0:
        #             continue
        #
        #         indices_seqsep = list(set(zip(indices_upper_tri_i, indices_upper_tri_j)).intersection(grid_indices_ab_pairs))
        #         if len(indices_seqsep) == 0:
        #             continue
        #
        #         indices_a_seqsep, indices_b_seqsep = zip(*indices_seqsep)
        #         distances_ab_seqsep = dist_matrix[indices_a_seqsep, indices_b_seqsep]
        #         distances_ab[seq_sep].extend(distances_ab_seqsep[~np.isnan(distances_ab_seqsep)])
        #
        # for seq_sep in sequence_separations:
        #     print(protein + " seq sep " + str(seq_sep) +": " + str(len(distances_ab[seq_sep])))
        #
        # if all([len(distances_ab[seq_sep]) >= size for seq_sep in sequence_separations]):
        #     break

    for seq_sep in distances_ab.keys():
        distances_ab[seq_sep]['all'] = np.concatenate(
            distances_ab[seq_sep].values())

    return distances_ab