def main():
    import numpy as np
    from rosetta_score_files import how_many_purples_in_file
    import os
    import re
    global coh_names, doc_names
    design_list = ['ct11', 'ct12', 'ct13', 'ct15', 'ct16', 'ct17', 'ct29', 'ct31', 'ct33', 'ct36', 'ct38', 'ct41',
                   'ct44', 'ct45', 'ct46', 'ct47', 'ct48', 'ct49', 'ct50', 'ct51', 'ct52', 'ct53', 'ct54', 'ct55',
                   'ct59']
    # df = DataFrame({name: Series([-1 * len(design_list)], index=design_list) for name in design_list})
    mtrx = np.zeros([len(design_list), len(design_list)], dtype=int)
    score_file_list = [x for x in os.listdir(
        '/Users/jonathan/eden/no_backup/designs/Ct_8parts_10.2/prediction/results/')
                       if re.match('.*\.score', x)]
    for score_file in score_file_list:
        coh_name = score_file.split('_')[1]
        doc_name = score_file.split('_')[3]
        purple_num = int(how_many_purples_in_file(
            '/Users/jonathan/eden/no_backup/designs/Ct_8parts_10.2/prediction/results/' + score_file))
        mtrx[design_list.index(coh_name)][design_list.index(doc_name)] = 1 if purple_num >= 12 else 0
    dof_vec = find_degree_vector(mtrx)
    print dof_vec
    print mtrx
    mtrx = clean_all_zeros(mtrx)
    while not are_all_ones(dof_vec):
        new_all_ones(dof_vec)
        to_remove = dof_vec[-1].values()[0]
        mtrx = remove_from_matrix(mtrx, to_remove)
        mtrx = clean_all_zeros(mtrx)
        dof_vec = find_degree_vector(mtrx)
        # break
        print mtrx
        print dof_vec
        print coh_names
        print doc_names
Exemplo n.º 2
0
def main():
    from pandas import DataFrame, Series
    from rosetta_score_files import how_many_purples_in_file
    import os
    import re
    design_list = ['ct11', 'ct12', 'ct13', 'ct15', 'ct16', 'ct17', 'ct29', 'ct31', 'ct33', 'ct36', 'ct38', 'ct41',
                   'ct44', 'ct45', 'ct46', 'ct47', 'ct48', 'ct49', 'ct50', 'ct51', 'ct52', 'ct53', 'ct54', 'ct55',
                   'ct59']
    # df = DataFrame({name: Series([-1 * len(design_list)], index=design_list) for name in design_list})
    score_file_list = [x for x in os.listdir('.') if re.match('.*\.score', x)]

    coh_name_list = sorted(list(set(['_'.join(a.split('_VS_')[0].split('_')[1:]) for a in score_file_list])))
    doc_name_list = sorted(list(set(['_'.join(a.split('_VS_')[1].split('_')[:-1]) for a in score_file_list])))
    # print coh_name_list
    # print doc_name_list
    df = DataFrame({coh_name: Series([-1 * len(doc_name_list)], index=doc_name_list) for coh_name in coh_name_list})

    for score_file in score_file_list:
        # coh_name = score_file.split('_')[1]
        # doc_name = score_file.split('_')[3]
        coh_name = '_'.join(score_file.split('_VS_')[0].split('_')[1:])
        doc_name = '_'.join(score_file.split('_VS_')[1].split('_')[:-1])
        # print coh_name, doc_name
        purple_num = int(how_many_purples_in_file(score_file))
        df[coh_name][doc_name] = purple_num
    # pandas.set_option('display.max_columns', None)
    # print df
    show_prediction_heat_map(df.copy())
Exemplo n.º 3
0
def main():
    from pandas import DataFrame, Series
    from rosetta_score_files import how_many_purples_in_file
    import os
    import re
    # from matplotlib import pyplot as plt
    import networkx as nx
    score_file_list = [x for x in os.listdir('./')
                       if re.match('.*\.score', x)]
    coh_name_list = sorted(list(set([a.split('_')[1] for a in score_file_list])))
    doc_name_list = sorted(list(set([a.split('_')[3] for a in score_file_list])))
    df = DataFrame({coh_name: Series([-1 * len(doc_name_list)], index=doc_name_list) for coh_name in coh_name_list})
    df_true_score = DataFrame({coh_name: Series([-1 * len(doc_name_list)], index=doc_name_list) for coh_name in coh_name_list})
    for score_file in score_file_list:
        coh_name = score_file.split('_')[1]
        doc_name = score_file.split('_')[3]
        purple_num = int(how_many_purples_in_file('./'+score_file))
        df[coh_name][doc_name] = 1 if purple_num >= 10 else 0
        df_true_score[coh_name][doc_name] = purple_num

    G = nx.Graph()
    # labels = {}
    for coh in coh_name_list:
        for doc in doc_name_list:
            if df[coh][doc] == 1:
                G.add_node((coh, doc))
                # labels[(coh, doc)] = '%s<>%s' % (coh, doc)
    for c1, d1 in G.nodes_iter():
        for c2, d2 in G.nodes_iter():
            if df[c1][d2] == 0 and df[c2][d1] == 0:
                G.add_edge((c1, d1), (c2, d2))
    # pos = nx.spring_layout(G)
    # for node in labels:
    #     plt.annotate(labels[node], xy=pos[node])
    cliques = [a for a in nx.find_cliques(G)]
    max_len = max([len(a) for a in cliques])
    max_cliques = [a for a in cliques if len(a) == max_len]
    print len(max_cliques)
    clique_coh_list, clique_doc_list = coh_doc_set_span_maximal_cliques(max_cliques)
    print 'cohs that span entire clique list', clique_coh_list
    print 'docs that span entire clique list', clique_doc_list
    # best_ranker, best_rank = best_clique_by_overlapp(max_cliques, clique_coh_list, clique_doc_list)
    best_ranker, best_rank = best_clique_by_purples(max_cliques, df_true_score)
    print 'best ranker\n', best_ranker, best_rank
    ### find least similar clique:
    min_similarity = min(clique_similarity(best_ranker, a) for a in max_cliques)
    min_similars = []
    for clique in max_cliques:
        similarity = clique_similarity(best_ranker, clique)
        if similarity == min_similarity:
            min_similars.append(clique)
    best_min_similar_ranker, best_min_similar_rank = best_clique_by_purples(min_similars, df_true_score)
    print best_min_similar_ranker, best_min_similar_rank
    print min_similarity
    ### show true-score heat map for the best ranks clique:
    show_clique_heatmap(best_ranker, df_true_score, coh_name_list, doc_name_list)
    ### show true-score heat map for the least similar clique:
    show_clique_heatmap(best_min_similar_ranker, df_true_score, coh_name_list, doc_name_list)
Exemplo n.º 4
0
def main():
    import numpy as np
    from rosetta_score_files import how_many_purples_in_file
    import os
    import re
    global coh_names, doc_names
    design_list = [
        'ct11', 'ct12', 'ct13', 'ct15', 'ct16', 'ct17', 'ct29', 'ct31', 'ct33',
        'ct36', 'ct38', 'ct41', 'ct44', 'ct45', 'ct46', 'ct47', 'ct48', 'ct49',
        'ct50', 'ct51', 'ct52', 'ct53', 'ct54', 'ct55', 'ct59'
    ]
    # df = DataFrame({name: Series([-1 * len(design_list)], index=design_list) for name in design_list})
    mtrx = np.zeros([len(design_list), len(design_list)], dtype=int)
    score_file_list = [
        x for x in os.listdir(
            '/Users/jonathan/eden/no_backup/designs/Ct_8parts_10.2/prediction/results/'
        ) if re.match('.*\.score', x)
    ]
    for score_file in score_file_list:
        coh_name = score_file.split('_')[1]
        doc_name = score_file.split('_')[3]
        purple_num = int(
            how_many_purples_in_file(
                '/Users/jonathan/eden/no_backup/designs/Ct_8parts_10.2/prediction/results/'
                + score_file))
        mtrx[design_list.index(coh_name)][design_list.index(
            doc_name)] = 1 if purple_num >= 12 else 0
    dof_vec = find_degree_vector(mtrx)
    print dof_vec
    print mtrx
    mtrx = clean_all_zeros(mtrx)
    while not are_all_ones(dof_vec):
        new_all_ones(dof_vec)
        to_remove = dof_vec[-1].values()[0]
        mtrx = remove_from_matrix(mtrx, to_remove)
        mtrx = clean_all_zeros(mtrx)
        dof_vec = find_degree_vector(mtrx)
        # break
        print mtrx
        print dof_vec
        print coh_names
        print doc_names
Exemplo n.º 5
0
def main():
    from pandas import DataFrame, Series
    from rosetta_score_files import how_many_purples_in_file
    import os
    import re
    # design_list = ['ct11', 'ct12', 'ct13', 'ct15', 'ct16', 'ct17', 'ct29', 'ct31', 'ct33', 'ct36', 'ct38', 'ct41',
    #                'ct44', 'ct45', 'ct46', 'ct47', 'ct48', 'ct49', 'ct50', 'ct51', 'ct52', 'ct53', 'ct54', 'ct55',
    #                'ct59']
    # df = DataFrame({name: Series([-1], index=design_list) for name in design_list})
    #    score_file_list = [x for x in os.listdir('/Users/jonathan/eden/no_backup/designs/Ct_8parts_10.2/prediction/results/')
    #                       if re.match('.*\.score', x)]
    score_file_list = [x for x in os.listdir('./') if re.match('.*\.score', x)]
    coh_name_list = sorted(
        list(set([a.split('_')[1] for a in score_file_list])))
    doc_name_list = sorted(
        list(set([a.split('_')[3] for a in score_file_list])))
    df = DataFrame({
        coh_name: Series([-1 * len(doc_name_list)], index=doc_name_list)
        for coh_name in coh_name_list
    })
    for score_file in score_file_list:
        coh_name = score_file.split('_')[1]
        doc_name = score_file.split('_')[3]
        #        purple_num = int(how_many_purples_in_file('/Users/jonathan/eden/no_backup/designs/Ct_8parts_10.2/prediction/results/'+score_file))
        #        df[coh_name][doc_name] = 1 if purple_num >= 10 else 0
        purple_num = int(how_many_purples_in_file('./' + score_file))
        df[coh_name][doc_name] = 1 if purple_num >= 10 else 0

    i = 1
    while not all_dof_ones(df):
        dof_vec = find_degree_vector(df)
        df = remove_from_df(df, dof_vec[-1].values()[0])

        df = clean_zeroes(df)
        print 'printing dof for %i time' % i
        print dof_vec[-1]
        print find_degree_vector(df)
        if i > -1:
            show_prediction_heat_map(df)
        i += 1
Exemplo n.º 6
0
def main():
    from pandas import DataFrame, Series
    from rosetta_score_files import how_many_purples_in_file
    import os
    import re
    # design_list = ['ct11', 'ct12', 'ct13', 'ct15', 'ct16', 'ct17', 'ct29', 'ct31', 'ct33', 'ct36', 'ct38', 'ct41',
    #                'ct44', 'ct45', 'ct46', 'ct47', 'ct48', 'ct49', 'ct50', 'ct51', 'ct52', 'ct53', 'ct54', 'ct55',
    #                'ct59']
    # df = DataFrame({name: Series([-1], index=design_list) for name in design_list})
#    score_file_list = [x for x in os.listdir('/Users/jonathan/eden/no_backup/designs/Ct_8parts_10.2/prediction/results/')
#                       if re.match('.*\.score', x)]
    score_file_list = [x for x in os.listdir('./')
                       if re.match('.*\.score', x)]
    coh_name_list = sorted(list(set([a.split('_')[1] for a in score_file_list])))
    doc_name_list = sorted(list(set([a.split('_')[3] for a in score_file_list])))
    df = DataFrame({coh_name: Series([-1 * len(doc_name_list)], index=doc_name_list) for coh_name in coh_name_list})
    for score_file in score_file_list:
        coh_name = score_file.split('_')[1]
        doc_name = score_file.split('_')[3]
#        purple_num = int(how_many_purples_in_file('/Users/jonathan/eden/no_backup/designs/Ct_8parts_10.2/prediction/results/'+score_file))
#        df[coh_name][doc_name] = 1 if purple_num >= 10 else 0
        purple_num = int(how_many_purples_in_file('./'+score_file))
        df[coh_name][doc_name] = 1 if purple_num >= 10 else 0

    i = 1
    while not all_dof_ones(df):
        dof_vec = find_degree_vector(df)
        df = remove_from_df(df, dof_vec[-1].values()[0])

        df = clean_zeroes(df)
        print 'printing dof for %i time' % i
        print dof_vec[-1]
        print find_degree_vector(df)
        if i > -1:
            show_prediction_heat_map(df)
        i += 1
Exemplo n.º 7
0
def main():
    from pandas import DataFrame, Series
    from rosetta_score_files import how_many_purples_in_file
    import os
    import re
    # from matplotlib import pyplot as plt
    import networkx as nx
    score_file_list = [x for x in os.listdir('./') if re.match('.*\.score', x)]
    coh_name_list = sorted(
        list(set([a.split('_')[1] for a in score_file_list])))
    doc_name_list = sorted(
        list(set([a.split('_')[3] for a in score_file_list])))
    df = DataFrame({
        coh_name: Series([-1 * len(doc_name_list)], index=doc_name_list)
        for coh_name in coh_name_list
    })
    df_true_score = DataFrame({
        coh_name: Series([-1 * len(doc_name_list)], index=doc_name_list)
        for coh_name in coh_name_list
    })
    for score_file in score_file_list:
        coh_name = score_file.split('_')[1]
        doc_name = score_file.split('_')[3]
        purple_num = int(how_many_purples_in_file('./' + score_file))
        df[coh_name][doc_name] = 1 if purple_num >= 10 else 0
        df_true_score[coh_name][doc_name] = purple_num

    G = nx.Graph()
    # labels = {}
    for coh in coh_name_list:
        for doc in doc_name_list:
            if df[coh][doc] == 1:
                G.add_node((coh, doc))
                # labels[(coh, doc)] = '%s<>%s' % (coh, doc)
    for c1, d1 in G.nodes_iter():
        for c2, d2 in G.nodes_iter():
            if df[c1][d2] == 0 and df[c2][d1] == 0:
                G.add_edge((c1, d1), (c2, d2))
    # pos = nx.spring_layout(G)
    # for node in labels:
    #     plt.annotate(labels[node], xy=pos[node])
    cliques = [a for a in nx.find_cliques(G)]
    max_len = max([len(a) for a in cliques])
    max_cliques = [a for a in cliques if len(a) == max_len]
    print len(max_cliques)
    clique_coh_list, clique_doc_list = coh_doc_set_span_maximal_cliques(
        max_cliques)
    print 'cohs that span entire clique list', clique_coh_list
    print 'docs that span entire clique list', clique_doc_list
    # best_ranker, best_rank = best_clique_by_overlapp(max_cliques, clique_coh_list, clique_doc_list)
    best_ranker, best_rank = best_clique_by_purples(max_cliques, df_true_score)
    print 'best ranker\n', best_ranker, best_rank
    ### find least similar clique:
    min_similarity = min(
        clique_similarity(best_ranker, a) for a in max_cliques)
    min_similars = []
    for clique in max_cliques:
        similarity = clique_similarity(best_ranker, clique)
        if similarity == min_similarity:
            min_similars.append(clique)
    best_min_similar_ranker, best_min_similar_rank = best_clique_by_purples(
        min_similars, df_true_score)
    print best_min_similar_ranker, best_min_similar_rank
    print min_similarity
    ### show true-score heat map for the best ranks clique:
    show_clique_heatmap(best_ranker, df_true_score, coh_name_list,
                        doc_name_list)
    ### show true-score heat map for the least similar clique:
    show_clique_heatmap(best_min_similar_ranker, df_true_score, coh_name_list,
                        doc_name_list)