示例#1
0
def run(json_file, kallisto_spec, eVIPP_predict, output_name):

    spec_genes = pd.read_csv(kallisto_spec, sep="\t",
                             index_col="#gene_id").index.tolist()
    pathways = pd.read_csv(eVIPP_predict, sep="\t",
                           index_col="Pathway").index.tolist()

    with open(json_file) as f:
        gene_set_dict = json.load(f)

    #subset
    gene_set_dict = {k: v for (k, v) in gene_set_dict.items() if k in pathways}

    if len(gene_set_dict) > 1:
        spec_dict = {}
        for k, v in gene_set_dict.items():
            spec_dict[k] = [i for i in v if i in spec_genes]

        e = from_contents(spec_dict)

        upsetplot.plot(e,
                       sort_by='cardinality',
                       sort_categories_by='cardinality',
                       show_counts=True)
        plt.savefig(output_name)
        plt.clf()
示例#2
0
def getLevels(R, L, k):
    n = 1
    while (1 != 0):
        tempR = []
        tempL = []
        upsetD = []
        for i in range(len(R[n])):
            for j in range(len(R[1])):
                if (checkExists(
                        R[1][j],
                        R[n][i]) == False):  # Fix this to work with lists
                    intersectionTID = intersection(L[n][i], L[1][j])
                    if (len(intersectionTID) >= k):
                        if (n == 1):
                            tempR.append([R[n][i], R[1][j]])
                        else:
                            tempR.append(R[n][i] + [R[1][j]])
                        tempL.append(intersectionTID)
        if (len(tempR) == 0):
            return
        R.append(tempR)
        L.append(tempL)
        R[n + 1], L[n + 1] = checkDuplicates(R[n + 1], L[n + 1])
        for i in range(len(L[n + 1])):
            upsetD.append(len(L[n + 1][i]))
        print("\nLevel ", n + 1, "-->  Number of itemsets = ", len(R[n + 1]))
        #print(R[n+1])
        print("\n")
        upset = from_memberships(R[n + 1], data=upsetD)
        upset  # doctest: +NORMALIZE_WHITESPACE
        plot(upset)
        pyplot.show()
        n += 1
def make_UpSetPlot(GT ,bp_lists, vcf_names):
    bools = []
    for bplist in bp_lists:
        boollist = []
        for bp in GT:
            boollist.append(bpset in bp_lists)
            bools.append(boollist)
    
    dic = {}
    for nr, vcf in enumerate(vcf_names):
        dic[vcf] = bools[nr]
    dic['breakpoints'] = GT
    
    df = pd.DataFrame(dic)
    cols = df.columns.difference(['breakpoints']).tolist()
    s = df.groupby(cols).size()
    
    plot(s, show_counts='%d', sort_by="cardinality")
    plt.title('Intersection breakpoints SV callers')
    
    plt.show()

    # save plot
    currentfig = plt.gcf()
    currentfig.savefig('UpSetPlot %s' % ' '.join(vcf_names))
示例#4
0
    def upset_members(self, threshold=0, path=None, plot_upset=False, show_counts_bool=True, exclude_singletons_from_threshold=False, threshold_dual_cats=None, exclude_skids=None):

        celltypes = self.Celltypes

        contents = {} # empty dictionary
        for celltype in celltypes:
            name = celltype.get_name()
            contents[name] = celltype.get_skids()

        data = from_contents(contents)

        # identify indices of set intersection between all data and exclude_skids
        if(exclude_skids!=None):
            ind_dict = dict((k,i) for i,k in enumerate(data.id.values))
            inter = set(ind_dict).intersection(exclude_skids)
            indices = [ind_dict[x] for x in inter]
            data = data.iloc[np.setdiff1d(range(0, len(data)), indices)]

        unique_indices = np.unique(data.index)
        cat_types = [Celltype(' and '.join([data.index.names[i] for i, value in enumerate(index) if value==True]), 
                    list(data.loc[index].id)) for index in unique_indices]

        # apply threshold to all category types
        if(exclude_singletons_from_threshold==False):
            cat_bool = [len(x.get_skids())>=threshold for x in cat_types]
        
        # allows categories with no intersection ('singletons') to dodge the threshold
        if((exclude_singletons_from_threshold==True) & (threshold_dual_cats==None)): 
            cat_bool = [(((len(x.get_skids())>=threshold) | (' and ' not in x.get_name()))) for x in cat_types]

        # allows categories with no intersection ('singletons') to dodge the threshold and additional threshold for dual combos
        if((exclude_singletons_from_threshold==True) & (threshold_dual_cats!=None)): 
            cat_bool = [(((len(x.get_skids())>=threshold) | (' and ' not in x.get_name())) | (len(x.get_skids())>=threshold_dual_cats) & (x.get_name().count('+')<2)) for x in cat_types]

        cats_selected = list(np.array(cat_types)[cat_bool])
        skids_selected = [x for sublist in [cat.get_skids() for cat in cats_selected] for x in sublist]

        # identify indices of set intersection between all data and skids_selected
        ind_dict = dict((k,i) for i,k in enumerate(data.id.values))
        inter = set(ind_dict).intersection(skids_selected)
        indices = [ind_dict[x] for x in inter]

        data = data.iloc[indices]

        # identify skids that weren't plotting in upset plot (based on plotting threshold)
        all_skids = [x for sublist in [cat.get_skids() for cat in cat_types] for x in sublist]
        skids_excluded = list(np.setdiff1d(all_skids, skids_selected))

        if(plot_upset):
            if(show_counts_bool):
                fg = plot(data, sort_categories_by = None, show_counts='%d')
            else: 
                fg = plot(data, sort_categories_by = None)

            if(threshold_dual_cats==None):
                plt.savefig(f'{path}_excluded{len(skids_excluded)}_threshold{threshold}.pdf', bbox_inches='tight')
            if(threshold_dual_cats!=None):
                plt.savefig(f'{path}_excluded{len(skids_excluded)}_threshold{threshold}_dual-threshold{threshold_dual_cats}.pdf', bbox_inches='tight')

        return (cat_types, cats_selected, skids_excluded)
示例#5
0
    def start(self):
        self.print_arguments()
        print("Loading data.")
        decon_df = self.load_file(self.decon_path, nrows=None)

        columns = [x for x in decon_df.columns if "pvalue" in x]
        decon_df = decon_df[columns]

        variable = "p-value"
        if self.calc_fdr:
            print("Calculating FDR.")
            _, decon_df = self.bh_correct(decon_df)
            variable = "FDR"
        self.print_n_signif(df=decon_df, variable=variable)

        print("Preprocessing data.")
        data = self.parse_df(decon_df, self.alpha)
        counts = self.count(data)
        counts = counts[counts > 0]
        print(counts)

        print("Creating plot.")
        up.plot(counts, sort_by='cardinality', show_counts=True)
        for extension in self.extensions:
            plt.savefig(
                os.path.join(self.outdir, "{}.{}".format(self.name,
                                                         extension)))
        plt.close()
示例#6
0
def test_two_sets(set1, set2):
    # we had a bug where processing failed if no items were in some set
    fig = matplotlib.figure.Figure()
    plot(pd.DataFrame({'val': [5, 7],
                       'set1': set1,
                       'set2': set2}).set_index(['set1', 'set2'])['val'],
         fig)
示例#7
0
def test_dataframe_raises():
    fig = matplotlib.figure.Figure()
    df = pd.DataFrame({'val': [5, 7],
                       'set1': [False, True],
                       'set2': [True, True]}).set_index(['set1', 'set2'])
    with pytest.raises(ValueError, match='sum_over must be'):
        plot(df, fig)
示例#8
0
def Upsetplotting(name_file,name_output,folder):
    UTR5,Exon,Intron,UTR3,Upstream,Downstream = [],[],[],[],[],[]
    data=open(name_file)
    next(data)
    for lines in data.readlines():
        Upstream.append(lines.split('\t')[7])
        UTR5.append(lines.split('\t')[8])
        Exon.append(lines.split('\t')[9])
        Intron.append(lines.split('\t')[10])
        UTR3.append(lines.split('\t')[11])
        Downstream.append(lines.split('\t')[12])

    Upstream = pd.Series([True if x=="1" else False for x in Upstream])
    UTR5 = pd.Series([True if x=="1" else False for x in UTR5])
    Exon = pd.Series([True if x=="1" else False for x in Exon])
    Intron = pd.Series([True if x=="1" else False for x in Intron])
    UTR3 = pd.Series([True if x=="1" else False for x in UTR3])
    Downstream = pd.Series([True if x=="1" else False for x in Downstream])

    concat = pd.concat([Upstream,UTR5,Exon,Intron,UTR3,Downstream],axis=1,keys=["Upstream","UTR5","Exon","Intron","UTR3","Downstream"])
    result = concat.groupby(["Upstream","UTR5","Exon","Intron","UTR3","Downstream"]).size()
    result = result.nlargest(12)


    plot(result, sort_by = "cardinality")
    pyplot.suptitle("Intersection size")
    pyplot.savefig(folder+"/"+name_output+"_Upsetplot.pdf")
示例#9
0
def generate_upsetplot(rapport, names, min_alt, path):
    # remove cases with no supporint reads in all samples from alt > min_alt
    rapport = rapport[rapport[names].apply(
        lambda alt: False
        if (alt.str.split('/').str[0].astype(int) < min_alt).all() else True,
        axis=1)]

    upsetframe = rapport[names].reset_index()

    for name in names:

        mask = (upsetframe[name].str.split('/').str[0].astype(int) >= min_alt)
        upsetframe.loc[mask, name] = True
        upsetframe.loc[upsetframe[name] != True, name] = False

    samples = [c for c in upsetframe.columns if c != 'SNV']
    samples_count_series = upsetframe.fillna(False).groupby(
        samples).count()['SNV']

    upsetplot.plot(samples_count_series, sort_by='cardinality')
    current_figure = plt.gcf()
    plt.title("Overlaps strict filtered SNVs", fontsize=15)
    plt.ylabel("SNV count")

    current_figure.savefig(
        os.path.join(path, 'upsetplot_' + str(min_alt) + '.png'))
示例#10
0
def plot_intersection(ins_dict, save_fig=False):
    """
    Visualize an upsetplot displaying the number of unique subjects found simultaneously
    in a pair of instruments

    Parameters
    ----------
    ins_dict: dictionary
    save_fig: bool
    """

    ins_names = list(ins_dict.keys())
    list_comb = sum([
        list(map(list, combinations(ins_names, i + 1)))
        for i in range(len(ins_names) + 1)
    ], [])
    list_uniquesubj = []
    for lc in list_comb:
        list_uniquesubj.append([set(ins_dict[n].index) for n in lc])
    int_counts = list(map(_count_intersection, list_uniquesubj))
    inter_plot = from_memberships(list_comb, data=int_counts)
    plot(inter_plot,
         show_counts='%d',
         element_size=50,
         orientation='horizontal')
    if save_fig:
        plt.savefig(os.path.join(ut.out_folder, 'intersection_plot'),
                    format='pdf')
    else:
        plt.show()
示例#11
0
def test_dataframe_raises():
    fig = matplotlib.figure.Figure()
    df = pd.DataFrame({
        'val': [5, 7],
        'set1': [False, True],
        'set2': [True, True]
    }).set_index(['set1', 'set2'])
    with pytest.raises(ValueError, match='Please specify subset_size or '):
        plot(df, fig)
def plot_protein_upset(protein_dict):
    color = '#21918cff'
    plot_df = upsetplot.from_contents(protein_dict)
    upsetplot.plot(plot_df, sort_by='cardinality', subset_size='auto', facecolor=color)
    # plt.ylim(0, 60)
    plt.title("Distribution of Protein Overlap")

    plt.savefig("Protein_upset.svg")
    plt.savefig("Protein_upset.png")
示例#13
0
 def upsetplot(self, data, title, outdir, extension):
     counts = self.count(data)
     counts = counts[counts > 0]
     up.plot(counts, sort_by='cardinality', show_counts=True)
     plt.suptitle('{}'.format(title.replace("_", " ")),
                  fontsize=18,
                  fontweight='bold')
     plt.savefig(
         os.path.join(outdir, "{}_upsetplot.{}".format(title, extension)))
     plt.close()
示例#14
0
def writeIntersectionPlot(inputIterators, iter):
    contents = {}
    for circIter in inputIterators:
        contents[circIter.name] = [
            c for c in iter
            if (c.getMeta(circIter.id) != CircRow.META_INDEX_CIRC_NOT_IN_DB)
        ]

    df = from_contents(contents)
    plot(df, facecolor="red", sort_by="cardinality", show_counts='%d')
    pyplot.savefig('./output/out.png')
示例#15
0
def build_diagrams(consolidated_data, graph_storage_path, condition):
    dn_list, doses_list, up_list = unpack_consolidated_data(consolidated_data)
    index = build_graph_index(doses_list)
    data_lists = [dn_list, up_list]
    for pos in range(0, 2):
        ser = build_graph_data(data_lists[pos], index)
        plot(ser)
        if pos is 0:
            pyplot.savefig(os.path.join(graph_storage_path, condition + '_down_' + '.png'))
        else:
            pyplot.savefig(os.path.join(graph_storage_path, condition + '_up_' + '.png'))
示例#16
0
def test_plot_smoke_test(kw):
    fig = matplotlib.figure.Figure()
    X = generate_data(n_samples=100)
    plot(X, fig, **kw)
    fig.savefig(io.BytesIO(), format='png')

    # Also check fig is optional
    n_nums = len(plt.get_fignums())
    plot(X, **kw)
    assert len(plt.get_fignums()) - n_nums == 1
    assert plt.gcf().axes
 def plot_peptide_upset(self, save=False):
     color = '#21918cff'
     plot_df = upsetplot.from_contents(self.peptide_dict)
     upsetplot.plot(plot_df,
                    sort_by='cardinality',
                    subset_size='auto',
                    facecolor=color)
     # plt.ylim(0, 400)
     plt.title("Distribution of Peptide Overlap")
     if save:
         plt.savefig("Peptide_upset.svg")
         plt.savefig("Peptide_upset.png")
示例#18
0
def upset(index):
    selection = clusters[np.where(sets[:, index] > 0)]
    items, counts = np.unique(selection, return_counts=True)

    subset = from_memberships(items, counts)
    sub_classes = np.unique([item for sublist in items for item in sublist])

    print("Root Class: ", unique_clusters[index])
    print("# Papers: ", len(selection))
    print("# Labels: ", len(sub_classes))
    print("# Classes: ", len(items))
    if len(items) > 40 or len(sub_classes) > 20:
        print("Too many items")
    else:
        plot(subset)
示例#19
0
def create_plot(gnps_task, metadata_column, metadata_terms,
                intensity_threshold):
    data_df = _get_task_df(gnps_task)

    metadata_terms = set(metadata_terms)

    INTENSITY_THRESHOLD = float(intensity_threshold)

    data_df = data_df[data_df["featurearea"] > INTENSITY_THRESHOLD]

    membership = []
    grouped_df = data_df.groupby("featureid")
    for group_df in grouped_df:
        try:
            groups = set(group_df[1][metadata_column])
            groups = list(groups & metadata_terms)

            membership.append(groups)
        except:
            print("ERROR")
            raise

    upset_data_df = from_memberships(membership)

    plotting_object = plot(upset_data_df,
                           subset_size="count",
                           sort_by="cardinality",
                           orientation="horizontal",
                           show_counts=True)

    uuid_save = str(uuid.uuid4())
    pyplot.savefig("./output/{}.svg".format(uuid_save))

    return [html.Img(src="/plot/{}".format(uuid_save))]
    def plot_species_intersections(self, color, ignore_counts=0, orientation='horizontal'):
        memberships = []
        data = []

        species_groups, _ = self.orthogroups_sets()

        for k in species_groups:
            memberships.append(k)
            data.append(len(set(species_groups[k])))

        structured_data = from_memberships(memberships, data=data)

        species_dict = {'P8084_finalAssembly': 'P.betacei',
                        'P_cactorum_10300': 'P.cactorum',
                        'P_infestans_RefSeq': 'P. infestans',
                        'P_palmivora_LILI_trCDS': 'P.palmivora',
                        'P_parasitica_INRA310': 'P.parasitica',
                        'P_ramorum_Pr102': 'P.ramorum',
                        'P_sojae_V3': 'P.sojae'}


        new_names = [species_dict[old_name] for old_name in structured_data.index.names]
        structured_data.index.names = new_names

        structured_data = structured_data[structured_data > ignore_counts].copy()
        p = plot(structured_data,
                 orientation=orientation,
                 show_counts=True,
                 facecolor=color,
                 element_size=40)

        return p
示例#21
0
def main():
    args = get_args()
    sys.stderr.write("Deprecation warning:\n"
                     "This script is here for legacy purposes.\n"
                     "You should use _surpyvor upset_ instead.")
    if args.ignore_type:
        ignore_type = "-1"
    else:
        ignore_type = "1"
    combined_vcf = survivor(samples=[normalize_vcf(s) for s in args.variants],
                            distance=args.distance,
                            ignore_type=ignore_type,
                            minlength=args.minlength,
                            save=args.store)
    upsets = make_sets(vcf=combined_vcf, names=args.names or args.variants)
    plot(upsets, sort_by='cardinality')
    plt.savefig("UpSetPlot.png")
示例#22
0
 def generate_report(self, report_path=tempfile.gettempdir()):
     try:
         os.mkdir(report_path)
     except FileExistsError:
         pass
     rows = {}
     for entry in self.entries:
         rows[entry.biotools_id] = [
             source.is_available() for source in entry.sources.values()
         ]
     df = pd.DataFrame.from_dict(
         rows,
         orient='index',
         columns=[source_class.SOURCE for source_class in SOURCE_CLASSES])
     plot(df.groupby(
         [source_class.SOURCE for source_class in SOURCE_CLASSES]).size(),
          show_counts=True)
     with open(os.path.join(report_path, 'detailed_counts.md'),
               'w') as md_file:
         df.replace({
             True: '✓',
             False: '🗙'
         }).to_markdown(buf=md_file, tablefmt='github')
     with open(os.path.join(report_path, 'summary.md'), 'w') as md_file:
         summary_df = df.groupby([
             source_class.SOURCE for source_class in SOURCE_CLASSES
         ]).size()
         pretty_index = []
         for idx_row in summary_df.index:
             pretty_index.append([
                 'No ' + summary_df.index.names[cell_idx]
                 if cell == False else summary_df.index.names[cell_idx]
                 for cell_idx, cell in enumerate(idx_row)
             ])
         summary_df.reindex(pretty_index)
         summary_df.to_markdown(buf=md_file, tablefmt='github')
     pyplot.savefig(os.path.join(report_path, 'global_upset.png'))
     print(df[(df["biotools"] == False) & (df["bioschemas"] == False) &
              (df["OEB"] == False) & (df["OEB Metrics"] == False) &
              (df["Debian"] == False) & (df["BioConda"] == False) &
              (df["BioContainers"] == False) & (df["Biii"] == False)])
     print(df[(df["biotools"] == True) & (df["bioschemas"] == True) &
              (df["OEB"] == True) & (df["OEB Metrics"] == True) &
              (df["Debian"] == True) & (df["BioConda"] == True) &
              (df["BioContainers"] == True) & (df["Biii"] == False)])
示例#23
0
def test_matrix_plot_margins(x):
    """Non-regression test addressing a bug where there is are large whitespace
       margins around the matrix when the number of intersections is large"""
    axes = plot(x)

    # Expected behavior is that each matrix column takes up one unit on x-axis
    expected = len(x) - 1
    actual = axes['matrix'].get_xlim()[1] - axes['matrix'].get_xlim()[0]
    assert expected == actual
示例#24
0
def test_matrix_plot_margins(x, orientation):
    """Non-regression test addressing a bug where there is are large whitespace
       margins around the matrix when the number of intersections is large"""
    axes = plot(x, orientation=orientation)

    # Expected behavior is that each matrix column takes up one unit on x-axis
    expected = len(x) - 1
    attr = 'get_xlim' if orientation == 'horizontal' else 'get_ylim'
    lim = getattr(axes['matrix'], attr)()
    assert expected == lim[1] - lim[0]
示例#25
0
    def start(self):
        self.print_arguments()
        print("Loading data.")
        decon_df = self.load_file(self.decon_path, nrows=None)

        print("Calculating FDR.")
        _, decon_fdr_df = self.bh_correct(decon_df)

        print("Preprocessing data.")
        data = self.parse_df(decon_fdr_df, self.alpha)
        counts = self.count(data)
        counts = counts[counts > 0]
        print(counts)

        print("Creating plot.")
        up.plot(counts, sort_by='cardinality', show_counts=True)
        for extension in self.extensions:
            plt.savefig(os.path.join(self.outdir, "eQTL_upsetplot.{}".format(extension)))
        plt.close()
示例#26
0
def plotSetIntersections(df: DataFrame, labels: List[str],
                         unique_id: str) -> None:
    """ Plots sets size and intersection
        Args:
            df:                         dataframe with labels (one-hot encoding) and unique id
            class_labels:               name of columns with labels, one-hot encoding
            unique_id                   name of column with unique id

        Returns:
            None
    """

    df_subset = df[labels + [unique_id]]
    counts = df_subset.astype(bool).groupby(labels).count()[unique_id]
    upsetplot.plot(counts,
                   subset_size="sum",
                   show_counts="%d",
                   sort_by="cardinality")
    plt.suptitle("Multiple tags per comment")
    plt.show()
示例#27
0
def create_upset(df, cols, **args):
    ps = powerset(cols)
    counts_by_combo = collections.defaultdict(int)
    for i, r in df.iterrows():
        combo = [r[c] != "" and r[c] != False and r[c] != 0 for c in cols]
        counts_by_combo[tuple(combo)] += 1

    counts = []
    boolvecs = []
    for s in ps:
        boolvec = makebool(s, cols)
        c = counts_by_combo[tuple(boolvec)]
        if c > 0:
            boolvecs.append(boolvec)
            counts.append(c)
    multiindex = pd.MultiIndex.from_tuples(boolvecs, names=cols)
    #print(multiindex)
    upsetdata = pd.Series(counts,
                          index=multiindex).sort_values(ascending=False)
    usp.plot(upsetdata, sort_by='cardinality', **args)
示例#28
0
    def start(self):
        print("Loading data.")
        df = pd.read_excel(self.data_path, header=0, index_col=None, sheet_name="Sheet2")
        print(df["CellType"].unique())
        df["group"] = df["CellType"].map(self.trans_dict)
        print(df)

        print("Preprocessing data.")
        data = {}
        for ct in df["group"].unique():
            data[ct] = set(df.loc[df["group"] == ct, "GeneName"].tolist())
        print(data)
        counts = self.count(data)
        counts = counts[counts > 0]
        print(counts)

        print("Creating plot.")
        up.plot(counts, sort_by='cardinality', show_counts=True)
        plt.savefig(os.path.join(self.outdir, "markergenes_upsetplot.png"))
        plt.close()
示例#29
0
def plot_upset(ax):
    data = np.array([795., 27., 182., 7.])
    # plt.rcParams.update({'font.size': fontsize})
    example = from_memberships(
        [[' TP53 WT', ' MDM4 WT'], [' TP53 WT', ' MDM4 amp.'],
         [' TP53 mutant', ' MDM4 WT'], [' TP53 mutant', ' MDM4 amp.']],
        data=data)
    intersections, matrix, shading, totals = plot(example,
                                                  with_lines=True,
                                                  show_counts=True,
                                                  element_size=50)
    plt.ylabel('Number of patients', fontproperties)
示例#30
0
def plot_graph(res, path):
    """ From upset_plot data Plot upset plots and store corresponding data"""

    path_figures = f"{path}/figures"
    import os
    os.makedirs(path_figures, exist_ok=True)

    for typ_res, dic in res.items():
        liste_cats = sorted(dic.keys())
        data_out = []
        for cat in liste_cats:
            data_out.append(dic[cat])
        example = from_memberships(liste_cats, data=data_out)
        plot(example)
        pyplot.savefig(f"{path_figures}/{typ_res}.png")

    print(f"  figures stored in '{path_figures}/'")

    path_upset = f"{path}/data_upset.json"
    write_json_file(path_upset, [liste_cats, data_out])
    print(f"  output file in upset plot format stored in '{path_upset}'")