def propNoteGraph(data_test,b_u,b_i,mu,L,R): # Give the interesting graphic index_note = np.arange(1,6) count_1 = np.zeros([5,2]) count_2 = np.zeros([5,2]) notes = DataFrame(count_1,index=index_note,columns=['BON','MAUVAIS']) notes_naif = DataFrame(count_2,index=index_note,columns=['BON','MAUVAIS']) for r in range(data_test.shape[0]): # r_pred = round(mu + b_u[data_test.user_id.values[r]] + b_i[data_test.movie_id.values[r]] + X[data_test.user_id.values[r],data_test.movie_id.values[r]]) mean = mu + b_u[data_test[r,0]] + b_i[data_test[r,1]] r_pred = round(mean + np.dot(L[data_test[r,0],:],R[data_test[r,1],:])) r_pred = min(5,r_pred) r_pred = max(1,r_pred) r_true = int(round(mean+data_test[r,2])) r_naif = round(mean) if r_naif==r_true: notes_naif.BON[r_true]+=1 else: notes_naif.MAUVAIS[r_true]+=1 if r_pred==r_true: notes.BON[r_true]+=1 else: notes.MAUVAIS[r_pred]+=1 notes_naif_prop = notes_naif.div(notes_naif.sum(1),axis=0) notes_prop = notes.div(notes.sum(1),axis=0) notes_naif_VS_algo = pd.concat([notes_prop.BON,notes_naif_prop.BON], axis=1) notes_naif_VS_algo.columns = ['ALGO','NAIF'] return notes_naif_VS_algo
def find_degree_vector(dfi): from pandas import DataFrame as DF results = [{dof: (dfi.columns[i], 'coh')} for i, dof in enumerate(DF.sum(dfi))] temp = [{dof: (dfi.index[i], 'doc')} for i, dof in enumerate(DF.sum(dfi, axis=1))] results.extend(temp) # results = [{i[0]: (i[1], 'coh')} for i in DF.sum(dfi)] return sorted(results)
def hmm_build(alphabet, aln, threshold, sigma): '''Given alphabet, multiple alignment aln, insertion threshold and pseudocount sigma, return the profile HMM transition and emission matrix.''' aln_cols = list(zip(*(aln))) m, n = len(aln), len(aln_cols) # m sequences, n columns # indices of columns where '-' count is below threshold match_cols = [i for i in range(n) if aln_cols[i].count('-') / m < threshold] # state names k = len(match_cols) # k states states_ = ['M{0} D{0} I{0}'.format(i).split() for i in range(1, k + 1)] states = ['S', 'I0'] + [i for j in states_ for i in j] + ['E'] # building matrices transitions = DataFrame(data=0.0, columns=states, index=states) emissions = DataFrame(data=0.0, columns=alphabet, index=states) for seq in aln: # iterate through each sequence state_ix = 0 last_state = 'S' for i in range(n): if i in match_cols: state_ix += 1 if seq[i] != '-': current_state = 'M' + str(state_ix) emissions.loc[current_state, seq[i]] += 1 else: current_state = 'D' + str(state_ix) transitions.loc[last_state, current_state] += 1 last_state = current_state elif seq[i] != '-': current_state = 'I' + str(state_ix) transitions.loc[last_state, current_state] += 1 emissions.loc[current_state, seq[i]] += 1 last_state = current_state transitions.loc[last_state, 'E'] += 1 # scale rows to [0, 1] transitions = transitions.div(transitions.sum(1) + 1e-10, axis=0).round(3) emissions = emissions.div(emissions.sum(1) + 1e-10, axis=0).round(3) #add pseudocounts transitions.iloc[:2, 1:4] += sigma transitions.iloc[-4:-1, -2:] += sigma for i in range(k): transitions.iloc[i*3-1:i*3+2, i*3+1:i*3+4] += sigma emissions.iloc[i*3+1:i*3+3, :] += sigma emissions.iloc[-2, :] += sigma # scale again transitions = transitions.div(transitions.sum(1) + 1e-10, axis=0).round(3) emissions = emissions.div(emissions.sum(1) + 1e-10, axis=0).round(3) return transitions, emissions
def aggregate_chunks(mod_features_df, modality): without_info_df = mod_features_df.query('field != "info"') cnt_df = DataFrame([list(mod_features_df.loc[('info', 'count'), :].values)] * len(without_info_df), index=without_info_df.index) agg_df = without_info_df * cnt_df agg_df = DataFrame(agg_df.sum(axis=1) / cnt_df.sum(axis=1), index=without_info_df.index) agg_df['modality'] = modality agg_df.set_index('modality', append=True, inplace=True) agg_df = agg_df.reorder_levels(['modality', 'field', 'feature']) return agg_df
def summaryStatDataFrame(): df = DataFrame(np.arange(12).reshape(4,3), index=[['a','a','b','b'],[1,2,1,2]], columns=[['Ohio','Ohio','Colorado'], ['Green', 'Red','Green']]) df.index.names = ['key1','key2'] df.columns.names = ['state','color'] print (df) print ('Sum of key1: \n{}'.format(df.sum(level='key1'))) print ('Sum of key2: \n{}'.format(df.sum(level='key2'))) print ('Sum of state: \n{}'.format(df.sum(level='state', axis = 1))) print ('Sum of color: \n{}'.format(df.sum(level='color', axis = 1)))
def project_participation_evolution( pm_frame, all_authors, n=2, skip_anon=True, research_only=False): """Assembles data on participation to projects with n as thresh. Returns DataFrame, index, selection and title for data for use by stacked bar-plot and heatmap functions.""" if not research_only: thread_type = 'all threads' data, _ = get_last(pm_frame, thread_type) all_authors = list(all_authors) title = "Participation per project in Polymath\ (threshold = {})".format(n) else: thread_type = 'research threads' data, _ = get_last(pm_frame, thread_type) all_authors = set().union( *data['research threads', 'authors (accumulated)']) title = "Participation per project in Polymath\ (threshold = {}, only research-threads)".format(n) data.index = data.index.droplevel(1) author_project = DataFrame(columns=all_authors) for author in author_project.columns: author_project[author] = data[ thread_type, 'authors (accumulated)'].apply( lambda project, author=author: author in project) author_project = author_project.T author_project = author_project.sort_values(by=data.index.tolist(), ascending=False) author_project = author_project.drop( "Anonymous") if skip_anon else author_project select = author_project.sum(axis=1) >= n return author_project, data.index, select, title
def _get_most_frequent_word(lower_rank_bound: int, dtm_data_frame: pd.DataFrame) -> pd.DataFrame: """Get the most frequent words in final_matrix and words. The new count matrix will consists of only the most frequent words in the whole corpus. :param lower_rank_bound: the lowest rank to remain in the matrix (the rank is determined by the word's number of appearance in the whole corpus) (ranked from high to low) :param dtm_data_frame: the dtm in the form of panda data frame. the indices(rows) are segment names the columns are words. :return: dtm data frame with only the most frequent words """ # get the word count of each word in the corpus (a panda series) corpus_word_count: pd.Series = dtm_data_frame.sum(axis='index') # sort the word list sorted_word_count: pd.Series \ = corpus_word_count.sort_values(ascending=False) # get the first "lower_rank_bound" number of item most_frequent_counts: pd.Series \ = sorted_word_count.head(lower_rank_bound) # get the most frequent words (the index of the count) most_frequent_words = most_frequent_counts.index return dtm_data_frame[most_frequent_words]
def getNgrams(query, corpus, startYear, endYear, smoothing): params = dict(content=query, year_start=startYear, year_end=endYear, corpus=corpora[corpus], smoothing=1) req = requests.get('http://books.google.com/ngrams/graph', params=params) res = re.findall('var data = (.*?);\\n', req.text) if res: data = {qry['ngram']: qry['timeseries'] for qry in literal_eval(res[0])} df = DataFrame(data) df_sum = df.sum(axis=1) final_sum = df_sum.loc[[0]] else: df = DataFrame() final_sum.to_csv(filename, mode = 'a', header = False, index = False) print('Data saved to %s' % filename) return req.url, params['content'], df
def analyze(): signals = read_csv(FILE_SIGNALS) devices = signals["id"].unique() print("got %d signals from %d devices" % (len(signals), len(devices))) signals = signals.groupby(["frequency", "id"]).size() signals = signals.reindex(MultiIndex.from_product([SPECTRUM, devices], names=signals.index.names), fill_value=0) signals = signals.unstack("id") # let's only keep frequencies with all signals present candidates = signals.dropna() # suggest frequency where the weakest sensor has the most # received signals, and then the frequency with most total # received signals for all sensors candidates = DataFrame({"total": candidates.sum(axis=1), "weakest": candidates.min(axis=1)}) appropriate_freq = candidates.sort(["weakest", "total"], ascending=False).index[0] print("suggesting frequency %s" % mhz(appropriate_freq)) signals.to_csv("spectrum.csv") import matplotlib.pyplot as plt from matplotlib.ticker import EngFormatter p=signals.plot(kind="Area") p.xaxis.set_major_formatter(EngFormatter(unit='Hz', places=2)) plt.savefig(FILE_SPECTRUM, dpi=300) print("saved spectrum as %s" % FILE_SPECTRUM)
def thread_participation_evolution( pm_frame, project, n=2, skip_anon=True, research_only=False): """Assembles data on participation to threads in project with n as thresh. Returns DataFrame, index, selection and title for data for use by stacked bar-plot and heatmap functions.""" if not research_only: thread_type = 'all threads' title = "Participation per thread in {} (threshold = {})".format( project, n) else: thread_type = 'research threads' title = "Participation per thread in {}\ (threshold = {}, only research-threads)".format(project, n) data = pm_frame.loc[project][['basic', thread_type]] data = data.dropna() all_authors = set().union(*data[thread_type, 'authors']) author_thread = DataFrame(columns=all_authors) for author in author_thread.columns: author_thread[author] = data[thread_type, 'authors'].apply( lambda thread, author=author: author in thread) author_thread = author_thread.T author_thread = author_thread.sort_values(by=data.index.tolist(), ascending=False) author_thread = author_thread.drop( "Anonymous") if skip_anon else author_thread author_thread.columns.name = "Threads" select = author_thread.sum(axis=1) >= n return author_thread, data.index, select, title
def hmm_build(alphabet, aln, threshold): '''given alphabet, multiple alignment aln, and insertion threshold, return the profile HMM transition and emission matrix.''' aln_cols = list(zip(*(aln))) m, n = len(aln), len(aln_cols) # m sequences, n columns # indices of columns where '-' count is below threshold match_cols = [i for i in range(n) if aln_cols[i].count('-') / m < threshold] # state names k = len(match_cols) # k states states_ = [('M'+ str(i), 'D' + str(i), 'I' + str(i)) for i in range(1, k + 1)] states = ['S', 'I0'] + [i for j in states_ for i in j] + ['E'] # building matrices transitions = DataFrame(data=0.0, columns=states, index=states) emissions = DataFrame(data=0.0, columns=alphabet, index=states) for seq in aln: # iterate through each sequence state_ix = 0 last_state = 'S' for i in range(n): if i in match_cols: state_ix += 1 if seq[i] != '-': current_state = 'M' + str(state_ix) emissions.loc[current_state, seq[i]] += 1 else: current_state = 'D' + str(state_ix) transitions.loc[last_state, current_state] += 1 last_state = current_state elif seq[i] != '-': current_state = 'I' + str(state_ix) transitions.loc[last_state, current_state] += 1 emissions.loc[current_state, seq[i]] += 1 last_state = current_state transitions.loc[last_state, 'E'] += 1 # normalize rows transitions = transitions.div(transitions.sum(1) + 1e-10, axis=0).round(3) emissions = emissions.div(emissions.sum(1) + 1e-10, axis=0).round(3) return transitions, emissions
def agg(self): dframe = DataFrame(index=[0]) columns = [ Series([col]) for col in [self.column.sum(), len(self.column)]] dframe = self._build_dframe(dframe, columns) dframe = DataFrame([dframe.sum().to_dict()]) return self._add_calculated_column(dframe)
def agg(self): dframe = DataFrame(index=self.column.index) dframe = self._build_dframe(dframe, self.columns) column_names = [self._name_for_idx(i) for i in xrange(0, 2)] dframe = dframe.dropna(subset=column_names) dframe = DataFrame([dframe.sum().to_dict()]) return self._add_calculated_column(dframe)
def numpy_dot(): ''' Imagine a point system in which each country is awarded 4 points for each gold medal, 2 points for each silver medal, and one point for each bronze medal. Using the numpy.dot function, create a new dataframe called 'olympic_points_df' that includes: a) a column called 'country_name' with the country name b) a column called 'points' with the total number of points the country earned at the Sochi olympics. You do not need to call the function in your code when running it in the browser - the grader will do that automatically when you submit or test it. ''' countries = ['Russian Fed.', 'Norway', 'Canada', 'United States', 'Netherlands', 'Germany', 'Switzerland', 'Belarus', 'Austria', 'France', 'Poland', 'China', 'Korea', 'Sweden', 'Czech Republic', 'Slovenia', 'Japan', 'Finland', 'Great Britain', 'Ukraine', 'Slovakia', 'Italy', 'Latvia', 'Australia', 'Croatia', 'Kazakhstan'] gold = [13, 11, 10, 9, 8, 8, 6, 5, 4, 4, 4, 3, 3, 2, 2, 2, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0] silver = [11, 5, 10, 7, 7, 6, 3, 0, 8, 4, 1, 4, 3, 7, 4, 2, 4, 3, 1, 0, 0, 2, 2, 2, 1, 0] bronze = [9, 10, 5, 12, 9, 5, 2, 1, 5, 7, 1, 2, 2, 6, 2, 4, 3, 1, 2, 1, 0, 6, 2, 1, 0, 1] olympic_medal_counts = {'country_name':countries, 'gold': Series(gold), 'silver': Series(silver), 'bronze': Series(bronze)} df = DataFrame(olympic_medal_counts) gold_points = df[['gold']].applymap(lambda x: x*4) silver_points = df[['silver']].applymap(lambda x: x*2) bronze_points = df[['bronze']].applymap(lambda x: x*1) medal_points = DataFrame({'gold_points': gold_points.ix[:,0], 'silver_points': silver_points.ix[:,0], 'bronze_points':bronze_points.ix[:,0]}) medal_sums = medal_points.sum(axis=1) #instructors solution #medal_counts = df[['gold', 'silver', 'bronze']] #points = numpy.dot(medal_counts, [4, 2, 1]) olympic_points_df = DataFrame({'country_name': countries, 'points': medal_sums}) return olympic_points_df
def test_transpose_empty_preserves_datetimeindex(self): # GH#41382 df = DataFrame(index=DatetimeIndex([])) expected = DatetimeIndex([], dtype="datetime64[ns]", freq=None) result1 = df.T.sum().index result2 = df.sum(axis=1).index tm.assert_index_equal(result1, expected) tm.assert_index_equal(result2, expected)
def test_nan_int_timedelta_sum(self): # GH 27185 df = DataFrame( { "A": Series([1, 2, NaT], dtype="timedelta64[ns]"), "B": Series([1, 2, np.nan], dtype="Int64"), } ) expected = Series({"A": Timedelta(3), "B": 3}) result = df.sum() tm.assert_series_equal(result, expected)
def uncondExact2x2DF(df: pd.DataFrame, **kwargs) -> pd.Series: from rpy2.robjects.packages import importr assert df.shape == (2, 2), "Input dataframe must be of shape 2x2" exact2x2 = importr("exact2x2") c1 = int(df.iloc[0, 0]) c2 = int(df.iloc[1, 0]) n1, n2 = [int(x) for x in df.sum(axis=1)] res_d = uncondExact2x2(c1, n1, c2, n2, **kwargs) return pd.Series(res_d)
def show_ordered_ssm ( self, df:pd.DataFrame, name:str = 'NotProvided', ticks_interval:int =10, figsize:Tuple[int,int] =(10,9), isAnnot:bool=False ) -> pd.DataFrame: """ Get the Self-Similarity Matrix in an ordered form, where the first column has the highest sum. Parameters ---------- :param df : The Self-Similarity Matrix in pandas DataFrame format, where indexs and columns are same i.e. caseIDs. :param name : The name to be shown on the plot title. (default: NotProvided). :param ticks_interval : The interval of ticks for the Self-Similarity heatmap. :param figsize : Figure size of the Heatmap plot (default: (10,10)). :param isAnnot : True, will annotate each cell with its similarity value (default: False). Plots ----- Heatmap : heatmap of the ordered Self-Similarity Matrix. Returns ------- DataFrame : Ordered Self-Similarity Matrix. NaN represents that a caseID was not compared for the similarity. """ ordered_series = df.sum( axis=1).sort_values( ascending=False) lis = ordered_series.index.tolist() df_1 = df[lis] df_temp = df_1.reindex(lis) plt.figure( figsize=figsize) ax = sns.heatmap( df_temp, cmap='viridis', xticklabels=ticks_interval, yticklabels=ticks_interval, fmt='g', annot=isAnnot, annot_kws={'size': 9} ) ax.invert_xaxis() plt.yticks(rotation=0) plt.title('Self-Similarity Matrix (ordered) for : '+name) return df_temp
def filter_empty_trajectories(data: pandas.DataFrame) -> pandas.DataFrame: # If the inputfiles have whitespace characters in a line they'll be imported as additional trajectories with 0% at every timepoint. # So basically remove any trajectories which are 0% at all timepoints. # Need to first isolate the columns dedicated to the experimental timepoints row_totals = data.sum(axis=1) index_to_drop = row_totals[row_totals == 0.0].index index_to_keep = [i for i in data.index if i not in index_to_drop] data = data.reindex(index_to_keep) return data
class DetectionEvaluation(BaseEvaluation): """ DetectionEvaluations have a different number of predictions from the number of ground truth annotations. An example would be detecting lung nodules in a CT volume, or malignant cells in a pathology slide. """ def merge_ground_truth_and_predictions(self): self._cases = concat( [self._ground_truth_cases, self._predictions_cases], keys=["ground_truth", "predictions"]) def cross_validate(self): expected_keys = set(self._ground_truth_cases[self._join_key]) submitted_keys = set(self._predictions_cases[self._join_key]) missing = expected_keys - submitted_keys if missing: self._raise_missing_predictions_error(missing=missing) extra = submitted_keys - expected_keys if extra: self._raise_extra_predictions_error(extra=extra) def score(self): cases = set(self._ground_truth_cases[self._join_key]) self._case_results = DataFrame() for idx, case in enumerate(cases): self._case_results = self._case_results.append(self.score_case( idx=idx, case=self._cases.loc[self._cases[self._join_key] == case], ), ignore_index=True) self._aggregate_results = self.score_aggregates() def score_aggregates(self): aggregate_results = super().score_aggregates() totals = self._case_results.sum() for s in totals.index: aggregate_results[s]["sum"] = totals[s] tp = aggregate_results["true_positives"]["sum"] fp = aggregate_results["false_positives"]["sum"] fn = aggregate_results["false_negatives"]["sum"] aggregate_results["precision"] = tp / (tp + fp) aggregate_results["recall"] = tp / (tp + fn) aggregate_results["f1_score"] = 2 * tp / ((2 * tp) + fp + fn) return aggregate_results
def test_reduce_mixed_frame(self): # GH 6806 df = DataFrame({ "bool_data": [True, True, False, False, False], "int_data": [10, 20, 30, 40, 50], "string_data": ["a", "b", "c", "d", "e"], }) df.reindex(columns=["bool_data", "int_data", "string_data"]) test = df.sum(axis=0) tm.assert_numpy_array_equal(test.values, np.array([2, 150, "abcde"], dtype=object)) tm.assert_series_equal(test, df.T.sum(axis=1))
def add_total_row(df: pd.DataFrame) -> pd.DataFrame: """ Добавляет строчку с total суммами по столбцам к итоговой таблице :param df: финальный датафрейм с тремя конкатенированными таблицами :return: датафрейм с добавленной строкой сумм """ sumrow = pd.DataFrame(columns=df.columns) indx = sumrow.index sumrow = sumrow.append(df.sum(numeric_only=True), ignore_index=True) sumrow.index = indx.union(["Total"]) df = pd.concat([sumrow, df]) return df
def _get_priors_and_counts(self, df: pd.DataFrame) -> (int, int, int, int): sums = df.sum(axis=0) total_success_count = sums['success_case_count'] total_non_success_count = sums['non_success_count'] total = total_success_count + total_non_success_count # P(success) & P(not success) p_success = total_success_count / total p_non_success = total_non_success_count / total return p_success, p_non_success, total_success_count, total_non_success_count
def merge_profiles(name, output): while True: n = GetRunningTasks(name) if n == 0: print("%s, Merging profiles ... " % current_time()) files = glob.glob("%s/*/*gene_abund.tab" % (output)) if files: files = sorted(files) dict_merge = {} for f in files: with open(f) as handle: for line in islice(handle, 1, None): line = line.strip().split("\t") k_map = line[1] k_RNA = f.split("/")[-2] count = float(line[8]) # TPM if k_map in dict_merge: dict_merge[k_map][k_RNA] = count else: tmp_dic = {} tmp_dic[k_RNA] = count dict_merge[k_map] = tmp_dic df = DataFrame(dict_merge).T df = df.fillna(value=0) ### fill NA to 0 df_sum = DataFrame(df.sum(axis=1), columns=['sum']) df = df.join(df_sum) df = df.sort_values(by="sum", ascending=False) ### sort by sum df.drop(['sum'], axis=1, inplace=True) merge_out = os.path.join(output, "merge_gene_TPM.txt") df.to_csv(merge_out, sep="\t", header=True, index=True, index_label="gene", float_format="%.2f") return merge_out break else: print( "\n### Merge profiles failed, it is not exsit in %s/*/ \n" % (output)) exit(1) else: print("%s, Waitiing for task finished, remaining %d tasks" % (current_time(), n)) time.sleep(10)
def compute_countries_sto_multipliers(years: List[int], countries: List[str], sto_inflows_df: pd.DataFrame, ror_inflows_df: pd.DataFrame, ror_capacity_ds: pd.Series) -> pd.Series: """ Computing STO multipliers mapping cell runoff to approximated hourly-sampled reservoir inflows. Parameters ---------- years: List[int] List of years. countries: List[str] ISO codes of the countries for which we want to obtain STO multipliers. sto_inflows_df: pd.DataFrame Data frame with STO (GWh) inflow time series for each geographical unit across the time horizon considered. ror_inflows_df: pd.DataFrame Data frame with ROR (p.u.) capacity factors for each geographical unit across the time horizon considered. ror_capacity_ds: pd.Series Series with ROR hydro capacities (GW) for each geographical unit considered. Returns ------- sto_multipliers_ds: pd.Series STO multipliers per country. """ # Compute yearly per country ror electricity production ror_inflows_yearly = ror_inflows_df.groupby( ror_inflows_df.index.year).sum() ror_production_yearly = ror_inflows_yearly.multiply( ror_capacity_ds.dropna(), axis=1).transpose() ror_production_yearly_per_country = ror_production_yearly.groupby( ror_production_yearly.index.str[:2]).sum() # Get total hydro-electric production and remove ROR production to get STO production sto_production_yearly_per_country = get_hydro_production( years=years, countries=countries) countries_with_ror = set(countries).intersection( set(ror_production_yearly_per_country.index)) sto_production_yearly_per_country.loc[countries_with_ror] -= \ ror_production_yearly_per_country.loc[countries_with_ror] # For some countries (like LV and IE), computed ROR potential is bigger than the Eurostat total hydro generation # leading to negative STO production values so we clip it. sto_production_per_country = sto_production_yearly_per_country.clip( lower=0.).sum(axis=1) sto_inflows_per_country = sto_inflows_df.sum().groupby( sto_inflows_df.columns.str[:2]).sum() sto_multipliers_ds = sto_production_per_country / sto_inflows_per_country return sto_multipliers_ds
def weights_sum_to_one(weights: pd.DataFrame): sum_weights = weights.sum(axis=1) sum_weights[sum_weights==0.0] = 0.0001 weight_multiplier = 1.0 / sum_weights weight_multiplier_array = np.array([weight_multiplier]*len(weights.columns)) weight_values = weights.values normalised_weights_np = weight_multiplier_array.transpose() * weight_values normalised_weights = pd.DataFrame(normalised_weights_np, columns = weights.columns, index = weights.index) return normalised_weights
def get_tail_labels(df: pd.DataFrame, ql=[0.03, 1.]) -> list: """ Find the underrepresented targets. Underrepresented targets are those which are observed less than the median occurance. Targets beyond a quantile limit are filtered. """ irlbl = df.sum(axis=0) irlbl = irlbl[(irlbl > irlbl.quantile(ql[0])) & ((irlbl < irlbl.quantile(ql[1])))] # Filtering irlbl = irlbl.max() / irlbl threshold_irlbl = irlbl.median() tail_labels = irlbl[irlbl > threshold_irlbl].index.tolist() return tail_labels
def test_sum_timedelta64_skipna_false(): # GH#17235 arr = np.arange(8).astype(np.int64).view("m8[s]").reshape(4, 2) arr[-1, -1] = "Nat" df = DataFrame(arr) result = df.sum(skipna=False) expected = Series([pd.Timedelta(seconds=12), pd.NaT]) tm.assert_series_equal(result, expected) result = df.sum(axis=0, skipna=False) tm.assert_series_equal(result, expected) result = df.sum(axis=1, skipna=False) expected = Series([ pd.Timedelta(seconds=1), pd.Timedelta(seconds=5), pd.Timedelta(seconds=9), pd.NaT, ]) tm.assert_series_equal(result, expected)
def order_optimum(df: DataFrame, k1: Union[int, float], k2: Union[int, float], z: Union[int, float], depth: int): """ :param df: DataFrame, stock moving data :param k1: int or float, in case of perishable product - storage cost + purchase price, else - only storage cost :param k2: int or float, selling price :return: int or float, optimal order quantity """ if depth == 1: return stock_optimum(df, k1, k2) - z return stock_optimum(df, k1, k2) - z + depth * df.sum(axis=1).mean()
def test_stale_cached_series_bug_473(self): # this is chained, but ok with option_context('chained_assignment', None): Y = DataFrame(np.random.random((4, 4)), index=('a', 'b', 'c', 'd'), columns=('e', 'f', 'g', 'h')) repr(Y) Y['e'] = Y['e'].astype('object') Y['g']['c'] = np.NaN repr(Y) result = Y.sum() # noqa exp = Y['g'].sum() # noqa assert pd.isna(Y['g']['c'])
def gauge_chart_histogram_cross(responses, categories): datVals = str(dic['Data_values'][responses.name==dic['Spring_2017_Question_Code']].values[0]).split(';') # Blank values are pulled in as NaN if datVals == ['nan']: datVals = [''] description = responses.describe() h = 2*(description['75%']-description['25%'])/np.power(description['count'], 1./3.) nbins = np.round((description['max']-description['min'])/h) bins = np.histogram(responses, nbins)[1] stack = DataFrame(columns=np.arange(nbins), index=categories.columns) for i in range(len(stack)): stack.ix[i] = np.histogram(responses[categories.ix[:, i]], bins)[0] (100*stack.T/stack.sum(1)).T[::-1].plot(kind='barh', stacked=True, width=1, edgecolor='w', legend=False, align='edge', figsize=(12, 6)) plt.title("\n".join(wrap(str(dic['Question_Text']\ [responses.name==dic['Spring_2017_Question_Code']].values[0]), 88)), size='medium') for i in range(len(stack.T)): plt.axvline(np.cumsum((stack.T/stack.sum(1)).T.ix[0])[::-1][i]*100, color='lightgray') plt.axis('tight') plt.xticks(np.arange(0, 101, 10), ('%i%% '*11 % tuple(np.arange(0, 101, 10))).split()) plt.legend(bbox_to_anchor=(0.55, -0.05, 0.5, 0), ncol=len(datVals), fontsize='small') plt.subplots_adjust(left=0.18, right=0.92)
def check_lineage(genotypes: pandas.DataFrame, lineages: pandas.Series): ancestor = get_ancestor_series(genotypes, 0.97) genotypes = genotypes[[ i for i in genotypes.columns if i not in ancestor.index ]] frequencies = genotypes.sum() fig, ax = plt.subplots(figsize=(10, 10)) ax.plot(frequencies) plt.show()
def _compute_quantile_accuracies(heatmap: pd.DataFrame) -> Tuple[float, float]: """Computes the accuracy within 1st and 2nd quantile.""" # TODO(): Add overall accuracy result. # Create filters to calculate accuracy within 1st and 2nd quantile. mask_1st_quantile = (np.eye(*heatmap.shape) + np.eye(*heatmap.shape, k=1) + np.eye(*heatmap.shape, k=-1)) mask_2nd_quantile = (mask_1st_quantile + np.eye(*heatmap.shape, k=2) + np.eye(*heatmap.shape, k=-2)) # Calculate accuracy. all_sum = heatmap.sum().sum() accuracy_1st_quantile = (heatmap * mask_1st_quantile).sum().sum() / all_sum accuracy_2nd_quantile = (heatmap * mask_2nd_quantile).sum().sum() / all_sum return accuracy_1st_quantile, accuracy_2nd_quantile
def left_right(): low, high, n = -1 << 10, 1 << 10, 1 << 20 left = DataFrame(np.random.randint(low, high, (n, 7)), columns=list("ABCDEFG")) left["left"] = left.sum(axis=1) # one-2-one match i = np.random.permutation(len(left)) right = left.iloc[i].copy() right.columns = right.columns[:-1].tolist() + ["right"] right.index = np.arange(len(right)) right["right"] *= -1 return left, right
def c_input(self, o_iot: pd.DataFrame, q_iot: pd.DataFrame, p_tau: float): factor = 1 / (1 - (o_iot.sum(axis=0) / q_iot) * p_tau) A = np.array([ self.indicator("q", i) - (factor.loc[i] * (np.sum([self.indicator("d", j, i) for j in Sector], axis=0) + self.indicator("x", M.I, i) + self.indicator("xtilde", M.L, i) + self.indicator("xtilde", M.K, i))) for i in Sector ]) # normalization normalization = np.array([1 / q_iot.loc[i] for i in Sector]) A = np.multiply(A, normalization[:, None]) self.update_constraint(Bound(None, None, A, np.zeros(A.shape[0])))
def operate(cls, value: DataFrame) -> Series: """ Apply the sum operation across a DataFrame's columns. :param value: The DataFrame of distributions to sum across. """ if isinstance(value, DataFrame): result = value.sum(axis=1) names_csv = ', '.join(value.columns.to_list()) result.name = f'sum({names_csv})' return result else: raise TypeError('value for Sum aggregator must be DataFrame')
def weighted_average_of_impurity(df: pd.DataFrame, impurity_func: Callable[[pd.Series], float]) -> float: col_sum = df.sum(axis=1) col_gini = df.apply(impurity_func, axis=1) res = sum(col_gini * col_sum / sum(col_sum)) print('<<<<Frame Gini>>>>') print('Sum of Columns') print(col_sum) print('Gini of Columns') print(col_gini) print('Result') print(res) return res
def _normalise(df: pd.DataFrame) -> pd.DataFrame: """ Normalises dataframe Args: df: Raw dataframe Returns: Normalised dataframe """ norm_df = df / df.sum(axis=0) norm_df.fillna(1.0 / df.shape[0], inplace=True) return norm_df
def get_deseq2_stats(df: pd.DataFrame, subsets: List[List[T]], min_total_row_count: int = 0) -> pd.DataFrame: """Use the R bioconductor package 'limma' to perform a differential expression analysis of count like data (e.g. miRNA). See package documentation for more details. :param df: Matrix of counts, where each column is a sample and each row a feature. :param subsets: The two subsets to compare with each other. :param min_total_row_count: Drop rows that have in total less than than min_total_row_count reads :return: Results of the analysis in form of a Dataframe (p, logFC, ...) """ logger.debug("Computing deseq2 stats") if len(subsets) != 2: error = "This method currently only supports exactly two " \ "subsets as this is the most common use case. Support " \ "for more subsets will be added later." logger.exception(error) raise ValueError(error) # flatten subset flattened_subsets = [x for subset in subsets for x in subset] # discard columns that are not in a subset df = df[flattened_subsets] # filter rows with too few reads total_row_counts = df.sum(axis=1) keep = total_row_counts[total_row_counts >= min_total_row_count].index df = df.loc[keep] # pandas df -> R df r_count_data = pandas2ri.py2ri(df) # py2ri is stupid and makes too many assumptions. # These two lines restore the column order r_count_data.colnames = list(OrderedDict.fromkeys(flattened_subsets)) r_count_data = r_count_data.rx(robj.StrVector(flattened_subsets)) # see package documentation condition = ['s{}'.format(i) for i, subset in enumerate(subsets) for _ in subset] r_condition = robj.FactorVector(robj.StrVector(condition)) r_col_data = r['DataFrame'](condition=r_condition) r_design = robj.Formula('~ condition') r_design.environment['condition'] = r_condition r_dds = r['DESeqDataSetFromMatrix'](r_count_data, r_col_data, r_design) r_dds = r['DESeq'](r_dds, parallel=True) r_res = r['results'](r_dds) # R result table to Python pandas r_res = r['as.data.frame'](r_res) results = pandas2ri.ri2py(r_res) results.insert(0, 'feature', list(r['row.names'](r_res))) return results
def plot(tvec: pd.DataFrame, data: pd.DataFrame, combineZones: bool): ''' Function to plot the data :param tvec: Time vector for all the data measurements :param combineZones: Whether or not to sum all the zones into 1 value :param data: The data to plot :return: Does not return anything, will show a plot on the screen. ''' if tvec.empty or data.empty: raise Exception("The dataset is empty") if isinstance(tvec, pd.Index): x = tvec xLabel = f"Time ({x.name})" else: x = pd.to_datetime(tvec) xLabel = "Time (default)" if combineZones: if len(data) < 25: plt.bar(x, data.sum(axis=1)) else: plt.plot(x, data.sum(axis=1)) plt.xlabel(xLabel) plt.ylabel("Consumption (Wh)") plt.title("Energy consumption (all zones)") plt.show() else: fig, axs = plt.subplots(2, 2, figsize=(16, 10)) axs = axs.flatten() for index, column in enumerate(data): if len(data) < 25: axs[index].bar(x, data[column]) axs[index].set(xlabel=xLabel, ylabel="Consumption (Wh)") axs[index].set_title(f"Energy consumption ({column})") else: axs[index].plot(x, data[column]) axs[index].set(xlabel=xLabel, ylabel="Consumption (Wh)") axs[index].set_title(f"Energy consumption ({column})") plt.show()
def _get_fovdf(dbcon, evalset: str, min_ps_per_fov: int, min_fovs_per_p: int): # get fov metadata fovmetas = read_sql_query( f""" SELECT "fovname", "participants_{evalset}" AS "participants" FROM "fov_meta" WHERE "participants_{evalset}" NOT NULL ;""", dbcon) fovnames = list(set(fovmetas.loc[:, 'fovname'])) fovmetas.index = fovmetas.loc[:, 'fovname'] # init dataframe of fovs and who annotated them fovdf = DataFrame(0, index=fovnames, columns=ir.NPs) for fovname, row in fovmetas.iterrows(): for p in row['participants'].split(','): if p in ir.NPs: fovdf.loc[fovname, p] = 1 # only keep participants and fovs if > a certain threshold fovdf = fovdf.loc[:, fovdf.sum(axis=0) >= min_fovs_per_p] fovdf = fovdf.loc[fovdf.sum(axis=1) >= min_ps_per_fov, :] return fovdf, fovnames
def c_demand(self, p_delta: Mapping[Tuple[Sector, FinalUse], float], ytilde_iot: pd.DataFrame): const = np.array([ np.sum([p_delta[i, u] * ytilde_iot.loc[i, u] for u in FinalUse]) for i in Sector ]) A = np.array([self.indicator("y", i) for i in Sector]) normalization = np.array( [1 / ytilde_iot.sum(axis=1).loc[i] for i in Sector]) const = np.multiply( const, normalization) # if normalized, should all be 1 if p_delta is 1 A = np.multiply(A, normalization[:, None]) return Bound(A, const, None, None)
def getDF(self): # Converts to data frame col = ["ER+", "ER-", "Control"] keys = self.setKeys() keys.insert(0, "Total") ret = DataFrame(zeros((len(keys), len(col)), dtype=int), columns=col, index=keys) for k in keys: ret.loc[k, "ER+"] = self.pos.count(k) ret.loc[k, "ER-"] = self.neg.count(k) ret.loc[k, "Control"] = self.control.count(k) ret.loc["Total"] = ret.sum() return ret
def assert_melt(df: pd.DataFrame, eval_metric: str = "percent_strong") -> None: pair_ids = set_pair_ids() df = df.loc[:, [pair_ids[x]["index"] for x in pair_ids]] index_sums = df.sum().tolist() assert_error = "Stop! The eval_metric provided in 'metric_melt()' is incorrect!" assert_error = "{err} This is a fatal error providing incorrect results".format( err=assert_error) if eval_metric == "percent_strong": assert index_sums[0] != index_sums[1], assert_error elif eval_metric == "precision_recall": assert index_sums[0] == index_sums[1], assert_error elif eval_metric == "grit": assert index_sums[0] == index_sums[1], assert_error
class I8Merge(object): params = ['inner', 'outer', 'left', 'right'] param_names = ['how'] def setup(self, how): low, high, n = -1000, 1000, 10**6 self.left = DataFrame(np.random.randint(low, high, (n, 7)), columns=list('ABCDEFG')) self.left['left'] = self.left.sum(axis=1) self.right = self.left.sample(frac=1).rename({'left': 'right'}, axis=1) self.right = self.right.reset_index(drop=True) self.right['right'] *= -1 def time_i8merge(self, how): merge(self.left, self.right, how=how)
def generate_probability_vector_result(output_path): cluster_frame = pd.read_csv(output_path + '/clusters.csv', header=None) cluster_frame = cluster_frame.set_index(cluster_frame.ix[:,0]).ix[:, 1:] cluster_array = cluster_frame.values points_frame = pd.read_csv(output_path + '/points.csv', header=None) # points_frame = points_frame.drop_duplicates() points_array = points_frame.values distance_matrix = pw.euclidean_distances(cluster_array, points_array) distance_matrix = distance_matrix.T distance_frame = DataFrame(distance_matrix) # print(distance_frame) # print(distance_frame.sum(axis=1)) distance_frame = distance_frame.div(distance_frame.sum(axis=1), axis=0) distance_frame.to_csv(output_path + '/probability.csv')
def edbSave(): '获取客户剪切板中的edb代码,并调用接口获取edb指标的具体数据' # 获取客户剪切板中的代码及输入的起始与结束日期 codes = getCodeFromClipboard() start = sDate() end = eDate() data = w.edb(codes, start, end, "Fill=Previous") datachg = [d.strftime('%y-%m-%d') for d in data.Times] df = DataFrame(data.Data, index=data.Codes, columns=datachg).T print('-' * 85) print(df) print('-' * 85) print('统计指标:') print(df.describe()) print("sum", " " * 3, str(df.sum()).split(sep=" ")[1].rjust(10)) return df
def calc_happiness(order, guest_dict): df = DataFrame(columns=order, index=order) for idx,guest in enumerate(order[:-1]): # print "{} -> {}: {}".format( # guest, # order[idx+1], # gd[guest][order[idx+1]] # ) df[guest][order[idx+1]] = guest_dict[guest][order[idx+1]] df[order[idx+1]][guest] = guest_dict[order[idx+1]][guest] df[order[0]][order[-1]] = guest_dict[order[0]][order[-1]] df[order[-1]][order[0]] = guest_dict[order[-1]][order[0]] return df.sum().sum()
def predict_random_category(y_test, n=1000): """ Uses boostrapping to compute the expected prediction by chance for each category. Parameters: y_test (array): Labels n (int): the number of times randomize. Returns: Series containg the accuracy for each class..""" # Create a data frame with random predictions. random_ = DataFrame({i: shuffle_predict(y_test) for i in xrange(n)}) random_.index = y_test # Calculate the random and random_ = 1.0 * random_.sum(axis=1) / n grouped = random_.groupby(level=0) mean = grouped.mean() * 100.0 sd = grouped.std() * 100.0 return mean, sd
def two_column_summary(df, index, column, do_totals=True, do_prob=False): """returns a DataFrame contingency (frequency) summary table for two columns. arguments: df -- input DataFrame index -- the column used to summarize vertically column -- the column used to summarize vertically to_total -- places a row and column to summarize the total along that axis do_prob -- instead of return frequency, return probablity """ # test input if not (column in df.columns): raise ValueError("[two_column_summary] '%s' no a valid column name" % column) if not (index in df.columns): raise ValueError("[two_column_summary] '%s' no a valid column name" % index) # group for each column unique_col_values = df[column].unique() cols = [] for v in unique_col_values: mask = df[column]==v cols.append(df[mask].groupby(index)) # glue groups back together df_summary = DataFrame() for idx, c in enumerate(cols): d = c.count() d.columns = [unique_col_values[idx]] df_summary = pandas.concat([df_summary, d], axis=1) # add total if do_totals: df_summary['total'] = df_summary.apply(sum, axis=1) df_summary.ix['total'] = df_summary.sum() # make into probablity if do_prob: df_summary = df_summary/df_summary.ix['total']['total'] return df_summary
def plot_centre_crowd(self, thresh=2, show_threads=False, **kwargs): """Plotting evolution of number of participants close to centre""" project, show, _ = ac.handle_kwargs(**kwargs) data = self.__get_centre_distances(thresh, split=False) data_close = DataFrame({ '6 hours': data[data <= .25].count(axis=1), '12 hours': data[(data <= .5) & (data > .25)].count(axis=1), '24 hours': data[(data <= 1) & (data > .5)].count(axis=1)}, columns=['6 hours', '12 hours', '24 hours']) plt.style.use(SETTINGS['style']) y_max = data_close.sum(axis=1).max() _, axes = plt.subplots() data_close.plot(kind="area", ax=axes, stacked=True, color=['darkslategray', 'steelblue', 'lightgray']) axes.set_yticks(range(1, y_max + 1)) axes.set_ylabel("Number of participants") axes.set_title("Crowd close to the centre of discussion in {}".format( project)) axes.xaxis.set_ticks_position('bottom') axes.yaxis.set_ticks_position('left') if show_threads: self.__show_threads(axes) ac.show_or_save(show)
def process(self, start_time: datetime, end_time: datetime, input:DataFrame): if str(self.name) not in '+-*/': raise ValueError("Unknown math function: " + str(self.name)) ret = DataFrame() # two args means we're doing A + B if len(self._args) == 2: left = self._args[0].process(start_time, end_time, input) if isinstance(self._args[0], QueryFunction) else self._args[0] right = self._args[1].process(start_time, end_time, input) if isinstance(self._args[1], QueryFunction) else self._args[1] for l_col in left.columns: for r_col in right.columns: if self.name == '+': t = left[l_col] + right[r_col] elif self.name == '-': t = left[l_col] - right[r_col] elif self.name == '*': t = left[l_col] * right[r_col] elif self.name == '/': t = left[l_col] / right[r_col] else: raise ValueError("Unknown operator: " + str(self.name)) t = DataFrame(t) t.columns = [l_col + self.name + r_col] print(left.head()) print(right.head()) print(t.head()) ret = ret.combine_first(t) else: # everything is in the input DataFrame ret = DataFrame(input.sum(axis=0)) ret.columns = [' + '.join(input.columns)] return ret
def fix_event_type(df: DataFrame): ''' Not sure yet. :param df: Dataframe object. :return: Modified Dataframe. ''' a = time.time() colsf = df['id'].ravel() # list of all IDs unique = pd.Series(colsf).unique() # get unique IDs u_counts = [] # list of unique counts (UNUSED) counts_bucket = [] # bucket of counts (UNUSED) df = pd.get_dummies(df) # create dummy variables todrop = df.sum() < 50 # get columns where sum of dummy column < 50 dropcols = df.columns[todrop] # get those column names df = df.drop(dropcols, axis=1) # drop those columns df['num_events'] = 0 # create number of events columns, set to 0 # print(df.columns) print(str(len(unique))) for ii in range(0,len(unique)): # loop through all the unique IDs subset = df.loc[df['id'] == unique[ii]] # subset by that ID the_dummies = subset.columns != 'id' # get all columns that do not equal that ID aa = subset.iloc[:, subset.columns != 'id'].sum().tolist() # get all of those columns to list event_sum = np.sum(aa) # sum all of those # aa = aa.set_index([[subset.index[0]]]) # subset.iloc[:,subset.columns != 'id'] = aa df = df.set_value(subset.index, the_dummies, aa) df = df.set_value(subset.index, 'num_events', event_sum) # df.loc[subset.index] = subset df = df.drop_duplicates('id') print(df) b = time.time() print(b-a) return df
for i, group in enumerate(Groups): TotalCells_Area[SGroup[i]+'_Mean'] = TotalCells_Area[group].mean(axis =1) TotalCells_Area[SGroup[i]+'_Values']= TotalCells_Area[group].count(axis=1) TotalCells_Area[SGroup[i]+'_Stdev'] = TotalCells_Area[group].std(axis=1) TotalCells_Area[SGroup[i]+'_Serror'] = TotalCells_Area[SGroup[i]+'_Stdev']/np.sqrt(TotalCells_Area[SGroup[i]+'_Values']) ## Saving Table TotalCells.to_csv('C:\Users\keriambermudez\Dropbox\Figures\Figure2 p16 p30 p60 Septotemporal\SGZ\Total_Cells_C57_SW_SGZ.csv') SGZArea.to_csv('C:\Users\keriambermudez\Dropbox\Figures\Figure2 p16 p30 p60 Septotemporal\SGZ\SGZArea_C57_SW_SGZ.csv') TotalCells_Area.to_csv('C:\Users\keriambermudez\Dropbox\Figures\Figure2 p16 p30 p60 Septotemporal\SGZ\Total_Cells_Area_C57_SW_SGZ.csv') TotalCells[Dictionary['C57_p30']] TotalCells[Dictionary['C57']] #Calculating Density for C57 and C57_p30 TotalCellsSum = TotalCells.sum() SGZAreaSum = SGZArea.sum() Density = TotalCellsSum/SGZAreaSum Density = Density[0:13] DensityTable = Series([Density[0:4].mean(),Density[4:9].mean(),Density[9:13].mean()], index =['C57 P16','C57 P30','C57 P60']) C57_p16_Error =Density[0:4].std()/sqrt(Density[0:4].count()) C57_p30_Error = Density[4:9].std()/sqrt(Density[4:9].count()) C57_Error = Density[9:13].std()/sqrt(Density[9:13].count()) #Density.to_csv('C:\Users\KBermudez-Hernandez\Documents\Dropbox\Figures\Figure1 p16 p30 p60 density\Prox_density_bar_graph\P16 P30 P60 Prox1 Density.csv') #Plotting Density Graph plt.figure() DensityTable.plot(kind='bar',yerr=[C57_p16_Error,C57_p30_Error,C57_Error]) plt.ylabel('Density of Prox1 in SGZ') plt.xticks(rotation=0)
print "df.head - first 5 rows" print df.head() import os os.remove(r'./births1880.txt') uqNames = df['Names'].unique() print "df['names'].unique()" print uqNames print "df.names.describe()" print df['Names'].describe() df = df.groupby("Names") #group by name print df df = df.sum() # applys sum to each groupBy obj print df #above is equivalent os select sum(births) from df group by names; Sorted = df.sort(columns="Births", ascending=False) print Sorted.head(1) #or df['Births'].max() #Create Graph df['Births'].plot(kind="bar") print "The most popular name" df.sort(columns='Births', ascending=False)
class Scores(object): """ Parameters ---------- uri : str, optional modality : str, optional Returns ------- scores : `Scores` Examples -------- >>> s = Scores(uri='video', modality='speaker') >>> s[Segment(0,1), 's1', 'A'] = 0.1 >>> s[Segment(0,1), 's1', 'B'] = 0.2 >>> s[Segment(0,1), 's1', 'C'] = 0.3 >>> s[Segment(0,1), 's2', 'A'] = 0.4 >>> s[Segment(0,1), 's2', 'B'] = 0.3 >>> s[Segment(0,1), 's2', 'C'] = 0.2 >>> s[Segment(2,3), 's1', 'A'] = 0.2 >>> s[Segment(2,3), 's1', 'B'] = 0.1 >>> s[Segment(2,3), 's1', 'C'] = 0.3 """ @classmethod def from_df( cls, df, uri=None, modality=None, aggfunc=np.mean ): """ Parameters ---------- df : DataFrame Must contain the following columns: 'segment', 'track', 'label' and 'value' uri : str, optional Resource identifier modality : str, optional Modality aggfunc : func Value aggregation function in case of duplicate (segment, track, label) tuples Returns ------- """ dataframe = pivot_table( df, values=PYANNOTE_SCORE, index=[PYANNOTE_SEGMENT, PYANNOTE_TRACK], columns=PYANNOTE_LABEL, aggfunc=aggfunc ) annotation = Annotation(uri=uri, modality=modality) for index, _ in dataframe.iterrows(): segment = Segment(*index[0]) track = index[1] annotation[segment, track] = '' labels = dataframe.columns return cls(uri=uri, modality=modality, annotation=annotation, labels=labels, values=dataframe.values) def __init__(self, uri=None, modality=None, annotation=None, labels=None, values=None, dtype=None): super(Scores, self).__init__() names = [PYANNOTE_SEGMENT + '_' + field for field in Segment._fields] + [PYANNOTE_TRACK] if annotation: annotation = annotation.copy() index = Index( [s + (t, ) for s, t in annotation.itertracks()], name=names) else: annotation = Annotation(uri=uri, modality=modality) index = MultiIndex(levels=[list() for name in names], labels=[list() for name in names], names=names) self.annotation_ = annotation columns = None if labels is None else list(labels) data = None if values is None else np.array(values) dtype = np.float if values is None else values.dtype self.dataframe_ = DataFrame(data=data, dtype=dtype, index=index, columns=columns) self.hasChanged_ = True self.modality = modality self.uri = uri def copy(self): self._reindexIfNeeded() copied = self.__class__(uri=self.uri, modality=self.modality) copied.dataframe_ = self.dataframe_.copy() copied.annotation_ = self.annotation_.copy() copied.hasChanged_ = self.hasChanged_ return copied # del scores[segment] # del scores[segment, :] # del scores[segment, track] def __delitem__(self, key): if isinstance(key, Segment): segment = key self.dataframe_.drop(tuple(segment), axis=0, inplace=True) del self.annotation_[segment] self.hasChanged_ = True elif isinstance(key, tuple) and len(key) == 2: segment, track = key self.dataframe_.drop(tuple(segment) + (track, ), axis=0, inplace=True) del self.annotation_[segment, track] self.hasChanged_ = True else: raise KeyError('') # value = scores[segment, track, label] def __getitem__(self, key): if len(key) == 2: key = (key[0], '_', key[1]) segment, track, label = key return self.dataframe_.at[tuple(segment) + (track, ), label] # scores[segment, track, label] = value # scores[segment, label] ==== scores[segment, '_', label] def __setitem__(self, key, value): if len(key) == 2: key = (key[0], '_', key[1]) segment, track, label = key # do not add empty track if not segment: return self.dataframe_.at[tuple(segment) + (track,), label] = value self.annotation_[segment, track] = label self.hasChanged_ = True def __len__(self): """Number of annotated segments""" return len(self.annotation_) def __nonzero__(self): return self.__bool__() def __bool__(self): """False if annotation is empty""" return True if self.annotation_ else False def __contains__(self, included): """Check if segments are annotated Parameters ---------- included : `Segment` or `Timeline` Returns ------- contains : bool True if every segment in `included` is annotated, False otherwise. """ return included in self.annotation_ def __iter__(self): """Iterate over sorted segments""" return iter(self.annotation_.get_timeline()) def __reversed__(self): """Reverse iterate over sorted segments""" return reversed(self.annotation_.get_timeline()) def itersegments(self): return iter(self) def tracks(self, segment): """Set of tracks for query segment Parameters ---------- segment : `Segment` Query segment Returns ------- tracks : set Set of tracks for query segment """ return self.annotation_.get_tracks(segment) def has_track(self, segment, track): """Check whether a given track exists Parameters ---------- segment : `Segment` Query segment track : Query track Returns ------- exists : bool True if track exists for segment """ return self.annotation_.has_track(segment, track) def get_track_by_name(self, track): """Get all tracks with given name Parameters ---------- track : any valid track name Requested name track Returns ------- tracks : list List of (segment, track) tuples """ return self.annotation_.get_track_by_name(track) def new_track(self, segment, candidate=None, prefix=None): """Track name generator Parameters ---------- segment : Segment prefix : str, optional candidate : any valid track name Returns ------- track : str New track name """ return self.annotation_.new_track(segment, candidate=None, prefix=None) def itertracks(self): """Iterate over annotation as (segment, track) tuple""" return self.annotation_.itertracks() def itervalues(self): """Iterate over scores as (segment, track, label, value) tuple""" # make sure segment/track pairs are sorted self._reindexIfNeeded() labels = self.labels() # yield one (segment, track, label) tuple per loop for index, columns in self.dataframe_.iterrows(): segment = Segment(*index[:-1]) track = index[-1] for label in labels: value = columns[label] if not np.isnan(value): yield segment, track, label, value def get_track_scores(self, segment, track): """Get all scores for a given track. Parameters ---------- segment : Segment track : hashable segment, track must be a valid track Returns ------- scores : dict {label: score} dictionary """ return dict(self.dataframe_.xs(tuple(segment) + (track, ))) def labels(self, unknown=True): """List of labels Parameters ---------- unknown : bool, optional When False, do not return Unknown instances When True, return any label (even Unknown instances) Returns ------- labels : list Sorted list of existing labels Remarks ------- Labels are sorted based on their string representation. """ labels = sorted(self.dataframe_.columns, key=str) if unknown: return labels else: return [l for l in labels if not isinstance(l, Unknown)] def _reindexIfNeeded(self): if not self.hasChanged_: return names = [PYANNOTE_SEGMENT + '_' + field for field in Segment._fields] + [PYANNOTE_TRACK] new_index = Index( [s + (t, ) for s, t in self.annotation_.itertracks()], name=names) self.dataframe_ = self.dataframe_.reindex(new_index) self.hasChanged_ = False return def retrack(self): """ """ self._reindexIfNeeded() retracked = self.copy() annotation = self.annotation_.retrack() retracked.annotation_ = annotation names = [PYANNOTE_SEGMENT + '_' + field for field in Segment._fields] + [PYANNOTE_TRACK] new_index = Index( [s + (t, ) for s, t in annotation.itertracks()], name=names) retracked.dataframe_.index = new_index return retracked def apply(self, func, axis=0): applied = self.copy() applied.dataframe_ = self.dataframe_.apply(func, axis=axis) applied.hasChanged_ = True return applied def rank(self, ascending=False): """ Parameters ---------- ascending : boolean, default False False for ranks by high (0) to low (N-1) Returns ------- rank : `Scores` """ ranked = self.copy() ranked.dataframe_ = -1 + self.dataframe_.rank(axis=1, ascending=ascending) ranked.hasChanged_ = True return ranked def nbest(self, n, ascending=False): """ Parameters ---------- n : int Size of n-best list ascending : boolean, default False False for ranks by high (0) to low (N-1) Returns ------- nbest : `Scores` New scores where only n-best are kept. """ filtered = self.copy() ranked_ = -1 + self.dataframe_.rank(axis=1, ascending=ascending) filtered.dataframe_ = filtered.dataframe_.where(ranked_ < n, other=np.NaN) filtered.hasChanged_ = True return filtered def subset(self, labels, invert=False): """Scores subset Extract scores subset based on labels Parameters ---------- labels : set Set of labels invert : bool, optional If invert is True, extract all but requested `labels` Returns ------- subset : `Scores` Scores subset. """ self._reindexIfNeeded() if not isinstance(labels, set): raise TypeError('labels must be provided as a set of labels.') if invert: labels = set(self.labels()) - labels else: labels = labels & set(self.labels()) subset = Scores(uri=self.uri, modality=self.modality) subset.annotation_ = self.annotation_ subset.dataframe_ = self.dataframe_[list(labels)] return subset def to_annotation(self, threshold=-np.inf, posterior=False): """ Parameters ---------- threshold : float, optional Each track is annotated with the label with the highest score. Yet, if the latter is smaller than `threshold`, label is replaced with an `Unknown` instance. posterior : bool, optional If True, scores are posterior probabilities in open-set identification. If top model posterior is higher than unknown posterior, it is selected. Otherwise, label is replaced with an `Unknown` instance. """ if not self: return Annotation(uri=self.uri, modality=self.modality) best = self.nbest(1, ascending=False) large_enough = best.copy() if posterior: unknown_posterior = 1. - self.dataframe_.sum(axis=1) large_enough.dataframe_ = ( ((best.dataframe_.T > unknown_posterior) & (best.dataframe_.T > threshold)).T ) else: large_enough.dataframe_ = ( (best.dataframe_.T > threshold).T ) large_enough.dataframe_.where(best.dataframe_.notnull(), inplace=True, other=np.NaN) annotation = Annotation(uri=self.uri, modality=self.modality) for segment, track, label, value in large_enough.itervalues(): label = label if value else Unknown() annotation[segment, track] = label return annotation def map(self, func): """Apply function to all values""" mapped = self.copy() mapped.dataframe_ = self.dataframe_.applymap(func) mapped.hasChanged_ = True return mapped def crop(self, focus, mode='strict'): """Crop on focus Parameters ---------- focus : `Segment` or `Timeline` mode : {'strict', 'loose', 'intersection'} In 'strict' mode, only segments fully included in focus coverage are kept. In 'loose' mode, any intersecting segment is kept unchanged. In 'intersection' mode, only intersecting segments are kept and replaced by their actual intersection with the focus. Returns ------- cropped : same type as caller Cropped version of the caller containing only tracks matching the provided focus and mode. Remarks ------- In 'intersection' mode, the best is done to keep the track names unchanged. However, in some cases where two original segments are cropped into the same resulting segments, conflicting track names are modified to make sure no track is lost. """ if isinstance(focus, Segment): return self.crop(Timeline([focus], uri=self.uri), mode=mode) self._reindexIfNeeded() cropped = self.copy() if mode in ['strict', 'loose']: new_annotation = self.annotation_.crop(focus, mode=mode) keep = [new_annotation.has_track(segment, track) for segment, track in self.itertracks()] cropped.dataframe_ = self.dataframe_[keep] cropped.annotation_ = new_annotation cropped.hasChanged_ = True return cropped elif mode in ['intersection']: raise NotImplementedError('') # # two original segments might be cropped into the same resulting # # segment -- therefore, we keep track of the mapping # intersection, mapping = timeline.crop(coverage, # mode=mode, mapping=True) # # # create new empty annotation # A = self.__class__(uri=self.uri, modality=self.modality) # # for cropped in intersection: # for original in mapping[cropped]: # for track in self.tracks(original): # # try to use original track name (candidate) # # if it already exists, create a brand new one # new_track = A.new_track(cropped, candidate=track) # # copy each value, column by column # for label in self.dataframe_.columns: # value = self.dataframe_.get_value((original, track), # label) # A.dataframe_ = A.dataframe_.set_value((cropped, new_track), # label, value) # # return A def __str__(self): """Human-friendly representation""" if self: self._reindexIfNeeded() return str(self.dataframe_) else: return "" def _repr_png_(self): from .notebook import repr_scores return repr_scores(self)
for i, group in enumerate(Groups): TotalCells_Area[SGroup[i]+'_Mean'] = TotalCells_Area[group].mean(axis =1) TotalCells_Area[SGroup[i]+'_Values']= TotalCells_Area[group].count(axis=1) TotalCells_Area[SGroup[i]+'_Stdev'] = TotalCells_Area[group].std(axis=1) TotalCells_Area[SGroup[i]+'_Serror'] = TotalCells_Area[SGroup[i]+'_Stdev']/np.sqrt(TotalCells_Area[SGroup[i]+'_Values']) ## Saving Table TotalCells.to_csv('C:\Users\keriambermudez\Dropbox\Figures\Figure3 Cre Bax\Coronals_Horizontals\Total_Cells_CreBax_C_H_Hilus.csv') HilusArea.to_csv('C:\Users\keriambermudez\Dropbox\Figures\Figure3 Cre Bax\Coronals_Horizontals\HilusArea_CreBax_C_H_Hilus.csv') TotalCells_Area.to_csv('C:\Users\keriambermudez\Dropbox\Figures\Figure3 Cre Bax\Coronals_Horizontals\Total_Cells_Area_C_H_CreBax_Hilus.csv') #Calculating Density of All TotalCellsSum = TotalCells.sum() HilusAreaSum = HilusArea.sum() Density = TotalCellsSum/HilusAreaSum DensityTable = Series([Density[P30_Positive_Tam].mean(),Density[P30_Negative_Tam].mean(),Density[P60_Positive_Tam].mean(),Density[P60_Negative_Tam].mean()], index =['P30_Positive_Tam','P30_Negative_Tam', 'P60_Positive_Tam' ,'P60_Negative_Tam']) P30_Positive_Tam_Error =Density[P30_Positive_Tam].std()/sqrt(Density[P30_Positive_Tam].count()) P30_Negative_Tam_Error = Density[P30_Negative_Tam].std()/sqrt(Density[P30_Negative_Tam].count()) P60_Positive_Tam_Error = Density[P60_Positive_Tam].std()/sqrt(Density[P60_Positive_Tam].count()) P60_Negative_Tam_Error = Density[P60_Negative_Tam].std()/sqrt(Density[P60_Negative_Tam].count()) #Density.to_csv('C:\Users\KBermudez-Hernandez\Documents\Dropbox\Figures\Figure1 p16 p30 p60 density\Prox_density_bar_graph\P16 P30 P60 Prox1 Density.csv') #Plotting Density Graph plt.figure()
import numpy as np df = DataFrame(np.random.randn(4, 3), index=['a', 'a', 'b', 'b']) df.ix['b'] ## Summarizing and Computing Descriptive Statistics # reductions or summary statistics f = DataFrame([[1.4, np.nan], [7.1, -4.5], [np.nan, np.nan], [0.75, -1.3]], index=['a', 'b', 'c', 'd'], columns=['one', 'two']) df df = DataFrame([[1.4, np.nan], [7.1, -4.5], [np.nan, np.nan], [0.75, -1.3]], index=['a', 'b', 'c', 'd'], columns=['one', 'two']) df df.sum() # columns sum df.sum(axis=1) # sum row by row df (7.10 - 4.5)/2 df.mean(axis=1, skipna=False) df df.idxmax() df df.cumsum() # accumultation df.describe() # multiple summary statistics in one shot. obj = Series(['a', 'a', 'b', 'c'] * 4) obj obj.describe() ## Correlation and Covariance import pandas.io.data as web all_data = {}
def descriptiveStatsDataFrame(): df = DataFrame([[1.4, np.nan], [7, 5], [np.nan, np.nan], [7,10]], index=['a','b','c','d'], columns=['one','two']) print (df) print ('Column Sum: \n{}'.format(df.sum(axis=0))) print ('Row Sum: \n{}'.format(df.sum(axis=1))) print ('Do not skip NA: \n{}'.format(df.sum(axis=1, skipna=False))) print ('Index with min Value: \n{}'.format(df.idxmin())) print ('Summary Statistic: \n{}'.format(df.describe()))