def make_USrepresentative_df(): representative_df = pd.DataFrame() df = pd.read_html(URLS['dem_USrepresentative'])[0] df.columns = ['county', 'candidate1', 'candidate2', 'candidate3', 'candidate4', 'candidate5', 'candidate6'] df['county'] = df['county'].fillna('') splits = df[df.county.str.startswith('DISTRICT')].index.tolist() splits.append(df.shape[0]) for split in range(len(splits) - 1): df_ = df.iloc[splits[split]:splits[split+1]] df_ = df_.drop(df_.index[0]) df_.columns = df_.iloc[0] df_ = df_.drop(df_.index[0]) df_.columns = ['county'] + list(df_.columns[1:]) df_ = df_.dropna(subset=[df_.columns.values[1]]) df_ = df_.dropna(axis=1) df_ = pd.melt(df_, id_vars=['county'], value_vars=list(df_.columns[1:])) df_.columns = ['county', 'candidate', 'votes'] df_ = df_[df_['county'] != ''] df_['party'] = 'Democratic' df_['candidate'] = df_['candidate'].str.lstrip('*') df_['candidate'] = df_['candidate'].str.replace('\((.*?)\)', '') df_['candidate'] = df_['candidate'].str.rstrip('()') df_['office'] = 'US Representative' representative_df = representative_df.append(df_) df = pd.read_html(URLS['rep_USrepresentative'])[0] df.columns = ['county', 'candidate1', 'candidate2', 'candidate3', 'candidate4', 'candidate5'] df['county'] = df['county'].fillna('') splits = df[df.county.str.startswith('DISTRICT')].index.tolist() splits.append(df.shape[0]) for split in range(len(splits) - 1): df_ = df.iloc[splits[split]:splits[split+1]] df_ = df_.drop(df_.index[0]) df_.columns = df_.iloc[0] df_ = df_.drop(df_.index[0]) df_.columns = ['county'] + list(df_.columns[1:]) df_ = df_.dropna(subset=[df_.columns.values[1]]) df_ = df_.dropna(axis=1) df_ = pd.melt(df_, id_vars=['county'], value_vars=list(df_.columns[1:])) df_.columns = ['county', 'candidate', 'votes'] df_ = df_[df_['county'] != ''] df_['party'] = 'Republican' df_['candidate'] = df_['candidate'].str.lstrip('*') df_['candidate'] = df_['candidate'].str.replace('\((.*?)\)', '') df_['candidate'] = df_['candidate'].str.rstrip('()') df_['office'] = 'US Representative' representative_df = representative_df.append(df_) return representative_df
def most_probable_words(model, vocabulary, num_words): """ Return a DataFrame of the most probable words for each topic, given a model, vocabulary, and number of words. """ ## create array of vocabulary, sorted by topic ## probabilities, one row for each topic. vocab = np.asarray(vocabulary)[np.argsort(model.topic_word_)] wp = np.sort(model.topic_word_) ## select n most probable words, which are the right-most ## columns in the vocab array. words = vocab[:, -num_words:-1] words = pd.DataFrame(words.T) words['rank'] = words.index words = pd.melt(words, id_vars='rank') word_probs = wp[:, -num_words:-1] word_probs = pd.DataFrame(word_probs.T) word_probs['rank'] = word_probs.index word_probs = pd.melt(word_probs, id_vars='rank') ww = words.merge(word_probs, on=['rank', 'variable']) ww.columns = ['rank', 'topic', 'word', 'prob'] return ww
def test_tidy_splicing_with_expression(self, test_study): test = test_study.tidy_splicing_with_expression common_id = 'common_id' sample_id = 'sample_id' event_name = 'event_name' splicing_common_id = test_study.splicing.feature_data[ test_study.splicing.feature_expression_id_col] # Tidify splicing splicing = test_study.splicing.data splicing_index_name = test_study._maybe_get_axis_name(splicing, axis=0) splicing_columns_name = test_study._maybe_get_axis_name(splicing, axis=1) splicing_tidy = pd.melt(splicing.reset_index(), id_vars=splicing_index_name, value_name='psi', var_name=splicing_columns_name) rename_columns = {} if splicing_index_name == 'index': rename_columns[splicing_index_name] = sample_id if splicing_columns_name == 'columns': rename_columns[splicing_columns_name] = event_name splicing_columns_name = event_name splicing_tidy = splicing_tidy.rename(columns=rename_columns) # Create a column of the common id on which to join splicing # and expression splicing_names = splicing_tidy[splicing_columns_name] if isinstance(splicing_names, pd.Series): splicing_tidy[common_id] = splicing_tidy[ splicing_columns_name].map(splicing_common_id) else: splicing_tidy[common_id] = [ test_study.splicing.feature_renamer(x) for x in splicing_names.itertuples(index=False)] splicing_tidy = splicing_tidy.dropna() # Tidify expression expression = test_study.expression.data_original expression_index_name = test_study._maybe_get_axis_name(expression, axis=0) expression_columns_name = test_study._maybe_get_axis_name(expression, axis=1) expression_tidy = pd.melt(expression.reset_index(), id_vars=expression_index_name, value_name='expression', var_name=common_id) # This will only do anything if there is a column named "index" so # no need to check anything expression_tidy = expression_tidy.rename(columns={'index': sample_id}) expression_tidy = expression_tidy.dropna() splicing_tidy.set_index([sample_id, common_id], inplace=True) expression_tidy.set_index([sample_id, common_id], inplace=True) true = splicing_tidy.join(expression_tidy, how='inner').reset_index() pdt.assert_frame_equal(test, true)
def get_song_recs(ratings, n_features): ''' Takes user new movie ratings from website user and returns recommended song titles ''' path_to_songs_sf = '/home/cully/Documents/capstone/data/flask_songs_sf' path_to_movies_sf = '/home/cully/Documents/capstone/data/flask_movies_sf' songs_sf = gl.load_sframe(path_to_songs_sf) songs_df = songs_sf.to_dataframe() value_vars = [x for x in songs_df.columns if x != 'id'] ids = [x for x in songs_df.index] if 'id' not in songs_df.columns: songs_df.insert(0, 'id', ids) songs_melted = gl.SFrame(pd.melt(songs_df, id_vars = 'id', value_vars=value_vars)) songs_rec = gl.factorization_recommender.create(songs_melted, user_id = 'id', item_id='variable', target='value', num_factors = n_features) _, _, songs_item_intercept, songs_item_factors, songs_intercept = get_rec_coeffs(songs_rec) movies_sf = gl.load_sframe(path_to_movies_sf) movies_df = movies_sf.to_dataframe() value_vars = [x for x in movies_df.columns if x != 'id'] new_ratings = {movie_dict[name]:int(ratings[name]) for name in ratings} new_df = pd.DataFrame.from_dict([new_ratings], orient='columns').replace(-1,np.nan) movies_df = pd.concat([movies_df, new_df]).reset_index(drop=True) ids = [str(i) for i in movies_df.index] movies_df.insert(0, 'id', ids) movies_melted = gl.SFrame(pd.melt(movies_df, id_vars='id', value_vars=value_vars)).dropna() movies_rec = gl.factorization_recommender.create(movies_melted, user_id='id', item_id='variable', target='value', num_factors=n_features) movies_user_intercept, movies_user_factors, _, _, movies_intercept = get_rec_coeffs(movies_rec) comb = np.dot(np.array(movies_user_factors)[-1], np.array(songs_item_factors).T) return songs_df.columns[1:][np.argsort(comb)[::-1]]
def wrapper(name): global pltsize Xt, Yt=loadData(name, 'train') Xv, Yv=loadData(name, 'validate') w = Train(Xt, Yt, 0) print 'Classification Error (TR): ', classifyErr(LRPredict(w, Xt), Yt, 0.5), name print 'Classification Error (VAL):: ',classifyErr(LRPredict(w, Xv), Yv, 0.5), name t1 = 'Classification Error vs Decision Boundary - ' + name + ': Training' t2 = 'Classification Error vs Decision Boundary - ' + name + ': Validation' plotCEDB(w, Xt, Yt, '') plotCEDB(w, Xv, Yv, '') t1 = 'Logistic Regression - ' + name + ': Training' t2 = 'Logistic Regression - ' + name + ': Validation' plotDecisionBoundary(w, Xt, Yt, LRPredict, [0.5], '') plotDecisionBoundary(w, Xv, Yv, LRPredict, [0.5], '') l = array(linspace(0,100,101)) tErr, tClass, vErr, vClass = GridL(Xt, Yt, Xv, Yv, l) DF1 = pd.DataFrame({'TR': pd.Series(tClass), 'VAL': pd.Series(vClass), 'Lambda': pd.Series(l)}) DF1 = pd.melt(DF1,id_vars=['Lambda']) DF2 = pd.DataFrame({'TR': pd.Series(tErr), 'VAL': pd.Series(vErr), 'Lambda': pd.Series(l)}) DF2 = pd.melt(DF2,id_vars=['Lambda']) title1 = 'Classification Error vs Lambda - ' + name title2 = 'Logisitic Loss vs Lambda - ' + name print p1 = ggplot(DF1, aes(x='Lambda', y='value', color='variable')) + geom_line(size=4) + ggtitle('') + ylab('Error') + theme_matplotlib(rc=pltsize, matplotlib_defaults=False) print p2 = ggplot(DF2, aes(x='Lambda', y='value', color='variable')) + geom_line(size=4) + ggtitle('') + ylab('Error') + theme_matplotlib(rc=pltsize, matplotlib_defaults=False)
def find_avg_dataframe(df, log=None, value_vars=list()): try: avg_col = None for col in df.columns: if 'average' in str(col): avg_col = col if avg_col != None: df_avg = pd.melt(df, id_vars=['year'], value_vars=[avg_col]) if len(value_vars) == 0: all_columns = list() for col in df.columns: all_columns.append(col) all_columns.remove(avg_col) all_columns.remove('year') value_vars = all_columns else: if avg_col in value_vars: value_vars.remove(avg_col) df_lng = pd.melt(df, id_vars=['year'], value_vars=value_vars) print("Found average dataframe") return df_avg, df_lng except KeyError as ke: if log: logging.error(str(ke)) else: print("Could not find average dataframe") return pd.DataFrame(), pd.DataFrame()
def main(): """Load up all the performances and do some stats""" log_files = [] performances = [] for local_file in os.listdir("data"): if local_file.endswith(".log"): log_files.append("data/" + local_file) print("Loading the performances.") for log in log_files: performances.append(MetatonePerformanceLog(log)) ## Also load up the experiment design dataframe to merge with the data! experiment_design = pd.read_csv("2015-MetatoneStudy-ExperimentDesign.csv", index_col='time', parse_dates=True) print("Finding the lengths.") performer_length_dict = {} for perf in performances: performer_length_dict.update(perf.performer_lengths()) performance_length_frame = pd.DataFrame.from_dict(performer_length_dict, orient="index") performance_length_frame['time'] = performance_length_frame.index performers = performances[0].performers().tolist() long_performance_lengths = pd.melt(performance_length_frame, id_vars=['time'], value_vars=performers) long_performance_lengths = long_performance_lengths.replace({'variable':DEVICE_SEATS}) long_performance_lengths.to_csv("performance_lengths.csv") print("Creating Gesture Scores.") for perf in performances: perf.print_gesture_score() ## Prints out a gesture-score pdf for reference. print("Creating performance info dataframe.") perf_data = {} for perf in performances: perf_data.update({perf.first_touch_timestamp():{ "raw_new_ideas":perf.raw_new_ideas, "new_idea_changes":perf.count_new_idea_interface_changes(), "button_presses":perf.count_button_interface_changes(), "flux":perf.ensemble_flux(), "entropy":perf.ensemble_entropy() }}) performance_data = pd.DataFrame.from_dict(perf_data, orient = "index") performance_data.to_csv("performance_data.csv") print("Creating perfomer button press dataframe") performer_presses = {} for perf in performances: performer_presses.update(perf.button_interface_changes_by_performer()) button_changes_frame = pd.DataFrame.from_dict(performer_presses,orient = "index") button_experiment_frame = pd.concat([experiment_design,button_changes_frame], axis = 1) performers = performances[0].performers().tolist() button_experiment_frame['time'] = button_experiment_frame.index long_button_frame = pd.melt(button_experiment_frame, id_vars=['time', 'perf_number', 'group', 'performance', 'button', 'server', 'overall'], value_vars=performers, var_name='seat', value_name='button_presses') long_button_frame = long_button_frame.replace({'seat':DEVICE_SEATS}) long_button_frame['performer'] = np.vectorize(lambda x, y: PARTICIPANTS[x][y])(long_button_frame['group'], long_button_frame['seat']) long_button_frame.to_csv("button_presses_per_performer.csv")
def sales_to_db(self, kk_nullfall, kk_planfall): '''store the sales matrices in database''' # sum up sales join them on index to dataframe, replace missing entries # (e.g. no entries for planned markets in nullfall -> sales = 0) sales_nullfall = kk_nullfall.sum(axis=1) sales_planfall = kk_planfall.sum(axis=1) df_sales_null = pd.DataFrame(sales_nullfall, columns=['umsatz_nullfall']) df_sales_plan = pd.DataFrame(sales_planfall, columns=['umsatz_planfall']) df_sales = df_sales_null.join(df_sales_plan, how='outer') df_sales.fillna(0, inplace=True) df_sales['id'] = df_sales.index df_sales['umsatz_differenz'] = ((df_sales['umsatz_planfall'] / df_sales['umsatz_nullfall']) * 100 - 100) df_sales.fillna(0, inplace=True) self.parent_tbx.dataframe_to_table('Maerkte', df_sales, pkeys=['id']) # invert the pivoted tables kk_nullfall['id_markt'] = kk_nullfall.index kk_planfall['id_markt'] = kk_planfall.index df_nullfall = pd.melt(kk_nullfall, value_name='kk_strom_nullfall', id_vars='id_markt') df_planfall = pd.melt(kk_planfall, value_name='kk_strom_planfall', id_vars='id_markt') # join the results to the cell table cells = self.parent_tbx.table_to_dataframe('Beziehungen_Maerkte_Zellen') del cells['kk_strom_nullfall'] del cells['kk_strom_planfall'] cells = cells.merge(df_nullfall, on=['id_siedlungszelle', 'id_markt'], how='left') cells = cells.merge(df_planfall, on=['id_siedlungszelle', 'id_markt'], how='left') cells.fillna(0, inplace=True) cells.sort_values(by = ['id_markt', 'id_siedlungszelle'], inplace=True) # should be identical, but take both anyway sum_null = cells.groupby('id_siedlungszelle', as_index=False)['kk_strom_nullfall'].sum() sum_plan = cells.groupby('id_siedlungszelle', as_index=False)['kk_strom_planfall'].sum() cells = cells.merge(sum_null, on=['id_siedlungszelle'], suffixes=('', '_sum')) cells = cells.merge(sum_plan, on=['id_siedlungszelle'], suffixes=('', '_sum')) cells['kk_bindung_nullfall'] = cells['kk_strom_nullfall'] * 100 / cells['kk_strom_nullfall_sum'] cells['kk_bindung_planfall'] = cells['kk_strom_planfall'] * 100 / cells['kk_strom_planfall_sum'] # deletion of old entries and inserting is faster than updating self.parent_tbx.delete_rows_in_table('Beziehungen_Maerkte_Zellen') #column_values = {} #for col in cells.columns: #column_values[col] = cells[col].values arcpy.AddMessage(u'Schreibe Kenngrößen in Datenbank...') self.parent_tbx.insert_dataframe_in_table( 'Beziehungen_Maerkte_Zellen', cells)
def create_line_plot(plot_title, y_label, df, log, value_vars=list()): #variable_colors = dict() #colors = ['red', 'blue', 'green', 'orange', 'yellow', 'purple', 'black', 'cyan'] #colors_to_hex = { 'red': '#FF0000', 'blue': '#00000FF', 'green': '#00FF00', 'orange': '#CC79A7', 'yellow': '#AAAA00', 'purple': '#AA00AA', 'black': '#FFFFFF', 'cyan': '#00AAFF' } #colors_to_col = dict() #color_index = 0 #for col in df.columns: #if col != 'year': #variable_colors[col] = colors[color_index % len(colors)] #colors_to_col[colors[color_index % len(colors)]] = col #color_index += 1 # Transform the columns into id, variable, and values columns, using the year column as the id df_lng = None try: df_aes_basis = pd.melt(df, id_vars=['year']) df_lng = pd.melt(df, id_vars=['year'], value_vars=value_vars) except KeyError as ke: if log: logging.error(str(ke)) return None #df_avg, df_lng = find_avg_dataframe(df, log, value_vars) #if len(df_avg) == 0 or len(df_lng) == 0: #return None #color_list = list() #for row_index, row in df_lng.iterrows(): # color_list.append(variable_colors[row.variable]) # #df_colors = pd.DataFrame(color_list, index=df_lng.index, columns=['color_mapping']) #df_lng = pd.concat([df_lng, df_colors], axis=1, join_axes=[df_lng.index]) # plot = ggplot(aes(x='year', y='value', color='variable'), data=df_lng) #plot.add_to_legend(legend_type='color', legend_dict=colors_to_col) #print plot.data._get_numeric_data().columns #selected_color_list = list() #for col in value_vars: #selected_color_list.append(variable_colors[col]) #plot.manual_color_list = selected_color_list #data_assigned_visual_mapping = assign_visual_mapping(data=df_aes_basis, aes=aes(x='year', y='value', color='variable'), gg=plot) #print data_assigned_visual_mapping plot += geom_line(aes(x='year', y='value', color='variable'), data=df_lng) plot += ggtitle(plot_title) plot += xlab('Year') plot += ylab(y_label) fig = plot.draw() return fig
def save_data_for_frontend(model, vectorizer, df): doc_ids = np.argsort(model.doc_topic_, axis=0)[-5:-1,:].T doc_probs = np.sort(model.doc_topic_, axis=0)[-5:-1,:].T topic_total_probs = np.sum(doc_probs, axis=1) ## extract and prepare most probable words. ## split bigrams and take the unique set of the resulting word list. w = p.most_probable_words(model, vectorizer.get_feature_names(), 10) word_data = collections.defaultdict(list) for topic, g in w.groupby('topic'): word_data[topic] = ', '.join([w.capitalize() for w in p.unique(itertools.chain(*g.sort('prob', ascending=False)['word'].str.split(' ').values))]) # word_data[topic] = ', '.join([str(g['prob'].sum())] + [w.capitalize() for w in p.unique(itertools.chain(*g.sort('prob', ascending=False)['word'].str.split(' ').values))]) # for k,v in word_data.iteritems(): # print k # print topic_total_probs[k] # word_data[k] = v + str(topic_total_probs[k]) with open('frontend/app/word_data.pkl', 'w') as f: pickle.dump(word_data, f) di = pd.DataFrame(doc_ids) di['topic'] = di.index di = pd.melt(di, id_vars='topic') di.columns = ['topic','rank','key'] dp = pd.DataFrame(doc_probs) dp['topic'] = dp.index dp = pd.melt(dp, id_vars='topic') dp.columns = ['topic','rank','prob'] dd = pd.merge(di, dp) ## merge in document data for the most probable documents. df['topic'] = np.argmax(model.doc_topic_, axis=1).T df['topic_prob'] = np.max(model.doc_topic_, axis=1).T df['key'] = df.index most_probable_docs = pd.merge(df, dd) ## TODO: do the decoding here. most_probable_docs['ingredient_txt'] = [w for w in most_probable_docs['ingredient_txt'].str.split('\n') if w != []] doc_data = collections.defaultdict(list) for topic, g in most_probable_docs.groupby('topic'): row = g.sort('prob')[['ingredient_txt','image','url','title', 'key']].values doc_data[topic] = map(lambda x: dict(zip(['ingredient','image','url','title','key'], x)), row) with open('frontend/app/doc_data.pkl', 'w') as f: pickle.dump(doc_data, f) engine = p.make_engine() df.to_sql('clean_recipes', engine, if_exists='replace')
def test_melt(): pdf = pd.DataFrame({"A": list("abcd") * 5, "B": list("XY") * 10, "C": np.random.randn(20)}) ddf = dd.from_pandas(pdf, 4) list_eq(dd.melt(ddf), pd.melt(pdf)) list_eq(dd.melt(ddf, id_vars="C"), pd.melt(pdf, id_vars="C")) list_eq(dd.melt(ddf, value_vars="C"), pd.melt(pdf, value_vars="C")) list_eq( dd.melt(ddf, value_vars=["A", "C"], var_name="myvar"), pd.melt(pdf, value_vars=["A", "C"], var_name="myvar") ) list_eq( dd.melt(ddf, id_vars="B", value_vars=["A", "C"], value_name="myval"), pd.melt(pdf, id_vars="B", value_vars=["A", "C"], value_name="myval"), )
def additional_rows(table_career, var_value): def _fill_variables(row, var_value): years, values, starts, ends = yearly_value_converter(row[var_value], row.time_unit, row.start_date, row.end_date) col_y = ['year_{}'.format(i) for i in range(len(years))] col_v = ['value_{}'.format(i) for i in range(len(years))] col_s = ['start_{}'.format(i) for i in range(len(years))] col_e = ['end_{}'.format(i) for i in range(len(years))] row[col_y] = years row[col_v] = values row[col_s] = starts row[col_e] = ends return row table = table_career.copy() year_vars = ['year_{}'.format(i) for i in range(20)] value_vars = ['value_{}'.format(i) for i in range(20)] start_vars = ['start_{}'.format(i) for i in range(20)] end_vars = ['end_{}'.format(i) for i in range(20)] for year, value, start, end in zip(year_vars, value_vars, start_vars, end_vars): table[year] = np.nan table[value] = np.nan table[start] = np.nan table[end] = np.nan table = table.apply(lambda x: _fill_variables(x, var_value), axis = 1) id_vars = [var_name for var_name in table.columns if var_name not in year_vars + value_vars + start_vars + end_vars] df_years = pd.melt(table, id_vars = id_vars, value_vars = year_vars, var_name = 'var_year', value_name = 'year_from_melt') to_concat = [df_years] for to_add in ['value', 'start', 'end']: df_type = pd.melt(table, id_vars = ['noind', 'start_date'], value_vars = eval(to_add + '_vars'), var_name = 'var_' + to_add, value_name = to_add + '_from_melt') assert (df_years['noind'] == df_type['noind']).all() df_type.drop(['noind', 'start_date'], inplace = True, axis=1) assert df_years.shape[0] == df_type.shape[0] to_concat += [df_type] df = pd.concat(to_concat, axis=1, join_axes=[df_years.index]) del to_concat, table gc.collect() df = df.loc[df.value_from_melt.notnull(), :] df.drop([var_value, 'year', 'start_date', 'end_date', 'var_value', 'var_year', 'var_start', 'var_end'], inplace = True, axis=1) df.rename(columns={'value_from_melt': var_value, 'year_from_melt': 'year', 'end_from_melt': 'end_date', 'start_from_melt': 'start_date'}, inplace=True) df['time_unit'] = 'year' return df.sort(['noind', 'year', 'start_date'])
def parse_sub(sub, office, district): sub = sub.reset_index(drop=True) # Special case these. Needs to be cleaned up and generalized. if (office, district) == ('U.S. House', '33'): sub = pd.concat([sub.iloc[0:4, 0:-1].reset_index(drop=True), sub.iloc[5:9, 1:-1].reset_index(drop=True), sub.iloc[10:14, 1:].reset_index(drop=True)], axis=1).dropna(how='all') elif (office, district) == ('State Assembly', '33'): sub = pd.concat([sub.iloc[0:4, 0:-1].reset_index(drop=True), sub.iloc[5:9, 1:].reset_index(drop=True)], axis=1).dropna(how='all') elif (office, district) == ('U.S. House', '24'): sub = pd.concat([sub.iloc[0:6, 0:-1].reset_index(drop=True), sub.iloc[7:13, 1:].reset_index(drop=True)], axis=1).dropna(how='all') sub.columns = ['county'] + \ sub.iloc[:, 1:-1].iloc[0].fillna('').tolist() + ['office'] sub = sub.dropna(axis=1, how='all') sub = sub.rename(columns=parse_candidate) parties = sub.iloc[:, 1:-1].iloc[1].to_dict() sub = sub[sub.county.isin(COUNTIES)] sub = pd.melt(sub, id_vars=['county', 'office'], value_vars=sub.columns.tolist()[ 1:-1], var_name='candidate', value_name='votes') sub['party'] = sub.candidate.apply(lambda x: parties[x]) sub = sub.assign(office=office, district=district) return sub[fieldnames]
def timePlotLine(data): normalize = input("Would you like to normalize the y-axis? (y/n): ") geneNamesDict = {} for _, row in data.iterrows(): geneNamesDict[row['Gene']] = 1 data = data.pivot_table('Values', ['Sample'], ['Gene', 'Time']) geneList = geneNamesDict.keys() ylabel = input("What should the y-axis label be?: ") counter = 1 for key in geneList: plt.figure(counter) tempTable = data[key] tempTable = tempTable.T tempTable = tempTable.dropna(axis=1, how='any') if normalize == 'y': tempTable = tempTable / np.amax(tempTable.values) tempTable['Time'] = tempTable.index tempTable = pd.melt(tempTable, id_vars='Time')[['Time','value']] sns.regplot(x='Time',y='value',data=tempTable,scatter=True) plt.title(key) plt.ylabel(ylabel) plt.xlabel('Time(min)') counter += 1 plt.show()
def FormatToPrevise(df2_1, df2_2): # Dropping DATETime index to merge df1 and df2 df2_1 = df2_1.reset_index(drop=False) df2_2 = df2_2.reset_index(drop=False) # Converting Historian files to VTQ format (DATETime, TAGNAME, DESCRIPTION, VALUE) mdf = pd.merge(pd.melt(df2_1, id_vars=['DATETIME'], var_name='TAGNAME', value_name='DESCRIPTION')[['TAGNAME', 'DESCRIPTION']], pd.melt(df2_2, id_vars=['DATETIME'], var_name='TAGNAME', value_name='VALUE'), on=['TAGNAME']) # Sort columns by VTQ format mdf = mdf[['DATETIME', 'TAGNAME', 'DESCRIPTION', 'VALUE']] return (mdf)
def check_interval(filename): df = pd.read_csv(inputdir + filename) df.rename(columns=lambda x: x[:8] if x != 'Timestamp' else x, inplace=True) df.dropna(axis=1, how='all', inplace=True) df['Timestamp'] = pd.to_datetime(df['Timestamp']) df.set_index(pd.DatetimeIndex(df['Timestamp']), inplace=True) # df.info() df_re = df.resample('M', how='sum') cols = list(df_re) df_re.reset_index(inplace=True) df_long = pd.melt(df_re, id_vars='index', value_vars=cols) # print # print df_long.head() df_long.rename(columns={'index':'Timestamp', 'variable': 'Building_Number', 'value': 'Electricity_(KWH)'}, inplace=True) df_long['month'] = df_long['Timestamp'].map(lambda x: x.month) df_long['year'] = df_long['Timestamp'].map(lambda x: x.year) col_str = ','.join(['\'{0}\''.format(x) for x in cols]) conn = uo.connect('all') with conn: df = pd.read_sql('SELECT Building_Number, year, month, [Electricity_(KWH)] FROM EUAS_monthly WHERE Building_Number IN ({0}) AND year = \'2015\''.format(col_str), conn) # print df.head() df_long.drop('Timestamp', axis=1, inplace=True) df_all = pd.merge(df, df_long, how='left', on=['Building_Number', 'year', 'month'], suffixes=['_EUAS', '_ION']) df_all['ratio'] = df_all['Electricity_(KWH)_ION']/df_all['Electricity_(KWH)_EUAS'].map(lambda x: round(x, 3)) df_all['percent_diff'] = df_all['ratio'].map(lambda x: abs(1 - x) * 100.0) # print df_all.head() return df_all
def __call__(self, df): df_cols = df.columns.values.tolist() id_vals = [col._name for col in self.args[2]] id_vars = [col for col in df_cols if col not in id_vals] key = self.args[0] value = self.args[1] return pandas.melt(df, id_vars, id_vals, key, value)
def gg_funcs(functions,bottom,top,N=1000,labels = ["Baseline"], title = "Consumption and Cash-on-Hand", ylab = "y", xlab="x", loc = loc, ltitle = 'Variable', file_name = None): if type(functions)==list: function_list = functions else: function_list = [functions] step = (top-bottom)/N x = np.arange(bottom,top,step) fig = pd.DataFrame({'x': x}) #xx there's got to be a better way to scroll through this list i = 0 for function in function_list: fig[labels[i]] = function(x) #print labels[i] i=i+1 fig = pd.melt(fig, id_vars=['x']) #print(fig) g = gg.ggplot(fig) + \ mp.base_plot + mp.line + mp.point + \ mp.theme_bw(base_size=9) + mp.fte_theme +mp.colors + \ gg.labs(title=title,y=ylab,x=xlab) + mp.legend_f(loc) + mp.legend_t_c(ltitle) + mp.legend_t_s(ltitle) #+ \ # #gg.geom_text(data=pd.DataFrame(data={'l':"test"},index=np.arange(1)), x = "1", y = "1",group="1",colour="1", label = "plot mpg vs. wt") #gg.geom_text(data=pd.DataFrame(data={'l':"test"},index=np.arange(1)), mapping=gg.aes_string(x="1", y="1",group="1",colour="1",shape="1", mapping="l")) if file_name is not None: mp.ggsave(file_name,g) return(g)
def get_rep_data(C_prog_arg, rep) : strain_1_file = root_path + "/" + str(C_prog_arg) \ + "/J0." + str(rep) + "." + str(C_prog_arg) strain_2_file = root_path + "/" + str(C_prog_arg) \ + "/J1." + str(rep) + "." + str(C_prog_arg) newind = np.arange(260) + 1 strain_1 = pd.read_csv(strain_1_file, sep="\t", header=None, names=newind) strain_2 = pd.read_csv(strain_2_file, sep="\t", header=None, names=newind) n = np.shape(strain_1)[0] strain_1['strain'] = 1 strain_2['strain'] = 2 strain_1['t'] = np.linspace(0, n * dt * prntime / 365, n) strain_2['t'] = np.linspace(0, n * dt * prntime / 365, n) out = pd.merge(strain_1, strain_2, #on=('t', 'strain'), how='outer') #out = pd.concat([strain_1, strain_2], axis=1) out['rep'] = rep out = pd.melt(out, id_vars=['t', 'strain', 'rep'], value_vars=list(newind), var_name="city_newind", value_name='inc') out.head() return out
def expand(predicted, tsv_file): tsv = pd.read_csv(tsv_file, sep='\t') on = [] for col in tsv.columns: if 'Dependency=' in col: tsv = tsv.rename(columns={col: col.replace('Dependency=', 'building_characteristics_report.').lower().replace(' ', '_')}) on.append(col.replace('Dependency=', 'building_characteristics_report.').lower().replace(' ', '_')) try: predicted = predicted.reset_index() predicted = predicted.merge(tsv, on=on, how='left') except KeyError as ke: sys.exit('Column {} does not exist.'.format(ke)) id_vars = [] value_vars = [] for col in predicted.columns: if 'Option=' in col: value_vars.append(col) else: id_vars.append(col) melted = pd.melt(predicted, id_vars=id_vars, value_vars=value_vars, var_name='building_characteristics_report.{}'.format(os.path.basename(tsv_file).replace('.tsv', '').lower().replace(' ', '_')), value_name='frac') melted = melted.set_index('_id') melted['building_characteristics_report.{}'.format(os.path.basename(tsv_file).replace('.tsv', '').lower().replace(' ', '_'))] = melted['building_characteristics_report.{}'.format(os.path.basename(tsv_file).replace('.tsv', '').lower().replace(' ', '_'))].str.replace('Option=', '') return melted
def plot_clf_polar(clf, cmap=None, key='nickname', n_topics=60, n_top=3, labels=None, topics = None, mask=None, selection='top', metric='correlation', max_val=None): import pandas as pd import seaborn as sns ## Set up topic nicknames word_keys = pd.read_csv("../data/unprocessed/abstract_topics_filtered/topic_sets/topic_keys" + str(n_topics) + "-july_cognitive.csv") word_keys['topic_name'] = "topic" + word_keys['topic'].astype('str') o_fi = pd.DataFrame(clf.odds_ratio) # Melt feature importances, and add top_words for each feeature o_fi['region'] = range(1, o_fi.shape[0] + 1) o_fis_melt = pd.melt(o_fi, var_name='topic_order', value_name='importance', id_vars=['region']) word_keys = pd.merge(pd.DataFrame(np.array([range(0, clf.feature_importances.shape[1]), clf.feature_names]).T, columns=['topic_order', 'topic_name']), word_keys) word_keys.topic_order = word_keys.topic_order.astype('int') o_fis_melt= pd.merge(o_fis_melt, word_keys) o_fis_melt['abs_imp'] = np.abs(o_fis_melt['importance']) if mask is not None: o_fis_melt = o_fis_melt[o_fis_melt.region.isin(mask)] if topics is not None: o_fis_melt = o_fis_melt[o_fis_melt[key].isin(topics)] pplot = pd.pivot_table(o_fis_melt, values='importance', index=[key], columns=['region']) if cmap is None: cmap = sns.color_palette('Set1', clf.feature_importances.shape[0]) if mask is not None: cmap = [n[0] for n in sorted(zip(np.array(cmap)[np.array(mask)-1], mask), key=lambda tup: tup[1])] return plot_polar(pplot, overplot=True, palette=cmap, n_top=n_top, metric=metric, selection=selection, label_size=30, labels=labels, max_val=max_val)
def tx_modes_plot(consensus_data, ordered_genomes, tx_mode_plot_tgt): ordered_groups = ['transMap', 'transMap+TM', 'transMap+TMR', 'transMap+TM+TMR', 'TM', 'TMR', 'TM+TMR', 'CGP', 'PB', 'Other'] ordered_groups = OrderedDict([[frozenset(x.split('+')), x] for x in ordered_groups]) def split_fn(s): return ordered_groups.get(frozenset(s['Transcript Modes'].replace('aug', '').split(',')), 'Other') modes_df = json_biotype_counter_to_df(consensus_data, 'Transcript Modes') df = modes_df.pivot(index='genome', columns='Transcript Modes').transpose().reset_index() df['Modes'] = df.apply(split_fn, axis=1) df = df[['Modes'] + ordered_genomes] ordered_values = [x for x in ordered_groups.itervalues() if x in set(df['Modes'])] with tx_mode_plot_tgt.open('w') as outf, PdfPages(outf) as pdf: title_string = 'Transcript modes in protein coding consensus gene set' ylabel = 'Number of transcripts' if len(ordered_genomes) > 1: df['Ordered Modes'] = pd.Categorical(df['Modes'], ordered_values, ordered=True) df = df.sort_values('Ordered Modes') df = df[['Ordered Modes'] + ordered_genomes].set_index('Ordered Modes') df = df.fillna(0) generic_stacked_barplot(df, pdf, title_string, df.index, ylabel, ordered_genomes, 'Transcript mode(s)', bbox_to_anchor=(1.25, 0.7)) else: generic_barplot(pd.melt(df, id_vars='Modes'), pdf, 'Transcript mode(s)', ylabel, title_string, x='Modes', y='value', order=ordered_values)
def test_linetype(): meat_lng = pd.melt(meat[['date', 'beef', 'pork', 'broilers']], id_vars='date') p = ggplot(aes(x='date', y='value', colour='variable', linetype='variable', shape='variable'), data=meat_lng) + \ geom_line() + geom_point() +\ ylim(0, 3000) assert_same_ggplot(p, "legend_linetype")
def generateBathroomTilePlot(bl_vs_change_json): df = pd.read_json(bl_vs_change_json) summary_regions = ['ctx-lh-parsorbitalis','ctx-rh-parsorbitalis','ctx-rh-lateralorbitofrontal', 'ctx-lh-lateralorbitofrontal','ctx-rh-frontalpole','ctx-rh-parstriangularis', 'ctx-lh-frontalpole','ctx-lh-parstriangularis','ctx-lh-caudalanteriorcingulate', 'ctx-rh-rostralmiddlefrontal','ctx-lh-caudalmiddlefrontal', 'ctx-rh-caudalanteriorcingulate','ctx-rh-rostralanteriorcingulate', 'ctx-lh-rostralmiddlefrontal','ctx-rh-caudalmiddlefrontal', 'ctx-lh-superiorparietal','ctx-rh-isthmuscingulate', 'ctx-lh-rostralanteriorcingulate','ctx-rh-parsopercularis', 'ctx-rh-superiorparietal','ctx-lh-parsopercularis', 'ctx-rh-medialorbitofrontal','ctx-lh-isthmuscingulate', 'ctx-lh-supramarginal','ctx-lh-inferiorparietal','ctx-rh-supramarginal', 'ctx-lh-superiorfrontal','ctx-rh-superiorfrontal','ctx-rh-middletemporal', 'ctx-lh-middletemporal','ctx-rh-inferiorparietal','ctx-rh-superiortemporal', 'ctx-lh-posteriorcingulate','ctx-lh-precuneus','ctx-lh-medialorbitofrontal', 'ctx-lh-superiortemporal','ctx-rh-posteriorcingulate','ctx-rh-precuneus'] ordering = {x:i for i,x in enumerate(summary_regions)} rank_by = summary_regions # could take subset of cortical summary regions subjects = GROUPS['increasing_low']['N'] df = df[df['rid'].isin(subjects)] baseline_keys = ["%s_bl" % _ for _ in rank_by] change_keys = ["%s_change" % _ for _ in summary_regions] df['rank'] = df[baseline_keys].mean(axis=1) keep_keys = ['rid', 'rank'] + change_keys df = df[keep_keys] df_long = pd.melt(df,id_vars=['rank'],value_vars=change_keys) # sort change df_long['variable'] = [_.replace('_change','') for _ in df_long['variable']] df_long['variable'] = ['%s_%s' % (str(ordering[_]).zfill(2),_) for _ in df_long['variable']] print ggplot(aes(x='variable',y='rank'),data=df_long)+geom_tile(aes(fill='value'))+theme(axis_text_x=element_text(angle=270,size=8), axis_text_y=element_text(size=6))
def parse_dates2(df): for k in range(len(df.columns)): vtype = df.convert_objects(convert_numeric=True).dtypes[k] if vtype == 'int64': if k != 0: df[df.columns[k]] = df[df.columns[k]].astype(float) df.drop(df.columns[-1], axis=1, inplace=True) df.rename(columns={'HORA UTC': 'date'}, inplace=True) df = pd.melt( df, id_vars=["date"] ).rename(columns={'variable': 'hour'} ) df['hour'] = df['hour'].astype(str) df.hour = df.hour.apply(lambda x: '%04i' %int(x) ) df.hour = df.hour.apply(lambda x: x[:2] ) df.date = df.apply(lambda x: pd.to_datetime(x.date, format="%Y-%m-%d")\ + timedelta(hours=int(x.hour)), axis=1) df.rename(columns={'value': var}, inplace=True) df.drop('hour', 1, inplace=True) df.set_index('date', inplace=True) df.sort(inplace=True) return df
def parse_los_angeles(): output_columns = ['county', 'precinct', 'office', 'district', 'party', 'candidate', 'votes'] sovc_zip_url = 'https://www.lavote.net/documents/SVC/3744_SVC_Excel.zip' sovc_zip = requests.get(sovc_zip_url) if sovc_zip.status_code != 200: return f = tempfile.NamedTemporaryFile() f.write(sovc_zip.content) sovc_zf = zipfile.ZipFile(f.name) df = pd.read_excel(sovc_zf.open( '34TH_CONGRESS_DIST_U-T_06-06-17_Voter_Nominated_by_Precinct_3744-5055.xls')) df.columns = df.loc[1] df = df[df.TYPE == 'TOTAL'] table = pd.melt(df, id_vars=['PRECINCT'], value_vars=df.columns.tolist()[ 8:-1], var_name='candidate', value_name='votes').assign(county='Los Angeles', office='U.S. House', district='34').rename(columns={'PRECINCT': 'precinct'}).replace({'candidate': candidates}) parties = {k: 'DEM' for k in candidates.values()} table['party'] = table.candidate.apply(lambda x: parties[x]) for x in ['candidate', 'district', 'office', 'precinct', 'county']: table = table.sort_values(by=x, kind='mergesort') table[output_columns].to_csv( '2017/20170606__ca__special__general__los_angeles__precinct.csv', index=False)
def main(): df = pd.read_csv("./attention.csv") df = pd.melt(df, ["subidr", "attnr"], var_name="solutions", value_name="score") df.solutions = df.solutions.str[-1].astype(int) df.columns = ["subject", "attention", "solutions", "score"] df.to_csv("attention.csv")
def donutchart(*args, **kwargs): #get info from submitted data data = kwargs.get('data', 'None') ids = kwargs.get('ids', 'None') vals = kwargs.get('vals', 'None') val_name = kwargs.get('val_name', 'None') v_name = kwargs.get('v_name', 'None') out_file = kwargs.get('out_file', 'None') if vals or data or ids or val_name == 'None': return "Data must be submitted" df = df_from_json(data) df = df.sort("total", ascending=False) df = pd.melt(df, id_vars=[ids], value_vars=[vals], value_name=val_name, var_name=v_name) d = Donut(df, label=[ids, v_name], values=v_name, text_font_size='8pt', hover_text='vals') output_file(out_file) save(d)
def reshape(school_attendance): # reshape the data into a more normal form x = pd.melt(school_attendance, id_vars=['school_year', 'lea_name', 'lea_number', 'school_number', 'school_name', 'grade_level']) def get_sex(v): if v.endswith('_Male'): return 'Male' if v.endswith('_Female'): return 'Female' assert False, 'can not get here' def get_race(v): if v.endswith('_Male'): return v[:-5].replace('_', ' ') if v.endswith('_Female'): return v[:-7].replace('_', ' ') assert False, 'can not get here' # did not work #return v.rstrip('_Male').rstrip('_Female') x['sex'] = x.variable.map(get_sex) x['race'] = x.variable.map(get_race) x['attendance'] = x.value del(x['variable']) del(x['value']) return x
def process_load_data(filename): # import data df = pd.read_csv(filename, parse_dates=[[1, 2, 3]], thousands=',') # unpivot df = pd.melt(df, id_vars=['year_month_day', 'zone_id'], var_name='hour') # drop rows where value is NaN df.dropna(inplace=True) # drop where zoneid = 21 [this is just a total row, that occurs in solution data only] df = df[df.zone_id != 21] # create datetime col df.hour = df.hour.str.replace('h', '') df.hour = pd.to_timedelta(df.hour.astype(int) - 1, unit='h') df['datetime'] = df.year_month_day + df.hour # drop and reorder columns df = df[['datetime', 'zone_id', 'value']].copy() # add weights df['weight'] = 1 # increase weight on future predictions - where datetime > 2008-06-30 05:30 predictions_start_datetime = datetime(2008, 6, 30, 05, 30, 0) df.loc[df['datetime'] > predictions_start_datetime, 'weight'] = 8 # add trend variable [incremental number of hours] trend_start_datetime = datetime(2004, 1, 1, 0, 0, 0) df['trend'] = (df.datetime - trend_start_datetime) / np.timedelta64(1, 'h') + 1 df = df.sort_values(by=['zone_id', 'datetime'], ascending=[True, True]) return df
def run_t_test_app(): st.header('■t-test') st.write( 'To compare the results of two tests. e.g., examine the difference in performance by teaching method.' ) st.sidebar.subheader('Data Upload') df_edu = pd.read_csv("data/eng_sample_data_t_test.csv") def download_link(object_to_download, download_filename, download_link_text): if isinstance(object_to_download, pd.DataFrame): object_to_download = object_to_download.to_csv( index=False, encoding='utf_8_sig') b64 = base64.b64encode(object_to_download.encode()).decode() return f'<a href="data:file/txt;base64,{b64}" download="{download_filename}">{download_link_text}</a>' tmp_download_link = download_link(df_edu, 'sample_ttest.csv', 'Download sample csv file.') st.sidebar.markdown(tmp_download_link, unsafe_allow_html=True) # st.sidebar.info(""" # [Download the sample csv file](https://github.com/59er/eng_learning_analytics_web/blob/master/sample_data/eng_sample_data_t_test_for_WEB.csv) # """) uploaded_file = st.sidebar.file_uploader( "File upload (Drag and drop or use [Browse files] button to import csv file. Only utf-8 format is available.)", type=["csv"]) # uploaded_file = st.file_uploader( # label = 'File Upload(Drag and drop csv/Excel)', # type = ['csv', 'xlsx'] # ) try: if uploaded_file is not None: df_edu = pd.read_csv(uploaded_file) uploaded_file.seek(0) display_data = st.sidebar.checkbox(label='Show uploaded data') if display_data: st.dataframe(df_edu) else: df_edu = pd.read_csv('data/eng_sample_data_t_test.csv') show_df = st.sidebar.checkbox('Show DataFrame') if show_df == True: st.write(df_edu) A_var = np.var(df_edu.iloc[:, 0], ddof=1) B_var = np.var(df_edu.iloc[:, 1], ddof=1) A_df = len(df_edu) - 1 B_df = len(df_edu) - 1 f = A_var / B_var one_sided_pval1 = stats.f.cdf(f, A_df, B_df) one_sided_pval2 = stats.f.sf(f, A_df, B_df) two_sided_pval = min(one_sided_pval1, one_sided_pval2) st.subheader( "Confirmation of equality of variance between two groups (p-value < 0.05 for unequal variance (Welch's t-test was applied)),\ Equal variances at p-value > 0.05 (Student's t-test applied))") dist = round(two_sided_pval, 3) st.write('F ', round(f, 3)) st.write('p-value:', round(two_sided_pval, 3)) if dist < 0.05: result_w = stats.ttest_ind(df_edu.iloc[:, 0], df_edu.iloc[:, 1]) st.subheader('t-test results (welch)') st.write(result_w) else: result_s = stats.ttest_ind(df_edu.iloc[:, 0], df_edu.iloc[:, 1]) st.subheader('t-test results (Student)') st.write(result_s) st.set_option('deprecation.showPyplotGlobalUse', False) st.write( sns.catplot(x='variable', y='value', kind='box', data=pd.melt(df_edu))) plt.title('Comparison between the two groups', fontsize=15) plt.show() st.pyplot() except Exception as e: st.header( 'ERROR: Data inconsistency. Check data format to be uploaded.') print('Data inconsistency error')
test 127306 2B1B-A4 0.9 69.23 test 127306 2B1B-A4 1 0 # to something like this: client propcode propfp 0 0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9 1 venterra 127306 1B1B-A123* 100 99.11 98.32 97.05 95.53 93.99 92.16 89.9 85.5 65.05 0 venterra 127306 2B1B-A4 100 98.91 97.82 96.73 95.64 94.55 93.46 90.4 88.65333333 69.23 0 # Python code for it would be: df_fix = df.pivot_table(index = ['client','propcode','propfp'], columns='pctile', values='adj') df_fix.columns = df_fix.columns.get_level_values('pctile') df_fix.reset_index(inplace=True) # From pivot table format back to origional format could use melt v = ['client','propcode','propfp'] df = pd.melt(df_fix, id_vars = v, var_name = 'pctile', value_name = 'adj') """ h2o.ai in python """ import h2o # Start the h2o clusters / shut down clusters h2o.init() h2o.cluster().shutdown() h2o.cluster().show_status() # check cluster status # Data exchange between pandas and h2o df_h2o = h2o.H2OFrame(df) # import pandas dataframe to h2o dataframe df = df_h2o.as_data_frame() # export h2o dataframe to pandas dataframe
import pandas as pd filename = 'Inj_Prodbywell.xls' df = pd.read_table(filename) df_inj = pd.melt( df, id_vars=['Apino', 'Company', 'Inj_type', 'Field', 'Formation', 'Year'], value_vars=[ 'Jan_Inj', 'Feb_Inj', 'Mar_Inj', 'Apr_Inj', 'May_Inj', 'Jun_Inj', 'Jul_Inj', 'Aug_Inj', 'Sep_Inj', 'Oct_Inj', 'Nov_Inj', 'Dec_Inj' ], var_name="Month_val", value_name="Inj") df_inj['Month_val'] = df_inj['Month_val'].replace(to_replace='_Inj', value='', regex=True) month_transform = { 'Jan': 1, 'Feb': 2, 'Mar': 3, 'Apr': 4, 'May': 5, 'Jun': 6, 'Jul': 7, 'Aug': 8, 'Sep': 9, 'Oct': 10, 'Nov': 11, 'Dec': 12 } df_inj['Month_val'] = df_inj['Month_val'].map(month_transform)
# model8-3ではこの方法を使ってwarningを回避しているが、サンプリングがうまくいかない、、、 # 予測分布 probs = (2.5, 50, 97.5) qua = np.transpose(np.percentile(mcmc_sample['y_new'], (2.5, 50, 97.5), axis=0), axes=(1, 2, 0)) d_est = pandas.DataFrame(qua.reshape((-1, 3)), columns=['p{}'.format(p) for p in probs]) d_est['PersonID'] = np.repeat(np.arange(N) + 1, T_new) d_est['Time'] = np.tile(Time_new, N) print(d_est) Time_tbl = pandas.Series(Time, index=['Time{}'.format(t) for t in Time]) d = pandas.melt(data_conc2, id_vars='PersonID', var_name='Time', value_name='Y') d['Time'] = Time_tbl[d['Time']].values _, axes = plt.subplots(4, 4, figsize=figaspect(7 / 8) * 1.5) for (row, col), ax in np.ndenumerate(axes): person = row * 4 + col + 1 ax.fill_between('Time', 'p2.5', 'p97.5', data=d_est.query('PersonID==@person'), color='k', alpha=1 / 5) ax.plot('Time', 'p50', data=d_est.query('PersonID==@person'), color='k') ax.scatter('Time', 'Y', data=d.query('PersonID==@person'), color='k') if row < 3:
gives_df = data[[ 'playerId', 'meanNumberOfGives_A', 'meanNumberOfGives_B', 'meanNumberOfGives_C', 'meanNumberOfGives_D' ]] gives_df.rename(columns={ 'meanNumberOfGives_A': 'A', 'meanNumberOfGives_B': 'B', 'meanNumberOfGives_C': 'C', 'meanNumberOfGives_D': 'D' }, inplace=True) gives_melted_df = pd.melt(gives_df, id_vars=['playerId'], value_vars=['A', 'B', 'C', 'D']) gives_melted_df.rename(columns={ 'variable': 'ScoreSystem', 'value': 'gives' }, inplace=True) #print(gives_melted_df) #################################### Takes takes_df = data[[ 'playerId', 'meanNumberOfTakes_A', 'meanNumberOfTakes_B', 'meanNumberOfTakes_C', 'meanNumberOfTakes_D'
def rank_genes_groups_violin(adata, groups=None, n_genes=20, gene_names=None, gene_symbols=None, use_raw=None, key=None, split=True, scale='width', strip=True, jitter=True, size=1, ax=None, show=None, save=None): """\ Plot ranking of genes for all tested comparisons. Parameters ---------- adata : :class:`~anndata.AnnData` Annotated data matrix. groups : list of `str`, optional (default: `None`) List of group names. n_genes : `int`, optional (default: 20) Number of genes to show. Is ignored if `gene_names` is passed. gene_names : `None` or list of `str` (default: `None`) List of genes to plot. Is only useful if interested in a custom gene list, which is not the result of :func:`scanpy.api.tl.rank_genes_groups`. gene_symbols : `str`, optional (default: `None`) Key for field in `.var` that stores gene symbols if you do not want to use `.var_names` displayed in the plot. use_raw : `bool`, optional (default: `None`) Use `raw` attribute of `adata` if present. Defaults to the value that was used in :func:`~scanpy.api.tl.rank_genes_groups`. split : `bool`, optional (default: `True`) Whether to split the violins or not. scale : `str`, optional (default: 'width') See `seaborn.violinplot`. strip : `bool`, optional (default: `True`) Show a strip plot on top of the violin plot. jitter : `int`, `float`, `bool`, optional (default: `True`) If set to 0, no points are drawn. See `seaborn.stripplot`. size : `int`, optional (default: 1) Size of the jitter points. {show_save_ax} """ if key is None: key = 'rank_genes_groups' groups_key = str(adata.uns[key]['params']['groupby']) if use_raw is None: use_raw = bool(adata.uns[key]['params']['use_raw']) reference = str(adata.uns[key]['params']['reference']) groups_names = (adata.uns[key]['names'].dtype.names if groups is None else groups) if isinstance(groups_names, str): groups_names = [groups_names] axs = [] for group_name in groups_names: if gene_names is None: gene_names = adata.uns[key]['names'][group_name][:n_genes] df = pd.DataFrame() new_gene_names = [] for g in gene_names: if adata.raw is not None and use_raw: X_col = adata.raw[:, g].X else: X_col = adata[:, g].X if issparse(X_col): X_col = X_col.toarray().flatten() new_gene_names.append( g if gene_symbols is None else adata.var[gene_symbols][g]) df[g] = X_col df['hue'] = adata.obs[groups_key].astype(str).values if reference == 'rest': df.loc[df['hue'] != group_name, 'hue'] = 'rest' else: df.loc[~df['hue'].isin([group_name, reference]), 'hue'] = np.nan df['hue'] = df['hue'].astype('category') df_tidy = pd.melt(df, id_vars='hue', value_vars=new_gene_names) x = 'variable' y = 'value' hue_order = [group_name, reference] import seaborn as sns _ax = sns.violinplot(x=x, y=y, data=df_tidy, inner=None, hue_order=hue_order, hue='hue', split=split, scale=scale, orient='vertical', ax=ax) if strip: _ax = sns.stripplot(x=x, y=y, data=df_tidy, hue='hue', dodge=True, hue_order=hue_order, jitter=jitter, color='black', size=size, ax=_ax) _ax.set_xlabel('genes') _ax.set_title('{} vs. {}'.format(group_name, reference)) _ax.legend_.remove() _ax.set_ylabel('expression') _ax.set_xticklabels(gene_names, rotation='vertical') writekey = ('rank_genes_groups_' + str(adata.uns[key]['params']['groupby']) + '_' + group_name) utils.savefig_or_show(writekey, show=show, save=save) axs.append(_ax) if show == False: return axs
def index(): # extract data needed for visuals genre_counts = df.groupby('genre').count()['message'] genre_names = list(genre_counts.index) category_melt = pd.melt(df, id_vars=['id', 'message', 'original', 'genre'], var_name='category') category_counts = category_melt.groupby('category').sum()['value'] category_names = list(category_counts.index) df['message_len'] = df.message.str.len() message_lens = df['message_len'] # create visuals graphs = [ { 'data': [Bar(x=genre_names, y=genre_counts)], 'layout': { 'title': 'Distribution of Message Genres', 'yaxis': { 'title': "Count" }, 'xaxis': { 'title': "Genre" }, } }, { 'data': [Bar(x=category_names, y=category_counts)], 'layout': { 'title': 'Distribution of Message Categories', 'yaxis': { 'title': "Count" }, 'xaxis': { 'title': "Category", 'tickangle': -45, }, } }, { 'data': [{ 'type': 'histogram', 'x': message_lens, }], 'layout': { 'title': 'Histogram of Message Lengths', 'yaxis': { 'title': "Count" }, } }, ] # encode plotly graphs in JSON ids = ["graph-{}".format(i) for i, _ in enumerate(graphs)] graphJSON = json.dumps(graphs, cls=plotly.utils.PlotlyJSONEncoder) # render web page with plotly graphs return render_template('master.html', ids=ids, graphJSON=graphJSON)
# Check that index column was added # In[46]: common_dict['index_col'].head() # <b> Unpivot other columns than Index </b> # In[47]: common_dict_melt = pd.melt(common_dict, id_vars=['index_col']) # In[48]: common_dict_melt.head() # <p> <b> Remove {} <b/> </p> # In[49]: common_dict_melt['value'] = common_dict_melt['value'].map(lambda x: x.lstrip('{').rstrip('}'))
zip_ref.close() ''' Process data ''' # read in csv file as Dataframe df = pd.read_csv(raw_data_file_unzipped + '/SE4ALLData.csv') # subset for renewable energy consumption data df_subset = df[df['Indicator Name'].str.contains( 'Renewable energy consumption')] #convert tables from wide form (each year is a column) to long form (a single column of years and a single column of values) year_list = [str(year) for year in range(1990, 2017)] #check df_long = pd.melt(df_subset, id_vars=['Country Name', 'Country Code'], value_vars=year_list, var_name='year', value_name='renewable energy consumption') #convert year column from object to integer df_long.year = df_long.year.astype('int64') #save processed dataset to csv processed_data_file = data_dir + dataset_name + '_edit.csv' df_long.to_csv(processed_data_file, index=False) ''' Upload processed data to Carto ''' print('Uploading processed data to Carto.') #set up carto authentication using local variables for username (CARTO_WRI_RW_USER) and API key (CARTO_WRI_RW_KEY) auth_client = APIKeyAuthClient(api_key=os.getenv('CARTO_WRI_RW_KEY'),
aggfunc='sum') # In[27]: table1 = pd.pivot_table(table, index='cust_id', columns='type', values='Monetary', fill_value=0, aggfunc=np.sum).reset_index() # 索引重置,恢复到最初 # In[28]: pd.melt(table1, id_vars='cust_id', value_vars=['Normal', 'Special_offer'], value_name='Monetary', var_name='TYPE') # ### 5.1.8 赋值与条件赋值 # #### 1. 赋值 # In[29]: sample = pd.DataFrame({ 'name': ['Bob', 'Lindy', 'Mark', 'Miki', 'Sully', 'Rose'], 'score': [99, 78, 999, 77, 77, np.nan], 'group': [1, 1, 1, 2, 1, 2], })
min_permuted_scores[(tissue, subset, 'permuted', 'min')].append( lasso_perm.score(X_final_test_sub, y_test_sub)) real = pd.DataFrame.from_dict(scores) real = real.T.reset_index() permuted = pd.DataFrame.from_dict(permuted_scores) permuted = permuted.T.reset_index() min_real = pd.DataFrame.from_dict(min_scores) min_real = min_real.T.reset_index() min_permuted = pd.DataFrame.from_dict(min_permuted_scores) min_permuted = min_permuted.T.reset_index() everything = pd.concat([real, permuted, min_real, min_permuted]) everything = everything.rename( columns={ 'level_0': 'tissue', 'level_1': 'training_set_size', 'level_2': 'type', 'level_3': 'test_set_size' }) everything = pd.melt( everything, id_vars=['tissue', 'training_set_size', 'type', 'test_set_size'], value_vars=[0, 1, 2]) pickle.dump(everything, open('test_results', 'wb'))
def eia_mecs_energy_call(**kwargs): """ Convert response for calling url to pandas dataframe, begin parsing df into FBA format :param kwargs: potential arguments include: url: string, url response_load: df, response from url call args: dictionary, arguments specified when running flowbyactivity.py ('year' and 'source') :return: pandas dataframe of original source data """ # load arguments necessary for function response_load = kwargs['r'] args = kwargs['args'] ## load .yaml file containing information about each energy table ## (the .yaml includes information such as column names, units, and which rows to grab) filename = 'EIA_MECS_energy tables' sourcefile = datapath + filename + '.yaml' with open(sourcefile, 'r') as f: table_dict = yaml.safe_load(f) ## read raw data into dataframe ## (include both Sheet 1 (data) and Sheet 2 (relative standard errors)) df_raw_data = pd.read_excel(io.BytesIO(response_load.content), sheet_name=0, header=None) df_raw_rse = pd.read_excel(io.BytesIO(response_load.content), sheet_name=1, header=None) ## retrieve table name from cell A3 of Excel file table = df_raw_data.iloc[2][0] # drop the table description (retain only table name) table = table.split(' ')[0] ## for each of the census regions... ## - grab the appropriate rows and columns ## - add column names ## - "unpivot" dataframe from wide format to long format ## - add columns denoting census region, relative standard error, units ## - concatenate census region data into master dataframe df_data = pd.DataFrame() for region in table_dict[args['year']][table]['regions']: ## grab relevant columns ## (this is a necessary step because code was retaining some seemingly blank columns) # determine number of columns in table, based on number of column names num_cols = len(table_dict[args['year']][table]['col_names']) # keep only relevant columns df_raw_data = df_raw_data.iloc[:, 0:num_cols] df_raw_rse = df_raw_rse.iloc[:, 0:num_cols] ## grab relevant rows # get indices for relevant rows grab_rows = table_dict[args['year']][table]['regions'][region] grab_rows_rse = table_dict[args['year']][table]['rse_regions'][region] # keep only relevant rows df_data_region = pd.DataFrame( df_raw_data.loc[grab_rows[0] - 1:grab_rows[1] - 1]).reindex() df_rse_region = pd.DataFrame( df_raw_rse.loc[grab_rows_rse[0] - 1:grab_rows_rse[1] - 1]).reindex() # assign column names df_data_region.columns = table_dict[args['year']][table]['col_names'] df_rse_region.columns = table_dict[args['year']][table]['col_names'] # "unpivot" dataframe from wide format to long format # ('NAICS code' and 'Subsector and Industry' are identifier variables) # (all other columns are value variables) df_data_region = pd.melt( df_data_region, id_vars=table_dict[args['year']][table]['col_names'][0:2], value_vars=table_dict[args['year']][table]['col_names'][2:], var_name='FlowName', value_name='FlowAmount') df_rse_region = pd.melt( df_rse_region, id_vars=table_dict[args['year']][table]['col_names'][0:2], value_vars=table_dict[args['year']][table]['col_names'][2:], var_name='FlowName', value_name='Spread') # add census region df_data_region['Location'] = region # add relative standard error data df_data_region = pd.merge(df_data_region, df_rse_region) ## add units # if table name ends in 1, units must be extracted from flow names if table[-1] == '1': flow_name_array = df_data_region['FlowName'].str.split('\s+\|+\s') df_data_region['FlowName'] = flow_name_array.str[0] df_data_region['Unit'] = flow_name_array.str[1] # if table name ends in 2, units are 'trillion Btu' elif table[-1] == '2': df_data_region['Unit'] = 'Trillion Btu' df_data_region['FlowName'] = df_data_region['FlowName'] data_type = table_dict[args['year']][table]['data_type'] if data_type == 'nonfuel consumption': df_data_region['Class'] = 'Other' elif data_type == 'fuel consumption': df_data_region['Class'] = 'Energy' # remove extra spaces before 'Subsector and Industry' descriptions df_data_region['Subsector and Industry'] = \ df_data_region['Subsector and Industry'].str.lstrip(' ') # concatenate census region data with master dataframe df_data = pd.concat([df_data, df_data_region]) return df_data
#continent/regional aggregates agg_list = ['ARB', 'CSS', 'EAS', 'EMU', 'LCN', 'MEA', 'PSS', 'SAS', 'SSF'] agg = fs[fs.country_code.isin(agg_list)] #individual countries - remove country names that represent aggregates agg_country_code_list = [ 'ARB', 'CSS', 'EAS', 'EAP', 'CEA', 'EMU', 'ECS', 'ECA', 'CEU', 'EUU', 'HPC', 'HIC', 'NOC', 'OEC', 'LCN', 'LAC', 'CLA', 'LDC', 'LMY', 'LIC', 'LMC', 'MEA', 'MNA', 'CME', 'MIC', 'NAC', 'OED', 'OSS', 'PSS', 'SST', 'SAS', 'CSA', 'SSF', 'SSA', 'CAA', 'UMC', 'WLD' ] fs = fs[~fs.country_code.isin(agg_country_code_list)] #reshape to put years in rows instead of columns fs = pd.melt(fs, id_vars=['country', 'country_code', 'indicator'], var_name='year') agg = pd.melt(agg, id_vars=['country', 'country_code', 'indicator'], var_name='year') world = pd.melt(world, id_vars=['country', 'country_code', 'indicator'], var_name='year') #reshape again to put indicators in columns instead of rows & save results fs = pd.pivot_table(fs, values='value', index=['country', 'country_code', 'year'], columns=['indicator']) agg = pd.pivot_table(agg, values='value',
cints = ions["CALIBRATED_INTENSITY"][full_anchor_ions] cints_swim = cints[:, :9] cints_udmse = cints[:, 9:] cints_swim_cv = scipy.stats.variation(cints_swim, axis=1) cints_udmse_cv = scipy.stats.variation(cints_udmse, axis=1) results = scipy.stats.ttest_rel(cints_swim_cv, cints_udmse_cv) log.printMessage( "SWIM/UDMSE median cvs (ttest: {}, pval: {}): {} {}".format( results[0], results[1], np.median(cints_swim_cv), np.median(cints_udmse_cv), )) d = pd.melt( pd.DataFrame(np.stack([ cints_swim_cv, cints_udmse_cv, ]).T, columns=["SWIM-DIA", "HDMSE"]), ) d["Y"] = 1 d["Acquistion"] = d["variable"] tmp = sns.violinplot(x='value', y='Y', hue='Acquistion', split=True, data=d, inner="quartile", gridsize=1000, orient="h") tmp = plt.ylabel("Relative Frequency") tmp = plt.xlabel("CV Of Fully Reproducible Aggregates") tmp = plt.yticks([])
import pandas as pd import numpy as np import seaborn as sns import matplotlib.pyplot as plt if __name__ == '__main__': # Plot stat vs legendary data = pd.read_csv("Pokemon.csv") stat = data.drop(['#', 'Type 1', 'Type 2', 'Total', 'Generation', 'Name'], axis=1) stat = pd.melt(stat, id_vars=['Legendary'], var_name="stat") plt.figure() sns.swarmplot( x="stat", y="value", data=stat, hue="Legendary").get_figure().savefig("Results//Stat_vs_Le.png") # Plot type vs legendary stat1 = data[['Type 1', 'Legendary']] stat2 = data[['Type 2', 'Legendary']] typ = data['Type 2'].unique() dic_type = {typ[i]: i for i in range(19)} val = [[typ[i], 0, 0, 0] for i in range(19)] for i in range(800): val[dic_type[stat1.values[i][0]]][3] += 1 if stat1.values[i][1] == True: val[dic_type[stat1.values[i][0]]][1] += 1 val[dic_type[stat2.values[i][0]]][2] += 1 df = pd.DataFrame(val, columns=['Type Name', 'Type 1', 'Type 2', 'Total'])
# add any new lines to wmata.rail_lines # wmata.rail_stations ######################################################### # wmata.rail_lines_served ##################################################### rail_stations = requests.get('http://api.wmata.com/Rail.svc/json/jStations', headers) rail_stations = return_data(rail_stations, 'Stations') rail_stations = rail_stations.loc[:, [ 'Code', 'Name', 'Lat', 'Lon', 'LineCode1', 'LineCode2', 'LineCode3', 'LineCode4' ]] rail_lines_served = pandas.melt( rail_stations, id_vars=['Code', 'Name', 'Lat', 'Lon'], value_vars=['LineCode1', 'LineCode2', 'LineCode3', 'LineCode4'], var_name='Split', value_name='LineCode').loc[:, ['Code', 'LineCode']] rail_lines_served = rail_lines_served[rail_lines_served['LineCode'].notnull()] rail_stations = rail_stations.loc[:, ['Code', 'Name', 'Lat', 'Lon']] # rail stations # uppercase the station name # map the MAR ID # for unknown records, add to unknown locations table # for records where no match on wmata_station_code, station, and mar_id, add # map back the station ids # update rail_stations_operational # expire stations where no record in new table, record in operational # extend expiration date where record in both
diff_gene_expression_df.append(sig_dict) p_val_list.append(p_val) _, p_val_corrected = fdrcorrection(p_val_list) diff_gene_expression_df = pd.DataFrame(diff_gene_expression_df) diff_gene_expression_df['p_val'] = p_val_corrected diff_gene_expression_df['sig_level'] = diff_gene_expression_df['p_val'].apply( lambda x: man_utils.pval_to_sig(x)) diff_gene_expression_df = diff_gene_expression_df.loc[diff_gene_expression_df.sig_level != 'n.s.', ] gene_sig_grouped = diff_gene_expression_df.groupby('gene') diff_param_data_inh = pd.melt(hof_param_data_inh_lines, id_vars=['Cell_id', 'Cre_line'], value_vars=significant_parameters, var_name='conductance', value_name='value') inh_expr_df = pd.melt(inh_expression_data, id_vars=['sample_id', 'Cre_line'], value_vars=gene_types, var_name='gene', value_name='cpm') hue_levels = inh_lines tick_fontsize = 16 axis_fontsize = 16 sns.set(style='whitegrid') for channel_, genes in channel_correlate_dict.items(): cond_ = 'gbar_%s.somatic' % channel_ if cond_ not in significant_parameters: continue
def load_jhu_us_time_series(branch="master"): """ Loads the JHU US timeseries data, transforms it so we are happy with it. """ cases = pd.read_csv(CASES_URL.format(branch)) deaths = pd.read_csv(DEATHS_URL.format(branch)) lookup_table = pd.read_csv(LOOKUP_TABLE_URL.format(branch)) keep_lookup_cols = ["UID", "Population"] lookup_table = lookup_table[keep_lookup_cols] # melt cases id_vars, dates = parse_columns(cases) cases_df = pd.melt( cases, id_vars=id_vars, value_vars=dates, value_name="cases", var_name="date", ) # melt deaths id_vars, dates = parse_columns(deaths) deaths_df = pd.melt( deaths, id_vars=id_vars, value_vars=dates, value_name="deaths", var_name="date", ) # join merge_cols = [ "UID", "iso2", "iso3", "code3", "FIPS", "Admin2", "Province_State", "Country_Region", "Lat", "Long_", "date", ] m1 = pd.merge(cases_df, deaths_df, on=merge_cols, how="left") df = pd.merge(m1.drop(columns="Population"), lookup_table, on="UID", how="left") keep_cols = [ "Province_State", "Admin2", "FIPS", "Lat", "Long_", "date", "cases", "deaths", "Population", ] df = ( df[keep_cols] .assign( date=pd.to_datetime(df.date) .dt.tz_localize("US/Pacific") .dt.normalize() .dt.tz_convert("UTC"), ) .rename( columns={ "FIPS": "fips", "Long_": "Lon", "Province_State": "state", "Admin2": "county", } ) ) # Fix fips df = df.pipe(coerce_fips_integer) df["fips"] = df.fips.astype(str) df["fips"] = df.apply(correct_county_fips, axis=1) for col in ["state", "county", "fips"]: df[col] = df[col].fillna("") return df.sort_values(sort_cols).reset_index(drop=True)
# Montreal geojson with open(DATA_PATH.joinpath('montreal_shapefile.geojson'), encoding='utf-8') as shapefile: mtl_geojson = json.load(shapefile) # Montreal cases per borough # cases = pd.read_csv(DATA_PATH.joinpath('cases.csv'), encoding='utf-8', na_values='na').dropna(axis=1, how='all') # borough_tbc = cases[-1:] # Nb. of cases with borough TBC # cases_df = cases[:-1] # Nb. of cases with known borough # cases_long = pd.melt(cases_df, id_vars='borough', # var_name='date', value_name='cases') cases_per1000_df = pd.read_csv(DATA_PATH.joinpath('cases_per1000.csv'), encoding='utf-8', na_values='na').dropna(axis=1, how='all') cases_per1000_long = pd.melt(reduce_cols(cases_per1000_df, 10), id_vars='borough', var_name='date', value_name='cases_per_1000') # Montreal data data_mtl = pd.read_csv(DATA_PATH.joinpath('data_mtl.csv'), encoding='utf-8', na_values='na') # QC data data_qc = pd.read_csv(DATA_PATH.joinpath('data_qc.csv'), encoding='utf-8', na_values='na') # Last update date # Display 1 day after the latest data as data from the previous day are posted latest_mtl_date = datetime.date.fromisoformat(
big_frame = pd.concat(dfs, ignore_index=True) big_frame_1 = pd.DataFrame( big_frame[fields[0]].loc[big_frame['fire'] == 1], columns=[fields[0] ]).assign(year=key).rename(columns={feature: "fire"}) big_frame_0 = pd.DataFrame( big_frame[fields[0]].loc[big_frame['fire'] == 0], columns=[fields[0] ]).assign(year=key).rename(columns={feature: "non_fire"}) del big_frame new_list.append(big_frame_1) new_list.append(big_frame_0) del dfs #import pdb; pdb.set_trace() cdf = pd.concat([pd for pd in new_list], ignore_index=True) mdf = pd.melt(cdf, id_vars=['year'], var_name=['fire']).dropna() # import pdb; pdb.set_trace() # bxpstats.extend(cbook.boxplot_stats(np.ravel(mdf), labels=key)) # del cdf # fig, axes = pyplot.subplots() if i == 0: min_lim = mdf.value.min() max_lim = mdf.value.max() sp = (max_lim - min_lim) / 10 ax = sns.boxplot(ax=axes[i], x="year", y="value", hue="fire", palette=['red', 'green'], data=mdf) #ax.set_ylim([int(min_lim), int(max_lim)])
import pandas as pd data1 = {'Student':['Ice Bear','Panda','Grizzly'], 'Math':[80,95,79]} grades1=pd.DataFrame(data1,columns=['Student','Math']) data2 = {'Student':['Ice Bear','Panda','Grizzly'], 'Electronics':[85,81,83]} grades2=pd.DataFrame(data2,columns=['Student','Electronics']) data3 = {'Student':['Ice Bear','Panda','Grizzly'], 'GEAS':[90,79,93]} grades3=pd.DataFrame(data3,columns=['Student','GEAS']) data4 = {'Student':['Ice Bear','Panda','Grizzly'], 'ESAT':[93,89,88]} grades4=pd.DataFrame(data4,columns=['Student','ESAT']) merge=pd.merge (grades1,grades2,how='right',on='Student') merge1=pd.merge (merge,grades3,how='right',on='Student') mergefinal=pd.merge (merge1,grades4,how='right',on='Student') mergelong=pd.melt(mergefinal,id_vars = 'Student', var_name = 'Subject', value_name='Grades')
from bokeh.io import output_file, show import microtubule_pkg as mt output_file("interactive_fig1.html") rg = np.random.default_rng(1284) lbl_df = pd.read_csv('gardner_time_to_catastrophe_dic_tidy.csv') labeled = lbl_df.loc[lbl_df["labeled"] == True, "time to catastrophe (s)"].values unlabeled = lbl_df.loc[lbl_df["labeled"] == False, "time to catastrophe (s)"].values # Make plots for tubulin concentration data # taken from HW9.1 df = pd.read_csv('gardner_mt_catastrophe_only_tubulin.csv',comment='#') df = pd.melt(df, value_vars = ['12 uM', '7 uM', '9 uM', '10 uM', '14 uM'], var_name = 'tubulin concentrations', value_name = 'time to catastrophe (s)') df = df.dropna() concen = ['12 uM', '7 uM', '9 uM', '10 uM', '14 uM'] def tub_stripbox(conc): return iqplot.stripbox( title = 'Microtubule Time to Catastrophe against Tubulin Concentration', data = df.loc[df['tubulin concentrations'] == conc], q = 'time to catastrophe (s)', #color_column='year', q_axis='x', jitter=True, whisker_caps=True, display_points=False, marker_kwargs=dict(alpha=0.5, size=1), box_kwargs=dict(fill_color=None, line_color='grey'),
size_scale = 500 ax.scatter( x=x.map(x_to_num), # Use mapping for x y=y.map(y_to_num), # Use mapping for y s=size * size_scale, # Vector of square sizes, proportional to size parameter marker='s' # Use square as scatterplot marker ) # Show column labels on the axes ax.set_xticks([x_to_num[v] for v in x_labels]) ax.set_xticklabels(x_labels, rotation=45, horizontalalignment='right') ax.set_yticks([y_to_num[v] for v in y_labels]) ax.set_yticklabels(y_labels) fig.show() data = pd.read_csv( 'https://raw.githubusercontent.com/drazenz/heatmap/master/autos.clean.csv') columns = [ 'bore', 'stroke', 'compression-ratio', 'horsepower', 'city-mpg', 'price' ] corr = data[columns].corr() corr = pd.melt( corr.reset_index(), id_vars='index' ) # Unpivot the dataframe, so we can get pair of arrays for x and y corr.columns = ['x', 'y', 'value'] heatmap(x=corr['x'], y=corr['y'], size=corr['value'].abs())
steps = 10 adj = [(i + 1) * 0.01 / steps for i in range(steps)] myDF = myModel.makeSensAnalDF( {col: adj for col in myDF.columns if col != "RSS"}, myDF) myRSS = myModel.getRSSforParamiters(myDF, RS["indep_cond"], myPaths) myDF['RSS'] = myRSS myDF2 = myDF.copy() tempDS = myDF.iloc[0] myDF2 = (myDF2 - tempDS) / tempDS myDF2 = pd.melt(myDF2, id_vars=['RSS']) myDF2 = myDF2[myDF2["value"] != 0.0] myDF2 = myDF2[pd.isnull(myDF2["value"]) == False] myDF2 = myDF2.sort_values(by=['RSS']) myDF2 = myDF2.rename(columns={ "value": "paramiter change%", "RSS": "RSS change%" }) myDF["index"] = index myDF2["index"] = index myList1.append(myDF) myList2.append(myDF2) myDF = pd.concat(myList1) myDF2 = pd.concat(myList2) myDF.to_csv(os.path.join(data_dir, "sensativity.csv"))
def newcsv(set1, data_dict, set2=None): df = pd.DataFrame(data=data_dict) melt = pd.melt(df, id_vars=list(data_dict.keys())[0], var_name='Period', value_name='Average Temperature') print(melt) return(melt)
list(hidr.set_index('UH_nome').index), ) data = hidr.set_index('UH_nome').loc[powerplant] left_column, right_column = st.beta_columns([1, 3]) table_vis = data.filter( like='vol/volutilmax_itr', axis=1).T.reset_index(drop=True).reset_index().set_index('index') left_column.write(table_vis) data = data.filter(like='vol/volutilmax_itr', axis=1).T.reset_index(drop=True).reset_index() data = pd.melt(data, id_vars=["index"]) data.columns = ['iteration', 'UH_nome', 'vol/volutilmax'] st.write('') chart = ( alt.Chart(data).mark_area(opacity=0.2) # .mark_line() .encode( x="iteration:N", y=alt.Y("vol/volutilmax:Q", stack=None), color="UH_nome:N", )) right_column.altair_chart(chart, use_container_width=True) st.subheader('After seek goal') st.write('\n')
#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Sat Nov 30 16:55:56 2019 @author: root """ import pandas as pedo #setting libraries dmath={'Student':['Ice bear','Panda','Grizzly'], 'Math':[80,95,79]} delectronics={'Student':['Ice bear','Panda','Grizzly'], 'Electronics':[85,81,83]} dgeas={'Student':['Ice bear','Panda','Grizzly'], 'GEAS':[90,79,93]} desat={'Student':['Ice bear','Panda','Grizzly'], 'ESAT':[93,89,88]} #dataframezzzz math=pedo.DataFrame(dmath) elecs=pedo.DataFrame(delectronics) geas=pedo.DataFrame(dgeas) esat=pedo.DataFrame(desat) #merge grades = pedo.merge(pedo.merge(pedo.merge(math,elecs),geas),esat) #long to short messy = pedo.melt(grades, id_vars=['Student'], value_vars=['Math','Electronics','GEAS','ESAT']).rename(columns={'variable' : 'Subject', 'value' : 'Grades'})
def analyze_color(rgb_img, mask, hist_plot_type=None, label="default"): """Analyze the color properties of an image object Inputs: rgb_img = RGB image data mask = Binary mask made from selected contours hist_plot_type = None, 'all', 'rgb','lab' or 'hsv' label = optional label parameter, modifies the variable name of observations recorded Returns: analysis_image = histogram output :param rgb_img: numpy.ndarray :param mask: numpy.ndarray :param hist_plot_type: str :param label: str :return analysis_images: list """ if len(np.shape(rgb_img)) < 3: fatal_error("rgb_img must be an RGB image") # Mask the input image masked = cv2.bitwise_and(rgb_img, rgb_img, mask=mask) # Extract the blue, green, and red channels b, g, r = cv2.split(masked) # Convert the BGR image to LAB lab = cv2.cvtColor(masked, cv2.COLOR_BGR2LAB) # Extract the lightness, green-magenta, and blue-yellow channels l, m, y = cv2.split(lab) # Convert the BGR image to HSV hsv = cv2.cvtColor(masked, cv2.COLOR_BGR2HSV) # Extract the hue, saturation, and value channels h, s, v = cv2.split(hsv) # Color channel dictionary channels = { "b": b, "g": g, "r": r, "l": l, "m": m, "y": y, "h": h, "s": s, "v": v } # Histogram plot types hist_types = { "ALL": ("b", "g", "r", "l", "m", "y", "h", "s", "v"), "RGB": ("b", "g", "r"), "LAB": ("l", "m", "y"), "HSV": ("h", "s", "v") } if hist_plot_type is not None and hist_plot_type.upper() not in hist_types: fatal_error( "The histogram plot type was " + str(hist_plot_type) + ', but can only be one of the following: None, "all", "rgb", "lab", or "hsv"!' ) # Store histograms, plotting colors, and plotting labels histograms = { "b": { "label": "blue", "graph_color": "blue", "hist": [ float(i[0]) for i in cv2.calcHist([channels["b"]], [0], mask, [256], [0, 255]) ] }, "g": { "label": "green", "graph_color": "forestgreen", "hist": [ float(i[0]) for i in cv2.calcHist([channels["g"]], [0], mask, [256], [0, 255]) ] }, "r": { "label": "red", "graph_color": "red", "hist": [ float(i[0]) for i in cv2.calcHist([channels["r"]], [0], mask, [256], [0, 255]) ] }, "l": { "label": "lightness", "graph_color": "dimgray", "hist": [ float(i[0]) for i in cv2.calcHist([channels["l"]], [0], mask, [256], [0, 255]) ] }, "m": { "label": "green-magenta", "graph_color": "magenta", "hist": [ float(i[0]) for i in cv2.calcHist([channels["m"]], [0], mask, [256], [0, 255]) ] }, "y": { "label": "blue-yellow", "graph_color": "yellow", "hist": [ float(i[0]) for i in cv2.calcHist([channels["y"]], [0], mask, [256], [0, 255]) ] }, "h": { "label": "hue", "graph_color": "blueviolet", "hist": [ float(i[0]) for i in cv2.calcHist([channels["h"]], [0], mask, [256], [0, 255]) ] }, "s": { "label": "saturation", "graph_color": "cyan", "hist": [ float(i[0]) for i in cv2.calcHist([channels["s"]], [0], mask, [256], [0, 255]) ] }, "v": { "label": "value", "graph_color": "orange", "hist": [ float(i[0]) for i in cv2.calcHist([channels["v"]], [0], mask, [256], [0, 255]) ] } } # Create list of bin labels for 8-bit data binval = np.arange(0, 256) analysis_image = None # Create a dataframe of bin labels and histogram data dataset = pd.DataFrame({ 'bins': binval, 'blue': histograms["b"]["hist"], 'green': histograms["g"]["hist"], 'red': histograms["r"]["hist"], 'lightness': histograms["l"]["hist"], 'green-magenta': histograms["m"]["hist"], 'blue-yellow': histograms["y"]["hist"], 'hue': histograms["h"]["hist"], 'saturation': histograms["s"]["hist"], 'value': histograms["v"]["hist"] }) # Make the histogram figure using plotnine if hist_plot_type is not None: if hist_plot_type.upper() == 'RGB': df_rgb = pd.melt(dataset, id_vars=['bins'], value_vars=['blue', 'green', 'red'], var_name='Color Channel', value_name='Pixels') hist_fig = (ggplot( df_rgb, aes(x='bins', y='Pixels', color='Color Channel')) + geom_line() + scale_x_continuous(breaks=list(range(0, 256, 25))) + scale_color_manual(['blue', 'green', 'red'])) elif hist_plot_type.upper() == 'LAB': df_lab = pd.melt( dataset, id_vars=['bins'], value_vars=['lightness', 'green-magenta', 'blue-yellow'], var_name='Color Channel', value_name='Pixels') hist_fig = (ggplot( df_lab, aes(x='bins', y='Pixels', color='Color Channel')) + geom_line() + scale_x_continuous(breaks=list(range(0, 256, 25))) + scale_color_manual(['yellow', 'magenta', 'dimgray'])) elif hist_plot_type.upper() == 'HSV': df_hsv = pd.melt(dataset, id_vars=['bins'], value_vars=['hue', 'saturation', 'value'], var_name='Color Channel', value_name='Pixels') hist_fig = (ggplot( df_hsv, aes(x='bins', y='Pixels', color='Color Channel')) + geom_line() + scale_x_continuous(breaks=list(range(0, 256, 25))) + scale_color_manual(['blueviolet', 'cyan', 'orange'])) elif hist_plot_type.upper() == 'ALL': s = pd.Series([ 'blue', 'green', 'red', 'lightness', 'green-magenta', 'blue-yellow', 'hue', 'saturation', 'value' ], dtype="category") color_channels = [ 'blue', 'yellow', 'green', 'magenta', 'blueviolet', 'dimgray', 'red', 'cyan', 'orange' ] df_all = pd.melt(dataset, id_vars=['bins'], value_vars=s, var_name='Color Channel', value_name='Pixels') hist_fig = (ggplot( df_all, aes(x='bins', y='Pixels', color='Color Channel')) + geom_line() + scale_x_continuous(breaks=list(range(0, 256, 25))) + scale_color_manual(color_channels)) analysis_image = hist_fig # Hue values of zero are red but are also the value for pixels where hue is undefined. The hue value of a pixel will # be undef. when the color values are saturated. Therefore, hue values of 0 are excluded from the calculations below # Calculate the median hue value (median is rescaled from the encoded 0-179 range to the 0-359 degree range) hue_median = np.median(h[np.where(h > 0)]) * 2 # Calculate the circular mean and standard deviation of the encoded hue values # The mean and standard-deviation are rescaled from the encoded 0-179 range to the 0-359 degree range hue_circular_mean = stats.circmean(h[np.where(h > 0)], high=179, low=0) * 2 hue_circular_std = stats.circstd(h[np.where(h > 0)], high=179, low=0) * 2 # Plot or print the histogram if hist_plot_type is not None: params.device += 1 if params.debug == 'print': hist_fig.save(os.path.join( params.debug_outdir, str(params.device) + '_analyze_color_hist.png'), verbose=False) elif params.debug == 'plot': print(hist_fig) # Store into global measurements # RGB signal values are in an unsigned 8-bit scale of 0-255 rgb_values = [i for i in range(0, 256)] # Hue values are in a 0-359 degree scale, every 2 degrees at the midpoint of the interval hue_values = [i * 2 + 1 for i in range(0, 180)] # Percentage values on a 0-100 scale (lightness, saturation, and value) percent_values = [round((i / 255) * 100, 2) for i in range(0, 256)] # Diverging values on a -128 to 127 scale (green-magenta and blue-yellow) diverging_values = [i for i in range(-128, 128)] if hist_plot_type is not None: if hist_plot_type.upper() == 'RGB' or hist_plot_type.upper() == 'ALL': outputs.add_observation(sample=label, variable='blue_frequencies', trait='blue frequencies', method='plantcv.plantcv.analyze_color', scale='frequency', datatype=list, value=histograms["b"]["hist"], label=rgb_values) outputs.add_observation(sample=label, variable='green_frequencies', trait='green frequencies', method='plantcv.plantcv.analyze_color', scale='frequency', datatype=list, value=histograms["g"]["hist"], label=rgb_values) outputs.add_observation(sample=label, variable='red_frequencies', trait='red frequencies', method='plantcv.plantcv.analyze_color', scale='frequency', datatype=list, value=histograms["r"]["hist"], label=rgb_values) if hist_plot_type.upper() == 'LAB' or hist_plot_type.upper() == 'ALL': outputs.add_observation(sample=label, variable='lightness_frequencies', trait='lightness frequencies', method='plantcv.plantcv.analyze_color', scale='frequency', datatype=list, value=histograms["l"]["hist"], label=percent_values) outputs.add_observation(sample=label, variable='green-magenta_frequencies', trait='green-magenta frequencies', method='plantcv.plantcv.analyze_color', scale='frequency', datatype=list, value=histograms["m"]["hist"], label=diverging_values) outputs.add_observation(sample=label, variable='blue-yellow_frequencies', trait='blue-yellow frequencies', method='plantcv.plantcv.analyze_color', scale='frequency', datatype=list, value=histograms["y"]["hist"], label=diverging_values) if hist_plot_type.upper() == 'HSV' or hist_plot_type.upper() == 'ALL': outputs.add_observation(sample=label, variable='hue_frequencies', trait='hue frequencies', method='plantcv.plantcv.analyze_color', scale='frequency', datatype=list, value=histograms["h"]["hist"][0:180], label=hue_values) outputs.add_observation(sample=label, variable='saturation_frequencies', trait='saturation frequencies', method='plantcv.plantcv.analyze_color', scale='frequency', datatype=list, value=histograms["s"]["hist"], label=percent_values) outputs.add_observation(sample=label, variable='value_frequencies', trait='value frequencies', method='plantcv.plantcv.analyze_color', scale='frequency', datatype=list, value=histograms["v"]["hist"], label=percent_values) # Always save hue stats outputs.add_observation(sample=label, variable='hue_circular_mean', trait='hue circular mean', method='plantcv.plantcv.analyze_color', scale='degrees', datatype=float, value=hue_circular_mean, label='degrees') outputs.add_observation(sample=label, variable='hue_circular_std', trait='hue circular standard deviation', method='plantcv.plantcv.analyze_color', scale='degrees', datatype=float, value=hue_circular_std, label='degrees') outputs.add_observation(sample=label, variable='hue_median', trait='hue median', method='plantcv.plantcv.analyze_color', scale='degrees', datatype=float, value=hue_median, label='degrees') # Store images outputs.images.append(analysis_image) return analysis_image
def load_data(data_path): # Subtitles info subs_df = pd.read_csv(data_path / 'prep/all_subtitles.csv') # IMDB data, only to top 150 movies imdb_df = pd.read_csv(data_path / 'prep/imdb_top250_movies.csv') imdb_df = imdb_df.loc[imdb_df['top_250_rank'] <= 150] movie_info_df = imdb_df.loc[:, [ 'imdb_id', 'title', 'year', 'rating', 'genres', 'top_250_rank', 'color_info' ]] # Movie duration dict with open(data_path / 'prep/movie_duration_dict.pk', 'rb') as r: movie_duration_dict = pickle.load(r) # Silences info silences_df = pd.read_csv(data_path / 'prep/silences_info.csv') silences_df = pd.merge(left=silences_df, right=movie_info_df, on='imdb_id', how='inner') silences_df.loc[:, 'total_duration'] = silences_df['imdb_id'].apply( lambda x: movie_duration_dict[x]) silences_df.loc[:, 'pos_rel'] = 100 * silences_df['start'] / silences_df[ 'total_duration'] silences_df.loc[:, 'dur_rel'] = 100 * silences_df[ 'duration'] / silences_df['total_duration'] # Movie summary info movies_df = pd.read_csv(data_path / 'prep/movies_infos.csv') aux_dict = { 'silence_dur': 'Silence', 'dialogue_dur': 'Dialogue', 'other_dur': 'Other sounds' } cols = movies_df.columns.tolist() for k, v in aux_dict.items(): cols.remove(k) movies_melt = pd.melt(movies_df, id_vars=cols, value_vars=list(aux_dict.keys())) movies_melt.loc[:, 'var_name'] = movies_melt['variable'].apply( lambda x: aux_dict[x]) # Positions info positions_df = data_sound_type_share_by__position(subs_df, silences_df, movie_info_df, 1) # Umap data umap_df = pd.read_csv(data_path / 'prep/umap_df.csv') return { 'subs_df': subs_df, 'silences_df': silences_df, 'movies_df': movies_df, 'movies_melt_df': movies_melt, 'positions_df': positions_df, 'umap_df': umap_df }
# Importar Dados data = pd.read_excel( "D:\OneDrive\Documentos OK\Python Scripts\WIOD_SEA_Nov16 (2).xlsx", sheet_name='DATA') df = pd.DataFrame(data) # Filtrar para Brasil df_bra = df[df['country'] == 'BRA'] # Fazer anos virarem uma coluna só (melt) df_bra_melt = pd.melt(df_bra, id_vars=["country", "variable", "description", "code"], var_name='year') # Fazer variable virar colunas (pivot_table) df_bra_pivot = pd.pivot_table(df_bra_melt, index=["country", "description", "code", "year"], columns="variable", values="value") df_bra_pivot = df_bra_pivot.reset_index() # Selecionar colunas não úteis df_Bra = df_bra_pivot.drop( ['country', 'COMP', 'EMP', 'EMPE', 'GO_PI', 'II_PI', 'VA_PI', 'VA_QI'],