def results(selected_states="none", words="none"): if selected_states == "none" or words == "none": return "Make sure you selected your state(s) AND submitted a topic of interest. Press the back button to retry." else: print("Calculating...") states = selected_states.replace(","," ") #Do some magic here to correspond states with senator #models = np.where(senators in model) #models is an array of the filenames of modles #word2ids is an array of the filenames of word2ids senators = [] sen_state = [] models = ["app/models/model_architecture_set_0.json", "app/models/model_architecture_set_1.json"] word2ids = ["app/models/word2id_set_0.json", "app/models/word2id_set_1.json"] weights = ['app/models/model_weights_set_0.h5', 'app/models/model_weights_set_1.h5'] k = len(models) sentiments = [] for i in range(k): (model, word2id) = model_setup(models[i], word2ids[i], weights[i]) sentiments.append(predict_sentiment(words, model, word2id)) print(i) table_results = Table().with_columns("Senators", senators, "State", sen_state, "Sentiment", sentiments) text_results = table_results.as_text() return render_template("results.html", selected_states=selected_states, words=words, result=text_results)
def game_filter(csv_file): # Takes in a csv filepath of one of the EightThirtyFour data sets # and filters the data to games with a 10 or less point lead in # the last 6 minutes of the game. pbp = Table().read_table(csv_file) unique_games = pbp.group('GAME_ID').column(0) print(unique_games) last_quarter = pbp.where('PERIOD', predicates.are.equal_to(4)) transformed_minutes = last_quarter.apply(time_string_to_number, 'PCTIMESTRING') last_quarter_and_minutes = last_quarter.with_column( 'TIME', transformed_minutes) between_six_and_seven = last_quarter_and_minutes.where( 'TIME', predicates.are.below_or_equal_to(6.5)) close_games = make_array() for game in unique_games: game_scores_only = between_six_and_seven.where( 'GAME_ID', predicates.are.equal_to(game)).select('TIME', 'SCORE').where( 'SCORE', predicates.are.not_equal_to('nan')) score = game_scores_only.row(0).item(1) t1, t2 = score.split('-') if abs(int(t1) - int(t2)) <= 10: close_games = np.append(close_games, game) return close_games
def sanitize_dataframe(df: Table): """Sanitize a DataFrame to prepare it for serialization. copied from the ipyvega project * Make a copy * Convert categoricals to strings. * Convert np.bool_ dtypes to Python bool objects * Convert np.int dtypes to Python int objects * Convert floats to objects and replace NaNs/infs with None. * Convert DateTime dtypes into appropriate string representations """ import numpy as np if df is None: return None # raise InternalLogicalError("Cannot sanitize empty df") df = df.copy() def to_list_if_array(val): if isinstance(val, np.ndarray): return val.tolist() else: return val for col_name in df.labels: dtype = df.column(col_name).dtype if str(dtype) == 'category': # XXXX: work around bug in to_json for categorical types # https://github.com/pydata/pandas/issues/10778 df[col_name] = df[col_name].astype(str) elif str(dtype) == 'bool': # convert numpy bools to objects; np.bool is not JSON serializable df[col_name] = df[col_name].astype(object) elif np.issubdtype(dtype, np.integer): # convert integers to objects; np.int is not JSON serializable df[col_name] = df[col_name].astype(object) elif np.issubdtype(dtype, np.floating): # For floats, convert to Python float: np.float is not JSON serializable # Also convert NaN/inf values to null, as they are not JSON serializable col = df[col_name] bad_values = np.isnan(col) | np.isinf(col) df[col_name] = np.where(bad_values, None, col).astype(object) # col.astype(object)[~bad_values]= None elif str(dtype).startswith('datetime'): # Convert datetimes to strings # astype(str) will choose the appropriate resolution new_column = df[col_name].astype(str) new_column[new_column == 'NaT'] = '' df[col_name] = new_column elif dtype == object: # Convert numpy arrays saved as objects to lists # Arrays are not JSON serializable col = np.vectorize(to_list_if_array)(df[col_name]) df[col_name] = np.where(notnull(col), col, None).astype(object) return df
def loadChipTables(*chipData): ''' Loads the color chip information into two datascience Tables. Args: chipData (tuple or dict): a variable length argument consisting of either a tuple with a dictionary mapping each Munsell coordinate to its WCS chip number, and a dictionary mapping each WCS chip number to its Munsell coordinates, or those dictionaries in that order Returns: the same information in two datascience Table objects Examples: >>> munsellInfo = readChipData('./WCS_data_core/chip.txt') >>> coordToIndexTable, indexToCoordTable = loadChipTables(munsellInfo) ... >>> coord_to_index, index_to_coord = readChipData('./WCS_data_core/chip.txt') >>> coordToIndexTable, indexToCoordTable = loadChipTables(coord_to_index, index_to_coord) ''' if len(chipData) == 0: print('Warning: No data provided') return Table(), Table() if len(chipData) == 1: chipData = chipData[0] elif len(chipData) > 2: raise ValueError('chipData can accept at most two arguments') # coordinate to index # get the dictionaries coord_to_index = chipData[0] # sort the keys and get the sorted values sorted_keys, sorted_values = _get_sorted_keys_and_values(coord_to_index) coordToIndexTable = Table().with_columns('Coordinate', sorted_keys, 'Index', sorted_values) # index to lightness, hue coordinate # get the dictionaries index_to_coord = chipData[1] # sort the keys and get the sorted values sorted_keys, sorted_values = _get_sorted_keys_and_values(index_to_coord) # unzip the lightness and hue values unzipped_values = [v for v in zip(*sorted_values)] # get the lightness and hue values lightness, hue = unzipped_values indexToCoordTable = Table().with_columns('Coordinate', sorted_keys, 'Lightness', lightness, 'Hue', hue) return coordToIndexTable, indexToCoordTable
def transition_probability(self, values): """ For a multivariate probability distribution, assigns transition probabilities, ie P(Y | X). Parameters ---------- values : List or Array Values that must correspond to the domain in the same order Returns ------- Table A probability distribution with those probabilities """ if any(np.array(values) < 0): warnings.warn('Probability cannot be negative') states = self.column(0) self = multi_domain(Table(), 'Source', states, 'Target', states) return_table = self.with_column('Probability', values) _transition_warn(return_table) return return_table
def summary(self): b0, bs = self.params sum_tbl = Table().with_columns([ ("Param", ['Intercept'] + self.input_labels), ("Coeffient", [b0] + list(bs)), ]) return sum_tbl
def fast_run_expectancy(retro, re): TABLE_FLAG = False if isinstance(retro, _Table): TABLE_FLAG = True retro = retro.to_df() re = re.to_df() re = re.set_index(['Outs', 'Start_Bases']) # Build current out-runner states idx = list(zip(retro['Outs'], retro['Start_Bases'])) # Extract run potentials retro['Run_Expectancy'] = re.loc[idx].values next_outs = retro['Outs'] + retro['Event_Outs'] # Build next out-runner states idx = list(zip(next_outs, retro['End_Bases'])) # Extract run potentials retro['Run_Expectancy_Next'] = re.loc[idx].values # When the inning ends, there are 3 outs. That is not in the run # expectancy matrix so inning ending plate appearances will have an NA # value here. We fill those with 0. retro['Run_Expectancy_Next'].fillna(0, inplace=True) return _Table.from_df(retro) if TABLE_FLAG else retro
def loadSpeakerTable(speakerData): ''' Loads the speaker info into a datascience Table. Args: speakerData (dict): a dictionary mapping each language to a list of its speakers' age and gender information Returns: the same information in a datascience Table ''' # create lsits for the information language = [] speaker = [] age = [] gender = [] # loop through the languages for lang in speakerData: # loop through the speakers for spkr in speakerData[lang]: # add to the lists language.append(lang) speaker.append(spkr) # get the age and gender a, g = speakerData[lang][spkr][0] age.append(a) gender.append(g) # turn into a table speakerTable = Table().with_columns('Language', language, 'Speaker', speaker, 'Age', age, 'Gender', gender) return speakerTable
def most_common_lineup_position(retro): TABLE_FLAG = False if isinstance(retro, _Table): TABLE_FLAG = True retro = retro.to_df() # Order of operations: # 1. Get PA counts # 2. Turn Lineup_Order into a column # 3. Rename column to PA # 4. Sort on PA in descending order lineup_pos = retro.groupby(['Batter_ID', 'Lineup_Order'])['Inning'].\ count().\ reset_index(level='Lineup_Order').\ rename(columns={'Inning': 'PA'}).\ sort_values('PA', ascending=False) # Duplicates indicate other positions. By keeping first, we keep the most # common due to the sorting most_common = ~lineup_pos.index.duplicated(keep='first') lineup_pos = lineup_pos.loc[most_common, ['Lineup_Order']].sort_index() if TABLE_FLAG: return _Table.from_df(lineup_pos.reset_index()) else: return lineup_pos
def steady_state(self): """ Finds the stationary distribution of the Markov Chain. Returns ------- Table Distribution. Examples -------- >>> states = ['A', 'B'] >>> transition_matrix = np.array([[0.1, 0.9], ... [0.8, 0.2]]) >>> mc = MarkovChain.from_matrix(states, transition_matrix) >>> mc.steady_state() Value | Probability A | 0.666667 B | 0.333333 """ # Steady state is the left eigenvector that corresponds to eigenvalue=1. w, vl = scipy.linalg.eig(self.matrix, left=True, right=False) # Find index of eigenvalue = 1. index = np.isclose(w, 1) eigenvector = np.real(vl[:, index])[:, 0] probabilities = eigenvector / sum(eigenvector) # Zero out floating poing errors that are negative. indices = np.logical_and(np.isclose(probabilities, 0), probabilities < 0) probabilities[indices] = 0 return Table().values(self.states).probabilities(probabilities)
def transition_function(self, pfunc): """ Assigns transition probabilities to a Distribution via a probability function. The probability function is applied to each value of the domain. Must have domain values in the first column first. Parameters ---------- pfunc : variate function Conditional probability function of the distribution ( P(Y | X)) Returns ------- Table Table with those probabilities in its final column """ states = self.column(0) self = multi_domain(Table(), 'Source', states, 'Target', states) domain_names = self.labels values = np.array(self.apply(pfunc, *domain_names)).astype(float) if any(values < 0): warnings.warn('Probability cannot be negative') conditioned_var = self.labels[0] all_other_vars = ','.join(self.labels[1:]) return_table = self.with_column('P({} | {})'.format( all_other_vars, conditioned_var), values) _transition_warn(return_table) return return_table
def test(data_structures: List[Union[List[str], Deque[str]]], sample_sizes: List[int]) -> Table: """ Return the test results in a Table. """ append_times, pop_times = [], [] data_structure_labels = [] elements = [] for N in sample_sizes: print('%d elements' % (N)) print('-' * 20) for ds in data_structures: append_time, pop_time = performance(ds, N) output(append_time, pop_time, ds) append_times.append(append_time) pop_times.append(pop_time) data_structure_labels.append(type_name(ds)) elements.append(N) print() print() # print() table = Table().with_columns('Data structure', data_structure_labels, 'Elements', elements, 'Append time(nano sec)', append_times, 'Pop time(nano sec)', pop_times) return table
def fill_null(table, fill_column=None, fill_value=None, fill_method=None): TABLE_FLAG = False if isinstance(table, _Table): TABLE_FLAG = True table = table.to_df() data = table[fill_column] if fill_column is not None else table data = data.fillna(value=fill_value, method=fill_method) return _Table.from_df(data) if TABLE_FLAG else data
def __init__(self, num_notes=20, starting_note="0"): """ __init__ """ self.res_path = str(Path("components/res/")) self.data_table = Table.read_table(self.res_path + "/probability_table.csv") self.notes = self.data_table.column("octave") self.num_notes = num_notes self.starting_note = starting_note
def emp_dist(values): """ Takes an array of values and returns an empirical distribution Parameters ---------- values : array Array of values that will be grouped by the distribution Returns ------- Table A distribution Examples -------- >>> x = make_array(1, 1, 1, 1, 1, 2, 3, 3, 3, 4) >>> emp_dist(x) Value | Proportion 1 | 0.5 2 | 0.1 3 | 0.3 4 | 0.1 """ total = len(values) position_counts = Table().with_column('position', values).group(0) new_dist = Table().values(position_counts.column(0)) return new_dist.with_column( 'Proportion', position_counts.column(1) / total )
def transpose(cls, tbl, category_col, time_col='Day', time_less = None) : """Transpose a table with one column containing categories and remaining labels time stamps""" time_col_vals = [lbl for lbl in tbl.labels if not lbl == category_col] xtbl = Table().with_column(time_col, time_col_vals) vals = tbl.drop(category_col) for lbl, vals in zip(tbl[category_col], vals.rows) : xtbl[lbl] = vals return TimeTable.from_table(xtbl, time_col, time_less)
def lm_fit(self, output_label, model_fun, x_column_or_label=None): if x_column_or_label is None: input_labels = [ lbl for lbl in self.labels if not lbl == output_label ] f_values = [ model_fun(*row) for row in self.select(input_labels).rows ] p = len(input_labels) else: f_values = model_fun(self._get_column(x_column_or_label)) p = 1 fit_tbl = Table(["Quantity", "Value"]) return fit_tbl.with_rows([ ("Residual standard error", self.RSE(output_label, f_values)), ("R^2", self.R2(output_label, f_values)), ("F-statistic", self.F_stat(output_label, f_values, p)) ])
def from_file(self, filepath_or_buffer, *args, **vargs): try: table = Table.read_table(filepath_or_buffer, *args, **vargs) df_name = find_name() return self.create_with_table_wrap(table, df_name) except FileNotFoundError: red_print(f"File {filepath_or_buffer} does not exist!") except UserError as err: red_print(err)
def forecast_table(self, past, ahead, inc=1): """Project a TimeTable forward. inc must match the interval""" last_time = self[self.time_column][-1] past_times = self[self.time_column][-past-1:-1] fore_time = np.arange(last_time + inc, last_time + inc + ahead, inc) def project(lbl): m, b = np.polyfit(past_times, self[lbl][-past-1:-1], 1) return [m*time + b for time in fore_time] xtbl = Table().with_columns([(self.time_column, fore_time)] + [(label, project(label)) for label in self.categories]) return self.copy().append(xtbl)
def datasci(): cong_dict = [houseDictPerYear(y) for y in range(1990, 2018)] sen_dict = [senateDictPerYear(y) for y in range(1990, 2018)] house = Table().with_columns( "Year", np.arange(1990, 2018), "House Bi-Partisan", np.array([(x['100'] + x['95']) * 100 / x['total'] for x in cong_dict]), "House Non Partisan", np.array([x['nonpart'] * 100 / x['total'] for x in cong_dict]), "House Collaborative", np.array([x['together'] * 100 / x['total'] for x in cong_dict])) senate = Table().with_columns( "Year", np.arange(1990, 2018), "Senate Bi-Partisan", np.array([(x['100'] + x['95']) * 100 / x['total'] for x in sen_dict]), "Senate Non Partisan", np.array([x['nonpart'] * 100 / x['total'] for x in sen_dict]), "Senate Collaborative", np.array([x['together'] * 100 / x['total'] for x in sen_dict])) total = senate.join("Year", house) #print(total) #total.scatter("Year") plt.axis([1989, 2020, 0, 100]) plt.plot(total.column("Year"), total.column("House Bi-Partisan"), 'k', c='g', label="House Bi-Partisan") plt.plot(total.column("Year"), total.column("Senate Bi-Partisan"), 'k', c='y', label="Senate Bi-Partisan") # plt.plot(total.column("Year"), total.column("House Non Partisan"), '*', c='g', label="House Non Partisan") # plt.plot(total.column("Year"), total.column("Senate Non Partisan"), '*', c='y', label="Senate Non Partisan") # plt.plot(total.column("Year"), total.column("House Collaborative"), '.', c='g', label="House Collaboration") # plt.plot(total.column("Year"), total.column("Senate Collaborative"), '.', c='y', label="Senate Collaboration") drawParties(plt, "both") #plt.legend(bbox_to_anchor=(0.5, -0.15)) plt.legend(loc=4) plt.ylabel("Percentage of Total Votes") plt.xlabel("Years (1990-2017)") plt.savefig("data.png", dpi=400) plt.show()
def Cor(self): """Create a correlation matrix of numeric columns as a table.""" assert (self.num_rows > 0) num_labels = [ lbl for lbl in self.labels if isinstance(self[lbl][0], Number) ] tbl = self.select(num_labels) Cor_tbl = Table().with_column("Param", num_labels) for lbl in num_labels: Cor_tbl[lbl] = [self.Cor_coef(lbl, xlbl) for xlbl in num_labels] return Cor_tbl
def extend_table(self, ahead, inc=1): """Project a TimeTable forward from last interval. inc must match the interval""" last_time = self[self.time_column][-1] fore_time = np.arange(last_time + inc, last_time + inc + ahead, inc) def project(lbl): b = self[lbl][-1] m = self[lbl][-1] - self[lbl][-2] return [m*(time+1)*inc + b for time in range(ahead)] xtbl = Table().with_columns([(self.time_column, fore_time)] + [(label, project(label)) for label in self.categories]) return self.copy().append(xtbl)
def get_first_from_group(table, groupby): TABLE_FLAG = False if isinstance(table, _Table): TABLE_FLAG = True table = table.to_df() out = table.sort_values(groupby).\ drop_duplicates(subset=groupby, keep='first') if TABLE_FLAG: return _Table.from_df(out) else: return out
def to_table(self,data=None): """ Loads the table into memory, and converts it into a UCB DataScience Table Check out http://data8.org for more information """ if not _berktb: print("You don't have the Berkeley DataScience library installed") return if not data: data = self.db.c.execute(self._formulate()).fetchall() cols = list(self.columns.keys()) return Table.from_rows(data,cols)
def lm_summary_1d(self, y_column_or_label, x_label): b0, b1 = self.regression_1d_params(y_column_or_label, x_label) r_model = lambda x: b0 + x * b1 SE_b0, SE_b1 = self.SE_1d_params(y_column_or_label, x_label, r_model) sum_tbl = Table().with_column('Param', ['intercept', x_label]) sum_tbl['Coefficient'] = [b0, b1] sum_tbl['Std Error'] = (SE_b0, SE_b1) sum_tbl['t-statistic'] = np.array([b0, b1]) / sum_tbl['Std Error'] sum_tbl['95% CI'] = [(b0 - 2 * SE_b0, b0 + 2 * SE_b0), (b1 - 2 * SE_b1, b1 + 2 * SE_b1)] sum_tbl['99% CI'] = [(b0 - 3 * SE_b0, b0 + 3 * SE_b0), (b1 - 3 * SE_b1, b1 + 3 * SE_b1)] return sum_tbl
def merge(t1, t2, on, how='outer', fillna=True): DS_FLAG = False if isinstance(t1, _Table): t1 = t1.to_df() DS_FLAG = True if isinstance(t2, _Table): t2 = t2.to_df() full_t = _pd.merge(t1, t2, how=how, left_on=on, right_on=on) if fillna: full_t.fillna(0, inplace=True) if DS_FLAG: return _Table.from_df(full_t) else: return full_t
def merge(t1, t2, on, how='outer', fillna=True): import pandas as pd from datascience import Table DS_FLAG = False if isinstance(t1, Table): t1 = t1.to_df() DS_FLAG = True if isinstance(t2, Table): t2 = t2.to_df() full_t = pd.merge(t1, t2, how=how, left_on=on, right_on=on) if fillna: full_t.fillna(0, inplace=True) if DS_FLAG: return Table.from_df(full_t) else: return full_t
def summary(self): b0, bs = self.params sum_tbl = Table().with_columns([ ("Param", ['Intercept'] + self.input_labels), ("Coefficient", [b0] + list(bs)), ]) sum_tbl['Std Error'] = self.source_table.SE_params( self.output_label, (b0, bs)) sum_tbl['t-statistic'] = sum_tbl['Coefficient'] / sum_tbl['Std Error'] sum_tbl['95% CI'] = [ (b - 2 * se, b + 2 * se) for b, se in zip(sum_tbl['Coefficient'], sum_tbl['Std Error']) ] sum_tbl['99% CI'] = [ (b - 3 * se, b + 3 * se) for b, se in zip(sum_tbl['Coefficient'], sum_tbl['Std Error']) ] return sum_tbl
def lm_summary(self, output_label): intercept, slopes = self.regression_params(output_label) mdl = ML_Table._make_model(intercept, slopes) input_labels = [lbl for lbl in self.labels if not lbl == output_label] sum_tbl = Table().with_column('Param', ['Intercept'] + input_labels) sum_tbl['Coefficient'] = [intercept] + list(slopes) sum_tbl['Std Error'] = self.SE_params(output_label, (intercept, slopes)) sum_tbl['t-statistic'] = sum_tbl['Coefficient'] / sum_tbl['Std Error'] sum_tbl['95% CI'] = [ (b - 2 * se, b + 2 * se) for b, se in zip(sum_tbl['Coefficient'], sum_tbl['Std Error']) ] sum_tbl['99% CI'] = [ (b - 3 * se, b + 3 * se) for b, se in zip(sum_tbl['Coefficient'], sum_tbl['Std Error']) ] return sum_tbl
def event(self, x): """ Shows the probability that distribution takes on value x or list of values x. Parameters ---------- x : float or Iterable or function An event represented either as an indicator function or a specific value in the domain or a subset of the domain Returns ------- Table Shows the probabilities of each value in the event Examples -------- >>> dist = Table().values([1 2, 3, 4]).probabilities([1/4, 1/4, 1/4, 1/4]) >>> dist.event(2) Domain | Probability 2 | 0.25 >>> dist.event([2,3]) Domain | Probability 2 | 0.25 3 | 0.25 """ check_valid_probability_table(self) if callable(x): t = self.where(self.apply(x, 0)) print('P(Event) = {0}'.format(sum(t.column(1)))) return t if not isinstance(x, collections.Iterable): x = [x] probabilities = [self.prob_event(k) for k in x] print('P(Event) = {0}'.format(sum(probabilities))) return Table().with_columns('Outcome', x, 'Probability', probabilities)
def loadNamingTable(namingData): ''' Loads the naming data into a datascience Table. Args: namingData (dict): a hierarchical dictionary mapping each language to each speaker's naming data, which maps each color index to their given color term Returns: the same information in a datascience Table ''' # create lists for the information language = [] speaker = [] index = [] color_term = [] # loop through the languages for lang in namingData: # loop through the speakers for spkr in namingData[lang]: # loop through the color index for i in namingData[lang][spkr]: # get the color term term = namingData[lang][spkr][i] # add to the lists language.append(lang) speaker.append(spkr) index.append(i) color_term.append(term) # turn into a table namingTable = Table().with_columns('Language', language, 'Speaker', speaker, 'Index', index, 'Term', color_term) return namingTable
from flask import Flask, jsonify,request from datascience import Table from intervaltree import Interval, IntervalTree prefixcode = '' t = Table.read_table('CourseWhere.csv') trees = {day:IntervalTree() for day in ['M','T','W','R','F','S']} for row in t.rows: for day in row[5]: trees[day][row[6]:row[7]] = row room_table = t.group('Building',collect=set).select(['Building','Facility set']) room_list = {building.lower():rooms for building,rooms in zip(room_table['Building'],room_table['Facility set'])} app = Flask(__name__) def class_to_dict(clas): convert = lambda x: x if isinstance(x,str) else int(x) return {label:convert(v) for label,v in zip(t.column_labels,clas)} @app.route(prefixcode+'/rooms/<building>/<room>') def get_room(building,room): weekday = request.args.get('day', 'M') if weekday not in "MTWRFS": weekday = 'M' values = [class_to_dict(v) for v in t.where('Building',building).where('Room',room).rows] values = [v for v in values if weekday in v['Days']] values = sorted(values,key=lambda x:x['Start'])