def joined_df(): """ Loop through all districts in the state to build the summaries dataframe. """ district1 = f"ftp://dbprftp.state.fl.us/pub/llweb/1fdinspi.csv" district2 = f"ftp://dbprftp.state.fl.us/pub/llweb/2fdinspi.csv" district3 = f"ftp://dbprftp.state.fl.us/pub/llweb/3fdinspi.csv" district4 = f"ftp://dbprftp.state.fl.us/pub/llweb/4fdinspi.csv" district5 = f"ftp://dbprftp.state.fl.us/pub/llweb/5fdinspi.csv" district6 = f"ftp://dbprftp.state.fl.us/pub/llweb/6fdinspi.csv" district7 = f"ftp://dbprftp.state.fl.us/pub/llweb/7fdinspi.csv" all_districts = [ district1, district2, district3, district4, district5, district6, district7 ] for district in all_districts: insp_list = [] insp = read_summaries(district) insp_list.append(insp) df_insp = pd.concat(insp_list, axis=0) return df_insp
def save_num_cat(project_relative_root_path, local_project): url = project_relative_root_path + local_project + '_0_num.xlsx' df_num = pd.read_excel(url) url = project_relative_root_path + local_project + '_0_cat.xlsx' df_cat = pd.read_excel(url) delete_columns = [ 'StartDate', 'EndDate', 'Status', 'IPAddress', 'RecipientLastName', 'RecipientFirstName', 'RecipientEmail', 'ExternalReference', 'LocationLatitude', 'LocationLongitude', 'DistributionChannel', 'UserLanguage', 'RecordedDate', 'ResponseId', 'Progress', 'Duration (in seconds)', 'Finished' ] df_num = df_num.drop(columns=delete_columns, axis=1) df_cat = df_cat.drop(columns=delete_columns, axis=1) df_num = df_num.iloc[1:, ] df_cat = df_cat.iloc[1:, ] for column in df_cat: df_cat.rename(columns={column: (column + '_cat')}, inplace=True) df_all = pd.concat([df_num, df_cat], sort=False, axis=1) url = project_relative_root_path + local_project + '_0_all.csv' print(f'url (save 0_all.csv): {url}') df_all.to_csv(url, index=False, encoding='utf-8') print(df_all.shape) return df_all
def __init__(self): # Vamos a leer el conjunto de datos en un dataframe de pandas. df1 = pd.read_csv('static/data/DB_2009-2010.csv') df2 = pd.read_csv('static/data/DB_2010-2011.csv') self.dataF = pd.concat( [df1, df2]) #Juntar los datos para manejar una sola base de datos self.dataF = self.data() self.dataMonths = self.dataMonths() self.data_products = self.data_products() return
def test_pandas_concatenate(self): d1 = DataFrame(data=[2, 4, 6, 8], columns=["A"], index=[1, 2, 3, 4]) d2 = DataFrame(data=[[1, 1.1], [3, 3.3], [5, 5.5], [7, 7.7], [9, 9.9]], columns=["A", "B"], index=[1, 2, 3, 4, 5]) result = pandas.concat([d1, d2], keys=[1, 2]) self.assertEqual(result["A"][1][2], 4) self.assertEqual(result["A"][2][2], 3) self.assertTrue(numpy.isnan(result["B"][1][1])) self.assertFloatEqual(result["B"][2][4], 7.7)
def historicalDataEnd(self, idx: int, start: str, end: str): super().historicalDataEnd(idx, start, end) sym = self.id2hist[idx]['symbol'] r = self.id2hist[idx]['data'].rename(sym).fillna(0) with threading.Lock(): l = self.cache self.cache = pandas.concat([l, r], axis=1) self.logger.info('{}) {}'.format(len(self.id2hist), sym)) self.id2hist.pop(idx, None)
def combine_rankings(dflow, dfunq, scoring_func=None): d1 = apply_rank_agg(dflow, scoring_func=scoring_func) d2 = apply_rank_agg(dfunq, scoring_func=scoring_func) d3 = pd.concat([d1, d2], axis=1) d3.columns = pd.MultiIndex.from_product([('Low_card', 'Unique'), list(d1)]) sort_cols = [ ('Unique', 'Median'), ('Low_card', 'Median'), ('Unique', 'Fails'), ] return d3.drop(('Low_card', 'Fails'), axis=1).sort_values(sort_cols, ascending=[1, 1, 1])
def process_directory(self, path="", skip_lines=0, colnames=[]): base_path = self.base_dir if path != "": base_path = os.path.join(base_path, path) files = ReadCsv.all_files(base_path) data = None all_paths = map(lambda file: os.path.join(base_path, file), files) for file_path in all_paths: temp = self.read_csv(file_path, skip_lines, colnames) if data is None: data = temp else: data = p.concat([data, temp]) return data
def get_map2(self): df_production_map = self.data_access.get_df_produccion() df_fincas_map = self.data_access.get_df_finca() df_production_map_2 = df_production_map.groupby( 'finca')['tallos_planta'].mean() df_production_map_2 = df_production_map_2.add_suffix('').reset_index() df_production_map_2 = df_production_map_2.sort_values(by=['finca']) df_fincas_map2 = df_fincas_map.sort_values(by=['finca']).reset_index() result = pd.concat([df_production_map_2, df_fincas_map2], axis=1, join='inner') result2 = result.loc[:, ~result.columns.duplicated()] return result2
def eater(file, table_name): global tables with open(file, "r") as f: j = json.loads(f.read()) if j["op"] == "c": basic_dict = {"id":j["id"], "ts":j["ts"]} basic_dict.update(j["data"]) if tables[table_name].empty: tables[table_name] = pd.DataFrame(basic_dict, columns = basic_dict.keys(), index = [basic_dict["id"]]) else: t_df = pd.DataFrame(basic_dict, columns = basic_dict.keys(), index = [basic_dict["id"]]) tables[table_name] = pd.concat([tables[table_name], t_df]) else: for k,v in j["set"].items(): if k in tables[table_name].columns: tables[table_name][k] = tables[table_name][k].where(tables[table_name]['id'] != j['id'], v) else: tables[table_name][k] = [None] tables[table_name][k] = tables[table_name][k].where(tables[table_name]['id'] != j['id'], v)
# incNums = [] crime_to_st = [] crime_dist = [] for i, crime in crime_data.iterrows(): #print crime[1]['Category'] loc = eval(crime['Location']) # get the location of the crime (lat, long) st_of_crime = min([(streets[st].distFromStreet(loc), st) for st in streets]) # min((CrimeStreet.dist(crime loc), CrimeStreet) for st in streets) streets[st_of_crime[1]].addCrime(crime['Category']) crime_to_st.append(st_of_crime[1]) crime_dist.append(st_of_crime[0]) # CS.addCrime(crime type) # print crime[1]['Category'] # print crime # incNums.append(crime['IncidntNum']) # starts = pd.Series([edge['startCoords'] for edge in trimmed_edges]) # ends = pd.Series([edge['endCoords'] for edge in trimmed_edges]) # dists = pd.Series([edge['distance'] for edge in trimmed_edges]) keys = ['Category', 'DayOfWeek', 'Date', 'Time', 'Location', 'StreetMatch', 'Distance'] crime_df = pd.concat([pd.Series(cats), pd.Series(days), pd.Series(dates), \ pd.Series(times), pd.Series(locs), pd.Series(crime_to_st), pd.Series(crime_dist)], axis=1, keys=keys) # crime_df.to_csv("crimes_with_streets.csv") print 'finished matching crimes to streets'
stEdges = pd.read_csv("cal.cedge.csv") stNodes = pd.read_csv("cal.cnode.csv") print "COLUMNS FOR EDGES: ", stEdges.columns print "COLUMNS FOR NODES: ", stNodes.columns startCoords = [] endCoords = [] nodes = stNodes.as_matrix() for i, edge in stEdges.iterrows(): # print edge # print edge['startID'], edge['endID'] start = int(edge['startID']) end = int(edge['endID']) startCoords.append((float(nodes[start][2]), float(nodes[start][1]))) endCoords.append((float(nodes[end][2]), float(nodes[end][1]))) #print edge['NodeID'] # which st['NodeID'] == startID startCoords = pd.Series(startCoords, name='startCoords') endCoords = pd.Series(endCoords, name='endCoords') #print startCoords df = pd.concat([stEdges['EdgeID'], startCoords, endCoords, stEdges['distance']], axis=1) # print df df.to_csv("edgeLocs.csv")
from narratives.coded.parse import parse_atlas_output from narratives.coded.convert import convert_doc2docx from narratives.coded.transform import transform from narratives.coded.process import process from glob import glob from pandas import pandas import numpy from numpy.random import choice from collections import Counter import feather frames = [] for doc in glob('./data/*.doc'): convert_doc2docx(doc, './data/') frames.append(parse_atlas_output(doc + "x")) data = pandas.concat(frames) transformed_data = transform(data, 'code') transformed_data['dataset'] = choice(['TRAIN','TEST'], len(transformed_data['segment']), p=(0.7, 0.3)) print(Counter(transformed_data['dataset'])) processed_data = process(transformed_data, 'segment') print(processed_data) feather.write_dataframe(processed_data, './data/coded_data.feather')
def neural_network(): datos = self.data_access.get_df_produccion() datos.drop(['Bloque','Nave','Lado','Cama','Id Cama','Piloto/homogenea','Area','Suma de Indice tallos/M2','Suma de Indice tallos/planta','Notas'],axis=1,inplace=True) datos.rename({'Fecha Siembra':'fecha_siembra','Año Semana':'ano_semana','UP':'finca','Tipo':'tipo','Variedad':'variedad','Fecha siembra':'fecha_siembra','Concatenado':'concatenado','Tallos producidos':'tallos','Edad':'edad','Cantidad':'cantidad_plantas','Fiesta':'fiesta'},axis=1,inplace=True) datos['coeficiente']=datos['tallos']/datos['cantidad_plantas'] datos['ano_semana']=datos['ano_semana'].astype(str) colores = self.data_access.get_df_variedad_color() colores.rename(columns={'Variedad':'variedad'},inplace=True) fechas= self.data_access.get_df_fechas() fechas['dia']=pd.to_datetime(fechas['dia']) semanas=fechas.groupby('ano_semana').max().reset_index() semanas['ano_semana']=semanas['ano_semana'].astype(str) estaciones = self.get_df_estacion() fincas=self.get_df_finca() fincas.rename(columns={'FINCAS':'nombre','SIGLA':'sigla','LATITUD':'latitud','LONGITUD':'longitud'},inplace=True) finca_estac = pd.DataFrame() for i in list(fincas.sigla.unique()): temp= fincas[fincas['sigla']==i] temp=pd.concat([temp,estaciones],ignore_index=True) temp['nombre']=temp.iloc[0,0] temp['sigla']=temp.iloc[0,1] temp['latitud']=temp.iloc[0,2] temp['longitud']=temp.iloc[0,3] temp.dropna(inplace=True) temp['distancia'] = np.sqrt((temp['LATITUD'] - temp['latitud'])**2 + (temp['LONGITUD'] - temp['longitud'])**2) temp.sort_values(['sigla','distancia'],ignore_index=True,inplace=True) temp=temp.head(1) finca_estac=pd.concat([finca_estac,temp]) finca_estac=finca_estac.reset_index(drop=True) #Code for calculation of average points for every farm-variety-age-week promedio=pd.DataFrame(columns=('ansema','up','variedad','edad','indiceplan')) datos.ano_semana=datos.ano_semana.astype(int) for i in list(datos['ano_semana'].sort_values().unique()): desde = i-199 datos_filt=datos[(datos['ano_semana']>=desde) & (datos['ano_semana']<i-4)].copy() curva_promed=datos_filt.groupby(['finca','variedad','edad'])['coeficiente'].mean().reset_index() curva_promed['ano_semana'] = i curva_promed['ano_semana'] = (curva_promed['ano_semana']).astype(str) promedio=pd.concat([promedio,curva_promed],ignore_index=True) promedio=promedio.set_index(['ano_semana','finca','variedad','edad']).to_dict('index') def curva_promedio(ansem,up,variedad,edad): try: valor=promedio[ansem,up,variedad,edad]['coeficiente'] return valor except: return 0 datos.ano_semana=datos.ano_semana.astype(str) datos=datos.merge(semanas,on='ano_semana',how='left') datos['mes_dato']=datos.dia.dt.month recons=datos.sort_values(['concatenado','edad']).reset_index(drop=True) lag_prod=10 for i in tqdm(range(1,lag_prod+1)): strprod=str(i)+'sem_atras' strconc=str(i)+'concat_atras' recons[strprod]=recons['coeficiente'].shift(i) recons[strconc]=recons['concatenado'].shift(i) vald=str(i)+'valido' recons[vald]=recons.apply(lambda ff: 1 if ff[strconc]==ff['concatenado'] else 0,axis=1) recons[strprod]=recons.apply(lambda x: x[strprod] if x[vald]==1 else 0,axis=1) recons.drop(columns={strconc},inplace=True) recons.drop(columns={vald},inplace=True) recons.drop(columns={'Unnamed: 0'},inplace=True) recons=recons.merge(colores[['variedad','Color']],how='left',on='variedad') #agrega la columna de curva estandar para cada variedad-finca recons.ano_semana=(recons.ano_semana).astype(str) recons['curva_metodo_finca'] = recons.apply(lambda x: curva_promedio(x['ano_semana'],x['finca'],x['variedad'],x['edad']),axis=1) recons=recons[recons['tipo'].isin(['Minicarnation','Carnation'])] recons.Color.fillna('NoColor',inplace=True) recons['edad^2']=recons['edad']**2 recons['edad^3']=recons['edad']**3 #Red Neuronal consolidado_rn=pd.DataFrame() from sklearn.preprocessing import StandardScaler from tensorflow import keras from tensorflow.keras import layers recons=recons[recons['dia']>='01/01/2018'] y_hat_rn=pd.Series(name='y_hat_falso') for i in recons.tipo.unique(): for j in recons_test[recons_test['tipo']==i]['Color'].unique(): temp_test=recons_test[(recons_test['tipo']==i)&(recons_test['Color']==j)] df_clean_test=pd.concat([temp_test[['edad','edad^2','edad^3','mes_dato','5sem_atras', '6sem_atras','7sem_atras','8sem_atras','9sem_atras','10sem_atras', #'11sem_atras','12sem_atras','13sem_atras','14sem_atras','15sem_atras', 'curva_metodo_finca','coeficiente']], pd.get_dummies(temp_test['variedad']), pd.get_dummies(temp_test['finca'])], axis=1) df_clean_test.fillna(value=0,inplace=True) y_real_test = df_clean_test.coeficiente X_real_test = df_clean_test.drop('coeficiente', axis=1) temp=recons[(recons['tipo']==i)&(recons['Color']==j)] temp=temp[temp['variedad'].isin(temp_test['variedad'].unique())] temp=temp[temp['finca'].isin(temp_test['finca'].unique())] df_clean=pd.concat([temp[['edad','edad^2','edad^3','mes_dato','5sem_atras', '6sem_atras','7sem_atras','8sem_atras','9sem_atras','10sem_atras', #'11sem_atras','12sem_atras','13sem_atras','14sem_atras','15sem_atras', 'curva_metodo_finca','coeficiente']], pd.get_dummies(temp['variedad']), pd.get_dummies(temp['finca'])], axis=1) df_clean.fillna(value=0,inplace=True) y = df_clean.coeficiente X = df_clean.drop('coeficiente', axis=1) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42) scaler = StandardScaler() X_train_std = pd.DataFrame(scaler.fit_transform(X_train), columns = X_train.columns) neurons = 256 model = keras.Sequential([layers.Dense(neurons, activation='relu', input_shape=[len(X_train_std.columns)]), layers.Dense(neurons,activation='relu'), layers.Dense(1,activation='relu')]) #Capa salida model.compile(loss='mse', optimizer = 'adam') history = model.fit(X_train_std, y_train, epochs=100, validation_split = 0.2, verbose=0,batch_size=100) X_norm = scaler.transform(X_real_test) indice=X_real_test.reset_index()['index'] y_hat=model.predict(X_norm) y_hat=pd.Series(y_hat[0:,0],name='y_hat') y_hat.index=X_real_test.index y_hat_rn=pd.concat([y_hat_rn,y_hat],axis=1) y_hat_rn.drop(columns={'y_hat_falso'},inplace=True) ser_y_hat=np.sum(y_hat_rn,axis=1) y_hat_rn['y_hat_red_n']=ser_y_hat validacion_y_hat=y_hat_rn[['y_hat_red_n']] validacion_final=pd.concat([recons_test,validacion_y_hat],axis=1)
def replace_and_concat(column_name,df): replacement = pandas.get_dummies(df[column_name],prefix=column_name) replacement = replacement.set_index(df.index) df.drop(column_name, axis=1, inplace=True) df = pandas.concat([df,replacement],axis=1) return df
def curva_promedio(ansem,up,variedad,edad): try: valor=promedio[ansem,up,variedad,edad]['coeficiente'] return valor except: return 0 datos.ano_semana=datos.ano_semana.astype(str) datos=datos.merge(semanas,on='ano_semana',how='left') datos['mes_dato']=datos.dia.dt.month recons=datos.sort_values(['concatenado','edad']).reset_index(drop=True) lag_prod=10 for i in tqdm(range(1,lag_prod+1)): strprod=str(i)+'sem_atras' strconc=str(i)+'concat_atras' recons[strprod]=recons['coeficiente'].shift(i) recons[strconc]=recons['concatenado'].shift(i) vald=str(i)+'valido' recons[vald]=recons.apply(lambda ff: 1 if ff[strconc]==ff['concatenado'] else 0,axis=1) recons[strprod]=recons.apply(lambda x: x[strprod] if x[vald]==1 else 0,axis=1) recons.drop(columns={strconc},inplace=True) recons.drop(columns={vald},inplace=True) recons.drop(columns={'Unnamed: 0'},inplace=True) recons=recons.merge(colores[['variedad','Color']],how='left',on='variedad') #agrega la columna de curva estandar para cada variedad-finca recons.ano_semana=(recons.ano_semana).astype(str) recons['curva_metodo_finca'] = recons.apply(lambda x: curva_promedio(x['ano_semana'],x['finca'],x['variedad'],x['edad']),axis=1) recons=recons[recons['tipo'].isin(['Minicarnation','Carnation'])] recons.Color.fillna('NoColor',inplace=True) recons['edad^2']=recons['edad']**2 recons['edad^3']=recons['edad']**3 #Red Neuronal consolidado_rn=pd.DataFrame() from sklearn.preprocessing import StandardScaler from tensorflow import keras from tensorflow.keras import layers recons=recons[recons['dia']>='01/01/2018'] y_hat_rn=pd.Series(name='y_hat_falso') for i in recons.tipo.unique(): for j in recons_test[recons_test['tipo']==i]['Color'].unique(): temp_test=recons_test[(recons_test['tipo']==i)&(recons_test['Color']==j)] df_clean_test=pd.concat([temp_test[['edad','edad^2','edad^3','mes_dato','5sem_atras', '6sem_atras','7sem_atras','8sem_atras','9sem_atras','10sem_atras', #'11sem_atras','12sem_atras','13sem_atras','14sem_atras','15sem_atras', 'curva_metodo_finca','coeficiente']], pd.get_dummies(temp_test['variedad']), pd.get_dummies(temp_test['finca'])], axis=1) df_clean_test.fillna(value=0,inplace=True) y_real_test = df_clean_test.coeficiente X_real_test = df_clean_test.drop('coeficiente', axis=1) temp=recons[(recons['tipo']==i)&(recons['Color']==j)] temp=temp[temp['variedad'].isin(temp_test['variedad'].unique())] temp=temp[temp['finca'].isin(temp_test['finca'].unique())] df_clean=pd.concat([temp[['edad','edad^2','edad^3','mes_dato','5sem_atras', '6sem_atras','7sem_atras','8sem_atras','9sem_atras','10sem_atras', #'11sem_atras','12sem_atras','13sem_atras','14sem_atras','15sem_atras', 'curva_metodo_finca','coeficiente']], pd.get_dummies(temp['variedad']), pd.get_dummies(temp['finca'])], axis=1) df_clean.fillna(value=0,inplace=True) y = df_clean.coeficiente X = df_clean.drop('coeficiente', axis=1) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42) scaler = StandardScaler() X_train_std = pd.DataFrame(scaler.fit_transform(X_train), columns = X_train.columns) neurons = 256 model = keras.Sequential([layers.Dense(neurons, activation='relu', input_shape=[len(X_train_std.columns)]), layers.Dense(neurons,activation='relu'), layers.Dense(1,activation='relu')]) #Capa salida model.compile(loss='mse', optimizer = 'adam') history = model.fit(X_train_std, y_train, epochs=100, validation_split = 0.2, verbose=0,batch_size=100) X_norm = scaler.transform(X_real_test) indice=X_real_test.reset_index()['index'] y_hat=model.predict(X_norm) y_hat=pd.Series(y_hat[0:,0],name='y_hat') y_hat.index=X_real_test.index y_hat_rn=pd.concat([y_hat_rn,y_hat],axis=1) y_hat_rn.drop(columns={'y_hat_falso'},inplace=True) ser_y_hat=np.sum(y_hat_rn,axis=1) y_hat_rn['y_hat_red_n']=ser_y_hat validacion_y_hat=y_hat_rn[['y_hat_red_n']] validacion_final=pd.concat([recons_test,validacion_y_hat],axis=1)
errors='ignore') # clean and transform agent records for row in jdata: row['agentActiveYr'] = row['reportYear'] row['agentTitle'] = 'REGISTERED AGENT' # remove unwanted officers row.pop('officersList', None) # export agent records from in-memory python objects to dataframes df_agents = json_normalize(jdata, None, errors='ignore') # combine agents and offices into a single dataframe set df = pd.concat([df_agents, df_officers]) # trim all fields on all rows df = trimAllColumns(df) df = removePeriodsFromAllColumns(df) # remove duplicates groupby_list = list(set(df.columns) - set(['agentTitle'])) df = df.groupby(groupby_list).agg({ 'agentTitle': combineRows, }) # remove multi-level index prior to JSON export df = df.reset_index() df = df.sort_index(axis=1) df = df.sort_values(['taxpayerId'])
def run(self, surface_only=True, improvements_only=True, progress=True, view=None): """Run the differential flux variability analysis. Parameters ---------- surface_only : bool, optional If only the surface of the n-dimensional production envelope should be scanned (defaults to True). improvements_only : bool, optional If only grid points should should be scanned that constitute and improvement in production over the reference state (defaults to True). progress : bool, optional If a progress bar should be shown. view : SequentialView or MultiprocessingView or ipython.cluster.DirectView, optional A parallelization view (defaults to SequentialView). Returns ------- pandas.Panel A pandas Panel containing a results DataFrame for every grid point scanned. """ with TimeMachine() as tm: # Make sure that the design_space_model is initialized to its original state later for variable in self.variables: reaction = self.design_space_model.reactions.get_by_id( variable) tm(do=int, undo=partial(setattr, reaction, 'lower_bound', reaction.lower_bound)) tm(do=int, undo=partial(setattr, reaction, 'upper_bound', reaction.upper_bound)) target_reaction = self.design_space_model.reactions.get_by_id( self.objective) tm(do=int, undo=partial(setattr, target_reaction, 'lower_bound', target_reaction.lower_bound)) tm(do=int, undo=partial(setattr, target_reaction, 'upper_bound', target_reaction.upper_bound)) if view is None: view = config.default_view else: view = view included_reactions = [ reaction.id for reaction in self.reference_model.reactions if reaction.id not in self.exclude ] + self.variables + [self.objective] self.reference_flux_dist = pfba(self.reference_model, fraction_of_optimum=0.99) self.reference_flux_ranges = flux_variability_analysis( self.reference_model, reactions=included_reactions, view=view, remove_cycles=False, fraction_of_optimum=0.75).data_frame self._init_search_grid(surface_only=surface_only, improvements_only=improvements_only) func_obj = _DifferentialFvaEvaluator(self.design_space_model, self.variables, self.objective, included_reactions) if progress: progress = ProgressBar(len(self.grid)) results = list( progress(view.imap(func_obj, self.grid.iterrows()))) else: results = list(view.map(func_obj, self.grid.iterrows())) solutions = dict((tuple(point.iteritems()), fva_result) for (point, fva_result) in results) reference_intervals = self.reference_flux_ranges[[ 'lower_bound', 'upper_bound' ]].values for sol in six.itervalues(solutions): intervals = sol[['lower_bound', 'upper_bound']].values gaps = [ self._interval_gap(interval1, interval2) for interval1, interval2 in my_zip(reference_intervals, intervals) ] sol['gaps'] = gaps if self.normalize_ranges_by is not None: normalizer = sol.lower_bound[self.normalize_ranges_by] if normalizer > non_zero_flux_threshold: normalized_intervals = sol[['lower_bound', 'upper_bound' ]].values / normalizer sol['normalized_gaps'] = [ self._interval_gap(interval1, interval2) for interval1, interval2 in my_zip( reference_intervals, normalized_intervals) ] else: sol['normalized_gaps'] = [numpy.nan] * len(sol.lower_bound) else: sol['normalized_gaps'] = gaps ref_upper_bound = self.reference_flux_ranges.upper_bound.apply( lambda v: 0 if abs(v) < non_zero_flux_threshold else v) ref_lower_bound = self.reference_flux_ranges.lower_bound.apply( lambda v: 0 if abs(v) < non_zero_flux_threshold else v) collection = list() for key, df in six.iteritems(solutions): df['biomass'] = key[0][1] df['production'] = key[1][1] df['KO'] = False df['flux_reversal'] = False df['suddenly_essential'] = False df['free_flux'] = False df.loc[(df.lower_bound == 0) & (df.upper_bound == 0) & (ref_upper_bound != 0) & (ref_lower_bound != 0), 'KO'] = True df.loc[((ref_upper_bound < 0) & (df.lower_bound > 0) | ((ref_lower_bound > 0) & (df.upper_bound < 0))), 'flux_reversal'] = True df.loc[((df.lower_bound <= 0) & (df.lower_bound > 0)) | ((ref_lower_bound >= 0) & (df.upper_bound <= 0)), 'suddenly_essential'] = True is_reversible = numpy.asarray([ self.design_space_model.reactions.get_by_id(i).reversibility for i in df.index ], dtype=bool) not_reversible = numpy.logical_not(is_reversible) df.loc[((df.lower_bound == -1000) & (df.upper_bound == 1000) & is_reversible) | ((df.lower_bound == 0) & (df.upper_bound == 1000) & not_reversible) | ((df.lower_bound == -1000) & (df.upper_bound == 0) & not_reversible), 'free_flux'] = True df['reaction'] = df.index df['excluded'] = df['reaction'].isin(self.exclude) collection.append(df) # multi_index = [(key[0][1], key[1][1]) for key in solutions] # solutions_multi_index = pandas.concat(list(solutions.values()), # axis=0, keys=multi_index)# # solutions_multi_index.index.set_names(['biomass', 'production', # 'reaction'], inplace=True) total = pandas.concat(collection, ignore_index=True, copy=False) total.sort_values(['biomass', 'production', 'reaction'], inplace=True) total.index = total['reaction'] return DifferentialFVAResult(total, self.envelope, self.reference_flux_ranges, self.reference_flux_dist)
fail_count[2] += 1 continue if end[0] > 38.5 or end[0] < 37.5: fail_count[3] += 1 continue trimmed_edges.append(e) print fail_count print len(trimmed_edges) # print trimmed_edges edgeIDs = pd.Series([edge['EdgeID'] for edge in trimmed_edges]) starts = pd.Series([edge['startCoords'] for edge in trimmed_edges]) ends = pd.Series([edge['endCoords'] for edge in trimmed_edges]) dists = pd.Series([edge['distance'] for edge in trimmed_edges]) trimmed_df = pd.concat([edgeIDs, starts, ends, dists], axis=1, keys=['EdgeID', 'startCoords', 'endCoords', 'distance']) # trimmed_df = pd.concat(trimmed_edges, axis=0, keys = [edge['EdgeID'] for edge in trimmed_edges]) #print trimmed_df trimmed_df.to_csv("trimmed_edges.csv") # cats = df['Category'] # print type(cats).__name__ # print len(cats) # for i, crime in df.iterrows(): # find the minimum (distFromStreet, CrimeStreet) pair # add the crime to that street
def run(self, surface_only=True, improvements_only=True, progress=True, view=None): """Run the differential flux variability analysis. Parameters ---------- surface_only : bool, optional If only the surface of the n-dimensional production envelope should be scanned (defaults to True). improvements_only : bool, optional If only grid points should should be scanned that constitute and improvement in production over the reference state (defaults to True). progress : bool, optional If a progress bar should be shown. view : SequentialView or MultiprocessingView or ipython.cluster.DirectView, optional A parallelization view (defaults to SequentialView). Returns ------- pandas.Panel A pandas Panel containing a results DataFrame for every grid point scanned. """ with TimeMachine() as tm: # Make sure that the design_space_model is initialized to its original state later for variable in self.variables: reaction = self.design_space_model.reactions.get_by_id(variable) tm(do=int, undo=partial(setattr, reaction, 'lower_bound', reaction.lower_bound)) tm(do=int, undo=partial(setattr, reaction, 'upper_bound', reaction.upper_bound)) target_reaction = self.design_space_model.reactions.get_by_id(self.objective) tm(do=int, undo=partial(setattr, target_reaction, 'lower_bound', target_reaction.lower_bound)) tm(do=int, undo=partial(setattr, target_reaction, 'upper_bound', target_reaction.upper_bound)) if view is None: view = config.default_view else: view = view included_reactions = [reaction.id for reaction in self.reference_model.reactions if reaction.id not in self.exclude] + self.variables + [self.objective] self.reference_flux_dist = pfba(self.reference_model, fraction_of_optimum=0.99) self.reference_flux_ranges = flux_variability_analysis(self.reference_model, reactions=included_reactions, view=view, remove_cycles=False, fraction_of_optimum=0.75).data_frame self._init_search_grid(surface_only=surface_only, improvements_only=improvements_only) func_obj = _DifferentialFvaEvaluator(self.design_space_model, self.variables, self.objective, included_reactions) if progress: progress = ProgressBar(len(self.grid)) results = list(progress(view.imap(func_obj, self.grid.iterrows()))) else: results = list(view.map(func_obj, self.grid.iterrows())) solutions = dict((tuple(point.iteritems()), fva_result) for (point, fva_result) in results) reference_intervals = self.reference_flux_ranges[['lower_bound', 'upper_bound']].values for sol in six.itervalues(solutions): intervals = sol[['lower_bound', 'upper_bound']].values gaps = [self._interval_gap(interval1, interval2) for interval1, interval2 in my_zip(reference_intervals, intervals)] sol['gaps'] = gaps if self.normalize_ranges_by is not None: normalizer = sol.lower_bound[self.normalize_ranges_by] if normalizer > non_zero_flux_threshold: normalized_intervals = sol[['lower_bound', 'upper_bound']].values / normalizer sol['normalized_gaps'] = [self._interval_gap(interval1, interval2) for interval1, interval2 in my_zip(reference_intervals, normalized_intervals)] else: sol['normalized_gaps'] = [numpy.nan] * len(sol.lower_bound) else: sol['normalized_gaps'] = gaps ref_upper_bound = self.reference_flux_ranges.upper_bound.apply( lambda v: 0 if abs(v) < non_zero_flux_threshold else v) ref_lower_bound = self.reference_flux_ranges.lower_bound.apply( lambda v: 0 if abs(v) < non_zero_flux_threshold else v) collection = list() for key, df in six.iteritems(solutions): df['biomass'] = key[0][1] df['production'] = key[1][1] df['KO'] = False df['flux_reversal'] = False df['suddenly_essential'] = False df['free_flux'] = False df.loc[ (df.lower_bound == 0) & ( df.upper_bound == 0) & ( ref_upper_bound != 0) & ( ref_lower_bound != 0), 'KO' ] = True df.loc[ ((ref_upper_bound < 0) & (df.lower_bound > 0) | ( (ref_lower_bound > 0) & (df.upper_bound < 0))), 'flux_reversal' ] = True df.loc[ ((df.lower_bound <= 0) & (df.lower_bound > 0)) | ( (ref_lower_bound >= 0) & (df.upper_bound <= 0)), 'suddenly_essential' ] = True is_reversible = numpy.asarray([ self.design_space_model.reactions.get_by_id(i).reversibility for i in df.index], dtype=bool) not_reversible = numpy.logical_not(is_reversible) df.loc[ ((df.lower_bound == -1000) & (df.upper_bound == 1000) & is_reversible) | ( (df.lower_bound == 0) & (df.upper_bound == 1000) & not_reversible) | ( (df.lower_bound == -1000) & (df.upper_bound == 0) & not_reversible), 'free_flux' ] = True df['reaction'] = df.index df['excluded'] = df['reaction'].isin(self.exclude) collection.append(df) # multi_index = [(key[0][1], key[1][1]) for key in solutions] # solutions_multi_index = pandas.concat(list(solutions.values()), # axis=0, keys=multi_index)# # solutions_multi_index.index.set_names(['biomass', 'production', # 'reaction'], inplace=True) total = pandas.concat(collection, ignore_index=True, copy=False) total.sort_values(['biomass', 'production', 'reaction'], inplace=True) total.index = total['reaction'] return DifferentialFVAResult(total, self.envelope, self.reference_flux_ranges, self.reference_flux_dist)
def run(self, surface_only=True, improvements_only=True, progress=True, view=None, fraction_of_optimum=1.0): """Run the differential flux variability analysis. Parameters ---------- surface_only : bool, optional If only the surface of the n-dimensional production envelope should be scanned (defaults to True). improvements_only : bool, optional If only grid points should should be scanned that constitute and improvement in production over the reference state (defaults to True). progress : bool, optional If a progress bar should be shown. view : SequentialView or MultiprocessingView or ipython.cluster.DirectView, optional A parallelization view (defaults to SequentialView). fraction_of_optimum : float, optional A value between zero and one that determines the width of the flux ranges of the reference solution. The lower the value, the larger the ranges. Returns ------- pandas.Panel A pandas Panel containing a results DataFrame for every grid point scanned. """ # Calculate the reference state. self.reference_flux_dist = pfba( self.reference_model, fraction_of_optimum=fraction_of_optimum) self.reference_flux_ranges = flux_variability_analysis( self.reference_model, reactions=self.included_reactions, view=view, remove_cycles=False, fraction_of_optimum=fraction_of_optimum).data_frame self.reference_flux_ranges[ self.reference_flux_ranges.abs() < non_zero_flux_threshold] = 0.0 reference_intervals = self.reference_flux_ranges.loc[ self.included_reactions, ['lower_bound', 'upper_bound']].values if self.normalize_ranges_by is not None: logger.debug( self.reference_flux_ranges.loc[self.normalize_ranges_by, ]) # The most obvious flux to normalize by is the biomass reaction # flux. This is probably always greater than zero. Just in case # the model is defined differently or some other normalizing # reaction is chosen, we use the absolute value. norm = abs(self.reference_flux_ranges.at[self.normalize_ranges_by, "lower_bound"]) if norm > non_zero_flux_threshold: normalized_reference_intervals = reference_intervals / norm else: raise ValueError( "The reaction that you have chosen for normalization '{}' " "has zero flux in the reference state. Please choose another " "one.".format(self.normalize_ranges_by)) with TimeMachine() as tm: # Make sure that the design_space_model is initialized to its original state later for variable in self.variables: reaction = self.design_space_model.reactions.get_by_id( variable) tm(do=int, undo=partial(setattr, reaction, 'lower_bound', reaction.lower_bound)) tm(do=int, undo=partial(setattr, reaction, 'upper_bound', reaction.upper_bound)) target_reaction = self.design_space_model.reactions.get_by_id( self.objective) tm(do=int, undo=partial(setattr, target_reaction, 'lower_bound', target_reaction.lower_bound)) tm(do=int, undo=partial(setattr, target_reaction, 'upper_bound', target_reaction.upper_bound)) if view is None: view = config.default_view else: view = view self._init_search_grid(surface_only=surface_only, improvements_only=improvements_only) func_obj = _DifferentialFvaEvaluator(self.design_space_model, self.variables, self.objective, self.included_reactions) if progress: progress = ProgressBar(len(self.grid)) results = list( progress(view.imap(func_obj, self.grid.iterrows()))) else: results = list(view.map(func_obj, self.grid.iterrows())) solutions = dict((tuple(point.iteritems()), fva_result) for (point, fva_result) in results) for sol in solutions.values(): sol[sol.abs() < non_zero_flux_threshold] = 0.0 intervals = sol.loc[self.included_reactions, ['lower_bound', 'upper_bound']].values gaps = [ self._interval_gap(interval1, interval2) for interval1, interval2 in zip(reference_intervals, intervals) ] sol['gaps'] = gaps if self.normalize_ranges_by is not None: # See comment above regarding normalization. normalizer = abs(sol.lower_bound[self.normalize_ranges_by]) if normalizer > non_zero_flux_threshold: normalized_intervals = sol.loc[ self.included_reactions, ['lower_bound', 'upper_bound']].values / normalizer sol['normalized_gaps'] = [ self._interval_gap(interval1, interval2) for interval1, interval2 in zip( normalized_reference_intervals, normalized_intervals) ] else: sol['normalized_gaps'] = numpy.nan else: sol['normalized_gaps'] = gaps # Determine where the reference flux range overlaps with zero. zero_overlap_mask = numpy.asarray([ self._interval_overlap(interval1, (0, 0)) > 0 for interval1 in reference_intervals ], dtype=bool) collection = list() for key, df in solutions.items(): df['biomass'] = key[0][1] df['production'] = key[1][1] df['KO'] = False df['flux_reversal'] = False df['suddenly_essential'] = False df['free_flux'] = False df.loc[(df.lower_bound == 0) & (df.upper_bound == 0) & (~zero_overlap_mask), 'KO'] = True df.loc[((self.reference_flux_ranges.upper_bound < 0) & (df.lower_bound > 0) | ((self.reference_flux_ranges.lower_bound > 0) & (df.upper_bound < 0))), 'flux_reversal'] = True df.loc[(zero_overlap_mask & (df.lower_bound > 0)) | (zero_overlap_mask & (df.upper_bound < 0)), 'suddenly_essential'] = True is_reversible = numpy.asarray([ self.design_space_model.reactions.get_by_id(i).reversibility for i in df.index ], dtype=bool) not_reversible = ~is_reversible df.loc[((df.lower_bound == -1000) & (df.upper_bound == 1000) & is_reversible) | ((df.lower_bound == 0) & (df.upper_bound == 1000) & not_reversible) | ((df.lower_bound == -1000) & (df.upper_bound == 0) & not_reversible), 'free_flux'] = True df['reaction'] = df.index df['excluded'] = df['reaction'].isin(self.exclude) collection.append(df) # multi_index = [(key[0][1], key[1][1]) for key in solutions] # solutions_multi_index = pandas.concat(list(solutions.values()), # axis=0, keys=multi_index)# # solutions_multi_index.index.set_names(['biomass', 'production', # 'reaction'], inplace=True) total = pandas.concat(collection, ignore_index=True, copy=False) total.sort_values(['biomass', 'production', 'reaction'], inplace=True) total.index = total['reaction'] return DifferentialFVAResult(total, self.envelope, self.reference_flux_ranges)
def build_fingerprint_matrices(self): # pathnames: List of paths to each piece for which a fingerprint matrix should be built # number_of_fingerprints: however many fingerprints you need interval_settings = self.interval_settings fingerprint_matrices = {} # Load pickled fingerprints if self.fp_pickle_path is not None: if os.path.isfile(self.fp_pickle_path): print "Found pickled fingerprints at '" + self.fp_pickle_path +"', importing..." with open(self.fp_pickle_path, 'rb') as fp_pickle: fingerprint_matrices = pickle.load(fp_pickle) else: print "Warning: was asked to look for pickled fingerprints at '" + self.fp_pickle_path +"'" print "Couldn't find any -- new pickle file will be created." number_of_fingerprints = self.number_of_fingerprints for path in self.pathnames: # Skip pickled fingerprints if os.path.basename(path) in fingerprint_matrices.keys(): continue # Setup for each piece #print("Indexing " + path) piece = IndexedPiece(path) piece_stream = music21.converter.parseFile(path) # LM: Get time signature and determine strong beats time_sigs = piece.get_data([metre.TimeSignatureIndexer]) # Assuming no time signature change in whole piece, assign offsets to strong beats if time_sigs['metre.TimeSignatureIndexer']['0'].iloc[0] == '6/8' or time_sigs['metre.TimeSignatureIndexer']['0'].iloc[0] == '9/8': strong_beat_offsets = 1.5 measures = 4 else: strong_beat_offsets = 1.0 measures = 4 # LM: Get total number of offsets numer, denom = time_sigs['metre.TimeSignatureIndexer']['0'].iloc[0].split('/') # Four bars worth of offsets, ignoring anacrusis... # Add an extra strong beat at end total_offsets = int(numer) * measures*4.0/int(denom) + strong_beat_offsets interval_settings['quarterLength'] = strong_beat_offsets interval_settings['intervalDistance'] = strong_beat_offsets interval_settings['subsection'] = (0.0, total_offsets) # LM: Build strong-interval frame strong_intervals = self.__build_strong_intervals(piece, interval_settings, strong_beat_offsets, total_offsets) # LM: Build weak-interval frame weak_intervals = self.__build_weak_intervals(piece, interval_settings, strong_beat_offsets, total_offsets) # LM: Assemble results # 1. Prepare strong_intervals -- had to change this due to change in representation... take off final column (start of new bar) strong_intervals = strong_intervals.T.iloc[:-1].T strong_intervals = self.__shift_matrix(strong_intervals) # Had to change this due to change in representation.... take off final row # strong_intervals = strong_intervals.iloc[:] # 2. Prepare weak_intervals: weak_intervals = weak_intervals.iloc[:] weak_intervals.index = my_range(strong_beat_offsets, strong_beat_offsets, total_offsets+strong_beat_offsets) # 3. Row of 0s --- added after discussion with Laura pertaining to fingerprint representation zeros = DataFrame(Series([0.0]*(len(weak_intervals)))) zeros.index = (my_range(strong_beat_offsets, strong_beat_offsets, total_offsets+strong_beat_offsets)) zeros = zeros.T # 4. Append fingerprint_frame = pandas.concat([weak_intervals.T, zeros, strong_intervals]) fingerprint_frame.index = (['w'] + fingerprint_frame.index.tolist()[1:]) #piece_stream.show('musicxml', 'MuseScore') # DataFrame(Series([0.0]*(len(weak_intervals)+1))).reindex(range(1, len(weak_intervals)+1)).T fingerprint_matrices[os.path.basename(path)]=fingerprint_frame number_of_fingerprints -= 1 if 0 == number_of_fingerprints: print "Max Number of Fingerprints Reached" break return fingerprint_matrices