def __call__(self, df): if len(self.args) >= 2: if not isinstance(self.args[0], dplython.later.Later) or \ not isinstance(self.args[1], dplython.later.Later): raise ValueError( "Arguments must be of the form \"X.column1, X.column2, ...\"" ) sp_key = self.args[0]._name sp_value = self.args[1]._name else: raise ValueError( "You must provide at least two arguments, the key and the value." ) all_id_cols = [] all_value_cols = list(df.columns) if len(self.args) > 2: if 'exclude' in self.kwargs and self.kwargs['exclude'] == True: for arg in self.args[2:]: if not isinstance(arg, dplython.later.Later): raise ValueError( "Arguments must be of the form \"X.column1, X.column2, ...\"" ) all_id_cols.append(arg._name) all_value_cols.remove(arg._name) else: all_id_cols = list(df.columns) all_value_cols = [] for arg in self.args[2:]: if not isinstance(arg, dplython.later.Later): raise ValueError( "Arguments must be of the form \"X.column1, X.column2, ...\"" ) all_id_cols.remove(arg._name) all_value_cols.append(arg._name) outdf = DplyFrame( df.melt(id_vars=all_id_cols, value_vars=all_value_cols)) cols = list(outdf.columns) cols[-2:] = sp_key, sp_value outdf.columns = cols return outdf
def create_task2(): df = clean_df(request.json) print(df["Category"]) df['supplier'] = df['Category'].apply(lambda x: supp(x)) df = DplyFrame(df) >> group_by(X.supplier) >> summarize(max1 = most_common( X.Name ) ) print(df) # df_fav = df >> mutate(new = supp(X.Category)) jsondf = df.to_json(orient='records') return (jsondf);
def create_task2(): df = clean_df(request.json) print(df["Category"]) df['supplier'] = df['Category'].apply(lambda x: supp(x)) df = DplyFrame(df) >> group_by( X.supplier) >> summarize(max1=most_common(X.Name)) print(df) # df_fav = df >> mutate(new = supp(X.Category)) jsondf = df.to_json(orient='records') return (jsondf)
def __call__(self, df): if len(self.args) >= 2: if not isinstance(self.args[0], dplython.later.Later) or \ not isinstance(self.args[1], dplython.later.Later): raise ValueError( "Arguments must be of the form \"X.column1, X.column2, ...\"" ) sp_key = self.args[0]._name sp_value = self.args[1]._name else: raise ValueError( "You must provide at least two arguments, the key and the value." ) multiindex = [s for s in df.columns if s != sp_key and s != sp_value] outdf = DplyFrame( df.set_index(multiindex).pivot(columns=sp_key, values=sp_value)).reset_index() outdf.columns.name = None outdf = outdf[multiindex + list(dict.fromkeys(df[sp_key]))] return outdf
def read_delim(f, delim, col_names = True): assert isinstance(f, str) assert isinstance(delim, str) assert isinstance(col_names, bool) if col_names == True: col_names = 0 else: col_names = None df = DplyFrame(pd.read_csv(filepath_or_buffer=f, header=col_names, sep=delim)) if col_names == None: df.columns = [''.join(map(str, list(n))) for n in zip(cycle(['X']), range(1, df.shape[1]+1))] return df
def clean_df(requestjson): # data = request.json data = requestjson dd = data["file2"] df = pd.DataFrame(dd) df.columns = df.columns.map(lambda x: re.sub(r'\W+', '', x)) a = DplyFrame(df) return (a)
def add_day(df, countriez): df_temp = df.copy() df = pd.DataFrame(index=range(0, 100000)) for country in countriez: data = (DplyFrame(df_temp) >> sift(X.country == country)) df_filt = (data >> mutate(day=range(1, len(data) + 1))) df = pd.concat([df, df_filt], sort=False).dropna(how='all') return df
def dply(self): """ Return dplyr frame for the read table. dplyr is an R inspired wrapper to process Pandas tables in a flow-like manner. See https://github.com/dodger487/dplython and https://cran.rstudio.com/web/packages/dplyr/vignettes/introduction.html for more details about dplyr. dplyr and nuts-ml use the same syntax (>>) for chaining functions and integrate nicely with each other. :return: dplyr dataframe instead of Pandas dataframe. :rtype: DplyFrame """ return DplyFrame(self.dataframe)
# cummin(series) diamonds >> select(X.price) >> mutate(price_cummin=cummin(X.price)) >> head(6) # cumprod(series) diamonds >> select(X.price) >> mutate(price_cumprod=cumprod(X.price)) >> head(6) # Extending dfply with custom functions # https://github.com/kieferk/dfply/blob/master/examples/basics-extending-functionality.ipynb ############### dplython ################# from dplython import (DplyFrame, X, diamonds, select, sift, sample_n, sample_frac, head, arrange, mutate, group_by, summarize, DelayFunction) df = DplyFrame(df) df >> head(5) df >> sample_n(5) df >> select(X.name, X.category, X.country, X.role, X.description) df >> sift(X.category == 'Leadership') # As in pandas, use bitwise logical operators like |, & (, is same as &) df >> arrange(X.country) # couldnt find a way to sort descending so moved to dfply library df >> mutate(carat_bin=X.carat.round()) df >> group_by(X.category) >> summarize(num_of_people = X.name.count()) # It's possible to pass the entire dataframe using X._ # The special Later name, "_" will refer to the entire DataFrame. # Combine multiple (df >> sift(X.name != 'Unsung hero') >> group_by(X.category)
numerator = data - np.min(data, 0) denominator = np.max(data, 0) - np.min(data, 0) return numerator / (denominator + 1e-7) # train Parameters seq_length = 60 data_dim = 8 hidden_dim = 10 output_dim = 1 learning_rate = 0.01 iterations = 500 # last, diff_24h, diff_per_24h, bid, ask, low, high, volume data = DplyFrame(pd.read_csv('./bitcoin_ticker.csv', delimiter=',')) data = data >> sift(X.rpt_key == 'btc_krw') >> select( X.last, X.diff_24h, X.diff_per_24h, X.bid, X.ask, X.low, X.high, X.volume) data = np.asarray(data) #data = MinMaxScaler(data) data = tf.layers.batch_normalization(data) x = data y = data[:, [0]] # last as label # build a dataset dataX = [] dataY = [] for i in range(0, len(y) - seq_length): _x = x[i:i + seq_length] _y = y[i + seq_length] # Next close price print(_x, "->", _y)
"""Plotting distribution of feature coefficients for ARIMAX models.""" import json import pandas as pd from dplython import DplyFrame import matplotlib matplotlib.use('TkAgg') from ggplot import (ggplot, scale_color_brewer, geom_histogram, scale_x_continuous, facet_wrap, labs, ggtitle, scale_y_continuous, theme_gray, aes, geom_boxplot) # noqa: E402 # data frame processing; creating a master df for faceting plots json_filename = "arima_1.json" coefficient_dict = json.load(open(json_filename)) coefficients = DplyFrame( pd.DataFrame.from_dict(coefficient_dict, orient='index')) coefficients = coefficients.transpose() folder_name = json_filename.split('.')[0] dfs_to_concat = [] for feature in [ "home_goal", "away_goal", "home_yellow", "away_yellow", "home_red", "away_red" ]: # create a new long-form dataframe for clean plotting purposes values_dict = { "significant": coefficients[feature]["significant"], "insignificant": coefficients[feature]["unsignificant"] } df = pd.DataFrame.from_dict(values_dict, orient='index')
output = output.drop(columns ='content') """to filter the bigrams only""" bigr = output[output['word'].str.contains("_")] """FROM THIS PART, 2 STRATEGIES, SAVE THE OUTPUT AND CONTINUE W R OR GO AHEAD W PYTHON""" """5 plotting""" """5 1 aggregating for plotting""" from dplython import (DplyFrame, X, diamonds, select, sift, sample_n, sample_frac, head, arrange, mutate, group_by, summarize, DelayFunction) dfr = DplyFrame(output) dfr = (dfr >> group_by(X.word, X.source) >> summarize(tot=X.count.sum())) dff = (dfr >>select(X.word, X.tot )) """5.2 wordcloud""" """turns the word freq to dict""" d = {} for a, x in dff.values: d[a] = x wordcloud = WordCloud(width = 1000, height = 1000, background_color ='white', min_font_size =15, max_font_size=120).generate_from_frequencies(frequencies=d) plt.figure(figsize = (8, 8), facecolor = None) plt.imshow(wordcloud)
def load_data(input_dir, crsrd_id): cctv_log = pd.read_csv(input_dir + "/ORT_CCTV_5MIN_LOG.csv") cctv_mst = pd.read_csv(input_dir + "/ORT_CCTV_MST.csv") cctv_log['DATE'] = pd.DataFrame(pd.DatetimeIndex(cctv_log['REG_DT']).date) cctv_log['HOUR'] = pd.DataFrame(pd.DatetimeIndex(cctv_log['REG_DT']).hour) cctv_log['MINUTE'] = ( pd.DataFrame(pd.DatetimeIndex(cctv_log['REG_DT']).minute) // 30) * 30 cctv_log['temp_DAY'] = pd.to_datetime(cctv_log['DATE']).dt.dayofweek cctv_log.loc[cctv_log['temp_DAY'] < 5, 'DAY'] = int(0) #mon - fri cctv_log.loc[cctv_log['temp_DAY'] == 5, 'DAY'] = int(1) #sat cctv_log.loc[cctv_log['temp_DAY'] == 6, 'DAY'] = int(2) #sun df0 = DplyFrame(cctv_log) >> group_by( X.DATE, X.DAY, X.HOUR, X.MINUTE, X.CCTV_ID) >> summarize( GO_TRF=X.GO_BIKE.sum() + X.GO_CAR.sum() + X.GO_SUV.sum() + X.GO_VAN.sum() + X.GO_TRUCK.sum() + X.GO_BUS.sum() + X.RIGHT_BIKE.sum() + X.RIGHT_CAR.sum() + X.RIGHT_SUV.sum() + X.RIGHT_VAN.sum() + X.RIGHT_TRUCK.sum() + X.RIGHT_BUS.sum(), LEFT_TRF=X.LEFT_BIKE.sum() + X.LEFT_CAR.sum() + X.LEFT_SUV.sum() + X.LEFT_VAN.sum() + X.LEFT_TRUCK.sum() + X.LEFT_BUS.sum()) # Extract records of selected crossroad cctv_mst = DplyFrame(cctv_mst) >> sift(X.CRSRD_ID == crsrd_id) >> select( X.CRSRD_ID, X.CCTV_ID) df0 = pd.merge(df0, cctv_mst, how="inner", on="CCTV_ID") df0 = df0.sort_values(['DATE', 'HOUR', 'MINUTE', 'CCTV_ID']) # Time frame from existing dataset tf = DplyFrame( df0.drop_duplicates( ['DATE', 'DAY', 'HOUR', 'MINUTE'], keep='last')) >> select( X.DATE, X.DAY, X.HOUR, X.MINUTE) # Process the datastructure into pivot cctv_list = sorted(cctv_mst['CCTV_ID'].unique()) df1 = tf for cctv in cctv_list: a = df0 >> sift(X.CCTV_ID == cctv) >> select( X.DATE, X.DAY, X.HOUR, X.MINUTE, X.GO_TRF, X.LEFT_TRF) df1 = pd.merge(df1, a, how='left', on=['DATE', 'DAY', 'HOUR', 'MINUTE'], suffixes=('', '_' + str(cctv))) df1 = df1.set_index(['DATE', 'DAY', 'HOUR', 'MINUTE']) df1 = df1.fillna(df1.rolling(window=24, min_periods=1, center=True).mean()) df1 = df1.fillna(0) df1 = df1.reset_index() df1['TOTAL_TRF'] = DplyFrame(df1.iloc[:, 4:3 + len(cctv_list) * 2].sum( axis=1, skipna=True)) df1 = df1 >> sift(X.TOTAL_TRF > 0) print(df1) # Name the cctv id and direction - for tod_traffic_analyzer cols = [cctv + '_GO_RATE' for cctv in cctv_list] cols.extend([cctv + '_LEFT_RATE' for cctv in cctv_list]) cols = sorted(cols) cols = ['TOD'] + cols + ['TOTAL_TRF'] return df1, cols
# Read in temp file to match import pandas as pd co2_file_to_match = "annual_avg_co2_GFDL-ESM2M_rcp45_r1i1p1.csv" esm_co2_data = pd.read_csv(co2_file_to_match).rename(columns={ "value.1...": "co2_value" }).set_index('year') esm_co2 = esm_co2_data.co2_value * 1000000 #print esm_co2_data.head(5) #print esm_co2.head(5) from numpy import mean from dplython import (DplyFrame, X, mutate) CONCENTRATION_CO2 = "simpleNbox.Ca" hector_co2 = pyhector.run(pyhector.rcp45)[CONCENTRATION_CO2].loc[esm_co2.index] comp = DplyFrame({"hector": hector_co2, "esm": esm_co2}) def difference_quantifier(esm_series, hector_run_series): calculate_df = DplyFrame({"hector": hector_run_series, "esm": esm_series}) calculate_df = calculate_df >> mutate(percentdiff=(X.hector - X.esm) / X.esm) return mean(abs(calculate_df.percentdiff)) #print difference_quantifier(esm_co2,hector_co2) def hector_runner(params, comp_data, var): hector_output = pyhector.run( pyhector.rcp45, {
def difference_quantifier(esm_series, hector_run_series): calculate_df = DplyFrame({"hector": hector_run_series, "esm": esm_series}) calculate_df = calculate_df >> mutate(percentdiff=(X.hector - X.esm) / X.esm) return mean(abs(calculate_df.percentdiff))
import matplotlib.pyplot as plt plt.rcParams['figure.figsize'] = 6.4, 4.8 import numpy as np import seaborn as sns sns.set_theme(style="ticks", palette="pastel") import altair as alt firsts = pd.read_csv( 'https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-06-09/firsts.csv' ) firsts.to_csv('/Users/vivekparashar/Downloads/firsts.csv') # Create/Convert a pandas dataframe to dplython df firsts = DplyFrame(firsts) firsts.columns firsts.gender.unique() firsts.category.unique() # firsts df summary by category t1 = (firsts >> mutate(year_grp=((X.year / 10).round()) * 10) >> group_by( X.year_grp, X.category) >> summarize(nrows=X.accomplishment.count())) c1 = alt.Chart(t1).mark_circle().encode(x='year_grp:O', y='category:O', size='nrows:Q') c3 = alt.Chart(t1).mark_bar().encode(x='year_grp', y='nrows', color='category') # firsts df summary by gender t2 = (firsts >> mutate(year_grp=((X.year / 10).round()) * 10) >> group_by( X.year_grp, X.gender) >> summarize(nrows=X.accomplishment.count()))
# Django is only used here to fetch the data from djangodb if __name__ == "__main__": os.environ.setdefault("DJANGO_SETTINGS_MODULE", "settings") logging.basicConfig(format='%(asctime)s %(message)s', level=logging.INFO) application = DjangoWhiteNoise(get_wsgi_application()) from dplython import DplyFrame from article_fetcher import ArticleFetcher from tagger.models import Article import pandas as pd # Trained data label_df = DplyFrame( pd.read_csv( "/Users/danchecketts/PycharmProjects/taggernews/supervised_topics.csv") ) RETRY = True DEBUG = False DEBUG_FETCH_MAX = 100 FETCH_NOT_CACHED = True loops = 0 skipped_due_state = 0 retry_articles = 0 loaded_from_db = 0 uncached_articles = [] for article_id in label_df.id:
temp_df = temp_df.set_index(match_data.index) temp_df["time"] = match_data["time"] # append to long form df and plot preds_df = pd.concat([longform_df, temp_df], axis=0, ignore_index=True) sifted_df = preds_df >> sift((X.match_id == match_id) | (X.match_id == opponent_match_id) | (X.match_id == model_no)) plot_matches(sifted_df, date, filename_out) # read & dissect df longform_df_og = pd.read_csv("../../LongForm/longform.csv", dtype={'shorthand_search_vol': int}) longform_df = DplyFrame(longform_df_og) cols = longform_df.columns y_var = longform_df["shorthand_search_vol"] # search volume longform_df['ones'] = 1 # MLR MODEL #1: USING ONLY MATCH STAGES TO PREDICT SEARCH VOLUME # note the difference in argument order; y_var is dependent, x_vars independent # using Stage 0 as "reference level"; only vars are stage_1-4_indicators x_var_list = [ "ones", "stage_1_ind", "stage_2_ind", "stage_3_ind", "stage_4_ind" ] x_vars = longform_df[x_var_list] lm = sm.OLS(y_var, x_vars).fit() with open('model1.txt', 'w') as f: print >> f, lm.summary() predict_on_match_id(lm=lm,
numerator = data - np.min(data, 0) denominator = np.max(data, 0) - np.min(data, 0) return numerator / (denominator + 1e-7) # train Parameters seq_length = 60 data_dim = 8 hidden_dim = 10 output_dim = 1 learning_rate = 0.01 iterations = 500 # last, diff_24h, diff_per_24h, bid, ask, low, high, volume data = DplyFrame( pd.read_csv('/home/yeolpyeong/bitcoin_ticker.csv', delimiter=',')) data = data >> sift(X.rpt_key == 'btc_krw') >> select( X.last, X.diff_24h, X.diff_per_24h, X.bid, X.ask, X.low, X.high, X.volume) data = np.asarray(data) data = MinMaxScaler(data) x = data y = data[:, [0]] # last as label # build a dataset dataX = [] dataY = [] for i in range(0, len(y) - seq_length): _x = x[i:i + seq_length] _y = y[i + seq_length] # Next close price print(_x, "->", _y) dataX.append(_x)
def main(argv): yURL = None outdir = None maxFrames = 500 yURL = input("Enter the youtube url:") outdir = input("Enter the output directory:") maxFrames = int(input("Enter the maximum number of frames to check:")) faceDet = cv2.CascadeClassifier( "haarcascade/haarcascade_frontalface_default.xml") faceDet2 = cv2.CascadeClassifier( "haarcascade/haarcascade_frontalface_alt2.xml") faceDet3 = cv2.CascadeClassifier( "haarcascade/haarcascade_frontalface_alt.xml") faceDet4 = cv2.CascadeClassifier( "haarcascade/haarcascade_frontalface_alt_tree.xml") # pdata, pframes, pfacedims = getNewInstances(yURL, faceDet, faceDet2, faceDet3, faceDet4, maxCount=maxFrames) # headers = dict() headers['Ocp-Apim-Subscription-Key'] = ms_key1 headers['Content-Type'] = 'application/octet-stream' # resultsDf = pd.DataFrame() frameId = 0 for image in pframes: print("posting frame %d of %d" % (frameId, len(pframes))) #sending the frame image to MS cognitive services resultMS = processRequest(image, headers) #isinstance == type() if isinstance(resultMS, list): for result in resultMS: if isinstance(result, dict): resFrameList = [] for res in result['scores'].items(): resFrameList.append( (frameId, res[0], res[1], result["faceRectangle"]['left'], result["faceRectangle"]['top'], result["faceRectangle"]['width'], result["faceRectangle"]['height'])) appendDf = pd.DataFrame(resFrameList, columns=[ "frameId", "emotionLabel", "conf", "faceleft", "facetop", "faceW", "faceH" ]) resultsDf = resultsDf.append(appendDf) time.sleep(2) frameId += 1 # # print(resultsDf) #we append all the data to the dataframe #http://bluescreen.club/2017/06/18/import-pandas-as-pd/ #then we convert the dataframe to a Dplyframe object which allows us to do higher level data analytics #for this one, we will select out the top most ranking face frames for each of the emotions #microsoft provides us with around 8 emotions #so we sort out 8 faces for 8 emotions and then save them accordingly dfFaces = DplyFrame(resultsDf) # print(dfFaces) topFaces = ( dfFaces >> group_by(X.emotionLabel) >> sift(X.conf == X.conf.max()) >> sift(X.frameId == X.frameId.min()) >> ungroup() >> group_by( X.frameId) >> sift(X.conf == X.conf.max()) >> ungroup() >> arrange( X.emotionLabel)) topFaces = topFaces.drop_duplicates() #print(topFaces) i = 0 for index, row in topFaces.iterrows(): print("saving emotion frame %d of %d" % (i, len(topFaces.index))) # emotion = row["emotionLabel"] confid = int(row["conf"] * 100) image = pframes[int(row["frameId"])] faceL = row["faceleft"] faceT = row["facetop"] faceW = row["faceW"] faceH = row["faceH"] #save cropped face imageW = image[faceT:faceT + faceH, faceL:faceL + faceW] cv2.imwrite( os.path.expanduser("%s/Cropped_%s.jpg" % (outdir, emotion)), imageW) #if you wish to put a rectangle on the faces then uncomment below # # cv2.rectangle( image,(faceL,faceT), # (faceL+faceW, faceT + faceH), # color = (255,0,0), thickness = 5 ) # cv2.putText( image, emotion, (faceL,faceT-10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255,0,0), 1 ) # cv2.imwrite(os.path.expanduser("%s/%s.jpg" % (outdir, emotion)), image) i += 1
def __rrshift__(self, other): return self.__call__(DplyFrame(other.copy(deep=True)))
home_yellows = df.home_yellows.apply(lambda x: len(literal_eval(x))) away_yellows = df.away_yellows.apply(lambda x: len(literal_eval(x))) home_reds = df.home_reds.apply(lambda x: len(literal_eval(x))) away_reds = df.away_reds.apply(lambda x: len(literal_eval(x))) data = pd.DataFrame({'home_goals': home_goals, 'away_goals': away_goals, 'home_yellows': home_yellows, 'away_yellows': away_yellows, 'home_reds': home_reds, 'away_reds': away_reds}) df_container.append(data) # concatenate all into one master data frames & generate descriptive stats master_df = DplyFrame(pd.concat(df_container)) print("home goals", "\n", master_df.home_goals.describe()) print("away goals", "\n", master_df.away_goals.describe()) print("home yellows", "\n", master_df.home_yellows.describe()) print("away yellows", "\n", master_df.away_yellows.describe()) print("home reds", "\n", master_df.home_reds.describe()) print("away reds", "\n", master_df.away_reds.describe()) print("frequency of home goals", len(master_df >> sift(X.home_goals > 0))) print("frequency of away goals", len(master_df >> sift(X.away_goals > 0))) print("frequency of home yellows", len(master_df >> sift(X.home_yellows > 0))) print("frequency of away yellows", len(master_df >> sift(X.away_yellows > 0))) print("frequency of home reds", len(master_df >> sift(X.home_reds > 0))) print("frequency of away reds", len(master_df >> sift(X.away_reds > 0))) goals = master_df.apply(lambda row: row.home_goals + row.away_goals, axis=1)
temp_df = pd.DataFrame.from_records(preds, columns=labels) temp_df = temp_df.set_index(match_data.index) temp_df["time"] = match_data["time"] # append to long form df and plot preds_df = pd.concat([longform_df, temp_df], axis=0, ignore_index=True) sifted_df = preds_df >> sift((X.match_id == match_id) | (X.match_id == opponent_match_id) | (X.match_id == model_no)) plot_matches(sifted_df, date, filename_out) # read initial data longform_df = DplyFrame( pd.read_csv("../../LongForm/longform.csv", dtype={'shorthand_search_vol': float})) # process data longform_df["date"] = longform_df.match_id.apply( lambda x: "20" + x.split("20")[-1]) longform_df['date_time'] = longform_df['date'].astype( str) + " " + longform_df['time'].astype(str) longform_df['date_time'] = pd.to_datetime(longform_df['date_time'], errors="coerce", infer_datetime_format=True) # fit several ARIMA(2, 0, 2) models # TODO: why did Schwartz use 2, 0, 2 as model parameters? # TODO: uncomment all below once you figure out prediction / plotting for arima 3+ (smaller models)
start_time = time.clock() dt = d0.ply_where( X.usertype == 'Subscriber' ).ply_select( slat = X.latitude_start * math.pi / 180, elat = X.latitude_end * math.pi / 180, slng = X.longitude_start * math.pi / 180, elng = X.longitude_end * math.pi / 180 ) print( 'pandas_ply: ' + str( round( time.clock() - start_time, 2 ) ) + ' seconds.' ) # dplython import pandas from dplython import (DplyFrame, X, diamonds, select, sift, sample_n, sample_frac, head, arrange, mutate, group_by, summarize, DelayFunction) start_time = time.clock() dt = DplyFrame(d0) >> sift( X.usertype == 'Subscirber' ) >> mutate( slat = X.latitude_start * math.pi / 180, elat = X.latitude_end * math.pi / 180, slng = X.longitude_start * math.pi / 180, elng = X.longitude_end * math.pi / 180 ) print( 'dplython: ' + str( round( time.clock() - start_time, 2 ) ) + ' seconds.' ) # dfply from dfply import * import pandas as pd start_time = time.clock() dt =d0 >> mask( X.usertype == 'Subscirber' ) >> mutate( slat = X.latitude_start * math.pi / 180, elat = X.latitude_end * math.pi / 180,
def czMatchmaker(data, Q, precursor_fasta): data = pd.read_csv( "/Users/matteo/Documents/czMatchmaker/data/examplaryData.csv") data = DplyFrame(data) precursors = data >> \ sift( X.tag == 'precursor' ) >> \ select( X.active, X.neutral, X.estimates) fragments = data >> sift( X.tag != 'precursor' ) >> \ group_by( X.tag, X.active, X.broken_bond ) >> \ summarize( estimates = X.estimates.sum() ) I_on_fragments = {} optiminfos = {} for break_point, data in fragments.groupby('broken_bond'): pairing, optiminfo = collect_fragments(data, Q) I_on_fragments[break_point] = pairing optiminfos[break_point] = optiminfo cations_fragmented_I = sum( sum(I_on_fragments[bP][p] for p in I_on_fragments[bP]) for bP in I_on_fragments) I_no_reactions = precursors >> \ sift( X.active==Q, X.neutral == 0) >> \ select( X.estimates ) I_no_reactions = I_no_reactions.values.flatten()[0] prec_ETnoD_PTR_I = precursors >> \ sift( X.active != Q ) >> \ rename( ETnoD = X.neutral, I = X.estimates ) >> \ mutate( PTR = Q - X.ETnoD - X.active ) >> \ select( X.ETnoD, X.PTR, X.I ) I_prec_no_frag = prec_ETnoD_PTR_I >> \ summarize( I = X.I.sum() ) I_prec_no_frag = I_prec_no_frag.values.flatten()[0] precursorNoReactions = precursors >> \ sift( X.active == Q ) >> \ select( X.estimates ) prec_ETnoD_PTR_I = prec_ETnoD_PTR_I >> mutate( I_PTR = crossprod(X.PTR, X.I), \ I_ETnoD = crossprod(X.ETnoD, X.I) ) >> \ summarize( I_PTR = X.I_PTR.sum(), I_ETnoD = X.I_ETnoD.sum() ) I_PTR_no_frag, I_ETnoD_no_frag = prec_ETnoD_PTR_I.values.flatten() prob_PTR = I_PTR_no_frag / (I_PTR_no_frag + I_ETnoD_no_frag) prob_ETnoD = 1. - prob_PTR I_frags = dict( (bP, sum(I_on_fragments[bP][pairing] for pairing in I_on_fragments[bP])) for bP in I_on_fragments) I_frag_total = sum(I_frags[bP] for bP in I_frags) prob_frag = Counter( dict((int(bP), I_frags[bP] / I_frag_total) for bP in I_frags)) prob_frag = [prob_frag[i] for i in range(len(precursor_fasta))] I_frags_PTRETnoD_total = sum( (Q - 1 - sum(q for cz, q in pairing)) * I_on_fragments[bP][pairing] for bP in I_on_fragments for pairing in I_on_fragments[bP]) anion_meets_cation = I_frags_PTRETnoD_total + I_PTR_no_frag + I_ETnoD_no_frag prob_fragmentation = I_frags_PTRETnoD_total / anion_meets_cation prob_no_fragmentation = 1 - prob_fragmentation prob_no_reaction = I_no_reactions / (I_no_reactions + I_frag_total + I_prec_no_frag) prob_reaction = 1. - prob_no_reaction res = {} res['reaction'] = (prob_reaction, prob_no_reaction) res['fragmentation'] = (prob_fragmentation, prob_no_fragmentation) res['fragmentation_amino_acids'] = tuple(prob_frag) return res
def main(argv): ytURL = None outdir = None maxFrames = 500 try: opts, args = getopt.getopt(argv, "hy:o:m:", ["yturl=", "odir=", "maxframes="]) except getopt.GetoptError: print 'Error: shellScript.py -y <yturl> -o <odir> -m <maxframes>' sys.exit(2) #print opts for opt, arg in opts: if opt == '-h': print 'help: shellScript.py -y <yturl> -o <odir> -m <maxframes>' sys.exit() elif opt in ("-y", "--yturl"): print("--yturl={}".format(arg)) ytURL = arg elif opt in ("-o", "--odir"): print("--odir={}".format(arg)) outdir = arg elif opt in ("-m", "--maxframes"): print("--maxframes={}".format(arg)) maxFrames = int(arg) # if ytURL is None: print 'bad yt: shellScript.py -y <yturl> -o <odir> -m <maxframes>' sys.exit() # if outdir is None: print 'bad outdir: shellScript.py -y <yturl> -o <odir> -m <maxframes>' sys.exit() # if False == isinstance(maxFrames, (int, long)): print 'bad maxFrames: shellScript.py -y <yturl> -o <odir> -m <maxframes>' sys.exit() # # faceDet = cv2.CascadeClassifier( "haarcascade/haarcascade_frontalface_default.xml") faceDet2 = cv2.CascadeClassifier( "haarcascade/haarcascade_frontalface_alt2.xml") faceDet3 = cv2.CascadeClassifier( "haarcascade/haarcascade_frontalface_alt.xml") faceDet4 = cv2.CascadeClassifier( "haarcascade/haarcascade_frontalface_alt_tree.xml") # pdata, pframes, pfacedims = getNewInstances(ytURL, faceDet, faceDet2, faceDet3, faceDet4, maxCount=maxFrames) # headers = dict() headers['Ocp-Apim-Subscription-Key'] = ms_key1 headers['Content-Type'] = 'application/octet-stream' # resultsDf = pd.DataFrame() frameId = 0 for image in pframes: print("posting frame %d of %d" % (frameId, len(pframes))) resultMS = processRequest(image, headers) # if isinstance(resultMS, list): for result in resultMS: if isinstance(result, dict): resFrameList = [] for res in result['scores'].items(): resFrameList.append( (frameId, res[0], res[1], result["faceRectangle"]['left'], result["faceRectangle"]['top'], result["faceRectangle"]['width'], result["faceRectangle"]['height'])) appendDf = pd.DataFrame(resFrameList, columns=[ "frameId", "emotionLabel", "conf", "faceleft", "facetop", "faceW", "faceH" ]) resultsDf = resultsDf.append(appendDf) time.sleep(2) frameId += 1 # dfFaces = DplyFrame(resultsDf) # topFaces = ( dfFaces >> group_by(X.emotionLabel) >> sift(X.conf == X.conf.max()) >> sift(X.frameId == X.frameId.min()) >> ungroup() >> group_by( X.frameId) >> sift(X.conf == X.conf.max()) >> ungroup() >> arrange( X.emotionLabel)) topFaces = topFaces.drop_duplicates() #print(topFaces) # i = 0 for index, row in topFaces.iterrows(): print("saving emotion frame %d of %d" % (i, len(topFaces.index))) # emotion = row["emotionLabel"] confid = int(row["conf"] * 100) image = pframes[int(row["frameId"])] faceL = row["faceleft"] faceT = row["facetop"] faceW = row["faceW"] faceH = row["faceH"] # #save cropped face imageW = image[faceT:faceT + faceH, faceL:faceL + faceW] cv2.imwrite( os.path.expanduser("%s/Cropped_%s.jpg" % (outdir, emotion)), imageW) # cv2.rectangle(image, (faceL, faceT), (faceL + faceW, faceT + faceH), color=(255, 0, 0), thickness=5) cv2.putText(image, emotion, (faceL, faceT - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 0, 0), 1) # cv2.imwrite(os.path.expanduser("%s/box%s.jpg" % (outdir, emotion)), image) i += 1