def merge_flights_history(mdate): vdp = get_vdropbox() # Check for monthly folders and get all parquets inside for folder in vdp.ls(c.PATH_HISTORY): is_date_folder = re.search(r"\d{4}_\d{2}", folder) if is_date_folder and ("." not in folder) and (folder < f"{mdate:%Y_%m}"): log.info(f"Merging '{folder}' vflights history") sub_folder = f"{c.PATH_HISTORY}/{folder}" # Read all daily parquets dfs = [] for file in vdp.ls(sub_folder): if file.endswith(".parquet"): dfs.append(vdp.read_parquet(f"{sub_folder}/{file}")) # Export it as only one parquet file df = pd.concat(dfs) vdp.write_parquet(df, f"{sub_folder}.parquet") log.success(f"Successfuly merged '{folder}' vflights history") # Delete original folder vdp.delete(sub_folder)
def weights_selection(models): WEIGHTS = [] log.success('|SELECT THE WEIGHTS FOR THE MODELS|') for m in models: log.success('select the weights for: {}'.format(m)) WEIGHTS.append(float(input())) return WEIGHTS
def show(title, options, can_exit=True, main_menu=False, decorator='++'): """ Display a menu options: dictionary in which each key is a string and each value is a tuple (string, function), representing the text of the function that will be called when the related string in inserted as input ex: { 'a', ('option a', print) } : print 'option a' and when 'a' is pressed, call the function 'print' """ log.success('{} {} {}'.format(decorator, title, decorator)) for s,f in options.items(): log.warning('({}) {}'.format(s,f[0])) if can_exit: log.warning('(x) Exit') wrong_choice = True while(wrong_choice): arg = input() print() try: if arg=='x' and can_exit: wrong_choice = False quit_menu(main_menu) else: funct = options[arg][1] wrong_choice = False res = funct() quit_menu(main_menu) return res except KeyError as _: log.error('Invalid option, retry:')
def create_one_report(dfs, mdate): """Creates a report for one month""" data = extract_data.main(dfs, mdate, export_data=False) create_report.main(mdate, data=data) log.success(f"Report {mdate:%Y-%m} created")
def cluster_ensemble(clip, path_sparse, path_dense): sparse_pl, dense_pl = cluster.cluster_users_by_interactions_count(clip=clip) log.success('Cluster 1 (interactions count <= {}): {} playlists'.format(clip, len(sparse_pl))) log.success('Cluster 2 (interactions count > {}): {} playlists'.format(clip, len(dense_pl))) # filter target playlists from the 2 clusters s1 = set(sparse_pl) s2 = set(dense_pl) s_target = set(data.get_target_playlists()) s1_target = s1 & s_target s2_target = s2 & s_target sparse_pl = pd.DataFrame({'playlist_id':list(s1_target)}) dense_pl= pd.DataFrame({'playlist_id': list(s2_target)}) df_sparse = pd.read_csv(path_sparse) df_dense = pd.read_csv(path_dense) cluster1 = df_sparse.merge(sparse_pl) cluster2 = df_dense.merge(dense_pl) final = pd.concat([cluster1, cluster2]) final.to_csv(path_or_buf='submissions/cluster_ensemble' + t.strftime('_%H-%M-%S'), index=False)
def on_packet(packet: bytes): global should_close, chunk_index, file_bytes index, packet = packet[:4], packet[4:] index, = struct.unpack("i", index) if index != chunk_index: return file_bytes += packet log(f"Received chunk {chunk_index}") if chunk_index == chunks_count - 1: # Basically, MD5 check is redundant, as we've guaranteed # the correct order of the individual packets and their # integrity if md5 == hashlib.md5(file_bytes).digest(): connection.status = b"md5 ok" with open(file_name, "wb") as file: file.write(file_bytes) log.success("Receive successful") should_close = True else: connection.status = b"md5 error" log.error("MD5 error", file=sys.stderr) else: connection.status = b"received " + struct.pack( "i", chunk_index) chunk_index += 1
def upush_summary(update_repo_path, dest_base, update_files, update_fails, temp_path, need_sync=None, debug=False): pusher = pushupdate.UpdatePusher( update_repo_path, dest_base, update_files=update_files, fails=update_fails, temp_path=temp_path, debug=debug ) pusher.print_summary() if need_sync: log.success("Updated trees (might need sync):") helpers.print_list(map(os.path.basename, need_sync), nl_after=True) log.info(log.term.warn("Don't forget to push changes to update repo:")) log.info(update_repo_path) log.info("")
def option_selection_evaluation_2(): log.success('|EVALUATE OR SAVE THE MATRIX?|') log.warning('\'s\' save the matrix') log.warning('\'e\' evaluate the matrix') log.warning('\'c\' create the CSV') selection = input()[0] if selection in ['s', 'e', 'c']: return selection else: log.info('wrong mode') exit(0)
def urls(): # regex for URLs urls = findall(url_regex, context.default_txt()) if len(urls) == 0: log.fail('No URLs') return log.success('URLs:') for url in urls: log.info(url, indent=1)
def resources(): # regex for resources, e.g. /api/v2 resources = findall(resource_regex, context.default_txt()) if len(resources) == 0: log.fail('No Resources') return log.success('Resources:') for res in resources: log.info(res[1], indent=1)
def comments(): # Use BeautifulSoup to extract all comments soup = BeautifulSoup(context.default_txt(), 'html.parser') comments = soup.find_all(string=lambda text: isinstance(text, Comment)) if len(comments) == 0: log.fail('No comments') return log.success('Comments:') for c in comments: log.info(c, indent=1)
def save_r_hat(self, evaluation): r_hat = self.W_sparse r_hat = check_matrix(r_hat, format='csr') # create dir if not exists if evaluation: filename = 'raw_data/saved_r_hat_evaluation/{}_{}'.format(self.name, time.strftime('%H-%M-%S')) os.makedirs(os.path.dirname(filename), exist_ok=True) else: filename = 'raw_data/saved_r_hat/{}_{}'.format(self.name, time.strftime('%H-%M-%S')) os.makedirs(os.path.dirname(filename), exist_ok=True) sps.save_npz(filename, r_hat) log.success('R_hat succesfully saved in: {}.npz'.format(filename))
def option_selection_evaluation(type): if type == 'SIM': # LET USER CHOOSE OPTIONS log.success('STUDY HARD | WORK HARD | F**K HARD |') log.warning('\'s\' for save the r_hat in saved_r_hat_evaluation') log.warning('\'m\' for compute the MAP@10') option = input()[0] if option == 's': urm_filter_tracks = data.get_urm_train_1() rel_path = 'saved_r_hat_evaluation/' log.success('SELECT A NAME FOR THE MATRIX') name = input() elif option == 'm': urm_filter_tracks = data.get_urm_train_1() rel_path = None name = None else: log.warning( 'CON UNA MANO SELEZIONI E CON L\'ALTRA FAI UNA SEGA AL TUO RAGAZZO...' ) exit(0) return name, urm_filter_tracks, rel_path elif type == 'R_HAT': # LET USER CHOOSE OPTIONS log.success('STUDY HARD | WORK HARD | F**K HARD |') log.warning('\'s\' for save the r_hat in saved_r_hat') log.warning('\'e\' for EXPORT and get a SUB') option = input()[0] if option == 's': log.success('SELECT A NAME FOR THE MATRIX') name = input() urm_filter_tracks = data.get_urm() rel_path = 'saved_r_hat/' EXPORT = False elif option == 'e': log.success('SELECT A NAME FOR THE SUB') name = input() urm_filter_tracks = data.get_urm() rel_path = None EXPORT = True else: log.warning( 'CON UNA MANO SELEZIONI E CON L\'ALTRA FAI UNA SEGA AL TUO RAGAZZO...' ) exit(0) return name, urm_filter_tracks, rel_path, EXPORT
def user_agents(): # Get standard length of response length = context.default_len() log.info(f'Standard Response Length: {length}') # Iterate through all User-Agent, comparing response length to original # If different, print that out for agent in user_agents_list: r = context.session.get(context.url, headers={'User-Agent': agent}) if len(r.text) != length: log.info(agent, indent=1) log.success(f'Response size is different: {len(r.text)}', indent=2) else: log.fail(agent, indent=1) log.fail('Default length', indent=2)
def test_post(): r = context.session.post(context.url) if r.status_code == 501: log.fail('POST request throws Code 501 (Unsupported Method)') elif r.status_code == 200: log.success('POST accepted!') length = len(r.text) if context.default_len() == length: log.fail('GET and POST responses are of same length', indent=1) else: log.success('GET and POST responses are of different lengths!', indent=1) else: log.info(f'POST returns Code {r.status_code} - could be something there')
def get_jwts(): response = get_full_response(context.default_req) jwts = findall(jwt_regex, response) if len(jwts) == 0: log.fail('No JWTs') return log.success('JWTs found:') for jwt in jwts: log.success(jwt, indent=1) # The last section of a JWT is the signature for section in jwt.split('.')[:-1]: log.info(standard_b64decode(section), indent=2)
def redirects(): # Grab the request's history history = context.default_req.history if len(history) == 0: log.fail('No redirects') return log.success('Redirects:') for url in history: red = Fore.RED + str(url.status_code) + Fore.RESET red = red.ljust(20, ' ') red += url.url log.info(red, indent=1) redirect(url.url)
def fit(self, clip=7): sparse_pl, dense_pl = cluster.cluster_users_by_interactions_count( clip=clip) log.success( 'Cluster 1 (interactions count <= {}): {} playlists'.format( clip, len(sparse_pl))) log.success( 'Cluster 2 (interactions count > {}): {} playlists'.format( clip, len(dense_pl))) # filter target playlists from the 2 clusters s1 = set(sparse_pl) s2 = set(dense_pl) s_target = set(data.get_target_playlists()) s1_target = s1 & s_target s2_target = s2 & s_target self.sparse_pl = list(s1_target) self.dense_pl = list(s2_target)
def normalization_mode_selection(): log.success('|SELECT THE NORMALIZATION MODE|') log.warning('\'1\' MAX MATRIX') log.warning('\'2\' MAX ROW') log.warning('\'3\' L2 NORM') log.warning('\'4\' NONE') selection = input()[0] if selection == '1': NORMALIZATION_MODE = 'MAX_MATRIX' elif selection == '2': NORMALIZATION_MODE = 'MAX_ROW' elif selection == '3': NORMALIZATION_MODE = 'L2' elif selection == '4': NORMALIZATION_MODE = 'NONE' else: log.error('wrong mode') exit(0) return NORMALIZATION_MODE
def print_summary(good, fails, good_s, fail_s): n_good = len(good) n_fails = len(fails) n_all = n_good + n_fails if good: log.success("\n%d updates %s:" % (n_good, good_s)) fmt = '{t.bold}{upf}:{t.normal}\n{up}' if len(good[0]) == 2: l = map(lambda x: fmt.format(t=log.term, upf=x[0], up=x[1]), good) else: l = [] for upf, up, builds in good: bstr = '\n'.join(map(str, builds)) l.append(fmt.format(t=log.term, upf=upf, up=bstr)) helpers.print_list(l) if fails: log.error("\n%s updates %s:" % (n_fails, fail_s)) fmt = "{t.warn}{upf}:{t.normal} {err}" l = map(lambda x: fmt.format(t=log.term, upf=x[0], err=str(x[1])), fails) helpers.print_list(l)
def extract_gcal_confusions(exclude_other=True, merge_study=True, min_alpha=0.1): vdp = get_vdropbox() dfg = vdp.read_parquet(PATH_GCAL_DATA) df_aux = clear_potential_confusions(dfg, exclude_other, merge_study) df_matrix = get_confusion_matrix(df_aux, col_text="summary", col_category="calendar") df_confusions = filter_confusions(df_matrix, min_alpha) num_confusions = df_confusions.shape[0] if num_confusions > 0: log.warning( f"There are {num_confusions} in google calendar. Exporting them") vdp.write_excel(df_confusions, PATH_CONFUSIONS) else: log.success("There are no confusions in google calendar")
def redirect(url): if '?' not in url: log.fail('Redirect contains no GET parameters', indent=2) return # URL Decode params = unquote(url.split('?')[1]).split('&') for p in params: name, value = p.split('=') if re.match(url_regex, value): log.success( 'Redirect appears to contain a URL - possible RFI or use of another URL scheme?', indent=2) continue if re.match(resource_regex, value): log.success( 'Redirect appears to contain a resource - possible LFI?', indent=2) continue if re.match(file_regex, value): log.success( 'Redirect appears to contain a filename - possible LFI?', indent=2) continue
for chunk_index in range(chunks_count): log(f"Sending chunk {chunk_index + 1}/{chunks_count}") status = connection.remote_status while status != b"received " + struct.pack("i", chunk_index) and \ status != b"md5 ok" and status != b"md5 error": connection.status = b"packet " + struct.pack( "i", chunk_index) chunk = struct.pack( "i", chunk_index) + file_bytes[chunk_index * chunk_size: (chunk_index + 1) * chunk_size] connection.send_message(b"packet", chunk) status = connection.remote_status # check md5 log("Checking MD5") while True: if status == b"md5 ok": log.success("Send successful") exit(0) if status == b"md5 error": log.error("MD5 mismatch") break # Reset the receiver while not connection.remote_status == b"header received": connection.send_message(b"reset", b"")
def export_csv_wizard(recommendations): log.info('Choose a name for the CSV:') name = input() exportcsv(recommendations, name=name) log.success('CSV saved!')
def wizard_hybrid(): SIM_MATRIX = ['saved_sim_matrix', 'saved_sim_matrix_evaluation'] R_HAT = ['saved_r_hat', 'saved_r_hat_evaluation'] SAVE = ['saved_sim_matrix', 'saved_r_hat'] EVALUATE = ['saved_sim_matrix_evaluation', 'saved_r_hat_evaluation'] start = time.time() matrices_array, folder, models = hb.create_matrices_array() print('matrices loaded in {:.2f} s'.format(time.time() - start)) log.success('You have loaded: {}'.format(models)) NORMALIZATION_MODE = normalization_mode_selection() if folder in SAVE: WEIGHTS = weights_selection(models) if folder in SIM_MATRIX: name, urm_filter_tracks, rel_path = option_selection_save('SIM') hybrid_rec = HybridSimilarity( matrices_array, normalization_mode=NORMALIZATION_MODE, urm_filter_tracks=urm_filter_tracks) sps.save_npz('raw_data/' + rel_path + name, hybrid_rec.get_r_hat(weights_array=WEIGHTS)) if folder in R_HAT: name, urm_filter_tracks, rel_path, EXPORT = option_selection_save( 'R_HAT') hybrid_rec = HybridRHat(matrices_array, normalization_mode=NORMALIZATION_MODE, urm_filter_tracks=urm_filter_tracks) if EXPORT: N = ask_number_recommendations() recommendations = hybrid_rec.recommend_batch( weights_array=WEIGHTS, target_userids=data.get_target_playlists(), N=N) exportcsv(recommendations, path='submission', name=name) else: sps.save_npz('raw_data/' + rel_path + name, hybrid_rec.get_r_hat(weights_array=WEIGHTS)) elif folder in EVALUATE: log.success('|WHAT YOU WANT TO DO ???|') log.warning('\'1\' BAYESIAN SEARCH VALIDATION') log.warning('\'2\' HAND CRAFTED WEIGHTS') mode = input()[0] # BAYESIAN SEARCH if mode == '1': log.success( '|SELECT A NUMBER OF |||ITERATIONS||| FOR THE ALGORITHM|') iterations = float(input()) urm_filter_tracks = data.get_urm_train_1() if folder in SIM_MATRIX: hybrid_rec = HybridSimilarity( matrices_array, normalization_mode=NORMALIZATION_MODE, urm_filter_tracks=urm_filter_tracks) if folder in R_HAT: hybrid_rec = HybridRHat(matrices_array, normalization_mode=NORMALIZATION_MODE, urm_filter_tracks=urm_filter_tracks) hybrid_rec.validate(iterations=iterations, urm_test=data.get_urm_test_1(), userids=data.get_target_playlists()) # MANUAL WEIGHTS elif mode == '2': WEIGHTS = weights_selection(models) urm_filter_tracks = data.get_urm_train_1() chose = option_selection_evaluation_2() # save, evaluate or csv if chose == 's': log.success('|CHOSE A NAME FOR THE MATRIX...|') name = input() if folder in SIM_MATRIX: type = 'SIM' hybrid_rec = HybridSimilarity( matrices_array, normalization_mode=NORMALIZATION_MODE, urm_filter_tracks=urm_filter_tracks) elif folder in R_HAT: type = 'R_HAT' hybrid_rec = HybridRHat( matrices_array, normalization_mode=NORMALIZATION_MODE, urm_filter_tracks=urm_filter_tracks) sps.save_npz('raw_data/saved_r_hat_evaluation/' + name, hybrid_rec.get_r_hat(weights_array=WEIGHTS)) sym_rec = symmetric_recommender_creator( models, type, NORMALIZATION_MODE, urm_filter_tracks=data.get_urm_train_2()) sps.save_npz('raw_data/saved_r_hat_evaluation_2/' + name, sym_rec.get_r_hat(weights_array=WEIGHTS)) elif chose == 'e': if folder in SIM_MATRIX: type = 'SIM' hybrid_rec = HybridSimilarity( matrices_array, normalization_mode=NORMALIZATION_MODE, urm_filter_tracks=urm_filter_tracks) elif folder in R_HAT: type = 'R_HAT' hybrid_rec = HybridRHat( matrices_array, normalization_mode=NORMALIZATION_MODE, urm_filter_tracks=urm_filter_tracks) N = ask_number_recommendations() print('Recommending...') recs = hybrid_rec.recommend_batch( weights_array=WEIGHTS, target_userids=data.get_target_playlists(), N=N) hybrid_rec.evaluate(recommendations=recs, test_urm=data.get_urm_test_1()) # export the recommendations log.success( 'Do you want to save the CSV with these recomendations? (y/n)' ) if input()[0] == 'y': export_csv_wizard(recs) sym_rec = symmetric_recommender_creator( models, type, NORMALIZATION_MODE, urm_filter_tracks=data.get_urm_train_2()) recs2 = sym_rec.recommend_batch( weights_array=WEIGHTS, target_userids=data.get_target_playlists()) sym_rec.evaluate(recommendations=recs2, test_urm=data.get_urm_test_2()) elif chose == 'c': if folder in R_HAT: hybrid_rec = HybridRHat( matrices_array, normalization_mode=NORMALIZATION_MODE, urm_filter_tracks=urm_filter_tracks) N = ask_number_recommendations() print('Recommending...') recs = hybrid_rec.recommend_batch( weights_array=WEIGHTS, target_userids=data.get_target_playlists(), N=N) export_csv_wizard(recs) else: log.error('not implemented yet') else: log.error('WRONG FOLDER')
def ask_number_recommendations(): log.success('Select the number of recommendations (default: 10)') N = int(input()) return N
shrink=shrink, threshold=threshold, implicit=implicit, alpha=alpha, beta=beta, l=l, c=c, export=False) """ If this file is executed, test the SPLUS distance metric """ if __name__ == '__main__': print() log.success('++ What do you want to do? ++') log.warning('(t) Test the model with some default params') log.warning('(r) Save the R^') log.warning('(s) Save the similarity matrix') #log.warning('(v) Validate the model') log.warning('(x) Exit') arg = input()[0] print() model = CFUserBased() if arg == 't': # recs = model.recommend_batch(userids=data.get_target_playlists(), urm=data.get_urm_train()) # model.evaluate(recommendations=recs, test_urm=data.get_urm_test()) model.test(distance=CFUserBased.SIM_SPLUS, k=600, alpha=0.25,
action='store_true') args = parser.parse_args() # Set the Context context.url = fix_url(args.url) context.file = fix_filepath(args.output) context.session = Session() context.default_req = grab(text=False) if args.agent: context.session.headers.update({'User-Agent': args.user}) if args.cookies: context.session.cookies.update(cookie_string_to_dict(args.cookies)) if args.hide: context.hide_fail = True if args.username and args.password: context.session.auth = (args.username, args.password) # Log it all log.success(f'Analysing {context.url}') log.success(f'Saving output to {context.file}') # Execute the different modules recon.execute(not args.nagent)