def plot(experiment, output_dir="evaluation/single_scenario", input_dir="results/"): # setup directories for this plot input_dir = os.path.join(experiment, input_dir) output_dir = os.path.join(experiment, output_dir) ensure_dir(output_dir, rm=True) print input_dir print output_dir # load data ed = data.ExperimentData(path=input_dir) ed.normalize_times() # go over all scenarios and call plot methods for s in ed.scenarios.itervalues(): single_scenario_plot( s, output_dir, yfield=["pps_local", "pps_global"], yname="packets per second", xlim=120, label_rename_func=label_rename_generic_performance) single_scenario_plot( s, output_dir, yfield=["pcount_local", "pcount_global"], yname="# pattern matches", xlim=120, label_rename_func=label_rename_matchexample) single_scenario_plot( s, output_dir, yfield=["matchcount_local", "matchcount_global"], yname="# pattern matches", xlim=120, label_rename_func=label_rename_matchexample) single_scenario_plot( s, output_dir, yfield=["t_request_local", "t_request_global"], yname="state request delay [s]", xlim=120)
def retranslateUi(self, MainWindow): _translate = QtCore.QCoreApplication.translate MainWindow.setWindowTitle(_translate("MainWindow", "Add Teacher")) self.label.setText(_translate("MainWindow", "Title:")) self.label_2.setText(_translate("MainWindow", "First Name:")) self.label_3.setText(_translate("MainWindow", "Last Name:")) self.label_4.setText(_translate("MainWindow", "Designation:")) self.label_5.setText(_translate("MainWindow", "Gender:")) self.titleDrop.setItemText(0, _translate("MainWindow", "Dr.")) self.titleDrop.setItemText(1, _translate("MainWindow", "Mr.")) self.titleDrop.setItemText(2, _translate("MainWindow", "Mrs.")) self.titleDrop.setItemText(3, _translate("MainWindow", "Ms.")) self.firstName.setPlaceholderText(_translate("MainWindow", "First Name")) self.lastName.setPlaceholderText(_translate("MainWindow", "Last Name")) self.designationDrop.setItemText(0, _translate("MainWindow", "Asst. Professor")) self.designationDrop.setItemText(1, _translate("MainWindow", "Professor")) self.designationDrop.setItemText(2, _translate("MainWindow", "Acct. Professor")) self.mGender.setText(_translate("MainWindow", "M")) self.fGender.setText(_translate("MainWindow", "F")) self.resetBtn.setText(_translate("MainWindow", "Reset")) self.cancelBtn.setText(_translate("MainWindow", "Cancel")) self.addBtn.setText(_translate("MainWindow", "Add")) self.label_6.setText(_translate("MainWindow", "Teacher Id:")) helper.ensure_dir('Training/') s = os.listdir('Training') if len(s)<9: id = 'RJITCSEIT0'+str(len(s)+1) else: id = 'RJITCSEIT'+str(len(s)+1) self.label_7.setText(_translate("MainWindow", id ))
def create_dataset(teacher_id): faceDetect = cv2.CascadeClassifier( 'Cascade/haarcascade_frontalface_default.xml') eye_cascade = cv2.CascadeClassifier('Cascade/haarcascade_eye.xml') cam = cv2.VideoCapture(1) sample_count = 0 helper.ensure_dir('Training/') directory = 'Training/' + teacher_id + '/' helper.ensure_dir(directory) s = len(os.listdir(directory)) while True: ret, img = cam.read() gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY) faces = faceDetect.detectMultiScale(gray, 1.3, 5) for x, y, w, h in faces: gray_face = cv2.resize((gray[y:y + h, x:x + w]), (110, 110)) eyes = eye_cascade.detectMultiScale(gray_face) for ex, ey, ew, eh in eyes: cv2.rectangle(img, (x, y), (x + w, y + h), (0, 255, 0), 2) sample_count += 1 cv2.imwrite( directory + teacher_id + '_' + str(sample_count + s) + '.jpg', gray[y:y + h, x:x + w]) cv2.rectangle(img, (x, y), (x + w, y + h), (0, 255, 0), 2) cv2.waitKey(100) cv2.imshow('My Face', img) cv2.waitKey(1) if sample_count >= 20: break cam.release() cv2.destroyAllWindows()
def train(): recognizer = cv2.createLBPHFaceRecognizer(2, 2, 7, 7, 15) path = 'Training' userIDs, faces = helper.get_faces_with_username(path) recognizer.train(faces, userIDs) directory = 'Recognizer' helper.ensure_dir(directory) recognizer.save('Recognizer/trainingData.yaml')
def get_lyrics_by_tracks(artist_tracks_id_object): artist_tracks_object = {} counter = 1 musixmatch_regex = re.compile(r'\*.*\*\s*$') # this will delete **** This Lyrics is NOT... *** at the end of the string is_limit_reached = False helper.ensure_dir(OUTPUT_DIR_MUSIXMATCH_JSON) if VERBOSE: helper.log_highlight('Fetching lyrics of tracks') for artist_id, tracks in artist_tracks_id_object.items(): artist_tracks = {} artist_tracks[artist_id] = [] if VERBOSE: print 'Fetching tracks of artist ' + str(artist_id) + ' [' + str(counter) + ' of ' + str(len(artist_tracks_id_object)) + ']' if os.path.exists(OUTPUT_DIR_MUSIXMATCH_JSON + artist_id + '.json') and SKIP_EXISTING_LYRICS: if VERBOSE: print " Tracks of artist already fetched: " + OUTPUT_DIR_MUSIXMATCH_JSON + str(artist_id) + '.json' counter += 1 continue for index, track_id in enumerate(tracks, start = 1): response = fetch_lyrics_by_track_id(track_id) header = response['message']['header'] status_code = header['status_code'] if VERBOSE: print ' Fetching lyrics of track ' + str(track_id) + ' [' + str(index) + ' of ' + str(len(tracks)) + ']' if status_code is 200: lyrics = response['message']['body']['lyrics']['lyrics_body'] lyrics_replaced = re.sub(r'\*.*\*\s*$', '', lyrics) artist_tracks[artist_id].append(lyrics_replaced) try: artist_tracks_object[artist_id] += lyrics_replaced except: artist_tracks_object[artist_id] = '' artist_tracks_object[artist_id] += lyrics_replaced if status_code is 402: is_limit_reached = True counter += 1 if not is_limit_reached: if VERBOSE: print '\n Save JSON with lyrics\n' save_json(artist_tracks, OUTPUT_DIR_MUSIXMATCH_JSON + artist_id + '.json') return artist_tracks_object
def add(self): id = self.label_7.text() fname = self.firstName.text() lname = self.lastName.text() title = self.titleDrop.currentText() design = self.designationDrop.currentText() gender = None if self.fGender.isChecked(): gender = 'F' elif self.mGender.isChecked(): gender = 'M' db.add_teacher(id,title,fname,lname,gender,design) helper.ensure_dir('Training/'+id+'/')
def main(): args = parse_args() # input files train_file = args.data_dir + '/train.json' dev_file = args.data_dir + '/dev.json' test_file = args.data_dir + '/test.json' wv_file = args.wv_file wv_dim = args.wv_dim # output files helper.ensure_dir(args.vocab_dir) vocab_file = args.vocab_dir + '/vocab.pkl' emb_file = args.vocab_dir + '/embedding.npy' # load files print("loading files...") train_tokens = load_tokens( train_file) # load sentence token with entity being padding? dev_tokens = load_tokens(dev_file) test_tokens = load_tokens(test_file) if args.lower: train_tokens, dev_tokens, test_tokens = [[t.lower() for t in tokens] for tokens in\ (train_tokens, dev_tokens, test_tokens)] # load glove print("loading glove...") glove_vocab = vocab.load_glove_vocab(wv_file, wv_dim) print("{} words loaded from glove.".format(len(glove_vocab))) print("building vocab...") v = build_vocab(train_tokens, glove_vocab, args.min_freq) print("calculating oov...") datasets = {'train': train_tokens, 'dev': dev_tokens, 'test': test_tokens} for dname, d in datasets.items(): total, oov = count_oov(d, v) print("{} oov: {}/{} ({:.2f}%)".format(dname, oov, total, oov * 100.0 / total)) print("building embeddings...") embedding = vocab.build_embedding(wv_file, v, wv_dim) print("embedding size: {} x {}".format(*embedding.shape)) print("dumping to files...") with open(vocab_file, 'wb') as outfile: pickle.dump(v, outfile) np.save(emb_file, embedding) print("all done.")
def build_env(args, env_name=EXP_NAME): env = gym.make("CartPole-v0") env = Monitor(env, helper.ensure_dir(path.join(args.monitor_path, env_name)), allow_early_resets=True) env = Reset(env) return env
def write_files(fdata, data_dir): """ Expects a list of tuples, containing (case_id, case_data) Writes the {case_data} into {case_id}.txt All the files will be written into {data_dir} directory """ helper.ensure_dir(data_dir) for case_id, case_data in fdata: if (case_id is None) or (case_data is None): continue f = open(os.path.join(data_dir, case_id + '.txt'), 'w') f.write(case_data.encode('utf8')) f.close() pass
def add(self, MainWindow): id = self.label_7.text() fname = self.firstName.text() lname = self.lastName.text() title = self.titleDrop.currentText() design = self.designationDrop.currentText() gender = None if self.fGender.isChecked(): gender = 'F' elif self.mGender.isChecked(): gender = 'M' result, comment = db.add_teacher(id, title, fname, lname, gender, design) if result: helper.ensure_dir('Training/' + id + '/') MainWindow.close() buttonReply = QtWidgets.QMessageBox.question( MainWindow, 'Teacher Added', comment, QtWidgets.QMessageBox.Ok, QtWidgets.QMessageBox.Ok)
def get_html_by_tracks(artist_tracks_id_object): artist_tracks_object = {} counter = 1 musixmatch_regex = re.compile(r'\*.*\*\s*$') # this will delete **** This Lyrics is NOT... *** at the end of the string if VERBOSE: helper.log_highlight('Fetching lyrics HTML of tracks') helper.ensure_dir(OUTPUT_DIR_MUSIXMATCH_HTML) for artist_id, tracks in artist_tracks_id_object.items(): if VERBOSE: print 'Fetching tracks of artist ' + str(artist_id) + ' [' + str(counter) + ' of ' + str(len(artist_tracks_id_object)) + ']' for index, track_id in enumerate(tracks, start = 1): response = fetch_html_lyrics_by_track_id(track_id) header = response['message']['header'] status_code = header['status_code'] has_lyrics = response['message']['body']['track']['has_lyrics'] if VERBOSE: print ' Fetching lyrics of track ' + str(track_id) + ' [' + str(index) + ' of ' + str(len(tracks)) + ']' if status_code is 200 and int(has_lyrics) > 0: track_url = response['message']['body']['track']['track_share_url'] filename = OUTPUT_DIR_MUSIXMATCH_HTML + str(artist_id) + '_' + str(track_id) + '.html' try: if VERBOSE: print ' Storing and retrieving data from ' + track_url content = urllib.urlopen(track_url).read() with open(filename, 'w') as f: f.write(content) except IOError: # return empty content in case some IO / socket error occurred if VERBOSE: print ' Cannot retrieve data from ' + track_url
def generate_data(f, reviews): count = 1 dir_name = helper.get_name_without_extension(os.path.basename(f)) helper.ensure_dir(dir_name) for review in reviews: # Without labels target_path = os.path.join(dir_name, 'no_label_' + str(count) + '.txt') helper.save_list_to_file(target_path, [x[2] for x in review.samples]) # With labels target_path = os.path.join(dir_name, str(count) + '.txt') data = [ ','.join(x[0]) + ' | ' + ','.join(x[1]) + ' | ' + x[2] for x in review.samples ] helper.save_list_to_file(target_path, data) count += 1
def save_lfmb_c1ku_combined_file(c1ku_file, lfmb1_file, output_file, header_string): helper.log_highlight('save ' + output_file) LFM1b_file = mf.read_txt(lfmb1_file) sorted_string = header_string + "\n" with open(c1ku_file, 'r') as f: reader = csv.reader(f, delimiter='\t') # create reader headers = reader.next() # skip header for index, row in enumerate(reader, start=1): the_id = row[0] sorted_string += LFM1b_file[the_id] + "\n" helper.ensure_dir(OUTPUT_DIR) text_file = open(output_file, 'w') text_file.write(sorted_string) text_file.close()
def fix_station_data(stations, ref_station_data): for station_idx, station_id in enumerate(stations.index): station_data = read_station_data(station_id) print(station_idx, station_id) if path.isfile( path.join(BASE_DIR, "pems", "fix", "{}.csv".format(station_id))): continue if station_data["Flow (Veh/5 Minutes)"].dtype == 'object': station_data["Flow (Veh/5 Minutes)"] = station_data[ "Flow (Veh/5 Minutes)"].map( lambda x: float(x.replace(",", ""))) if "Speed (mph)" not in station_data.columns: station_data["Speed (mph)"] = pd.Series(np.zeros( station_data.index.size), index=station_data.index) station_data = station_data[[ "Flow (Veh/5 Minutes)", "Speed (mph)", "# Lane Points", "% Observed" ]] if station_data.shape[0] != ref_station_data.shape[0]: station_data = station_data.reindex_like(ref_station_data, method='ffill') if pd.isnull(station_data).any().any(): print(station_id) raise Exception("bad stations") station_data = resample_dataframe(station_data) station_data = station_data.apply(pd.to_numeric) station_data.to_csv(ensure_dir( path.join(BASE_DIR, "pems", "fix", "{}.csv".format(station_id))), date_format="%Y-%m-%d %H:%M:%S")
def generate_wikipedia_AAM(): ps = PorterStemmer() html_contents = {} # dictionary to hold document frequency of each term in corpus terms_df = {} # list of all terms term_list = [] # read artist names from file artists = Wikipedia_Fetcher.read_file(Wikipedia_Fetcher.ARTISTS_FILE) # using functions and parameters defined in o1_Wikipedia_Fetcher.py helper.ensure_dir(WIKIPEDIA_OUTPUT) # for all artists for i in range(0, len(artists)): # construct file name to fetched HTML page for current artist, depending on parameter settings in Wikipedia_Fetcher.py if Wikipedia_Fetcher.USE_INDEX_IN_OUTPUT_FILE: html_fn = Wikipedia_Fetcher.OUTPUT_DIRECTORY + "/" + str(i) + ".html" # target file name elif not Wikipedia_Fetcher.USE_INDEX_IN_OUTPUT_FILE: html_fn = Wikipedia_Fetcher.OUTPUT_DIRECTORY + "/" + urllib.quote(artists[i]) + ".html" # target file name # Load fetched HTML content if target file exists if os.path.exists(html_fn): # Read entire file html_content = open(html_fn, 'r').read() # Next we perform some text processing: # Strip content off HTML tags content_tags_removed = remove_html_markup(html_content) # remove numbers content_no_numbers = re.sub(r'[0-9]+', ' ', content_tags_removed) # Perform case-folding, i.e., convert to lower case content_casefolded = content_no_numbers.lower() # remove words with wiki in it content_no_specific_words = re.sub(r'[\w]*wiki|article|pedia|privacy|policy[\w]*', ' ', content_casefolded) # Tokenize stripped content at white space characters tokens = content_no_specific_words.split() # Remove all tokens containing non-alphanumeric characters; using a simple lambda function (i.e., anonymous function, can be used as parameter to other function) tokens_filtered = filter(lambda t: t.isalnum(), tokens) # Remove words in the stop word list tokens_filtered_stopped = filter(lambda t: t not in STOP_WORDS, tokens_filtered) tokens_stemmed = [] # stemm words for w in tokens_filtered_stopped: tokens_stemmed.append(ps.stem(w)) # Store remaining tokens of current artist in dictionary for further processing if len(tokens_stemmed) > 0: html_contents[i] = tokens_filtered_stopped print "File " + html_fn + " --- total tokens: " + str(len(tokens)) + "; after filtering and stopping: " + str(len(tokens_filtered_stopped)) else: # Inform user if target file does not exist print "Target file " + html_fn + " does not exist!" html_contents[i] = '' # Start computing term weights, in particular, document frequencies and term frequencies. # Iterate over all (key, value) tuples from dictionary just created to determine document frequency (DF) of all terms for aid, terms in html_contents.items(): # convert list of terms to set of terms ("uniquify" words for each artist/document) for t in set(terms): # and iterate over all terms in this set # update number of artists/documents in which current term t occurs if t not in terms_df: terms_df[t] = 1 else: terms_df[t] += 1 # remove all values which are one #terms_df = dict((k, v) for k, v in terms_df.iteritems() if v != 1) # Compute number of artists/documents and terms no_artists = len(html_contents.items()) no_terms = len(terms_df) print "Number of artists in corpus: " + str(no_artists) print "Number of terms in corpus: " + str(no_terms) # You may want (or need) to perform some kind of dimensionality reduction here, e.g., filtering all terms # with a very small document frequency. # ... # Dictionary is unordered, so we store all terms in a list to fix their order, before computing the TF-IDF matrix for t in terms_df.keys(): term_list.append(t) # Create IDF vector using logarithmic IDF formulation idf = np.zeros(no_terms, dtype=np.float32) for i in range(0, no_terms): idf[i] = np.log(no_artists / terms_df[term_list[i]]) # print term_list[i] + ": " + str(idf[i]) # Initialize matrix to hold term frequencies (and eventually TF-IDF weights) for all artists for which we fetched HTML content tfidf = np.zeros(shape=(no_artists, no_terms), dtype=np.float32) # Iterate over all (artist, terms) tuples to determine all term frequencies TF_{artist,term} terms_index_lookup = {} # lookup table for indices (for higher efficiency) for a_idx, terms in html_contents.items(): print "Computing term weights for artist " + str(a_idx) # You may want (or need) to make the following more efficient. for t in terms: # iterate over all terms of current artist if t in terms_index_lookup: t_idx = terms_index_lookup[t] else: t_idx = term_list.index(t) # get index of term t in (ordered) list of terms terms_index_lookup[t] = t_idx tfidf[a_idx, t_idx] += 1 # increase TF value for every encounter of a term t within a document of the current artist # Replace TF values in tfidf by TF-IDF values: # copy and reshape IDF vector and point-wise multiply it with the TF values tfidf = np.log1p(tfidf) * np.tile(idf, no_artists).reshape(no_artists, no_terms) # Storing TF-IDF weights and term list print "Saving TF-IDF matrix to " + WIKIPEDIA_TFIDFS + "." np.savetxt(WIKIPEDIA_TFIDFS, tfidf, fmt='%0.6f', delimiter='\t', newline='\n') print "Saving term list to " + WIKIPEDIA_TERMS + "." with open(WIKIPEDIA_TERMS, 'w') as f: f.write("terms\n") for t in term_list: f.write(t + "\n") # Computing cosine similarities and store them # print "Computing cosine similarities between artists." # Initialize similarity matrix sims = np.zeros(shape=(no_artists, no_artists), dtype=np.float32) # Compute pairwise similarities between artists for i in range(0, no_artists): print "Computing similarities for artist " + str(i) for j in range(i, no_artists): cossim = 1.0 - scidist.cosine(tfidf[i], tfidf[j]) # If either TF-IDF vector (of i or j) only contains zeros, cosine similarity is not defined (NaN: not a number). # In this case, similarity between i and j is set to zero (or left at zero, in our case). if not np.isnan(cossim): sims[i,j] = cossim sims[j,i] = cossim print "Saving cosine similarities to " + WIKIPEDIA_AAM + "." np.savetxt(WIKIPEDIA_AAM, sims, fmt='%0.6f', delimiter='\t', newline='\n') # Compute number of artists/documents and terms no_artists = len(html_contents.items()) no_terms = len(terms_df) print "Number of artists in corpus: " + str(no_artists) print "Number of terms in corpus: " + str(no_terms)
] DATE_START = DT.datetime.strptime( "{} 08:00:00".format(DT.datetime.now().strftime('%Y-%m-%d')), '%Y-%m-%d %H:%M:%S') DATE_END = DATE_START - DT.timedelta(days=7) LIMIT = 2000 def get_data(container, time): df, time_to = fetch_data_by_exchange(FSYM, TSYM, market, time, time_frame="minute") container.append(df) return df.shape[0], time_to for market in MARKETS: print("{}".format(market), end="") dfs = [] num_row, time = get_data(dfs, T.mktime(DATE_START.timetuple())) while num_row > LIMIT: num_row, time = get_data(dfs, time) data = pd.concat(dfs).sort_index().drop_duplicates("time") data.to_csv(ensure_dir(path_join("./data", "{}_minute.csv".format(market)))) print("\tdownloaded")
no_users = UAM.shape[0] no_artists = UAM.shape[1] # np.tile: take sum_pc_user no_artists times (results in an array of length no_artists*no_users) # np.reshape: reshape the array to a matrix # np.transpose: transpose the reshaped matrix artist_sum_copy = np.tile(sum_pc_user, no_artists).reshape(no_artists, no_users).transpose() # Perform sum-to-1 normalization UAM = UAM / artist_sum_copy # Inform user print "UAM created. Users: " + str(UAM.shape[0]) + ", Artists: " + str( UAM.shape[1]) helper.ensure_dir(OUTPUT_DIR) # Write everything to text file (artist names, user names, UAM) # Write artists to text file with open(ARTISTS_FILE, 'w') as outfile: # "a" to append outfile.write('artist\n') for key in artists.keys(): # for all artists listened to by any user outfile.write(key + "\n") # Write users to text file with open(USERS_FILE, 'w') as outfile: outfile.write('user\n') for key in users.keys(): # for all users outfile.write(key + "\n") # Write UAM
def run_recommender(run_function, run_method, neighbors=[1, 2, 5, 10, 20, 50], recommender_artists=[1, 3, 5, 7, 10, 20, 30, 50, 100, 200]): """ runs automatically the run function, this funciton must be declared in the parameters it also saves automatically a json string with the parameters - the file name is as follows: K(K_number)_R(Recommended_artists).json :param run_function: the run fuction, from the single recommender :param run_method: the string which describes the current recommender :param neighbors: a list of different neighbors :param recommender_artists: a list of different artists to recommend """ # for threading global NUM_THREADS, THREAD_STARTED, LOCK LOCK.acquire() NUM_THREADS += 1 THREAD_STARTED = True LOCK.release() # for threading k_sorted = {} r_sorted = {} data_to_append = {} all_files = {} output_filedir = OUTPUT_DIR + run_method + '/' all_files_path = output_filedir + 'all.json' helper.ensure_dir(output_filedir + 'recommended/') for neighbor in neighbors: k_sorted['K' + str(neighbor)] = [] for recommender_artist in recommender_artists: k_sorted['R' + str(recommender_artist)] = [] file_path = output_filedir + 'K' + str(neighbor) + '_R' + str(recommender_artist) + '.json' file_path_reco = output_filedir + 'recommended/' + 'K' + str(neighbor) + '_R' + str(recommender_artist) + '.json' data_to_append = {'neighbors': neighbor, 'recommended_artists': recommender_artist} data = run_function(neighbor, recommender_artist) recommended = data['recommended'] formated_recommended = {} # delete this # 1. not valid json # 2. not necessary for the specific files del data['recommended'] data_to_append.update(data) if type(recommended) is not bool: for key, value in recommended.iteritems(): # convert everything to strings # due to otherwise it is not a valid json formated_recommended[key] = {} if len(value) == 0: continue for kf, fold_recommended in value.iteritems(): formated_recommended[key][kf] = {} formated_recommended[key][kf]['recommended'] = {} formated_recommended[key][kf]['order'] = [] for artist, ranking in fold_recommended.iteritems(): formated_recommended[key][kf]['recommended'][str(artist)] = str(ranking) formated_recommended[key][kf]['order'].append(artist) # write json file for hybrids content = json.dumps(formated_recommended, indent=4, sort_keys=True) f = open(file_path_reco, 'w') f.write(content) f.close() # write json file for csv content = json.dumps(data_to_append, indent=4, sort_keys=True) f = open(file_path, 'w') f.write(content) f.close() # for threading LOCK.acquire() NUM_THREADS -= 1 LOCK.release()
def work(rank, args, master_net, cc, optimizer=None): torch.manual_seed(args.seed + rank) summary_file = path_join(args.model_path, EXP_NAME + "_{}".format(rank)) summary = cc.create_experiment(helper.ensure_dir(summary_file)) summary.to_zip(summary_file) exp_buff = helper.ExperienceBuffer() episodes = master_net.episodes episode_deliveries = [] episode_lengths = [] # episode_mean_values = [] # Create the local copy of the network env = gameEnv(args.partial, args.env_size, args.action_space) local_net = DFP_Network( (args.env_size**2) * 3, # observation_size = (args.env_size*args.env_size)*3 = battel_ground*colors num_offset=len(args.offset), a_size=args.action_space, num_measurements=args.num_measurements) assert args.num_measurements == len(env.measurements) if optimizer is None: optimizer = optim.Adam(master_net.parameters(), lr=args.learning_rate) print("Starting work on worker-{}".format(rank)) while not master_net.should_stop(): local_net.load_state_dict(master_net.state_dict( )) # Copy parameters from global to local network episode_buffer = [] episode_frames = [] done = False step = 0 temp = 0.25 # How spread out we want our action distribution to be observation, o_big, measurements, delivery_pos, drone_pos = env.reset() the_measurements = measurements # measuremeents [number delivery, battery life] while not done: # Here is where our goal-switching takes place # When the battery charge is below 0.3, we set the goal to optimize battery # When the charge is above that value we set the goal to optimize deliveries if measurements[1] <= .3: goal = np.array([[0., 1.]]) else: goal = np.array([[1., 0.] ]) # goal [go for delivery, go for battery] action_dist = local_net.forward(np.expand_dims(observation, 0), np.expand_dims(measurements, 0), goal, temp) b = np.squeeze(goal, axis=0) * np.squeeze(action_dist.data.numpy(), axis=0).T c = np.sum(b, axis=1) c /= c.sum() # Choose greedy action action = np.random.choice(c, p=c) action = np.argmax(c == action) observation_new, o_new_big, measurements_new, delivery_pos_new, drone_pos_new, done = env.step( action) episode_buffer.append([ observation, action, np.array(measurements), goal, np.zeros(len(args.offset)) ]) if rank == 0 and master_net.episodes % 150 == 0: episode_frames.append( helper.set_image_gridworld(o_new_big, measurements_new, step + 1, delivery_pos_new, drone_pos_new)) observation = np.copy(observation_new) measurements = measurements_new[:] delivery_pos = delivery_pos_new[:] drone_pos = drone_pos_new step += 1 # End the episode after 100 steps if step > 100: done = True episode_deliveries.append(measurements[0]) episode_lengths.append(step) # Update the network using the experience buffer at the end of the episode. if args.train: loss, entropy = train(episode_buffer, exp_buff, local_net=local_net, master_net=master_net, action_space=args.action_space, offsets=args.offset, optimizer=optimizer, batch_size=args.batch_size, max_grad_norm=args.max_grad_norm) # Periodically save gifs of episodes, model parameters, and summary statistics. if episodes % 50 == 0 and episodes != 0: if master_net.episodes % 2000 == 0 and rank == 0 and train: model_file = path_join(args.model_path, 'model-{}.cptk'.format(episodes)) torch.save(master_net.state_dict(), helper.ensure_dir(model_file)) print("Saved Model") if rank == 0 and master_net.episodes % 150 == 0: time_per_step = 0.25 images = np.array(episode_frames) image_file = path_join(args.gif_path + '/image-{}.gif'.format(episodes)) imageio.mimsave(helper.ensure_dir(image_file), images, duration=time_per_step) mean_deliveries = np.mean(episode_deliveries[-50:]) mean_length = np.mean(episode_lengths[-50:]) # mean_value = np.mean(episode_mean_values[-50:]) summary.add_scalar_value('Performance/Deliveries_{}'.format(rank), float(mean_deliveries)) summary.add_scalar_value('Performance/Length_{}'.format(rank), float(mean_length)) # summary.add_scalar_value('Performance/Mean-{}'.format(rank), float(mean_value)) summary.add_scalar_value('Check/episode_{}'.format(rank), episodes) summary.add_scalar_value('Check/master_episode_{}'.format(rank), master_net.episodes) if args.train: summary.add_scalar_value('Losses/Loss_{}'.format(rank), float(loss.data.numpy())) summary.add_scalar_value('Losses/Entory_{}'.format(rank), float(entropy.data.numpy())) summary.to_zip(summary_file) episodes += 1 master_net.episodes += 1
def plot(experiment, output_dir="evaluation/multi_scenario", input_dir="results/"): # setup directories for this plot input_dir = os.path.join(experiment, input_dir) output_dir = os.path.join(experiment, output_dir) ensure_dir(output_dir, rm=True) print input_dir print output_dir # load data ed = data.ExperimentData(path=input_dir) ed.normalize_times() df = ed.get_combined_df() cdelays = sorted(df["controldelay"].drop_duplicates().tolist()) print cdelays lambdas = sorted(df["srclambda"].drop_duplicates().tolist()) print lambdas middleboxes = sorted(df["numbermb"].drop_duplicates().tolist()) # middleboxes.remove(16) print middleboxes dummystatesizes = sorted(df["dummystatesize"].drop_duplicates().tolist()) print dummystatesizes """ Plots: xaxis = numbermb yaxis = pps, request times, global pcount layout: one plot line per backend """ for delay in cdelays[:2]: for lmb in lambdas: for dss in dummystatesizes: multi_scenario_plot( output_dir, ed, xfield="numbermb", yfield=["pps_global", "pps_local"], destinction_field="backend", rowfilter={ "controldelay": delay, "srclambda": lmb, "dummystatesize": dss}, xname="number of replicated VNF instances", yname="avg. processed pkt/s", name_pre="", name_post="_d%03d_l%03d_dss%08d" % (delay, lmb*100, dss) ) multi_scenario_plot( output_dir, ed, xfield="numbermb", yfield=["pps_global"], destinction_field="backend", rowfilter={ "controldelay": delay, "srclambda": lmb, "dummystatesize": dss}, xname="number of replicated VNF instances", yname="avg. processed pkt/s", name_pre="", name_post="_d%03d_l%03d_dss%08d" % (delay, lmb*100, dss) ) multi_scenario_plot( output_dir, ed, xfield="numbermb", yfield=["t_request_global", "t_request_local"], destinction_field="backend", rowfilter={ "controldelay": delay, "srclambda": lmb, "dummystatesize": dss}, xname="number of replicated VNF instances", yname="avg. state request delay [s]", name_pre="", name_post="_d%03d_l%03d_dss%08d" % (delay, lmb*100, dss) ) multi_scenario_plot( output_dir, ed, xfield="numbermb", yfield=["t_request_global"], destinction_field="backend", rowfilter={ "controldelay": delay, "srclambda": lmb, "dummystatesize": dss}, xname="number of replicated VNF instances", yname="avg. state request delay [s]", name_pre="", name_post="_d%03d_l%03d_dss%08d" % (delay, lmb*100, dss), ymax=0.9 ) multi_scenario_plot( output_dir, ed, xfield="numbermb", yfield=["pcount_global", "pcount_local"], destinction_field="backend", rowfilter={ "controldelay": delay, "srclambda": lmb, "dummystatesize": dss}, xname="number of replicated VNF instances", yname="number of processed packets", name_pre="", name_post="_d%03d_l%03d_dss%08d" % (delay, lmb*100, dss) ) """ Plots: xaxis = controldelay yaxis = pps, request times, global pcount layout: one plot line per backend """ for nmb in middleboxes: for lmb in lambdas: for dss in dummystatesizes[:2]: multi_scenario_plot( output_dir, ed, xfield="controldelay", yfield=["pps_global", "pps_local"], destinction_field="backend", rowfilter={ "numbermb": nmb, "srclambda": lmb, "dummystatesize": dss}, xname="control plane latency [ms]", yname="avg. processed pkt/s", name_pre="", name_post="_nmb%03d_l%03d_dss%08d" % (nmb, lmb*100, dss) ) multi_scenario_plot( output_dir, ed, xfield="controldelay", yfield=["pps_global"], destinction_field="backend", rowfilter={ "numbermb": nmb, "srclambda": lmb, "dummystatesize": dss}, xname="control plane latency [ms]", yname="avg. processed pkt/s", name_pre="", name_post="_nmb%03d_l%03d_dss%08d" % (nmb, lmb*100, dss) ) multi_scenario_plot( output_dir, ed, xfield="controldelay", yfield=["t_request_global", "t_request_local"], destinction_field="backend", rowfilter={ "numbermb": nmb, "srclambda": lmb, "dummystatesize": dss}, xname="control plane latency [ms]", yname="avg. state request delay [s]", name_pre="", name_post="_nmb%03d_l%03d_dss%08d" % (nmb, lmb*100, dss) ) multi_scenario_plot( output_dir, ed, xfield="controldelay", yfield=["t_request_global"], destinction_field="backend", rowfilter={ "numbermb": nmb, "srclambda": lmb, "dummystatesize": dss}, xname="control plane latency [ms]", yname="avg. state request delay [s]", name_pre="", name_post="_nmb%03d_l%03d_dss%08d" % (nmb, lmb*100, dss) ) multi_scenario_plot( output_dir, ed, xfield="controldelay", yfield=["pcount_global"], destinction_field="backend", rowfilter={ "numbermb": nmb, "srclambda": lmb, "dummystatesize": dss}, xname="control plane latency [ms]", yname="number of processed packets", name_pre="", name_post="_nmb%03d_l%03d_dss%08d" % (nmb, lmb*100, dss) ) """ Plots: xaxis = dummystatesize yaxis = pps, request times, global pcount layout: one plot line per backend """ for nmb in middleboxes: for lmb in lambdas: for delay in cdelays[:2]: multi_scenario_plot( output_dir, ed, xfield="dummystatesize", yfield=["pps_global"], destinction_field="backend", rowfilter={ "numbermb": nmb, "srclambda": lmb, "controldelay": delay}, xname="state size [byte]", yname="avg. processed pkt/s", name_pre="", name_post="_nmb%03d_l%03d_d%03d" % (nmb, lmb*100, delay), xlogscale=True ) multi_scenario_plot( output_dir, ed, xfield="dummystatesize", yfield=["t_request_global", "t_request_local"], destinction_field="backend", rowfilter={ "numbermb": nmb, "srclambda": lmb, "controldelay": delay}, xname="state item size [byte]", yname="avg. processed request delay [s]", name_pre="", name_post="_nmb%03d_l%03d_d%03d" % (nmb, lmb*100, delay), xlogscale=True ) multi_scenario_plot( output_dir, ed, xfield="dummystatesize", yfield=["t_request_global"], destinction_field="backend", rowfilter={ "numbermb": nmb, "srclambda": lmb, "controldelay": delay}, xname="state item size [byte]", yname="avg. state request delay [s]", name_pre="", name_post="_nmb%03d_l%03d_d%03d" % (nmb, lmb*100, delay), xlogscale=True )
def process_dir(data_dir, MIN_N_GRAM, MAX_N_GRAM, b_verbose=False, b_size=None): """ Processes a directory containing a set of case documents and generates n-grams. The n-grams thus generated shall be stored in {data_dir}/n-grams/ """ target_dir = os.path.join(data_dir, 'n_grams') # Make sure the target directory exists helper.ensure_dir(target_dir) # Get the case file list case_files = helper.get_files(data_dir) if b_size is not None: case_files = case_files[:b_size] total_count = len(case_files) progress = 0 for case_file in case_files: # Compute the path to save the file target_file_name = os.path.basename(case_file) target_path = os.path.join(target_dir, target_file_name) # Read the case data from the string case_data = helper.read_file_to_string(case_file) valid_n_grams = {} # Go over every sentence in the document for sentence in get_sentences(case_data): pos_tuples = nltk.pos_tag(nltk.word_tokenize(sentence)) # Update the grammar if required and get the POS tags pos_tags = get_pos_tags(pos_tuples) # Generate N-Grams of tags n_grams = [] for n in range(MIN_N_GRAM, MAX_N_GRAM + 1): n_grams.extend([list(grams) for grams in ngrams(range(len(pos_tuples)), n)]) # Get only the n-grams that match the defined grammar for i in range(len(n_grams)): # Generate n-gram list and check validity if parse([pos_tags[j] for j in n_grams[i]]): # Append words to overall list elements = ' '.join([pos_tuples[k][0] for k in n_grams[i]]) if elements in valid_n_grams: valid_n_grams[elements] += 1 else: valid_n_grams[elements] = 1 # Save n-grams to file helper.save_dict_to_file(target_path, valid_n_grams) progress += 1 if b_verbose: print(progress / (0.01 * total_count), ' % Complete') return target_dir
if i_iter % args.eval_step == 0: iter_eval, saved_weights = eval(eval_dataloader, input_embeddings, target_embeddings, neighbor_embeddings, edge_types, mask_neighbor, device) eval_loss.append(iter_eval) vis.line( Y=np.array(eval_loss), X=np.array(range(0, i_iter + 1, args.eval_step)), opts=dict(legend=["RMSE"], title=model.name + " eval loos", showlegend=True), win="win:eval-{}".format(EXP_NAME)) # print("dump example") # torch.save(saved_weights, ensure_dir(path.join(path.join("..", "data", args.data_dir), model.name, "saved_eval_iter_{}_drop_{}.pt".format(int(i_iter/args.eval_step), args.drop_prob)))) # print("dump done") if best_model > iter_eval: print("save best model") best_model = iter_eval torch.save(model, path.join(path.join("..", "data", args.data_dir), "{}.pt".format(model.name))) # test performance model = torch.load(path.join(path.join("..", "data", args.data_dir), "{}.pt".format(model.name))) test = setup_model(model, args.eval_batch_size, args, is_training=False) iter_test, saved_weights = test(test_dataloader, input_embeddings, target_embeddings, neighbor_embeddings, edge_types, mask_neighbor, device) print("test RMSE: {}".format(iter_test)) torch.save(saved_weights, ensure_dir( path.join(path.join("..", "data", args.data_dir), model.name, "saved_test_drop_{}.pt".format(args.drop_prob)))) test_rmse.append(iter_test) print("execution_mean: {}".format(np.mean(test_rmse)))
stats['len_limited'] = len_limited stats['best_five'] = terms_df[:5] stats['all'] = terms_df stats['found_artists'] = found_artists return stats # /count_wiki_terms if __name__ == '__main__': loop_me = {} #loop_me['wiki'] = count_wiki_terms() loop_me['mm'] = count_mm_terms() helper.ensure_dir(OUTPUT) for key, terms in loop_me.iteritems(): filename = 'novalue.json' if key == 'wiki': filename = 'wiki_term_stats.json' elif key == 'mm': filename = 'mm_term_stats.json' content = json.dumps(terms, indent=4, sort_keys=True) json_file = open(OUTPUT + 'mm_term_stats.json', 'w') json_file.write(content) json_file.close()
def download_traffic(stations_id, auth_data, auth_base='http://pems.dot.ca.gov/', time_interval=[(1522627200, 1523059200), (1523232000, 1523664000), (1523836800, 1524268800), (1524441600, 1524873600)]): base_url = "http://pems.dot.ca.gov/?report_form=1&dnode=VDS&content=loops&tab=det_timeseries&export=text&station_id={}&s_time_id={}&e_time_id={}&tod=all&tod_from=0&tod_to=0&dow_1=on&dow_2=on&dow_3=on&dow_4=on&dow_5=on&q=flow&q2=speed&gn=5min&agg=on" with session() as c: c.post(auth_base, data=auth_data) for i, station_id in enumerate(stations_id): ts = 10 # Default time to sleep print("Iteration: {}".format(i)) print('initial time to sleep {}'.format(ts)) for j, (start_time, end_time) in enumerate(time_interval): url = base_url.format(station_id, start_time, end_time) ts_small = 2 # small sleep interval while True: try: # Download with 10-second sleep time breaks print('try to download file: {}-{}'.format( station_id, j)) print('time to sleep {}'.format(ts_small)) # Make the request and download attached file r = c.get(url) if r.status_code == 200: with open( ensure_dir( path.join(BASE_DIR, "{}".format(station_id), "part-{}.csv".format(j))), "w") as file: file.write(r.text) else: raise ConnectionError("Data not obtained") # save file time.sleep( np.random.random_integers(ts_small, int(1.2 * ts_small))) except ConnectionError: print('ConnectionError') ts_small = ts_small * 2 time.sleep(ts) # Sleep and login again c.post(auth_base, data=auth_data) continue break # sleep for a longer interval dt = [ pd.read_csv(path.join(BASE_DIR, "{}".format(station_id), "part-{}.csv".format(i)), sep="\t") for i in range(len(time_interval)) ] dt = pd.concat(dt, axis=0) dt["5 Minutes"] = pd.to_datetime(dt["5 Minutes"], format="%m/%d/%Y %H:%M") dt = dt.set_index("5 Minutes") dt.to_csv( ensure_dir( path.join(BASE_DIR, "stations", "{}.csv".format(station_id)))) time.sleep(np.random.random_integers(ts, int(1.2 * ts)))
_ids.remove(s_idx) eval_dataset = random.sample(_ids, e_t_size) for s_idx in eval_dataset: _ids.remove(s_idx) return _ids, eval_dataset, test_dataset if __name__ == "__main__": input_embeddings, target_embeddings, neighbor_embeddings, edge_type, mask_neigh, prefix = generate_triangular_embedding( (12000, 10), 4) torch.save( input_embeddings, ensure_dir(path.join(BASE_DIR, prefix + "_input_embeddings.pt"))) torch.save( target_embeddings, ensure_dir(path.join(BASE_DIR, prefix + "_target_embeddings.pt"))) torch.save( neighbor_embeddings, ensure_dir(path.join(BASE_DIR, prefix + "_neighbor_embeddings.pt"))) torch.save(edge_type, ensure_dir(path.join(BASE_DIR, prefix + "_edge_type.pt"))) torch.save(mask_neigh, ensure_dir(path.join(BASE_DIR, prefix + "_mask_neighbor.pt"))) train_dataset, eval_dataset, test_dataset = split_training_test_dataset( list(range(input_embeddings.size(0))), e_t_size=1000) torch.save(train_dataset, path.join(BASE_DIR, prefix + "_train_dataset.pt"))
artists = artists[:NUMBER_OF_MAX_ARTISTS] number_of_fetches = NUMBER_OF_MAX_ARTISTS * 2 + (NUMBER_OF_MAX_ARTISTS * NUMBER_OF_ALBUMS) * (1 + NUMBER_OF_MAX_TRACKS) if VERBOSE: helper.log_highlight('You will have ' + str(number_of_fetches) + ' queries to the musixmatch api') print '' print 'Artist queries: ' + str(NUMBER_OF_MAX_ARTISTS) print 'Album queries: ' + str(NUMBER_OF_MAX_ARTISTS) print 'Track queries: ' + str(NUMBER_OF_MAX_ARTISTS * NUMBER_OF_ALBUMS) print 'Lyrics queries: ' + str((NUMBER_OF_MAX_ARTISTS * NUMBER_OF_ALBUMS) * NUMBER_OF_MAX_TRACKS) print '' print 'These numbers can vary if an artists has less albums, tracks or tracks with lyrics' print '' helper.ensure_dir(OUTPUT_DIR_MUSIXMATCH) # live fetching # fetched_artist_ids = get_artist_ids(artists) # save_txt(fetched_artist_ids, 'artist_ids.txt') # fetched_artist_album_ids = get_artist_albums(fetched_artist_ids, NUMBER_OF_ALBUMS) # save_txt(fetched_artist_album_ids, 'album_ids.txt') # fetched_artist_album_tracks = get_artist_album_tracks(fetched_artist_album_ids, NUMBER_OF_MAX_TRACKS) # save_txt(fetched_artist_album_tracks, 'album_tracks.txt') # fetched_lyrics = get_lyrics_by_tracks(fetched_artist_album_tracks) # fetching with stored data # fetched_artist_ids = read_txt(GENERATED_ARTISTS_FILE) # fetched_artist_album_ids = read_txt(GENERATED_ALBUM_IDS_FILE, True)
for excange in dic["Data"]["Exchanges"] ] market = sorted(market, key=lambda d: -d.volume) for excange_info in market: print("{}\t{}".format(excange_info.market, excange_info.volume)) # download daily OHLC price-series for ETH/USD for a given 'market' # extract close-price (cp) print("{}/{}".format(fsym, tsym)) good_market_name = [] data = pd.DataFrame() for market in map(lambda m: m.market, market): print("{}".format(market), end="") df = fetch_data_hour_by_exchange(fsym, tsym, market) df = df[(df.index > "2017-06-01") & (df.index <= "2017-11-05")] if df.shape[0] != 0: df.name = market df.to_csv( ensure_dir(path_join("./data", "{}_hourly.csv".format(market)))) data = pd.concat([data, df], axis=1, ignore_index=False) print("\tdownloaded") good_market_name.append(market) if len(good_market_name) == 10: break else: print("\tskipp") print(good_market_name) print(data.head(10)) print(data.tail(10))
for _id in eval_stations: eval_dataset.extend(site_to_exp_idx.d[_id]) for _id in test_stations: test_dataset.extend(site_to_exp_idx.d[_id]) print("train len: {}\neval len: {}\ntest len: {}".format(len(train_dataset), len(eval_dataset), len(test_dataset))) return train_dataset, eval_dataset, test_dataset if __name__ == "__main__": stations = read_stations() G, stations_distances = compute_graph(stations) torch.save(G, ensure_dir(path.join(BASE_DIR, "pems", "temp", "graph.pt"))) torch.save(stations_distances, ensure_dir(path.join(BASE_DIR, "pems", "temp", "station_dist.pt"))) # G = torch.load(path.join(BASE_DIR, "pems", "temp", "graph.pt")) # # input_embeddings, target_embeddings, neighbor_embeddings, edge_type, neigh_mask, station_id_to_idx, station_id_to_exp_idx = generate_embedding(stations, G) # # torch.save(input_embeddings, ensure_dir(path.join(BASE_DIR, "pems", "utility_input_embeddings.pt"))) # torch.save(target_embeddings, ensure_dir(path.join(BASE_DIR, "pems", "target_embeddings.pt"))) # torch.save(neighbor_embeddings, ensure_dir(path.join(BASE_DIR, "pems", "neighbor_embeddings.pt"))) # torch.save(edge_type, ensure_dir(path.join(BASE_DIR, "pems", "edge_type.pt"))) # torch.save(neigh_mask, ensure_dir(path.join(BASE_DIR, "pems", "mask_neighbor.pt"))) # torch.save(station_id_to_idx, ensure_dir(path.join(BASE_DIR, "pems", "station_id_to_idx.pt"))) # torch.save(station_id_to_exp_idx, ensure_dir(path.join(BASE_DIR, "pems", "station_id_to_exp_idx.pt"))) # # station_id_to_idx = torch.load(path.join(BASE_DIR, "pems", "station_id_to_idx.pt"))
sites_correlation = pickle.load( open(path.join(BASE_DIR, "utility", "temp", "neighbors.bin"), "rb")) tz_onehot = pickle.load( open(path.join(BASE_DIR, "utility", "temp", "tz_onehot.bin"), "rb")) input_embeddings, target_embeddings, neighbor_embeddings, edge_types, neigh_mask, site_to_idx, site_to_exp_idx = generate_embedding( sites_normalized_dataframe, sites_info, sites_correlation, days_onehot, tz_onehot, seq_len=16) torch.save( input_embeddings, ensure_dir( path.join(BASE_DIR, "utility", "utility_input_embeddings.pt"))) torch.save( target_embeddings, ensure_dir(path.join(BASE_DIR, "utility", "target_embeddings.pt"))) torch.save( neighbor_embeddings, ensure_dir(path.join(BASE_DIR, "utility", "neighbor_embeddings.pt"))) torch.save(edge_types, ensure_dir(path.join(BASE_DIR, "utility", "edge_type.pt"))) torch.save(neigh_mask, ensure_dir(path.join(BASE_DIR, "utility", "mask_neighbor.pt"))) torch.save(site_to_idx, ensure_dir(path.join(BASE_DIR, "utility", "site_to_idx.pt"))) torch.save( site_to_exp_idx, ensure_dir(path.join(BASE_DIR, "utility", "site_to_exp_idx.pt")))
eval_dataloader, input_embeddings, target_embeddings, neighbor_embeddings, edge_types, mask_neighbor, device) eval_loss.append(iter_eval) vis.line(Y=np.array(eval_loss), X=np.array(range(0, i_iter + 1, args.eval_step)), opts=dict(legend=["RMSE"], title=model.name + " eval loos", showlegend=True), win="win:eval-{}".format(EXP_NAME)) torch.save( saved_weights, ensure_dir( path.join( "data", args.data_dir, model.name, "{}_new_saved_eval_iter-{}_temp-{}.bin".format( args.dataset_prefix, int(i_iter / args.eval_step), args.temp)))) # pickle.dump(saved_weights, open(ensure_dir(path.join(args.data_dir, model.name, "{}saved_eval_iter-{}_temp-{}.bin".format(args.dataset_prefix, int(i_iter/args.eval_step), args.temp))), "wb")) if best_model > iter_eval: print("save best model") best_model = iter_eval torch.save( model, path.join("data", args.data_dir, "{}.pt".format(model.name))) # test performance model = torch.load(
print('Completed extracting case data from ' + f_name) # Create n_grams n_gram_dir = generate_n_grams.process_dir(case_data_dir, 2, 4, b_verbose=True, b_size=2) print('Completed generating n_grams from ' + f_name) helper.move_dir(n_gram_dir, os.path.join(save_dir, f_name)) print('Completed processing ' + str(f_name) + ' in ' + str(time.time() - start) + ' (s)') helper.delete_dir(f_name) def main(): files = [x for x in helper.get_files('.') if x.endswith('_complete.zip')] num_cores = multiprocessing.cpu_count() Parallel(n_jobs=num_cores)(delayed(process_file)(f) for f in files) if __name__ == '__main__': save_dir = 'data' helper.ensure_dir(save_dir) main()