def train(model,iterator,optimizer,criterion,train_loader,visual_path,trg2idx,savepath): loss_for_save=100 for epoch in range(iterator): for k, (src_batch, trg_batch) in enumerate(train_loader): src_tensor = torch.LongTensor(src_batch).to(device) trg_tensor = torch.LongTensor(trg_batch).to(device) optimizer.zero_grad() outputs = model.forward(src=src_tensor, trg=trg_tensor, teacher_force=True) outputs = outputs[1:].contiguous().view(-1, outputs.shape[-1]) trg_tensor = trg_tensor[1:].contiguous().view(-1) loss = criterion(outputs, trg_tensor) visualize(loss,epoch,visual_path,model,src_tensor,trg_tensor,trg2idx=trg2idx) loss.backward() optimizer.step() if(loss.item()<loss_for_save): loss_for_save=loss.item() torch.save(model.state_dict(),savepath) print("save model at Epoch {:d}".format(epoch+1)) print("Epoch: {:d} batch step: [{:d}/{:d}] Loss: {:.4f}".format(epoch + 1, k + 1, len(train_loader), loss))
def try_two_objects_interaction(): orange_sphere = download_point_cloud.download_to_object( "models/orange sphere.ply", 1000) orange_sphere.scale(0.3) orange_sphere.shift([0, 0.18, 0]) # visualization.visualize_object([orange_sphere]) # moving_orange_sphere = PotentialFieldObject(orange_sphere) grey_plane = download_point_cloud.download_to_object( "models/grey plane.ply", 6000) grey_plane.scale(0.1) grey_plane.rotate([1, 0, 0], math.radians(90)) # visualization.visualize(objects=[grey_plane, orange_sphere]) # moving_orange_sphere.interaction(grey_plane) # blue_conus = download_point_cloud.download_to_object("models/blue conus.ply", 3000) # blue_conus.scale(0.4) # blue_conus.rotate([1, 0, 0], math.radians(30)) # blue_conus.rotate([0, 1, 0], math.radians(60)) # blue_conus.shift([0, -0.3, 0]) # visualization.visualize(objects=[blue_conus, orange_sphere]) # moving_orange_sphere.interaction(blue_conus) # brown_cylinder = download_point_cloud.download_to_object("models/brown cylinder.ply", 3000) # brown_cylinder.scale(0.4) # brown_cylinder.rotate([1, 0, 0], math.radians(60)) # brown_cylinder.rotate([0, 1, 0], math.radians(30)) # brown_cylinder.shift([-0.3, -0.6, 0]) # visualization.visualize(objects=[brown_cylinder, orange_sphere]) # moving_orange_sphere.interaction(brown_cylinder) violet_thor = download_point_cloud.download_to_object( "models/violet thor.ply") visualization.visualize(objects=[violet_thor])
def get_focus_of_attention(videopath, frames): path = videopath[:videopath.rfind('/')] mean_path = join(videopath[:videopath.rfind('/')], "mean_frame.png") outputfolder = join(path, "output") if not os.path.exists(outputfolder): os.mkdir(outputfolder) print("///////////////////") print("getFrames Begin") print("///////////////////") get_frames(frames, videopath, join(outputfolder, 'Frames')) print("///////////////////") print("OpticalFlow Begin") print("///////////////////") optical_flow(outputfolder, frames) print("///////////////////") print("Segmantention Begin") print("///////////////////") segment(outputfolder, frames) print("///////////////////") print("Prediction Begin") print("///////////////////") predict(outputfolder, mean_path, frames) print("///////////////////") print("Visualization Begin") print("///////////////////") visualize(outputfolder, frames) print("///////////////////") print("generateVideo Begin") print("///////////////////") generate_video(outputfolder, frames)
def scatter_plot_molecular_weight(x2d, data_matrix, visualization_options): """Plots the 2D PCA with the molecules colored by their molecular weight Args: x2d (numpy array): 2d matrix containing the points given by a 2D PCA on the data matrix data_matrix (numpy array): non normalized data matrix of all the features visualization_options (dict): contains the options for visualization (plotting) """ molecular_weight_fig = plt.figure( figsize=viz.get_option(visualization_options, 'figsize')) ax = molecular_weight_fig.add_subplot(111) ax.set_title(r"2D PCA with molecules colored by their molecular volume", fontsize=viz.get_option(visualization_options, 'fontsize')) scatter = ax.scatter(x2d[:, 0], x2d[:, 1], c=data_matrix[:, 2], alpha=0.05, s=10, cmap=viz.get_option(visualization_options, 'colormap')) colorbar = plt.colorbar(scatter, label=r"Molecular volume [m$^3$/mol]") colorbar.set_alpha(0.5) colorbar.draw_all() molecular_weight_fig.tight_layout() if viz.do_plot(visualization_options): viz.visualize(molecular_weight_fig, 'molecular_weights', visualization_options)
def test_linear_classifier_moons(self): target = 'label' df = moons_data() X = df.loc[:, df.columns != target].values y = df[target].values clf = linear_model.LogisticRegressionCV() clf.fit(X, y) visualize(X, y, lambda x: clf.predict(x))
def test_linear_classifier_iris_all(self): target = 'species' df = iris_data() mappings = enumerate_strings(df) X = df[['sepal_length', 'sepal_width']].values y = df[target].values clf = linear_model.LogisticRegressionCV() clf.fit(X, y) visualize(X, y, lambda x: clf.predict(x))
def test_backprop_standard_moons(self): """ Check moons data with backprop classifier Moons has 2 features """ target = 'label' df = normalize_data(moons_data(), target) X = df.loc[:, df.columns != target].values y = df[target].values model = build_model(X, y, 3) visualize(X, y, lambda x: predict(model, x))
def test_backprop_standard_flights_months(self): """ Check iris data with backprop weight optimizer Iris has 2 features and binary classification outputs """ target = 'month' df = normalize_data(flights_data(), target) mappings = enumerate_strings(df) X = df.loc[:, df.columns != target].values y = df[target].values model = build_model(X, y, 3) visualize(X, y, lambda x: predict(model, x))
def test_backprop_standard_iris_all_species(self): """ Check iris data with backprop weight optimizer Iris has 2 features and binary classification outputs """ target = 'species' df = normalize_data(iris_data(), target) mappings = enumerate_strings(df) X = df[['sepal_length', 'sepal_width']].values y = df[target].values model = build_model(X, y, 3) visualize(X, y, lambda x: predict(model, x))
def main(): dataset = Dataset(TRAIN_DATASET_PATH, batch_size=10) dataset.shuffle() print("Training model...") model: Model = train(Model(), dataset) print("Done!") dataset_test = Dataset(TEST_DATASET_PATH, batch_size=500) labels: List[int] = label_test_dataset(model, dataset_test) dataset_test.y = labels visualize(dataset, dataset_test)
def check_data_generation(): data_generation.save_images_from_VREP() depth_im = image_processing.load_image("3d_map/", "room_depth0.png", "depth") rgb_im = image_processing.load_image("3d_map/", "room_rgb0.png") xyz, rgb = image_processing.calculate_point_cloud(rgb_im / 255, depth_im / 255) temp = PointsObject() temp.add_points(xyz, rgb) temp.save_all_points("3d_map/", "room") visualization.visualize([temp])
def BattleFortune(turns, max_threads, game, province, dom_path, game_path, temp_path, dump_log=False): """ Runs BattleFortune, simulate battles, and return results. :param temp_path: OS path to dominions temporary files :param turns: Number of Turns to be simulated. :param max_threads: Maximum number of simultaneous threads. :param game: Game to be simulated. :param province: Province where battle occurs. :param dom_path: dominions OS path. :param game_path: game OS path. :param dump_log: If true, created log files. :return: True when simulation is completed. """ setup(dom_path=dom_path, game_path=game_path, max_threads=max_threads, temp_path=temp_path) logs = batchrun(turns, game, province) n = logs['nations'] w = logs['winners'] b = logs['battles'] if dump_log: logpath = './battlefortune/logs/' + game + '/' if not os.path.exists(logpath): os.makedirs(logpath) yaml.dump(data=w, stream=open(logpath + 'winlog.yaml', 'w')) yaml.dump(data=b, stream=open(logpath + 'battlelog.yaml', 'w')) with open(logpath + 'battlelog.json', 'w') as outfile: json.dump(b, outfile) with open(logpath + 'winlog.json', 'w') as outfile: json.dump(w, outfile) visualize(nations=n, win_log=w, battle_log=b, rounds=turns) return True
def load_many_objects(): models_list = [] models_list.append( download_point_cloud.download_to_object("models/blue conus.ply")) models_list.append( download_point_cloud.download_to_object("models/grey plane.ply")) models_list.append( download_point_cloud.download_to_object("models/red cube.ply")) models_list[0].scale(0.1) models_list[0].clear() visualization.visualize(models_list[0].get_points()[0], models_list[0].get_points()[1]) visualization.visualize_object(models_list)
def check_normals_estimation(): stable_object = download_point_cloud.download_to_object("3d_map/room.pcd") points = stable_object.get_points()[0] normals = stable_object.get_normals() / 100 normals_object = PointsObject(points + normals) visualization.visualize([stable_object, normals_object]) d_x = 0.1 new_points, new_normals, _ = data_generation.reduce_environment_points( stable_object.get_points()[0], stable_object.get_normals(), d_x) new_points_object = PointsObject(new_points, np.full(new_points.shape, 0.3)) new_normals_object = PointsObject(new_points + new_normals / 100) visualization.visualize([new_points_object, new_normals_object])
def run(self, frame, EM): self.current_container = FrameContainer(frame) candidates, auxiliary = self.run_part1() self.current_container.traffic_light, self.current_container.auxiliary = self.run_part2(candidates, auxiliary) try: # sanity: make sure part2 returns not more than part1 candidates assert len(self.current_container.traffic_light) <= len(candidates) except AssertionError: self.current_container.traffic_light, self.current_container.auxiliary = candidates, auxiliary if EM is not None: self.run_part3(EM) visualize(candidates, auxiliary, self.prev_container, self.current_container, self.focal, self.pp) self.prev_container = self.current_container self.current_container = None
def run(): args = argparser() path = utils.create_log_dir(sys.argv) utils.start(args.http_port) env = Env(args) agents = [Agent(args) for _ in range(args.n_agent)] master = Master(args) for agent in agents: master.add_agent(agent) master.add_env(env) success_list = [] time_list = [] for idx in range(args.n_episode): print('=' * 80) print("Episode {}".format(idx + 1)) # 서버의 stack, timer 초기화 print("서버를 초기화하는중...") master.reset(path) # 에피소드 시작 master.start() # 에이전트 학습 master.train() print('=' * 80) success_list.append(master.infos["is_success"]) time_list.append(master.infos["end_time"] - master.infos["start_time"]) if (idx + 1) % args.print_interval == 0: print("=" * 80) print("EPISODE {}: Avg. Success Rate / Time: {:.2} / {:.2}".format( idx + 1, np.mean(success_list), np.mean(time_list))) success_list.clear() time_list.clear() print("=" * 80) if (idx + 1) % args.checkpoint_interval == 0: utils.save_checkpoints(path, agents, idx + 1) if args.visual: visualize(path, args) print("끝") utils.close()
def plot_combination_colors(x2d, data_matrix, visualization_options): """Tries to color code the last four features in the data set and apply it to the each point of the 2D PCA. Args: x2d (numpy array): 2d matrix containing the points given by a 2D PCA on the data matrix data_matrix (numpy array): data matrix of all the features visualization_options (dict): contains the options for visualization (plotting) """ combination_colors_fig = plt.figure( figsize=viz.get_option(visualization_options, 'figsize')) ax = combination_colors_fig.add_subplot(111) ax.set_title( r"Coloring each point with the combination of the last four features", fontsize=20) # === Generate color space from data === cyan = data_matrix[:, 4] magenta = data_matrix[:, 5] yellow = data_matrix[:, 6] key = data_matrix[:, 7] colors = np.ones((x2d.shape[0], 4)) colors[:, 0] = (1 - cyan) * (1 - key) colors[:, 1] = (1 - magenta) * (1 - key) colors[:, 2] = (1 - yellow) * (1 - key) min = np.min(colors[:, 0]) max = np.max(colors[:, 0]) colors[:, 0] = (colors[:, 0] - min) / (max - min) min = np.min(colors[:, 1]) max = np.max(colors[:, 1]) colors[:, 1] = (colors[:, 1] - min) / (max - min) min = np.min(colors[:, 2]) max = np.max(colors[:, 2]) colors[:, 2] = (colors[:, 2] - min) / (max - min) colors[:, 3] = 0.7 scatter = ax.scatter(x2d[:, 0], x2d[:, 1], c=colors, s=10) combination_colors_fig.tight_layout() if viz.do_plot(visualization_options): viz.visualize(combination_colors_fig, 'combination_colors', visualization_options)
def run_program(): """ Driver function for the text collection, processing, topic modeling, and visualization scripts. """ era = input("Select '19th' or '20th' as era for analysis: ") for i in tqdm(range(6), desc="Generating and visualizing topics..."): # this loop is necessary for the progress bar if i == 0: source_and_split() elif i == 1: gather_text(era) elif i == 2: generate_corpus() elif i == 3: utility_year() elif i == 4: # n_topics is 8 by default, n_iterations is 300 by default model_topics(era=era) elif i == 5: # visualization visualize(era) print("\nProcess complete.")
def run(initial_lettice, rules, max_t): """ Run cellular automaton Parameters ---------- initial_lettice: list a two dimensional array of states (0, 1) rules: list the first row states the probabilities for getting sick, the second one states the probabilities for becoming healthy. max_t: int the maximum number of steps Returns ------- percentage_of_dead_list: percentage of dead in each checkpoint """ lettice = initial_lettice infected_list = rules[0] # infected_list[0] = 0 # healthy_list = rules[1] healthy_list = [0 for i in range(9)] interval = int(max_t / 40) max_t = 40 * interval num_dead_list = [] max_t = 100 for t in range(max_t): for i in range(len(lettice)): for j in range(len(lettice[0])): ret = __find(lettice, i, j) if lettice[i][j] == 0: # Healthy if random.random() < infected_list[ret]: lettice[i][j] = 1 else: # Infected if random.random() < healthy_list[ret]: lettice[i][j] = 0 if t % interval == interval - 1: # Count Dead f = visualization.visualize(lettice) plt.savefig("{0}".format(t)) plt.close() num_dead_list.append(__cnt_dead(lettice)) return [p / (len(lettice) * len(lettice[0])) for p in num_dead_list]
def validate(net, loader, writer): global global_step net.eval() loader.reset() res = evaluate(net, loader, max_aabbs=1000) for i, (img, aabbs) in enumerate(zip(res.batch_imgs, res.batch_aabbs)): vis = visualize(img, aabbs) writer.add_image(f'img{i}', vis.transpose((2, 0, 1)), global_step) writer.add_scalar('val_loss', res.loss, global_step) writer.add_scalar('val_recall', res.metrics.recall(), global_step) writer.add_scalar('val_precision', res.metrics.precision(), global_step) writer.add_scalar('val_f1', res.metrics.f1(), global_step) return res.metrics.f1()
def tab_content(identifier, annotation_types, view, text): meta_id = "%s:Metadata" % identifier anno_id = "%s:Annotations" % identifier content = div( { 'id': identifier, 'class': 'tab_c1', 'style': "display: none;" }, []) sub_tabs = content.add( div({'class': 'tab2'}, [tab_button_sub(meta_id), tab_button_sub(anno_id)])) content.add_all([ tab_text_sub(meta_id, dump(view.get('metadata'))), tab_text_sub(anno_id, dump(view.get('annotations'))) ]) for annotation_type in annotation_types: id_sub = identifier + ':' + annotation_type sub_tabs.add(tab_button_sub(id_sub)) content.add(tab_text_sub(id_sub, visualize(id_sub, view, text))) return content
from data.make_clusters import * from visualization.visualize import * from models.recommender import * import pandas as pd print("Loading datasets...") df_aisles = pd.read_csv("../data/raw/aisles.csv") df_orders = pd.read_csv("../data/raw/orders.csv") df_products = pd.read_csv("../data/raw/products.csv") df_departments = pd.read_csv("../data/raw/departments.csv") df_order_products__prior = pd.read_csv("../data/raw/order_products__prior.csv") df_order_products__train = pd.read_csv("../data/raw/order_products__train.csv") df_orders = createClusters(df_aisles, df_orders, df_products, df_order_products__prior) visualize(df_aisles, df_departments, df_orders, df_products, df_order_products__train, df_order_products__prior) product_recommender(df_order_products__prior, df_order_products__train, df_orders, df_products)
class DatasetIAMSplit: """wrapper which provides a dataset interface for a split of the original dataset""" def __init__(self, dataset, start_idx, end_idx): assert start_idx >= 0 and end_idx <= len(dataset) self.dataset = dataset self.start_idx = start_idx self.end_idx = end_idx def __getitem__(self, idx): return self.dataset[self.start_idx + idx] def __len__(self): return self.end_idx - self.start_idx if __name__ == '__main__': from visualization import visualize from coding import encode, decode import matplotlib.pyplot as plt dataset = DatasetIAM(Path('../data'), (350, 350), (350, 350), caching=False) img, gt = dataset[0] gt_map = encode(img.shape, gt) gt = decode(gt_map) plt.imshow(visualize(img / 255 - 0.5, gt)) plt.show()
stats.reset() agent.play(args.play_games, args) stats.write(0, "play") if args.visualization_file: from visualization import visualize # use states recorded during gameplay. NB! Check buffer size, that it can accomodate one game! states = [ agent.mem.getState(i) for i in range(agent.history_length, agent.mem.current - agent.random_starts) ] logger.info("Collected %d game states" % len(states)) import numpy as np states = np.array(states) states = states / 255. visualize(net.model, states, args.visualization_filters, args.visualization_file) sys.exit() if args.random_steps: # populate replay memory with random steps logger.info("Populating replay memory with %d random moves" % args.random_steps) # Set env mode test so that loss of life is considered as terminal env.setMode('train') stats.reset() agent.play_random(args.random_steps, args) stats.write(0, "random") # loop over epochs for epoch in range(args.start_epoch, args.epochs): logger.info("Epoch #%d" % (epoch + 1))
model.eval() if args.cuda: model.cuda() # read the image img = cv2.imread('examples/' + args.img) if args.model_type == 'inception': # the input image's size is different img = cv2.resize(img, (299, 299)) img = img.astype(np.float32) img = img[:, :, (2, 1, 0)] # calculate the gradient and the label index gradients, label_index = calculate_outputs_and_gradients([img], model, None, args.cuda) gradients = np.transpose(gradients[0], (1, 2, 0)) img_gradient_overlay = visualize(gradients, img, clip_above_percentile=99, clip_below_percentile=0, overlay=True, mask_mode=True) img_gradient = visualize(gradients, img, clip_above_percentile=99, clip_below_percentile=0, overlay=False) # calculae the integrated gradients attributions = random_baseline_integrated_gradients(img, model, label_index, calculate_outputs_and_gradients, \ steps=100, num_random_trials=25, cuda=args.cuda) img_integrated_gradient_overlay = visualize(attributions, img, clip_above_percentile=99, clip_below_percentile=0, \ overlay=True, mask_mode=True) img_integrated_gradient = visualize(attributions, img, clip_above_percentile=99, clip_below_percentile=0, overlay=False) output_img = generate_entrie_images(img, img_gradient, img_gradient_overlay, img_integrated_gradient, \ img_integrated_gradient_overlay) cv2.imwrite('results/' + args.model_type + '/' + os.path.splitext(args.img)[0] + "_img.jpg", np.uint8(img)[:, :, (2, 1, 0)]) cv2.imwrite('results/' + args.model_type + '/' + os.path.splitext(args.img)[0] + "_exp.jpg", np.uint8(img_integrated_gradient[:, :, (2, 1, 0)])) cv2.imwrite('results/' + args.model_type + '/' + args.img, np.uint8(output_img)) print(np.uint8(np.max(img_integrated_gradient, 2)))
d_map[i][j] = 2 for (i, j) in [(0, 1), (1, 0), (n - 2, 0), (n - 1, 1), (0, n - 2), (1, n - 1), (n - 2, n - 1), (n - 1, n - 2)]: d_map[i][j] = 3 for (i, j) in [(1, 1), (1, n - 2), (n - 2, 1), (n - 2, n - 2)]: d_map[i][j] = 4 dic_position_degree = {(x, y): d_map[x][y] for x in range(n) for y in range(n)} #key=position,value=degree dic_position_steps = {} #key=position,value=which step next = (start_x, start_y) for step in range(n * n): (x, y) = next #stand at (x,y) now dic_position_steps[(x, y)] = step #update dic_position_steps update_dic_position_degree(x, y, dic_position_degree) tmp = 999999 #find the next move for (i, j) in [(1, -2), (2, -1), (2, 1), (1, 2), (-1, 2), (-2, 1), (-2, -1), (-1, -2)]: if (x + i) >= 0 and (x + i) < n and (y + j) >= 0 and ( y + j) < n and dic_position_degree[(x + i), (y + j)] < tmp: tmp = dic_position_degree[(x + i), (y + j)] next = ((x + i), (y + j)) if dic_position_degree[ next] > 8 and step != n * n - 1: #The KnightTour couldn't be finished return n, dic_position_steps, 2 return n, dic_position_steps, 0 (N, dic, errCode) = knightTour(20, 4, 5) #20*20 Matrix ,start at (4,5) visualize(N, dic, errCode)
net.load_weights(args.load_weights) if args.play_games: logger.info("Playing for %d game(s)" % args.play_games) stats.reset() agent.play(args.play_games) stats.write(0, "play") if args.visualization_file: from visualization import visualize # use states recorded during gameplay. NB! Check buffer size, that it can accomodate one game! states = [agent.mem.getState(i) for i in xrange(agent.history_length, agent.mem.current - agent.random_starts)] logger.info("Collected %d game states" % len(states)) import numpy as np states = np.array(states) states = states / 255. visualize(net.model, states, args.visualization_filters, args.visualization_file) sys.exit() if args.random_steps: # populate replay memory with random steps logger.info("Populating replay memory with %d random moves" % args.random_steps) stats.reset() agent.play_random(args.random_steps) stats.write(0, "random") # loop over epochs for epoch in xrange(args.epochs): logger.info("Epoch #%d" % (epoch + 1)) if args.train_steps: logger.info(" Training for %d steps" % args.train_steps)
def generate_chunk(start_time, end_time, raw_audio, truth_labels, boundary_mask_vec, spectrogram_info): ''' Generate a data chunk around a given elephant call. The data chunk is of size "chunk_length" seconds and has the call of interest randomly placed inside the window Parameters: - start_time and end_time in seconds - truth_labels: Gives the ground truth elephant call labelings - boundary_mask_vec: Gives the location of the "fuzzy" boundary regions around each call where we want to allow for flexability in prediction ''' window_size = spectrogram_info['window'] # Flag for whether to sample oversized windows oversize = spectrogram_info['oversize_windows'] # Convert the times to .wav frames to help ensure # robustness of approach start_frame = int(math.floor(start_time * spectrogram_info['samplerate'])) end_frame = int(math.ceil(end_time * spectrogram_info['samplerate'])) # Generate oversized windows to allow for random location sampling of the calls. if oversize: # Formula is: spect_frames = floor((wav_frames - overlap) / hop) len_call_spect_frames = math.floor(((end_frame - start_frame) - (spectrogram_info['NFFT'] - spectrogram_info['hop'])) / spectrogram_info['hop']) window_size = 2 * window_size - len_call_spect_frames # Calculate the start first based on true window size true_chunk_size = (window_size - 1) * spectrogram_info['hop'] + spectrogram_info['NFFT'] chunk_start = end_frame - true_chunk_size # Add the size of the new window chunk_size = ((window_size - 1) * spectrogram_info['hop'] + spectrogram_info['NFFT']) chunk_end = chunk_start + chunk_size #chunk_end = start_frame + true_chunk_size # Somehow off by one? # For now skip if at edges if chunk_start < 0 or (chunk_end >= raw_audio.shape[0]): print ("skipping too long of call") # Maybe don't need this let us se return None, None, None else: # Convert from window size in spectrogram frames to raw audio size # Note we use the -1 term to force the correct number of frames # wav = frames * hop - hop + window ==> wav = frames * hop + overlap chunk_size = (window_size - 1) * spectrogram_info['hop'] + spectrogram_info['NFFT'] # Padding to call call_length = end_frame - start_frame # In .wav frames padding_length = chunk_size - call_length # if padding_frame is neg skip call # but still want to go to the next!! if padding_length < 0: print ("skipping too long of call") # Maybe don't need this let us se return None, None, None # Randomly split the pad to before and after pad_front = np.random.randint(0, padding_length + 1) # Do some stuff to avoid the front and end! chunk_start = start_frame - pad_front chunk_end = start_frame + call_length + (padding_length - pad_front) # Do some quick voodo - assume cant have issue where # the window of 64 frames is lareger than the sound file! if (chunk_start < 0): # Amount to transfer to end chunk_start = 0 chunk_end = chunk_size # See if we have passed the end of the sound file. # Note divide by sr to get sound file length in seconds if (chunk_end >= raw_audio.shape[0]): chunk_end = raw_audio.shape[0] chunk_start = raw_audio.shape[0] - chunk_size assert(chunk_end - chunk_start == chunk_size) # Make sure the call is fully in the region assert(chunk_start <= start_frame and chunk_end >= end_frame) NFFT = spectrogram_info['NFFT'] samplerate = spectrogram_info['samplerate'] hop = spectrogram_info['hop'] max_freq = spectrogram_info['max_freq'] pad_to = spectrogram_info['pad_to'] # Extract the spectogram [spectrum, freqs, t] = ml.specgram(raw_audio[chunk_start: chunk_end], NFFT=NFFT, Fs=samplerate, noverlap=(NFFT - hop), window=ml.window_hanning, pad_to=pad_to) # Check our math assert(spectrum.shape[1] == window_size) # Cutout the high frequencies that are not of interest spectrum = spectrum[(freqs <= max_freq)] # Get the corresponding labels # Calculate the relative start time w/r # to the entire spectogram for the given chunk start_spec = max(math.ceil((chunk_start - spectrogram_info['NFFT'] / 2.) / spectrogram_info['hop']), 0) end_spec = start_spec + spectrum.shape[1] data_labels = truth_labels[start_spec: end_spec] boundary_mask = boundary_mask_vec[start_spec: end_spec] if VERBOSE: new_features = 10*np.log10(spectrum) visualize(new_features.T, labels=data_labels, boundaries=boundary_mask) # We want spectrograms to be time x freq spectrum = spectrum.T return spectrum, data_labels, boundary_mask
def generate_empty_chunks(n, raw_audio, label_vec, boundary_mask_vec, spectrogram_info): """ Generate n empty data chunks by uniformally sampling time sections with no elephant calls present """ # Step through the labels vector and collect the indeces from # which we can define a window with no elephant call # i.e. all start indeces such that the window (start, start + window_sz) # does not contain an elephant call # In the case where we are considering uncertainty around boundaries, # we add the label_vec and boundary_mask_vec to prevent having negative # samples including uncertain boundaries valid_starts = [] window_size = spectrogram_info['window'] updated_labels = label_vec + boundary_mask_vec # Flag for whether to sample oversized windows oversize = spectrogram_info['oversize_windows'] if oversize: window_size *= 2 # Step backwards and keep track of how far away the # last elephant call was last_elephant = 0 # For now is the size of the window for i in range(label_vec.shape[0] - 1, -1, -1): last_elephant += 1 # Check if we encounter an elephant call # Note: do >= in case where boundary + label = 2 if (updated_labels[i] >= 1): last_elephant = 0 # If we haven't seen an elephant call # for a chunk size than record this index if (last_elephant >= window_size): valid_starts.append(i) # Generate num_empty uniformally random # empty chunks empty_features = [] empty_labels = [] empty_boundary_masks = [] NFFT = spectrogram_info['NFFT'] samplerate = spectrogram_info['samplerate'] hop = spectrogram_info['hop'] max_freq = spectrogram_info['max_freq'] pad_to = spectrogram_info['pad_to'] for i in range(n): # Generate a valid empty start chunk # index by randomly sampling from our # ground truth labels start = np.random.choice(valid_starts) # Now we have to do a litle back conversion to get # the raw audio index in raw audio frames # Number of hops in we are marks the first raw audio frame to use chunk_start = start * spectrogram_info['hop'] chunk_size = (window_size - 1) * spectrogram_info['hop'] + spectrogram_info['NFFT'] chunk_end = int(chunk_start + chunk_size) # Get the spectrogram chunk # Extract the spectogram [spectrum, freqs, t] = ml.specgram(raw_audio[chunk_start: chunk_end], NFFT=NFFT, Fs=samplerate, noverlap=(NFFT - hop), window=ml.window_hanning, pad_to=pad_to) # Cutout the high frequencies that are not of interest spectrum = spectrum[(freqs <= max_freq)] assert(spectrum.shape[1] == window_size) data_labels = label_vec[start : start + spectrum.shape[1]] # Make sure that no call exists in the chunk assert(np.sum(data_labels) == 0) boundary_mask = boundary_mask_vec[start : start + spectrum.shape[1]] assert(np.sum(boundary_mask) == 0) if VERBOSE: new_features = 10*np.log10(spectrum) visualize(new_features.T, labels=data_labels) # We want spectrograms to be time x freq spectrum = spectrum.T empty_features.append(spectrum) empty_labels.append(data_labels) empty_boundary_masks.append(boundary_mask) return empty_features, empty_labels, empty_boundary_masks
centers = kmeans.cluster_centers_ sse[k] = kmeans.inertia_ wcss.append(kmeans.inertia_) print(sse[k]) plt.figure() plt.plot(list(sse.keys()), list(sse.values())) plt.xlabel('Cluster') plt.ylabel('Sum of squared Errors of prediction') outfile = 'results/elbow-plot/kmeans-elbowmethod-result' + '-' + file_code + '.jpg' plt.savefig(outfile) kneedle = KneeLocator(range_n_clusters, wcss, S=1.0, invert=False, direction='decreasing') print('Optimal number of clusters: ', kneedle.knee) # optimal = int(input('Enter optimal number of clusters: ')) kmeans = KMeans(kneedle.knee, random_state=42) labels = kmeans.fit_predict(X.values) visualize(df, labels, file_code) cluster_labels = pd.DataFrame(labels, index=X.index, columns=['Cluster_Labels']) cluster_labels.to_csv('results/labels/labels' + '-' + file_code + '.csv', sep=',', encoding='utf-8', index='True')
def demo(): ########################## # 1. GET NEW DATASET # # 2. ADD LOCATIONS # # 3. TRAIN CLASSIFIERS # # 4. MAKE PREDICTIONS # # 5. FILTER, SORT, GROUP # # 6. VISUALIZE # ########################## print() ###################### # 1. GET NEW DATASET # ###################### print('\n1. GET NEW DATASET') # read Twitter tokens consumer_key, consumer_secret, access_token, access_token_secret = read_twitter_tokens('tokens/twitter_tokens.txt') # connect with the Twitter API twitter_api: tweepy.API = connect_to_twitter_api(consumer_key, consumer_secret, access_token, access_token_secret) # define keywords # define keywords # COVID_KEYWORDS: List[str] = [ # 'corona', 'covid', 'quaranteen', 'home', 'stay', 'inside', 'virology', 'doctor', 'nurse', 'virus', 'grandma', # 'vaccin', 'sars', 'alone', 'strongtogether', 'elbow', 'mouth mask', 'protective equipment', 'hospitalization', # 'increas', 'death', 'dead', 'impact', 'ICU', 'intensive care', 'applause', 'stay healthy', 'take care', 'risk', # 'risk group', 'environment', # 'U+1F637', # Medical Mask Emoji # 'U+1F691', # Amublance Emoji # 'U+1F92E', # Vomiting Emoji # 'U+1F912', # Thermometer Emoji # ] # COVID_FAKE_KEYWORDS: List[str] = [ # 'coronascam', 'fakecorona', 'fake', 'coronahoax', 'hoaxcorona', 'gooutside', 'donotstayhome''fuckvirology', # 'donttrustvirologists', 'coronadoesntexist', 'chinesevirushoax', # ] keywords: Dict[str, int] = { 'covid': 100, # get 100 tweets with 'covid' in it 'corona': 100, # get 100 tweet with 'corona' in it 'coronahoax': 100, # get tweets 100 with 'coronahoax' in it } # get new dataset new_dataset: List[Tweet] = get_new_tweets(twitter_api, keywords) print(f'First tweet:\n{new_dataset[0]}') # save new dataset save_tweets(new_dataset, 'tweets/new_dataset.pickle') #################### # 2. ADD LOCATIONS # #################### print('\n2. ADD LOCATION TO THOSE TWEETS') # read Google token geocoding_api_key: str = read_google_token('tokens/google_token.txt') # initialize Google API google_api: GoogleV3 = GoogleV3(api_key=geocoding_api_key) # add location to tweets when possible num_tweets_with_location_before: int = 0 num_tweets_with_location_after: int = 0 for tweet in new_dataset: if tweet.country_code is not None and tweet.continent is not None: num_tweets_with_location_before += 1 tweet.add_location(google_api) if tweet.country_code is not None and tweet.continent is not None: num_tweets_with_location_after += 1 print(f'Number of tweets with location before: {num_tweets_with_location_before}') print(f'Number of tweets with location after: {num_tweets_with_location_after}') # save new dataset with locations included save_tweets(new_dataset, 'tweets/new_dataset.pickle') ######################## # 3. TRAIN CLASSIFIERS # ######################## print('\n3. TRAIN CLASSIFIERS') # load train dataset train_dataset = load_tweets('tweets/train_dataset.pickle') # pre-process train dataset X: List[str] = [tweet.text for tweet in train_dataset] X: List[str] = preprocess_corpus(X) labels: List[bool] = [tweet.denier for tweet in train_dataset] # train on part of the data # train, validation split X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2) # vectorize vectorizer: CountVectorizer = CountVectorizer() X_train = vectorizer.fit_transform(X_train) X_test = vectorizer.transform(X_test) # create Complement Naive Bayes classifier naive_bayes_classifier = ComplementNB() # train Complement Naive Bayes classifier naive_bayes_classifier = naive_bayes_classifier.fit(X_train, y_train) # validate Complement Naive Bayes classifier naive_bayes_accuracy: float = naive_bayes_classifier.score(X_test, y_test) print(f'Naive Bayes accuracy:\t{naive_bayes_accuracy * 100:>3.2f}%') # save Naive Bayes classifier save_model(naive_bayes_classifier, 'models/naive_bayes.pickle') # create Decision Tree classifier decision_tree_classifier = DecisionTreeClassifier() # train Decision Tree classifier decision_tree_classifier = decision_tree_classifier.fit(X_train, y_train) # validate Decision Tree classifier decision_tree_accuracy: float = decision_tree_classifier.score(X_test, y_test) print(f'Decision Tree accuracy:\t{decision_tree_accuracy * 100:>3.2f}%') # save Decision Tree classifier save_model(decision_tree_classifier, 'models/decision_tree.pickle') # retrain best model on all of the data # vectorize vectorizer: CountVectorizer = CountVectorizer() X: List[str] = vectorizer.fit_transform(X) best_model = ComplementNB().fit(X, labels) \ if naive_bayes_accuracy >= decision_tree_accuracy \ else DecisionTreeClassifier().fit(X, labels) # save best mode save_model(best_model, 'models/best_model.pickle') ####################### # 4. MAKE PREDICTIONS # ####################### print('\n4. USE CLASSIFIERS') # load test dataset test_dataset = load_tweets('tweets/test_dataset.pickle') # pre-processing X: List[str] = [tweet.text for tweet in test_dataset] X: List[str] = preprocess_corpus(X) # vectorize X = vectorizer.transform(X) # make predictions y = best_model.predict(X) # add predictions to tweet for tweet, label in zip(test_dataset, y): tweet.denier = label ########################## # 5. FILTER, SORT, GROUP # ########################## print('\n5. USE VARIOUS FILTERS') # use filters tweets_filtered_by_hashtag: List[Tweet] = filter_by_hashtag(test_dataset, '#coronahoax') tweets_filtered_by_hashtags_all: List[Tweet] = filter_by_hashtags_all(test_dataset, ['#corona', '#coronahoax']) tweets_filtered_by_hashtags_any: List[Tweet] = filter_by_hashtags_any(test_dataset, ['#corona', '#coronahoax', '#coronavirus', '#covid19']) tweets_filtered_before: List[Tweet] = filter_before(test_dataset, datetime(2020, 4, 19, 18, 58, 46)) tweets_filtered_at: List[Tweet] = filter_at(test_dataset, datetime(2020, 4, 19, 18, 58, 46)) tweets_filtered_after: List[Tweet] = filter_after(test_dataset, datetime(2020, 4, 19, 18, 58, 46)) tweets_filtered_between: List[Tweet] = filter_between(test_dataset, datetime(2020, 4, 19, 18, 0, 0), datetime(2020, 4, 19, 19, 0, 0)) tweets_filtered_by_country_code: List[Tweet] = filter_by_country_code(test_dataset, 'US') tweets_filtered_by_country_codes: List[Tweet] = filter_by_country_codes(test_dataset, ['US', 'GB']) tweets_filtered_by_continent: List[Tweet] = filter_by_continent(test_dataset, 'Europe') tweets_filtered_by_continents: List[Tweet] = filter_by_continents(test_dataset, ['Europe', 'North America']) tweets_sorted_by_date_ascending: List[Tweet] = sort_by_date_ascending(test_dataset) tweets_sorted_by_date_descending: List[Tweet] = sort_by_date_descending(test_dataset) tweets_grouped_by_country_code: defaultdict = group_by_country_code(test_dataset) tweets_grouped_by_continent: defaultdict = group_by_continent(test_dataset) ################ # 6. VISUALIZE # ################ print('\n6. VISUALIZE') # continents CONTINENTS: Dict[str, str] = { 'Asia': 'asia', 'Europe': 'europe', 'Africa': 'africa', 'North America': 'north_america', 'South America': 'south_america', 'Oceania': 'oceania', 'Antarctica': 'antartica', } # create series to plot num_tweets_per_country_per_continent_absolute = defaultdict(lambda: defaultdict(int)) num_tweets_per_country_absolute = defaultdict(lambda: defaultdict(int)) num_tweets_per_continent_absolute = defaultdict(lambda: defaultdict(int)) for tweet in test_dataset: if tweet.has_location(): country_code: str = tweet.country_code.lower() continent: str = CONTINENTS[tweet.continent] num_tweets_per_country_per_continent_absolute[tweet.continent][country_code] += 1 num_tweets_per_country_absolute['World'][country_code] += 1 num_tweets_per_continent_absolute['World'][continent] += 1 # visualize plots title = 'Absolute number of tweets per country and per continent' series = num_tweets_per_country_per_continent_absolute filename = 'num_tweets_per_country_per_continent_absolute' visualize(title, series, filename, per_continent=False) title = 'Absolute number of tweets per country' series = num_tweets_per_country_absolute filename = 'num_tweets_per_country_absolute' visualize(title, series, filename, per_continent=False) title = 'Absolute number of tweets per continent' series = num_tweets_per_continent_absolute filename = 'num_tweets_per_continent_absolute' visualize(title, series, filename, per_continent=True)
#print constellationNames, len(constellationNames) #print len(starsNeedClustering) # if the user runs kmeans if args.algorithm == 'Kmeans': K = args.K # running K means for 1000 times with 20 centroids standardKMeans = algorithms.KMeansPlusPlus(starsNeedClustering,K) #standardKMeans.randInitCentroid() #standardKMeans.decisiveInitCentroid() #standardKMeans.runStandardKmeansWithIter(2000) #standardKMeans.runStandardKmeansWithoutIter() standardKMeans.runKmeansPlusPlus() # output the stars that belong to centroid 1 # cluster_1 = algorithms.getCluster(1, assignments) visualization.visualize(standardKMeans.assignments, 'Kmeans') #print len(assignments), len(cluster_1), cluster_1 # print centroids, assignments # if the user runs DBSCAN elif args.algorithm == 'DBSCAN': Eps = args.Eps minDist = args.minDist #print Eps, minDist, len(starsNeedClustering) standardDBS = algorithms.densityBasedClustering(starsNeedClustering, Eps, minDist) standardDBS.runDBA() #print standardDBS.getNumOfClusters() noise = standardDBS.getNoise() #print 'Number of noise stars: ', len(noise) visualization.visualize(standardDBS.assignments, 'DBSCAN')