def main(): fem = 0 male = 0 for year in range(1975, 2016): print('Parsing movies in %d...' % (year)) # Creates year folders if they don't exist; clears them if they do clear_dir('%s/fem/%d' % (DESTFOL, year)) clear_dir('%s/male/%d' % (DESTFOL, year)) # Preprocesses data by gender, year, and movie females = read_folder_dict('%s/fem/%d' % (SRCFOL, year), year) for movie in females: fem += preprocess(movie, 'fem') males = read_folder_dict('%s/male/%d' % (SRCFOL, year), year) for movie in males: male += preprocess(movie, 'male') print('-----------------------------') print('NUMBER OF WORDS SPOKEN') print('\tMale:\t\t%d' % (male)) print('\tFemale:\t\t%d' % (fem))
def main(): # Creates year folders if they don't exist; clears them if they do for year in range(1975, 2016): clear_dir('%s/fem/%d' % (DESTFOL, year)) clear_dir('%s/male/%d' % (DESTFOL, year)) cornell() imsdb()
def convert(youtube_url, download_directory, downloads_list): """Converts Youtube Video to MP4 File.""" clear_dir(download_directory, downloads_list) yt = YouTube(youtube_url) yt.streams.first().download(download_directory) new_filename = yt.streams.first().default_filename return send_from_directory(download_directory, new_filename, as_attachment=True)
def main(): # Creates year folders if they don't exist; clears them if they do for year in range(1975, 2016): clear_dir('%s/%d' % (DESTFOL, year)) genderize = Genderize(user_agent='GenderizeDocs/0.0', api_key=config.api_key, timeout=60) femCount = 0 maleCount = 0 unkCount = 0 for year in range(1975, 2016): print('Gendering movies in %d...' % (year)) movies = read_folder_dict('%s/%d' % (SRCFOL, year), year) for movie in movies: characters = {} title = movie['title'] lines = movie['text'].split('\n') outFile = open('%s/%d/%s' % (DESTFOL, year, title), mode='w', encoding='ISO-8859-1') for i in range(2, len(lines)): fields = lines[i].split('\t') if len(fields) < 2: continue name = fields.pop(0) gender = getGender(name, characters, genderize) outFile.write('%s\t%s\t%s\n' % (name, gender, ' '.join(fields))) for gender in characters.values(): if gender == '?': unkCount += 1 elif gender == 'f': femCount += 1 elif gender == 'm': maleCount += 1 outFile.close() print('Finished %s...' % (title)) print('----------------------------------------') print('NUMBER OF CHARACTERS') print('\tMale:\t\t%d' % (maleCount)) print('\tFemale:\t\t%d' % (femCount)) print('\tUnknown:\t%d' % (unkCount))
def actor(): """ Returns an MP4 File""" val_str = "youtube.com/watch?v=" youtube_url = str(request.form['youtube_url']) val_str_res = validate_str(youtube_url, val_str) try: if(val_str_res == True): clear_dir(downloads, downloads_list) return convert(youtube_url, downloads, downloads_list) except: clear_dir(downloads, downloads_list) return redirect('/')
def main(): for year in range(1975, 2016): # Creates year folders if they don't exist; clears them if they do clear_dir('%s/txt/%d' % (FOL, year)) print('Parsing movies in %d...' % (year)) # Preprocesses data by gender, year, and movie movies = read_folder_dict('%s/html/%d' % (FOL, year), year) for movie in movies: parse(movie) print('-----------------------------')
def main(): # Clears directory for year in range(1975, 2020): clear_dir('%s/html/%d' % (FOL, year)) inFile = open('%s/movie_script_urls.txt' % (FOL), mode='r', encoding='ISO-8859-1') outFile = open('%s/movie_metadata.txt' % (FOL), mode='w', encoding='ISO-8859-1') counter = 0 for line in inFile: fields = line.split('\t') title = fields[0] year = fields[1] url = fields[2] regex = re.compile('([^\s\w]|_)+') filename = regex.sub('', title.lower()).replace(' ', '_') try: page = urlopen(url) soup = BeautifulSoup(page, 'html.parser') text = soup.prettify() write_file('%s/html/%s/%s.html' % (FOL, year, filename), text) except: print('Could not download %s.' % (title)) continue outFile.write('%d\t%s' % (counter, line)) print('Finished downloading %s...' % (title)) counter += 1 outFile.close() inFile.close()
def index(): """Returns Index Page.""" clear_dir(downloads, downloads_list) return render_template("index.html")
def train_model(X, model_params, n_max_epochs, early_stop, model_name, random_sample_size = 10, n_folds = 5): """ Trains a model for each cross validation fold and saves all models along with CBTs to ./output/<model_name> Args: X (np array): dataset (train+test) with shape [N_Subjects, N_ROIs, N_ROIs, N_Views] n_max_epochs (int): number of training epochs (if early_stop == True this is maximum epoch limit) early_stop (bool): if set true, model will stop training when overfitting starts. model_name (string): name for saving the model random_sample_size (int): random subset size for SNL function n_folds (int): number of cross validation folds Return: models: trained models """ models = [] save_path = MODEL_WEIGHT_BACKUP_PATH + "/" + model_name + "/" if not os.path.exists(save_path): os.makedirs(save_path) model_id = str(uuid.uuid4()) with open(save_path + "model_params.txt", 'w') as f: print(model_params, file=f) CBTs = [] scores = [] for i in range(n_folds): torch.cuda.empty_cache() print("********* FOLD {} *********".format(i)) train_data, test_data, train_mean, train_std = helper.preprocess_data_array(X, number_of_folds=n_folds, current_fold_id=i) test_casted = [d.to(device) for d in helper.cast_data(test_data)] loss_weightes = torch.tensor(np.array(list((1 / train_mean) / np.max(1 / train_mean))*len(train_data)), dtype = torch.float32) loss_weightes = loss_weightes.to(device) train_casted = [d.to(device) for d in helper.cast_data(train_data)] model = DGN(model_params) model = model.to(device) optimizer = torch.optim.AdamW(model.parameters(), lr=model_params["learning_rate"], weight_decay= 0.00) targets = [torch.tensor(tensor, dtype = torch.float32).to(device) for tensor in train_data] test_errors = [] tick = time.time() for epoch in range(n_max_epochs): model.train() losses = [] for data in train_casted: #Compose Dissimilarity matrix from network outputs cbt = model(data) views_sampled = random.sample(targets, random_sample_size) sampled_targets = torch.cat(views_sampled, axis = 2).permute((2,1,0)) expanded_cbt = cbt.expand((sampled_targets.shape[0],model_params["N_ROIs"],model_params["N_ROIs"])) diff = torch.abs(expanded_cbt - sampled_targets) #Absolute difference sum_of_all = torch.mul(diff, diff).sum(axis = (1,2)) #Sum of squares l = torch.sqrt(sum_of_all) #Square root of the sum losses.append((l * loss_weightes[:random_sample_size * model_params["n_attr"]]).sum()) #Backprob optimizer.zero_grad() loss = torch.mean(torch.stack(losses)) loss.backward() optimizer.step() #Track the loss if epoch % 10 == 0: cbt = DGN.generate_cbt_median(model, train_casted) rep_loss = DGN.mean_frobenious_distance(cbt, test_casted) tock = time.time() time_elapsed = tock - tick tick = tock rep_loss = float(rep_loss) test_errors.append(rep_loss) print("Epoch: {} | Test Rep: {:.2f} | Time Elapsed: {:.2f} |".format(epoch, rep_loss, time_elapsed)) #Early stopping control if len(test_errors) > 6 and early_stop: torch.save(model.state_dict(), TEMP_FOLDER + "/weight_" + model_id + "_" + str(rep_loss)[:5] + ".model") last_6 = test_errors[-6:] if(all(last_6[i] < last_6[i + 1] for i in range(5))): print("Early Stopping") break #Restore best model so far try: restore = "./temp/weight_" + model_id + "_" + str(min(test_errors))[:5] + ".model" model.load_state_dict(torch.load(restore)) except: pass torch.save(model.state_dict(), save_path + "fold" + str(i) + ".model") models.append(model) #Generate and save refined CBT cbt = DGN.generate_cbt_median(model, train_casted) rep_loss = DGN.mean_frobenious_distance(cbt, test_casted) cbt = cbt.cpu().numpy() CBTs.append(cbt) np.save( save_path + "fold" + str(i) + "_cbt", cbt) #Save all subject biased CBTs all_cbts = DGN.generate_subject_biased_cbts(model, train_casted) np.save(save_path + "fold" + str(i) + "_all_cbts", all_cbts) scores.append(float(rep_loss)) print("FINAL RESULTS REP: {}".format(rep_loss)) #Clean interim model weights helper.clear_dir(TEMP_FOLDER) for i, cbt in enumerate(CBTs): show_image(cbt, i, scores[i]) return models