Exemplo n.º 1
0
def main():
    fem = 0
    male = 0

    for year in range(1975, 2016):
        print('Parsing movies in %d...' % (year))

        # Creates year folders if they don't exist; clears them if they do
        clear_dir('%s/fem/%d' % (DESTFOL, year))
        clear_dir('%s/male/%d' % (DESTFOL, year))

        # Preprocesses data by gender, year, and movie
        females = read_folder_dict('%s/fem/%d' % (SRCFOL, year), year)
        for movie in females:
            fem += preprocess(movie, 'fem')

        males = read_folder_dict('%s/male/%d' % (SRCFOL, year), year)
        for movie in males:
            male += preprocess(movie, 'male')

        print('-----------------------------')

    print('NUMBER OF WORDS SPOKEN')
    print('\tMale:\t\t%d' % (male))
    print('\tFemale:\t\t%d' % (fem))
Exemplo n.º 2
0
def main():
    # Creates year folders if they don't exist; clears them if they do
    for year in range(1975, 2016):
        clear_dir('%s/fem/%d' % (DESTFOL, year))
        clear_dir('%s/male/%d' % (DESTFOL, year))

    cornell()
    imsdb()
def convert(youtube_url, download_directory, downloads_list):
    """Converts Youtube Video to MP4 File."""
    clear_dir(download_directory, downloads_list)
    yt = YouTube(youtube_url)
    yt.streams.first().download(download_directory)
    new_filename = yt.streams.first().default_filename
    
    return send_from_directory(download_directory, new_filename, as_attachment=True)
Exemplo n.º 4
0
def main():
    # Creates year folders if they don't exist; clears them if they do
    for year in range(1975, 2016):
        clear_dir('%s/%d' % (DESTFOL, year))

    genderize = Genderize(user_agent='GenderizeDocs/0.0',
                          api_key=config.api_key,
                          timeout=60)

    femCount = 0
    maleCount = 0
    unkCount = 0

    for year in range(1975, 2016):

        print('Gendering movies in %d...' % (year))

        movies = read_folder_dict('%s/%d' % (SRCFOL, year), year)

        for movie in movies:
            characters = {}
            title = movie['title']
            lines = movie['text'].split('\n')

            outFile = open('%s/%d/%s' % (DESTFOL, year, title),
                           mode='w',
                           encoding='ISO-8859-1')

            for i in range(2, len(lines)):
                fields = lines[i].split('\t')

                if len(fields) < 2:
                    continue

                name = fields.pop(0)
                gender = getGender(name, characters, genderize)
                outFile.write('%s\t%s\t%s\n' %
                              (name, gender, ' '.join(fields)))

            for gender in characters.values():
                if gender == '?':
                    unkCount += 1
                elif gender == 'f':
                    femCount += 1
                elif gender == 'm':
                    maleCount += 1

            outFile.close()
            print('Finished %s...' % (title))

        print('----------------------------------------')

    print('NUMBER OF CHARACTERS')
    print('\tMale:\t\t%d' % (maleCount))
    print('\tFemale:\t\t%d' % (femCount))
    print('\tUnknown:\t%d' % (unkCount))
Exemplo n.º 5
0
def actor():
    """ Returns an MP4 File"""
    val_str = "youtube.com/watch?v="
    youtube_url = str(request.form['youtube_url'])
    val_str_res = validate_str(youtube_url, val_str)

    try:
        if(val_str_res == True):
            clear_dir(downloads, downloads_list)
            return convert(youtube_url, downloads, downloads_list)
    except:
        clear_dir(downloads, downloads_list)
        return redirect('/')
Exemplo n.º 6
0
def main():
    for year in range(1975, 2016):
        # Creates year folders if they don't exist; clears them if they do
        clear_dir('%s/txt/%d' % (FOL, year))

        print('Parsing movies in %d...' % (year))

        # Preprocesses data by gender, year, and movie
        movies = read_folder_dict('%s/html/%d' % (FOL, year), year)
        for movie in movies:
            parse(movie)

        print('-----------------------------')
Exemplo n.º 7
0
def main():

    # Clears directory
    for year in range(1975, 2020):
        clear_dir('%s/html/%d' % (FOL, year))

    inFile = open('%s/movie_script_urls.txt' % (FOL), mode='r',
        encoding='ISO-8859-1')
    outFile = open('%s/movie_metadata.txt' % (FOL), mode='w',
        encoding='ISO-8859-1')

    counter = 0

    for line in inFile:
        fields = line.split('\t')
        title = fields[0]
        year = fields[1]
        url = fields[2]

        regex = re.compile('([^\s\w]|_)+')
        filename = regex.sub('', title.lower()).replace(' ', '_')

        try:
            page = urlopen(url)
            soup = BeautifulSoup(page, 'html.parser')
            text = soup.prettify()
            write_file('%s/html/%s/%s.html' % (FOL, year, filename), text)
        except:
            print('Could not download %s.' % (title))
            continue

        outFile.write('%d\t%s' % (counter, line))
        print('Finished downloading %s...' % (title))

        counter += 1

    outFile.close()
    inFile.close()
Exemplo n.º 8
0
def index():
    """Returns Index Page."""
    clear_dir(downloads, downloads_list)
    return render_template("index.html")
Exemplo n.º 9
0
    def train_model(X, model_params, n_max_epochs, early_stop, model_name, random_sample_size = 10, n_folds = 5):
        """
            Trains a model for each cross validation fold and 
            saves all models along with CBTs to ./output/<model_name> 
            Args:
                X (np array): dataset (train+test) with shape [N_Subjects, N_ROIs, N_ROIs, N_Views]
                n_max_epochs (int): number of training epochs (if early_stop == True this is maximum epoch limit)
                early_stop (bool): if set true, model will stop training when overfitting starts.
                model_name (string): name for saving the model
                random_sample_size (int): random subset size for SNL function
                n_folds (int): number of cross validation folds
            Return:
                models: trained models 
        """
        models = []

        save_path = MODEL_WEIGHT_BACKUP_PATH + "/" + model_name + "/"
        if not os.path.exists(save_path):
            os.makedirs(save_path)
            
        model_id = str(uuid.uuid4())
        with open(save_path + "model_params.txt", 'w') as f:
            print(model_params, file=f)
        
        CBTs = []
        scores = []
        for i in range(n_folds):
            torch.cuda.empty_cache() 
            print("********* FOLD {} *********".format(i))
            train_data, test_data, train_mean, train_std = helper.preprocess_data_array(X, number_of_folds=n_folds, current_fold_id=i)
            
            test_casted = [d.to(device) for d in helper.cast_data(test_data)]
            loss_weightes = torch.tensor(np.array(list((1 / train_mean) / np.max(1 / train_mean))*len(train_data)), dtype = torch.float32)
            loss_weightes = loss_weightes.to(device)
            train_casted = [d.to(device) for d in helper.cast_data(train_data)]

            model = DGN(model_params)
            model = model.to(device)
            
            optimizer = torch.optim.AdamW(model.parameters(), lr=model_params["learning_rate"], weight_decay= 0.00)
            targets = [torch.tensor(tensor, dtype = torch.float32).to(device) for tensor in train_data]
            test_errors = []
            tick = time.time()
            
            for epoch in range(n_max_epochs):
                model.train()
                losses = []
                for data in train_casted:
                    #Compose Dissimilarity matrix from network outputs
                    cbt = model(data)
                    views_sampled = random.sample(targets, random_sample_size)
                    sampled_targets = torch.cat(views_sampled, axis = 2).permute((2,1,0))
                    expanded_cbt = cbt.expand((sampled_targets.shape[0],model_params["N_ROIs"],model_params["N_ROIs"]))
                    diff = torch.abs(expanded_cbt - sampled_targets) #Absolute difference
                    sum_of_all = torch.mul(diff, diff).sum(axis = (1,2)) #Sum of squares
                    l = torch.sqrt(sum_of_all)  #Square root of the sum
                    losses.append((l * loss_weightes[:random_sample_size * model_params["n_attr"]]).sum())
            	#Backprob
                optimizer.zero_grad()
                loss = torch.mean(torch.stack(losses))
                loss.backward()
                optimizer.step()
                
                #Track the loss
                if epoch % 10 == 0:
                     cbt = DGN.generate_cbt_median(model, train_casted)
                     rep_loss = DGN.mean_frobenious_distance(cbt, test_casted)
                     tock = time.time()
                     time_elapsed = tock - tick
                     tick = tock
                     rep_loss = float(rep_loss)
                     test_errors.append(rep_loss)
                     print("Epoch: {}  |  Test Rep: {:.2f}  |  Time Elapsed: {:.2f}  |".format(epoch, rep_loss, time_elapsed))
                     #Early stopping control
                     if len(test_errors) > 6 and early_stop:
                        torch.save(model.state_dict(), TEMP_FOLDER + "/weight_" + model_id + "_" + str(rep_loss)[:5]  + ".model")
                        last_6 = test_errors[-6:]
                        if(all(last_6[i] < last_6[i + 1] for i in range(5))):
                            print("Early Stopping")
                            break
        	#Restore best model so far
            try:
                restore = "./temp/weight_" + model_id + "_" + str(min(test_errors))[:5] + ".model"
                model.load_state_dict(torch.load(restore))
            except:
                pass
            torch.save(model.state_dict(), save_path + "fold" + str(i) + ".model")
            models.append(model)
            #Generate and save refined CBT
            cbt = DGN.generate_cbt_median(model, train_casted)
            rep_loss = DGN.mean_frobenious_distance(cbt, test_casted)
            cbt = cbt.cpu().numpy()
            CBTs.append(cbt)
            np.save( save_path + "fold" + str(i) + "_cbt", cbt)
            #Save all subject biased CBTs
            all_cbts = DGN.generate_subject_biased_cbts(model, train_casted)
            np.save(save_path + "fold" + str(i) + "_all_cbts", all_cbts)
            scores.append(float(rep_loss))
            print("FINAL RESULTS  REP: {}".format(rep_loss))
            #Clean interim model weights
            helper.clear_dir(TEMP_FOLDER)
        for i, cbt in enumerate(CBTs):
            show_image(cbt, i, scores[i])
        return models