예제 #1
0
def dtw_dictionary():
    '''
    takes in the .npy files in arrays
    uses de load_plotline to produce a smooth plot (x,y values)
    computes the distance thanks to Dynamic Time Wrapping

    return:
    -------

    '''
    # Loop through the script files
    path_to_file = '../data/emotions/arrays'
    files = os.listdir(path_to_file)
    legit_files = [
        filename[:-4] for filename in files if filename[-3:] == 'npy'
    ]
    Ntot = len(legit_files)
    #prepare all arrays
    print "Preparing all the files"
    list_smooth_arrays = []
    index = 0
    for filename in legit_files:
        arr = prepare_smooth_array(filename)
        list_smooth_arrays.append(arr)
        index += 1
        progression_bar(index, Ntot, Nbars=60, char='-')

    #looking at the similarity in the plots
    print "\n"
    print "Computing all the distances"
    full_dictionary = defaultdict(dict)
    index = 0
    for index1 in xrange(len(legit_files)):
        filename1 = legit_files[index1]
        arr1 = list_smooth_arrays[index1]
        for index2 in xrange(index1 + 1, len(legit_files)):
            filename2 = legit_files[index2]
            arr2 = list_smooth_arrays[index2]
            min_distance = get_distance(arr1, arr2)
            full_dictionary[filename1][filename2] = min_distance
            full_dictionary[filename2][filename1] = min_distance
        index += 1
        progression_bar(index, Ntot, Nbars=60, char='-')
    return full_dictionary
예제 #2
0
def dtw_dictionary():
    '''
    takes in the .npy files in arrays
    uses de load_plotline to produce a smooth plot (x,y values)
    computes the distance thanks to Dynamic Time Wrapping

    return:
    -------

    '''
    # Loop through the script files
    path_to_file = '../data/emotions/arrays'
    files = os.listdir(path_to_file)
    legit_files = [filename[:-4] for filename in files if filename[-3:]=='npy']
    Ntot = len(legit_files)
    #prepare all arrays
    print "Preparing all the files"
    list_smooth_arrays = []
    index = 0
    for filename in legit_files:
        arr = prepare_smooth_array(filename)
        list_smooth_arrays.append(arr)
        index += 1
        progression_bar(index, Ntot, Nbars=60, char='-')


    #looking at the similarity in the plots
    print "\n"
    print "Computing all the distances"
    full_dictionary = defaultdict(dict)
    index = 0
    for index1 in xrange(len(legit_files)):
        filename1 = legit_files[index1]
        arr1 = list_smooth_arrays[index1]
        for index2 in xrange(index1+1,len(legit_files)):
            filename2 = legit_files[index2]
            arr2 = list_smooth_arrays[index2]
            min_distance = get_distance(arr1, arr2)
            full_dictionary[filename1][filename2] = min_distance
            full_dictionary[filename2][filename1] = min_distance
        index += 1
        progression_bar(index, Ntot, Nbars=60, char='-')
    return full_dictionary
예제 #3
0
        os.mkdir('../../data/scraping/texts')
        print('making ../../data/scraping/texts folder')

    # List all the available movies, and the corresponding URL links
    movies = get_all_movies()
    print(check_movie_info(movies))

    # Write all the scripts (in texts folder) and the summary of the movies
    # in .csv format (in scraping folder
    browser = webdriver.Chrome(ChromeDriverManager().install())
    for i, movie in enumerate(movies):
        try:
            handle_movie(movie, browser)
        except IndexError:
            continue
        progression_bar(i, len(movies))
'''comments on the scraping results
movies that were not scraped successfully:
-------
3 texts are manually removed:
Appolo 13 & Scary Movie 2 (no actual script available on website)
(zero octet file - the results of the findAll is a little different than
    for other movies: as only one movie was in this situation, it is ignored)

list of pdfs:
--------------
8 Mile was pdf
A.I. was pdf
Back to the Future was pdf
Back to the Future II & III was pdf
Batman and Robin was pdf
예제 #4
0
        list_emotions.append(emotion_counts(window, emotion_dict , vocabulary))
        if index%10==0 and verbose:
            print index
    array_emotions = np.array(list_emotions)
    if print_to_file:
        path_to_file = "../data/emotions/arrays/"+filename
        np.save(path_to_file, array_emotions)
    return array_emotions

if __name__ == '__main__':
    NRC_emotions_file = '../data/emotions/NRC_emotions.txt'
    print 'Loading the NRC emotions database, please wait.'
    emotion_dictionary, vocabulary = \
                            load_dictionary_and_vocabulary(NRC_emotions_file)

    # Create the proper directories
    if not os.path.exists('../data/emotions/arrays'):
        os.mkdir('../data/emotions/arrays')

    # Loop through the script files
    path_to_file = '../data/scraping/texts'
    files = os.listdir(path_to_file)
    index = 1
    legit_files = [filename for filename in files if filename[-3:]=='txt']
    Ntot = len(legit_files)
    for filename in legit_files:
        get_emotions( filename[:-4], path_to_file, emotion_dictionary,
                    vocabulary, print_to_file=True, verbose=False)
        progression_bar(index, Ntot, Nbars=60, char='-')
        index += 1
예제 #5
0
        display(hbox5)


################
if __name__ == '__main__':
    # Saving pngs
    user_input = raw_input("Do you want to save plots as png (y/n) > ")
    save_png = (user_input == 'y')

    # Create the directory for pngs
    if not os.path.exists('../data/emotions/graphs') and save_png:
        os.mkdir('../data/emotions/graphs')

    # Loop through the script files
    path_to_file = '../data/emotions/arrays'
    files = os.listdir(path_to_file)
    index = 1
    legit_files = [
        filename[:-4] for filename in files if filename[-3:] == 'npy'
    ]
    Ntot = len(legit_files)
    chosen_emotions = range(5) + range(7, 10)
    #all emotions except positive and negative
    for filename in legit_files:
        plotline = LoadPlotLine(filename)
        plotline.load_emotions()
        plotline.visualisation_for_emotions(list_emotions=chosen_emotions,
                                            save_png=save_png)
        progression_bar(index, Ntot, Nbars=60, char='-')
        index += 1
예제 #6
0
        os.mkdir('../data/scraping')
        print 'making ../data/scraping folder'
    if not os.path.exists('../data/scraping/texts'):
        os.mkdir('../data/scraping/texts')
        print 'making ../data/scraping/texts folder'

    # List all the available movies, and the corresponding URL links
    movies = get_all_movies()
    print check_movie_info(movies)

    # Write all the scripts (in texts folder) and the summary of the movies
    # in .csv format (in scraping folder)
    browser = webdriver.Firefox()
    for i,movie in enumerate(movies):
        handle_movie(movie, browser)
        progression_bar(i, len(movies))

'''comments on the scraping results
movies that were not scraped successfully:
-------
3 texts are manually removed:
Appolo 13 & Scary Movie 2 (no actual script available on website)
(zero octet file - the results of the findAll is a little different than
    for other movies: as only one movie was in this situation, it is ignored)

list of pdfs:
--------------
8 Mile was pdf
A.I. was pdf
Back to the Future was pdf
Back to the Future II & III was pdf
예제 #7
0
def investigate_stability(movies, distances, times_run, k):
    '''
    exploring the stability of the clusters
    the clustering (with 100 initiations to enhance the chance of reaching the global minimum)
    is run multiple times

    parameters:
    -----------
    movies: list of movies
    distances: square array of pairewise distances
    k: number of clusters
    times_run: number of times the clustering is run (for instance 6 or 10)

    returns:
    --------
    dictionary with the stable sets
    '''
    list_runs = [] #will get all the movies as filename in the clusters
                   # for instance k = 3:
                   # [[set1, set2, set3], [set1, set2, set3]]
    print 'Progression of clustering'
    for i in xrange(times_run):
        progression_bar(i, times_run-1, Nbars=times_run-1, char='-')
        #for a run let's get the clusters
        m1,m2,c,d1,d2 = chosen_num_cluster(movies, k, distances)
        list_sets = [] # will receive the sets: [set1, set2, set3]
        for key in d2.keys():
            s1 = set(d2[key])
            s1.add(key)
            list_sets.append(s1)
        list_runs.append(list_sets)

    medoids = m2 #the medoids can vary with the runs, but they should be in the same
                 #clusters, this will allow us to id clusters in different clusters

    #figure out corresponding clusters from the various runs, and bringing them together
    d = defaultdict(list)
    #dictionary: d[medoid] = [set1_from_run1, set3_from_run2, ...] containing the medoid
    for run in list_runs:
        for set_ in run:
            for medoid in medoids:
                if medoid in set_: #identify the right cluster
                    d[medoid].append(set_)

    #finally, intersect the sets
    d_intersection = {}
    for medoid in d.keys():
        list_sets = d[medoid]
        for i in range(len(list_sets)):
            if i == 0:
                set_intersect = list_sets[i]
            else:
                set_intersect = set_intersect.intersection(list_sets[i])
        d_intersection[medoid] = set_intersect

    #print the stability: how many movies are always together in the same cluster
    num_movies = 0
    total_num_movies = len(movies)
    print '\n'
    print '*'*50
    print '**' + ' '*12 + 'stability of clusters' + ' '*13 +'**'
    print '*'*50
    for intersect_set in d_intersection.values():
        num_movies += len(intersect_set)
        print 'number of movies always in the cluster: '+ str(len(intersect_set))
    print '-'*50
    print 'movies that are always in the same cluster: '
    print 'in numbers: '+ str(num_movies) + ' | in percent: ' + str(num_movies*1./total_num_movies * 100) +'%'
    print '*'*50

    return d_intersection