def dtw_dictionary(): ''' takes in the .npy files in arrays uses de load_plotline to produce a smooth plot (x,y values) computes the distance thanks to Dynamic Time Wrapping return: ------- ''' # Loop through the script files path_to_file = '../data/emotions/arrays' files = os.listdir(path_to_file) legit_files = [ filename[:-4] for filename in files if filename[-3:] == 'npy' ] Ntot = len(legit_files) #prepare all arrays print "Preparing all the files" list_smooth_arrays = [] index = 0 for filename in legit_files: arr = prepare_smooth_array(filename) list_smooth_arrays.append(arr) index += 1 progression_bar(index, Ntot, Nbars=60, char='-') #looking at the similarity in the plots print "\n" print "Computing all the distances" full_dictionary = defaultdict(dict) index = 0 for index1 in xrange(len(legit_files)): filename1 = legit_files[index1] arr1 = list_smooth_arrays[index1] for index2 in xrange(index1 + 1, len(legit_files)): filename2 = legit_files[index2] arr2 = list_smooth_arrays[index2] min_distance = get_distance(arr1, arr2) full_dictionary[filename1][filename2] = min_distance full_dictionary[filename2][filename1] = min_distance index += 1 progression_bar(index, Ntot, Nbars=60, char='-') return full_dictionary
def dtw_dictionary(): ''' takes in the .npy files in arrays uses de load_plotline to produce a smooth plot (x,y values) computes the distance thanks to Dynamic Time Wrapping return: ------- ''' # Loop through the script files path_to_file = '../data/emotions/arrays' files = os.listdir(path_to_file) legit_files = [filename[:-4] for filename in files if filename[-3:]=='npy'] Ntot = len(legit_files) #prepare all arrays print "Preparing all the files" list_smooth_arrays = [] index = 0 for filename in legit_files: arr = prepare_smooth_array(filename) list_smooth_arrays.append(arr) index += 1 progression_bar(index, Ntot, Nbars=60, char='-') #looking at the similarity in the plots print "\n" print "Computing all the distances" full_dictionary = defaultdict(dict) index = 0 for index1 in xrange(len(legit_files)): filename1 = legit_files[index1] arr1 = list_smooth_arrays[index1] for index2 in xrange(index1+1,len(legit_files)): filename2 = legit_files[index2] arr2 = list_smooth_arrays[index2] min_distance = get_distance(arr1, arr2) full_dictionary[filename1][filename2] = min_distance full_dictionary[filename2][filename1] = min_distance index += 1 progression_bar(index, Ntot, Nbars=60, char='-') return full_dictionary
os.mkdir('../../data/scraping/texts') print('making ../../data/scraping/texts folder') # List all the available movies, and the corresponding URL links movies = get_all_movies() print(check_movie_info(movies)) # Write all the scripts (in texts folder) and the summary of the movies # in .csv format (in scraping folder browser = webdriver.Chrome(ChromeDriverManager().install()) for i, movie in enumerate(movies): try: handle_movie(movie, browser) except IndexError: continue progression_bar(i, len(movies)) '''comments on the scraping results movies that were not scraped successfully: ------- 3 texts are manually removed: Appolo 13 & Scary Movie 2 (no actual script available on website) (zero octet file - the results of the findAll is a little different than for other movies: as only one movie was in this situation, it is ignored) list of pdfs: -------------- 8 Mile was pdf A.I. was pdf Back to the Future was pdf Back to the Future II & III was pdf Batman and Robin was pdf
list_emotions.append(emotion_counts(window, emotion_dict , vocabulary)) if index%10==0 and verbose: print index array_emotions = np.array(list_emotions) if print_to_file: path_to_file = "../data/emotions/arrays/"+filename np.save(path_to_file, array_emotions) return array_emotions if __name__ == '__main__': NRC_emotions_file = '../data/emotions/NRC_emotions.txt' print 'Loading the NRC emotions database, please wait.' emotion_dictionary, vocabulary = \ load_dictionary_and_vocabulary(NRC_emotions_file) # Create the proper directories if not os.path.exists('../data/emotions/arrays'): os.mkdir('../data/emotions/arrays') # Loop through the script files path_to_file = '../data/scraping/texts' files = os.listdir(path_to_file) index = 1 legit_files = [filename for filename in files if filename[-3:]=='txt'] Ntot = len(legit_files) for filename in legit_files: get_emotions( filename[:-4], path_to_file, emotion_dictionary, vocabulary, print_to_file=True, verbose=False) progression_bar(index, Ntot, Nbars=60, char='-') index += 1
display(hbox5) ################ if __name__ == '__main__': # Saving pngs user_input = raw_input("Do you want to save plots as png (y/n) > ") save_png = (user_input == 'y') # Create the directory for pngs if not os.path.exists('../data/emotions/graphs') and save_png: os.mkdir('../data/emotions/graphs') # Loop through the script files path_to_file = '../data/emotions/arrays' files = os.listdir(path_to_file) index = 1 legit_files = [ filename[:-4] for filename in files if filename[-3:] == 'npy' ] Ntot = len(legit_files) chosen_emotions = range(5) + range(7, 10) #all emotions except positive and negative for filename in legit_files: plotline = LoadPlotLine(filename) plotline.load_emotions() plotline.visualisation_for_emotions(list_emotions=chosen_emotions, save_png=save_png) progression_bar(index, Ntot, Nbars=60, char='-') index += 1
os.mkdir('../data/scraping') print 'making ../data/scraping folder' if not os.path.exists('../data/scraping/texts'): os.mkdir('../data/scraping/texts') print 'making ../data/scraping/texts folder' # List all the available movies, and the corresponding URL links movies = get_all_movies() print check_movie_info(movies) # Write all the scripts (in texts folder) and the summary of the movies # in .csv format (in scraping folder) browser = webdriver.Firefox() for i,movie in enumerate(movies): handle_movie(movie, browser) progression_bar(i, len(movies)) '''comments on the scraping results movies that were not scraped successfully: ------- 3 texts are manually removed: Appolo 13 & Scary Movie 2 (no actual script available on website) (zero octet file - the results of the findAll is a little different than for other movies: as only one movie was in this situation, it is ignored) list of pdfs: -------------- 8 Mile was pdf A.I. was pdf Back to the Future was pdf Back to the Future II & III was pdf
def investigate_stability(movies, distances, times_run, k): ''' exploring the stability of the clusters the clustering (with 100 initiations to enhance the chance of reaching the global minimum) is run multiple times parameters: ----------- movies: list of movies distances: square array of pairewise distances k: number of clusters times_run: number of times the clustering is run (for instance 6 or 10) returns: -------- dictionary with the stable sets ''' list_runs = [] #will get all the movies as filename in the clusters # for instance k = 3: # [[set1, set2, set3], [set1, set2, set3]] print 'Progression of clustering' for i in xrange(times_run): progression_bar(i, times_run-1, Nbars=times_run-1, char='-') #for a run let's get the clusters m1,m2,c,d1,d2 = chosen_num_cluster(movies, k, distances) list_sets = [] # will receive the sets: [set1, set2, set3] for key in d2.keys(): s1 = set(d2[key]) s1.add(key) list_sets.append(s1) list_runs.append(list_sets) medoids = m2 #the medoids can vary with the runs, but they should be in the same #clusters, this will allow us to id clusters in different clusters #figure out corresponding clusters from the various runs, and bringing them together d = defaultdict(list) #dictionary: d[medoid] = [set1_from_run1, set3_from_run2, ...] containing the medoid for run in list_runs: for set_ in run: for medoid in medoids: if medoid in set_: #identify the right cluster d[medoid].append(set_) #finally, intersect the sets d_intersection = {} for medoid in d.keys(): list_sets = d[medoid] for i in range(len(list_sets)): if i == 0: set_intersect = list_sets[i] else: set_intersect = set_intersect.intersection(list_sets[i]) d_intersection[medoid] = set_intersect #print the stability: how many movies are always together in the same cluster num_movies = 0 total_num_movies = len(movies) print '\n' print '*'*50 print '**' + ' '*12 + 'stability of clusters' + ' '*13 +'**' print '*'*50 for intersect_set in d_intersection.values(): num_movies += len(intersect_set) print 'number of movies always in the cluster: '+ str(len(intersect_set)) print '-'*50 print 'movies that are always in the same cluster: ' print 'in numbers: '+ str(num_movies) + ' | in percent: ' + str(num_movies*1./total_num_movies * 100) +'%' print '*'*50 return d_intersection