nltk.data.path.append('/home/maxim/bin/nltk') os.chdir(os.path.dirname(os.path.realpath(__file__))) ######################################################################################################################## # Data ######################################################################################################################## # Declare model parameters embedding_size = 200 vocabulary_size = 2000 batch_size = 100 max_words = 100 texts, target = text_helpers.load_movie_data() # Normalize text stops = stopwords.words('english') texts = text_helpers.normalize_text(texts, stops) target = [target[ix] for ix, x in enumerate(texts) if len(x.split()) > 2] texts = [x for x in texts if len(x.split()) > 2] # Split up data set into train/test train_indices = np.random.choice(len(target), round(0.8 * len(target)), replace=False) test_indices = np.array(list(set(range(len(target))) - set(train_indices))) texts_train = [x for ix, x in enumerate(texts) if ix in train_indices] texts_test = [x for ix, x in enumerate(texts) if ix in test_indices] target_train = np.array(
# Add checkpoints to training save_embeddings_every = 5000 print_valid_every = 5000 print_loss_every = 100 # Declare stop words stops = stopwords.words('english') # We pick some test words. We are expecting synonyms to appear valid_words = ['love', 'hate', 'happy', 'sad', 'man', 'woman'] # Later we will have to transform these into indices # Load the movie review data print('Loading Data') texts, target = text_helpers.load_movie_data() # Normalize text print('Normalizing Text Data') texts = text_helpers.normalize_text(texts, stops) # Texts must contain at least 3 words target = [target[ix] for ix, x in enumerate(texts) if len(x.split()) > 2] texts = [x for x in texts if len(x.split()) > 2] # Build our data set and dictionaries print('Creating Dictionary') word_dictionary = text_helpers.build_dictionary(texts, vocabulary_size) word_dictionary_rev = dict(zip(word_dictionary.values(), word_dictionary.keys())) text_data = text_helpers.text_to_numbers(texts, word_dictionary)
# Add checkpoints to training save_embeddings_every = 5000 print_valid_every = 5000 print_loss_every = 100 # Declare stop words #stops = stopwords.words('english') stops = [] # We pick a few test words for validation. valid_words = ['love', 'hate', 'happy', 'sad', 'man', 'woman'] # Later we will have to transform these into indices # Load the movie review data print('Loading Data') texts, target = text_helpers.load_movie_data(data_folder_name) # Normalize text print('Normalizing Text Data') texts = text_helpers.normalize_text(texts, stops) # Texts must contain at least 3 words target = [target[ix] for ix, x in enumerate(texts) if len(x.split()) > window_size] texts = [x for x in texts if len(x.split()) > window_size] assert(len(target)==len(texts)) # Build our data set and dictionaries print('Creating Dictionary') word_dictionary = text_helpers.build_dictionary(texts, vocabulary_size) word_dictionary_rev = dict(zip(word_dictionary.values(), word_dictionary.keys())) text_data = text_helpers.text_to_numbers(texts, word_dictionary)
# Add checkpoints to training save_embeddings_every = 5000 print_valid_every = 5000 print_loss_every = 100 # Declare stop words stops = stopwords.words('english') # We pick some test words. We are expecting synonyms to appear valid_words = ['love', 'hate', 'happy', 'sad', 'man', 'woman'] # Later we will have to transform these into indices # Load the movie review data: # - texts is a list of strings (sentences) # - target is a list of 0 or 1 (unused in this example). texts, _ = text_helpers.load_movie_data() # Normalize text texts = text_helpers.normalize_text(texts, stops) texts = [x for x in texts if len(x.split()) > 2] # Build our data set and dictionaries word_dictionary = text_helpers.build_dictionary(texts, vocabulary_size) word_dictionary_rev = dict( zip(word_dictionary.values(), word_dictionary.keys())) text_data = text_helpers.text_to_numbers(texts, word_dictionary) # Get validation word keys valid_examples = [word_dictionary[x] for x in valid_words] ########################################################################################################################