def main(): from matplotlib import pyplot as plt def text_size(total: int) -> float: """equals 8 if total is 0, 28 if total is 200""" return 8 + total / 200 * 20 for word, job_popularity, resume_popularity in data: plt.text(job_popularity, resume_popularity, word, ha='center', va='center', size=text_size(job_popularity + resume_popularity)) plt.xlabel("Popularity on Job Postings") plt.ylabel("Popularity on Resumes") plt.axis([0, 100, 0, 100]) plt.xticks([]) plt.yticks([]) # plt.show() plt.close() import re # This is not a great regex, but it works on our data. tokenized_sentences = [ re.findall("[a-z]+|[.]", sentence.lower()) for sentence in sentences ] # Create a vocabulary (that is, a mapping word -> word_id) based on our text. vocab = Vocabulary(word for sentence_words in tokenized_sentences for word in sentence_words) from scratch.deep_learning import Tensor, one_hot_encode inputs: List[int] = [] targets: List[Tensor] = [] for sentence in tokenized_sentences: for i, word in enumerate(sentence): # For each word for j in [i - 2, i - 1, i + 1, i + 2]: # take the nearby locations if 0 <= j < len(sentence): # that aren't out of bounds nearby_word = sentence[j] # and get those words. # Add an input that's the original word_id inputs.append(vocab.get_id(word)) # Add a target that's the one-hot-encoded nearby word targets.append(vocab.one_hot_encode(nearby_word)) # Model for learning word vectors from scratch.deep_learning import Sequential, Linear random.seed(0) EMBEDDING_DIM = 5 # seems like a good size # Define the embedding layer separately, so we can reference it. embedding = TextEmbedding(vocab=vocab, embedding_dim=EMBEDDING_DIM) model = Sequential([ # Given a word (as a vector of word_ids), look up its embedding. embedding, # And use a linear layer to compute scores for "nearby words". Linear(input_dim=EMBEDDING_DIM, output_dim=vocab.size) ]) # Train the word vector model from scratch.deep_learning import SoftmaxCrossEntropy, Momentum, GradientDescent loss = SoftmaxCrossEntropy() optimizer = GradientDescent(learning_rate=0.01) for epoch in range(100): epoch_loss = 0.0 for input, target in zip(inputs, targets): predicted = model.forward(input) epoch_loss += loss.loss(predicted, target) gradient = loss.gradient(predicted, target) model.backward(gradient) optimizer.step(model) print(epoch, epoch_loss) # Print the loss print(embedding.closest("black")) # and also a few nearest words print(embedding.closest("slow")) # so we can see what's being print(embedding.closest("car")) # learned. # Explore most similar words pairs = [(cosine_similarity(embedding[w1], embedding[w2]), w1, w2) for w1 in vocab.w2i for w2 in vocab.w2i if w1 < w2] pairs.sort(reverse=True) print(pairs[:5]) # Plot word vectors plt.close() from scratch.working_with_data import pca, transform import matplotlib.pyplot as plt # Extract the first two principal components and transform the word vectors components = pca(embedding.embeddings, 2) transformed = transform(embedding.embeddings, components) # Scatter the points (and make them white so they're "invisible") fig, ax = plt.subplots() ax.scatter(*zip(*transformed), marker='.', color='w') # Add annotations for each word at its transformed location for word, idx in vocab.w2i.items(): ax.annotate(word, transformed[idx]) # And hide the axes ax.get_xaxis().set_visible(False) ax.get_yaxis().set_visible(False) # plt.show() plt.savefig('im/word_vectors') plt.gca().clear() plt.close() from bs4 import BeautifulSoup import requests url = "https://www.ycombinator.com/topcompanies/" soup = BeautifulSoup(requests.get(url).text, 'html5lib') # We get the companies twice, so use a set comprehension to deduplicate. companies = list({b.text for b in soup("b") if "h4" in b.get("class", ())}) assert len(companies) == 101 vocab = Vocabulary([c for company in companies for c in company]) START = "^" STOP = "$" # We need to add them to the vocabulary too. vocab.add(START) vocab.add(STOP) HIDDEN_DIM = 32 # You should experiment with different sizes! rnn1 = SimpleRnn(input_dim=vocab.size, hidden_dim=HIDDEN_DIM) rnn2 = SimpleRnn(input_dim=HIDDEN_DIM, hidden_dim=HIDDEN_DIM) linear = Linear(input_dim=HIDDEN_DIM, output_dim=vocab.size) model = Sequential([rnn1, rnn2, linear]) from scratch.deep_learning import softmax def generate(seed: str = START, max_len: int = 50) -> str: rnn1.reset_hidden_state() # Reset both hidden states. rnn2.reset_hidden_state() output = [seed] # Start the output with the specified seed. # Keep going until we produce the STOP character or reach the max length while output[-1] != STOP and len(output) < max_len: # Use the last character as the input input = vocab.one_hot_encode(output[-1]) # Generate scores using the model predicted = model.forward(input) # Convert them to probabilities and draw a random char_id probabilities = softmax(predicted) next_char_id = sample_from(probabilities) # Add the corresponding char to our output output.append(vocab.get_word(next_char_id)) # Get rid of START and END characters and return the word. return ''.join(output[1:-1]) loss = SoftmaxCrossEntropy() optimizer = Momentum(learning_rate=0.01, momentum=0.9) for epoch in range(300): random.shuffle(companies) # Train in a different order each epoch. epoch_loss = 0 # Track the loss. for company in tqdm.tqdm(companies): rnn1.reset_hidden_state() # Reset both hidden states. rnn2.reset_hidden_state() company = START + company + STOP # Add START and STOP characters. # The rest is just our usual training loop, except that the inputs # and target are the one-hot-encoded previous and next characters. for prev, next in zip(company, company[1:]): input = vocab.one_hot_encode(prev) target = vocab.one_hot_encode(next) predicted = model.forward(input) epoch_loss += loss.loss(predicted, target) gradient = loss.gradient(predicted, target) model.backward(gradient) optimizer.step(model) # Each epoch print the loss and also generate a name print(epoch, epoch_loss, generate()) # Turn down the learning rate for the last 100 epochs. # There's no principled reason for this, but it seems to work. if epoch == 200: optimizer.lr *= 0.1
def main(): # Replace this with the locations of your files # This points to the current directory, modify if your files are elsewhere. MOVIES = "u.item" # pipe-delimited: movie_id|title|... RATINGS = "u.data" # tab-delimited: user_id, movie_id, rating, timestamp from typing import NamedTuple class Rating(NamedTuple): user_id: str movie_id: str rating: float import csv # We specify this encoding to avoid a UnicodeDecodeError. # see: https://stackoverflow.com/a/53136168/1076346 with open(MOVIES, encoding="iso-8859-1") as f: reader = csv.reader(f, delimiter="|") movies = {movie_id: title for movie_id, title, *_ in reader} # Create a list of [Rating] with open(RATINGS, encoding="iso-8859-1") as f: reader = csv.reader(f, delimiter="\t") ratings = [ Rating(user_id, movie_id, float(rating)) for user_id, movie_id, rating, _ in reader ] # 1682 movies rated by 943 users assert len(movies) == 1682 assert len(list({rating.user_id for rating in ratings})) == 943 import re # Data structure for accumulating ratings by movie_id star_wars_ratings = { movie_id: [] for movie_id, title in movies.items() if re.search("Star Wars|Empire Strikes|Jedi", title) } # Iterate over ratings, accumulating the Star Wars ones for rating in ratings: if rating.movie_id in star_wars_ratings: star_wars_ratings[rating.movie_id].append(rating.rating) # Compute the average rating for each movie avg_ratings = [(sum(title_ratings) / len(title_ratings), movie_id) for movie_id, title_ratings in star_wars_ratings.items()] # And then print them in order for avg_rating, movie_id in sorted(avg_ratings, reverse=True): print(f"{avg_rating:.2f} {movies[movie_id]}") import random random.seed(0) random.shuffle(ratings) split1 = int(len(ratings) * 0.7) split2 = int(len(ratings) * 0.85) train = ratings[:split1] # 70% of the data validation = ratings[split1:split2] # 15% of the data test = ratings[split2:] # 15% of the data avg_rating = sum(rating.rating for rating in train) / len(train) baseline_error = sum( (rating.rating - avg_rating)**2 for rating in test) / len(test) # This is what we hope to do better than assert 1.26 < baseline_error < 1.27 # Embedding vectors for matrix factorization model from scratch.deep_learning import random_tensor EMBEDDING_DIM = 2 # Find unique ids user_ids = {rating.user_id for rating in ratings} movie_ids = {rating.movie_id for rating in ratings} # Then create a random vector per id user_vectors = { user_id: random_tensor(EMBEDDING_DIM) for user_id in user_ids } movie_vectors = { movie_id: random_tensor(EMBEDDING_DIM) for movie_id in movie_ids } # Training loop for matrix factorization model from typing import List import tqdm from scratch.linear_algebra import dot def loop(dataset: List[Rating], learning_rate: float = None) -> None: with tqdm.tqdm(dataset) as t: loss = 0.0 for i, rating in enumerate(t): movie_vector = movie_vectors[rating.movie_id] user_vector = user_vectors[rating.user_id] predicted = dot(user_vector, movie_vector) error = predicted - rating.rating loss += error**2 if learning_rate is not None: # predicted = m_0 * u_0 + ... + m_k * u_k # So each u_j enters output with coefficent m_j # and each m_j enters output with coefficient u_j user_gradient = [error * m_j for m_j in movie_vector] movie_gradient = [error * u_j for u_j in user_vector] # Take gradient steps for j in range(EMBEDDING_DIM): user_vector[j] -= learning_rate * user_gradient[j] movie_vector[j] -= learning_rate * movie_gradient[j] t.set_description(f"avg loss: {loss / (i + 1)}") learning_rate = 0.05 for epoch in range(20): learning_rate *= 0.9 print(epoch, learning_rate) loop(train, learning_rate=learning_rate) loop(validation) loop(test) from scratch.working_with_data import pca, transform original_vectors = [vector for vector in movie_vectors.values()] components = pca(original_vectors, 2) ratings_by_movie = defaultdict(list) for rating in ratings: ratings_by_movie[rating.movie_id].append(rating.rating) vectors = [ (movie_id, sum(ratings_by_movie[movie_id]) / len(ratings_by_movie[movie_id]), movies[movie_id], vector) for movie_id, vector in zip( movie_vectors.keys(), transform(original_vectors, components)) ] # Print top 25 and bottom 25 by first principal component print(sorted(vectors, key=lambda v: v[-1][0])[:25]) print(sorted(vectors, key=lambda v: v[-1][0])[-25:])
def main(): # Replace this with the locations of your files # This points to the current directory, modify if your files are elsewhere. MOVIES = "u.item" # pipe-delimited: movie_id|title|... RATINGS = "u.data" # tab-delimited: user_id, movie_id, rating, timestamp from typing import NamedTuple class Rating(NamedTuple): user_id: str movie_id: str rating: float import csv # We specify this encoding to avoid a UnicodeDecodeError. # see: https://stackoverflow.com/a/53136168/1076346 with open(MOVIES, encoding="iso-8859-1") as f: reader = csv.reader(f, delimiter="|") movies = {movie_id: title for movie_id, title, *_ in reader} # Create a list of [Rating] with open(RATINGS, encoding="iso-8859-1") as f: reader = csv.reader(f, delimiter="\t") ratings = [Rating(user_id, movie_id, float(rating)) for user_id, movie_id, rating, _ in reader] # 1682 movies rated by 943 users assert len(movies) == 1682 assert len(list({rating.user_id for rating in ratings})) == 943 import re # Data structure for accumulating ratings by movie_id star_wars_ratings = {movie_id: [] for movie_id, title in movies.items() if re.search("Star Wars|Empire Strikes|Jedi", title)} # Iterate over ratings, accumulating the Star Wars ones for rating in ratings: if rating.movie_id in star_wars_ratings: star_wars_ratings[rating.movie_id].append(rating.rating) # Compute the average rating for each movie avg_ratings = [(sum(title_ratings) / len(title_ratings), movie_id) for movie_id, title_ratings in star_wars_ratings.items()] # And then print them in order for avg_rating, movie_id in sorted(avg_ratings, reverse=True): print(f"{avg_rating:.2f} {movies[movie_id]}") import random random.seed(0) random.shuffle(ratings) split1 = int(len(ratings) * 0.7) split2 = int(len(ratings) * 0.85) train = ratings[:split1] # 70% of the data validation = ratings[split1:split2] # 15% of the data test = ratings[split2:] # 15% of the data avg_rating = sum(rating.rating for rating in train) / len(train) baseline_error = sum((rating.rating - avg_rating) ** 2 for rating in test) / len(test) # This is what we hope to do better than assert 1.26 < baseline_error < 1.27 # Embedding vectors for matrix factorization model from scratch.deep_learning import random_tensor EMBEDDING_DIM = 2 # Find unique ids user_ids = {rating.user_id for rating in ratings} movie_ids = {rating.movie_id for rating in ratings} # Then create a random vector per id user_vectors = {user_id: random_tensor(EMBEDDING_DIM) for user_id in user_ids} movie_vectors = {movie_id: random_tensor(EMBEDDING_DIM) for movie_id in movie_ids} # Training loop for matrix factorization model from typing import List import tqdm from scratch.linear_algebra import dot def loop(dataset: List[Rating], learning_rate: float = None) -> None: with tqdm.tqdm(dataset) as t: loss = 0.0 for i, rating in enumerate(t): movie_vector = movie_vectors[rating.movie_id] user_vector = user_vectors[rating.user_id] predicted = dot(user_vector, movie_vector) error = predicted - rating.rating loss += error ** 2 if learning_rate is not None: # predicted = m_0 * u_0 + ... + m_k * u_k # So each u_j enters output with coefficent m_j # and each m_j enters output with coefficient u_j user_gradient = [error * m_j for m_j in movie_vector] movie_gradient = [error * u_j for u_j in user_vector] # Take gradient steps for j in range(EMBEDDING_DIM): user_vector[j] -= learning_rate * user_gradient[j] movie_vector[j] -= learning_rate * movie_gradient[j] t.set_description(f"avg loss: {loss / (i + 1)}") learning_rate = 0.05 for epoch in range(20): learning_rate *= 0.9 print(epoch, learning_rate) loop(train, learning_rate=learning_rate) loop(validation) loop(test) from scratch.working_with_data import pca, transform original_vectors = [vector for vector in movie_vectors.values()] components = pca(original_vectors, 2) ratings_by_movie = defaultdict(list) for rating in ratings: ratings_by_movie[rating.movie_id].append(rating.rating) vectors = [ (movie_id, sum(ratings_by_movie[movie_id]) / len(ratings_by_movie[movie_id]), movies[movie_id], vector) for movie_id, vector in zip(movie_vectors.keys(), transform(original_vectors, components)) ] # Print top 25 and bottom 25 by first principal component print(sorted(vectors, key=lambda v: v[-1][0])[:25]) print(sorted(vectors, key=lambda v: v[-1][0])[-25:])
def main(): # Ścieżki wskazują na bieżący folder. Zmień je, jeżeli przechowujesz pliki gdzie indziej. MOVIES = "u.item" # plik rozdzielany pionowymi kreskami: movie_id|title|… RATINGS = "u.data" # plik rozdzielany znakiem tabulacji: user_id, movie_id, rating, timestamp from typing import NamedTuple class Rating(NamedTuple): user_id: str movie_id: str rating: float import csv # Określamy kodowanie, by uniknąć błędu UnicodeDecodeError. # Patrz: https://stackoverflow.com/a/53136168/1076346. with open(MOVIES, encoding="iso-8859-1") as f: reader = csv.reader(f, delimiter="|") movies = {movie_id: title for movie_id, title, *_ in reader} # Utwórz listę ocen with open(RATINGS, encoding="iso-8859-1") as f: reader = csv.reader(f, delimiter="\t") ratings = [Rating(user_id, movie_id, float(rating)) for user_id, movie_id, rating, _ in reader] # 1682 filmów ocenionych przez 943 użytkowników assert len(movies) == 1682 assert len(list({rating.user_id for rating in ratings})) == 943 import re # Struktura danych do grupowania ocen po movie_id star_wars_ratings = {movie_id: [] for movie_id, title in movies.items() if re.search("Star Wars|Empire Strikes|Jedi", title)} # Iteracja po ocenach i zbieranie ocen dla Gwiezdnych wojen for rating in ratings: if rating.movie_id in star_wars_ratings: star_wars_ratings[rating.movie_id].append(rating.rating) # Obliczanie średniej oceny dla każdego z filmów avg_ratings = [(sum(title_ratings) / len(title_ratings), movie_id) for movie_id, title_ratings in star_wars_ratings.items()] #Wyświetlenie wyników for avg_rating, movie_id in sorted(avg_ratings, reverse=True): print(f"{avg_rating:.2f} {movies[movie_id]}") import random random.seed(0) random.shuffle(ratings) split1 = int(len(ratings) * 0.7) split2 = int(len(ratings) * 0.85) train = ratings[:split1] # 70% danych validation = ratings[split1:split2] # 15% danych test = ratings[split2:] # 15% danych avg_rating = sum(rating.rating for rating in train) / len(train) baseline_error = sum((rating.rating - avg_rating) ** 2 for rating in test) / len(test) # To jest wynik, z którym będziemy się porównywać assert 1.26 < baseline_error < 1.27 # Wektory w modelu faktoryzacji macierzy from scratch.deep_learning import random_tensor EMBEDDING_DIM = 2 # Znajdź unikatowe identyfikatory user_ids = {rating.user_id for rating in ratings} movie_ids = {rating.movie_id for rating in ratings} # Następnie dla każdego identyfikatora utwórz losowy wektor user_vectors = {user_id: random_tensor(EMBEDDING_DIM) for user_id in user_ids} movie_vectors = {movie_id: random_tensor(EMBEDDING_DIM) for movie_id in movie_ids} # Pętla trenująca model faktoryzacji macierzy from typing import List import tqdm from scratch.linear_algebra import dot def loop(dataset: List[Rating], learning_rate: float = None) -> None: with tqdm.tqdm(dataset) as t: loss = 0.0 for i, rating in enumerate(t): movie_vector = movie_vectors[rating.movie_id] user_vector = user_vectors[rating.user_id] predicted = dot(user_vector, movie_vector) error = predicted - rating.rating loss += error ** 2 if learning_rate is not None: # wartości przewidywane = m_0 * u_0 + … + m_k * u_k # więc każda wartość u_j jest brana do wyniku ze współczynnikiem m_j # a każda wartość m_j jest brana do wyniku ze współczynnikiem u_j user_gradient = [error * m_j for m_j in movie_vector] movie_gradient = [error * u_j for u_j in user_vector] # Zrób krok w kierunku gradientu for j in range(EMBEDDING_DIM): user_vector[j] -= learning_rate * user_gradient[j] movie_vector[j] -= learning_rate * movie_gradient[j] t.set_description(f"avg loss: {loss / (i + 1)}") learning_rate = 0.05 for epoch in range(20): learning_rate *= 0.9 print(epoch, learning_rate) loop(train, learning_rate=learning_rate) loop(validation) loop(test) from scratch.working_with_data import pca, transform original_vectors = [vector for vector in movie_vectors.values()] components = pca(original_vectors, 2) ratings_by_movie = defaultdict(list) for rating in ratings: ratings_by_movie[rating.movie_id].append(rating.rating) vectors = [ (movie_id, sum(ratings_by_movie[movie_id]) / len(ratings_by_movie[movie_id]), movies[movie_id], vector) for movie_id, vector in zip(movie_vectors.keys(), transform(original_vectors, components)) ] # Wyświetl pierwsze 25 i ostatnie 25 po pierwszych głównych składowych print(sorted(vectors, key=lambda v: v[-1][0])[:25]) print(sorted(vectors, key=lambda v: v[-1][0])[-25:])
def main(): from matplotlib import pyplot as plt def text_size(total: int) -> float: """ Wynosi 8, jeżeli liczba wystąpień jest równa 0 lub 28, jeżeli liczba wystąpień jest równa 200. """ return 8 + total / 200 * 20 for word, job_popularity, resume_popularity in data: plt.text(job_popularity, resume_popularity, word, ha='center', va='center', size=text_size(job_popularity + resume_popularity)) plt.xlabel("Popularnosc w ofertach pracy") plt.ylabel("Popularnosc w CV") plt.axis([0, 100, 0, 100]) plt.xticks([]) plt.yticks([]) plt.show() plt.close() import re # To wyrażenie regularne nie jest szczególnie zaawansowane, ale w naszym przypadku wystarczy. tokenized_sentences = [re.findall("[a-z]+|[.]", sentence.lower()) for sentence in sentences] # Tworzenie słownika (czyli mapowania słowo -> identyfikator słowa) na podstawie tekstu. vocab = Vocabulary(word for sentence_words in tokenized_sentences for word in sentence_words) from scratch.deep_learning import Tensor, one_hot_encode inputs: List[int] = [] targets: List[Tensor] = [] for sentence in tokenized_sentences: for i, word in enumerate(sentence): # dla każdego słowa for j in [i - 2, i - 1, i + 1, i + 2]: # weź najbliższe otoczenie if 0 <= j < len(sentence): # które znajduje się w tekście nearby_word = sentence[j] # i pobierz z niego słowa. # dodaje input, czyli identyfikator word_id pierwotnego słowa. inputs.append(vocab.get_id(word)) # dodaje target, czyli identyfikatory najbliższych słów. targets.append(vocab.one_hot_encode(nearby_word)) # Model uczący dla wektorów słów. from scratch.deep_learning import Sequential, Linear random.seed(0) EMBEDDING_DIM = 5 # wydaje się być dobrą wielkością # tworzymy warstwę embedding osobno. embedding = TextEmbedding(vocab=vocab, embedding_dim=EMBEDDING_DIM) model = Sequential([ # Mając słowo na wejściu (jako wektor identyfikatorów word_ids), dołącz jego wektor. embedding, # użyj warstwy linear do obliczenia wartości dla najbliższych słów. Linear(input_dim=EMBEDDING_DIM, output_dim=vocab.size) ]) # Trenowanie modelu wektorów słów from scratch.deep_learning import SoftmaxCrossEntropy, Momentum, GradientDescent loss = SoftmaxCrossEntropy() optimizer = GradientDescent(learning_rate=0.01) for epoch in range(100): epoch_loss = 0.0 for input, target in zip(inputs, targets): predicted = model.forward(input) epoch_loss += loss.loss(predicted, target) gradient = loss.gradient(predicted, target) model.backward(gradient) optimizer.step(model) print(epoch, epoch_loss) # wyświetl wartość straty print(embedding.closest("black")) # oraz kilka najbliższych słów print(embedding.closest("slow")) # aby było widać print(embedding.closest("car")) # jak przebiega trenowanie modelu. # Najbliższe sobie słowa pairs = [(cosine_similarity(embedding[w1], embedding[w2]), w1, w2) for w1 in vocab.w2i for w2 in vocab.w2i if w1 < w2] pairs.sort(reverse=True) print(pairs[:5]) # Wyświetlenie wektorów słów plt.close() from scratch.working_with_data import pca, transform import matplotlib.pyplot as plt # Wyznacz pierwsze dwie główne składowe i przetransformuj wektory słów. components = pca(embedding.embeddings, 2) transformed = transform(embedding.embeddings, components) # Narysuj punkty na wykresie i pokoloruj je na biało, aby były niewidoczne. fig, ax = plt.subplots() ax.scatter(*zip(*transformed), marker='.', color='w') # Dodaj opis do każdego punktu. for word, idx in vocab.w2i.items(): ax.annotate(word, transformed[idx]) # Ukryj osie. ax.get_xaxis().set_visible(False) ax.get_yaxis().set_visible(False) plt.show() plt.savefig('im/word_vectors') plt.gca().clear() plt.close() from bs4 import BeautifulSoup import requests url = "https://www.ycombinator.com/topcompanies/" soup = BeautifulSoup(requests.get(url).text, 'html5lib') # Pobieramy nazwy dwukrotnie, więc użyjemy zbioru, aby usunąć duplikaty. companies = list({b.text for b in soup("b") if "h4" in b.get("class", ())}) assert len(companies) == 101 vocab = Vocabulary([c for company in companies for c in company]) START = "^" STOP = "$" # Należy je dodać do słownika. vocab.add(START) vocab.add(STOP) HIDDEN_DIM = 32 # Powinieneś poeksperymentować z różnymi rozmiarami! rnn1 = SimpleRnn(input_dim=vocab.size, hidden_dim=HIDDEN_DIM) rnn2 = SimpleRnn(input_dim=HIDDEN_DIM, hidden_dim=HIDDEN_DIM) linear = Linear(input_dim=HIDDEN_DIM, output_dim=vocab.size) model = Sequential([ rnn1, rnn2, linear ]) from scratch.deep_learning import softmax def generate(seed: str = START, max_len: int = 50) -> str: rnn1.reset_hidden_state() # Zresetuj obydwa ukryte stany rnn2.reset_hidden_state() output = [seed] # rozpocznij od podstawienia pod output określonej wartości seed # Kontynuuj, aż trafisz na znak STOP lub do osiągnięcia maksymalnej długości while output[-1] != STOP and len(output) < max_len: # Użyj ostatniego znaku na wejściu input = vocab.one_hot_encode(output[-1]) # Wygeneruj wyniki, używając modelu predicted = model.forward(input) # Przekonwertuj je na prawdopodobieństwa i pobierz losowy char_id probabilities = softmax(predicted) next_char_id = sample_from(probabilities) # Dodaj odpowiedni znak do wartości wyjściowej output.append(vocab.get_word(next_char_id)) # usuń znaki START i END i zwróć wygenerowane słowo return ''.join(output[1:-1]) loss = SoftmaxCrossEntropy() optimizer = Momentum(learning_rate=0.01, momentum=0.9) for epoch in range(300): random.shuffle(companies) # Za każdym przebiegiem zmieniamy kolejność. epoch_loss = 0 # Tśledzenie wartości straty. for company in tqdm.tqdm(companies): rnn1.reset_hidden_state() # Zresetuj obydwa ukryte stany. rnn2.reset_hidden_state() company = START + company + STOP # Add START and STOP characters. # Reszta działa jak typowa pętla treningowa, z tą różnicą, że wartości wejściowe oraz target są zakodowanym # poprzednim i następnym znakiem. for prev, next in zip(company, company[1:]): input = vocab.one_hot_encode(prev) target = vocab.one_hot_encode(next) predicted = model.forward(input) epoch_loss += loss.loss(predicted, target) gradient = loss.gradient(predicted, target) model.backward(gradient) optimizer.step(model) # Przy każdym przebiegu wyświetl wartość straty oraz wygeneruj nazwę. print(epoch, epoch_loss, generate()) # Zmniejszenie wartości learning rate na ostatnie 100 przebiegów. # Nie ma dobrze określonego powodu, by tak robić, ale wygląda na to, że działa to dobrze. if epoch == 200: optimizer.lr *= 0.1
def main(): from matplotlib import pyplot as plt def text_size(total: int) -> float: """equals 8 if total is 0, 28 if total is 200""" return 8 + total / 200 * 20 for word, job_popularity, resume_popularity in data: plt.text(job_popularity, resume_popularity, word, ha='center', va='center', size=text_size(job_popularity + resume_popularity)) plt.xlabel("Popularity on Job Postings") plt.ylabel("Popularity on Resumes") plt.axis([0, 100, 0, 100]) plt.xticks([]) plt.yticks([]) # plt.show() plt.close() import re # This is not a great regex, but it works on our data. tokenized_sentences = [re.findall("[a-z]+|[.]", sentence.lower()) for sentence in sentences] # Create a vocabulary (that is, a mapping word -> word_id) based on our text. vocab = Vocabulary(word for sentence_words in tokenized_sentences for word in sentence_words) from scratch.deep_learning import Tensor, one_hot_encode inputs: List[int] = [] targets: List[Tensor] = [] for sentence in tokenized_sentences: for i, word in enumerate(sentence): # For each word for j in [i - 2, i - 1, i + 1, i + 2]: # take the nearby locations if 0 <= j < len(sentence): # that aren't out of bounds nearby_word = sentence[j] # and get those words. # Add an input that's the original word_id inputs.append(vocab.get_id(word)) # Add a target that's the one-hot-encoded nearby word targets.append(vocab.one_hot_encode(nearby_word)) # Model for learning word vectors from scratch.deep_learning import Sequential, Linear random.seed(0) EMBEDDING_DIM = 5 # seems like a good size # Define the embedding layer separately, so we can reference it. embedding = TextEmbedding(vocab=vocab, embedding_dim=EMBEDDING_DIM) model = Sequential([ # Given a word (as a vector of word_ids), look up its embedding. embedding, # And use a linear layer to compute scores for "nearby words". Linear(input_dim=EMBEDDING_DIM, output_dim=vocab.size) ]) # Train the word vector model from scratch.deep_learning import SoftmaxCrossEntropy, Momentum, GradientDescent loss = SoftmaxCrossEntropy() optimizer = GradientDescent(learning_rate=0.01) for epoch in range(100): epoch_loss = 0.0 for input, target in zip(inputs, targets): predicted = model.forward(input) epoch_loss += loss.loss(predicted, target) gradient = loss.gradient(predicted, target) model.backward(gradient) optimizer.step(model) print(epoch, epoch_loss) # Print the loss print(embedding.closest("black")) # and also a few nearest words print(embedding.closest("slow")) # so we can see what's being print(embedding.closest("car")) # learned. # Explore most similar words pairs = [(cosine_similarity(embedding[w1], embedding[w2]), w1, w2) for w1 in vocab.w2i for w2 in vocab.w2i if w1 < w2] pairs.sort(reverse=True) print(pairs[:5]) # Plot word vectors plt.close() from scratch.working_with_data import pca, transform import matplotlib.pyplot as plt # Extract the first two principal components and transform the word vectors components = pca(embedding.embeddings, 2) transformed = transform(embedding.embeddings, components) # Scatter the points (and make them white so they're "invisible") fig, ax = plt.subplots() ax.scatter(*zip(*transformed), marker='.', color='w') # Add annotations for each word at its transformed location for word, idx in vocab.w2i.items(): ax.annotate(word, transformed[idx]) # And hide the axes ax.get_xaxis().set_visible(False) ax.get_yaxis().set_visible(False) # plt.show() plt.savefig('im/word_vectors') plt.gca().clear() plt.close() from bs4 import BeautifulSoup import requests url = "https://www.ycombinator.com/topcompanies/" soup = BeautifulSoup(requests.get(url).text, 'html5lib') # We get the companies twice, so use a set comprehension to deduplicate. companies = list({b.text for b in soup("b") if "h4" in b.get("class", ())}) assert len(companies) == 101 vocab = Vocabulary([c for company in companies for c in company]) START = "^" STOP = "$" # We need to add them to the vocabulary too. vocab.add(START) vocab.add(STOP) HIDDEN_DIM = 32 # You should experiment with different sizes! rnn1 = SimpleRnn(input_dim=vocab.size, hidden_dim=HIDDEN_DIM) rnn2 = SimpleRnn(input_dim=HIDDEN_DIM, hidden_dim=HIDDEN_DIM) linear = Linear(input_dim=HIDDEN_DIM, output_dim=vocab.size) model = Sequential([ rnn1, rnn2, linear ]) from scratch.deep_learning import softmax def generate(seed: str = START, max_len: int = 50) -> str: rnn1.reset_hidden_state() # Reset both hidden states. rnn2.reset_hidden_state() output = [seed] # Start the output with the specified seed. # Keep going until we produce the STOP character or reach the max length while output[-1] != STOP and len(output) < max_len: # Use the last character as the input input = vocab.one_hot_encode(output[-1]) # Generate scores using the model predicted = model.forward(input) # Convert them to probabilities and draw a random char_id probabilities = softmax(predicted) next_char_id = sample_from(probabilities) # Add the corresponding char to our output output.append(vocab.get_word(next_char_id)) # Get rid of START and END characters and return the word. return ''.join(output[1:-1]) loss = SoftmaxCrossEntropy() optimizer = Momentum(learning_rate=0.01, momentum=0.9) for epoch in range(300): random.shuffle(companies) # Train in a different order each epoch. epoch_loss = 0 # Track the loss. for company in tqdm.tqdm(companies): rnn1.reset_hidden_state() # Reset both hidden states. rnn2.reset_hidden_state() company = START + company + STOP # Add START and STOP characters. # The rest is just our usual training loop, except that the inputs # and target are the one-hot-encoded previous and next characters. for prev, next in zip(company, company[1:]): input = vocab.one_hot_encode(prev) target = vocab.one_hot_encode(next) predicted = model.forward(input) epoch_loss += loss.loss(predicted, target) gradient = loss.gradient(predicted, target) model.backward(gradient) optimizer.step(model) # Each epoch print the loss and also generate a name print(epoch, epoch_loss, generate()) # Turn down the learning rate for the last 100 epochs. # There's no principled reason for this, but it seems to work. if epoch == 200: optimizer.lr *= 0.1