Пример #1
0
def main():
    from matplotlib import pyplot as plt

    def text_size(total: int) -> float:
        """equals 8 if total is 0, 28 if total is 200"""
        return 8 + total / 200 * 20

    for word, job_popularity, resume_popularity in data:
        plt.text(job_popularity,
                 resume_popularity,
                 word,
                 ha='center',
                 va='center',
                 size=text_size(job_popularity + resume_popularity))
    plt.xlabel("Popularity on Job Postings")
    plt.ylabel("Popularity on Resumes")
    plt.axis([0, 100, 0, 100])
    plt.xticks([])
    plt.yticks([])
    # plt.show()

    plt.close()

    import re

    # This is not a great regex, but it works on our data.
    tokenized_sentences = [
        re.findall("[a-z]+|[.]", sentence.lower()) for sentence in sentences
    ]

    # Create a vocabulary (that is, a mapping word -> word_id) based on our text.
    vocab = Vocabulary(word for sentence_words in tokenized_sentences
                       for word in sentence_words)

    from scratch.deep_learning import Tensor, one_hot_encode

    inputs: List[int] = []
    targets: List[Tensor] = []

    for sentence in tokenized_sentences:
        for i, word in enumerate(sentence):  # For each word
            for j in [i - 2, i - 1, i + 1, i + 2]:  # take the nearby locations
                if 0 <= j < len(sentence):  # that aren't out of bounds
                    nearby_word = sentence[j]  # and get those words.

                    # Add an input that's the original word_id
                    inputs.append(vocab.get_id(word))

                    # Add a target that's the one-hot-encoded nearby word
                    targets.append(vocab.one_hot_encode(nearby_word))

    # Model for learning word vectors

    from scratch.deep_learning import Sequential, Linear

    random.seed(0)
    EMBEDDING_DIM = 5  # seems like a good size

    # Define the embedding layer separately, so we can reference it.
    embedding = TextEmbedding(vocab=vocab, embedding_dim=EMBEDDING_DIM)

    model = Sequential([
        # Given a word (as a vector of word_ids), look up its embedding.
        embedding,
        # And use a linear layer to compute scores for "nearby words".
        Linear(input_dim=EMBEDDING_DIM, output_dim=vocab.size)
    ])

    # Train the word vector model

    from scratch.deep_learning import SoftmaxCrossEntropy, Momentum, GradientDescent

    loss = SoftmaxCrossEntropy()
    optimizer = GradientDescent(learning_rate=0.01)

    for epoch in range(100):
        epoch_loss = 0.0
        for input, target in zip(inputs, targets):
            predicted = model.forward(input)
            epoch_loss += loss.loss(predicted, target)
            gradient = loss.gradient(predicted, target)
            model.backward(gradient)
            optimizer.step(model)
        print(epoch, epoch_loss)  # Print the loss
        print(embedding.closest("black"))  # and also a few nearest words
        print(embedding.closest("slow"))  # so we can see what's being
        print(embedding.closest("car"))  # learned.

    # Explore most similar words

    pairs = [(cosine_similarity(embedding[w1], embedding[w2]), w1, w2)
             for w1 in vocab.w2i for w2 in vocab.w2i if w1 < w2]
    pairs.sort(reverse=True)
    print(pairs[:5])

    # Plot word vectors
    plt.close()

    from scratch.working_with_data import pca, transform
    import matplotlib.pyplot as plt

    # Extract the first two principal components and transform the word vectors
    components = pca(embedding.embeddings, 2)
    transformed = transform(embedding.embeddings, components)

    # Scatter the points (and make them white so they're "invisible")
    fig, ax = plt.subplots()
    ax.scatter(*zip(*transformed), marker='.', color='w')

    # Add annotations for each word at its transformed location
    for word, idx in vocab.w2i.items():
        ax.annotate(word, transformed[idx])

    # And hide the axes
    ax.get_xaxis().set_visible(False)
    ax.get_yaxis().set_visible(False)

    # plt.show()

    plt.savefig('im/word_vectors')
    plt.gca().clear()
    plt.close()

    from bs4 import BeautifulSoup
    import requests

    url = "https://www.ycombinator.com/topcompanies/"
    soup = BeautifulSoup(requests.get(url).text, 'html5lib')

    # We get the companies twice, so use a set comprehension to deduplicate.
    companies = list({b.text for b in soup("b") if "h4" in b.get("class", ())})
    assert len(companies) == 101

    vocab = Vocabulary([c for company in companies for c in company])

    START = "^"
    STOP = "$"

    # We need to add them to the vocabulary too.
    vocab.add(START)
    vocab.add(STOP)

    HIDDEN_DIM = 32  # You should experiment with different sizes!

    rnn1 = SimpleRnn(input_dim=vocab.size, hidden_dim=HIDDEN_DIM)
    rnn2 = SimpleRnn(input_dim=HIDDEN_DIM, hidden_dim=HIDDEN_DIM)
    linear = Linear(input_dim=HIDDEN_DIM, output_dim=vocab.size)

    model = Sequential([rnn1, rnn2, linear])

    from scratch.deep_learning import softmax

    def generate(seed: str = START, max_len: int = 50) -> str:
        rnn1.reset_hidden_state()  # Reset both hidden states.
        rnn2.reset_hidden_state()
        output = [seed]  # Start the output with the specified seed.

        # Keep going until we produce the STOP character or reach the max length
        while output[-1] != STOP and len(output) < max_len:
            # Use the last character as the input
            input = vocab.one_hot_encode(output[-1])

            # Generate scores using the model
            predicted = model.forward(input)

            # Convert them to probabilities and draw a random char_id
            probabilities = softmax(predicted)
            next_char_id = sample_from(probabilities)

            # Add the corresponding char to our output
            output.append(vocab.get_word(next_char_id))

        # Get rid of START and END characters and return the word.
        return ''.join(output[1:-1])

    loss = SoftmaxCrossEntropy()
    optimizer = Momentum(learning_rate=0.01, momentum=0.9)

    for epoch in range(300):
        random.shuffle(companies)  # Train in a different order each epoch.
        epoch_loss = 0  # Track the loss.
        for company in tqdm.tqdm(companies):
            rnn1.reset_hidden_state()  # Reset both hidden states.
            rnn2.reset_hidden_state()
            company = START + company + STOP  # Add START and STOP characters.

            # The rest is just our usual training loop, except that the inputs
            # and target are the one-hot-encoded previous and next characters.
            for prev, next in zip(company, company[1:]):
                input = vocab.one_hot_encode(prev)
                target = vocab.one_hot_encode(next)
                predicted = model.forward(input)
                epoch_loss += loss.loss(predicted, target)
                gradient = loss.gradient(predicted, target)
                model.backward(gradient)
                optimizer.step(model)

        # Each epoch print the loss and also generate a name
        print(epoch, epoch_loss, generate())

        # Turn down the learning rate for the last 100 epochs.
        # There's no principled reason for this, but it seems to work.
        if epoch == 200:
            optimizer.lr *= 0.1
Пример #2
0
def main():

    # Replace this with the locations of your files

    # This points to the current directory, modify if your files are elsewhere.
    MOVIES = "u.item"  # pipe-delimited: movie_id|title|...
    RATINGS = "u.data"  # tab-delimited: user_id, movie_id, rating, timestamp

    from typing import NamedTuple

    class Rating(NamedTuple):
        user_id: str
        movie_id: str
        rating: float

    import csv
    # We specify this encoding to avoid a UnicodeDecodeError.
    # see: https://stackoverflow.com/a/53136168/1076346
    with open(MOVIES, encoding="iso-8859-1") as f:
        reader = csv.reader(f, delimiter="|")
        movies = {movie_id: title for movie_id, title, *_ in reader}

    # Create a list of [Rating]
    with open(RATINGS, encoding="iso-8859-1") as f:
        reader = csv.reader(f, delimiter="\t")
        ratings = [
            Rating(user_id, movie_id, float(rating))
            for user_id, movie_id, rating, _ in reader
        ]

    # 1682 movies rated by 943 users
    assert len(movies) == 1682
    assert len(list({rating.user_id for rating in ratings})) == 943

    import re

    # Data structure for accumulating ratings by movie_id
    star_wars_ratings = {
        movie_id: []
        for movie_id, title in movies.items()
        if re.search("Star Wars|Empire Strikes|Jedi", title)
    }

    # Iterate over ratings, accumulating the Star Wars ones
    for rating in ratings:
        if rating.movie_id in star_wars_ratings:
            star_wars_ratings[rating.movie_id].append(rating.rating)

    # Compute the average rating for each movie
    avg_ratings = [(sum(title_ratings) / len(title_ratings), movie_id)
                   for movie_id, title_ratings in star_wars_ratings.items()]

    # And then print them in order
    for avg_rating, movie_id in sorted(avg_ratings, reverse=True):
        print(f"{avg_rating:.2f} {movies[movie_id]}")

    import random
    random.seed(0)
    random.shuffle(ratings)

    split1 = int(len(ratings) * 0.7)
    split2 = int(len(ratings) * 0.85)

    train = ratings[:split1]  # 70% of the data
    validation = ratings[split1:split2]  # 15% of the data
    test = ratings[split2:]  # 15% of the data

    avg_rating = sum(rating.rating for rating in train) / len(train)
    baseline_error = sum(
        (rating.rating - avg_rating)**2 for rating in test) / len(test)

    # This is what we hope to do better than
    assert 1.26 < baseline_error < 1.27

    # Embedding vectors for matrix factorization model

    from scratch.deep_learning import random_tensor

    EMBEDDING_DIM = 2

    # Find unique ids
    user_ids = {rating.user_id for rating in ratings}
    movie_ids = {rating.movie_id for rating in ratings}

    # Then create a random vector per id
    user_vectors = {
        user_id: random_tensor(EMBEDDING_DIM)
        for user_id in user_ids
    }
    movie_vectors = {
        movie_id: random_tensor(EMBEDDING_DIM)
        for movie_id in movie_ids
    }

    # Training loop for matrix factorization model

    from typing import List
    import tqdm
    from scratch.linear_algebra import dot

    def loop(dataset: List[Rating], learning_rate: float = None) -> None:
        with tqdm.tqdm(dataset) as t:
            loss = 0.0
            for i, rating in enumerate(t):
                movie_vector = movie_vectors[rating.movie_id]
                user_vector = user_vectors[rating.user_id]
                predicted = dot(user_vector, movie_vector)
                error = predicted - rating.rating
                loss += error**2

                if learning_rate is not None:
                    #     predicted = m_0 * u_0 + ... + m_k * u_k
                    # So each u_j enters output with coefficent m_j
                    # and each m_j enters output with coefficient u_j
                    user_gradient = [error * m_j for m_j in movie_vector]
                    movie_gradient = [error * u_j for u_j in user_vector]

                    # Take gradient steps
                    for j in range(EMBEDDING_DIM):
                        user_vector[j] -= learning_rate * user_gradient[j]
                        movie_vector[j] -= learning_rate * movie_gradient[j]

                t.set_description(f"avg loss: {loss / (i + 1)}")

    learning_rate = 0.05
    for epoch in range(20):
        learning_rate *= 0.9
        print(epoch, learning_rate)
        loop(train, learning_rate=learning_rate)
        loop(validation)
    loop(test)

    from scratch.working_with_data import pca, transform

    original_vectors = [vector for vector in movie_vectors.values()]
    components = pca(original_vectors, 2)

    ratings_by_movie = defaultdict(list)
    for rating in ratings:
        ratings_by_movie[rating.movie_id].append(rating.rating)

    vectors = [
        (movie_id,
         sum(ratings_by_movie[movie_id]) / len(ratings_by_movie[movie_id]),
         movies[movie_id], vector) for movie_id, vector in zip(
             movie_vectors.keys(), transform(original_vectors, components))
    ]

    # Print top 25 and bottom 25 by first principal component
    print(sorted(vectors, key=lambda v: v[-1][0])[:25])
    print(sorted(vectors, key=lambda v: v[-1][0])[-25:])
def main():
    
    # Replace this with the locations of your files
    
    # This points to the current directory, modify if your files are elsewhere.
    MOVIES = "u.item"   # pipe-delimited: movie_id|title|...
    RATINGS = "u.data"  # tab-delimited: user_id, movie_id, rating, timestamp
    
    from typing import NamedTuple
    
    class Rating(NamedTuple):
        user_id: str
        movie_id: str
        rating: float
    
    import csv
    # We specify this encoding to avoid a UnicodeDecodeError.
    # see: https://stackoverflow.com/a/53136168/1076346
    with open(MOVIES, encoding="iso-8859-1") as f:
        reader = csv.reader(f, delimiter="|")
        movies = {movie_id: title for movie_id, title, *_ in reader}
    
    # Create a list of [Rating]
    with open(RATINGS, encoding="iso-8859-1") as f:
        reader = csv.reader(f, delimiter="\t")
        ratings = [Rating(user_id, movie_id, float(rating))
                   for user_id, movie_id, rating, _ in reader]
    
    # 1682 movies rated by 943 users
    assert len(movies) == 1682
    assert len(list({rating.user_id for rating in ratings})) == 943
    
    import re
    
    # Data structure for accumulating ratings by movie_id
    star_wars_ratings = {movie_id: []
                         for movie_id, title in movies.items()
                         if re.search("Star Wars|Empire Strikes|Jedi", title)}
    
    # Iterate over ratings, accumulating the Star Wars ones
    for rating in ratings:
        if rating.movie_id in star_wars_ratings:
            star_wars_ratings[rating.movie_id].append(rating.rating)
    
    # Compute the average rating for each movie
    avg_ratings = [(sum(title_ratings) / len(title_ratings), movie_id)
                   for movie_id, title_ratings in star_wars_ratings.items()]
    
    # And then print them in order
    for avg_rating, movie_id in sorted(avg_ratings, reverse=True):
        print(f"{avg_rating:.2f} {movies[movie_id]}")
    
    import random
    random.seed(0)
    random.shuffle(ratings)
    
    split1 = int(len(ratings) * 0.7)
    split2 = int(len(ratings) * 0.85)
    
    train = ratings[:split1]              # 70% of the data
    validation = ratings[split1:split2]   # 15% of the data
    test = ratings[split2:]               # 15% of the data
    
    avg_rating = sum(rating.rating for rating in train) / len(train)
    baseline_error = sum((rating.rating - avg_rating) ** 2
                         for rating in test) / len(test)
    
    # This is what we hope to do better than
    assert 1.26 < baseline_error < 1.27
    
    
    # Embedding vectors for matrix factorization model
    
    from scratch.deep_learning import random_tensor
    
    EMBEDDING_DIM = 2
    
    # Find unique ids
    user_ids = {rating.user_id for rating in ratings}
    movie_ids = {rating.movie_id for rating in ratings}
    
    # Then create a random vector per id
    user_vectors = {user_id: random_tensor(EMBEDDING_DIM)
                    for user_id in user_ids}
    movie_vectors = {movie_id: random_tensor(EMBEDDING_DIM)
                     for movie_id in movie_ids}
    
    
    # Training loop for matrix factorization model
    
    from typing import List
    import tqdm
    from scratch.linear_algebra import dot
    
    def loop(dataset: List[Rating],
             learning_rate: float = None) -> None:
        with tqdm.tqdm(dataset) as t:
            loss = 0.0
            for i, rating in enumerate(t):
                movie_vector = movie_vectors[rating.movie_id]
                user_vector = user_vectors[rating.user_id]
                predicted = dot(user_vector, movie_vector)
                error = predicted - rating.rating
                loss += error ** 2
    
                if learning_rate is not None:
                    #     predicted = m_0 * u_0 + ... + m_k * u_k
                    # So each u_j enters output with coefficent m_j
                    # and each m_j enters output with coefficient u_j
                    user_gradient = [error * m_j for m_j in movie_vector]
                    movie_gradient = [error * u_j for u_j in user_vector]
    
                    # Take gradient steps
                    for j in range(EMBEDDING_DIM):
                        user_vector[j] -= learning_rate * user_gradient[j]
                        movie_vector[j] -= learning_rate * movie_gradient[j]
    
                t.set_description(f"avg loss: {loss / (i + 1)}")
    
    learning_rate = 0.05
    for epoch in range(20):
        learning_rate *= 0.9
        print(epoch, learning_rate)
        loop(train, learning_rate=learning_rate)
        loop(validation)
    loop(test)
    
    
    from scratch.working_with_data import pca, transform
    
    original_vectors = [vector for vector in movie_vectors.values()]
    components = pca(original_vectors, 2)
    
    ratings_by_movie = defaultdict(list)
    for rating in ratings:
        ratings_by_movie[rating.movie_id].append(rating.rating)
    
    vectors = [
        (movie_id,
         sum(ratings_by_movie[movie_id]) / len(ratings_by_movie[movie_id]),
         movies[movie_id],
         vector)
        for movie_id, vector in zip(movie_vectors.keys(),
                                    transform(original_vectors, components))
    ]
    
    # Print top 25 and bottom 25 by first principal component
    print(sorted(vectors, key=lambda v: v[-1][0])[:25])
    print(sorted(vectors, key=lambda v: v[-1][0])[-25:])
Пример #4
0
def main():
    
    # Ścieżki wskazują na bieżący folder. Zmień je, jeżeli przechowujesz pliki gdzie indziej.
    MOVIES = "u.item"   # plik rozdzielany pionowymi kreskami: movie_id|title|…
    RATINGS = "u.data"  # plik rozdzielany znakiem tabulacji: user_id, movie_id, rating, timestamp
    
    from typing import NamedTuple
    
    class Rating(NamedTuple):
        user_id: str
        movie_id: str
        rating: float
    
    import csv
    # Określamy kodowanie, by uniknąć błędu UnicodeDecodeError.
    # Patrz: https://stackoverflow.com/a/53136168/1076346.
    with open(MOVIES, encoding="iso-8859-1") as f:
        reader = csv.reader(f, delimiter="|")
        movies = {movie_id: title for movie_id, title, *_ in reader}
    
    # Utwórz listę ocen
    with open(RATINGS, encoding="iso-8859-1") as f:
        reader = csv.reader(f, delimiter="\t")
        ratings = [Rating(user_id, movie_id, float(rating))
                   for user_id, movie_id, rating, _ in reader]
    
    # 1682 filmów ocenionych przez 943 użytkowników
    assert len(movies) == 1682
    assert len(list({rating.user_id for rating in ratings})) == 943
    
    import re
    
    # Struktura danych do grupowania ocen po movie_id
    star_wars_ratings = {movie_id: []
                         for movie_id, title in movies.items()
                         if re.search("Star Wars|Empire Strikes|Jedi", title)}
    
    # Iteracja po ocenach i zbieranie ocen dla Gwiezdnych wojen
    for rating in ratings:
        if rating.movie_id in star_wars_ratings:
            star_wars_ratings[rating.movie_id].append(rating.rating)
    
    # Obliczanie średniej oceny dla każdego z filmów
    avg_ratings = [(sum(title_ratings) / len(title_ratings), movie_id)
                   for movie_id, title_ratings in star_wars_ratings.items()]
    
    #Wyświetlenie wyników
    for avg_rating, movie_id in sorted(avg_ratings, reverse=True):
        print(f"{avg_rating:.2f} {movies[movie_id]}")
    
    import random
    random.seed(0)
    random.shuffle(ratings)
    
    split1 = int(len(ratings) * 0.7)
    split2 = int(len(ratings) * 0.85)
    
    train = ratings[:split1]              # 70% danych
    validation = ratings[split1:split2]   # 15% danych
    test = ratings[split2:]               # 15% danych
    
    avg_rating = sum(rating.rating for rating in train) / len(train)
    baseline_error = sum((rating.rating - avg_rating) ** 2
                         for rating in test) / len(test)
    
    # To jest wynik, z którym będziemy się porównywać
    assert 1.26 < baseline_error < 1.27
    
    
    # Wektory w modelu faktoryzacji macierzy
    
    from scratch.deep_learning import random_tensor
    
    EMBEDDING_DIM = 2
    
    # Znajdź unikatowe identyfikatory
    user_ids = {rating.user_id for rating in ratings}
    movie_ids = {rating.movie_id for rating in ratings}
    
    # Następnie dla każdego identyfikatora utwórz losowy wektor
    user_vectors = {user_id: random_tensor(EMBEDDING_DIM)
                    for user_id in user_ids}
    movie_vectors = {movie_id: random_tensor(EMBEDDING_DIM)
                     for movie_id in movie_ids}
    
    
    # Pętla trenująca model faktoryzacji macierzy
    
    from typing import List
    import tqdm
    from scratch.linear_algebra import dot
    
    def loop(dataset: List[Rating],
             learning_rate: float = None) -> None:
        with tqdm.tqdm(dataset) as t:
            loss = 0.0
            for i, rating in enumerate(t):
                movie_vector = movie_vectors[rating.movie_id]
                user_vector = user_vectors[rating.user_id]
                predicted = dot(user_vector, movie_vector)
                error = predicted - rating.rating
                loss += error ** 2
    
                if learning_rate is not None:
                    # wartości przewidywane = m_0 * u_0 + … + m_k * u_k
                    # więc każda wartość u_j jest brana do wyniku ze współczynnikiem m_j
                    # a każda wartość m_j jest brana do wyniku ze współczynnikiem u_j
                    user_gradient = [error * m_j for m_j in movie_vector]
                    movie_gradient = [error * u_j for u_j in user_vector]
    
                    # Zrób krok w kierunku gradientu
                    for j in range(EMBEDDING_DIM):
                        user_vector[j] -= learning_rate * user_gradient[j]
                        movie_vector[j] -= learning_rate * movie_gradient[j]
    
                t.set_description(f"avg loss: {loss / (i + 1)}")
    
    learning_rate = 0.05
    for epoch in range(20):
        learning_rate *= 0.9
        print(epoch, learning_rate)
        loop(train, learning_rate=learning_rate)
        loop(validation)
    loop(test)
    
    
    from scratch.working_with_data import pca, transform
    
    original_vectors = [vector for vector in movie_vectors.values()]
    components = pca(original_vectors, 2)
    
    ratings_by_movie = defaultdict(list)
    for rating in ratings:
        ratings_by_movie[rating.movie_id].append(rating.rating)
    
    vectors = [
        (movie_id,
         sum(ratings_by_movie[movie_id]) / len(ratings_by_movie[movie_id]),
         movies[movie_id],
         vector)
        for movie_id, vector in zip(movie_vectors.keys(),
                                    transform(original_vectors, components))
    ]
    
    # Wyświetl pierwsze 25 i ostatnie 25 po pierwszych głównych składowych
    print(sorted(vectors, key=lambda v: v[-1][0])[:25])
    print(sorted(vectors, key=lambda v: v[-1][0])[-25:])
Пример #5
0
def main():
    from matplotlib import pyplot as plt
    
    def text_size(total: int) -> float:
        """
        Wynosi 8, jeżeli liczba wystąpień jest równa 0
        lub 28, jeżeli liczba wystąpień jest równa 200.
        """
        return 8 + total / 200 * 20
    
    for word, job_popularity, resume_popularity in data:
        plt.text(job_popularity, resume_popularity, word,
                 ha='center', va='center',
                 size=text_size(job_popularity + resume_popularity))
    plt.xlabel("Popularnosc w ofertach pracy")
    plt.ylabel("Popularnosc w CV")
    plt.axis([0, 100, 0, 100])
    plt.xticks([])
    plt.yticks([])
    plt.show()
    
    
    plt.close()
    
    import re
    
    # To wyrażenie regularne nie jest szczególnie zaawansowane, ale w naszym przypadku wystarczy.
    tokenized_sentences = [re.findall("[a-z]+|[.]", sentence.lower())
                           for sentence in sentences]
    
    # Tworzenie słownika (czyli mapowania słowo -> identyfikator słowa) na podstawie tekstu.
    vocab = Vocabulary(word
                       for sentence_words in tokenized_sentences
                       for word in sentence_words)
    
    from scratch.deep_learning import Tensor, one_hot_encode
    
    inputs: List[int] = []
    targets: List[Tensor] = []
    
    for sentence in tokenized_sentences:
        for i, word in enumerate(sentence):          # dla każdego słowa
            for j in [i - 2, i - 1, i + 1, i + 2]:   # weź najbliższe otoczenie
                if 0 <= j < len(sentence):           # które znajduje się w tekście
                    nearby_word = sentence[j]        # i pobierz z niego słowa.
    
                    # dodaje input, czyli identyfikator word_id pierwotnego słowa.
                    inputs.append(vocab.get_id(word))
    
                    # dodaje target, czyli identyfikatory najbliższych słów.
                    targets.append(vocab.one_hot_encode(nearby_word))
    
    
    # Model uczący dla wektorów słów.
    
    from scratch.deep_learning import Sequential, Linear
    
    random.seed(0)
    EMBEDDING_DIM = 5  # wydaje się być dobrą wielkością
    
    # tworzymy warstwę embedding osobno.
    embedding = TextEmbedding(vocab=vocab, embedding_dim=EMBEDDING_DIM)
    
    model = Sequential([
        # Mając słowo na wejściu (jako wektor identyfikatorów word_ids), dołącz jego wektor.
        embedding,
        # użyj warstwy linear do obliczenia wartości dla najbliższych słów.
        Linear(input_dim=EMBEDDING_DIM, output_dim=vocab.size)
    ])
    
    
    # Trenowanie modelu wektorów słów
    
    from scratch.deep_learning import SoftmaxCrossEntropy, Momentum, GradientDescent
    
    loss = SoftmaxCrossEntropy()
    optimizer = GradientDescent(learning_rate=0.01)
    
    for epoch in range(100):
        epoch_loss = 0.0
        for input, target in zip(inputs, targets):
            predicted = model.forward(input)
            epoch_loss += loss.loss(predicted, target)
            gradient = loss.gradient(predicted, target)
            model.backward(gradient)
            optimizer.step(model)
        print(epoch, epoch_loss)            # wyświetl wartość straty
        print(embedding.closest("black"))   # oraz kilka najbliższych słów
        print(embedding.closest("slow"))    # aby było widać 
        print(embedding.closest("car"))     # jak przebiega trenowanie modelu.
    
    
    
    # Najbliższe sobie słowa
    
    pairs = [(cosine_similarity(embedding[w1], embedding[w2]), w1, w2)
             for w1 in vocab.w2i
             for w2 in vocab.w2i
             if w1 < w2]
    pairs.sort(reverse=True)
    print(pairs[:5])
    
    
    # Wyświetlenie wektorów słów
    plt.close()
    
    from scratch.working_with_data import pca, transform
    import matplotlib.pyplot as plt
    
    # Wyznacz pierwsze dwie główne składowe i przetransformuj wektory słów.
    components = pca(embedding.embeddings, 2)
    transformed = transform(embedding.embeddings, components)
    
    # Narysuj punkty na wykresie i pokoloruj je na biało, aby były niewidoczne.
    fig, ax = plt.subplots()
    ax.scatter(*zip(*transformed), marker='.', color='w')
    
    # Dodaj opis do każdego punktu.
    for word, idx in vocab.w2i.items():
        ax.annotate(word, transformed[idx])
    
    # Ukryj osie.
    ax.get_xaxis().set_visible(False)
    ax.get_yaxis().set_visible(False)
    
    plt.show()
    
    
    
    plt.savefig('im/word_vectors')
    plt.gca().clear()
    plt.close()
    
    from bs4 import BeautifulSoup
    import requests
    
    url = "https://www.ycombinator.com/topcompanies/"
    soup = BeautifulSoup(requests.get(url).text, 'html5lib')
    
    # Pobieramy nazwy dwukrotnie, więc użyjemy zbioru, aby usunąć duplikaty.
    companies = list({b.text
                      for b in soup("b")
                      if "h4" in b.get("class", ())})
    assert len(companies) == 101
    
    vocab = Vocabulary([c for company in companies for c in company])
    
    START = "^"
    STOP = "$"
    
    # Należy je dodać do słownika.
    vocab.add(START)
    vocab.add(STOP)
    
    HIDDEN_DIM = 32  # Powinieneś poeksperymentować z różnymi rozmiarami!
    
    rnn1 =  SimpleRnn(input_dim=vocab.size, hidden_dim=HIDDEN_DIM)
    rnn2 =  SimpleRnn(input_dim=HIDDEN_DIM, hidden_dim=HIDDEN_DIM)
    linear = Linear(input_dim=HIDDEN_DIM, output_dim=vocab.size)
    
    model = Sequential([
        rnn1,
        rnn2,
        linear
    ])
    
    from scratch.deep_learning import softmax
    
    def generate(seed: str = START, max_len: int = 50) -> str:
        rnn1.reset_hidden_state()  # Zresetuj obydwa ukryte stany
        rnn2.reset_hidden_state()
        output = [seed]            # rozpocznij od podstawienia pod output określonej wartości seed
    
        # Kontynuuj, aż trafisz na znak STOP lub do osiągnięcia maksymalnej długości 
        while output[-1] != STOP and len(output) < max_len:
            # Użyj ostatniego znaku na wejściu
            input = vocab.one_hot_encode(output[-1])
    
            # Wygeneruj wyniki, używając modelu
            predicted = model.forward(input)
    
            # Przekonwertuj je na prawdopodobieństwa i pobierz losowy char_id
            probabilities = softmax(predicted)
            next_char_id = sample_from(probabilities)
    
            # Dodaj odpowiedni znak do wartości wyjściowej
            output.append(vocab.get_word(next_char_id))
    
        # usuń znaki START i END i zwróć wygenerowane słowo
        return ''.join(output[1:-1])
    
    loss = SoftmaxCrossEntropy()
    optimizer = Momentum(learning_rate=0.01, momentum=0.9)
    
    for epoch in range(300):
        random.shuffle(companies)  # Za każdym przebiegiem zmieniamy kolejność.
        epoch_loss = 0             # Tśledzenie wartości straty.
        for company in tqdm.tqdm(companies):
            rnn1.reset_hidden_state()  # Zresetuj obydwa ukryte stany.
            rnn2.reset_hidden_state()
            company = START + company + STOP   # Add START and STOP characters.
    
            # Reszta działa jak typowa pętla treningowa, z tą różnicą, że wartości wejściowe oraz target są zakodowanym 
            # poprzednim i następnym znakiem.
            for prev, next in zip(company, company[1:]):
                input = vocab.one_hot_encode(prev)
                target = vocab.one_hot_encode(next)
                predicted = model.forward(input)
                epoch_loss += loss.loss(predicted, target)
                gradient = loss.gradient(predicted, target)
                model.backward(gradient)
                optimizer.step(model)
    
        # Przy każdym przebiegu wyświetl wartość straty oraz wygeneruj nazwę.
        print(epoch, epoch_loss, generate())
    
        # Zmniejszenie wartości learning rate na ostatnie 100 przebiegów.
        # Nie ma dobrze określonego powodu, by tak robić, ale wygląda na to, że działa to dobrze.
        if epoch == 200:
            optimizer.lr *= 0.1
Пример #6
0
def main():
    from matplotlib import pyplot as plt
    
    def text_size(total: int) -> float:
        """equals 8 if total is 0, 28 if total is 200"""
        return 8 + total / 200 * 20
    
    for word, job_popularity, resume_popularity in data:
        plt.text(job_popularity, resume_popularity, word,
                 ha='center', va='center',
                 size=text_size(job_popularity + resume_popularity))
    plt.xlabel("Popularity on Job Postings")
    plt.ylabel("Popularity on Resumes")
    plt.axis([0, 100, 0, 100])
    plt.xticks([])
    plt.yticks([])
    # plt.show()
    
    
    plt.close()
    
    import re
    
    # This is not a great regex, but it works on our data.
    tokenized_sentences = [re.findall("[a-z]+|[.]", sentence.lower())
                           for sentence in sentences]
    
    # Create a vocabulary (that is, a mapping word -> word_id) based on our text.
    vocab = Vocabulary(word
                       for sentence_words in tokenized_sentences
                       for word in sentence_words)
    
    from scratch.deep_learning import Tensor, one_hot_encode
    
    inputs: List[int] = []
    targets: List[Tensor] = []
    
    for sentence in tokenized_sentences:
        for i, word in enumerate(sentence):          # For each word
            for j in [i - 2, i - 1, i + 1, i + 2]:   # take the nearby locations
                if 0 <= j < len(sentence):           # that aren't out of bounds
                    nearby_word = sentence[j]        # and get those words.
    
                    # Add an input that's the original word_id
                    inputs.append(vocab.get_id(word))
    
                    # Add a target that's the one-hot-encoded nearby word
                    targets.append(vocab.one_hot_encode(nearby_word))
    
    
    # Model for learning word vectors
    
    from scratch.deep_learning import Sequential, Linear
    
    random.seed(0)
    EMBEDDING_DIM = 5  # seems like a good size
    
    # Define the embedding layer separately, so we can reference it.
    embedding = TextEmbedding(vocab=vocab, embedding_dim=EMBEDDING_DIM)
    
    model = Sequential([
        # Given a word (as a vector of word_ids), look up its embedding.
        embedding,
        # And use a linear layer to compute scores for "nearby words".
        Linear(input_dim=EMBEDDING_DIM, output_dim=vocab.size)
    ])
    
    
    # Train the word vector model
    
    from scratch.deep_learning import SoftmaxCrossEntropy, Momentum, GradientDescent
    
    loss = SoftmaxCrossEntropy()
    optimizer = GradientDescent(learning_rate=0.01)
    
    for epoch in range(100):
        epoch_loss = 0.0
        for input, target in zip(inputs, targets):
            predicted = model.forward(input)
            epoch_loss += loss.loss(predicted, target)
            gradient = loss.gradient(predicted, target)
            model.backward(gradient)
            optimizer.step(model)
        print(epoch, epoch_loss)            # Print the loss
        print(embedding.closest("black"))   # and also a few nearest words
        print(embedding.closest("slow"))    # so we can see what's being
        print(embedding.closest("car"))     # learned.
    
    
    
    # Explore most similar words
    
    pairs = [(cosine_similarity(embedding[w1], embedding[w2]), w1, w2)
             for w1 in vocab.w2i
             for w2 in vocab.w2i
             if w1 < w2]
    pairs.sort(reverse=True)
    print(pairs[:5])
    
    
    # Plot word vectors
    plt.close()
    
    from scratch.working_with_data import pca, transform
    import matplotlib.pyplot as plt
    
    # Extract the first two principal components and transform the word vectors
    components = pca(embedding.embeddings, 2)
    transformed = transform(embedding.embeddings, components)
    
    # Scatter the points (and make them white so they're "invisible")
    fig, ax = plt.subplots()
    ax.scatter(*zip(*transformed), marker='.', color='w')
    
    # Add annotations for each word at its transformed location
    for word, idx in vocab.w2i.items():
        ax.annotate(word, transformed[idx])
    
    # And hide the axes
    ax.get_xaxis().set_visible(False)
    ax.get_yaxis().set_visible(False)
    
    # plt.show()
    
    
    
    plt.savefig('im/word_vectors')
    plt.gca().clear()
    plt.close()
    
    from bs4 import BeautifulSoup
    import requests
    
    url = "https://www.ycombinator.com/topcompanies/"
    soup = BeautifulSoup(requests.get(url).text, 'html5lib')
    
    # We get the companies twice, so use a set comprehension to deduplicate.
    companies = list({b.text
                      for b in soup("b")
                      if "h4" in b.get("class", ())})
    assert len(companies) == 101
    
    vocab = Vocabulary([c for company in companies for c in company])
    
    START = "^"
    STOP = "$"
    
    # We need to add them to the vocabulary too.
    vocab.add(START)
    vocab.add(STOP)
    
    HIDDEN_DIM = 32  # You should experiment with different sizes!
    
    rnn1 =  SimpleRnn(input_dim=vocab.size, hidden_dim=HIDDEN_DIM)
    rnn2 =  SimpleRnn(input_dim=HIDDEN_DIM, hidden_dim=HIDDEN_DIM)
    linear = Linear(input_dim=HIDDEN_DIM, output_dim=vocab.size)
    
    model = Sequential([
        rnn1,
        rnn2,
        linear
    ])
    
    from scratch.deep_learning import softmax
    
    def generate(seed: str = START, max_len: int = 50) -> str:
        rnn1.reset_hidden_state()  # Reset both hidden states.
        rnn2.reset_hidden_state()
        output = [seed]            # Start the output with the specified seed.
    
        # Keep going until we produce the STOP character or reach the max length
        while output[-1] != STOP and len(output) < max_len:
            # Use the last character as the input
            input = vocab.one_hot_encode(output[-1])
    
            # Generate scores using the model
            predicted = model.forward(input)
    
            # Convert them to probabilities and draw a random char_id
            probabilities = softmax(predicted)
            next_char_id = sample_from(probabilities)
    
            # Add the corresponding char to our output
            output.append(vocab.get_word(next_char_id))
    
        # Get rid of START and END characters and return the word.
        return ''.join(output[1:-1])
    
    loss = SoftmaxCrossEntropy()
    optimizer = Momentum(learning_rate=0.01, momentum=0.9)
    
    for epoch in range(300):
        random.shuffle(companies)  # Train in a different order each epoch.
        epoch_loss = 0             # Track the loss.
        for company in tqdm.tqdm(companies):
            rnn1.reset_hidden_state()  # Reset both hidden states.
            rnn2.reset_hidden_state()
            company = START + company + STOP   # Add START and STOP characters.
    
            # The rest is just our usual training loop, except that the inputs
            # and target are the one-hot-encoded previous and next characters.
            for prev, next in zip(company, company[1:]):
                input = vocab.one_hot_encode(prev)
                target = vocab.one_hot_encode(next)
                predicted = model.forward(input)
                epoch_loss += loss.loss(predicted, target)
                gradient = loss.gradient(predicted, target)
                model.backward(gradient)
                optimizer.step(model)
    
        # Each epoch print the loss and also generate a name
        print(epoch, epoch_loss, generate())
    
        # Turn down the learning rate for the last 100 epochs.
        # There's no principled reason for this, but it seems to work.
        if epoch == 200:
            optimizer.lr *= 0.1