def compare(self):
        groupA = self.df.copy()
        groupA['quarter'] = groupA[
            groupA['quarter'].astype(int) < 2]['quarter']
        groupA = groupA.dropna()
        groupB = self.df.copy()
        groupB['quarter'] = groupB[
            groupB['quarter'].astype(int) < 2]['quarter']
        groupB = groupB.dropna()

        # Create DTM
        for count, group in enumerate([groupA, groupB]):
            cv2 = CountVectorizer(ngram_range=(1, 1))
            dtm = cv2.fit_transform(group['lemmatized_text'])
            words = np.array(cv2.get_feature_names())
            dtm_df = pd.DataFrame.from_records(dtm.A, columns=words)

            # Analyze frequency
            freqs = dtm.sum(axis=0).A.flatten()
            index = np.argsort(freqs)[-50:]
            WordFreq = pd.DataFrame.from_records(
                (list(zip(words[index], freqs[index]))))
            WordFreq.columns = ['Word', 'Freq']

            # Plot bar graph
            fig, ax = plt.subplots(figsize=(8, 8))
            WordFreq.sort_values(by='Freq').plot.barh(x='Word',
                                                      y='Freq',
                                                      ax=ax,
                                                      color='gray')
            plt.title('Group ' + str(count))
            plt.figure()
            plt.show()
Exemplo n.º 2
0
def plot_conf_matrix(y_true, y_pred, normed=True, heatmap_color ='Blues', **kwargs):

    ## check to make sure that y_pred is an array of integers if y_true is a bunch of integers
    true_int_check = all(isinstance(a,int) for a in y_true)
    pred_int_check = all(isinstance(a,int) for a in y_pred)
    if true_int_check and not pred_int_check: # convert the y_pred values to integers
        if isinstance(y_pred, pd.Series):
            y_pred = y_pred.astype(int)

    my_c = metrics.confusion_matrix(y_true, y_pred)

    print metrics.matthews_corrcoef(y_true, y_pred)
    if normed:
        cm_normalized = my_c.astype('float') / my_c.sum(axis=1)[:, np.newaxis]
        my_c = cm_normalized
        plt.title('Normalized RF Classifier Confusion Matrix')
    else:
        plt.title('Random Forest Classifier Confusion Matrix')

    sns.heatmap(my_c, annot=True,  fmt='',cmap=heatmap_color, **kwargs)
    plt.ylabel('True')
    plt.xlabel('Assigned')
    plt.show()

    return
def plot_losses(inputs,
                outputs,
                losses,
                val_losses,
                title,
                nfe=None,
                net=None):
    # plot statistics
    if nfe is not None:
        nfe[0].append(net.odeblock_down1.odefunc.nfe)
        nfe[1].append(net.odeblock_down2.odefunc.nfe)
        nfe[2].append(net.odeblock_down3.odefunc.nfe)
        nfe[3].append(net.odeblock_down4.odefunc.nfe)
        nfe[4].append(net.odeblock_embedding.odefunc.nfe)
        nfe[5].append(net.odeblock_up1.odefunc.nfe)
        nfe[6].append(net.odeblock_up2.odefunc.nfe)
        nfe[7].append(net.odeblock_up3.odefunc.nfe)
        nfe[8].append(net.odeblock_up4.odefunc.nfe)

    if nfe is not None: cols = 4
    else: cols = 3
    fig, ax = plt.subplots(nrows=1, ncols=cols, figsize=(15, 5))

    fig.suptitle(title, fontsize=16)

    ax[0].plot(np.arange(len(losses)), losses, label="loss")
    ax[0].plot(np.arange(len(val_losses)), val_losses, label="val_loss")

    if nfe is not None:
        ax[3].plot(np.arange(len(nfe[0])), nfe[0], label="down1")
        ax[3].plot(np.arange(len(nfe[0])), nfe[1], label="down2")
        ax[3].plot(np.arange(len(nfe[0])), nfe[2], label="down3")
        ax[3].plot(np.arange(len(nfe[0])), nfe[3], label="down4")
        ax[3].plot(np.arange(len(nfe[0])), nfe[4], label="embed")
        ax[3].plot(np.arange(len(nfe[0])), nfe[5], label="up1")
        ax[3].plot(np.arange(len(nfe[0])), nfe[6], label="up2")
        ax[3].plot(np.arange(len(nfe[0])), nfe[7], label="up3")
        ax[3].plot(np.arange(len(nfe[0])), nfe[8], label="up4")
        ax[3].legend()

    outputs = torch.argmax(torch.softmax(outputs, dim=1), dim=1)[0]
    outputs = outputs.detach().cpu()
    outputs = outputs.numpy()

    ax[0].legend()
    ax[1].imshow(outputs)
    ax[2].imshow(inputs.detach().cpu()[0].numpy().transpose(1, 2, 0))

    plt.show()
Exemplo n.º 4
0
    epochs=100,
    validation_data=validation_generator,
    validation_steps=50)

model.save('cats_and_dogs_small_2.h5')




acc = history.history['acc']
val_acc = history.history['val_acc']
loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(1, len(acc) + 1)


plt.plot(epochs, acc, 'bo', label='training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Traing and validation accuracy')
plt.legend()

plt.figure()

plt.plot(epochs, loss, 'bo', label='Traing loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Traing and validation loss')
plt.legend()

plt.show()
Exemplo n.º 5
0
#initial condition
io = 0.1

#create array of time points
timePoints = np.linspace(0, 10, 11)
print(timePoints)

#solve ODE
cpi = 0.3
solution = odeint(fode,io,timePoints,args=(cpi,))
print(solution)

#set data points on plot
plt.plot(timePoints,solution)

#label x- and y- axes
plt.xlabel('time')
Text(0.5, 0, 'time')
plt.ylabel('i(t)')
Text(0, 0.5, 'i(t)')

#set the position of text and display desired text
yMax = max(solution)
xMax = max(timePoints)

plt.text(0.8*xMax,0.8*yMax,str("cpi="+str(cpi)))
plt.text(0.8*xMax,0.7*yMax,str("i(o)="+str(io)))

>>> #display the plot
>>> plt.show()
    def process_text_data(self):
        processed_text = {}
        lemmatizer = WordNetLemmatizer()
        stopwords = nltk.corpus.stopwords.words('english')
        newStopWords = []
        stopwords.extend(newStopWords)

        for iteration, entry in enumerate(self.df['text']):
            # Tokenize (removes punctuation adn shifts to all lower case)
            tokenizer = RegexpTokenizer(r'\w+')
            processed_text[iteration] = tokenizer.tokenize(entry.lower())
            # Remove numbers
            processed_text[iteration] = [
                item for item in processed_text[iteration]
                if not item.isdigit()
            ]
            # Remove stopwords
            processed_text[iteration] = [
                word for word in processed_text[iteration]
                if word not in stopwords
            ]
            # Lemmatize
            processed_text[iteration] = " ".join(
                [lemmatizer.lemmatize(i) for i in processed_text[iteration]])
        self.df['lemmatized_text'] = list(
            processed_text.values())  # append to dataframe

        # Create DTM
        cv = CountVectorizer(ngram_range=(1, 1))
        self.dtm = cv.fit_transform(self.df['lemmatized_text'])
        words = np.array(cv.get_feature_names())
        print(pd.DataFrame.from_records(self.dtm[:5, :5].A, columns=words[:5]))
        self.dtm_df = pd.DataFrame.from_records(self.dtm.A, columns=words)

        # Analyze frequency
        freqs = self.dtm.sum(axis=0).A.flatten()
        index = np.argsort(freqs)[-50:]
        print(list(zip(words[index], freqs[index])))
        WordFreq = pd.DataFrame.from_records(
            (list(zip(words[index], freqs[index]))))
        WordFreq.columns = ['Word', 'Freq']

        # Plot bar graph
        fig, ax = plt.subplots(figsize=(8, 8))
        WordFreq.sort_values(by='Freq').plot.barh(x='Word',
                                                  y='Freq',
                                                  ax=ax,
                                                  color='gray')

        # Look at metadata (word count over time)
        self.df['year'] = self.df['year'].astype(str).astype(int)
        self.df.plot.line(x='year', y='word_count')

        # Generate word cloud
        data = dict(zip(WordFreq['Word'].tolist(), WordFreq['Freq'].tolist()))
        wordcloud = WordCloud().generate_from_frequencies(data)

        # Plotting
        plt.figure()
        plt.imshow(wordcloud, interpolation='bilinear')
        plt.axis('off')
        plt.show()

        # LDA
        def print_topics(model, count_vectorizer, n_top_words):
            words = count_vectorizer.get_feature_names()
            for topic_idx, topic in enumerate(model.components_):
                print('\nTopic #%d: ' % topic_idx)
                print(' '.join(
                    [words[i] for i in topic.argsort()[:n_top_words - 1:-1]]))

        # Set parameters
        number_topics = 5
        number_words = 5

        # Create and fit LDA model
        lda = LDA(n_components=number_topics, n_jobs=-1)
        lda.fit(self.dtm)
        print('Topics found via the LDA:')
        print_topics(lda, cv, number_words)