def compare(self): groupA = self.df.copy() groupA['quarter'] = groupA[ groupA['quarter'].astype(int) < 2]['quarter'] groupA = groupA.dropna() groupB = self.df.copy() groupB['quarter'] = groupB[ groupB['quarter'].astype(int) < 2]['quarter'] groupB = groupB.dropna() # Create DTM for count, group in enumerate([groupA, groupB]): cv2 = CountVectorizer(ngram_range=(1, 1)) dtm = cv2.fit_transform(group['lemmatized_text']) words = np.array(cv2.get_feature_names()) dtm_df = pd.DataFrame.from_records(dtm.A, columns=words) # Analyze frequency freqs = dtm.sum(axis=0).A.flatten() index = np.argsort(freqs)[-50:] WordFreq = pd.DataFrame.from_records( (list(zip(words[index], freqs[index])))) WordFreq.columns = ['Word', 'Freq'] # Plot bar graph fig, ax = plt.subplots(figsize=(8, 8)) WordFreq.sort_values(by='Freq').plot.barh(x='Word', y='Freq', ax=ax, color='gray') plt.title('Group ' + str(count)) plt.figure() plt.show()
def plot_conf_matrix(y_true, y_pred, normed=True, heatmap_color ='Blues', **kwargs): ## check to make sure that y_pred is an array of integers if y_true is a bunch of integers true_int_check = all(isinstance(a,int) for a in y_true) pred_int_check = all(isinstance(a,int) for a in y_pred) if true_int_check and not pred_int_check: # convert the y_pred values to integers if isinstance(y_pred, pd.Series): y_pred = y_pred.astype(int) my_c = metrics.confusion_matrix(y_true, y_pred) print metrics.matthews_corrcoef(y_true, y_pred) if normed: cm_normalized = my_c.astype('float') / my_c.sum(axis=1)[:, np.newaxis] my_c = cm_normalized plt.title('Normalized RF Classifier Confusion Matrix') else: plt.title('Random Forest Classifier Confusion Matrix') sns.heatmap(my_c, annot=True, fmt='',cmap=heatmap_color, **kwargs) plt.ylabel('True') plt.xlabel('Assigned') plt.show() return
def plot_losses(inputs, outputs, losses, val_losses, title, nfe=None, net=None): # plot statistics if nfe is not None: nfe[0].append(net.odeblock_down1.odefunc.nfe) nfe[1].append(net.odeblock_down2.odefunc.nfe) nfe[2].append(net.odeblock_down3.odefunc.nfe) nfe[3].append(net.odeblock_down4.odefunc.nfe) nfe[4].append(net.odeblock_embedding.odefunc.nfe) nfe[5].append(net.odeblock_up1.odefunc.nfe) nfe[6].append(net.odeblock_up2.odefunc.nfe) nfe[7].append(net.odeblock_up3.odefunc.nfe) nfe[8].append(net.odeblock_up4.odefunc.nfe) if nfe is not None: cols = 4 else: cols = 3 fig, ax = plt.subplots(nrows=1, ncols=cols, figsize=(15, 5)) fig.suptitle(title, fontsize=16) ax[0].plot(np.arange(len(losses)), losses, label="loss") ax[0].plot(np.arange(len(val_losses)), val_losses, label="val_loss") if nfe is not None: ax[3].plot(np.arange(len(nfe[0])), nfe[0], label="down1") ax[3].plot(np.arange(len(nfe[0])), nfe[1], label="down2") ax[3].plot(np.arange(len(nfe[0])), nfe[2], label="down3") ax[3].plot(np.arange(len(nfe[0])), nfe[3], label="down4") ax[3].plot(np.arange(len(nfe[0])), nfe[4], label="embed") ax[3].plot(np.arange(len(nfe[0])), nfe[5], label="up1") ax[3].plot(np.arange(len(nfe[0])), nfe[6], label="up2") ax[3].plot(np.arange(len(nfe[0])), nfe[7], label="up3") ax[3].plot(np.arange(len(nfe[0])), nfe[8], label="up4") ax[3].legend() outputs = torch.argmax(torch.softmax(outputs, dim=1), dim=1)[0] outputs = outputs.detach().cpu() outputs = outputs.numpy() ax[0].legend() ax[1].imshow(outputs) ax[2].imshow(inputs.detach().cpu()[0].numpy().transpose(1, 2, 0)) plt.show()
epochs=100, validation_data=validation_generator, validation_steps=50) model.save('cats_and_dogs_small_2.h5') acc = history.history['acc'] val_acc = history.history['val_acc'] loss = history.history['loss'] val_loss = history.history['val_loss'] epochs = range(1, len(acc) + 1) plt.plot(epochs, acc, 'bo', label='training acc') plt.plot(epochs, val_acc, 'b', label='Validation acc') plt.title('Traing and validation accuracy') plt.legend() plt.figure() plt.plot(epochs, loss, 'bo', label='Traing loss') plt.plot(epochs, val_loss, 'b', label='Validation loss') plt.title('Traing and validation loss') plt.legend() plt.show()
#initial condition io = 0.1 #create array of time points timePoints = np.linspace(0, 10, 11) print(timePoints) #solve ODE cpi = 0.3 solution = odeint(fode,io,timePoints,args=(cpi,)) print(solution) #set data points on plot plt.plot(timePoints,solution) #label x- and y- axes plt.xlabel('time') Text(0.5, 0, 'time') plt.ylabel('i(t)') Text(0, 0.5, 'i(t)') #set the position of text and display desired text yMax = max(solution) xMax = max(timePoints) plt.text(0.8*xMax,0.8*yMax,str("cpi="+str(cpi))) plt.text(0.8*xMax,0.7*yMax,str("i(o)="+str(io))) >>> #display the plot >>> plt.show()
def process_text_data(self): processed_text = {} lemmatizer = WordNetLemmatizer() stopwords = nltk.corpus.stopwords.words('english') newStopWords = [] stopwords.extend(newStopWords) for iteration, entry in enumerate(self.df['text']): # Tokenize (removes punctuation adn shifts to all lower case) tokenizer = RegexpTokenizer(r'\w+') processed_text[iteration] = tokenizer.tokenize(entry.lower()) # Remove numbers processed_text[iteration] = [ item for item in processed_text[iteration] if not item.isdigit() ] # Remove stopwords processed_text[iteration] = [ word for word in processed_text[iteration] if word not in stopwords ] # Lemmatize processed_text[iteration] = " ".join( [lemmatizer.lemmatize(i) for i in processed_text[iteration]]) self.df['lemmatized_text'] = list( processed_text.values()) # append to dataframe # Create DTM cv = CountVectorizer(ngram_range=(1, 1)) self.dtm = cv.fit_transform(self.df['lemmatized_text']) words = np.array(cv.get_feature_names()) print(pd.DataFrame.from_records(self.dtm[:5, :5].A, columns=words[:5])) self.dtm_df = pd.DataFrame.from_records(self.dtm.A, columns=words) # Analyze frequency freqs = self.dtm.sum(axis=0).A.flatten() index = np.argsort(freqs)[-50:] print(list(zip(words[index], freqs[index]))) WordFreq = pd.DataFrame.from_records( (list(zip(words[index], freqs[index])))) WordFreq.columns = ['Word', 'Freq'] # Plot bar graph fig, ax = plt.subplots(figsize=(8, 8)) WordFreq.sort_values(by='Freq').plot.barh(x='Word', y='Freq', ax=ax, color='gray') # Look at metadata (word count over time) self.df['year'] = self.df['year'].astype(str).astype(int) self.df.plot.line(x='year', y='word_count') # Generate word cloud data = dict(zip(WordFreq['Word'].tolist(), WordFreq['Freq'].tolist())) wordcloud = WordCloud().generate_from_frequencies(data) # Plotting plt.figure() plt.imshow(wordcloud, interpolation='bilinear') plt.axis('off') plt.show() # LDA def print_topics(model, count_vectorizer, n_top_words): words = count_vectorizer.get_feature_names() for topic_idx, topic in enumerate(model.components_): print('\nTopic #%d: ' % topic_idx) print(' '.join( [words[i] for i in topic.argsort()[:n_top_words - 1:-1]])) # Set parameters number_topics = 5 number_words = 5 # Create and fit LDA model lda = LDA(n_components=number_topics, n_jobs=-1) lda.fit(self.dtm) print('Topics found via the LDA:') print_topics(lda, cv, number_words)