from my_tools import get_bill_data import matplotlib.pyplot as plt plt.style.use('ggplot') # retrive data from mongo data, _ = get_bill_data() beyond_intro = data[data['bill_status'] != 'Introduced'] # show histograms to show proportion of bills that passed vs. those that failed passed_df = data[data['labels'] == 1] fig = plt.figure(figsize=(16, 8)) ax = fig.add_subplot(111) ax.set_title( 'Number of Bills Introduced (yellow), Beyond Introduced (red), and Passed (green) vs. Time', fontdict={'fontsize': 16}) ax.hist(data['intro_date'], bins=500, alpha=.35, color='orange') ax.hist(beyond_intro['intro_date'], bins=500, alpha=.5, color='r') ax.hist(passed_df['intro_date'], bins=500, color='g') ax.set_ylim(0, 400) plt.show()
import os from my_tools import get_bill_data, process_corpus, read_jsonl_file import matplotlib.pyplot as plt plt.style.use('ggplot') from sklearn.model_selection import train_test_split from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import recall_score, precision_score, accuracy_score, f1_score, confusion_matrix from sklearn.externals import joblib # get bill data print('-------------------') print('Loading original and preprocessed data for vectorizing and modeling...') data, in_progress = get_bill_data() corpus_with_labels = read_jsonl_file( '/home/ubuntu/galvanize_capstone/data/nlp/corpus_with_labels.jsonl') corpus_df = pd.DataFrame(list(corpus_with_labels)) X = corpus_df['document'] y = corpus_df['label'].astype(int) # create stratified train-test split print('-------------------') print('Doing train-test split...') X_train, X_test, y_train, y_test = train_test_split( X, y, stratify=y) #, random_state = 123) # Already vectorized using pickle_nlp_boosting_model.py
import numpy as np import pandas as pd from pymongo import MongoClient import pprint import string import re from collections import Counter from my_tools import get_bill_data, process_corpus from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer from sklearn.decomposition import NMF from sklearn.metrics.pairwise import linear_kernel from sklearn.preprocessing import normalize from sklearn.metrics import recall_score, precision_score, accuracy_score, confusion_matrix from nltk.tokenize import word_tokenize, wordpunct_tokenize, RegexpTokenizer from nltk.stem.snowball import SnowballStemmer from nltk.stem.wordnet import WordNetLemmatizer from nltk.corpus import stopwords from nltk.util import ngrams, skipgrams from sklearn.model_selection import train_test_split from sklearn.naive_bayes import MultinomialNB #, ComplementNB unreleased as of 12/14 import matplotlib.pyplot as plt plt.style.use('ggplot') data = get_bill_data()