def get_main_corpus(self): DB = ConnectToDB() sql_query = """ SELECT doc_id, doc_date, line_of_doc, speaker_text FROM corpus_table ORDER BY doc_date DESC, line_of_doc ASC; """ # Get entire corpus from database corpus_df = DB.pull_from_db(sql_query) docs = [sent for sent in corpus_df['speaker_text']] texts = [[str(word).lower() for word in self.TV.get_tokens(doc)] for doc in docs] frequency = defaultdict(int) for text in texts: for token in text: frequency[token] += 1 texts = [[token for token in text if frequency[token] > 1] for text in texts] return texts
def __init__(self): # Get the html links self.html_links = self.get_html_links() # Tokenizer! self.TV = TokenVectorizer() # self.DB = ConnectToDB() self.table_name = 'corpus_table'
def __init__(self): # Get the html links self.get_html_links() # Tokenizer! self.TV = TokenVectorizer() # self.DB = ConnectToDB() self.table_name = 'corpus_table'
def getSessionData(sessionID=None): # RESPONSE_TABE session_id, question_num, date, time, question, # trump_text, trump_sim, trump_isbot, trump_userguess, trump_answer # clinton_text, clinton_sim, clinton_isbot, clinton_userguess, clinton_answer sqlQuery = """ SELECT session_id, question_num, question, trump_text, trump_userguess, clinton_text, clinton_userguess FROM response_table WHERE session_id = '%s' ORDER BY question_num ASC; """ % (sessionID) DB = ConnectToDB() queryresults = DB.pull_from_db(sqlQuery) return pd.DataFrame.to_dict(queryresults, 'records')
def __init__( self, candidate, ): """ Prepare the bot for the input candidate.""" # Connect to the SQL database self.DB = ConnectToDB() self.corpus_table = 'corpus_table' self.question_table = 'question_table' self.response_table = 'response_table' # Save candidate and get candidate corpus self.candidate = candidate.lower() self.corpus = self.get_corpus() # Initialize the vectorizer self.TV = TokenVectorizer() # Initialize the markov chain self.sorin = MarkovChain(self.corpus) # Log dictionary for questions and responses self.idnum = 0
def __init__(self, candidate, ): """ Prepare the bot for the input candidate.""" # Connect to the SQL database self.DB = ConnectToDB() self.corpus_table = 'corpus_table' self.question_table = 'question_table' self.response_table = 'response_table' # Save candidate and get candidate corpus self.candidate = candidate.lower() self.corpus = self.get_corpus() # Initialize the vectorizer self.TV = TokenVectorizer() # Initialize the markov chain self.sorin = MarkovChain(self.corpus) # Log dictionary for questions and responses self.idnum = 0
def __init__(self, candidate, nlp=None): """ Prepare the bot for the input candidate.""" ############################# # Set some variables ############################# # Path to files/dependencies self.path = "/".join([os.getcwd(),"dependencies/"]) self.candidate = candidate.lower() ############################# # Connect to the SQL database ############################# self.DB = ConnectToDB() self.corpus_table = 'corpus_table' self.response_table = 'response_table' ############################ # Initialize the vectorizer ########################### if nlp is None: self.nlp = spacy.en.English(tagger=True, parser=False, entity=False, matcher=False) else: self.nlp = nlp self.TV = TokenVectorizer(nlp=self.nlp) ################################## # Set up TopicModeling ################################## self.TP = TopicModeling(TV=self.TV) # Check to see if the candidate specific files are there. if os.path.isfile("".join([self.path,candidate,"_lsi_index.index"])): corpus_df = self.DB.pull_candidate_corpus('corpus_table', self.candidate) self.corpus = corpus_df['speaker_text'].values.tolist() self.index = similarities.MatrixSimilarity.load( "".join([self.path,self.candidate,'_tfidf_index.index'])) self.lsi_index = similarities.MatrixSimilarity.load( "".join([self.path,self.candidate,'_lsi_index.index'])) else: corpus_df = self.DB.pull_candidate_corpus('corpus_table', self.candidate) self.corpus = corpus_df['speaker_text'].values.tolist() self.TP.prepare_candidate_corpus(self.corpus, self.candidate) self.index = similarities.MatrixSimilarity.load( "".join([self.path,self.candidate,'_tfidf_index.index'])) self.lsi_index = similarities.MatrixSimilarity.load( "".join([self.path,self.candidate,'_lsi_index.index'])) ############################# # Initialize the markov chain ############################# # Get the most recent markov chain. markov_models = glob.glob("".join([self.path, self.candidate, "_markov_models/*"])) if len(markov_models)==0: corpus = self.get_corpus() sorin = MarkovChain(state_size=3) sorin.train_model(corpus) dt = datetime.datetime.now() wk_of_yr = datetime.date(dt.year, dt.month, dt.day).isocalendar()[1] timestamp = dt.year*100+wk_of_yr fname = "".join([self.path, self.candidate, "_markov_models/", str(timestamp), "_markov_model.pkl"]) with open(fname, "wb") as f: pickle.dump(sorin, f) self.markov_model = sorin else: markov_fname = markov_models[-1] pkl_file = open(markov_fname, 'rb') sorin = pickle.load(pkl_file) self.markov_model = sorin # Log dictionary for questions and responses self.idnum = 0
class PoliBot(object): def __init__(self, candidate, nlp=None): """ Prepare the bot for the input candidate.""" ############################# # Set some variables ############################# # Path to files/dependencies self.path = "/".join([os.getcwd(),"dependencies/"]) self.candidate = candidate.lower() ############################# # Connect to the SQL database ############################# self.DB = ConnectToDB() self.corpus_table = 'corpus_table' self.response_table = 'response_table' ############################ # Initialize the vectorizer ########################### if nlp is None: self.nlp = spacy.en.English(tagger=True, parser=False, entity=False, matcher=False) else: self.nlp = nlp self.TV = TokenVectorizer(nlp=self.nlp) ################################## # Set up TopicModeling ################################## self.TP = TopicModeling(TV=self.TV) # Check to see if the candidate specific files are there. if os.path.isfile("".join([self.path,candidate,"_lsi_index.index"])): corpus_df = self.DB.pull_candidate_corpus('corpus_table', self.candidate) self.corpus = corpus_df['speaker_text'].values.tolist() self.index = similarities.MatrixSimilarity.load( "".join([self.path,self.candidate,'_tfidf_index.index'])) self.lsi_index = similarities.MatrixSimilarity.load( "".join([self.path,self.candidate,'_lsi_index.index'])) else: corpus_df = self.DB.pull_candidate_corpus('corpus_table', self.candidate) self.corpus = corpus_df['speaker_text'].values.tolist() self.TP.prepare_candidate_corpus(self.corpus, self.candidate) self.index = similarities.MatrixSimilarity.load( "".join([self.path,self.candidate,'_tfidf_index.index'])) self.lsi_index = similarities.MatrixSimilarity.load( "".join([self.path,self.candidate,'_lsi_index.index'])) ############################# # Initialize the markov chain ############################# # Get the most recent markov chain. markov_models = glob.glob("".join([self.path, self.candidate, "_markov_models/*"])) if len(markov_models)==0: corpus = self.get_corpus() sorin = MarkovChain(state_size=3) sorin.train_model(corpus) dt = datetime.datetime.now() wk_of_yr = datetime.date(dt.year, dt.month, dt.day).isocalendar()[1] timestamp = dt.year*100+wk_of_yr fname = "".join([self.path, self.candidate, "_markov_models/", str(timestamp), "_markov_model.pkl"]) with open(fname, "wb") as f: pickle.dump(sorin, f) self.markov_model = sorin else: markov_fname = markov_models[-1] pkl_file = open(markov_fname, 'rb') sorin = pickle.load(pkl_file) self.markov_model = sorin # Log dictionary for questions and responses self.idnum = 0 def update_model(self, text, sim): new_sorin = self.markov_model.update(text, sim) dt = datetime.datetime.now() wk_of_yr = datetime.date(dt.year, dt.month, dt.day).isocalendar()[1] timestamp = dt.year*100+wk_of_yr fname = "".join([self.path, self.candidate, "_markov_models/", str(timestamp),"_markov_model.pkl"]) with open(fname, "wb") as f: pickle.dump(new_sorin, f) self.markov_model = new_sorin return self def question_getbest_responses(self, question, nsent=100): responses = self.get_responses(num_sent=nsent) response_text = [" ".join(response[0].split( )) for response in responses] best_responses = self.get_tfidf_matches(response_text, question) return best_responses def ask_question(self, question=None): token_list = self.TV.get_tokens(question) if token_list is None: return None else: return (question, token_list) def get_responses(self, num_sent=100, tries=1, save_to_db=False): responses = self.markov_model.make_response(n_sentences=num_sent) response_list = [] for i, response in enumerate(responses): token_list = self.TV.get_tokens(response) response_list.append((response, token_list)) return response_list def response_tfidf_matches(self, sentences, question, n_return=None): index = self.TP.get_corpus_index(sentences) sims = index[self.TP.tfidf[self.TP.get_doc2bow(question)]] sims = sorted(enumerate(sims), key=lambda item: -item[1]) if n_return is None: return [(sims[i][0],sentences[sims[i][0]],sims[i][1]) for i in range(len(sentences))] else: return [(sims[i][0],sentences[sims[i][0]],sims[i][1]) for i in range(n_return)] def text_tfidf_matches(self, question, n_return=5): sims = self.index[self.TP.tfidf[self.TP.get_doc2bow(question)]] sims = sorted(enumerate(sims), key=lambda item: -item[1]) if n_return is None: return [(sims[i][0],self.corpus[sims[i][0]],sims[i][1]) for i in range(len(sentences))] else: return [(sims[i][0],self.corpus[sims[i][0]],sims[i][1]) for i in range(n_return)] def response_lsi_matches(self, sentences, question, n_return=5): index = self.TP.get_corpus_index(sentences, lsi=True) sims = index[self.TP.lsi[self.TP.tfidf[self.TP.get_doc2bow(question)]]] sims = sorted(enumerate(sims), key=lambda item: -item[1]) if n_return is None: return [(sims[i][0],sentences[sims[i][0]],sims[i][1]) for i in range(len(sentences))] else: return [(sims[i][0],sentences[sims[i][0]],sims[i][1]) for i in range(n_return)] def text_lsi_matches(self, question, n_return=5): sims = self.lsi_index[self.TP.lsi[self.TP.tfidf[self.TP.get_doc2bow(question)]]] sims = sorted(enumerate(sims), key=lambda item: -item[1]) if n_return is None: return [(sims[i][0],self.corpus[sims[i][0]],sims[i][1]) for i in range(len(sentences))] else: return [(sims[i][0],self.corpus[sims[i][0]],sims[i][1]) for i in range(n_return)] def get_corpus(self): df = self.DB.pull_candidate_corpus(self.corpus_table, self.candidate) corpus = deque() for doc in self.nlp.pipe(df['speaker_text'], batch_size=50, n_threads=1): pos_tagged_sentence = list() for tok in doc: if tok.like_url or tok.like_email: continue if tok.is_alpha: pos_tagged_sentence.append('::'.join([tok.orth_, tok.pos_])) elif tok.is_punct and tok.text in [',', '.', '?', '!']: pos_tagged_sentence.append(tok.text) elif "\'" in tok.text and tok.lemma_ not in string.punctuation: pos_tagged_sentence.append(tok.lemma_) corpus.append(pos_tagged_sentence) return list(corpus)
class PoliBot(object): def __init__(self, candidate, ): """ Prepare the bot for the input candidate.""" # Connect to the SQL database self.DB = ConnectToDB() self.corpus_table = 'corpus_table' self.question_table = 'question_table' self.response_table = 'response_table' # Save candidate and get candidate corpus self.candidate = candidate.lower() self.corpus = self.get_corpus() # Initialize the vectorizer self.TV = TokenVectorizer() # Initialize the markov chain self.sorin = MarkovChain(self.corpus) # Log dictionary for questions and responses self.idnum = 0 def ask_question(self, question=None): ts = time.time() self.date = int(datetime.datetime.fromtimestamp(ts).strftime('%Y%m%d')) self.time = int(datetime.datetime.fromtimestamp(ts).strftime('%H%M%S')) self.ID = str(self.idnum)+'_'+str(ts) self.idnum+=1 try: tokens = self.TV.tokenize_full(question) except: tokens = [] try: word_string = [str(t) for t in tokens] except: word_string = "" try: tokens_vect = self.TV.make_vector(word_string) except: tokens_vect=[] if len(tokens_vect) > 1: question_vect = sum(tokens_vect)/len(tokens_vect) else: question_vect = tokens_vect if len(question_vect)==1: self.question_vect = question_vect[0] else: self.question_vect = question_vect self.question_log = { 'question_id':[self.ID], 'question_date':[self.date], 'question_time':[self.time], 'question_sent':[question], 'question_tokens':[tokens]} self.response_log = { 'response_id':[self.ID], 'response_date':[self.date], 'response_time':[self.time], 'response_candidate':[self.candidate], 'response_sent':[], 'response_tokens':[], 'cosine_sim':[0], 'question_id':[self.question_log['question_id'][0]] } # We want a new response dictionary for each question asked. self.response_dict = {} self.responseIDcounter = 0 self.responseLOOPcounter = 0 def response(self, num_sent=100, tries=10, save_to_db=False): generated_sentences = self.sorin.generate_sentences(num_sent=num_sent) cosine_sims = [0] all_tokens = [] for i, sent in enumerate(generated_sentences): if sent is None: continue else: tokens = self.TV.tokenize_full(sent) if tokens is None: continue else: word_string = [str(t) for t in tokens] tokens_vect = self.TV.make_vector(word_string) if len(tokens_vect) > 1: response_vect = sum(tokens_vect)/len(tokens_vect) else: response_vect = tokens_vect # Cosine similarity try: cosine_sim_0 = cosine(response_vect,self.question_vect) except: continue if cosine_sim_0 > np.max(cosine_sims): self.response_log['response_sent'] = [sent] self.response_log['response_tokens'] = [tokens] self.response_log['cosine_sim'] = [cosine_sim_0] cosine_sims.append(cosine_sim_0) all_tokens.append(tokens) else: cosine_sims.append(cosine_sim_0) all_tokens.append(tokens) if (self.responseLOOPcounter < tries) and (self.response_log['cosine_sim'][0] < 0.70): self.responseLOOPcounter+=1 self.response(num_sent=num_sent, tries=tries) else: self.response_log['cosine_sim_dist'] = \ [(np.mean(cosine_sims),np.std(cosine_sims))] if save_to_db: self.DB.save_to_db(self.question_table, self.question_log) self.DB.save_to_db(self.response_table, self.response_log) else: print("Not saving to db") return self.response_log['response_sent'][0] def get_corpus(self): return self.DB.pull_candidate_corpus(self.corpus_table, self.candidate)
class BuildSaveCorpus(object): def __init__(self): # Get the html links self.html_links = self.get_html_links() # Tokenizer! self.TV = TokenVectorizer() # self.DB = ConnectToDB() self.table_name = 'corpus_table' def build_corpus(self): for key, vals in self.html_links.items(): if 'speech' in key: key_speaker = key.split('_')[1] doc_type = key.split('_')[0] else: doc_type = key for html_link in vals: line_of_doc_cter = 0 html_text = self.fetch_data(html_link) speakers, speakers_text = self.clean_html(html_text) if len(speakers) == 0: speakers = [str(key_speaker)] * len(speakers_text) for speaker, speaker_text in zip(speakers, speakers_text): sentences = self.TV.tokenize_tosentence(speaker_text) for s in sentences: corpus_ID = "_".join([ str(doc_type), str(self.doc_date), str(line_of_doc_cter) ]) html_link_dict = { "corpus_id": [corpus_ID], "link": [html_link], "doc_id": ["_".join([key, str(self.doc_date)])], "doc_type": [doc_type], "doc_date": [int(self.doc_date)], "speaker": [speaker.lower()], "speaker_text": [s], "line_of_doc": [line_of_doc_cter] } self.save_corpus(html_link_dict) print(line_of_doc_cter, s) line_of_doc_cter += 1 print("Done with %s - %s" % (key, html_link)) def save_corpus(self, out_dict): ''' Save dictionary to SQL database ''' self.DB.save_to_db(self.table_name, out_dict) def fetch_data(self, htmlfile): '''Grabs and opens the html file given the url address''' url = htmlfile if url == None: print("No URL Provided") else: response = urllib.request.urlopen(url) return response.read() def clean_html(self, htmltext): '''Uses beautifulsoup to parse the html file and clean it a bit Returns two different arrays: speaker -- who was talking speaker_text -- that the speaker said. Useful, specificially for debates. Clean_text will provide, in chronological order, the speaker:what they said. ''' soupy = BS(htmltext, 'lxml') # Get the document date. Save "March 16, 2015" as "20150316" date = str(soupy.find_all('span', class_='docdate')) date_str = " ".join(re.split(' |, ', re.sub(r"<.+?>|", "", date)[1:-1])) stime = time.strptime(date_str, "%B %d %Y") self.doc_date = str((stime[0] * 10000) + (stime[1] * 100) + (stime[2])) text_only = str(soupy.find_all('span', class_='displaytext')) speaker = [] speaker_text = [] for each in text_only[1:-1].split('<p>'): clean_each = re.sub(r"<.+?>|\[.+?\]", "", each) clean_each_split = clean_each.split(':') #print(clean_each_split) if len(clean_each_split) > 1: speaker.append(clean_each_split[0]) try: speaker_text.append(clean_each_split[1]) except (AttributeError, TypeError): pdb.set_trace() else: try: speaker_text[ -1] = speaker_text[-1] + ' ' + clean_each_split[0] except: speaker_text.append(clean_each_split[0]) return speaker, speaker_text def get_html_links(self): link_dict = {} link_dict['democratic_debate'] = [ 'http://www.presidency.ucsb.edu/ws/index.php?pid=116995', #Brooklyn, New York; April 14, 2016 'http://www.presidency.ucsb.edu/ws/index.php?pid=112719', #Miami, Florida; March 9, 2016 'http://www.presidency.ucsb.edu/ws/index.php?pid=112718', #Flint, Michigan; March 6, 2016 'http://www.presidency.ucsb.edu/ws/index.php?pid=111520', #Milwaukee, Wisconsin; February 11, 2016 'http://www.presidency.ucsb.edu/ws/index.php?pid=111471', #Durham, New Hampshire; February 4, 2016 'http://www.presidency.ucsb.edu/ws/index.php?pid=111409', #Charleston, South Carolina; January 17, 2016 'http://www.presidency.ucsb.edu/ws/index.php?pid=111178', #Manchester, New Hampshire; December 19, 2015 'http://www.presidency.ucsb.edu/ws/index.php?pid=110910', #Des Moines, Iowa; November 14, 2015 'http://www.presidency.ucsb.edu/ws/index.php?pid=110903', #Las Vegas, Nevada; October 13, 2015 ] link_dict['republican_debate'] = [ 'http://www.presidency.ucsb.edu/ws/index.php?pid=115148', #Miami, Florida; March 10, 2016 'http://www.presidency.ucsb.edu/ws/index.php?pid=111711', #Detroit, Michigan; March 3, 2016 'http://www.presidency.ucsb.edu/ws/index.php?pid=111634', #Houston, Texas; February 25, 2016 'http://www.presidency.ucsb.edu/ws/index.php?pid=111500', #Greenville, South Carolina; February 13, 2016 'http://www.presidency.ucsb.edu/ws/index.php?pid=111472', #Manchester, New Hampshire; February 6, 2016 'http://www.presidency.ucsb.edu/ws/index.php?pid=111412', #Des Moines, Iowa; January 28, 2016 'http://www.presidency.ucsb.edu/ws/index.php?pid=111395', #North Charleston, South Carolina; January 14, 2016 'http://www.presidency.ucsb.edu/ws/index.php?pid=111177', #Las Vegas, Nevada; December 15, 2015 'http://www.presidency.ucsb.edu/ws/index.php?pid=110908', #Milwaukee, Wisconsin; November 10, 2015 'http://www.presidency.ucsb.edu/ws/index.php?pid=110906', #Boulder, Colorado; October 28, 2015 'http://www.presidency.ucsb.edu/ws/index.php?pid=110756', #Simi Valley, California; September 16, 2015 'http://www.presidency.ucsb.edu/ws/index.php?pid=110489', #Cleveland, Ohio; August 6, 2015 'http://www.presidency.ucsb.edu/ws/index.php?pid=111413', #Des Moines, Iowa; January 28, 2016 'http://www.presidency.ucsb.edu/ws/index.php?pid=111394', #North Charleston, South Carolina; January 14, 2016 'http://www.presidency.ucsb.edu/ws/index.php?pid=111176', #Las Vegas, Nevada; December 15, 2015 'http://www.presidency.ucsb.edu/ws/index.php?pid=110909', #Milwaukee, Wisconsin: November 10, 2015 'http://www.presidency.ucsb.edu/ws/index.php?pid=110907', #Boulder, Colorado; October 28, 2015 'http://www.presidency.ucsb.edu/ws/index.php?pid=110758', #Simi Valley, California; September 16, 2015 'http://www.presidency.ucsb.edu/ws/index.php?pid=110757' #Cleveland, Ohio; August 6, 2015 ] link_dict['speech_trump'] = [ 'http://www.presidency.ucsb.edu/ws/index.php?pid=110306', 'http://www.presidency.ucsb.edu/ws/index.php?pid=116597' ] link_dict['speech_kaisch'] = [ 'http://www.presidency.ucsb.edu/ws/index.php?pid=116599', 'http://www.presidency.ucsb.edu/ws/index.php?pid=113069', 'http://www.presidency.ucsb.edu/ws/index.php?pid=116546' ] link_dict['speech_cruz'] = [ 'http://www.presidency.ucsb.edu/ws/index.php?pid=117232', 'http://www.presidency.ucsb.edu/ws/index.php?pid=116598', 'http://www.presidency.ucsb.edu/ws/index.php?pid=114768', 'http://www.presidency.ucsb.edu/ws/index.php?pid=110030', 'http://www.presidency.ucsb.edu/ws/index.php?pid=109774' ] link_dict['speech_malley'] = [ 'http://www.presidency.ucsb.edu/ws/index.php?pid=112703', 'http://www.presidency.ucsb.edu/ws/index.php?pid=112702', 'http://www.presidency.ucsb.edu/ws/index.php?pid=112696', 'http://www.presidency.ucsb.edu/ws/index.php?pid=112699', 'http://www.presidency.ucsb.edu/ws/index.php?pid=112704', 'http://www.presidency.ucsb.edu/ws/index.php?pid=112700', 'http://www.presidency.ucsb.edu/ws/index.php?pid=112716' ] link_dict['speech_clinton'] = [ 'http://www.presidency.ucsb.edu/ws/index.php?pid=116600', 'http://www.presidency.ucsb.edu/ws/index.php?pid=111596', 'http://www.presidency.ucsb.edu/ws/index.php?pid=111586', #Rachel Maddow of MSNBC; February 8, 2016 'http://www.presidency.ucsb.edu/ws/index.php?pid=111589', #Jake Tapper of CNN's "State of the Union"; February 7, 2016 'http://www.presidency.ucsb.edu/ws/index.php?pid=111587', 'http://www.presidency.ucsb.edu/ws/index.php?pid=111585', 'http://www.presidency.ucsb.edu/ws/index.php?pid=111591', 'http://www.presidency.ucsb.edu/ws/index.php?pid=111595', 'http://www.presidency.ucsb.edu/ws/index.php?pid=111592', 'http://www.presidency.ucsb.edu/ws/index.php?pid=111439', 'http://www.presidency.ucsb.edu/ws/index.php?pid=111593', 'http://www.presidency.ucsb.edu/ws/index.php?pid=111594', 'http://www.presidency.ucsb.edu/ws/index.php?pid=111414', 'http://www.presidency.ucsb.edu/ws/index.php?pid=111436', 'http://www.presidency.ucsb.edu/ws/index.php?pid=111435', 'http://www.presidency.ucsb.edu/ws/index.php?pid=111434', 'http://www.presidency.ucsb.edu/ws/index.php?pid=111433', 'http://www.presidency.ucsb.edu/ws/index.php?pid=111432', 'http://www.presidency.ucsb.edu/ws/index.php?pid=111431', 'http://www.presidency.ucsb.edu/ws/index.php?pid=111430', 'http://www.presidency.ucsb.edu/ws/index.php?pid=111429', 'http://www.presidency.ucsb.edu/ws/index.php?pid=111428', 'http://www.presidency.ucsb.edu/ws/index.php?pid=111416', 'http://www.presidency.ucsb.edu/ws/index.php?pid=111426', 'http://www.presidency.ucsb.edu/ws/index.php?pid=111426', 'http://www.presidency.ucsb.edu/ws/index.php?pid=111425', 'http://www.presidency.ucsb.edu/ws/index.php?pid=111424', 'http://www.presidency.ucsb.edu/ws/index.php?pid=111423', 'http://www.presidency.ucsb.edu/ws/index.php?pid=111422', 'http://www.presidency.ucsb.edu/ws/index.php?pid=111417', 'http://www.presidency.ucsb.edu/ws/index.php?pid=111418', 'http://www.presidency.ucsb.edu/ws/index.php?pid=111421', 'http://www.presidency.ucsb.edu/ws/index.php?pid=111419', 'http://www.presidency.ucsb.edu/ws/index.php?pid=111419', 'http://www.presidency.ucsb.edu/ws/index.php?pid=110267', 'http://www.presidency.ucsb.edu/ws/index.php?pid=110269', 'http://www.presidency.ucsb.edu/ws/index.php?pid=110268', 'http://www.presidency.ucsb.edu/ws/index.php?pid=110270', 'http://www.presidency.ucsb.edu/ws/index.php?pid=110271', 'http://www.presidency.ucsb.edu/ws/index.php?pid=111420' ] link_dict['speech_sanders'] = [ 'http://www.presidency.ucsb.edu/ws/index.php?pid=117513', #Steve Inskeep of National Public Radio; May 5, 2016 'http://www.presidency.ucsb.edu/ws/index.php?pid=117194', #Conference Hosted by the Pontifical Academy of Social Sciences in Vatican City; April 15, 2016 'http://www.presidency.ucsb.edu/ws/index.php?pid=116694', #Remarks on Policy in the Middle East in Salt Lake City, Utah; March 21, 2016 'http://www.presidency.ucsb.edu/ws/index.php?pid=117516', #Remarks in Essex Junction, Vermont Following the "Super Tuesday" Primaries, March 1, 2016 'http://www.presidency.ucsb.edu/ws/index.php?pid=117511', #Remarks in Concord Following the New Hampshire Primary; February 9, 2016 'http://www.presidency.ucsb.edu/ws/index.php?pid=111440', #Remarks in Des Moines Following the Iowa Caucus; February 1, 2016 'http://www.presidency.ucsb.edu/ws/index.php?pid=117514', #Remarks in a Meeting with Steelworkers in Des Moines, Iowa; January 26, 2016 'http://www.presidency.ucsb.edu/ws/index.php?pid=114487', #Remarks at the New Hampshire Democratic Party Jefferson-Jackson Dinner in Manchester; November 29, 2015 'http://www.presidency.ucsb.edu/ws/index.php?pid=117517', 'http://www.presidency.ucsb.edu/ws/index.php?pid=114493', 'http://www.presidency.ucsb.edu/ws/index.php?pid=114491', 'http://www.presidency.ucsb.edu/ws/index.php?pid=114486', 'http://www.presidency.ucsb.edu/ws/index.php?pid=114488', 'http://www.presidency.ucsb.edu/ws/index.php?pid=114494', 'http://www.presidency.ucsb.edu/ws/index.php?pid=114489', 'http://www.presidency.ucsb.edu/ws/index.php?pid=114490', 'http://www.presidency.ucsb.edu/ws/index.php?pid=114495', 'http://www.presidency.ucsb.edu/ws/index.php?pid=114492', 'http://www.presidency.ucsb.edu/ws/index.php?pid=110222', 'http://www.presidency.ucsb.edu/ws/index.php?pid=110221', 'http://www.presidency.ucsb.edu/ws/index.php?pid=110125', 'http://www.presidency.ucsb.edu/ws/index.php?pid=110124' ] return link_dict #DEFUNCT def combine_speakers(self): speakers = self.speakers text = self.speakers_text uniq_speakers = list(set(speakers)) speaker_dict = {} speakerarr = np.array(speakers) textarr = np.array(text) for us in uniq_speakers: match = np.where(speakerarr == us)[0] matched_text = textarr[match] giant_text_str = '' for mt in matched_text: re_mt = re.sub(r'\[.+?\]', '', mt.tolist()) giant_text_str += re_mt speaker_dict[us] = giant_text_str self.speaker_dict = speaker_dict
class BuildSaveCorpus(object): def __init__(self): # Get the html links self.get_html_links() # Tokenizer! self.TV = TokenVectorizer() # self.DB = ConnectToDB() self.table_name = 'corpus_table' def build_corpus(self): for key, vals in self.html_links.items(): if 'speech' in key: key_speaker = key.split('_')[1] doc_type = key.split('_')[0] else: doc_type = key line_of_doc_cter = 0 for html_link in vals: html_text = self.fetch_data(html_link) speakers, speakers_text = self.clean_html(html_text) if len(speakers)==0: speakers = [str(key_speaker)]*len(speakers_text) for speaker, speaker_text in zip(speakers, speakers_text): sentences = self.TV.tokenize_tosentence(speaker_text) for s in sentences: corpus_ID = "_".join([str(doc_type), str(self.doc_date), str(line_of_doc_cter)]) html_link_dict = { "corpus_id": [corpus_ID], "link": [html_link], "doc_id": ["_".join([key,str(self.doc_date)])], "doc_type": [doc_type], "doc_date": [int(self.doc_date)], "speaker": [speaker.lower()], "speaker_text": [s], "line_of_doc": [line_of_doc_cter] } self.save_corpus(html_link_dict) print(line_of_doc_cter, s) line_of_doc_cter+=1 print("Done with %s - %s" %(key, html_link)) def save_corpus(self, out_dict): ''' Save dictionary to SQL database ''' self.DB.save_to_db(self.table_name, out_dict) def fetch_data(self, htmlfile): '''Grabs and opens the html file given the url address''' url = htmlfile if url==None: print("No URL Provided") else: response = urllib.request.urlopen(url) return response.read() def clean_html(self, htmltext): '''Uses beautifulsoup to parse the html file and clean it a bit Returns two different arrays: speaker -- who was talking speaker_text -- that the speaker said. Useful, specificially for debates. Clean_text will provide, in chronological order, the speaker:what they said. ''' soupy = BS(htmltext, 'lxml') # Get the document date. Save "March 16, 2015" as "20150316" date = str(soupy.find_all('span',class_='docdate')) date_str = " ".join(re.split(' |, ',re.sub(r"<.+?>|","", date)[1:-1])) stime = time.strptime(date_str,"%B %d %Y") self.doc_date = str((stime[0]*10000)+(stime[1]*100)+(stime[2])) text_only = str(soupy.find_all('span',class_='displaytext')) speaker = [] speaker_text = [] for each in text_only[1:-1].split('<p>'): clean_each = re.sub(r"<.+?>|\[.+?\]","", each) clean_each_split = clean_each.split(':') #print(clean_each_split) if len(clean_each_split) > 1: speaker.append(clean_each_split[0]) try: speaker_text.append(clean_each_split[1]) except (AttributeError, TypeError): pdb.set_trace() else: try: speaker_text[-1] = speaker_text[-1]+' '+clean_each_split[0] except: speaker_text.append(clean_each_split[0]) return speaker, speaker_text def get_html_links(self): link_dict = {} link_dict['debate'] = [ 'http://www.presidency.ucsb.edu/ws/index.php?pid=115148', 'http://www.presidency.ucsb.edu/ws/index.php?pid=111711', 'http://www.presidency.ucsb.edu/ws/index.php?pid=111634', 'http://www.presidency.ucsb.edu/ws/index.php?pid=111500', 'http://www.presidency.ucsb.edu/ws/index.php?pid=111472', 'http://www.presidency.ucsb.edu/ws/index.php?pid=111412', 'http://www.presidency.ucsb.edu/ws/index.php?pid=111395', 'http://www.presidency.ucsb.edu/ws/index.php?pid=111177', 'http://www.presidency.ucsb.edu/ws/index.php?pid=110908', 'http://www.presidency.ucsb.edu/ws/index.php?pid=110906', 'http://www.presidency.ucsb.edu/ws/index.php?pid=110756', 'http://www.presidency.ucsb.edu/ws/index.php?pid=110489', 'http://www.presidency.ucsb.edu/ws/index.php?pid=111413', 'http://www.presidency.ucsb.edu/ws/index.php?pid=111394', 'http://www.presidency.ucsb.edu/ws/index.php?pid=111176', 'http://www.presidency.ucsb.edu/ws/index.php?pid=110909', 'http://www.presidency.ucsb.edu/ws/index.php?pid=110907', 'http://www.presidency.ucsb.edu/ws/index.php?pid=110758', 'http://www.presidency.ucsb.edu/ws/index.php?pid=110757', 'http://www.presidency.ucsb.edu/ws/index.php?pid=116995', 'http://www.presidency.ucsb.edu/ws/index.php?pid=112719', 'http://www.presidency.ucsb.edu/ws/index.php?pid=112718', 'http://www.presidency.ucsb.edu/ws/index.php?pid=111520', 'http://www.presidency.ucsb.edu/ws/index.php?pid=111471', 'http://www.presidency.ucsb.edu/ws/index.php?pid=111409', 'http://www.presidency.ucsb.edu/ws/index.php?pid=111178', 'http://www.presidency.ucsb.edu/ws/index.php?pid=110910', 'http://www.presidency.ucsb.edu/ws/index.php?pid=110903' ] link_dict['speech_trump'] = [ 'http://www.presidency.ucsb.edu/ws/index.php?pid=110306', 'http://www.presidency.ucsb.edu/ws/index.php?pid=116597' ] link_dict['speech_clinton'] = [ 'http://www.presidency.ucsb.edu/ws/index.php?pid=116600', 'http://www.presidency.ucsb.edu/ws/index.php?pid=111596' ] self.html_links = link_dict #DEFUNCT def combine_speakers(self): speakers = self.speakers text = self.speakers_text uniq_speakers = list(set(speakers)) speaker_dict = {} speakerarr = np.array(speakers) textarr = np.array(text) for us in uniq_speakers: match = np.where(speakerarr == us)[0] matched_text = textarr[match] giant_text_str = '' for mt in matched_text: re_mt = re.sub(r'\[.+?\]','', mt.tolist()) giant_text_str+=re_mt speaker_dict[us] = giant_text_str self.speaker_dict = speaker_dict
def logData(): error = "" if request.method == 'POST': trumpResp = request.form.get('Trump_Answer') clintonResp = request.form.get('Clinton_Answer') if trumpResp is not None and clintonResp is not None: dt = datetime.datetime.now() wk_of_yr = datetime.date(dt.year, dt.month, dt.day).isocalendar()[1] timestamp = dt.year * 100 + wk_of_yr # if user selected "bot" if trumpResp == "Tbot": SD[session['id']]['trump_userguess'] = ["bot"] # if response is "bot" if (SD[session['id']]['trump_isbot'][0]) == 1: # answer = 1 (correct) SD[session['id']]['trump_answer'] = [1] else: # answer = 0 (wrong) SD[session['id']]['trump_answer'] = [0] # else if user selected "not" else: SD[session['id']]['trump_userguess'] = ["not"] # if response is "bot" if (SD[session['id']]['trump_isbot'][0]) == 1: #answer is wrong SD[session['id']]['trump_answer'] = [0] else: # else answer is right SD[session['id']]['trump_answer'] = [1] if clintonResp == "Cbot": SD[session['id']]['clinton_userguess'] = ["bot"] if (SD[session['id']]['clinton_isbot'][0]) == 1: SD[session['id']]['clinton_answer'] = [1] else: SD[session['id']]['clinton_answer'] = [0] else: SD[session['id']]['clinton_userguess'] = ["not"] if (SD[session['id']]['clinton_isbot'][0]) == 1: SD[session['id']]['clinton_answer'] = [0] else: SD[session['id']]['clinton_answer'] = [1] # Set some quetion specific variables. Date/time of question SD[session['id']]['question_num'][0] += 1 SD[session['id']]['date'] = [ dt.year * 10000 + 100 * dt.month + dt.day ] SD[session['id']]['time'] = [ dt.hour * 1e4 + dt.minute * 1e2 + round(dt.microsecond / 1e6, 3) ] """ # Update markov chain if (SD[session['id']]['trump_userguess'][0] == "bot" and SD[session['id']]['trump_answer'][0] == 0): #global trump trump = \ trump.update_model( [SD[session['id']]["trump_text"]], SD[session['id']]["trump_sim"][0]) elif (SD[session['id']]['clinton_userguess'][0] == "bot" and SD[session['id']]['clinton_answer'][0] == 0): #global clinton clinton = \ clinton.update_model( [SD[session['id']]["clinton_text"]], SD[session['id']]["clinton_sim"][0]) """ DB = ConnectToDB() DB.save_to_db('response_table', SD[session['id']]) # you can redirect to home page on successful commit. or anywhere else return render_template("input.html", error_msg="", responses=getSessionData(session['id'])) else: error = ( "Error, your log is incomplete! Please check and submit it again!" )
def results(): # Want to pull all data from the response_table and count isbot, userguess # count(clinton_isbot) # count(clinton_userguess) # count(clinton_answer) allQuery = """ SELECT count(clinton_isbot) as all_total, sum(clinton_isbot) as all_clinton_bot, sum(clinton_answer) as all_clinton_correct, sum(trump_isbot) as all_trump_bot, sum(trump_answer) as all_trump_correct, (SELECT count(clinton_isbot) FROM response_table WHERE session_id = '%s') as session_total, (SELECT sum(clinton_isbot) FROM response_table WHERE session_id = '%s') as session_clinton_bot, (SELECT sum(clinton_answer) FROM response_table WHERE session_id = '%s') as session_clinton_correct, (SELECT sum(trump_isbot) FROM response_table WHERE session_id = '%s') as session_trump_bot, (SELECT sum(trump_answer) FROM response_table WHERE session_id = '%s') as session_trump_correct FROM response_table; """ % (session['id'], session['id'], session['id'], session['id'], session['id']) DB = ConnectToDB() qr = DB.pull_from_db(allQuery) all_results = [{ "name": "Trump", "data": [int(qr['all_trump_correct']), int(qr['session_trump_correct'])], "stack": "trump" }, { "name": "Trump Bot", "data": [ int(qr['all_total']) - int(qr['all_trump_correct']), int(qr['session_total']) - int(qr['session_trump_correct']) ], "stack": "trump" }, { "name": "Clinton", "data": [int(qr['all_clinton_correct']), int(qr['session_clinton_correct'])], "stack": "clinton" }, { "name": "Clinton Bot", "data": [ int(qr['all_total']) - int(qr['all_clinton_correct']), int(qr['session_total']) - int(qr['session_clinton_correct']) ], "stack": "clinton" }] trump_score = "/".join( [str(int(qr['session_trump_correct'])), str(int(qr['session_total']))]) clinton_score = "/".join([ str(int(qr['session_clinton_correct'])), str(int(qr['session_total'])) ]) return render_template("results.html", results_data=json.dumps(all_results), trump_score=trump_score, clinton_score=clinton_score)
class BuildSaveCorpus(object): def __init__(self): # Get the html links self.html_links = self.get_html_links() # Tokenizer! self.TV = TokenVectorizer() # self.DB = ConnectToDB() self.table_name = 'corpus_table' def build_corpus(self): for key, vals in self.html_links.items(): if 'speech' in key: key_speaker = key.split('_')[1] doc_type = key.split('_')[0] else: doc_type = key for html_link in vals: line_of_doc_cter = 0 html_text = self.fetch_data(html_link) speakers, speakers_text = self.clean_html(html_text) if len(speakers)==0: speakers = [str(key_speaker)]*len(speakers_text) for speaker, speaker_text in zip(speakers, speakers_text): sentences = self.TV.tokenize_tosentence(speaker_text) for s in sentences: corpus_ID = "_".join([str(doc_type), str(self.doc_date), str(line_of_doc_cter)]) html_link_dict = { "corpus_id": [corpus_ID], "link": [html_link], "doc_id": ["_".join([key,str(self.doc_date)])], "doc_type": [doc_type], "doc_date": [int(self.doc_date)], "speaker": [speaker.lower()], "speaker_text": [s], "line_of_doc": [line_of_doc_cter] } self.save_corpus(html_link_dict) print(line_of_doc_cter, s) line_of_doc_cter+=1 print("Done with %s - %s" %(key, html_link)) def save_corpus(self, out_dict): ''' Save dictionary to SQL database ''' self.DB.save_to_db(self.table_name, out_dict) def fetch_data(self, htmlfile): '''Grabs and opens the html file given the url address''' url = htmlfile if url==None: print("No URL Provided") else: response = urllib.request.urlopen(url) return response.read() def clean_html(self, htmltext): '''Uses beautifulsoup to parse the html file and clean it a bit Returns two different arrays: speaker -- who was talking speaker_text -- that the speaker said. Useful, specificially for debates. Clean_text will provide, in chronological order, the speaker:what they said. ''' soupy = BS(htmltext, 'lxml') # Get the document date. Save "March 16, 2015" as "20150316" date = str(soupy.find_all('span',class_='docdate')) date_str = " ".join(re.split(' |, ',re.sub(r"<.+?>|","", date)[1:-1])) stime = time.strptime(date_str,"%B %d %Y") self.doc_date = str((stime[0]*10000)+(stime[1]*100)+(stime[2])) text_only = str(soupy.find_all('span',class_='displaytext')) speaker = [] speaker_text = [] for each in text_only[1:-1].split('<p>'): clean_each = re.sub(r"<.+?>|\[.+?\]","", each) clean_each_split = clean_each.split(':') #print(clean_each_split) if len(clean_each_split) > 1: speaker.append(clean_each_split[0]) try: speaker_text.append(clean_each_split[1]) except (AttributeError, TypeError): pdb.set_trace() else: try: speaker_text[-1] = speaker_text[-1]+' '+clean_each_split[0] except: speaker_text.append(clean_each_split[0]) return speaker, speaker_text def get_html_links(self): link_dict = {} link_dict['democratic_debate'] = [ 'http://www.presidency.ucsb.edu/ws/index.php?pid=116995', #Brooklyn, New York; April 14, 2016 'http://www.presidency.ucsb.edu/ws/index.php?pid=112719', #Miami, Florida; March 9, 2016 'http://www.presidency.ucsb.edu/ws/index.php?pid=112718', #Flint, Michigan; March 6, 2016 'http://www.presidency.ucsb.edu/ws/index.php?pid=111520', #Milwaukee, Wisconsin; February 11, 2016 'http://www.presidency.ucsb.edu/ws/index.php?pid=111471', #Durham, New Hampshire; February 4, 2016 'http://www.presidency.ucsb.edu/ws/index.php?pid=111409', #Charleston, South Carolina; January 17, 2016 'http://www.presidency.ucsb.edu/ws/index.php?pid=111178', #Manchester, New Hampshire; December 19, 2015 'http://www.presidency.ucsb.edu/ws/index.php?pid=110910', #Des Moines, Iowa; November 14, 2015 'http://www.presidency.ucsb.edu/ws/index.php?pid=110903', #Las Vegas, Nevada; October 13, 2015 ] link_dict['republican_debate'] = [ 'http://www.presidency.ucsb.edu/ws/index.php?pid=115148', #Miami, Florida; March 10, 2016 'http://www.presidency.ucsb.edu/ws/index.php?pid=111711', #Detroit, Michigan; March 3, 2016 'http://www.presidency.ucsb.edu/ws/index.php?pid=111634', #Houston, Texas; February 25, 2016 'http://www.presidency.ucsb.edu/ws/index.php?pid=111500', #Greenville, South Carolina; February 13, 2016 'http://www.presidency.ucsb.edu/ws/index.php?pid=111472', #Manchester, New Hampshire; February 6, 2016 'http://www.presidency.ucsb.edu/ws/index.php?pid=111412', #Des Moines, Iowa; January 28, 2016 'http://www.presidency.ucsb.edu/ws/index.php?pid=111395', #North Charleston, South Carolina; January 14, 2016 'http://www.presidency.ucsb.edu/ws/index.php?pid=111177', #Las Vegas, Nevada; December 15, 2015 'http://www.presidency.ucsb.edu/ws/index.php?pid=110908', #Milwaukee, Wisconsin; November 10, 2015 'http://www.presidency.ucsb.edu/ws/index.php?pid=110906', #Boulder, Colorado; October 28, 2015 'http://www.presidency.ucsb.edu/ws/index.php?pid=110756', #Simi Valley, California; September 16, 2015 'http://www.presidency.ucsb.edu/ws/index.php?pid=110489', #Cleveland, Ohio; August 6, 2015 'http://www.presidency.ucsb.edu/ws/index.php?pid=111413', #Des Moines, Iowa; January 28, 2016 'http://www.presidency.ucsb.edu/ws/index.php?pid=111394', #North Charleston, South Carolina; January 14, 2016 'http://www.presidency.ucsb.edu/ws/index.php?pid=111176', #Las Vegas, Nevada; December 15, 2015 'http://www.presidency.ucsb.edu/ws/index.php?pid=110909', #Milwaukee, Wisconsin: November 10, 2015 'http://www.presidency.ucsb.edu/ws/index.php?pid=110907', #Boulder, Colorado; October 28, 2015 'http://www.presidency.ucsb.edu/ws/index.php?pid=110758', #Simi Valley, California; September 16, 2015 'http://www.presidency.ucsb.edu/ws/index.php?pid=110757' #Cleveland, Ohio; August 6, 2015 ] link_dict['speech_trump'] = [ 'http://www.presidency.ucsb.edu/ws/index.php?pid=110306', 'http://www.presidency.ucsb.edu/ws/index.php?pid=116597' ] link_dict['speech_kaisch'] = [ 'http://www.presidency.ucsb.edu/ws/index.php?pid=116599', 'http://www.presidency.ucsb.edu/ws/index.php?pid=113069', 'http://www.presidency.ucsb.edu/ws/index.php?pid=116546'] link_dict['speech_cruz'] = [ 'http://www.presidency.ucsb.edu/ws/index.php?pid=117232', 'http://www.presidency.ucsb.edu/ws/index.php?pid=116598', 'http://www.presidency.ucsb.edu/ws/index.php?pid=114768', 'http://www.presidency.ucsb.edu/ws/index.php?pid=110030', 'http://www.presidency.ucsb.edu/ws/index.php?pid=109774'] link_dict['speech_malley'] = [ 'http://www.presidency.ucsb.edu/ws/index.php?pid=112703', 'http://www.presidency.ucsb.edu/ws/index.php?pid=112702', 'http://www.presidency.ucsb.edu/ws/index.php?pid=112696', 'http://www.presidency.ucsb.edu/ws/index.php?pid=112699', 'http://www.presidency.ucsb.edu/ws/index.php?pid=112704', 'http://www.presidency.ucsb.edu/ws/index.php?pid=112700', 'http://www.presidency.ucsb.edu/ws/index.php?pid=112716' ] link_dict['speech_clinton'] = [ 'http://www.presidency.ucsb.edu/ws/index.php?pid=116600', 'http://www.presidency.ucsb.edu/ws/index.php?pid=111596', 'http://www.presidency.ucsb.edu/ws/index.php?pid=111586', #Rachel Maddow of MSNBC; February 8, 2016 'http://www.presidency.ucsb.edu/ws/index.php?pid=111589', #Jake Tapper of CNN's "State of the Union"; February 7, 2016 'http://www.presidency.ucsb.edu/ws/index.php?pid=111587', 'http://www.presidency.ucsb.edu/ws/index.php?pid=111585', 'http://www.presidency.ucsb.edu/ws/index.php?pid=111591', 'http://www.presidency.ucsb.edu/ws/index.php?pid=111595', 'http://www.presidency.ucsb.edu/ws/index.php?pid=111592', 'http://www.presidency.ucsb.edu/ws/index.php?pid=111439', 'http://www.presidency.ucsb.edu/ws/index.php?pid=111593', 'http://www.presidency.ucsb.edu/ws/index.php?pid=111594', 'http://www.presidency.ucsb.edu/ws/index.php?pid=111414', 'http://www.presidency.ucsb.edu/ws/index.php?pid=111436', 'http://www.presidency.ucsb.edu/ws/index.php?pid=111435', 'http://www.presidency.ucsb.edu/ws/index.php?pid=111434', 'http://www.presidency.ucsb.edu/ws/index.php?pid=111433', 'http://www.presidency.ucsb.edu/ws/index.php?pid=111432', 'http://www.presidency.ucsb.edu/ws/index.php?pid=111431', 'http://www.presidency.ucsb.edu/ws/index.php?pid=111430', 'http://www.presidency.ucsb.edu/ws/index.php?pid=111429', 'http://www.presidency.ucsb.edu/ws/index.php?pid=111428', 'http://www.presidency.ucsb.edu/ws/index.php?pid=111416', 'http://www.presidency.ucsb.edu/ws/index.php?pid=111426', 'http://www.presidency.ucsb.edu/ws/index.php?pid=111426', 'http://www.presidency.ucsb.edu/ws/index.php?pid=111425', 'http://www.presidency.ucsb.edu/ws/index.php?pid=111424', 'http://www.presidency.ucsb.edu/ws/index.php?pid=111423', 'http://www.presidency.ucsb.edu/ws/index.php?pid=111422', 'http://www.presidency.ucsb.edu/ws/index.php?pid=111417', 'http://www.presidency.ucsb.edu/ws/index.php?pid=111418', 'http://www.presidency.ucsb.edu/ws/index.php?pid=111421', 'http://www.presidency.ucsb.edu/ws/index.php?pid=111419', 'http://www.presidency.ucsb.edu/ws/index.php?pid=111419', 'http://www.presidency.ucsb.edu/ws/index.php?pid=110267', 'http://www.presidency.ucsb.edu/ws/index.php?pid=110269', 'http://www.presidency.ucsb.edu/ws/index.php?pid=110268', 'http://www.presidency.ucsb.edu/ws/index.php?pid=110270', 'http://www.presidency.ucsb.edu/ws/index.php?pid=110271', 'http://www.presidency.ucsb.edu/ws/index.php?pid=111420' ] link_dict['speech_sanders'] = [ 'http://www.presidency.ucsb.edu/ws/index.php?pid=117513', #Steve Inskeep of National Public Radio; May 5, 2016 'http://www.presidency.ucsb.edu/ws/index.php?pid=117194', #Conference Hosted by the Pontifical Academy of Social Sciences in Vatican City; April 15, 2016 'http://www.presidency.ucsb.edu/ws/index.php?pid=116694', #Remarks on Policy in the Middle East in Salt Lake City, Utah; March 21, 2016 'http://www.presidency.ucsb.edu/ws/index.php?pid=117516', #Remarks in Essex Junction, Vermont Following the "Super Tuesday" Primaries, March 1, 2016 'http://www.presidency.ucsb.edu/ws/index.php?pid=117511', #Remarks in Concord Following the New Hampshire Primary; February 9, 2016 'http://www.presidency.ucsb.edu/ws/index.php?pid=111440', #Remarks in Des Moines Following the Iowa Caucus; February 1, 2016 'http://www.presidency.ucsb.edu/ws/index.php?pid=117514', #Remarks in a Meeting with Steelworkers in Des Moines, Iowa; January 26, 2016 'http://www.presidency.ucsb.edu/ws/index.php?pid=114487', #Remarks at the New Hampshire Democratic Party Jefferson-Jackson Dinner in Manchester; November 29, 2015 'http://www.presidency.ucsb.edu/ws/index.php?pid=117517', 'http://www.presidency.ucsb.edu/ws/index.php?pid=114493', 'http://www.presidency.ucsb.edu/ws/index.php?pid=114491', 'http://www.presidency.ucsb.edu/ws/index.php?pid=114486', 'http://www.presidency.ucsb.edu/ws/index.php?pid=114488', 'http://www.presidency.ucsb.edu/ws/index.php?pid=114494', 'http://www.presidency.ucsb.edu/ws/index.php?pid=114489', 'http://www.presidency.ucsb.edu/ws/index.php?pid=114490', 'http://www.presidency.ucsb.edu/ws/index.php?pid=114495', 'http://www.presidency.ucsb.edu/ws/index.php?pid=114492', 'http://www.presidency.ucsb.edu/ws/index.php?pid=110222', 'http://www.presidency.ucsb.edu/ws/index.php?pid=110221', 'http://www.presidency.ucsb.edu/ws/index.php?pid=110125', 'http://www.presidency.ucsb.edu/ws/index.php?pid=110124' ] return link_dict #DEFUNCT def combine_speakers(self): speakers = self.speakers text = self.speakers_text uniq_speakers = list(set(speakers)) speaker_dict = {} speakerarr = np.array(speakers) textarr = np.array(text) for us in uniq_speakers: match = np.where(speakerarr == us)[0] matched_text = textarr[match] giant_text_str = '' for mt in matched_text: re_mt = re.sub(r'\[.+?\]','', mt.tolist()) giant_text_str+=re_mt speaker_dict[us] = giant_text_str self.speaker_dict = speaker_dict
class PoliBot(object): def __init__( self, candidate, ): """ Prepare the bot for the input candidate.""" # Connect to the SQL database self.DB = ConnectToDB() self.corpus_table = 'corpus_table' self.question_table = 'question_table' self.response_table = 'response_table' # Save candidate and get candidate corpus self.candidate = candidate.lower() self.corpus = self.get_corpus() # Initialize the vectorizer self.TV = TokenVectorizer() # Initialize the markov chain self.sorin = MarkovChain(self.corpus) # Log dictionary for questions and responses self.idnum = 0 def ask_question(self, question=None): ts = time.time() self.date = int(datetime.datetime.fromtimestamp(ts).strftime('%Y%m%d')) self.time = int(datetime.datetime.fromtimestamp(ts).strftime('%H%M%S')) self.ID = str(self.idnum) + '_' + str(ts) self.idnum += 1 try: tokens = self.TV.tokenize_full(question) except: tokens = [] try: word_string = [str(t) for t in tokens] except: word_string = "" try: tokens_vect = self.TV.make_vector(word_string) except: tokens_vect = [] if len(tokens_vect) > 1: question_vect = sum(tokens_vect) / len(tokens_vect) else: question_vect = tokens_vect if len(question_vect) == 1: self.question_vect = question_vect[0] else: self.question_vect = question_vect self.question_log = { 'question_id': [self.ID], 'question_date': [self.date], 'question_time': [self.time], 'question_sent': [question], 'question_tokens': [tokens] } self.response_log = { 'response_id': [self.ID], 'response_date': [self.date], 'response_time': [self.time], 'response_candidate': [self.candidate], 'response_sent': [], 'response_tokens': [], 'cosine_sim': [0], 'question_id': [self.question_log['question_id'][0]] } # We want a new response dictionary for each question asked. self.response_dict = {} self.responseIDcounter = 0 self.responseLOOPcounter = 0 def response(self, num_sent=100, tries=10, save_to_db=False): generated_sentences = self.sorin.generate_sentences(num_sent=num_sent) cosine_sims = [0] all_tokens = [] for i, sent in enumerate(generated_sentences): if sent is None: continue else: tokens = self.TV.tokenize_full(sent) if tokens is None: continue else: word_string = [str(t) for t in tokens] tokens_vect = self.TV.make_vector(word_string) if len(tokens_vect) > 1: response_vect = sum(tokens_vect) / len(tokens_vect) else: response_vect = tokens_vect # Cosine similarity try: cosine_sim_0 = cosine(response_vect, self.question_vect) except: continue if cosine_sim_0 > np.max(cosine_sims): self.response_log['response_sent'] = [sent] self.response_log['response_tokens'] = [tokens] self.response_log['cosine_sim'] = [cosine_sim_0] cosine_sims.append(cosine_sim_0) all_tokens.append(tokens) else: cosine_sims.append(cosine_sim_0) all_tokens.append(tokens) if (self.responseLOOPcounter < tries) and (self.response_log['cosine_sim'][0] < 0.70): self.responseLOOPcounter += 1 self.response(num_sent=num_sent, tries=tries) else: self.response_log['cosine_sim_dist'] = \ [(np.mean(cosine_sims),np.std(cosine_sims))] if save_to_db: self.DB.save_to_db(self.question_table, self.question_log) self.DB.save_to_db(self.response_table, self.response_log) else: print("Not saving to db") return self.response_log['response_sent'][0] def get_corpus(self): return self.DB.pull_candidate_corpus(self.corpus_table, self.candidate)