Пример #1
0
    def get_main_corpus(self):
        DB = ConnectToDB()

        sql_query = """
        SELECT doc_id, doc_date, line_of_doc, speaker_text
        FROM corpus_table
        ORDER BY doc_date DESC, line_of_doc ASC;
        """

        # Get entire corpus from database
        corpus_df = DB.pull_from_db(sql_query)
        docs = [sent for sent in corpus_df['speaker_text']]

        texts = [[str(word).lower() for word in self.TV.get_tokens(doc)]
                 for doc in docs]

        frequency = defaultdict(int)
        for text in texts:
            for token in text:
                frequency[token] += 1

        texts = [[token for token in text if frequency[token] > 1]
                  for text in texts]

        return texts
Пример #2
0
 def __init__(self):
     # Get the html links
     self.html_links = self.get_html_links()
     # Tokenizer!
     self.TV = TokenVectorizer()
     #
     self.DB = ConnectToDB()
     self.table_name = 'corpus_table'
Пример #3
0
 def __init__(self):
     # Get the html links
     self.get_html_links()
     # Tokenizer!
     self.TV = TokenVectorizer()
     #
     self.DB = ConnectToDB()
     self.table_name = 'corpus_table'
Пример #4
0
def getSessionData(sessionID=None):

    # RESPONSE_TABE session_id, question_num, date, time, question,
    #               trump_text, trump_sim, trump_isbot, trump_userguess, trump_answer
    #               clinton_text, clinton_sim, clinton_isbot, clinton_userguess, clinton_answer

    sqlQuery = """
        SELECT session_id, question_num, question,
               trump_text, trump_userguess,
               clinton_text, clinton_userguess
        FROM response_table
        WHERE session_id = '%s'
        ORDER BY question_num ASC;
    """ % (sessionID)

    DB = ConnectToDB()

    queryresults = DB.pull_from_db(sqlQuery)

    return pd.DataFrame.to_dict(queryresults, 'records')
Пример #5
0
    def __init__(
        self,
        candidate,
    ):
        """ Prepare the bot for the input candidate."""
        # Connect to the SQL database
        self.DB = ConnectToDB()
        self.corpus_table = 'corpus_table'
        self.question_table = 'question_table'
        self.response_table = 'response_table'

        # Save candidate and get candidate corpus
        self.candidate = candidate.lower()
        self.corpus = self.get_corpus()

        # Initialize the vectorizer
        self.TV = TokenVectorizer()
        # Initialize the markov chain
        self.sorin = MarkovChain(self.corpus)

        # Log dictionary for questions and responses
        self.idnum = 0
Пример #6
0
    def __init__(self, candidate, ):
        """ Prepare the bot for the input candidate."""
        # Connect to the SQL database
        self.DB = ConnectToDB()
        self.corpus_table = 'corpus_table'
        self.question_table = 'question_table'
        self.response_table = 'response_table'

        # Save candidate and get candidate corpus
        self.candidate = candidate.lower()
        self.corpus = self.get_corpus()

        # Initialize the vectorizer
        self.TV = TokenVectorizer()
        # Initialize the markov chain
        self.sorin = MarkovChain(self.corpus)

        # Log dictionary for questions and responses
        self.idnum = 0
Пример #7
0
    def __init__(self, candidate, nlp=None):
        """ Prepare the bot for the input candidate."""
        #############################
        # Set some variables
        #############################
        # Path to files/dependencies
        self.path = "/".join([os.getcwd(),"dependencies/"])
        self.candidate = candidate.lower()

        #############################
        # Connect to the SQL database
        #############################
        self.DB = ConnectToDB()
        self.corpus_table = 'corpus_table'
        self.response_table = 'response_table'

        ############################
        # Initialize the vectorizer
        ###########################
        if nlp is None:
            self.nlp = spacy.en.English(tagger=True, parser=False,
                    entity=False, matcher=False)
        else:
            self.nlp = nlp

        self.TV = TokenVectorizer(nlp=self.nlp)

        ##################################
        # Set up TopicModeling
        ##################################
        self.TP = TopicModeling(TV=self.TV)

        # Check to see if the candidate specific files are there.
        if os.path.isfile("".join([self.path,candidate,"_lsi_index.index"])):
            corpus_df = self.DB.pull_candidate_corpus('corpus_table', self.candidate)
            self.corpus = corpus_df['speaker_text'].values.tolist()
            self.index = similarities.MatrixSimilarity.load(
                    "".join([self.path,self.candidate,'_tfidf_index.index']))
            self.lsi_index = similarities.MatrixSimilarity.load(
                    "".join([self.path,self.candidate,'_lsi_index.index']))
        else:
            corpus_df = self.DB.pull_candidate_corpus('corpus_table', self.candidate)
            self.corpus = corpus_df['speaker_text'].values.tolist()
            self.TP.prepare_candidate_corpus(self.corpus, self.candidate)
            self.index = similarities.MatrixSimilarity.load(
                    "".join([self.path,self.candidate,'_tfidf_index.index']))
            self.lsi_index = similarities.MatrixSimilarity.load(
                    "".join([self.path,self.candidate,'_lsi_index.index']))

        #############################
        # Initialize the markov chain
        #############################
        # Get the most recent markov chain.
        markov_models = glob.glob("".join([self.path,
                                           self.candidate,
                                           "_markov_models/*"]))
        if len(markov_models)==0:
            corpus = self.get_corpus()
            sorin = MarkovChain(state_size=3)
            sorin.train_model(corpus)

            dt = datetime.datetime.now()
            wk_of_yr = datetime.date(dt.year, dt.month, dt.day).isocalendar()[1]
            timestamp = dt.year*100+wk_of_yr
            fname = "".join([self.path,
                             self.candidate,
                             "_markov_models/",
                             str(timestamp),
                             "_markov_model.pkl"])

            with open(fname, "wb") as f:
                pickle.dump(sorin, f)

            self.markov_model = sorin

        else:
            markov_fname = markov_models[-1]
            pkl_file = open(markov_fname, 'rb')
            sorin = pickle.load(pkl_file)
            self.markov_model = sorin

        # Log dictionary for questions and responses
        self.idnum = 0
Пример #8
0
class PoliBot(object):
    def __init__(self, candidate, nlp=None):
        """ Prepare the bot for the input candidate."""
        #############################
        # Set some variables
        #############################
        # Path to files/dependencies
        self.path = "/".join([os.getcwd(),"dependencies/"])
        self.candidate = candidate.lower()

        #############################
        # Connect to the SQL database
        #############################
        self.DB = ConnectToDB()
        self.corpus_table = 'corpus_table'
        self.response_table = 'response_table'

        ############################
        # Initialize the vectorizer
        ###########################
        if nlp is None:
            self.nlp = spacy.en.English(tagger=True, parser=False,
                    entity=False, matcher=False)
        else:
            self.nlp = nlp

        self.TV = TokenVectorizer(nlp=self.nlp)

        ##################################
        # Set up TopicModeling
        ##################################
        self.TP = TopicModeling(TV=self.TV)

        # Check to see if the candidate specific files are there.
        if os.path.isfile("".join([self.path,candidate,"_lsi_index.index"])):
            corpus_df = self.DB.pull_candidate_corpus('corpus_table', self.candidate)
            self.corpus = corpus_df['speaker_text'].values.tolist()
            self.index = similarities.MatrixSimilarity.load(
                    "".join([self.path,self.candidate,'_tfidf_index.index']))
            self.lsi_index = similarities.MatrixSimilarity.load(
                    "".join([self.path,self.candidate,'_lsi_index.index']))
        else:
            corpus_df = self.DB.pull_candidate_corpus('corpus_table', self.candidate)
            self.corpus = corpus_df['speaker_text'].values.tolist()
            self.TP.prepare_candidate_corpus(self.corpus, self.candidate)
            self.index = similarities.MatrixSimilarity.load(
                    "".join([self.path,self.candidate,'_tfidf_index.index']))
            self.lsi_index = similarities.MatrixSimilarity.load(
                    "".join([self.path,self.candidate,'_lsi_index.index']))

        #############################
        # Initialize the markov chain
        #############################
        # Get the most recent markov chain.
        markov_models = glob.glob("".join([self.path,
                                           self.candidate,
                                           "_markov_models/*"]))
        if len(markov_models)==0:
            corpus = self.get_corpus()
            sorin = MarkovChain(state_size=3)
            sorin.train_model(corpus)

            dt = datetime.datetime.now()
            wk_of_yr = datetime.date(dt.year, dt.month, dt.day).isocalendar()[1]
            timestamp = dt.year*100+wk_of_yr
            fname = "".join([self.path,
                             self.candidate,
                             "_markov_models/",
                             str(timestamp),
                             "_markov_model.pkl"])

            with open(fname, "wb") as f:
                pickle.dump(sorin, f)

            self.markov_model = sorin

        else:
            markov_fname = markov_models[-1]
            pkl_file = open(markov_fname, 'rb')
            sorin = pickle.load(pkl_file)
            self.markov_model = sorin

        # Log dictionary for questions and responses
        self.idnum = 0

    def update_model(self, text, sim):

        new_sorin = self.markov_model.update(text, sim)

        dt = datetime.datetime.now()
        wk_of_yr = datetime.date(dt.year, dt.month, dt.day).isocalendar()[1]
        timestamp = dt.year*100+wk_of_yr
        fname = "".join([self.path, self.candidate, "_markov_models/",
                         str(timestamp),"_markov_model.pkl"])

        with open(fname, "wb") as f:
            pickle.dump(new_sorin, f)

        self.markov_model = new_sorin

        return self

    def question_getbest_responses(self, question, nsent=100):

            responses = self.get_responses(num_sent=nsent)
            response_text = [" ".join(response[0].split( )) for response in responses]
            best_responses = self.get_tfidf_matches(response_text, question)

            return best_responses

    def ask_question(self, question=None):

        token_list = self.TV.get_tokens(question)

        if token_list is None:
            return None
        else:
            return (question, token_list)

    def get_responses(self, num_sent=100, tries=1, save_to_db=False):

        responses = self.markov_model.make_response(n_sentences=num_sent)

        response_list = []
        for i, response in enumerate(responses):
            token_list = self.TV.get_tokens(response)
            response_list.append((response, token_list))

        return response_list

    def response_tfidf_matches(self, sentences, question, n_return=None):

        index = self.TP.get_corpus_index(sentences)
        sims = index[self.TP.tfidf[self.TP.get_doc2bow(question)]]
        sims = sorted(enumerate(sims), key=lambda item: -item[1])

        if n_return is None:
            return [(sims[i][0],sentences[sims[i][0]],sims[i][1]) for i in range(len(sentences))]
        else:
            return [(sims[i][0],sentences[sims[i][0]],sims[i][1]) for i in range(n_return)]

    def text_tfidf_matches(self, question, n_return=5):

        sims = self.index[self.TP.tfidf[self.TP.get_doc2bow(question)]]
        sims = sorted(enumerate(sims), key=lambda item: -item[1])

        if n_return is None:
            return [(sims[i][0],self.corpus[sims[i][0]],sims[i][1]) for i in range(len(sentences))]
        else:
            return [(sims[i][0],self.corpus[sims[i][0]],sims[i][1]) for i in range(n_return)]

    def response_lsi_matches(self, sentences, question, n_return=5):

        index = self.TP.get_corpus_index(sentences, lsi=True)
        sims = index[self.TP.lsi[self.TP.tfidf[self.TP.get_doc2bow(question)]]]
        sims = sorted(enumerate(sims), key=lambda item: -item[1])

        if n_return is None:
            return [(sims[i][0],sentences[sims[i][0]],sims[i][1]) for i in range(len(sentences))]
        else:
            return [(sims[i][0],sentences[sims[i][0]],sims[i][1]) for i in range(n_return)]

    def text_lsi_matches(self, question, n_return=5):

        sims = self.lsi_index[self.TP.lsi[self.TP.tfidf[self.TP.get_doc2bow(question)]]]
        sims = sorted(enumerate(sims), key=lambda item: -item[1])

        if n_return is None:
            return [(sims[i][0],self.corpus[sims[i][0]],sims[i][1]) for i in range(len(sentences))]
        else:
            return [(sims[i][0],self.corpus[sims[i][0]],sims[i][1]) for i in range(n_return)]

    def get_corpus(self):

        df = self.DB.pull_candidate_corpus(self.corpus_table, self.candidate)

        corpus = deque()
        for doc in self.nlp.pipe(df['speaker_text'], batch_size=50, n_threads=1):
            pos_tagged_sentence = list()
            for tok in doc:
                if tok.like_url or tok.like_email:
                    continue
                if tok.is_alpha:
                    pos_tagged_sentence.append('::'.join([tok.orth_, tok.pos_]))
                elif tok.is_punct and tok.text in [',', '.', '?', '!']:
                    pos_tagged_sentence.append(tok.text)
                elif "\'" in tok.text and tok.lemma_ not in string.punctuation:
                    pos_tagged_sentence.append(tok.lemma_)
            corpus.append(pos_tagged_sentence)

        return list(corpus)
Пример #9
0
class PoliBot(object):
    def __init__(self, candidate, ):
        """ Prepare the bot for the input candidate."""
        # Connect to the SQL database
        self.DB = ConnectToDB()
        self.corpus_table = 'corpus_table'
        self.question_table = 'question_table'
        self.response_table = 'response_table'

        # Save candidate and get candidate corpus
        self.candidate = candidate.lower()
        self.corpus = self.get_corpus()

        # Initialize the vectorizer
        self.TV = TokenVectorizer()
        # Initialize the markov chain
        self.sorin = MarkovChain(self.corpus)

        # Log dictionary for questions and responses
        self.idnum = 0

    def ask_question(self, question=None):
        ts = time.time()
        self.date = int(datetime.datetime.fromtimestamp(ts).strftime('%Y%m%d'))
        self.time = int(datetime.datetime.fromtimestamp(ts).strftime('%H%M%S'))

        self.ID = str(self.idnum)+'_'+str(ts)
        self.idnum+=1

        try:
            tokens = self.TV.tokenize_full(question)
        except:
            tokens = []

        try:
            word_string = [str(t) for t in tokens]
        except:
            word_string = ""

        try:
            tokens_vect = self.TV.make_vector(word_string)
        except:
            tokens_vect=[]

        if len(tokens_vect) > 1:
            question_vect = sum(tokens_vect)/len(tokens_vect)
        else:
            question_vect = tokens_vect

        if len(question_vect)==1:
            self.question_vect = question_vect[0]
        else:
            self.question_vect = question_vect

        self.question_log = {
                    'question_id':[self.ID],
                    'question_date':[self.date],
                    'question_time':[self.time],
                    'question_sent':[question],
                    'question_tokens':[tokens]}

        self.response_log = {
                    'response_id':[self.ID],
                    'response_date':[self.date],
                    'response_time':[self.time],
                    'response_candidate':[self.candidate],
                    'response_sent':[],
                    'response_tokens':[],
                    'cosine_sim':[0],
                    'question_id':[self.question_log['question_id'][0]]
                    }

        # We want a new response dictionary for each question asked.
        self.response_dict = {}
        self.responseIDcounter = 0
        self.responseLOOPcounter = 0

    def response(self, num_sent=100, tries=10, save_to_db=False):
        generated_sentences = self.sorin.generate_sentences(num_sent=num_sent)

        cosine_sims = [0]
        all_tokens = []
        for i, sent in enumerate(generated_sentences):
            if sent is None:
                continue
            else:
                tokens = self.TV.tokenize_full(sent)
                if tokens is None:
                    continue
                else:
                    word_string = [str(t) for t in tokens]
                    tokens_vect = self.TV.make_vector(word_string)

                if len(tokens_vect) > 1:
                    response_vect = sum(tokens_vect)/len(tokens_vect)
                else:
                    response_vect = tokens_vect

            # Cosine similarity
            try:
                cosine_sim_0 = cosine(response_vect,self.question_vect)
            except:
                continue

            if cosine_sim_0 > np.max(cosine_sims):
                self.response_log['response_sent'] = [sent]
                self.response_log['response_tokens'] = [tokens]
                self.response_log['cosine_sim'] = [cosine_sim_0]

                cosine_sims.append(cosine_sim_0)
                all_tokens.append(tokens)
            else:
                cosine_sims.append(cosine_sim_0)
                all_tokens.append(tokens)

        if (self.responseLOOPcounter < tries) and (self.response_log['cosine_sim'][0] < 0.70):
            self.responseLOOPcounter+=1
            self.response(num_sent=num_sent, tries=tries)
        else:
            self.response_log['cosine_sim_dist'] = \
                    [(np.mean(cosine_sims),np.std(cosine_sims))]

            if save_to_db:
                self.DB.save_to_db(self.question_table, self.question_log)
                self.DB.save_to_db(self.response_table, self.response_log)
            else:
                print("Not saving to db")

        return self.response_log['response_sent'][0]

    def get_corpus(self):

        return self.DB.pull_candidate_corpus(self.corpus_table, self.candidate)
Пример #10
0
class BuildSaveCorpus(object):
    def __init__(self):
        # Get the html links
        self.html_links = self.get_html_links()
        # Tokenizer!
        self.TV = TokenVectorizer()
        #
        self.DB = ConnectToDB()
        self.table_name = 'corpus_table'

    def build_corpus(self):

        for key, vals in self.html_links.items():

            if 'speech' in key:
                key_speaker = key.split('_')[1]
                doc_type = key.split('_')[0]
            else:
                doc_type = key

            for html_link in vals:
                line_of_doc_cter = 0

                html_text = self.fetch_data(html_link)
                speakers, speakers_text = self.clean_html(html_text)

                if len(speakers) == 0:
                    speakers = [str(key_speaker)] * len(speakers_text)

                for speaker, speaker_text in zip(speakers, speakers_text):
                    sentences = self.TV.tokenize_tosentence(speaker_text)

                    for s in sentences:
                        corpus_ID = "_".join([
                            str(doc_type),
                            str(self.doc_date),
                            str(line_of_doc_cter)
                        ])

                        html_link_dict = {
                            "corpus_id": [corpus_ID],
                            "link": [html_link],
                            "doc_id": ["_".join([key, str(self.doc_date)])],
                            "doc_type": [doc_type],
                            "doc_date": [int(self.doc_date)],
                            "speaker": [speaker.lower()],
                            "speaker_text": [s],
                            "line_of_doc": [line_of_doc_cter]
                        }

                        self.save_corpus(html_link_dict)
                        print(line_of_doc_cter, s)

                        line_of_doc_cter += 1

            print("Done with %s - %s" % (key, html_link))

    def save_corpus(self, out_dict):
        '''
        Save dictionary to SQL database
        '''
        self.DB.save_to_db(self.table_name, out_dict)

    def fetch_data(self, htmlfile):
        '''Grabs and opens the html file given the url address'''
        url = htmlfile
        if url == None:
            print("No URL Provided")
        else:
            response = urllib.request.urlopen(url)

        return response.read()

    def clean_html(self, htmltext):
        '''Uses beautifulsoup to parse the html file and clean it a bit

           Returns two different arrays:
                speaker -- who was talking
                speaker_text -- that the speaker said.

           Useful, specificially for debates. Clean_text will provide, in
           chronological order, the speaker:what they said.
        '''
        soupy = BS(htmltext, 'lxml')

        # Get the document date. Save "March 16, 2015" as "20150316"
        date = str(soupy.find_all('span', class_='docdate'))
        date_str = " ".join(re.split(' |, ',
                                     re.sub(r"<.+?>|", "", date)[1:-1]))
        stime = time.strptime(date_str, "%B %d %Y")
        self.doc_date = str((stime[0] * 10000) + (stime[1] * 100) + (stime[2]))

        text_only = str(soupy.find_all('span', class_='displaytext'))
        speaker = []
        speaker_text = []
        for each in text_only[1:-1].split('<p>'):
            clean_each = re.sub(r"<.+?>|\[.+?\]", "", each)

            clean_each_split = clean_each.split(':')
            #print(clean_each_split)
            if len(clean_each_split) > 1:
                speaker.append(clean_each_split[0])
                try:
                    speaker_text.append(clean_each_split[1])
                except (AttributeError, TypeError):
                    pdb.set_trace()
            else:
                try:
                    speaker_text[
                        -1] = speaker_text[-1] + ' ' + clean_each_split[0]
                except:
                    speaker_text.append(clean_each_split[0])

        return speaker, speaker_text

    def get_html_links(self):

        link_dict = {}

        link_dict['democratic_debate'] = [
            'http://www.presidency.ucsb.edu/ws/index.php?pid=116995',  #Brooklyn, New York; April 14, 2016
            'http://www.presidency.ucsb.edu/ws/index.php?pid=112719',  #Miami, Florida; March 9, 2016
            'http://www.presidency.ucsb.edu/ws/index.php?pid=112718',  #Flint, Michigan; March 6, 2016
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111520',  #Milwaukee, Wisconsin; February 11, 2016
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111471',  #Durham, New Hampshire; February 4, 2016
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111409',  #Charleston, South Carolina; January 17, 2016
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111178',  #Manchester, New Hampshire; December 19, 2015
            'http://www.presidency.ucsb.edu/ws/index.php?pid=110910',  #Des Moines, Iowa; November 14, 2015
            'http://www.presidency.ucsb.edu/ws/index.php?pid=110903',  #Las Vegas, Nevada; October 13, 2015
        ]

        link_dict['republican_debate'] = [
            'http://www.presidency.ucsb.edu/ws/index.php?pid=115148',  #Miami, Florida; March 10, 2016
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111711',  #Detroit, Michigan; March 3, 2016
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111634',  #Houston, Texas; February 25, 2016
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111500',  #Greenville, South Carolina; February 13, 2016
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111472',  #Manchester, New Hampshire; February 6, 2016
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111412',  #Des Moines, Iowa; January 28, 2016
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111395',  #North Charleston, South Carolina; January 14, 2016
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111177',  #Las Vegas, Nevada; December 15, 2015
            'http://www.presidency.ucsb.edu/ws/index.php?pid=110908',  #Milwaukee, Wisconsin; November 10, 2015
            'http://www.presidency.ucsb.edu/ws/index.php?pid=110906',  #Boulder, Colorado; October 28, 2015
            'http://www.presidency.ucsb.edu/ws/index.php?pid=110756',  #Simi Valley, California; September 16, 2015
            'http://www.presidency.ucsb.edu/ws/index.php?pid=110489',  #Cleveland, Ohio; August 6, 2015
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111413',  #Des Moines, Iowa; January 28, 2016
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111394',  #North Charleston, South Carolina; January 14, 2016
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111176',  #Las Vegas, Nevada; December 15, 2015
            'http://www.presidency.ucsb.edu/ws/index.php?pid=110909',  #Milwaukee, Wisconsin: November 10, 2015
            'http://www.presidency.ucsb.edu/ws/index.php?pid=110907',  #Boulder, Colorado; October 28, 2015
            'http://www.presidency.ucsb.edu/ws/index.php?pid=110758',  #Simi Valley, California; September 16, 2015
            'http://www.presidency.ucsb.edu/ws/index.php?pid=110757'  #Cleveland, Ohio; August 6, 2015
        ]

        link_dict['speech_trump'] = [
            'http://www.presidency.ucsb.edu/ws/index.php?pid=110306',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=116597'
        ]

        link_dict['speech_kaisch'] = [
            'http://www.presidency.ucsb.edu/ws/index.php?pid=116599',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=113069',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=116546'
        ]

        link_dict['speech_cruz'] = [
            'http://www.presidency.ucsb.edu/ws/index.php?pid=117232',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=116598',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=114768',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=110030',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=109774'
        ]

        link_dict['speech_malley'] = [
            'http://www.presidency.ucsb.edu/ws/index.php?pid=112703',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=112702',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=112696',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=112699',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=112704',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=112700',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=112716'
        ]

        link_dict['speech_clinton'] = [
            'http://www.presidency.ucsb.edu/ws/index.php?pid=116600',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111596',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111586',  #Rachel Maddow of MSNBC; February 8, 2016
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111589',  #Jake Tapper of CNN's "State of the Union"; February 7, 2016
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111587',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111585',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111591',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111595',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111592',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111439',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111593',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111594',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111414',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111436',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111435',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111434',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111433',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111432',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111431',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111430',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111429',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111428',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111416',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111426',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111426',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111425',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111424',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111423',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111422',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111417',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111418',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111421',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111419',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111419',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=110267',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=110269',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=110268',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=110270',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=110271',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111420'
        ]

        link_dict['speech_sanders'] = [
            'http://www.presidency.ucsb.edu/ws/index.php?pid=117513',  #Steve Inskeep of National Public Radio; May 5, 2016
            'http://www.presidency.ucsb.edu/ws/index.php?pid=117194',  #Conference Hosted by the Pontifical Academy of Social Sciences in Vatican City; April 15, 2016
            'http://www.presidency.ucsb.edu/ws/index.php?pid=116694',  #Remarks on Policy in the Middle East in Salt Lake City, Utah; March 21, 2016
            'http://www.presidency.ucsb.edu/ws/index.php?pid=117516',  #Remarks in Essex Junction, Vermont Following the "Super Tuesday" Primaries, March 1, 2016
            'http://www.presidency.ucsb.edu/ws/index.php?pid=117511',  #Remarks in Concord Following the New Hampshire Primary; February 9, 2016
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111440',  #Remarks in Des Moines Following the Iowa Caucus; February 1, 2016
            'http://www.presidency.ucsb.edu/ws/index.php?pid=117514',  #Remarks in a Meeting with Steelworkers in Des Moines, Iowa; January 26, 2016
            'http://www.presidency.ucsb.edu/ws/index.php?pid=114487',  #Remarks at the New Hampshire Democratic Party Jefferson-Jackson Dinner in Manchester; November 29, 2015
            'http://www.presidency.ucsb.edu/ws/index.php?pid=117517',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=114493',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=114491',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=114486',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=114488',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=114494',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=114489',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=114490',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=114495',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=114492',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=110222',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=110221',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=110125',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=110124'
        ]

        return link_dict

    #DEFUNCT
    def combine_speakers(self):
        speakers = self.speakers
        text = self.speakers_text
        uniq_speakers = list(set(speakers))

        speaker_dict = {}

        speakerarr = np.array(speakers)
        textarr = np.array(text)
        for us in uniq_speakers:
            match = np.where(speakerarr == us)[0]
            matched_text = textarr[match]

            giant_text_str = ''
            for mt in matched_text:
                re_mt = re.sub(r'\[.+?\]', '', mt.tolist())
                giant_text_str += re_mt

            speaker_dict[us] = giant_text_str

        self.speaker_dict = speaker_dict
Пример #11
0
class BuildSaveCorpus(object):
    def __init__(self):
        # Get the html links
        self.get_html_links()
        # Tokenizer!
        self.TV = TokenVectorizer()
        #
        self.DB = ConnectToDB()
        self.table_name = 'corpus_table'

    def build_corpus(self):

        for key, vals in self.html_links.items():

            if 'speech' in key:
                key_speaker = key.split('_')[1]
                doc_type = key.split('_')[0]
            else:
                doc_type = key

            line_of_doc_cter = 0
            for html_link in vals:
                html_text = self.fetch_data(html_link)
                speakers, speakers_text = self.clean_html(html_text)

                if len(speakers)==0:
                    speakers = [str(key_speaker)]*len(speakers_text)

                for speaker, speaker_text in zip(speakers, speakers_text):
                    sentences = self.TV.tokenize_tosentence(speaker_text)

                    for s in sentences:
                        corpus_ID = "_".join([str(doc_type),
                                             str(self.doc_date),
                                             str(line_of_doc_cter)])

                        html_link_dict = {
                                "corpus_id": [corpus_ID],
                                "link": [html_link],
                                "doc_id": ["_".join([key,str(self.doc_date)])],
                                "doc_type": [doc_type],
                                "doc_date": [int(self.doc_date)],
                                "speaker": [speaker.lower()],
                                "speaker_text": [s],
                                "line_of_doc": [line_of_doc_cter]
                        }

                        self.save_corpus(html_link_dict)
                        print(line_of_doc_cter, s)

                        line_of_doc_cter+=1

            print("Done with %s - %s" %(key, html_link))

    def save_corpus(self, out_dict):
        '''
        Save dictionary to SQL database
        '''
        self.DB.save_to_db(self.table_name, out_dict)

    def fetch_data(self, htmlfile):
        '''Grabs and opens the html file given the url address'''
        url = htmlfile
        if url==None:
            print("No URL Provided")
        else:
            response = urllib.request.urlopen(url)

        return response.read()

    def clean_html(self, htmltext):
        '''Uses beautifulsoup to parse the html file and clean it a bit

           Returns two different arrays:
                speaker -- who was talking
                speaker_text -- that the speaker said.

           Useful, specificially for debates. Clean_text will provide, in
           chronological order, the speaker:what they said.
        '''
        soupy = BS(htmltext, 'lxml')

        # Get the document date. Save "March 16, 2015" as "20150316"
        date = str(soupy.find_all('span',class_='docdate'))
        date_str = " ".join(re.split(' |, ',re.sub(r"<.+?>|","", date)[1:-1]))
        stime = time.strptime(date_str,"%B %d %Y")
        self.doc_date = str((stime[0]*10000)+(stime[1]*100)+(stime[2]))

        text_only = str(soupy.find_all('span',class_='displaytext'))
        speaker = []
        speaker_text = []
        for each in text_only[1:-1].split('<p>'):
            clean_each = re.sub(r"<.+?>|\[.+?\]","", each)

            clean_each_split = clean_each.split(':')
            #print(clean_each_split)
            if len(clean_each_split) > 1:
                speaker.append(clean_each_split[0])
                try:
                    speaker_text.append(clean_each_split[1])
                except (AttributeError, TypeError):
                    pdb.set_trace()
            else:
                try:
                    speaker_text[-1] = speaker_text[-1]+' '+clean_each_split[0]
                except:
                    speaker_text.append(clean_each_split[0])

        return speaker, speaker_text

    def get_html_links(self):

        link_dict = {}

        link_dict['debate'] = [
                'http://www.presidency.ucsb.edu/ws/index.php?pid=115148',
                'http://www.presidency.ucsb.edu/ws/index.php?pid=111711',
                'http://www.presidency.ucsb.edu/ws/index.php?pid=111634',
                'http://www.presidency.ucsb.edu/ws/index.php?pid=111500',
                'http://www.presidency.ucsb.edu/ws/index.php?pid=111472',
                'http://www.presidency.ucsb.edu/ws/index.php?pid=111412',
                'http://www.presidency.ucsb.edu/ws/index.php?pid=111395',
                'http://www.presidency.ucsb.edu/ws/index.php?pid=111177',
                'http://www.presidency.ucsb.edu/ws/index.php?pid=110908',
                'http://www.presidency.ucsb.edu/ws/index.php?pid=110906',
                'http://www.presidency.ucsb.edu/ws/index.php?pid=110756',
                'http://www.presidency.ucsb.edu/ws/index.php?pid=110489',
                'http://www.presidency.ucsb.edu/ws/index.php?pid=111413',
                'http://www.presidency.ucsb.edu/ws/index.php?pid=111394',
                'http://www.presidency.ucsb.edu/ws/index.php?pid=111176',
                'http://www.presidency.ucsb.edu/ws/index.php?pid=110909',
                'http://www.presidency.ucsb.edu/ws/index.php?pid=110907',
                'http://www.presidency.ucsb.edu/ws/index.php?pid=110758',
                'http://www.presidency.ucsb.edu/ws/index.php?pid=110757',
                'http://www.presidency.ucsb.edu/ws/index.php?pid=116995',
                'http://www.presidency.ucsb.edu/ws/index.php?pid=112719',
                'http://www.presidency.ucsb.edu/ws/index.php?pid=112718',
                'http://www.presidency.ucsb.edu/ws/index.php?pid=111520',
                'http://www.presidency.ucsb.edu/ws/index.php?pid=111471',
                'http://www.presidency.ucsb.edu/ws/index.php?pid=111409',
                'http://www.presidency.ucsb.edu/ws/index.php?pid=111178',
                'http://www.presidency.ucsb.edu/ws/index.php?pid=110910',
                'http://www.presidency.ucsb.edu/ws/index.php?pid=110903'
        ]

        link_dict['speech_trump'] = [
                'http://www.presidency.ucsb.edu/ws/index.php?pid=110306',
                'http://www.presidency.ucsb.edu/ws/index.php?pid=116597'

        ]

        link_dict['speech_clinton'] = [
                'http://www.presidency.ucsb.edu/ws/index.php?pid=116600',
                'http://www.presidency.ucsb.edu/ws/index.php?pid=111596'
        ]

        self.html_links = link_dict

    #DEFUNCT
    def combine_speakers(self):
        speakers = self.speakers
        text = self.speakers_text
        uniq_speakers = list(set(speakers))

        speaker_dict = {}

        speakerarr = np.array(speakers)
        textarr = np.array(text)
        for us in uniq_speakers:
            match = np.where(speakerarr == us)[0]
            matched_text = textarr[match]

            giant_text_str = ''
            for mt in matched_text:
                re_mt = re.sub(r'\[.+?\]','', mt.tolist())
                giant_text_str+=re_mt

            speaker_dict[us] = giant_text_str

        self.speaker_dict = speaker_dict
Пример #12
0
def logData():
    error = ""
    if request.method == 'POST':
        trumpResp = request.form.get('Trump_Answer')
        clintonResp = request.form.get('Clinton_Answer')

        if trumpResp is not None and clintonResp is not None:
            dt = datetime.datetime.now()
            wk_of_yr = datetime.date(dt.year, dt.month,
                                     dt.day).isocalendar()[1]
            timestamp = dt.year * 100 + wk_of_yr

            # if user selected "bot"
            if trumpResp == "Tbot":
                SD[session['id']]['trump_userguess'] = ["bot"]
                # if response is "bot"
                if (SD[session['id']]['trump_isbot'][0]) == 1:
                    # answer = 1 (correct)
                    SD[session['id']]['trump_answer'] = [1]
                else:
                    # answer = 0 (wrong)
                    SD[session['id']]['trump_answer'] = [0]
            # else if user selected "not"
            else:
                SD[session['id']]['trump_userguess'] = ["not"]
                # if response is "bot"
                if (SD[session['id']]['trump_isbot'][0]) == 1:
                    #answer is wrong
                    SD[session['id']]['trump_answer'] = [0]
                else:
                    # else answer is right
                    SD[session['id']]['trump_answer'] = [1]

            if clintonResp == "Cbot":
                SD[session['id']]['clinton_userguess'] = ["bot"]
                if (SD[session['id']]['clinton_isbot'][0]) == 1:
                    SD[session['id']]['clinton_answer'] = [1]
                else:
                    SD[session['id']]['clinton_answer'] = [0]
            else:
                SD[session['id']]['clinton_userguess'] = ["not"]
                if (SD[session['id']]['clinton_isbot'][0]) == 1:
                    SD[session['id']]['clinton_answer'] = [0]
                else:
                    SD[session['id']]['clinton_answer'] = [1]

            # Set some quetion specific variables. Date/time of question
            SD[session['id']]['question_num'][0] += 1
            SD[session['id']]['date'] = [
                dt.year * 10000 + 100 * dt.month + dt.day
            ]
            SD[session['id']]['time'] = [
                dt.hour * 1e4 + dt.minute * 1e2 +
                round(dt.microsecond / 1e6, 3)
            ]
            """
            # Update markov chain
            if (SD[session['id']]['trump_userguess'][0] == "bot" and
                SD[session['id']]['trump_answer'][0] == 0):

                #global trump
                trump = \
                trump.update_model(
                        [SD[session['id']]["trump_text"]],
                        SD[session['id']]["trump_sim"][0])

            elif (SD[session['id']]['clinton_userguess'][0] == "bot" and
                  SD[session['id']]['clinton_answer'][0] == 0):

                #global clinton
                clinton = \
                clinton.update_model(
                        [SD[session['id']]["clinton_text"]],
                        SD[session['id']]["clinton_sim"][0])
            """

            DB = ConnectToDB()
            DB.save_to_db('response_table', SD[session['id']])

            # you can redirect to home page on successful commit. or anywhere else
            return render_template("input.html",
                                   error_msg="",
                                   responses=getSessionData(session['id']))
        else:
            error = (
                "Error, your log is incomplete! Please check and submit it again!"
            )
Пример #13
0
def results():

    # Want to pull all data from the response_table and count isbot, userguess
    # count(clinton_isbot)
    # count(clinton_userguess)
    # count(clinton_answer)
    allQuery = """
        SELECT
               count(clinton_isbot) as all_total,
               sum(clinton_isbot) as all_clinton_bot,
               sum(clinton_answer) as all_clinton_correct,
               sum(trump_isbot) as all_trump_bot,
               sum(trump_answer) as all_trump_correct,
               (SELECT count(clinton_isbot)
                FROM response_table
                WHERE session_id = '%s') as session_total,
               (SELECT sum(clinton_isbot)
                FROM response_table
                WHERE session_id = '%s') as session_clinton_bot,
               (SELECT sum(clinton_answer)
                FROM response_table
                WHERE session_id = '%s') as session_clinton_correct,
               (SELECT sum(trump_isbot)
                FROM response_table
                WHERE session_id = '%s') as session_trump_bot,
               (SELECT sum(trump_answer)
                FROM response_table
                WHERE session_id = '%s') as session_trump_correct
        FROM response_table;
    """ % (session['id'], session['id'], session['id'], session['id'],
           session['id'])

    DB = ConnectToDB()
    qr = DB.pull_from_db(allQuery)

    all_results = [{
        "name":
        "Trump",
        "data":
        [int(qr['all_trump_correct']),
         int(qr['session_trump_correct'])],
        "stack":
        "trump"
    }, {
        "name":
        "Trump Bot",
        "data": [
            int(qr['all_total']) - int(qr['all_trump_correct']),
            int(qr['session_total']) - int(qr['session_trump_correct'])
        ],
        "stack":
        "trump"
    }, {
        "name":
        "Clinton",
        "data":
        [int(qr['all_clinton_correct']),
         int(qr['session_clinton_correct'])],
        "stack":
        "clinton"
    }, {
        "name":
        "Clinton Bot",
        "data": [
            int(qr['all_total']) - int(qr['all_clinton_correct']),
            int(qr['session_total']) - int(qr['session_clinton_correct'])
        ],
        "stack":
        "clinton"
    }]

    trump_score = "/".join(
        [str(int(qr['session_trump_correct'])),
         str(int(qr['session_total']))])
    clinton_score = "/".join([
        str(int(qr['session_clinton_correct'])),
        str(int(qr['session_total']))
    ])

    return render_template("results.html",
                           results_data=json.dumps(all_results),
                           trump_score=trump_score,
                           clinton_score=clinton_score)
Пример #14
0
class BuildSaveCorpus(object):
    def __init__(self):
        # Get the html links
        self.html_links = self.get_html_links()
        # Tokenizer!
        self.TV = TokenVectorizer()
        #
        self.DB = ConnectToDB()
        self.table_name = 'corpus_table'

    def build_corpus(self):

        for key, vals in self.html_links.items():

            if 'speech' in key:
                key_speaker = key.split('_')[1]
                doc_type = key.split('_')[0]
            else:
                doc_type = key

            for html_link in vals:
                line_of_doc_cter = 0

                html_text = self.fetch_data(html_link)
                speakers, speakers_text = self.clean_html(html_text)

                if len(speakers)==0:
                    speakers = [str(key_speaker)]*len(speakers_text)

                for speaker, speaker_text in zip(speakers, speakers_text):
                    sentences = self.TV.tokenize_tosentence(speaker_text)

                    for s in sentences:
                        corpus_ID = "_".join([str(doc_type),
                                             str(self.doc_date),
                                             str(line_of_doc_cter)])

                        html_link_dict = {
                                "corpus_id": [corpus_ID],
                                "link": [html_link],
                                "doc_id": ["_".join([key,str(self.doc_date)])],
                                "doc_type": [doc_type],
                                "doc_date": [int(self.doc_date)],
                                "speaker": [speaker.lower()],
                                "speaker_text": [s],
                                "line_of_doc": [line_of_doc_cter]
                        }

                        self.save_corpus(html_link_dict)
                        print(line_of_doc_cter, s)

                        line_of_doc_cter+=1

            print("Done with %s - %s" %(key, html_link))

    def save_corpus(self, out_dict):
        '''
        Save dictionary to SQL database
        '''
        self.DB.save_to_db(self.table_name, out_dict)

    def fetch_data(self, htmlfile):
        '''Grabs and opens the html file given the url address'''
        url = htmlfile
        if url==None:
            print("No URL Provided")
        else:
            response = urllib.request.urlopen(url)

        return response.read()

    def clean_html(self, htmltext):
        '''Uses beautifulsoup to parse the html file and clean it a bit

           Returns two different arrays:
                speaker -- who was talking
                speaker_text -- that the speaker said.

           Useful, specificially for debates. Clean_text will provide, in
           chronological order, the speaker:what they said.
        '''
        soupy = BS(htmltext, 'lxml')

        # Get the document date. Save "March 16, 2015" as "20150316"
        date = str(soupy.find_all('span',class_='docdate'))
        date_str = " ".join(re.split(' |, ',re.sub(r"<.+?>|","", date)[1:-1]))
        stime = time.strptime(date_str,"%B %d %Y")
        self.doc_date = str((stime[0]*10000)+(stime[1]*100)+(stime[2]))

        text_only = str(soupy.find_all('span',class_='displaytext'))
        speaker = []
        speaker_text = []
        for each in text_only[1:-1].split('<p>'):
            clean_each = re.sub(r"<.+?>|\[.+?\]","", each)

            clean_each_split = clean_each.split(':')
            #print(clean_each_split)
            if len(clean_each_split) > 1:
                speaker.append(clean_each_split[0])
                try:
                    speaker_text.append(clean_each_split[1])
                except (AttributeError, TypeError):
                    pdb.set_trace()
            else:
                try:
                    speaker_text[-1] = speaker_text[-1]+' '+clean_each_split[0]
                except:
                    speaker_text.append(clean_each_split[0])

        return speaker, speaker_text

    def get_html_links(self):

        link_dict = {}

        link_dict['democratic_debate'] = [
            'http://www.presidency.ucsb.edu/ws/index.php?pid=116995', #Brooklyn, New York; April 14, 2016
            'http://www.presidency.ucsb.edu/ws/index.php?pid=112719', #Miami, Florida; March 9, 2016
            'http://www.presidency.ucsb.edu/ws/index.php?pid=112718', #Flint, Michigan; March 6, 2016
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111520', #Milwaukee, Wisconsin; February 11, 2016
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111471', #Durham, New Hampshire; February 4, 2016
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111409', #Charleston, South Carolina; January 17, 2016
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111178', #Manchester, New Hampshire; December 19, 2015
            'http://www.presidency.ucsb.edu/ws/index.php?pid=110910', #Des Moines, Iowa; November 14, 2015
            'http://www.presidency.ucsb.edu/ws/index.php?pid=110903', #Las Vegas, Nevada; October 13, 2015
        ]

        link_dict['republican_debate'] = [
            'http://www.presidency.ucsb.edu/ws/index.php?pid=115148', #Miami, Florida; March 10, 2016
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111711', #Detroit, Michigan; March 3, 2016
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111634', #Houston, Texas; February 25, 2016
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111500', #Greenville, South Carolina; February 13, 2016
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111472', #Manchester, New Hampshire; February 6, 2016
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111412', #Des Moines, Iowa; January 28, 2016
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111395', #North Charleston, South Carolina; January 14, 2016
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111177', #Las Vegas, Nevada; December 15, 2015
            'http://www.presidency.ucsb.edu/ws/index.php?pid=110908', #Milwaukee, Wisconsin; November 10, 2015
            'http://www.presidency.ucsb.edu/ws/index.php?pid=110906', #Boulder, Colorado; October 28, 2015
            'http://www.presidency.ucsb.edu/ws/index.php?pid=110756', #Simi Valley, California; September 16, 2015
            'http://www.presidency.ucsb.edu/ws/index.php?pid=110489', #Cleveland, Ohio; August 6, 2015
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111413', #Des Moines, Iowa; January 28, 2016
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111394', #North Charleston, South Carolina; January 14, 2016
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111176', #Las Vegas, Nevada; December 15, 2015
            'http://www.presidency.ucsb.edu/ws/index.php?pid=110909', #Milwaukee, Wisconsin: November 10, 2015
            'http://www.presidency.ucsb.edu/ws/index.php?pid=110907', #Boulder, Colorado; October 28, 2015
            'http://www.presidency.ucsb.edu/ws/index.php?pid=110758', #Simi Valley, California; September 16, 2015
            'http://www.presidency.ucsb.edu/ws/index.php?pid=110757'  #Cleveland, Ohio; August 6, 2015
        ]

        link_dict['speech_trump'] = [
            'http://www.presidency.ucsb.edu/ws/index.php?pid=110306',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=116597'

        ]

        link_dict['speech_kaisch'] = [
            'http://www.presidency.ucsb.edu/ws/index.php?pid=116599',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=113069',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=116546']

        link_dict['speech_cruz'] = [
            'http://www.presidency.ucsb.edu/ws/index.php?pid=117232',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=116598',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=114768',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=110030',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=109774']

        link_dict['speech_malley'] = [
            'http://www.presidency.ucsb.edu/ws/index.php?pid=112703',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=112702',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=112696',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=112699',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=112704',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=112700',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=112716'
        ]

        link_dict['speech_clinton'] = [
            'http://www.presidency.ucsb.edu/ws/index.php?pid=116600',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111596',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111586', #Rachel Maddow of MSNBC; February 8, 2016
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111589', #Jake Tapper of CNN's "State of the Union"; February 7, 2016
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111587',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111585',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111591',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111595',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111592',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111439',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111593',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111594',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111414',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111436',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111435',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111434',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111433',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111432',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111431',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111430',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111429',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111428',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111416',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111426',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111426',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111425',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111424',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111423',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111422',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111417',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111418',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111421',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111419',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111419',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=110267',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=110269',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=110268',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=110270',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=110271',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111420'
        ]

        link_dict['speech_sanders'] = [
            'http://www.presidency.ucsb.edu/ws/index.php?pid=117513', #Steve Inskeep of National Public Radio; May 5, 2016
            'http://www.presidency.ucsb.edu/ws/index.php?pid=117194', #Conference Hosted by the Pontifical Academy of Social Sciences in Vatican City; April 15, 2016
            'http://www.presidency.ucsb.edu/ws/index.php?pid=116694', #Remarks on Policy in the Middle East in Salt Lake City, Utah; March 21, 2016
            'http://www.presidency.ucsb.edu/ws/index.php?pid=117516', #Remarks in Essex Junction, Vermont Following the "Super Tuesday" Primaries, March 1, 2016
            'http://www.presidency.ucsb.edu/ws/index.php?pid=117511', #Remarks in Concord Following the New Hampshire Primary; February 9, 2016
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111440', #Remarks in Des Moines Following the Iowa Caucus; February 1, 2016
            'http://www.presidency.ucsb.edu/ws/index.php?pid=117514', #Remarks in a Meeting with Steelworkers in Des Moines, Iowa; January 26, 2016
            'http://www.presidency.ucsb.edu/ws/index.php?pid=114487', #Remarks at the New Hampshire Democratic Party Jefferson-Jackson Dinner in Manchester; November 29, 2015
            'http://www.presidency.ucsb.edu/ws/index.php?pid=117517',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=114493',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=114491',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=114486',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=114488',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=114494',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=114489',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=114490',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=114495',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=114492',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=110222',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=110221',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=110125',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=110124'
        ]

        return link_dict

    #DEFUNCT
    def combine_speakers(self):
        speakers = self.speakers
        text = self.speakers_text
        uniq_speakers = list(set(speakers))

        speaker_dict = {}

        speakerarr = np.array(speakers)
        textarr = np.array(text)
        for us in uniq_speakers:
            match = np.where(speakerarr == us)[0]
            matched_text = textarr[match]

            giant_text_str = ''
            for mt in matched_text:
                re_mt = re.sub(r'\[.+?\]','', mt.tolist())
                giant_text_str+=re_mt

            speaker_dict[us] = giant_text_str

        self.speaker_dict = speaker_dict
Пример #15
0
class PoliBot(object):
    def __init__(
        self,
        candidate,
    ):
        """ Prepare the bot for the input candidate."""
        # Connect to the SQL database
        self.DB = ConnectToDB()
        self.corpus_table = 'corpus_table'
        self.question_table = 'question_table'
        self.response_table = 'response_table'

        # Save candidate and get candidate corpus
        self.candidate = candidate.lower()
        self.corpus = self.get_corpus()

        # Initialize the vectorizer
        self.TV = TokenVectorizer()
        # Initialize the markov chain
        self.sorin = MarkovChain(self.corpus)

        # Log dictionary for questions and responses
        self.idnum = 0

    def ask_question(self, question=None):
        ts = time.time()
        self.date = int(datetime.datetime.fromtimestamp(ts).strftime('%Y%m%d'))
        self.time = int(datetime.datetime.fromtimestamp(ts).strftime('%H%M%S'))

        self.ID = str(self.idnum) + '_' + str(ts)
        self.idnum += 1

        try:
            tokens = self.TV.tokenize_full(question)
        except:
            tokens = []

        try:
            word_string = [str(t) for t in tokens]
        except:
            word_string = ""

        try:
            tokens_vect = self.TV.make_vector(word_string)
        except:
            tokens_vect = []

        if len(tokens_vect) > 1:
            question_vect = sum(tokens_vect) / len(tokens_vect)
        else:
            question_vect = tokens_vect

        if len(question_vect) == 1:
            self.question_vect = question_vect[0]
        else:
            self.question_vect = question_vect

        self.question_log = {
            'question_id': [self.ID],
            'question_date': [self.date],
            'question_time': [self.time],
            'question_sent': [question],
            'question_tokens': [tokens]
        }

        self.response_log = {
            'response_id': [self.ID],
            'response_date': [self.date],
            'response_time': [self.time],
            'response_candidate': [self.candidate],
            'response_sent': [],
            'response_tokens': [],
            'cosine_sim': [0],
            'question_id': [self.question_log['question_id'][0]]
        }

        # We want a new response dictionary for each question asked.
        self.response_dict = {}
        self.responseIDcounter = 0
        self.responseLOOPcounter = 0

    def response(self, num_sent=100, tries=10, save_to_db=False):
        generated_sentences = self.sorin.generate_sentences(num_sent=num_sent)

        cosine_sims = [0]
        all_tokens = []
        for i, sent in enumerate(generated_sentences):
            if sent is None:
                continue
            else:
                tokens = self.TV.tokenize_full(sent)
                if tokens is None:
                    continue
                else:
                    word_string = [str(t) for t in tokens]
                    tokens_vect = self.TV.make_vector(word_string)

                if len(tokens_vect) > 1:
                    response_vect = sum(tokens_vect) / len(tokens_vect)
                else:
                    response_vect = tokens_vect

            # Cosine similarity
            try:
                cosine_sim_0 = cosine(response_vect, self.question_vect)
            except:
                continue

            if cosine_sim_0 > np.max(cosine_sims):
                self.response_log['response_sent'] = [sent]
                self.response_log['response_tokens'] = [tokens]
                self.response_log['cosine_sim'] = [cosine_sim_0]

                cosine_sims.append(cosine_sim_0)
                all_tokens.append(tokens)
            else:
                cosine_sims.append(cosine_sim_0)
                all_tokens.append(tokens)

        if (self.responseLOOPcounter <
                tries) and (self.response_log['cosine_sim'][0] < 0.70):
            self.responseLOOPcounter += 1
            self.response(num_sent=num_sent, tries=tries)
        else:
            self.response_log['cosine_sim_dist'] = \
                    [(np.mean(cosine_sims),np.std(cosine_sims))]

            if save_to_db:
                self.DB.save_to_db(self.question_table, self.question_log)
                self.DB.save_to_db(self.response_table, self.response_log)
            else:
                print("Not saving to db")

        return self.response_log['response_sent'][0]

    def get_corpus(self):

        return self.DB.pull_candidate_corpus(self.corpus_table, self.candidate)