Exemplo n.º 1
0
    def __init__(self, TV=None):
        self.path = "/".join([os.getcwd(),"dependencies/"])

        if TV is None:
            self.TV = TokenVectorizer()
        else:
            self.TV = TV

        # Check to see if dictioanry exists
        if os.path.isfile(self.path+'topic_dict'):
            self.topic_dict = corpora.Dictionary.load(self.path+'topic_dict')
        else:
            self.topic_dict = self.get_topic_dict()

        if os.path.isfile(self.path+'tfidf_model'):
            self.tfidf = models.TfidfModel.load(self.path+'tfidf_model')
        else:
            self.tfidf = self.train_tfidf()

        #if os.path.isfile(self.path+'lda_model'):
        #    self.lda = models.ldamodel.LdaModel.load(self.path+'lda_model')
        #else:
        #    self.lda = self.train_lda_model()

        if os.path.isfile(self.path+'lsi_model'):
            self.lsi = models.LsiModel.load(self.path+'lsi_model')
        else:
            self.lsi = self.train_lsi_model()
Exemplo n.º 2
0
 def __init__(self):
     # Get the html links
     self.html_links = self.get_html_links()
     # Tokenizer!
     self.TV = TokenVectorizer()
     #
     self.DB = ConnectToDB()
     self.table_name = 'corpus_table'
Exemplo n.º 3
0
 def __init__(self):
     # Get the html links
     self.get_html_links()
     # Tokenizer!
     self.TV = TokenVectorizer()
     #
     self.DB = ConnectToDB()
     self.table_name = 'corpus_table'
Exemplo n.º 4
0
    def __init__(
        self,
        candidate,
    ):
        """ Prepare the bot for the input candidate."""
        # Connect to the SQL database
        self.DB = ConnectToDB()
        self.corpus_table = 'corpus_table'
        self.question_table = 'question_table'
        self.response_table = 'response_table'

        # Save candidate and get candidate corpus
        self.candidate = candidate.lower()
        self.corpus = self.get_corpus()

        # Initialize the vectorizer
        self.TV = TokenVectorizer()
        # Initialize the markov chain
        self.sorin = MarkovChain(self.corpus)

        # Log dictionary for questions and responses
        self.idnum = 0
Exemplo n.º 5
0
    def __init__(self, candidate, ):
        """ Prepare the bot for the input candidate."""
        # Connect to the SQL database
        self.DB = ConnectToDB()
        self.corpus_table = 'corpus_table'
        self.question_table = 'question_table'
        self.response_table = 'response_table'

        # Save candidate and get candidate corpus
        self.candidate = candidate.lower()
        self.corpus = self.get_corpus()

        # Initialize the vectorizer
        self.TV = TokenVectorizer()
        # Initialize the markov chain
        self.sorin = MarkovChain(self.corpus)

        # Log dictionary for questions and responses
        self.idnum = 0
Exemplo n.º 6
0
    def __init__(self, candidate, nlp=None):
        """ Prepare the bot for the input candidate."""
        #############################
        # Set some variables
        #############################
        # Path to files/dependencies
        self.path = "/".join([os.getcwd(),"dependencies/"])
        self.candidate = candidate.lower()

        #############################
        # Connect to the SQL database
        #############################
        self.DB = ConnectToDB()
        self.corpus_table = 'corpus_table'
        self.response_table = 'response_table'

        ############################
        # Initialize the vectorizer
        ###########################
        if nlp is None:
            self.nlp = spacy.en.English(tagger=True, parser=False,
                    entity=False, matcher=False)
        else:
            self.nlp = nlp

        self.TV = TokenVectorizer(nlp=self.nlp)

        ##################################
        # Set up TopicModeling
        ##################################
        self.TP = TopicModeling(TV=self.TV)

        # Check to see if the candidate specific files are there.
        if os.path.isfile("".join([self.path,candidate,"_lsi_index.index"])):
            corpus_df = self.DB.pull_candidate_corpus('corpus_table', self.candidate)
            self.corpus = corpus_df['speaker_text'].values.tolist()
            self.index = similarities.MatrixSimilarity.load(
                    "".join([self.path,self.candidate,'_tfidf_index.index']))
            self.lsi_index = similarities.MatrixSimilarity.load(
                    "".join([self.path,self.candidate,'_lsi_index.index']))
        else:
            corpus_df = self.DB.pull_candidate_corpus('corpus_table', self.candidate)
            self.corpus = corpus_df['speaker_text'].values.tolist()
            self.TP.prepare_candidate_corpus(self.corpus, self.candidate)
            self.index = similarities.MatrixSimilarity.load(
                    "".join([self.path,self.candidate,'_tfidf_index.index']))
            self.lsi_index = similarities.MatrixSimilarity.load(
                    "".join([self.path,self.candidate,'_lsi_index.index']))

        #############################
        # Initialize the markov chain
        #############################
        # Get the most recent markov chain.
        markov_models = glob.glob("".join([self.path,
                                           self.candidate,
                                           "_markov_models/*"]))
        if len(markov_models)==0:
            corpus = self.get_corpus()
            sorin = MarkovChain(state_size=3)
            sorin.train_model(corpus)

            dt = datetime.datetime.now()
            wk_of_yr = datetime.date(dt.year, dt.month, dt.day).isocalendar()[1]
            timestamp = dt.year*100+wk_of_yr
            fname = "".join([self.path,
                             self.candidate,
                             "_markov_models/",
                             str(timestamp),
                             "_markov_model.pkl"])

            with open(fname, "wb") as f:
                pickle.dump(sorin, f)

            self.markov_model = sorin

        else:
            markov_fname = markov_models[-1]
            pkl_file = open(markov_fname, 'rb')
            sorin = pickle.load(pkl_file)
            self.markov_model = sorin

        # Log dictionary for questions and responses
        self.idnum = 0
Exemplo n.º 7
0
class PoliBot(object):
    def __init__(self, candidate, nlp=None):
        """ Prepare the bot for the input candidate."""
        #############################
        # Set some variables
        #############################
        # Path to files/dependencies
        self.path = "/".join([os.getcwd(),"dependencies/"])
        self.candidate = candidate.lower()

        #############################
        # Connect to the SQL database
        #############################
        self.DB = ConnectToDB()
        self.corpus_table = 'corpus_table'
        self.response_table = 'response_table'

        ############################
        # Initialize the vectorizer
        ###########################
        if nlp is None:
            self.nlp = spacy.en.English(tagger=True, parser=False,
                    entity=False, matcher=False)
        else:
            self.nlp = nlp

        self.TV = TokenVectorizer(nlp=self.nlp)

        ##################################
        # Set up TopicModeling
        ##################################
        self.TP = TopicModeling(TV=self.TV)

        # Check to see if the candidate specific files are there.
        if os.path.isfile("".join([self.path,candidate,"_lsi_index.index"])):
            corpus_df = self.DB.pull_candidate_corpus('corpus_table', self.candidate)
            self.corpus = corpus_df['speaker_text'].values.tolist()
            self.index = similarities.MatrixSimilarity.load(
                    "".join([self.path,self.candidate,'_tfidf_index.index']))
            self.lsi_index = similarities.MatrixSimilarity.load(
                    "".join([self.path,self.candidate,'_lsi_index.index']))
        else:
            corpus_df = self.DB.pull_candidate_corpus('corpus_table', self.candidate)
            self.corpus = corpus_df['speaker_text'].values.tolist()
            self.TP.prepare_candidate_corpus(self.corpus, self.candidate)
            self.index = similarities.MatrixSimilarity.load(
                    "".join([self.path,self.candidate,'_tfidf_index.index']))
            self.lsi_index = similarities.MatrixSimilarity.load(
                    "".join([self.path,self.candidate,'_lsi_index.index']))

        #############################
        # Initialize the markov chain
        #############################
        # Get the most recent markov chain.
        markov_models = glob.glob("".join([self.path,
                                           self.candidate,
                                           "_markov_models/*"]))
        if len(markov_models)==0:
            corpus = self.get_corpus()
            sorin = MarkovChain(state_size=3)
            sorin.train_model(corpus)

            dt = datetime.datetime.now()
            wk_of_yr = datetime.date(dt.year, dt.month, dt.day).isocalendar()[1]
            timestamp = dt.year*100+wk_of_yr
            fname = "".join([self.path,
                             self.candidate,
                             "_markov_models/",
                             str(timestamp),
                             "_markov_model.pkl"])

            with open(fname, "wb") as f:
                pickle.dump(sorin, f)

            self.markov_model = sorin

        else:
            markov_fname = markov_models[-1]
            pkl_file = open(markov_fname, 'rb')
            sorin = pickle.load(pkl_file)
            self.markov_model = sorin

        # Log dictionary for questions and responses
        self.idnum = 0

    def update_model(self, text, sim):

        new_sorin = self.markov_model.update(text, sim)

        dt = datetime.datetime.now()
        wk_of_yr = datetime.date(dt.year, dt.month, dt.day).isocalendar()[1]
        timestamp = dt.year*100+wk_of_yr
        fname = "".join([self.path, self.candidate, "_markov_models/",
                         str(timestamp),"_markov_model.pkl"])

        with open(fname, "wb") as f:
            pickle.dump(new_sorin, f)

        self.markov_model = new_sorin

        return self

    def question_getbest_responses(self, question, nsent=100):

            responses = self.get_responses(num_sent=nsent)
            response_text = [" ".join(response[0].split( )) for response in responses]
            best_responses = self.get_tfidf_matches(response_text, question)

            return best_responses

    def ask_question(self, question=None):

        token_list = self.TV.get_tokens(question)

        if token_list is None:
            return None
        else:
            return (question, token_list)

    def get_responses(self, num_sent=100, tries=1, save_to_db=False):

        responses = self.markov_model.make_response(n_sentences=num_sent)

        response_list = []
        for i, response in enumerate(responses):
            token_list = self.TV.get_tokens(response)
            response_list.append((response, token_list))

        return response_list

    def response_tfidf_matches(self, sentences, question, n_return=None):

        index = self.TP.get_corpus_index(sentences)
        sims = index[self.TP.tfidf[self.TP.get_doc2bow(question)]]
        sims = sorted(enumerate(sims), key=lambda item: -item[1])

        if n_return is None:
            return [(sims[i][0],sentences[sims[i][0]],sims[i][1]) for i in range(len(sentences))]
        else:
            return [(sims[i][0],sentences[sims[i][0]],sims[i][1]) for i in range(n_return)]

    def text_tfidf_matches(self, question, n_return=5):

        sims = self.index[self.TP.tfidf[self.TP.get_doc2bow(question)]]
        sims = sorted(enumerate(sims), key=lambda item: -item[1])

        if n_return is None:
            return [(sims[i][0],self.corpus[sims[i][0]],sims[i][1]) for i in range(len(sentences))]
        else:
            return [(sims[i][0],self.corpus[sims[i][0]],sims[i][1]) for i in range(n_return)]

    def response_lsi_matches(self, sentences, question, n_return=5):

        index = self.TP.get_corpus_index(sentences, lsi=True)
        sims = index[self.TP.lsi[self.TP.tfidf[self.TP.get_doc2bow(question)]]]
        sims = sorted(enumerate(sims), key=lambda item: -item[1])

        if n_return is None:
            return [(sims[i][0],sentences[sims[i][0]],sims[i][1]) for i in range(len(sentences))]
        else:
            return [(sims[i][0],sentences[sims[i][0]],sims[i][1]) for i in range(n_return)]

    def text_lsi_matches(self, question, n_return=5):

        sims = self.lsi_index[self.TP.lsi[self.TP.tfidf[self.TP.get_doc2bow(question)]]]
        sims = sorted(enumerate(sims), key=lambda item: -item[1])

        if n_return is None:
            return [(sims[i][0],self.corpus[sims[i][0]],sims[i][1]) for i in range(len(sentences))]
        else:
            return [(sims[i][0],self.corpus[sims[i][0]],sims[i][1]) for i in range(n_return)]

    def get_corpus(self):

        df = self.DB.pull_candidate_corpus(self.corpus_table, self.candidate)

        corpus = deque()
        for doc in self.nlp.pipe(df['speaker_text'], batch_size=50, n_threads=1):
            pos_tagged_sentence = list()
            for tok in doc:
                if tok.like_url or tok.like_email:
                    continue
                if tok.is_alpha:
                    pos_tagged_sentence.append('::'.join([tok.orth_, tok.pos_]))
                elif tok.is_punct and tok.text in [',', '.', '?', '!']:
                    pos_tagged_sentence.append(tok.text)
                elif "\'" in tok.text and tok.lemma_ not in string.punctuation:
                    pos_tagged_sentence.append(tok.lemma_)
            corpus.append(pos_tagged_sentence)

        return list(corpus)
Exemplo n.º 8
0
class PoliBot(object):
    def __init__(self, candidate, ):
        """ Prepare the bot for the input candidate."""
        # Connect to the SQL database
        self.DB = ConnectToDB()
        self.corpus_table = 'corpus_table'
        self.question_table = 'question_table'
        self.response_table = 'response_table'

        # Save candidate and get candidate corpus
        self.candidate = candidate.lower()
        self.corpus = self.get_corpus()

        # Initialize the vectorizer
        self.TV = TokenVectorizer()
        # Initialize the markov chain
        self.sorin = MarkovChain(self.corpus)

        # Log dictionary for questions and responses
        self.idnum = 0

    def ask_question(self, question=None):
        ts = time.time()
        self.date = int(datetime.datetime.fromtimestamp(ts).strftime('%Y%m%d'))
        self.time = int(datetime.datetime.fromtimestamp(ts).strftime('%H%M%S'))

        self.ID = str(self.idnum)+'_'+str(ts)
        self.idnum+=1

        try:
            tokens = self.TV.tokenize_full(question)
        except:
            tokens = []

        try:
            word_string = [str(t) for t in tokens]
        except:
            word_string = ""

        try:
            tokens_vect = self.TV.make_vector(word_string)
        except:
            tokens_vect=[]

        if len(tokens_vect) > 1:
            question_vect = sum(tokens_vect)/len(tokens_vect)
        else:
            question_vect = tokens_vect

        if len(question_vect)==1:
            self.question_vect = question_vect[0]
        else:
            self.question_vect = question_vect

        self.question_log = {
                    'question_id':[self.ID],
                    'question_date':[self.date],
                    'question_time':[self.time],
                    'question_sent':[question],
                    'question_tokens':[tokens]}

        self.response_log = {
                    'response_id':[self.ID],
                    'response_date':[self.date],
                    'response_time':[self.time],
                    'response_candidate':[self.candidate],
                    'response_sent':[],
                    'response_tokens':[],
                    'cosine_sim':[0],
                    'question_id':[self.question_log['question_id'][0]]
                    }

        # We want a new response dictionary for each question asked.
        self.response_dict = {}
        self.responseIDcounter = 0
        self.responseLOOPcounter = 0

    def response(self, num_sent=100, tries=10, save_to_db=False):
        generated_sentences = self.sorin.generate_sentences(num_sent=num_sent)

        cosine_sims = [0]
        all_tokens = []
        for i, sent in enumerate(generated_sentences):
            if sent is None:
                continue
            else:
                tokens = self.TV.tokenize_full(sent)
                if tokens is None:
                    continue
                else:
                    word_string = [str(t) for t in tokens]
                    tokens_vect = self.TV.make_vector(word_string)

                if len(tokens_vect) > 1:
                    response_vect = sum(tokens_vect)/len(tokens_vect)
                else:
                    response_vect = tokens_vect

            # Cosine similarity
            try:
                cosine_sim_0 = cosine(response_vect,self.question_vect)
            except:
                continue

            if cosine_sim_0 > np.max(cosine_sims):
                self.response_log['response_sent'] = [sent]
                self.response_log['response_tokens'] = [tokens]
                self.response_log['cosine_sim'] = [cosine_sim_0]

                cosine_sims.append(cosine_sim_0)
                all_tokens.append(tokens)
            else:
                cosine_sims.append(cosine_sim_0)
                all_tokens.append(tokens)

        if (self.responseLOOPcounter < tries) and (self.response_log['cosine_sim'][0] < 0.70):
            self.responseLOOPcounter+=1
            self.response(num_sent=num_sent, tries=tries)
        else:
            self.response_log['cosine_sim_dist'] = \
                    [(np.mean(cosine_sims),np.std(cosine_sims))]

            if save_to_db:
                self.DB.save_to_db(self.question_table, self.question_log)
                self.DB.save_to_db(self.response_table, self.response_log)
            else:
                print("Not saving to db")

        return self.response_log['response_sent'][0]

    def get_corpus(self):

        return self.DB.pull_candidate_corpus(self.corpus_table, self.candidate)
Exemplo n.º 9
0
class BuildSaveCorpus(object):
    def __init__(self):
        # Get the html links
        self.html_links = self.get_html_links()
        # Tokenizer!
        self.TV = TokenVectorizer()
        #
        self.DB = ConnectToDB()
        self.table_name = 'corpus_table'

    def build_corpus(self):

        for key, vals in self.html_links.items():

            if 'speech' in key:
                key_speaker = key.split('_')[1]
                doc_type = key.split('_')[0]
            else:
                doc_type = key

            for html_link in vals:
                line_of_doc_cter = 0

                html_text = self.fetch_data(html_link)
                speakers, speakers_text = self.clean_html(html_text)

                if len(speakers) == 0:
                    speakers = [str(key_speaker)] * len(speakers_text)

                for speaker, speaker_text in zip(speakers, speakers_text):
                    sentences = self.TV.tokenize_tosentence(speaker_text)

                    for s in sentences:
                        corpus_ID = "_".join([
                            str(doc_type),
                            str(self.doc_date),
                            str(line_of_doc_cter)
                        ])

                        html_link_dict = {
                            "corpus_id": [corpus_ID],
                            "link": [html_link],
                            "doc_id": ["_".join([key, str(self.doc_date)])],
                            "doc_type": [doc_type],
                            "doc_date": [int(self.doc_date)],
                            "speaker": [speaker.lower()],
                            "speaker_text": [s],
                            "line_of_doc": [line_of_doc_cter]
                        }

                        self.save_corpus(html_link_dict)
                        print(line_of_doc_cter, s)

                        line_of_doc_cter += 1

            print("Done with %s - %s" % (key, html_link))

    def save_corpus(self, out_dict):
        '''
        Save dictionary to SQL database
        '''
        self.DB.save_to_db(self.table_name, out_dict)

    def fetch_data(self, htmlfile):
        '''Grabs and opens the html file given the url address'''
        url = htmlfile
        if url == None:
            print("No URL Provided")
        else:
            response = urllib.request.urlopen(url)

        return response.read()

    def clean_html(self, htmltext):
        '''Uses beautifulsoup to parse the html file and clean it a bit

           Returns two different arrays:
                speaker -- who was talking
                speaker_text -- that the speaker said.

           Useful, specificially for debates. Clean_text will provide, in
           chronological order, the speaker:what they said.
        '''
        soupy = BS(htmltext, 'lxml')

        # Get the document date. Save "March 16, 2015" as "20150316"
        date = str(soupy.find_all('span', class_='docdate'))
        date_str = " ".join(re.split(' |, ',
                                     re.sub(r"<.+?>|", "", date)[1:-1]))
        stime = time.strptime(date_str, "%B %d %Y")
        self.doc_date = str((stime[0] * 10000) + (stime[1] * 100) + (stime[2]))

        text_only = str(soupy.find_all('span', class_='displaytext'))
        speaker = []
        speaker_text = []
        for each in text_only[1:-1].split('<p>'):
            clean_each = re.sub(r"<.+?>|\[.+?\]", "", each)

            clean_each_split = clean_each.split(':')
            #print(clean_each_split)
            if len(clean_each_split) > 1:
                speaker.append(clean_each_split[0])
                try:
                    speaker_text.append(clean_each_split[1])
                except (AttributeError, TypeError):
                    pdb.set_trace()
            else:
                try:
                    speaker_text[
                        -1] = speaker_text[-1] + ' ' + clean_each_split[0]
                except:
                    speaker_text.append(clean_each_split[0])

        return speaker, speaker_text

    def get_html_links(self):

        link_dict = {}

        link_dict['democratic_debate'] = [
            'http://www.presidency.ucsb.edu/ws/index.php?pid=116995',  #Brooklyn, New York; April 14, 2016
            'http://www.presidency.ucsb.edu/ws/index.php?pid=112719',  #Miami, Florida; March 9, 2016
            'http://www.presidency.ucsb.edu/ws/index.php?pid=112718',  #Flint, Michigan; March 6, 2016
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111520',  #Milwaukee, Wisconsin; February 11, 2016
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111471',  #Durham, New Hampshire; February 4, 2016
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111409',  #Charleston, South Carolina; January 17, 2016
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111178',  #Manchester, New Hampshire; December 19, 2015
            'http://www.presidency.ucsb.edu/ws/index.php?pid=110910',  #Des Moines, Iowa; November 14, 2015
            'http://www.presidency.ucsb.edu/ws/index.php?pid=110903',  #Las Vegas, Nevada; October 13, 2015
        ]

        link_dict['republican_debate'] = [
            'http://www.presidency.ucsb.edu/ws/index.php?pid=115148',  #Miami, Florida; March 10, 2016
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111711',  #Detroit, Michigan; March 3, 2016
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111634',  #Houston, Texas; February 25, 2016
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111500',  #Greenville, South Carolina; February 13, 2016
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111472',  #Manchester, New Hampshire; February 6, 2016
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111412',  #Des Moines, Iowa; January 28, 2016
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111395',  #North Charleston, South Carolina; January 14, 2016
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111177',  #Las Vegas, Nevada; December 15, 2015
            'http://www.presidency.ucsb.edu/ws/index.php?pid=110908',  #Milwaukee, Wisconsin; November 10, 2015
            'http://www.presidency.ucsb.edu/ws/index.php?pid=110906',  #Boulder, Colorado; October 28, 2015
            'http://www.presidency.ucsb.edu/ws/index.php?pid=110756',  #Simi Valley, California; September 16, 2015
            'http://www.presidency.ucsb.edu/ws/index.php?pid=110489',  #Cleveland, Ohio; August 6, 2015
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111413',  #Des Moines, Iowa; January 28, 2016
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111394',  #North Charleston, South Carolina; January 14, 2016
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111176',  #Las Vegas, Nevada; December 15, 2015
            'http://www.presidency.ucsb.edu/ws/index.php?pid=110909',  #Milwaukee, Wisconsin: November 10, 2015
            'http://www.presidency.ucsb.edu/ws/index.php?pid=110907',  #Boulder, Colorado; October 28, 2015
            'http://www.presidency.ucsb.edu/ws/index.php?pid=110758',  #Simi Valley, California; September 16, 2015
            'http://www.presidency.ucsb.edu/ws/index.php?pid=110757'  #Cleveland, Ohio; August 6, 2015
        ]

        link_dict['speech_trump'] = [
            'http://www.presidency.ucsb.edu/ws/index.php?pid=110306',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=116597'
        ]

        link_dict['speech_kaisch'] = [
            'http://www.presidency.ucsb.edu/ws/index.php?pid=116599',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=113069',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=116546'
        ]

        link_dict['speech_cruz'] = [
            'http://www.presidency.ucsb.edu/ws/index.php?pid=117232',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=116598',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=114768',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=110030',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=109774'
        ]

        link_dict['speech_malley'] = [
            'http://www.presidency.ucsb.edu/ws/index.php?pid=112703',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=112702',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=112696',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=112699',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=112704',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=112700',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=112716'
        ]

        link_dict['speech_clinton'] = [
            'http://www.presidency.ucsb.edu/ws/index.php?pid=116600',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111596',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111586',  #Rachel Maddow of MSNBC; February 8, 2016
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111589',  #Jake Tapper of CNN's "State of the Union"; February 7, 2016
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111587',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111585',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111591',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111595',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111592',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111439',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111593',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111594',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111414',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111436',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111435',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111434',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111433',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111432',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111431',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111430',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111429',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111428',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111416',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111426',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111426',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111425',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111424',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111423',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111422',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111417',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111418',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111421',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111419',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111419',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=110267',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=110269',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=110268',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=110270',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=110271',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111420'
        ]

        link_dict['speech_sanders'] = [
            'http://www.presidency.ucsb.edu/ws/index.php?pid=117513',  #Steve Inskeep of National Public Radio; May 5, 2016
            'http://www.presidency.ucsb.edu/ws/index.php?pid=117194',  #Conference Hosted by the Pontifical Academy of Social Sciences in Vatican City; April 15, 2016
            'http://www.presidency.ucsb.edu/ws/index.php?pid=116694',  #Remarks on Policy in the Middle East in Salt Lake City, Utah; March 21, 2016
            'http://www.presidency.ucsb.edu/ws/index.php?pid=117516',  #Remarks in Essex Junction, Vermont Following the "Super Tuesday" Primaries, March 1, 2016
            'http://www.presidency.ucsb.edu/ws/index.php?pid=117511',  #Remarks in Concord Following the New Hampshire Primary; February 9, 2016
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111440',  #Remarks in Des Moines Following the Iowa Caucus; February 1, 2016
            'http://www.presidency.ucsb.edu/ws/index.php?pid=117514',  #Remarks in a Meeting with Steelworkers in Des Moines, Iowa; January 26, 2016
            'http://www.presidency.ucsb.edu/ws/index.php?pid=114487',  #Remarks at the New Hampshire Democratic Party Jefferson-Jackson Dinner in Manchester; November 29, 2015
            'http://www.presidency.ucsb.edu/ws/index.php?pid=117517',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=114493',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=114491',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=114486',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=114488',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=114494',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=114489',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=114490',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=114495',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=114492',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=110222',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=110221',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=110125',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=110124'
        ]

        return link_dict

    #DEFUNCT
    def combine_speakers(self):
        speakers = self.speakers
        text = self.speakers_text
        uniq_speakers = list(set(speakers))

        speaker_dict = {}

        speakerarr = np.array(speakers)
        textarr = np.array(text)
        for us in uniq_speakers:
            match = np.where(speakerarr == us)[0]
            matched_text = textarr[match]

            giant_text_str = ''
            for mt in matched_text:
                re_mt = re.sub(r'\[.+?\]', '', mt.tolist())
                giant_text_str += re_mt

            speaker_dict[us] = giant_text_str

        self.speaker_dict = speaker_dict
Exemplo n.º 10
0
class TopicModeling(object):
    def __init__(self, TV=None):
        self.path = "/".join([os.getcwd(),"dependencies/"])

        if TV is None:
            self.TV = TokenVectorizer()
        else:
            self.TV = TV

        # Check to see if dictioanry exists
        if os.path.isfile(self.path+'topic_dict'):
            self.topic_dict = corpora.Dictionary.load(self.path+'topic_dict')
        else:
            self.topic_dict = self.get_topic_dict()

        if os.path.isfile(self.path+'tfidf_model'):
            self.tfidf = models.TfidfModel.load(self.path+'tfidf_model')
        else:
            self.tfidf = self.train_tfidf()

        #if os.path.isfile(self.path+'lda_model'):
        #    self.lda = models.ldamodel.LdaModel.load(self.path+'lda_model')
        #else:
        #    self.lda = self.train_lda_model()

        if os.path.isfile(self.path+'lsi_model'):
            self.lsi = models.LsiModel.load(self.path+'lsi_model')
        else:
            self.lsi = self.train_lsi_model()

    def prepare_candidate_corpus(self, input_text, candidate):
        fname = "".join([self.path,candidate])

        texts = [[str(word).lower() for word in self.TV.get_tokens(doc)]
                 for doc in input_text]

        corpus = [self.topic_dict.doc2bow(text) for text in texts]
        corpora.MmCorpus.serialize("".join([fname,"_corpus.mm"]), corpus)
        corpora.MmCorpus.serialize("".join([fname,"_tfidf_corpus.mm"]),self.tfidf[corpus])
        corpora.MmCorpus.serialize("".join([fname,"_lsi_corpus.mm"]),self.lsi[self.tfidf[corpus]])

        mm_lsi = corpora.MmCorpus("".join([fname,"_lsi_corpus.mm"]))
        mm_tfidf = corpora.MmCorpus("".join([fname,"_tfidf_corpus.mm"]))

        tfidf_index = similarities.SparseMatrixSimilarity(mm_tfidf, num_features=mm_tfidf.num_terms)
        tfidf_index.save("".join([fname,"_tfidf_index.index"]))

        lsi_index = similarities.MatrixSimilarity(mm_lsi, num_features=mm_lsi.num_terms)
        lsi_index.save("".join([fname,"_lsi_index.index"]))

        #lda_index = similarities.MatrixSimilarity(self.lda[mm_tfidf])
        #lda_index.save("".join([fname,"_lda_index.index"]))

    def get_cosinesim(self, vec1, vec2):

        return matutils.cossim(vec1, vec2)

    def get_helldist(self, vec1, vec2, lda_topics):

        dense1 = matutils.sparse2full(vec1, lda_topics)
        dense2 = matutils.sparse2full(vec2, lda_topics)

        return np.sqrt(0.5 * ((np.sqrt(dense1) - np.sqrt(dense2))**2).sum())

    def get_doc2bow(self, string):

        new_vec = self.topic_dict.doc2bow(self.TV.get_tokens(string))

        return new_vec

    def get_sparse_index(self, corpus=None):

        if corpus is None:
            mm = corpora.MmCorpus(self.path+'corpus_tfidf.mm')
        else:
            mm = self.tfidf[corpus]

        index = similarities.SparseMatrixSimilarity(mm,
                num_features=mm.num_terms)

        return index

    def get_dense_index(self,corpus=None):

        if corpus is None:
            mm = corpora.MmCorpus(self.path+'corpus_lsi.mm')
        else:
            mm = corpus

        index = similarities.MatrixSimilarity(mm,
                num_features=mm.num_terms)

        return index

    def get_corpus_index(self, input_text, lsi=False):

        corpus = [self.get_doc2bow(text) for text in input_text]
        if lsi is False:
            index = similarities.MatrixSimilarity(self.tfidf[corpus])
        else:
            index = similarities.MatrixSimilarity(
                    self.lsi[self.tfidf[corpus]])
        #corpora.MmCorpus.serialize('/tmp/corpus.mm', self.tfidf[corpus])
        #mm = corpora.MmCorpus('/tmp/corpus.mm')

        return index

    def get_corpus_tfidf_index(self, input_text):

        corpus = [self.get_doc2bow(text) for text in input_text]
        index = similarities.MatrixSimilarity(self.tfidf[corpus])
        #corpora.MmCorpus.serialize('/tmp/corpus.mm', self.tfidf[corpus])
        #mm = corpora.MmCorpus('/tmp/corpus.mm')

        return index

    def get_main_corpus(self):
        DB = ConnectToDB()

        sql_query = """
        SELECT doc_id, doc_date, line_of_doc, speaker_text
        FROM corpus_table
        ORDER BY doc_date DESC, line_of_doc ASC;
        """

        # Get entire corpus from database
        corpus_df = DB.pull_from_db(sql_query)
        docs = [sent for sent in corpus_df['speaker_text']]

        texts = [[str(word).lower() for word in self.TV.get_tokens(doc)]
                 for doc in docs]

        frequency = defaultdict(int)
        for text in texts:
            for token in text:
                frequency[token] += 1

        texts = [[token for token in text if frequency[token] > 1]
                  for text in texts]

        return texts

    def get_topic_dict(self):

        texts = self.get_main_corpus()

        dictionary = corpora.Dictionary(texts)
        dictionary.save(self.path+'topic_dict')

        corpus = [dictionary.doc2bow(text) for text in texts]
        corpora.MmCorpus.serialize(self.path+'corpus.mm', corpus)

        return dictionary

    def train_tfidf(self):

        corpus = corpora.MmCorpus(self.path+'corpus.mm')

        tfidf = models.TfidfModel(corpus, id2word=self.topic_dict)
        tfidf.save(self.path+'tfidf_model')

        corpora.MmCorpus.serialize(self.path+'corpus_tfidf.mm', tfidf[corpus])

        return tfidf

    def train_lda_model(self):

        mm = corpora.MmCorpus(self.path+'corpus.mm')
        lda = models.ldamodel.LdaModel(corpus=self.tfidf[mm], id2word=self.topic_dict,
                       num_topics=10, update_every=1, chunksize=512, passes=10)

        lda.save(self.path+'lda_model')

        return lda

    def train_lsi_model(self):

        corpus = corpora.MmCorpus(self.path+'corpus.mm')

        lsi = models.LsiModel(self.tfidf[corpus], id2word=self.topic_dict,
                num_topics=200)

        lsi.save(self.path+'lsi_model')
        corpora.MmCorpus.serialize(self.path+'corpus_lsi.mm', lsi[self.tfidf[corpus]])

        return lsi
Exemplo n.º 11
0
class BuildSaveCorpus(object):
    def __init__(self):
        # Get the html links
        self.get_html_links()
        # Tokenizer!
        self.TV = TokenVectorizer()
        #
        self.DB = ConnectToDB()
        self.table_name = 'corpus_table'

    def build_corpus(self):

        for key, vals in self.html_links.items():

            if 'speech' in key:
                key_speaker = key.split('_')[1]
                doc_type = key.split('_')[0]
            else:
                doc_type = key

            line_of_doc_cter = 0
            for html_link in vals:
                html_text = self.fetch_data(html_link)
                speakers, speakers_text = self.clean_html(html_text)

                if len(speakers)==0:
                    speakers = [str(key_speaker)]*len(speakers_text)

                for speaker, speaker_text in zip(speakers, speakers_text):
                    sentences = self.TV.tokenize_tosentence(speaker_text)

                    for s in sentences:
                        corpus_ID = "_".join([str(doc_type),
                                             str(self.doc_date),
                                             str(line_of_doc_cter)])

                        html_link_dict = {
                                "corpus_id": [corpus_ID],
                                "link": [html_link],
                                "doc_id": ["_".join([key,str(self.doc_date)])],
                                "doc_type": [doc_type],
                                "doc_date": [int(self.doc_date)],
                                "speaker": [speaker.lower()],
                                "speaker_text": [s],
                                "line_of_doc": [line_of_doc_cter]
                        }

                        self.save_corpus(html_link_dict)
                        print(line_of_doc_cter, s)

                        line_of_doc_cter+=1

            print("Done with %s - %s" %(key, html_link))

    def save_corpus(self, out_dict):
        '''
        Save dictionary to SQL database
        '''
        self.DB.save_to_db(self.table_name, out_dict)

    def fetch_data(self, htmlfile):
        '''Grabs and opens the html file given the url address'''
        url = htmlfile
        if url==None:
            print("No URL Provided")
        else:
            response = urllib.request.urlopen(url)

        return response.read()

    def clean_html(self, htmltext):
        '''Uses beautifulsoup to parse the html file and clean it a bit

           Returns two different arrays:
                speaker -- who was talking
                speaker_text -- that the speaker said.

           Useful, specificially for debates. Clean_text will provide, in
           chronological order, the speaker:what they said.
        '''
        soupy = BS(htmltext, 'lxml')

        # Get the document date. Save "March 16, 2015" as "20150316"
        date = str(soupy.find_all('span',class_='docdate'))
        date_str = " ".join(re.split(' |, ',re.sub(r"<.+?>|","", date)[1:-1]))
        stime = time.strptime(date_str,"%B %d %Y")
        self.doc_date = str((stime[0]*10000)+(stime[1]*100)+(stime[2]))

        text_only = str(soupy.find_all('span',class_='displaytext'))
        speaker = []
        speaker_text = []
        for each in text_only[1:-1].split('<p>'):
            clean_each = re.sub(r"<.+?>|\[.+?\]","", each)

            clean_each_split = clean_each.split(':')
            #print(clean_each_split)
            if len(clean_each_split) > 1:
                speaker.append(clean_each_split[0])
                try:
                    speaker_text.append(clean_each_split[1])
                except (AttributeError, TypeError):
                    pdb.set_trace()
            else:
                try:
                    speaker_text[-1] = speaker_text[-1]+' '+clean_each_split[0]
                except:
                    speaker_text.append(clean_each_split[0])

        return speaker, speaker_text

    def get_html_links(self):

        link_dict = {}

        link_dict['debate'] = [
                'http://www.presidency.ucsb.edu/ws/index.php?pid=115148',
                'http://www.presidency.ucsb.edu/ws/index.php?pid=111711',
                'http://www.presidency.ucsb.edu/ws/index.php?pid=111634',
                'http://www.presidency.ucsb.edu/ws/index.php?pid=111500',
                'http://www.presidency.ucsb.edu/ws/index.php?pid=111472',
                'http://www.presidency.ucsb.edu/ws/index.php?pid=111412',
                'http://www.presidency.ucsb.edu/ws/index.php?pid=111395',
                'http://www.presidency.ucsb.edu/ws/index.php?pid=111177',
                'http://www.presidency.ucsb.edu/ws/index.php?pid=110908',
                'http://www.presidency.ucsb.edu/ws/index.php?pid=110906',
                'http://www.presidency.ucsb.edu/ws/index.php?pid=110756',
                'http://www.presidency.ucsb.edu/ws/index.php?pid=110489',
                'http://www.presidency.ucsb.edu/ws/index.php?pid=111413',
                'http://www.presidency.ucsb.edu/ws/index.php?pid=111394',
                'http://www.presidency.ucsb.edu/ws/index.php?pid=111176',
                'http://www.presidency.ucsb.edu/ws/index.php?pid=110909',
                'http://www.presidency.ucsb.edu/ws/index.php?pid=110907',
                'http://www.presidency.ucsb.edu/ws/index.php?pid=110758',
                'http://www.presidency.ucsb.edu/ws/index.php?pid=110757',
                'http://www.presidency.ucsb.edu/ws/index.php?pid=116995',
                'http://www.presidency.ucsb.edu/ws/index.php?pid=112719',
                'http://www.presidency.ucsb.edu/ws/index.php?pid=112718',
                'http://www.presidency.ucsb.edu/ws/index.php?pid=111520',
                'http://www.presidency.ucsb.edu/ws/index.php?pid=111471',
                'http://www.presidency.ucsb.edu/ws/index.php?pid=111409',
                'http://www.presidency.ucsb.edu/ws/index.php?pid=111178',
                'http://www.presidency.ucsb.edu/ws/index.php?pid=110910',
                'http://www.presidency.ucsb.edu/ws/index.php?pid=110903'
        ]

        link_dict['speech_trump'] = [
                'http://www.presidency.ucsb.edu/ws/index.php?pid=110306',
                'http://www.presidency.ucsb.edu/ws/index.php?pid=116597'

        ]

        link_dict['speech_clinton'] = [
                'http://www.presidency.ucsb.edu/ws/index.php?pid=116600',
                'http://www.presidency.ucsb.edu/ws/index.php?pid=111596'
        ]

        self.html_links = link_dict

    #DEFUNCT
    def combine_speakers(self):
        speakers = self.speakers
        text = self.speakers_text
        uniq_speakers = list(set(speakers))

        speaker_dict = {}

        speakerarr = np.array(speakers)
        textarr = np.array(text)
        for us in uniq_speakers:
            match = np.where(speakerarr == us)[0]
            matched_text = textarr[match]

            giant_text_str = ''
            for mt in matched_text:
                re_mt = re.sub(r'\[.+?\]','', mt.tolist())
                giant_text_str+=re_mt

            speaker_dict[us] = giant_text_str

        self.speaker_dict = speaker_dict
Exemplo n.º 12
0
class BuildSaveCorpus(object):
    def __init__(self):
        # Get the html links
        self.html_links = self.get_html_links()
        # Tokenizer!
        self.TV = TokenVectorizer()
        #
        self.DB = ConnectToDB()
        self.table_name = 'corpus_table'

    def build_corpus(self):

        for key, vals in self.html_links.items():

            if 'speech' in key:
                key_speaker = key.split('_')[1]
                doc_type = key.split('_')[0]
            else:
                doc_type = key

            for html_link in vals:
                line_of_doc_cter = 0

                html_text = self.fetch_data(html_link)
                speakers, speakers_text = self.clean_html(html_text)

                if len(speakers)==0:
                    speakers = [str(key_speaker)]*len(speakers_text)

                for speaker, speaker_text in zip(speakers, speakers_text):
                    sentences = self.TV.tokenize_tosentence(speaker_text)

                    for s in sentences:
                        corpus_ID = "_".join([str(doc_type),
                                             str(self.doc_date),
                                             str(line_of_doc_cter)])

                        html_link_dict = {
                                "corpus_id": [corpus_ID],
                                "link": [html_link],
                                "doc_id": ["_".join([key,str(self.doc_date)])],
                                "doc_type": [doc_type],
                                "doc_date": [int(self.doc_date)],
                                "speaker": [speaker.lower()],
                                "speaker_text": [s],
                                "line_of_doc": [line_of_doc_cter]
                        }

                        self.save_corpus(html_link_dict)
                        print(line_of_doc_cter, s)

                        line_of_doc_cter+=1

            print("Done with %s - %s" %(key, html_link))

    def save_corpus(self, out_dict):
        '''
        Save dictionary to SQL database
        '''
        self.DB.save_to_db(self.table_name, out_dict)

    def fetch_data(self, htmlfile):
        '''Grabs and opens the html file given the url address'''
        url = htmlfile
        if url==None:
            print("No URL Provided")
        else:
            response = urllib.request.urlopen(url)

        return response.read()

    def clean_html(self, htmltext):
        '''Uses beautifulsoup to parse the html file and clean it a bit

           Returns two different arrays:
                speaker -- who was talking
                speaker_text -- that the speaker said.

           Useful, specificially for debates. Clean_text will provide, in
           chronological order, the speaker:what they said.
        '''
        soupy = BS(htmltext, 'lxml')

        # Get the document date. Save "March 16, 2015" as "20150316"
        date = str(soupy.find_all('span',class_='docdate'))
        date_str = " ".join(re.split(' |, ',re.sub(r"<.+?>|","", date)[1:-1]))
        stime = time.strptime(date_str,"%B %d %Y")
        self.doc_date = str((stime[0]*10000)+(stime[1]*100)+(stime[2]))

        text_only = str(soupy.find_all('span',class_='displaytext'))
        speaker = []
        speaker_text = []
        for each in text_only[1:-1].split('<p>'):
            clean_each = re.sub(r"<.+?>|\[.+?\]","", each)

            clean_each_split = clean_each.split(':')
            #print(clean_each_split)
            if len(clean_each_split) > 1:
                speaker.append(clean_each_split[0])
                try:
                    speaker_text.append(clean_each_split[1])
                except (AttributeError, TypeError):
                    pdb.set_trace()
            else:
                try:
                    speaker_text[-1] = speaker_text[-1]+' '+clean_each_split[0]
                except:
                    speaker_text.append(clean_each_split[0])

        return speaker, speaker_text

    def get_html_links(self):

        link_dict = {}

        link_dict['democratic_debate'] = [
            'http://www.presidency.ucsb.edu/ws/index.php?pid=116995', #Brooklyn, New York; April 14, 2016
            'http://www.presidency.ucsb.edu/ws/index.php?pid=112719', #Miami, Florida; March 9, 2016
            'http://www.presidency.ucsb.edu/ws/index.php?pid=112718', #Flint, Michigan; March 6, 2016
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111520', #Milwaukee, Wisconsin; February 11, 2016
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111471', #Durham, New Hampshire; February 4, 2016
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111409', #Charleston, South Carolina; January 17, 2016
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111178', #Manchester, New Hampshire; December 19, 2015
            'http://www.presidency.ucsb.edu/ws/index.php?pid=110910', #Des Moines, Iowa; November 14, 2015
            'http://www.presidency.ucsb.edu/ws/index.php?pid=110903', #Las Vegas, Nevada; October 13, 2015
        ]

        link_dict['republican_debate'] = [
            'http://www.presidency.ucsb.edu/ws/index.php?pid=115148', #Miami, Florida; March 10, 2016
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111711', #Detroit, Michigan; March 3, 2016
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111634', #Houston, Texas; February 25, 2016
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111500', #Greenville, South Carolina; February 13, 2016
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111472', #Manchester, New Hampshire; February 6, 2016
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111412', #Des Moines, Iowa; January 28, 2016
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111395', #North Charleston, South Carolina; January 14, 2016
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111177', #Las Vegas, Nevada; December 15, 2015
            'http://www.presidency.ucsb.edu/ws/index.php?pid=110908', #Milwaukee, Wisconsin; November 10, 2015
            'http://www.presidency.ucsb.edu/ws/index.php?pid=110906', #Boulder, Colorado; October 28, 2015
            'http://www.presidency.ucsb.edu/ws/index.php?pid=110756', #Simi Valley, California; September 16, 2015
            'http://www.presidency.ucsb.edu/ws/index.php?pid=110489', #Cleveland, Ohio; August 6, 2015
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111413', #Des Moines, Iowa; January 28, 2016
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111394', #North Charleston, South Carolina; January 14, 2016
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111176', #Las Vegas, Nevada; December 15, 2015
            'http://www.presidency.ucsb.edu/ws/index.php?pid=110909', #Milwaukee, Wisconsin: November 10, 2015
            'http://www.presidency.ucsb.edu/ws/index.php?pid=110907', #Boulder, Colorado; October 28, 2015
            'http://www.presidency.ucsb.edu/ws/index.php?pid=110758', #Simi Valley, California; September 16, 2015
            'http://www.presidency.ucsb.edu/ws/index.php?pid=110757'  #Cleveland, Ohio; August 6, 2015
        ]

        link_dict['speech_trump'] = [
            'http://www.presidency.ucsb.edu/ws/index.php?pid=110306',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=116597'

        ]

        link_dict['speech_kaisch'] = [
            'http://www.presidency.ucsb.edu/ws/index.php?pid=116599',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=113069',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=116546']

        link_dict['speech_cruz'] = [
            'http://www.presidency.ucsb.edu/ws/index.php?pid=117232',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=116598',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=114768',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=110030',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=109774']

        link_dict['speech_malley'] = [
            'http://www.presidency.ucsb.edu/ws/index.php?pid=112703',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=112702',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=112696',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=112699',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=112704',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=112700',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=112716'
        ]

        link_dict['speech_clinton'] = [
            'http://www.presidency.ucsb.edu/ws/index.php?pid=116600',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111596',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111586', #Rachel Maddow of MSNBC; February 8, 2016
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111589', #Jake Tapper of CNN's "State of the Union"; February 7, 2016
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111587',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111585',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111591',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111595',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111592',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111439',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111593',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111594',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111414',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111436',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111435',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111434',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111433',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111432',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111431',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111430',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111429',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111428',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111416',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111426',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111426',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111425',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111424',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111423',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111422',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111417',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111418',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111421',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111419',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111419',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=110267',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=110269',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=110268',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=110270',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=110271',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111420'
        ]

        link_dict['speech_sanders'] = [
            'http://www.presidency.ucsb.edu/ws/index.php?pid=117513', #Steve Inskeep of National Public Radio; May 5, 2016
            'http://www.presidency.ucsb.edu/ws/index.php?pid=117194', #Conference Hosted by the Pontifical Academy of Social Sciences in Vatican City; April 15, 2016
            'http://www.presidency.ucsb.edu/ws/index.php?pid=116694', #Remarks on Policy in the Middle East in Salt Lake City, Utah; March 21, 2016
            'http://www.presidency.ucsb.edu/ws/index.php?pid=117516', #Remarks in Essex Junction, Vermont Following the "Super Tuesday" Primaries, March 1, 2016
            'http://www.presidency.ucsb.edu/ws/index.php?pid=117511', #Remarks in Concord Following the New Hampshire Primary; February 9, 2016
            'http://www.presidency.ucsb.edu/ws/index.php?pid=111440', #Remarks in Des Moines Following the Iowa Caucus; February 1, 2016
            'http://www.presidency.ucsb.edu/ws/index.php?pid=117514', #Remarks in a Meeting with Steelworkers in Des Moines, Iowa; January 26, 2016
            'http://www.presidency.ucsb.edu/ws/index.php?pid=114487', #Remarks at the New Hampshire Democratic Party Jefferson-Jackson Dinner in Manchester; November 29, 2015
            'http://www.presidency.ucsb.edu/ws/index.php?pid=117517',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=114493',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=114491',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=114486',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=114488',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=114494',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=114489',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=114490',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=114495',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=114492',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=110222',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=110221',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=110125',
            'http://www.presidency.ucsb.edu/ws/index.php?pid=110124'
        ]

        return link_dict

    #DEFUNCT
    def combine_speakers(self):
        speakers = self.speakers
        text = self.speakers_text
        uniq_speakers = list(set(speakers))

        speaker_dict = {}

        speakerarr = np.array(speakers)
        textarr = np.array(text)
        for us in uniq_speakers:
            match = np.where(speakerarr == us)[0]
            matched_text = textarr[match]

            giant_text_str = ''
            for mt in matched_text:
                re_mt = re.sub(r'\[.+?\]','', mt.tolist())
                giant_text_str+=re_mt

            speaker_dict[us] = giant_text_str

        self.speaker_dict = speaker_dict
Exemplo n.º 13
0
class PoliBot(object):
    def __init__(
        self,
        candidate,
    ):
        """ Prepare the bot for the input candidate."""
        # Connect to the SQL database
        self.DB = ConnectToDB()
        self.corpus_table = 'corpus_table'
        self.question_table = 'question_table'
        self.response_table = 'response_table'

        # Save candidate and get candidate corpus
        self.candidate = candidate.lower()
        self.corpus = self.get_corpus()

        # Initialize the vectorizer
        self.TV = TokenVectorizer()
        # Initialize the markov chain
        self.sorin = MarkovChain(self.corpus)

        # Log dictionary for questions and responses
        self.idnum = 0

    def ask_question(self, question=None):
        ts = time.time()
        self.date = int(datetime.datetime.fromtimestamp(ts).strftime('%Y%m%d'))
        self.time = int(datetime.datetime.fromtimestamp(ts).strftime('%H%M%S'))

        self.ID = str(self.idnum) + '_' + str(ts)
        self.idnum += 1

        try:
            tokens = self.TV.tokenize_full(question)
        except:
            tokens = []

        try:
            word_string = [str(t) for t in tokens]
        except:
            word_string = ""

        try:
            tokens_vect = self.TV.make_vector(word_string)
        except:
            tokens_vect = []

        if len(tokens_vect) > 1:
            question_vect = sum(tokens_vect) / len(tokens_vect)
        else:
            question_vect = tokens_vect

        if len(question_vect) == 1:
            self.question_vect = question_vect[0]
        else:
            self.question_vect = question_vect

        self.question_log = {
            'question_id': [self.ID],
            'question_date': [self.date],
            'question_time': [self.time],
            'question_sent': [question],
            'question_tokens': [tokens]
        }

        self.response_log = {
            'response_id': [self.ID],
            'response_date': [self.date],
            'response_time': [self.time],
            'response_candidate': [self.candidate],
            'response_sent': [],
            'response_tokens': [],
            'cosine_sim': [0],
            'question_id': [self.question_log['question_id'][0]]
        }

        # We want a new response dictionary for each question asked.
        self.response_dict = {}
        self.responseIDcounter = 0
        self.responseLOOPcounter = 0

    def response(self, num_sent=100, tries=10, save_to_db=False):
        generated_sentences = self.sorin.generate_sentences(num_sent=num_sent)

        cosine_sims = [0]
        all_tokens = []
        for i, sent in enumerate(generated_sentences):
            if sent is None:
                continue
            else:
                tokens = self.TV.tokenize_full(sent)
                if tokens is None:
                    continue
                else:
                    word_string = [str(t) for t in tokens]
                    tokens_vect = self.TV.make_vector(word_string)

                if len(tokens_vect) > 1:
                    response_vect = sum(tokens_vect) / len(tokens_vect)
                else:
                    response_vect = tokens_vect

            # Cosine similarity
            try:
                cosine_sim_0 = cosine(response_vect, self.question_vect)
            except:
                continue

            if cosine_sim_0 > np.max(cosine_sims):
                self.response_log['response_sent'] = [sent]
                self.response_log['response_tokens'] = [tokens]
                self.response_log['cosine_sim'] = [cosine_sim_0]

                cosine_sims.append(cosine_sim_0)
                all_tokens.append(tokens)
            else:
                cosine_sims.append(cosine_sim_0)
                all_tokens.append(tokens)

        if (self.responseLOOPcounter <
                tries) and (self.response_log['cosine_sim'][0] < 0.70):
            self.responseLOOPcounter += 1
            self.response(num_sent=num_sent, tries=tries)
        else:
            self.response_log['cosine_sim_dist'] = \
                    [(np.mean(cosine_sims),np.std(cosine_sims))]

            if save_to_db:
                self.DB.save_to_db(self.question_table, self.question_log)
                self.DB.save_to_db(self.response_table, self.response_log)
            else:
                print("Not saving to db")

        return self.response_log['response_sent'][0]

    def get_corpus(self):

        return self.DB.pull_candidate_corpus(self.corpus_table, self.candidate)