def __init__(self, database, ngram_size): self.database = Database(database) self.connection = self.database.connect() self.tokeniser = Tokeniser(database, ngram_size) self.original_titles = self.tokeniser.all_article_titles() self.memory = {} self.learn(self.tokeniser.title_ngrams())
class Markov: def __init__(self, database, ngram_size): self.database = Database(database) self.connection = self.database.connect() self.tokeniser = Tokeniser(database, ngram_size) self.original_titles = self.tokeniser.all_article_titles() self.memory = {} self.learn(self.tokeniser.title_ngrams()) """ Accepts a dictionary of {article_id: [list of title ngrams]} and modifies self.memory to learn the most likely word to follow each word in the lexicon. """ def learn(self, ngrams): for list_of_ngrams in ngrams.values(): for origin_word, target_word in list_of_ngrams: if origin_word not in self.memory: self.memory[origin_word] = [] self.memory[origin_word].append(target_word) """ Generate a title, returning a dictionary of the form: {probability: ('title', article_id, [individual token probabilities])} where article_id is None or an integer if the generated title is overfitted and already exists on the webpage as a title. Can optionally pass a number to generate many titles. Defaults to one title. """ def generate_title(self): output = dict() return output """ Return a tuple consisting of a randomly chosen next word given the current word, and the probability of the next word being chosen. """ def next_word(self, current_word): next_possible_words = self.memory.get(current_word) if not next_possible_words: next_possible_words = self.memory.keys() next_word = random.sample(next_possible_words, 1)[0] next_word_probability = next_possible_words.count(next_word) / len( next_possible_words) return next_word, next_word_probability def main(self): pass
def all_article_titles(self): sql_all_article_titles = " SELECT id, title FROM articles WHERE title IS NOT NULL " all_article_titles = Database.execute(self.connection, sql_all_article_titles, ()).fetchall() # Add a space before punctuation so that it can be tokenised. all_article_titles = [(title[0], title[1].replace('?', ' ?')) for title in all_article_titles] return all_article_titles
def update_months_table(self): months = [] sql_get_last_month = " SELECT * FROM months ORDER BY year DESC, month DESC LIMIT 1 " last_month = Database.execute(self.connection, sql_get_last_month, ()).fetchone() # If there are no entries in the 'months' table, default to 1970/01/01. if last_month is None: last_month = (0, '1970', '01') last_month_timestamp = Helper.to_timestamp('/'.join(last_month[1:]), '%Y/%m') # Scrape the homepage's sidebar for links to month subdirectories. page = requests.get(self.homepage).text for link in BeautifulSoup(page, 'html.parser').findAll( 'a', attrs={'href': re.compile("/\d{4}/\d{2}$")}): url = link.get('href') human_readable = url[-7:] link_timestamp = Helper.to_timestamp(human_readable, '%Y/%m') # Build a list of months not currently held in the database. if link_timestamp > last_month_timestamp: human_readable = human_readable.split('/') months.append((human_readable[0], human_readable[1])) months.reverse() # List months in chronological order. if months: sql_insert_months = " INSERT INTO months(year, month) VALUES(?,?) " Database.execute_many(self.connection, sql_insert_months, months) self.connection.commit() print("\nAdded {} rows:\n{}\nto the table 'months'.\n".format( len(months), months)) # Prepend what was previously the last entry in the 'months' table. if last_month[0] is not 0: months.insert(0, last_month[1:]) return months
def scrape_all_articles(self): all_months = self.months_with_articles_to_scrape() article_columns = self.database.table_columns('articles') sql_article_where_url = " SELECT * FROM articles WHERE url = ? " for month in all_months: for article in self.scrape_articles_in_month(month): existing_article = Database.execute(self.connection, sql_article_where_url, (article[0], )).fetchone() # Skip the article if it already exists in the database. if existing_article: existing_article = dict( zip(article_columns, existing_article)) # {column: data, ...} date_published = datetime.fromtimestamp( int(existing_article['date_published'])).strftime( '%Y/%m/%d') print( "| An entry for {} ({}) already exists in the database." .format(existing_article['title'], date_published)) else: # url, date_published, title, paragraphs = article[0], article[1], article[2], str(article[3]) article = dict(zip(article_columns, article)) sql_insert_article = " INSERT INTO articles(title, url, paragraphs, date_published) VALUES(?,?,?,?) " Database.execute( self.connection, sql_insert_article, (article['title'], article['url'], article['paragraphs'], article['date_published'])) self.connection.commit() date_published = datetime.fromtimestamp( article['date_published']).strftime('%Y/%m/%d') print( "+ Added article {} ({}) to the database.".format( article['title'], date_published))
def __init__(self, homepage, database): self.database = Database(database) self.connection = self.database.connect() self.homepage = homepage
class Scraper: def __init__(self, homepage, database): self.database = Database(database) self.connection = self.database.connect() self.homepage = homepage """ Scrape and create a database record for each article. """ def scrape_all_articles(self): all_months = self.months_with_articles_to_scrape() article_columns = self.database.table_columns('articles') sql_article_where_url = " SELECT * FROM articles WHERE url = ? " for month in all_months: for article in self.scrape_articles_in_month(month): existing_article = Database.execute(self.connection, sql_article_where_url, (article[0], )).fetchone() # Skip the article if it already exists in the database. if existing_article: existing_article = dict( zip(article_columns, existing_article)) # {column: data, ...} date_published = datetime.fromtimestamp( int(existing_article['date_published'])).strftime( '%Y/%m/%d') print( "| An entry for {} ({}) already exists in the database." .format(existing_article['title'], date_published)) else: # url, date_published, title, paragraphs = article[0], article[1], article[2], str(article[3]) article = dict(zip(article_columns, article)) sql_insert_article = " INSERT INTO articles(title, url, paragraphs, date_published) VALUES(?,?,?,?) " Database.execute( self.connection, sql_insert_article, (article['title'], article['url'], article['paragraphs'], article['date_published'])) self.connection.commit() date_published = datetime.fromtimestamp( article['date_published']).strftime('%Y/%m/%d') print( "+ Added article {} ({}) to the database.".format( article['title'], date_published)) """ Return a list of months ['YYYY/MM', ...] for which articles have not yet been scraped. """ def months_with_articles_to_scrape(self): months = self.update_months_table() if months is None or len(months) is 0: print( "The list of months to scrape is empty. scrape_all_months() returned None." ) return return ['/'.join(month) for month in months] """ Populate the 'months' table with 'year, month' records. Return a list including the last month in the database before the function was called and all new months that were inserted. """ def update_months_table(self): months = [] sql_get_last_month = " SELECT * FROM months ORDER BY year DESC, month DESC LIMIT 1 " last_month = Database.execute(self.connection, sql_get_last_month, ()).fetchone() # If there are no entries in the 'months' table, default to 1970/01/01. if last_month is None: last_month = (0, '1970', '01') last_month_timestamp = Helper.to_timestamp('/'.join(last_month[1:]), '%Y/%m') # Scrape the homepage's sidebar for links to month subdirectories. page = requests.get(self.homepage).text for link in BeautifulSoup(page, 'html.parser').findAll( 'a', attrs={'href': re.compile("/\d{4}/\d{2}$")}): url = link.get('href') human_readable = url[-7:] link_timestamp = Helper.to_timestamp(human_readable, '%Y/%m') # Build a list of months not currently held in the database. if link_timestamp > last_month_timestamp: human_readable = human_readable.split('/') months.append((human_readable[0], human_readable[1])) months.reverse() # List months in chronological order. if months: sql_insert_months = " INSERT INTO months(year, month) VALUES(?,?) " Database.execute_many(self.connection, sql_insert_months, months) self.connection.commit() print("\nAdded {} rows:\n{}\nto the table 'months'.\n".format( len(months), months)) # Prepend what was previously the last entry in the 'months' table. if last_month[0] is not 0: months.insert(0, last_month[1:]) return months """ Return a list of tuples of all articles (and their metadata) for a given month. """ def scrape_articles_in_month(self, month): urls, articles = [], [] page = requests.get(self.homepage + month).text # Construct list of links and reverse it in order to get the correct chronological order. for link in BeautifulSoup(page, "html.parser").findAll( 'a', attrs={'class': 'archive'}): urls.append(link.get('href')) urls.reverse() for url in urls: articles.append(self.scrape_article(url)) return articles """ Return a payload containing article metadata and a list of the article's paragraphs. """ @staticmethod def scrape_article(url): time.sleep(randint(120, 260) / 100) # Wait between requests. headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36' } page = requests.get(url, headers=headers).text markup = BeautifulSoup(page, "html.parser") entry_date = markup.find('time', attrs={'class': 'entry-date'})['datetime'] timestamp = Helper.to_timestamp(entry_date, '%Y-%m-%dT%H:%M:%S+00:00') title = markup.find('h1', attrs={ 'class': 'entry-title' }).find('a').text entry_content = markup.find('div', attrs={ 'class': 'entry-content' }).findAll('p') # Remove ' ', '\n'. paragraphs = [ re.sub(u'[\xa0|\n]', u'', tag.text) for tag in entry_content ] paragraphs = [ paragraph for paragraph in paragraphs if paragraph is not '' ] payload = (url, timestamp, title, paragraphs) return payload def main(self): self.scrape_all_articles()
class Tokeniser: def __init__(self, database, ngram_size): self.database = Database(database) self.connection = self.database.connect() self.ngram_size = ngram_size """ Return a dictionary of the form {article_id: [list of title ngrams]} """ def title_ngrams(self, tokenised_titles=None): if tokenised_titles is None: tokenised_titles = self.tokenise_titles() title_ngrams = dict() # Discard those utterances shorter than the ngram_size and # print the number of sentences used to generate ngrams. for (id_number, title) in tokenised_titles.items(): if len(title) - 2 >= self.ngram_size: title_ngrams[id_number] = self.ngram(title, self.ngram_size) message = "\n{} titles out of a total of {} were converted into ngrams." print(message.format(len(title_ngrams), len(tokenised_titles))) return title_ngrams """ Return a list of ngrams of size 'self.ngram_size' for the given list of tokens. """ @staticmethod def ngram(tokens, ngram_size): number_of_tokens = len(tokens) ngrams = [] # Halt program execution for invalid values of n. if ngram_size == 0 or ngram_size > ( number_of_tokens - 2 ): # Subtract 2 to account for the insertion of utterance start/end tokens. raise ValueError( "The value of ngram_size cannot be 0 or larger than the number of tokens in the shortest utterance." ) for token_number in range(number_of_tokens): first_ngram = last_ngram = list() if token_number == 0: # Handle first ngram. for token in range(0, ngram_size): first_ngram.append(tokens[token]) ngrams.append(tuple(first_ngram)) elif token_number == number_of_tokens - 1: # Handle last ngram. for token in range(number_of_tokens - ngram_size, number_of_tokens): last_ngram.append(tokens[token]) ngrams.append(tuple(last_ngram)) elif token_number < number_of_tokens - ngram_size: # Handle all other ngrams in between. new_ngram = list() for token in range(token_number, token_number + ngram_size): new_ngram.append(tokens[token]) ngrams.append(tuple(new_ngram)) return ngrams """ Return a dictionary of the form {article_id: ['_^', 'word1', ..., 'word_n','$_'], ...} """ def tokenise_titles(self, article_titles=None): if article_titles is None: article_titles = self.all_article_titles() tokenised_titles = dict((id_number, title.split(' ')) for (id_number, title) in article_titles) for title in tokenised_titles.values(): title.insert(0, '_^') # Utterance start marker. title.append('$_') # Utterance end marker. return tokenised_titles """ Return a list of all the titles in the 'articles' table of the form [(id, 'title'), ...]. """ def all_article_titles(self): sql_all_article_titles = " SELECT id, title FROM articles WHERE title IS NOT NULL " all_article_titles = Database.execute(self.connection, sql_all_article_titles, ()).fetchall() # Add a space before punctuation so that it can be tokenised. all_article_titles = [(title[0], title[1].replace('?', ' ?')) for title in all_article_titles] return all_article_titles def main(self): self.all_article_titles()
def __init__(self, database, ngram_size): self.database = Database(database) self.connection = self.database.connect() self.ngram_size = ngram_size