예제 #1
0
    def __init__(self, database, ngram_size):
        self.database = Database(database)
        self.connection = self.database.connect()

        self.tokeniser = Tokeniser(database, ngram_size)
        self.original_titles = self.tokeniser.all_article_titles()

        self.memory = {}
        self.learn(self.tokeniser.title_ngrams())
예제 #2
0
class Markov:
    def __init__(self, database, ngram_size):
        self.database = Database(database)
        self.connection = self.database.connect()

        self.tokeniser = Tokeniser(database, ngram_size)
        self.original_titles = self.tokeniser.all_article_titles()

        self.memory = {}
        self.learn(self.tokeniser.title_ngrams())

    """
    Accepts a dictionary of {article_id: [list of title ngrams]} and modifies self.memory
    to learn the most likely word to follow each word in the lexicon.
    """

    def learn(self, ngrams):
        for list_of_ngrams in ngrams.values():
            for origin_word, target_word in list_of_ngrams:
                if origin_word not in self.memory:
                    self.memory[origin_word] = []
                self.memory[origin_word].append(target_word)

    """
    Generate a title, returning a dictionary of the form:
        {probability: ('title', article_id, [individual token probabilities])}
    where article_id is None or an integer if the generated title is overfitted and
    already exists on the webpage as a title.
    Can optionally pass a number to generate many titles. Defaults to one title.
    """

    def generate_title(self):
        output = dict()

        return output

    """
    Return a tuple consisting of a randomly chosen next word given the current word, 
    and the probability of the next word being chosen.
    """

    def next_word(self, current_word):
        next_possible_words = self.memory.get(current_word)

        if not next_possible_words:
            next_possible_words = self.memory.keys()

        next_word = random.sample(next_possible_words, 1)[0]
        next_word_probability = next_possible_words.count(next_word) / len(
            next_possible_words)

        return next_word, next_word_probability

    def main(self):
        pass
예제 #3
0
    def all_article_titles(self):
        sql_all_article_titles = " SELECT id, title FROM articles WHERE title IS NOT NULL "
        all_article_titles = Database.execute(self.connection,
                                              sql_all_article_titles,
                                              ()).fetchall()

        # Add a space before punctuation so that it can be tokenised.
        all_article_titles = [(title[0], title[1].replace('?', ' ?'))
                              for title in all_article_titles]

        return all_article_titles
예제 #4
0
    def update_months_table(self):
        months = []

        sql_get_last_month = " SELECT * FROM months ORDER BY year DESC, month DESC LIMIT 1 "
        last_month = Database.execute(self.connection, sql_get_last_month,
                                      ()).fetchone()
        # If there are no entries in the 'months' table, default to 1970/01/01.
        if last_month is None:
            last_month = (0, '1970', '01')
        last_month_timestamp = Helper.to_timestamp('/'.join(last_month[1:]),
                                                   '%Y/%m')

        # Scrape the homepage's sidebar for links to month subdirectories.
        page = requests.get(self.homepage).text
        for link in BeautifulSoup(page, 'html.parser').findAll(
                'a', attrs={'href': re.compile("/\d{4}/\d{2}$")}):
            url = link.get('href')

            human_readable = url[-7:]
            link_timestamp = Helper.to_timestamp(human_readable, '%Y/%m')

            # Build a list of months not currently held in the database.
            if link_timestamp > last_month_timestamp:
                human_readable = human_readable.split('/')
                months.append((human_readable[0], human_readable[1]))

        months.reverse()  # List months in chronological order.

        if months:
            sql_insert_months = " INSERT INTO months(year, month) VALUES(?,?) "
            Database.execute_many(self.connection, sql_insert_months, months)
            self.connection.commit()
            print("\nAdded {} rows:\n{}\nto the table 'months'.\n".format(
                len(months), months))

        # Prepend what was previously the last entry in the 'months' table.
        if last_month[0] is not 0:
            months.insert(0, last_month[1:])

        return months
예제 #5
0
    def scrape_all_articles(self):
        all_months = self.months_with_articles_to_scrape()
        article_columns = self.database.table_columns('articles')
        sql_article_where_url = " SELECT * FROM articles WHERE url = ? "

        for month in all_months:
            for article in self.scrape_articles_in_month(month):
                existing_article = Database.execute(self.connection,
                                                    sql_article_where_url,
                                                    (article[0], )).fetchone()

                # Skip the article if it already exists in the database.
                if existing_article:
                    existing_article = dict(
                        zip(article_columns,
                            existing_article))  # {column: data, ...}

                    date_published = datetime.fromtimestamp(
                        int(existing_article['date_published'])).strftime(
                            '%Y/%m/%d')
                    print(
                        "|  An entry for {} ({}) already exists in the database."
                        .format(existing_article['title'], date_published))
                else:
                    # url, date_published, title, paragraphs = article[0], article[1], article[2], str(article[3])
                    article = dict(zip(article_columns, article))

                    sql_insert_article = " INSERT INTO articles(title, url, paragraphs, date_published) VALUES(?,?,?,?) "
                    Database.execute(
                        self.connection, sql_insert_article,
                        (article['title'], article['url'],
                         article['paragraphs'], article['date_published']))
                    self.connection.commit()

                    date_published = datetime.fromtimestamp(
                        article['date_published']).strftime('%Y/%m/%d')
                    print(
                        "+  Added article {} ({}) to the     database.".format(
                            article['title'], date_published))
예제 #6
0
    def __init__(self, homepage, database):
        self.database = Database(database)
        self.connection = self.database.connect()

        self.homepage = homepage
예제 #7
0
class Scraper:
    def __init__(self, homepage, database):
        self.database = Database(database)
        self.connection = self.database.connect()

        self.homepage = homepage

    """
    Scrape and create a database record for each article.
    """

    def scrape_all_articles(self):
        all_months = self.months_with_articles_to_scrape()
        article_columns = self.database.table_columns('articles')
        sql_article_where_url = " SELECT * FROM articles WHERE url = ? "

        for month in all_months:
            for article in self.scrape_articles_in_month(month):
                existing_article = Database.execute(self.connection,
                                                    sql_article_where_url,
                                                    (article[0], )).fetchone()

                # Skip the article if it already exists in the database.
                if existing_article:
                    existing_article = dict(
                        zip(article_columns,
                            existing_article))  # {column: data, ...}

                    date_published = datetime.fromtimestamp(
                        int(existing_article['date_published'])).strftime(
                            '%Y/%m/%d')
                    print(
                        "|  An entry for {} ({}) already exists in the database."
                        .format(existing_article['title'], date_published))
                else:
                    # url, date_published, title, paragraphs = article[0], article[1], article[2], str(article[3])
                    article = dict(zip(article_columns, article))

                    sql_insert_article = " INSERT INTO articles(title, url, paragraphs, date_published) VALUES(?,?,?,?) "
                    Database.execute(
                        self.connection, sql_insert_article,
                        (article['title'], article['url'],
                         article['paragraphs'], article['date_published']))
                    self.connection.commit()

                    date_published = datetime.fromtimestamp(
                        article['date_published']).strftime('%Y/%m/%d')
                    print(
                        "+  Added article {} ({}) to the     database.".format(
                            article['title'], date_published))

    """
    Return a list of months ['YYYY/MM', ...] for which articles have not yet been scraped.
    """

    def months_with_articles_to_scrape(self):
        months = self.update_months_table()

        if months is None or len(months) is 0:
            print(
                "The list of months to scrape is empty. scrape_all_months() returned None."
            )
            return

        return ['/'.join(month) for month in months]

    """
    Populate the 'months' table with 'year, month' records.
    Return a list including the last month in the database before
    the function was called and all new months that were inserted.
    """

    def update_months_table(self):
        months = []

        sql_get_last_month = " SELECT * FROM months ORDER BY year DESC, month DESC LIMIT 1 "
        last_month = Database.execute(self.connection, sql_get_last_month,
                                      ()).fetchone()
        # If there are no entries in the 'months' table, default to 1970/01/01.
        if last_month is None:
            last_month = (0, '1970', '01')
        last_month_timestamp = Helper.to_timestamp('/'.join(last_month[1:]),
                                                   '%Y/%m')

        # Scrape the homepage's sidebar for links to month subdirectories.
        page = requests.get(self.homepage).text
        for link in BeautifulSoup(page, 'html.parser').findAll(
                'a', attrs={'href': re.compile("/\d{4}/\d{2}$")}):
            url = link.get('href')

            human_readable = url[-7:]
            link_timestamp = Helper.to_timestamp(human_readable, '%Y/%m')

            # Build a list of months not currently held in the database.
            if link_timestamp > last_month_timestamp:
                human_readable = human_readable.split('/')
                months.append((human_readable[0], human_readable[1]))

        months.reverse()  # List months in chronological order.

        if months:
            sql_insert_months = " INSERT INTO months(year, month) VALUES(?,?) "
            Database.execute_many(self.connection, sql_insert_months, months)
            self.connection.commit()
            print("\nAdded {} rows:\n{}\nto the table 'months'.\n".format(
                len(months), months))

        # Prepend what was previously the last entry in the 'months' table.
        if last_month[0] is not 0:
            months.insert(0, last_month[1:])

        return months

    """
    Return a list of tuples of all articles (and their metadata) for a given month.
    """

    def scrape_articles_in_month(self, month):
        urls, articles = [], []
        page = requests.get(self.homepage + month).text

        # Construct list of links and reverse it in order to get the correct chronological order.
        for link in BeautifulSoup(page, "html.parser").findAll(
                'a', attrs={'class': 'archive'}):
            urls.append(link.get('href'))
        urls.reverse()

        for url in urls:
            articles.append(self.scrape_article(url))

        return articles

    """
    Return a payload containing article metadata and a list of the article's paragraphs.
    """

    @staticmethod
    def scrape_article(url):
        time.sleep(randint(120, 260) / 100)  # Wait between requests.
        headers = {
            'User-Agent':
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'
        }
        page = requests.get(url, headers=headers).text
        markup = BeautifulSoup(page, "html.parser")

        entry_date = markup.find('time', attrs={'class':
                                                'entry-date'})['datetime']
        timestamp = Helper.to_timestamp(entry_date, '%Y-%m-%dT%H:%M:%S+00:00')
        title = markup.find('h1', attrs={
            'class': 'entry-title'
        }).find('a').text
        entry_content = markup.find('div', attrs={
            'class': 'entry-content'
        }).findAll('p')

        # Remove ' ', '\n'.
        paragraphs = [
            re.sub(u'[\xa0|\n]', u'', tag.text) for tag in entry_content
        ]
        paragraphs = [
            paragraph for paragraph in paragraphs if paragraph is not ''
        ]

        payload = (url, timestamp, title, paragraphs)

        return payload

    def main(self):
        self.scrape_all_articles()
예제 #8
0
class Tokeniser:
    def __init__(self, database, ngram_size):
        self.database = Database(database)
        self.connection = self.database.connect()

        self.ngram_size = ngram_size

    """
    Return a dictionary of the form {article_id: [list of title ngrams]}
    """

    def title_ngrams(self, tokenised_titles=None):
        if tokenised_titles is None:
            tokenised_titles = self.tokenise_titles()

        title_ngrams = dict()

        # Discard those utterances shorter than the ngram_size and
        # print the number of sentences used to generate ngrams.
        for (id_number, title) in tokenised_titles.items():
            if len(title) - 2 >= self.ngram_size:
                title_ngrams[id_number] = self.ngram(title, self.ngram_size)

        message = "\n{} titles out of a total of {} were converted into ngrams."
        print(message.format(len(title_ngrams), len(tokenised_titles)))
        return title_ngrams

    """
    Return a list of ngrams of size 'self.ngram_size' for the given list of tokens.
    """

    @staticmethod
    def ngram(tokens, ngram_size):
        number_of_tokens = len(tokens)
        ngrams = []

        # Halt program execution for invalid values of n.
        if ngram_size == 0 or ngram_size > (
                number_of_tokens - 2
        ):  # Subtract 2 to account for the insertion of utterance start/end tokens.
            raise ValueError(
                "The value of ngram_size cannot be 0 or larger than the number of tokens in the shortest utterance."
            )

        for token_number in range(number_of_tokens):
            first_ngram = last_ngram = list()

            if token_number == 0:
                # Handle first ngram.
                for token in range(0, ngram_size):
                    first_ngram.append(tokens[token])
                ngrams.append(tuple(first_ngram))

            elif token_number == number_of_tokens - 1:
                # Handle last ngram.
                for token in range(number_of_tokens - ngram_size,
                                   number_of_tokens):
                    last_ngram.append(tokens[token])
                ngrams.append(tuple(last_ngram))

            elif token_number < number_of_tokens - ngram_size:
                # Handle all other ngrams in between.
                new_ngram = list()
                for token in range(token_number, token_number + ngram_size):
                    new_ngram.append(tokens[token])
                ngrams.append(tuple(new_ngram))

        return ngrams

    """
    Return a dictionary of the form {article_id: ['_^', 'word1', ..., 'word_n','$_'], ...}
    """

    def tokenise_titles(self, article_titles=None):
        if article_titles is None:
            article_titles = self.all_article_titles()

        tokenised_titles = dict((id_number, title.split(' '))
                                for (id_number, title) in article_titles)

        for title in tokenised_titles.values():
            title.insert(0, '_^')  # Utterance start marker.
            title.append('$_')  # Utterance end marker.

        return tokenised_titles

    """
    Return a list of all the titles in the 'articles' table of the form [(id, 'title'), ...].
    """

    def all_article_titles(self):
        sql_all_article_titles = " SELECT id, title FROM articles WHERE title IS NOT NULL "
        all_article_titles = Database.execute(self.connection,
                                              sql_all_article_titles,
                                              ()).fetchall()

        # Add a space before punctuation so that it can be tokenised.
        all_article_titles = [(title[0], title[1].replace('?', ' ?'))
                              for title in all_article_titles]

        return all_article_titles

    def main(self):
        self.all_article_titles()
예제 #9
0
    def __init__(self, database, ngram_size):
        self.database = Database(database)
        self.connection = self.database.connect()

        self.ngram_size = ngram_size