def plot_number_of_articles_a_month(json_file_name: str, from_year: int = 0):
    # Get data
    json_path = results.absolute_path(json_file_name)
    raw_data, number_of_articles_in_period = counts(json_path, from_year)

    xs = []
    ys = []
    for year in sorted(list(raw_data.keys())):
        if int(year) < from_year:
            continue
        for month in sorted(list(raw_data[year].keys())):
            xs.append(dt.datetime.fromisoformat(f'{year}-{month}-01'))
            monthly_articles_count = functools.reduce(lambda acc, value: acc + value, raw_data[year][month].values())
            ys.append(monthly_articles_count)

    # Make the plot
    pub_name = publication_name(json_file_name)
    colour = sns.color_palette("Set1")[3]
    axes = plt.gca()

    plt.stackplot(xs, ys, labels=['articles'], colors=[colour])
    plt.margins(0, 0)
    plt.title(f'Monthly {pub_name} articles distribution (total articles={number_of_articles_in_period})')
    save_image(plt, Path(json_file_name).stem, 'articles')
    plt.show()
def plot_stacked_area(json_file_name: str, from_year: int):
    # Get data
    json_path = results.absolute_path(json_file_name)
    raw_data, number_of_articles = counts(json_path, from_year)

    xs = []
    anc, da, eff = [], [], []
    for year in sorted(list(raw_data.keys())):
        for month in sorted(list(raw_data[year].keys())):
            xs.append(dt.datetime.fromisoformat(f'{year}-{month}-01'))
            anc.append(raw_data[year][month]['anc'])
            da.append(raw_data[year][month]['da'])
            eff.append(raw_data[year][month]['eff'])

    data = pd.DataFrame({'anc': anc, 'da': da, 'eff': eff, }, index=xs)

    # We need to transform the data from raw data to percentage (fraction)
    data_perc = data.divide(data.sum(axis=1), axis=0)

    # Make the plot
    pal = sns.color_palette("Set1")
    pal = pal[0:3]
    axes = plt.gca()
    axes.set_ylim([0, 1])

    plt.stackplot(xs, data_perc["eff"], data_perc["da"], data_perc["anc"], labels=['EFF', 'DA', 'ANC'], colors=pal)
    plt.legend(loc='upper left')
    plt.margins(0, 0)
    plt.title(f'Times Live Political Focus (num articles={number_of_articles})')
    save_image(plt, Path(json_file_name).stem, 'stacked_area')
    plt.show()
def plot_separate_area(json_file_name: str, from_year: int):
    # Get data
    json_path = results.absolute_path(json_file_name)
    raw_data, number_of_articles_in_period = counts(json_path, from_year)

    xs = []
    anc, da, eff = [], [], []
    for year in sorted(list(raw_data.keys())):
        for month in sorted(list(raw_data[year].keys())):
            xs.append(dt.datetime.fromisoformat(f'{year}-{month}-01'))
            anc.append(raw_data[year][month]['anc'])
            da.append(raw_data[year][month]['da'])
            eff.append(raw_data[year][month]['eff'])

    data = pd.DataFrame({'anc': anc, 'da': da, 'eff': eff, }, index=xs)

    # Seats data
    election_2014 = [249, 89, 25]
    election_2019 = [230, 84, 44]
    elections = np.array([election_2014, election_2014, election_2019, election_2019])
    seats_xs = [xs[0], dt.datetime.fromisoformat('2019-05-07'), dt.datetime.fromisoformat('2019-05-08'), xs[-1]]
    seats = pd.DataFrame({'anc': elections[:,0], 'da': elections[:,1], 'eff': elections[:,2], }, index=seats_xs)
    seats_perc = seats.divide(seats.sum(axis=1), axis=0)

    # We need to transform the data from raw data to percentage (fraction)
    data_perc = data.divide(data.sum(axis=1), axis=0)

    # Make the plot
    pub_name = publication_name(json_file_name)
    pal = sns.color_palette("Set1")
    pal = pal[0:3]
    for data, seats, colour, name in zip([data_perc["eff"], data_perc["da"], data_perc["anc"]], [seats_perc["eff"], seats_perc["da"], seats_perc["anc"]], pal, ['EFF', 'DA', 'ANC']):
        ratio, _, _ = auc(data, seats)
        ratio = int(round(ratio * 100))

        plt.plot(seats_xs, seats, label='Vote share', color='k')
        axes = plt.gca()
        axes.set_ylim([0, 1])

        # format the ticks
        years = mdates.YearLocator()  # every year
        months = mdates.MonthLocator()  # every month
        years_fmt = mdates.DateFormatter('%Y')
        axes.xaxis.set_major_locator(years)
        axes.xaxis.set_major_formatter(years_fmt)
        axes.xaxis.set_minor_locator(months)

        plt.stackplot(xs, data, labels=[f'% {name} representation'], colors=[colour])
        plt.legend(loc='upper left', title=f'{ratio}% Focus')
        plt.margins(0, 0)
        plt.title(f'{pub_name} Political Focus on {name} (num articles={number_of_articles_in_period})')
        save_image(plt, f'{Path(json_file_name).stem}_{name.lower()}', 'results')
        plt.show()
    def run(cls):
        """Runs the scraper. This can be considered the main() method."""

        from scrapy.crawler import CrawlerProcess

        process = CrawlerProcess({
            'USER_AGENT':
            'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)',
            'FEED_FORMAT':
            'json',
            'FEED_URI':
            results.absolute_path(f'{cls.name}.json')
        })

        process.crawl(cls)
        process.start()
    def __init__(self, name: str, politics_page_url: str, domain_url: str,
                 politics_url_regex: str, **kwargs):
        """

        :param name: The file names for this scraper. e.g. 'times_live'
        :param politics_page_url: e.g. 'https://www.timeslive.co.za/politics/'
        :param domain_url: e.g. 'https://www.timeslive.co.za'
        :param politics_url_regex: e.g. r'https://www\.timeslive\.co\.za/politics/(.*)/'
        :param kwargs:
        """
        self.politics_page_url = politics_page_url
        self.domain_url = domain_url
        self.politics_url_regex = politics_url_regex
        urls_file_path = results.absolute_path(f'{name}.urls')
        if Path(urls_file_path).is_file():
            with open(urls_file_path, 'r') as f:
                self.start_urls = f.read().splitlines()
        else:
            self.start_urls = [politics_page_url]
        super().__init__(**kwargs)  # python3
        dispatcher.connect(self.on_spider_closed, signals.spider_closed)
Exemplo n.º 6
0
        page_num += 1
        print("getting page number " + str(page_num))
        time.sleep(1)
        outerHTML_new = driver.execute_script(
            "return document.documentElement.outerHTML")
        if outerHTML == outerHTML_new:
            break
        outerHTML = outerHTML_new
except Exception as e:
    print('Aborting page load:', e)
print('Done loading page')

elems = driver.find_elements_by_xpath("//a[@href]")
# Get politics links
links = []
for elem in elems:
    href = elem.get_attribute("href")
    # An example news page is 'https://www.iol.co.za/news/politics/jacob-zuma-will-finally-have-his-day-in-court-38238221'
    if href.startswith('https://www.iol.co.za/news/politics/'
                       ):  # and href.split('-')[-1].isnumeric():
        links.append(href)
print(f'Found {len(links)} links')
# Write unique links to file
with open(results.absolute_path('iol.urls'), 'w') as f:
    links = list(set(links))
    for elem in links:
        f.write(f'{elem}\n')
    print(f'Found unique {len(links)} links')

html = driver.page_source.encode('utf-8')
            outer_html_new = driver.execute_script("return document.documentElement.outerHTML")
            if outer_html == outer_html_new:
                break
            outer_html = outer_html_new
    except Exception as e:
        print('Aborting page load:', e)
    new_links = get_links_from_driver(driver)
    print(f'Found {len(new_links)} links on mobile')
    return new_links


if __name__ == '__main__':
    options = Options()
    options.headless = True  # Enable to run over SSH
    driver = webdriver.Firefox(options=options)

    desktop_links = get_links_on_deskop_website(driver)
    mobile_links = get_links_on_mobile_website(driver)
    all_links = desktop_links + mobile_links

    driver.close()

    print(f'Found {len(all_links)} links in total')

    # Write unique links to file
    with open(results.absolute_path('news24.urls'), 'w') as f:
        unique_links = list(set(all_links))
        for elem in unique_links:
            f.write(f'{elem}\n')
        print(f'Found unique {len(unique_links)} links')
def save_image(plt, result_name: str, prepend: str):
    name = results.absolute_path(f'{prepend}_{result_name}.png')
    plt.savefig(name, bbox_inches='tight', dpi=200)
 def on_spider_closed(self, spider: scrapy.Spider):
     stats = self.crawler.stats.get_stats()
     with open(results.absolute_path(f'{self.name}.stats'), 'w') as f:
         pprint(stats, stream=f)
while True:
    driver.find_element_by_css_selector(
        '.load-more').find_element_by_css_selector('.featured').click()
    page_num += 1
    print("getting page number " + str(page_num))
    time.sleep(2)  # There seems to be some for of rate limiting. It was '1'.
    outerHTML_new = driver.execute_script(
        "return document.documentElement.outerHTML")
    if outerHTML == outerHTML_new:
        break
    outerHTML = outerHTML_new
print('Done loading page')

elems = driver.find_elements_by_xpath("//a[@href]")
# Get politics links
links = []
for elem in elems:
    href = elem.get_attribute("href")
    if href.startswith('https://www.timeslive.co.za/politics/'):
        links.append(href)
print(f'Found {len(links)} links')
# Write unique links to file
with open(results.absolute_path('times_live.urls'), 'w') as f:
    links = list(set(links))
    for elem in links:
        f.write(f'{elem}\n')
    print(f'Found unique {len(links)} links')

html = driver.page_source.encode('utf-8')