def plot_number_of_articles_a_month(json_file_name: str, from_year: int = 0): # Get data json_path = results.absolute_path(json_file_name) raw_data, number_of_articles_in_period = counts(json_path, from_year) xs = [] ys = [] for year in sorted(list(raw_data.keys())): if int(year) < from_year: continue for month in sorted(list(raw_data[year].keys())): xs.append(dt.datetime.fromisoformat(f'{year}-{month}-01')) monthly_articles_count = functools.reduce(lambda acc, value: acc + value, raw_data[year][month].values()) ys.append(monthly_articles_count) # Make the plot pub_name = publication_name(json_file_name) colour = sns.color_palette("Set1")[3] axes = plt.gca() plt.stackplot(xs, ys, labels=['articles'], colors=[colour]) plt.margins(0, 0) plt.title(f'Monthly {pub_name} articles distribution (total articles={number_of_articles_in_period})') save_image(plt, Path(json_file_name).stem, 'articles') plt.show()
def plot_stacked_area(json_file_name: str, from_year: int): # Get data json_path = results.absolute_path(json_file_name) raw_data, number_of_articles = counts(json_path, from_year) xs = [] anc, da, eff = [], [], [] for year in sorted(list(raw_data.keys())): for month in sorted(list(raw_data[year].keys())): xs.append(dt.datetime.fromisoformat(f'{year}-{month}-01')) anc.append(raw_data[year][month]['anc']) da.append(raw_data[year][month]['da']) eff.append(raw_data[year][month]['eff']) data = pd.DataFrame({'anc': anc, 'da': da, 'eff': eff, }, index=xs) # We need to transform the data from raw data to percentage (fraction) data_perc = data.divide(data.sum(axis=1), axis=0) # Make the plot pal = sns.color_palette("Set1") pal = pal[0:3] axes = plt.gca() axes.set_ylim([0, 1]) plt.stackplot(xs, data_perc["eff"], data_perc["da"], data_perc["anc"], labels=['EFF', 'DA', 'ANC'], colors=pal) plt.legend(loc='upper left') plt.margins(0, 0) plt.title(f'Times Live Political Focus (num articles={number_of_articles})') save_image(plt, Path(json_file_name).stem, 'stacked_area') plt.show()
def plot_separate_area(json_file_name: str, from_year: int): # Get data json_path = results.absolute_path(json_file_name) raw_data, number_of_articles_in_period = counts(json_path, from_year) xs = [] anc, da, eff = [], [], [] for year in sorted(list(raw_data.keys())): for month in sorted(list(raw_data[year].keys())): xs.append(dt.datetime.fromisoformat(f'{year}-{month}-01')) anc.append(raw_data[year][month]['anc']) da.append(raw_data[year][month]['da']) eff.append(raw_data[year][month]['eff']) data = pd.DataFrame({'anc': anc, 'da': da, 'eff': eff, }, index=xs) # Seats data election_2014 = [249, 89, 25] election_2019 = [230, 84, 44] elections = np.array([election_2014, election_2014, election_2019, election_2019]) seats_xs = [xs[0], dt.datetime.fromisoformat('2019-05-07'), dt.datetime.fromisoformat('2019-05-08'), xs[-1]] seats = pd.DataFrame({'anc': elections[:,0], 'da': elections[:,1], 'eff': elections[:,2], }, index=seats_xs) seats_perc = seats.divide(seats.sum(axis=1), axis=0) # We need to transform the data from raw data to percentage (fraction) data_perc = data.divide(data.sum(axis=1), axis=0) # Make the plot pub_name = publication_name(json_file_name) pal = sns.color_palette("Set1") pal = pal[0:3] for data, seats, colour, name in zip([data_perc["eff"], data_perc["da"], data_perc["anc"]], [seats_perc["eff"], seats_perc["da"], seats_perc["anc"]], pal, ['EFF', 'DA', 'ANC']): ratio, _, _ = auc(data, seats) ratio = int(round(ratio * 100)) plt.plot(seats_xs, seats, label='Vote share', color='k') axes = plt.gca() axes.set_ylim([0, 1]) # format the ticks years = mdates.YearLocator() # every year months = mdates.MonthLocator() # every month years_fmt = mdates.DateFormatter('%Y') axes.xaxis.set_major_locator(years) axes.xaxis.set_major_formatter(years_fmt) axes.xaxis.set_minor_locator(months) plt.stackplot(xs, data, labels=[f'% {name} representation'], colors=[colour]) plt.legend(loc='upper left', title=f'{ratio}% Focus') plt.margins(0, 0) plt.title(f'{pub_name} Political Focus on {name} (num articles={number_of_articles_in_period})') save_image(plt, f'{Path(json_file_name).stem}_{name.lower()}', 'results') plt.show()
def run(cls): """Runs the scraper. This can be considered the main() method.""" from scrapy.crawler import CrawlerProcess process = CrawlerProcess({ 'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)', 'FEED_FORMAT': 'json', 'FEED_URI': results.absolute_path(f'{cls.name}.json') }) process.crawl(cls) process.start()
def __init__(self, name: str, politics_page_url: str, domain_url: str, politics_url_regex: str, **kwargs): """ :param name: The file names for this scraper. e.g. 'times_live' :param politics_page_url: e.g. 'https://www.timeslive.co.za/politics/' :param domain_url: e.g. 'https://www.timeslive.co.za' :param politics_url_regex: e.g. r'https://www\.timeslive\.co\.za/politics/(.*)/' :param kwargs: """ self.politics_page_url = politics_page_url self.domain_url = domain_url self.politics_url_regex = politics_url_regex urls_file_path = results.absolute_path(f'{name}.urls') if Path(urls_file_path).is_file(): with open(urls_file_path, 'r') as f: self.start_urls = f.read().splitlines() else: self.start_urls = [politics_page_url] super().__init__(**kwargs) # python3 dispatcher.connect(self.on_spider_closed, signals.spider_closed)
page_num += 1 print("getting page number " + str(page_num)) time.sleep(1) outerHTML_new = driver.execute_script( "return document.documentElement.outerHTML") if outerHTML == outerHTML_new: break outerHTML = outerHTML_new except Exception as e: print('Aborting page load:', e) print('Done loading page') elems = driver.find_elements_by_xpath("//a[@href]") # Get politics links links = [] for elem in elems: href = elem.get_attribute("href") # An example news page is 'https://www.iol.co.za/news/politics/jacob-zuma-will-finally-have-his-day-in-court-38238221' if href.startswith('https://www.iol.co.za/news/politics/' ): # and href.split('-')[-1].isnumeric(): links.append(href) print(f'Found {len(links)} links') # Write unique links to file with open(results.absolute_path('iol.urls'), 'w') as f: links = list(set(links)) for elem in links: f.write(f'{elem}\n') print(f'Found unique {len(links)} links') html = driver.page_source.encode('utf-8')
outer_html_new = driver.execute_script("return document.documentElement.outerHTML") if outer_html == outer_html_new: break outer_html = outer_html_new except Exception as e: print('Aborting page load:', e) new_links = get_links_from_driver(driver) print(f'Found {len(new_links)} links on mobile') return new_links if __name__ == '__main__': options = Options() options.headless = True # Enable to run over SSH driver = webdriver.Firefox(options=options) desktop_links = get_links_on_deskop_website(driver) mobile_links = get_links_on_mobile_website(driver) all_links = desktop_links + mobile_links driver.close() print(f'Found {len(all_links)} links in total') # Write unique links to file with open(results.absolute_path('news24.urls'), 'w') as f: unique_links = list(set(all_links)) for elem in unique_links: f.write(f'{elem}\n') print(f'Found unique {len(unique_links)} links')
def save_image(plt, result_name: str, prepend: str): name = results.absolute_path(f'{prepend}_{result_name}.png') plt.savefig(name, bbox_inches='tight', dpi=200)
def on_spider_closed(self, spider: scrapy.Spider): stats = self.crawler.stats.get_stats() with open(results.absolute_path(f'{self.name}.stats'), 'w') as f: pprint(stats, stream=f)
while True: driver.find_element_by_css_selector( '.load-more').find_element_by_css_selector('.featured').click() page_num += 1 print("getting page number " + str(page_num)) time.sleep(2) # There seems to be some for of rate limiting. It was '1'. outerHTML_new = driver.execute_script( "return document.documentElement.outerHTML") if outerHTML == outerHTML_new: break outerHTML = outerHTML_new print('Done loading page') elems = driver.find_elements_by_xpath("//a[@href]") # Get politics links links = [] for elem in elems: href = elem.get_attribute("href") if href.startswith('https://www.timeslive.co.za/politics/'): links.append(href) print(f'Found {len(links)} links') # Write unique links to file with open(results.absolute_path('times_live.urls'), 'w') as f: links = list(set(links)) for elem in links: f.write(f'{elem}\n') print(f'Found unique {len(links)} links') html = driver.page_source.encode('utf-8')