def get_pageviews(site_name, *args, **kwargs): if site_name.lower() == 'wikipedia': start = '' end = '' granularity = 'monthly' if kwargs.get('article_name') != None: article_name = kwargs['article_name'] # article_name = self.get_article_name(article_name) if kwargs.get('start') != None: start = kwargs['start'].replace('-', '') if kwargs.get('end') != None: end = kwargs['end'].replace('-', '') if kwargs.get('granularity') != None: granularity = kwargs['granularity'] p = PageviewsClient(user_agent="<*****@*****.**>") if start == '': return p.article_views('en.wikipedia', article_name, granularity=granularity) elif end == '': return p.article_views('en.wikipedia', article_name, granularity=granularity, start=start, end=start) else: return p.article_views('en.wikipedia', article_name, granularity=granularity, start=start, end=end)
def run(self): viewer = PageviewsClient( user_agent="<*****@*****.**> Selfie, Cat, and Dog analysis" ) self.logger.info('[%s] Starting Wiki thread' % self.Name) try: for ticker, article in self.Tickers.items(): End_date = time.strftime('%Y%m%d') data = viewer.article_views('en.wikipedia', article, granularity='daily', start=self.Start_date, end=End_date) for row in data: if data[row][article]: wikid = {} wikid['date'] = row.strftime('%m/%d/%Y') wikid['symbol'] = ticker wikid['article'] = article wikid['wiki_views'] = int(data[row][article]) queueDoc(wikid) self.logger.info('[%s] Collected Info on %s' % (self.Name, ticker)) except Exception as e: self.logger.error('[%s] Error: %s' % (self.Name, e)) self.logger.info('[%s] Exiting' % self.name) self.Fin = True
def _get_snp500_wiki_views(conn, start, end): """ Inserts wiki page views into the daily_views table Parameters: start (str) : YYYYMMDD end (str) : YYYYMMDD Returns: List[tuple] : (id, date, views, now, now) """ pvc = PageviewsClient() symbol_ids_and_titles = _get_symbol_ids_and_wiki_titles(conn) title_to_id = {title: id for id, title in symbol_ids_and_titles} articles = [title for _, title in symbol_ids_and_titles] project = 'en.wikipedia' now = datetime.datetime.utcnow() # API call views_dict = pvc.article_views(project, articles, start=start, end=end) # transforming API call to rows (a list of tuples) daily_views = [] for date in views_dict: for title in views_dict[date]: id, views = title_to_id[title], views_dict[date][title] daily_views.append((id, date, views, now, now)) return daily_views
def get_wiki_pageviews(title): p = PageviewsClient(user_agent="<*****@*****.**> multiple movie titles") today = datetime.datetime.now().strftime("%Y%m%d") try: return p.article_views('en.wikipedia', title, start='20130101',end=today) except: return {}
def top_articles_by_views(articles, top_x): """ returns the top x of the given list of articles based on page views for the previous month output: [(article1, views), (article2, views)] """ p = PageviewsClient() # create date string based on previous month now = datetime.datetime.now() previous_month = str(now.month - 1).zfill(2) if previous_month == "00": previous_month = "12" start_date = str(now.year) + previous_month + "0100" end_date = str(now.year) + previous_month + "2800" # get views result = p.article_views('en.wikipedia', articles, granularity='monthly', start=start_date, end=end_date) # clean results (six is used for backwards compatibility with python 2 result = six.next(six.itervalues(result)) sorted_articles = sorted(result.items(), key=operator.itemgetter(1), reverse=True) return sorted_articles[:top_x]
def getViewsPerDay(self): # pageviews own thing """Gets a time series dataframe: date (as index), views (column is article name) Will be using 'mwviews' package""" p = PageviewsClient('en') print(self.article) data = p.article_views('en.wikipedia', [self.article], granularity='daily', start=self.getPublishDate(), end=datetime.datetime.now().strftime('%Y%m%d')) df = pd.DataFrame.from_dict(data, orient='index').dropna() return df
def wiki_api(keyword, start, end, agent='user'): output_list = [] p = PageviewsClient('what is it..?') #parameter로 스트링이 들어가야함. 아무거나 넣어도 가능.. output_dict = dict( p.article_views('en.wikipedia.org', [keyword], start=start, end=end, agent=agent)) for key, val in output_dict.items(): tem_dict = {} tem_dict['date'] = key.strftime("%Y%m%d") tem_dict['view_count'] = val[keyword.replace(" ", "_")] output_list.append(tem_dict) result = json.dumps(output_list) return result
def get_page_views_dict(links): p = PageviewsClient() #today = datetime.datetime.today() #today = today.strftime('%Y%m%d') #p.article_views('{}.wikipedia'.format(lang), title, granularity='monthly', start='20160201', end=today) my_dico = p.article_views('{}.wikipedia'.format(lang), links) my_dico_by_article = {} for article in links: my_dico_by_article[article] = 0 for key_date, sub_dico_value in my_dico.items(): for article, number in sub_dico_value.items(): if number is not None: my_dico_by_article[article.replace('_', ' ')] += number my_dico_by_article = dict( sorted(my_dico_by_article.items(), key=operator.itemgetter(1), reverse=True)) #need to define a selection choose based on title approximation return my_dico_by_article
def get_page_views(article_names, output_path): """Query the Wikipedia page views api for the relevant pages Keyword arguments: article_names -- array of article names to query output_path -- output path for the csv file output """ p = PageviewsClient(user_agent="[email protected] Selfie, Cat, and Dog analysis") values = p.article_views('en.wikipedia',article_names, granularity='monthly', start='20150101', end='20200401') all_keys = list(values.keys()) all_keys.sort() val_dict = [] for x in article_names: for key in all_keys: val_dict.append({"article_title":x,"timestamp":key, "views":values[key][x]}) df = pd.DataFrame(val_dict) df = df.fillna(0) print("Writing Page View Data to -- " + output_path + " -- for " + str(len(df.article_title.unique())) + " articles") df.to_csv(output_path, mode='w', index=False) return df
def top_articles_by_views(articles, top_x): """ returns the top x of the given list of articles based on page views for the previous month output: [(article1, views), (article2, views)] """ p = PageviewsClient() # create date string based on previous month now = datetime.datetime.now() previous_month = str(now.month - 1).zfill(2) if previous_month == "00": previous_month = "12" start_date = str(now.year) + previous_month + "0100" end_date = str(now.year) + previous_month + "2800" # get views result = p.article_views('en.wikipedia', articles, granularity='monthly', start=start_date, end=end_date) # clean results (six is used for backwards compatibility with python 2 result = six.next(six.itervalues(result)) sorted_articles = sorted(result.items(), key=operator.itemgetter(1), reverse=True) return sorted_articles[:top_x]
def download_pageviews(entities=None, start='20150701', end=None, access='desktop', agent='user', limit=10000): """ Download pageviews from Wikipedia :param entities: A list of entities (Wikipedia pages) to get pageview data for :param start: The start date of the range over which to collect data; 2015-07-01 is the earliest supported by the API :param end: The end date of the range, or None for today :param access: The method by which Wikipedia was accessed (default: desktop) :param agent: The user agent accessing Wikipedia (default: user) :param limit: The number of most-trafficked entities to return data for, if no entities are specified in the call :return: A DataFrame of entities x pageviews by day """ if end is None: end = datetime.date.today().strftime('%Y%m%d') p = PageviewsClient() dates = pd.date_range(start=start, end=end) #str -> list if type(entities) is str: entities = [entities] # if entities aren't passed in, get the daily top entities for the period if entities is None: df_pvs = None for d in dates: try: df = pd.DataFrame(p.top_articles('en.wikipedia', year=d.year, month=d.month,\ day=d.day, limit=limit, access=access)) except: continue df = df.set_index('article').rename(columns={'views': d})[[d]] if df_pvs is None: df_pvs = df else: df_pvs = df_pvs.join(df, how='outer') entities = df_pvs.index.values.tolist() for i in range(len(entities)): try: entities[i] = unidecode(wikipedia.page(entities[i]).title) except wikipedia.exceptions.DisambiguationError as e: print 'I could not understand that, please check your spelling or be more specific' print 'Error: {0}'.format(e) avere = pd.DataFrame(columns=['NONE']) return avere except wikipedia.exceptions.PageError as e: print 'I could not understand that, please check your spelling or be more specific' print 'Error: {0}'.format(e) avere = pd.DataFrame(columns=['NONE']) return avere search = p.article_views('en.wikipedia', entities, start=start, end=end, access=access, agent=agent) df = pd.DataFrame.from_dict(search, orient='index') return df
import urllib from mwviews.api import PageviewsClient articles = ['cat', 'dog', 'New York', ] articles = [urllib.parse.quote('Park Güell'.encode('utf-8', 'ignore'), safe='')] top_x = 2 p = PageviewsClient(10) # create date string based on previous month now = datetime.datetime.now() previous_month = str(now.month - 1).zfill(2) if previous_month == "00": previous_month = "12" start_date = str(now.year) + previous_month + "0100" end_date = str(now.year) + previous_month + "2800" # encode in ascii for compatibility with page views api articles = [article.encode("ascii", 'ignore') for article in articles] # get views result = p.article_views('en.wikipedia', articles, granularity='monthly', start=start_date, end=end_date) # clean results (six is used for backwards compatibility with python 2 result = six.next(six.itervalues(result)) sorted_articles = sorted(result.items(), key=operator.itemgetter(1), reverse=True) # print sorted_articles[:top_x] print(sorted_articles[:top_x])
driver = GraphDatabase.driver("bolt://localhost", auth=("neo4j", "neo")) # people = [ # "Boris Johnson", "Theresa May", "Jacob Rees-Mogg" # ] with driver.session() as session: result = session.run(""" MATCH (p:Person) RETURN p.name AS person """) people = [row["person"] for row in result] # p.article_views("en.wikipedia", people, start="20190325", end="20190330") views = p.article_views("en.wikipedia", people, start="20160624", end="20190330") votes = {person: 0 for person in people} for key in views.keys(): for person_key in views[key].keys(): person = person_key.replace("_", " ") if views[key][person_key]: votes[person] += views[key][person_key] with open("data/pageviews.csv", "w") as pageviews_file: writer = csv.writer(pageviews_file, delimiter=",") writer.writerow(["person", "pageviews"]) for vote in votes: writer.writerow([vote, votes[vote]])
articles = [ urllib.parse.quote('Park Güell'.encode('utf-8', 'ignore'), safe='') ] top_x = 2 p = PageviewsClient(10) # create date string based on previous month now = datetime.datetime.now() previous_month = str(now.month - 1).zfill(2) if previous_month == "00": previous_month = "12" start_date = str(now.year) + previous_month + "0100" end_date = str(now.year) + previous_month + "2800" # encode in ascii for compatibility with page views api articles = [article.encode("ascii", 'ignore') for article in articles] # get views result = p.article_views('en.wikipedia', articles, granularity='monthly', start=start_date, end=end_date) # clean results (six is used for backwards compatibility with python 2 result = six.next(six.itervalues(result)) sorted_articles = sorted(result.items(), key=operator.itemgetter(1), reverse=True) # print sorted_articles[:top_x] print(sorted_articles[:top_x])
name_correction = {'M.J. Stewart':'M. J. Stewart', 'P.J. Hall':'P. J. Hall', 'R.J. McIntosh':'R. J. McIntosh' } df_2018 = df_2018.replace(name_correction) #2018 NFL draft took place from April 26 to April 28 #Collect more data than needed at beginning. Dates will be pared down after exploratory analysis #build dataframe format #wiki_views_t = pd.DataFrame.from_dict(p.article_views('en.wikipedia', df_2018.at[0,'Player'], granularity='daily', start='20170101', end='20181231')) #Use Jonathan Taylor as a control. #He is a top performing freshman from Wisconsin who is not eligible for the draft until 2019 #Considered using a combination of other college players as control, but most do not have a Wiki page wiki_views_c = pd.DataFrame.from_dict(p.article_views('en.wikipedia', 'Jonathan Taylor (American football)', granularity='daily', start='20170101', end='20181231')) #remove data wiki_views_t = wiki_views_c[0:0] #populate table by with wikipedia stats for each player for i in df_2018.index: wiki_views_t = wiki_views_t.append(pd.DataFrame.from_dict(p.article_views('en.wikipedia', df_2018.at[i,'Player'], granularity='daily', start='20170101', end='20181231'))) #set column name for players wiki_views_t['Player'] = wiki_views_t.index wiki_views_c['Player'] = wiki_views_c.index wiki_views_t = pd.melt(wiki_views_t, id_vars=["Player"],var_name="Date", value_name="Views") wiki_views_c = pd.melt(wiki_views_c, id_vars=["Player"],var_name="Date", value_name="Views")
current_stat_read_line += 1 current_date = data_dic['webPublicationDate'].split( 'T')[0].replace('-', '') if previous_date == None or previous_date == current_date: if previous_date == None: previous_date = current_date data_out.append(data_dic) stat_out.append(stat_dic) else: articles = [ dic['wiki_link'].split('/')[-1] for dic in stat_out ] views = p.article_views('en.wikipedia', articles, granularity='daily', start=previous_date, end=previous_date) for d_dic, s_dic in zip(data_out, stat_out): d_dic['views'] = views[datetime.strptime( previous_date, "%Y%m%d")][s_dic['wiki_link'].split('/')[-1]] if d_dic['views'] == None: d_dic['views'] = 0 i += 1 print('Writing file: ', i) semi_final_dataset.write(json.dumps(d_dic) + '\n') data_out = [data_dic] stat_out = [stat_dic] previous_date = current_date if len(stat_out) > 0:
# 20150901 is start of tool sd = '20150901' ed = today.strftime('%Y%m%d') # remove for wikipedia views per language version l = '/' # remove for wikipedia views per language version l = checkempty(l ,c) try: # remove +l for wikipedia views per language version views = p.article_views(w, [t + l], access='all-access', start=sd , end=ed) dates = sorted(views.keys()) graphlist(dates) a = int() # add up all supage views view_total += addtotalviews(views, a) except Exception: a = 0 pass
from mwviews.api import PageviewsClient import pandas import pandas_datareader as web import ystockquote from pprint import pprint yesterday = str( (datetime.datetime.now() - datetime.timedelta(2)).strftime("%Y%m%d")) print('Yesterday was', yesterday) pageViews = PageviewsClient('shivansh') #FOR Iphone pv = pageViews.article_views('en.wikipedia', 'IPhone', granularity='daily', start='20150701', end=yesterday) print(pv) print('Data points for IPhone: ', pv.__len__()) rawIphone = list(pv.items()) t = sorted(rawIphone) out = open('Iphone.csv', 'w') for i in t: d = datetime.datetime.strptime(str(i[0]), '%Y-%m-%d %H:%M:%S') row = d.strftime('%Y/%m/%d') + ',' + str(i[1]['IPhone']) + '\n' out.write(row)
if last_dt is None: last_dt = start #datetime.datetime.now() - datetime.timedelta(365*10,0) else: last_dt = datetime.datetime.combine(row['last_dt'], datetime.time(0)) if first_dt is not None and first_dt > start.to_datetime().date(): last_dt = start.to_datetime() current = last_dt st = str(last_dt.year) + str(last_dt.month).zfill(2) + str( last_dt.day).zfill(2) end = str(nnow.year) + str(nnow.month).zfill(2) + str(nnow.day).zfill(2) articles = [title.encode('utf8')] print('Getting', articles, project, 'from', st, 'to', end) try: resp = p.article_views(project + '.wikipedia', articles, start=st, end=end, agent='user') pageid = row.pageid for dv in resp.keys(): pv = resp[dv][title.encode('utf8').replace(b' ', b'_')] if pv is None: rdt = resp[dv] alts = list(rdt.keys()) if articles[0] in alts: alts.remove(articles[0]) i = 0 while i < len(alts) and pv is None: pv = rdt[alts[i]] i += 1 if pv is not None:
from mwviews.api import PageviewsClient import pandas as pd p = PageviewsClient(user_agent='all-agents') x = (p.article_views('en.wikipedia', ['Reliance Industries'], granularity='daily', start='20150701', end='20180318')) df = pd.DataFrame() df1 = pd.DataFrame() Y = pd.read_csv("FinalData.csv") timeslot = [] for date in x: items = x[date].items() timeslot.append({date: value for (timeslot, value) in items}) Date = [] PageViews = [] for i in timeslot: for x, y in i.items(): Date.append(x.date()) PageViews.append(y) #print(Date) #print(PageViews) df = pd.DataFrame(Date, columns=['Date']) df1 = pd.DataFrame(PageViews, columns=['WikiPageViews']) df = df.merge(df1, left_index=True, right_index=True)