Exemplo n.º 1
0
def get_pageviews(site_name, *args, **kwargs):
    if site_name.lower() == 'wikipedia':
        start = ''
        end = ''
        granularity = 'monthly'
        if kwargs.get('article_name') != None:
            article_name = kwargs['article_name']
        # article_name = self.get_article_name(article_name)
        if kwargs.get('start') != None:
            start = kwargs['start'].replace('-', '')

        if kwargs.get('end') != None:
            end = kwargs['end'].replace('-', '')

        if kwargs.get('granularity') != None:
            granularity = kwargs['granularity']

        p = PageviewsClient(user_agent="<*****@*****.**>")

        if start == '':
            return p.article_views('en.wikipedia',
                                   article_name,
                                   granularity=granularity)
        elif end == '':
            return p.article_views('en.wikipedia',
                                   article_name,
                                   granularity=granularity,
                                   start=start,
                                   end=start)
        else:
            return p.article_views('en.wikipedia',
                                   article_name,
                                   granularity=granularity,
                                   start=start,
                                   end=end)
Exemplo n.º 2
0
 def run(self):
     viewer = PageviewsClient(
         user_agent="<*****@*****.**> Selfie, Cat, and Dog analysis"
     )
     self.logger.info('[%s] Starting Wiki thread' % self.Name)
     try:
         for ticker, article in self.Tickers.items():
             End_date = time.strftime('%Y%m%d')
             data = viewer.article_views('en.wikipedia',
                                         article,
                                         granularity='daily',
                                         start=self.Start_date,
                                         end=End_date)
             for row in data:
                 if data[row][article]:
                     wikid = {}
                     wikid['date'] = row.strftime('%m/%d/%Y')
                     wikid['symbol'] = ticker
                     wikid['article'] = article
                     wikid['wiki_views'] = int(data[row][article])
                     queueDoc(wikid)
             self.logger.info('[%s] Collected Info on %s' %
                              (self.Name, ticker))
     except Exception as e:
         self.logger.error('[%s] Error: %s' % (self.Name, e))
     self.logger.info('[%s] Exiting' % self.name)
     self.Fin = True
Exemplo n.º 3
0
def _get_snp500_wiki_views(conn, start, end):
    """
    Inserts wiki page views into the daily_views table

    Parameters:
        start (str) : YYYYMMDD
        end   (str) : YYYYMMDD

    Returns:
        List[tuple] : (id, date, views, now, now)
    """
    pvc = PageviewsClient()
    symbol_ids_and_titles = _get_symbol_ids_and_wiki_titles(conn)
    title_to_id = {title: id for id, title in symbol_ids_and_titles}
    articles = [title for _, title in symbol_ids_and_titles]
    project = 'en.wikipedia'
    now = datetime.datetime.utcnow()

    # API call
    views_dict = pvc.article_views(project, articles, start=start, end=end)
    # transforming API call to rows (a list of tuples)
    daily_views = []
    for date in views_dict:
        for title in views_dict[date]:
            id, views = title_to_id[title], views_dict[date][title]
            daily_views.append((id, date, views, now, now))

    return daily_views
Exemplo n.º 4
0
def get_wiki_pageviews(title):
	p = PageviewsClient(user_agent="<*****@*****.**> multiple movie titles")
	today = datetime.datetime.now().strftime("%Y%m%d")
	try:
		return p.article_views('en.wikipedia', title, start='20130101',end=today)
	except:
		return {}
Exemplo n.º 5
0
def top_articles_by_views(articles, top_x):
    """
    returns the top x of the given list of articles
        based on page views for the previous month
        output:
            [(article1, views), (article2, views)]
    """
    p = PageviewsClient()

    # create date string based on previous month
    now = datetime.datetime.now()
    previous_month = str(now.month - 1).zfill(2)
    if previous_month == "00": previous_month = "12"
    start_date = str(now.year) + previous_month + "0100"
    end_date = str(now.year) + previous_month + "2800"

    # get views
    result = p.article_views('en.wikipedia',
                             articles,
                             granularity='monthly',
                             start=start_date,
                             end=end_date)
    # clean results (six is used for backwards compatibility with python 2
    result = six.next(six.itervalues(result))
    sorted_articles = sorted(result.items(),
                             key=operator.itemgetter(1),
                             reverse=True)
    return sorted_articles[:top_x]
Exemplo n.º 6
0
 def getViewsPerDay(self):  # pageviews own thing
     """Gets a time series dataframe: date (as index), views (column is article name)
     Will be using 'mwviews' package"""
     p = PageviewsClient('en')
     print(self.article)
     data = p.article_views('en.wikipedia', [self.article],
                            granularity='daily',
                            start=self.getPublishDate(),
                            end=datetime.datetime.now().strftime('%Y%m%d'))
     df = pd.DataFrame.from_dict(data, orient='index').dropna()
     return df
Exemplo n.º 7
0
def wiki_api(keyword, start, end, agent='user'):
    output_list = []

    p = PageviewsClient('what is it..?')  #parameter로 스트링이 들어가야함. 아무거나 넣어도 가능..
    output_dict = dict(
        p.article_views('en.wikipedia.org', [keyword],
                        start=start,
                        end=end,
                        agent=agent))

    for key, val in output_dict.items():
        tem_dict = {}
        tem_dict['date'] = key.strftime("%Y%m%d")
        tem_dict['view_count'] = val[keyword.replace(" ", "_")]
        output_list.append(tem_dict)

    result = json.dumps(output_list)
    return result
Exemplo n.º 8
0
def get_page_views_dict(links):
    p = PageviewsClient()
    #today = datetime.datetime.today()
    #today = today.strftime('%Y%m%d')
    #p.article_views('{}.wikipedia'.format(lang), title, granularity='monthly', start='20160201', end=today)
    my_dico = p.article_views('{}.wikipedia'.format(lang), links)
    my_dico_by_article = {}
    for article in links:
        my_dico_by_article[article] = 0

    for key_date, sub_dico_value in my_dico.items():
        for article, number in sub_dico_value.items():
            if number is not None:
                my_dico_by_article[article.replace('_', ' ')] += number
    my_dico_by_article = dict(
        sorted(my_dico_by_article.items(),
               key=operator.itemgetter(1),
               reverse=True))
    #need to define a selection choose based on title approximation
    return my_dico_by_article
Exemplo n.º 9
0
def get_page_views(article_names, output_path):
    """Query the Wikipedia page views api for the relevant pages

    Keyword arguments:
    article_names -- array of article names to query
    output_path -- output path for the csv file output
    """
    p = PageviewsClient(user_agent="[email protected] Selfie, Cat, and Dog analysis")
            
    values = p.article_views('en.wikipedia',article_names, granularity='monthly', start='20150101', end='20200401')
    all_keys = list(values.keys())
    all_keys.sort()
    val_dict = []
    for x in article_names:
        for key in all_keys:
            val_dict.append({"article_title":x,"timestamp":key, "views":values[key][x]})
    df = pd.DataFrame(val_dict)
    df = df.fillna(0)
    
    print("Writing Page View Data to -- " + output_path + " -- for " + str(len(df.article_title.unique())) + " articles")
    
    df.to_csv(output_path, mode='w', index=False)
    
    return df
def top_articles_by_views(articles, top_x):
    """
    returns the top x of the given list of articles
        based on page views for the previous month
        output:
            [(article1, views), (article2, views)]
    """
    p = PageviewsClient()

    # create date string based on previous month
    now = datetime.datetime.now()
    previous_month = str(now.month - 1).zfill(2)
    if previous_month == "00": previous_month = "12"
    start_date = str(now.year) + previous_month + "0100"
    end_date = str(now.year) + previous_month + "2800"

    # get views
    result = p.article_views('en.wikipedia', articles, 
            granularity='monthly', start=start_date, end=end_date)
    # clean results (six is used for backwards compatibility with python 2
    result = six.next(six.itervalues(result))
    sorted_articles = sorted(result.items(), 
            key=operator.itemgetter(1), reverse=True)
    return sorted_articles[:top_x]
Exemplo n.º 11
0
def download_pageviews(entities=None, start='20150701', end=None, access='desktop', agent='user', limit=10000):

    """
    Download pageviews from Wikipedia

    :param entities: A list of entities (Wikipedia pages) to get pageview data for
    :param start: The start date of the range over which to collect data;
        2015-07-01 is the earliest supported by the API
    :param end: The end date of the range, or None for today
    :param access: The method by which Wikipedia was accessed (default: desktop)
    :param agent: The user agent accessing Wikipedia (default: user)
    :param limit: The number of most-trafficked entities to return data for, if no entities are specified in the call
    :return: A DataFrame of entities x pageviews by day
    """
    
    if end is None:
        end = datetime.date.today().strftime('%Y%m%d')
    
    p = PageviewsClient()
    dates = pd.date_range(start=start, end=end)

    #str -> list
    if type(entities) is str:
        
        entities = [entities]
    
    # if entities aren't passed in, get the daily top entities for the period
    if entities is None:
        df_pvs = None
    
        for d in dates:
            try:
                df = pd.DataFrame(p.top_articles('en.wikipedia', year=d.year, month=d.month,\
                                                 day=d.day, limit=limit, access=access))
            except:
                continue

            df = df.set_index('article').rename(columns={'views': d})[[d]]

            if df_pvs is None:
                df_pvs = df
            else:
                df_pvs = df_pvs.join(df, how='outer')

        entities = df_pvs.index.values.tolist()
    
    for i in range(len(entities)):
        try:
            entities[i] = unidecode(wikipedia.page(entities[i]).title)
        except wikipedia.exceptions.DisambiguationError as e:
            print 'I could not understand that, please check your spelling or be more specific'
            print 'Error: {0}'.format(e)
            avere = pd.DataFrame(columns=['NONE'])
            return avere
        except wikipedia.exceptions.PageError as e:
            print 'I could not understand that, please check your spelling or be more specific'
            print 'Error: {0}'.format(e)
            avere = pd.DataFrame(columns=['NONE'])
            return avere
        
    search = p.article_views('en.wikipedia', entities, start=start, end=end, access=access, agent=agent)
    df = pd.DataFrame.from_dict(search, orient='index')
    
    return df
Exemplo n.º 12
0
import urllib
from mwviews.api import PageviewsClient

articles = ['cat', 'dog', 'New York', ]
articles = [urllib.parse.quote('Park Güell'.encode('utf-8', 'ignore'), safe='')]

top_x = 2

p = PageviewsClient(10)

# create date string based on previous month
now = datetime.datetime.now()
previous_month = str(now.month - 1).zfill(2)
if previous_month == "00": previous_month = "12"
start_date = str(now.year) + previous_month + "0100"
end_date = str(now.year) + previous_month + "2800"

# encode in ascii for compatibility with page views api 
articles = [article.encode("ascii", 'ignore') for article in articles]
# get views
result = p.article_views('en.wikipedia', articles, 
        granularity='monthly', start=start_date, end=end_date)
# clean results (six is used for backwards compatibility with python 2
result = six.next(six.itervalues(result))
sorted_articles = sorted(result.items(), 
        key=operator.itemgetter(1), reverse=True)
# print sorted_articles[:top_x]
print(sorted_articles[:top_x])
    

Exemplo n.º 13
0
driver = GraphDatabase.driver("bolt://localhost", auth=("neo4j", "neo"))

# people = [
#     "Boris Johnson", "Theresa May", "Jacob Rees-Mogg"
# ]

with driver.session() as session:
    result = session.run("""
  MATCH (p:Person)
  RETURN p.name AS person
  """)
    people = [row["person"] for row in result]

# p.article_views("en.wikipedia", people,  start="20190325", end="20190330")
views = p.article_views("en.wikipedia",
                        people,
                        start="20160624",
                        end="20190330")
votes = {person: 0 for person in people}

for key in views.keys():
    for person_key in views[key].keys():
        person = person_key.replace("_", " ")
        if views[key][person_key]:
            votes[person] += views[key][person_key]

with open("data/pageviews.csv", "w") as pageviews_file:
    writer = csv.writer(pageviews_file, delimiter=",")
    writer.writerow(["person", "pageviews"])

    for vote in votes:
        writer.writerow([vote, votes[vote]])
Exemplo n.º 14
0
articles = [
    urllib.parse.quote('Park Güell'.encode('utf-8', 'ignore'), safe='')
]

top_x = 2

p = PageviewsClient(10)

# create date string based on previous month
now = datetime.datetime.now()
previous_month = str(now.month - 1).zfill(2)
if previous_month == "00": previous_month = "12"
start_date = str(now.year) + previous_month + "0100"
end_date = str(now.year) + previous_month + "2800"

# encode in ascii for compatibility with page views api
articles = [article.encode("ascii", 'ignore') for article in articles]
# get views
result = p.article_views('en.wikipedia',
                         articles,
                         granularity='monthly',
                         start=start_date,
                         end=end_date)
# clean results (six is used for backwards compatibility with python 2
result = six.next(six.itervalues(result))
sorted_articles = sorted(result.items(),
                         key=operator.itemgetter(1),
                         reverse=True)
# print sorted_articles[:top_x]
print(sorted_articles[:top_x])
name_correction = {'M.J. Stewart':'M. J. Stewart',
                   'P.J. Hall':'P. J. Hall',
                   'R.J. McIntosh':'R. J. McIntosh'
                  }
df_2018 = df_2018.replace(name_correction)

#2018 NFL draft took place from April 26 to April 28
#Collect more data than needed at beginning. Dates will be pared down after exploratory analysis

#build dataframe format
#wiki_views_t = pd.DataFrame.from_dict(p.article_views('en.wikipedia', df_2018.at[0,'Player'], granularity='daily', start='20170101', end='20181231'))
#Use Jonathan Taylor as a control. 
#He is a top performing freshman from Wisconsin who is not eligible for the draft until 2019
#Considered using a combination of other college players as control, but most do not have a Wiki page
wiki_views_c = pd.DataFrame.from_dict(p.article_views('en.wikipedia', 'Jonathan Taylor (American football)', granularity='daily', start='20170101', end='20181231'))

#remove data
wiki_views_t = wiki_views_c[0:0]

#populate table by with wikipedia stats for each player
for i in df_2018.index:
    wiki_views_t = wiki_views_t.append(pd.DataFrame.from_dict(p.article_views('en.wikipedia', df_2018.at[i,'Player'], granularity='daily', start='20170101', end='20181231')))

#set column name for players
wiki_views_t['Player'] = wiki_views_t.index
wiki_views_c['Player'] = wiki_views_c.index
  
wiki_views_t = pd.melt(wiki_views_t, id_vars=["Player"],var_name="Date", value_name="Views")
wiki_views_c = pd.melt(wiki_views_c, id_vars=["Player"],var_name="Date", value_name="Views")
Exemplo n.º 16
0
                current_stat_read_line += 1
                current_date = data_dic['webPublicationDate'].split(
                    'T')[0].replace('-', '')

                if previous_date == None or previous_date == current_date:
                    if previous_date == None:
                        previous_date = current_date
                    data_out.append(data_dic)
                    stat_out.append(stat_dic)
                else:
                    articles = [
                        dic['wiki_link'].split('/')[-1] for dic in stat_out
                    ]
                    views = p.article_views('en.wikipedia',
                                            articles,
                                            granularity='daily',
                                            start=previous_date,
                                            end=previous_date)
                    for d_dic, s_dic in zip(data_out, stat_out):
                        d_dic['views'] = views[datetime.strptime(
                            previous_date,
                            "%Y%m%d")][s_dic['wiki_link'].split('/')[-1]]
                        if d_dic['views'] == None:
                            d_dic['views'] = 0
                        i += 1
                        print('Writing file: ', i)
                        semi_final_dataset.write(json.dumps(d_dic) + '\n')
                    data_out = [data_dic]
                    stat_out = [stat_dic]
                    previous_date = current_date
            if len(stat_out) > 0:
Exemplo n.º 17
0
	# 20150901 is start of tool
	sd = '20150901'

	ed = today.strftime('%Y%m%d') 

	# remove for wikipedia views per language version
	l = '/'

	# remove for wikipedia views per language version
	l = checkempty(l ,c)

	try:

		# remove +l for wikipedia views per language version 
		views = p.article_views(w, [t + l], access='all-access', start=sd , end=ed)

		dates = sorted(views.keys())
		
		graphlist(dates)

		a = int()

		# add up all supage views
		view_total += addtotalviews(views, a)

	except Exception: 

		a = 0
		pass
Exemplo n.º 18
0
from mwviews.api import PageviewsClient
import pandas
import pandas_datareader as web
import ystockquote
from pprint import pprint

yesterday = str(
    (datetime.datetime.now() - datetime.timedelta(2)).strftime("%Y%m%d"))

print('Yesterday was', yesterday)
pageViews = PageviewsClient('shivansh')

#FOR Iphone
pv = pageViews.article_views('en.wikipedia',
                             'IPhone',
                             granularity='daily',
                             start='20150701',
                             end=yesterday)
print(pv)
print('Data points for IPhone: ', pv.__len__())

rawIphone = list(pv.items())

t = sorted(rawIphone)

out = open('Iphone.csv', 'w')

for i in t:
    d = datetime.datetime.strptime(str(i[0]), '%Y-%m-%d %H:%M:%S')
    row = d.strftime('%Y/%m/%d') + ',' + str(i[1]['IPhone']) + '\n'
    out.write(row)
Exemplo n.º 19
0
 if last_dt is None:
     last_dt = start  #datetime.datetime.now() - datetime.timedelta(365*10,0)
 else:
     last_dt = datetime.datetime.combine(row['last_dt'], datetime.time(0))
 if first_dt is not None and first_dt > start.to_datetime().date():
     last_dt = start.to_datetime()
 current = last_dt
 st = str(last_dt.year) + str(last_dt.month).zfill(2) + str(
     last_dt.day).zfill(2)
 end = str(nnow.year) + str(nnow.month).zfill(2) + str(nnow.day).zfill(2)
 articles = [title.encode('utf8')]
 print('Getting', articles, project, 'from', st, 'to', end)
 try:
     resp = p.article_views(project + '.wikipedia',
                            articles,
                            start=st,
                            end=end,
                            agent='user')
     pageid = row.pageid
     for dv in resp.keys():
         pv = resp[dv][title.encode('utf8').replace(b' ', b'_')]
         if pv is None:
             rdt = resp[dv]
             alts = list(rdt.keys())
             if articles[0] in alts:
                 alts.remove(articles[0])
             i = 0
             while i < len(alts) and pv is None:
                 pv = rdt[alts[i]]
                 i += 1
         if pv is not None:
Exemplo n.º 20
0
from mwviews.api import PageviewsClient
import pandas as pd

p = PageviewsClient(user_agent='all-agents')

x = (p.article_views('en.wikipedia', ['Reliance Industries'],
                     granularity='daily',
                     start='20150701',
                     end='20180318'))

df = pd.DataFrame()
df1 = pd.DataFrame()
Y = pd.read_csv("FinalData.csv")

timeslot = []
for date in x:
    items = x[date].items()
    timeslot.append({date: value for (timeslot, value) in items})
Date = []
PageViews = []
for i in timeslot:
    for x, y in i.items():
        Date.append(x.date())
        PageViews.append(y)
#print(Date)
#print(PageViews)

df = pd.DataFrame(Date, columns=['Date'])
df1 = pd.DataFrame(PageViews, columns=['WikiPageViews'])

df = df.merge(df1, left_index=True, right_index=True)