def wiki_scraper(workbook, sheet1, object_list): iteration = 0 for element in object_list: try: #print('1') #print(element.titel1) #print(element.lastdate) #print(element.firstdate) titel1 = element.titel1 titel2 = element.titel2 titel3 = element.titel3 print(titel2, titel3) """ views = pageviewapi.per_article('sv.wikipedia', element.title1, element.firstdate, element.lastdate, access='all-access', agent='all-agents', granularity='daily') """ views = pageviewapi.per_article('en.wikipedia', element.titel1, element.lastdate, element.firstdate, access='all-access', agent='all-agents', granularity='daily') views = select_views(views) #print(1, views) except: try: print(2) print(titel2) views = pageviewapi.per_article('en.wikipedia', element.titel2, element.lastdate, element.firstdate, access='all-access', agent='all-agents', granularity='daily') views = select_views(views) except: try: print(3) print(element.titel3) views = pageviewapi.per_article('en.wikipedia', element.titel3, element.lastdate, element.firstdate, access='all-access', agent='all-agents', granularity='daily') views = select_views(views) except: views = 'N/A' write_to_file(sheet1, views, iteration, element) iteration = iteration + 1 workbook.save('Wiki_workbook.xls') print 'hola!'
def get_daily_wiki_data( TICKER, start='2017-01-01', end=date.today().strftime( "%Y-%m-%d")): # Gjer "end" funksjonen at vi ikkje får siste dagen med? """Function that takes ticker, and name of firm as input and retrieves relevant data as pandas dataframe""" Name = TICKER out_df = pd.DataFrame() # ### wikipedia data wikistart = start.replace('-', '') wikiend = end.replace("-", "") try: wikidata = wik.per_article('en.wikipedia', Name, wikistart, wikiend, access='all-access', agent='all-agents', granularity='daily') df_wiki = pd.DataFrame(wikidata['items']) df_wiki['Date'] = pd.to_datetime(df_wiki['timestamp'], format='%Y%m%d%H') df_wiki.set_index('Date', inplace=True) out_df['wikiviews'] = df_wiki['views'] except: print('fungerte ikke') return out_df.dropna( ) # drops nan values, at least 14 drops because of williams R is computed by 14day rolling window, nan for first days
def fillMissingValues(dataframe): pageTitle, accessDetails = extractTitleAndAccessDetails(dataframe) dataIndex = pd.concat([pageTitle, accessDetails], join='outer', axis=1) beforeFilling = dataframe.isnull().sum() dates = list(dataframe.columns) dates = dates[1:] for date in dates: nullValues = dataframe[date].isnull().to_numpy().nonzero() for index in nullValues[0]: try: dataframe.at[dataIndex['Page Title'][index], date] = pageviewapi.per_article( dataIndex['Domain'][index], dataIndex['Page Title'][index], date.replace('-', ''), date.replace('-', ''), dataIndex['Access'][index], dataIndex['Viewer Type'][index], granularity='daily')['items'][0]['views'] print("Filled", dataIndex['Page Title'][index], date) except pageviewapi.client.ZeroOrDataNotLoadedException: print("No data available.") continue afterFilling = dataframe.isnull().sum() print('Before filling the missing values: ', beforeFilling) print('After filling the missing values: ', afterFilling) return dataframe
def get_number_of_pageviews(article_name): pageview_data = pageviewapi.per_article( 'fr.wikipedia', article_name, start='20000101', end='20201020', granularity='monthly') pageview_count = 0 for item in pageview_data['items']: pageview_count += item['views'] return pageview_count
def get_pageviews(start_date, end_date, page): views = 0 try: out = pageviewapi.per_article(page.site.code + '.wikipedia', page.title(), start_date, end_date, access='all-access', agent='all-agents', granularity='daily') for i in range(len(out["items"])): views += out["items"][i]["views"] return views except pageviewapi.client.ZeroOrDataNotLoadedException: return 0
def get_page_views(start_date, end_date, page_name, granularity): """Get number of views of a page for a specific food using PageView API from Wikipedia.""" page_views = pv.per_article(page=page_name, project='en.wikipedia', start=start_date, end=end_date, granularity=granularity).get('items') views = 0 for i in range(len(page_views)): views += page_views[i].get('views') return page_name, views
def get_popularity_score(page_name): """Get number of views of a page for a specific food using PageView API from Wikipedia.""" today = datetime.date.today().strftime('%Y%m%d') last_year_today = (datetime.date.today() - datetime.timedelta(days=365)).strftime('%Y%m%d') page_views = pv.per_article(page=page_name, project='en.wikipedia', start=last_year_today, end=today, granularity='monthly').get('items') views = 0 for i in range(len(page_views)): views += page_views[i].get('views') return views
def parse(row): days_before = 15 days_after = 15 day_idx = np.arange(days_before + days_after) print(row[0]) # if row[0] % 100 == 0: # time.sleep(1) index = row[0] row = row[1] topic = row[0] #topic = topic.replace('#', '') # print(topic) start_date = (row[1] - pd.Timedelta(days=days_before)).strftime('%Y%m%d') end_date = (row[1] + pd.Timedelta(days=days_after)).strftime('%Y%m%d') # print(start_date, end_date) try: suggestion = wiki.search(topic) if len(suggestion) > 0: #print('Interpreting topic ' + topic + ' as ' + suggestion[0]) interpreted_topic = suggestion[0] #opic_views = pgviews.per_article('en.wikipedia', topic, start_date, end_date, agent='user', access='all-access', granularity='daily') else: interpreted_topic = topic return [index, topic] + [interpreted_topic]+[row[1]]+list(np.full(days_before+days_after+1, np.nan)) topic_views = pgviews.per_article('en.wikipedia', interpreted_topic, start_date, end_date, agent='user', access='all-access', granularity='daily')['items'] # print(topic_views, '--------------\n') if len(topic_views) < 16: return [index, topic] + [interpreted_topic]+[row[1]]+list(np.full(days_before+days_after+1, np.nan)) except Exception: interpreted_topic = topic return [index, topic] + [interpreted_topic]+[row[1]]+list(np.full(days_before+days_after+1, np.nan)) day_list = topic_views views_list = np.array(list(map(lambda i: i['views'], day_list))) #print(views_list) views_list = interpolate_zeros(views_list) pct_change = np.diff(views_list) / views_list[0:-1] * 100 if np.max(pct_change[13:16]) < 10: return [index, topic] + [interpreted_topic]+[row[1]]+list(np.full(days_before+days_after+1, np.nan)) return [index, topic] + [interpreted_topic] + [row[1]] + list(views_list) + [np.mean(views_list)]
def views(article_name, start_date, end_date): '''Input: a wiki article name, and the desired date range for daily pageview data to be collected Output: a list containing pageviews of the article for specified range ''' page_views = [] d = pageviewapi.per_article('en.wikipedia', article_name, start_date, end_date, access='all-access', agent='all-agents', granularity='daily') for i in range(len(d['items'])): page_views.append(d['items'][i]['views']) return (page_views)
def getPageviews(articleName, lang): ''' Get pageviews for given article and language version of Wikipedia, convert to dataframe. --- Parameters: article name, language abbreviated ('en', 'hi', 'ur', ...) Returns: dataframe. 1 row per day; columns are project, article, views, etc --- ''' dailyPageViewsDict = pageviewapi.per_article(lang + '.wikipedia', articleName, start, end, agent='all-agents', granularity='daily') return pd.DataFrame.from_dict(dailyPageViewsDict['items'])
def get_wiki_views(self, article: str, start_date, end_date): """ return dict of datetime : views for the period between start and end date (both inclusive) """ _st = self._to_wiki_date_string(start_date) _nd = self._to_wiki_date_string(end_date) try: full_result = pageviewapi.per_article('en.wikipedia', article, _st, _nd,\ access='all-access', agent='all-agents', granularity='daily') except pageviewapi.client.ZeroOrDataNotLoadedException as e: full_result = dict(items=[]) result_subset = {} for item in full_result["items"]: py_dt = self._to_datetime(item["timestamp"]) result_subset[py_dt] = item["views"] return result_subset
def get_pageviews(self, country, current_date): logging.info('getting pageviews for ' + country) project = config.PREFIX[config.LANGUAGE[country]] + '.wikipedia' filepath = config.keywords_path / (country + '.txt') current_date = current_date - timedelta(days=1) end = current_date.strftime('%Y%m%d') start_date = current_date - timedelta(days=6) start = start_date.strftime('%Y%m%d') features = pd.Series() logging.info('from ' + str(start) + ' to ' + str(end)) with open(str(filepath.absolute()), 'r') as file: for line in file: line = line[:-1] count = 0 try: res = pageviewapi.per_article(project, line.strip(), start, end, access='all-access', agent='all-agents', granularity='daily') for item in res['items']: count += int(item['views']) except ZeroOrDataNotLoadedException: logging.info( 'ZeroOrDataNotLoadedException returned, saving ' 'pageviews as 0.') count = 0 except ThrottlingException: logging.info( 'ThrottlingException returned, saving pageviews as 0.') count = 0 features[line] = count logging.info('\tpageviews for ' + line + ' are ' + str(count)) return features
def fetchwikiData(dataset, dataframe, path, startDate, endDate, granularity): print("Checking for Duplicates") dropDuplicates(dataset, path) count = 0 fileCount = 0 prefetchedFiles = os.listdir(path) if len(prefetchedFiles) != 0: fileCount = len(prefetchedFiles) prefetchedFiles = pd.DataFrame(prefetchedFiles, columns = ['Page']) prefetchedFiles1 = prefetchedFiles.merge(dataset['Page']) df = prefetchedFiles.set_index('Page').drop(prefetchedFiles1['Page']).reset_index() for _ in list(df['Page']): os.remove(os.path.join(path, str(_))) dataset = dataset.set_index('Page').drop(prefetchedFiles1['Page']).reset_index() dataframe = extractTitleAndAccessDetails(dataset) dataframe['Domain'] = dataframe['Domain'].str.replace('.org', '') print("Beginning Download.\n") for domain, pageTitle, accessDetails, viewerType in zip(dataframe['Domain'], dataframe['Page'], dataframe['Access'], dataframe['Viewer Type']): try: jsonResults = pageviewapi.per_article(domain, pageTitle.replace('%0%0%0', '/'), startDate, endDate, accessDetails, viewerType, granularity) try: writeJSONFile(path, pageTitle, domain, accessDetails, viewerType, jsonResults): fileCount = fileCount+ 1 print("Saved", jsonResults['items'][0]['article'], fileCount) except OSError: fileCount, count = errorHandling(fileCount, count) continue except pageviewapi.client.ZeroOrDataNotLoadedException: try: with open(path + "/" + str(pageTitle) + "_" + domain + ".org" + "_" + accessDetails + "_" + viewerType, "w") as write_file: json.dump('', write_file) except OSError: fileCount, count = errorHandling(fileCount, count) continue fileCount, count = errorHandling(fileCount, count) continue print("Download Completed.\n")
def fetch_timeseries_wikipedia(keyword, save_csv=True): try: interest_over_time = pageviewapi.per_article('en.wikipedia', keyword, '20151101', '20191101', access='all-access', agent='all-agents', granularity='daily') except: print('The chosen article doesn\'t exist') return None # Save in a csv if needed if save_csv: # Csv naming and path data_path = 'data/wikipedia/' if not os.path.exists(data_path): os.makedirs(data_path) file_name = data_path + keyword.lower() + '_wikipedia_interest.csv' interest_over_time_df = pd.DataFrame(interest_over_time) interest_over_time_df.to_csv(file_name, index=False, encoding='utf-8') return interest_over_time
def average_page_view(entity): """ Takes the table and returns the average page views of the Wikipedia page over a given time interval :param table: :return: """ start_date = '20200101' # January 1 2020 end_date = '20200326' # March 26 2020 n_of_page_views = 0 try: page_views = pageviewapi.per_article('en.wikipedia', entity, start_date, end_date, access='all-access', agent='all-agents', granularity='daily') for article in page_views['items']: n_of_page_views += article['views'] except: n_of_page_views = 0 return n_of_page_views / 86
import csv import pageviewapi with open('party_name.csv','r',encoding='utf-8') as inp, open('qwer.csv','w') as out: writer=csv.writer(out) s=0 for row in csv.reader(inp): if row: try: s+=1v = pageviewapi.per_article('en.wikipedia', row[0], '20151106','20191120', access='all-access', agent='all-agents', granularity='daily') for i in v['items']: list1=[] list1.append(row[0]) list1.append(i['timestamp']) list1.append(i['views']) writer.writerow(list1) print(list1) except Exception as e: print(e) print("done")
def get_wiki_pageviews(twitter_file: str, wiki_file: str): """ Get Wikipedia pageviews for each Twitter trend and save them in a csv :param twitter_file: path to csv with Twitter trends :param wiki_file: path to output csv where Wiki pageviews will be written """ timeStamps = pd.read_csv(twitter_file, header=0, index_col=0) ts_per_article = pd.DataFrame(columns=[ 'trend', 't-15', 't-14', 't-13', 't-12', 't-11', 't-10', 't-9', 't-8', 't-7', 't-6', 't-5', 't-4', 't-3', 't-2', 't-1', 't' ]) for index, row in timeStamps.iterrows(): days_before = 15 # topic = row[2] topic = row.name start_date = (datetime.today() - timedelta(days=days_before + 1)).strftime('%Y%m%d') end_date = datetime.today().strftime('%Y%m%d') try: suggestion = wiki.search(topic) if len(suggestion) > 0: interpreted_topic = suggestion[0] else: interpreted_topic = 'NA' interpreted_topic = topic if interpreted_topic is not 'NA': topic_views = pgviews.per_article('en.wikipedia', interpreted_topic, start_date, end_date, agent='user', access='all-access', granularity='daily')['items'] day_list = topic_views views_list = np.array(list(map(lambda i: i['views'], day_list))) views_list = interpolate_zeros(views_list) views_list = np.array(views_list).astype(int) ts_per_article = ts_per_article.append( { 'trend': interpreted_topic, 't-15': views_list[0], 't-14': views_list[1], 't-13': views_list[2], 't-12': views_list[3], 't-11': views_list[4], 't-10': views_list[5], 't-9': views_list[6], 't-8': views_list[7], 't-7': views_list[8], 't-6': views_list[9], 't-5': views_list[10], 't-4': views_list[11], 't-3': views_list[12], 't-2': views_list[13], 't-1': views_list[14], 't': views_list[15] }, ignore_index=True) except Exception: interpreted_topic = topic ts_per_article[[ 't-15', 't-14', 't-13', 't-12', 't-11', 't-10', 't-9', 't-8', 't-7', 't-6', 't-5', 't-4', 't-3', 't-2', 't-1', 't' ]] = ts_per_article[[ 't-15', 't-14', 't-13', 't-12', 't-11', 't-10', 't-9', 't-8', 't-7', 't-6', 't-5', 't-4', 't-3', 't-2', 't-1', 't' ]].astype(int) ts_per_article.to_csv(wiki_file, index=False, header=True, encoding='utf-8')
def getSum(project,startdate,enddate,limit=1000, thumbsize=1000): #define stopwords stopwords = ['Progetto:','Pagina_principale','Wikipedia:','Aiuto:','Speciale:','Special:','File:','Categoria:','load.php'] #add the custom ones #stopwords = stopwords + w_stopwords print stopwords #set up the maxvalue var maxvalue = 0 data = dict() for date in daterange(startdate,enddate): print date.strftime("%Y-%m-%d") try: results = pageviewapi.top(project, date.year, date.strftime('%m'), date.strftime('%d'), access='all-access') #print json.dumps(results, indent=4, sort_keys=True) for item in results['items'][0]['articles']: if item['article'] in data: data[item['article']] += item['views'] else: data[item['article']] = item['views'] except: print('impossible to fetch ', date.strftime("%Y-%m-%d")) data = sorted(data.items(), key=operator.itemgetter(1), reverse=True) articles = [] #create an object for each article rank = 1 for elm in data: #chech stopwords stop = False for stopword in stopwords: if stopword in elm[0]: stop = True print 'stopped '+ elm[0] break if elm[0] in w_stopwords: stop = True found_stopwords.append(elm[0].replace('_',' ')) print '\tfound custom sw: '+elm[0] if not stop: obj = {} obj['title'] = elm[0] obj['pageviews'] = elm[1] obj['rank'] = rank articles.append(obj) rank = rank + 1 #add imgs and snippet for article in articles[:limit]: article['image'] = getImage('it.wikipedia', article['title'], thumbsize) article['snippet'] = getSnippet(project, article['title']) #add pageviews for article in articles[:limit]: print 'loading stats for', article['title'], ' from ', startdate.strftime('%Y%m%d'), ' to ', enddate.strftime('%Y%m%d') raw_stats = pageviewapi.per_article(project, urllib.quote(article['title'].encode('utf8')), startdate.strftime('%Y%m%d'), enddate.strftime('%Y%m%d'), access='all-access', agent='all-agents', granularity='daily') stats = [] #parse raw stats #for now it is optimized for the vega code, quite messy. stats.append({}) stats[0]['name'] = 'table' stats[0]['values'] = [] #print json.dumps(raw_stats, indent=4, sort_keys=True) # check from here error of 6 output for item in raw_stats['items']: item_result = {} item_result['x'] = datetime.strptime(item['timestamp'],"%Y%m%d%M").strftime("%m/%d/%Y") item_result['y'] = item['views'] if int(item['views']) > maxvalue: maxvalue = int(item['views']) stats[0]['values'].append(item_result) print json.dumps(stats, indent=4, sort_keys=True) # check from here error of 6 output article['stats'] = stats results = {} results['maxvalue'] = maxvalue results['project'] = project results['startdate'] = startdate.strftime("%Y-%m-%d") results['enddate'] = enddate.strftime("%Y-%m-%d") results['articles'] = articles[:limit] return results
""" import pageviewapi tot=pageviewapi.per_article('sv.wikipedia', 'Lady Bird', '20180211', '20180212', access='all-access', agent='all-agents', granularity='daily') import pageviewapi.period pageviewapi.period.sum_last('sv.wikipedia', 'Paris', last=30, access='all-access', agent='all-agents') tot=pageviewapi.period.avg_last('sv.wikipedia', 'Paris', last=30) """ import pageviewapi tot=pageviewapi.per_article('sv.wikipedia', 'Baywatch', '20170602', '20170702', access='all-access', agent='all-agents', granularity='daily') print tot.items()[0][1][1]['views'] """ AttrDict({u'items': [{u'access': u'all-access', u'views': 13, u'timestamp': u'2017120600', u'agent': u'all-agents', u'project': u'sv.wikipedia', u'granularity': u'daily', u'article': u'The_Ring_(film)'}, {u'access': u'all-access', u'views': 17, u'timestamp': u'2017120700', u'agent': u'all-agents', u'project': u'sv.wikipedia', u'granularity': u'daily', u'article': u'The_Ring_(film)'}, {u'access': u'all-access', u'views': 30, u'timestamp': u'2017120800', u'agent': u'all-agents', u'project': u'sv.wikipedia', u'granularity': u'daily', u'article': u'The_Ring_(film)'}, {u'access': u'all-access', u'views': 227, u'timestamp': u'2017120900', u'agent': u'all-agents', u'project': u'sv.wikipedia', u'granularity': u'daily', u'article': u'The_Ring_(film)'}, {u'access': u'all-access', u'views': 46, u'timestamp': u'2017121000', u'agent': u'all-agents',
import wikipedia import pageviewapi import pymongo import os from pymongo import MongoClient #print (wikipedia.search("Barack")) #ny = wikipedia.page("New York") #print(ny.title) #Datos notice = pageviewapi.per_article('en.wikipedia', 'Barack', '20151106', '20151120', access='all-access', agent='all-agents', granularity='daily') #print(notice) #conexion a bd y creacion de la base de datos myclient = MongoClient(host=os.environ['MONGO_HOST'], port=int(os.environ['MONGO_PORT'])) db = myclient.wiki #coleccion datos = db.datos result = datos.insert_one(notice) print ('Objeto instertado ' + str(result.inserted_id))
def page_view(read_link, to_link): directory = "output" # Parent Directory path parent_dir = "./test/" path = os.path.join(parent_dir, directory) os.mkdir(path) result = pd.read_csv(read_link) result = result.drop(columns=['Unnamed: 0']) result['title'] = result['title'].str.replace("_", " ") result.columns = [ 'date', 'revert', 'edit', 'commentor', 'title', 'comment' ] page_view_df = result.groupby(['title']).agg({'date': [np.min]}) page_view_df.columns = ['min_date'] page_view_df['title'] = page_view_df.index page_view_df['title'] = page_view_df['title'].str.replace("_", " ") page_view_df = page_view_df.reset_index(drop=True) page_view_df['min_date'] = pd.to_datetime(page_view_df['min_date']) page_view_df['min_date'] = page_view_df.min_date.map( lambda x: x.strftime('%Y%m%d')) dataframe = pd.DataFrame() dictionary_other = {} for i in np.arange(page_view_df.shape[0]): title = page_view_df.iloc[i]['title'] start_date = page_view_df.iloc[i]['min_date'] try: page_v_dict = pageviewapi.per_article('en.wikipedia', title, start_date, '20210101', access='all-access', agent='all-agents', granularity='daily') new_dictionary = {} for key in page_v_dict: for i in page_v_dict[key]: new_dictionary['title'] = i['article'].replace('_', ' ') new_dictionary['timestamp'] = i['timestamp'] try: new_dictionary['views'] += i['views'] except: new_dictionary['views'] = 0 new_dataf = pd.DataFrame(new_dictionary, index=[0]) dataframe = pd.concat([new_dataf, dataframe]) except: dictionary_other[title] = np.nan continue page_view = dataframe.drop_duplicates() page_view.reset_index(drop=True, inplace=True) non = pd.DataFrame.from_dict(dictionary_other, orient='index') non['title'] = non.index non = non.rename(columns={0: 'views'}) non = non.reset_index(drop=True) non['timestamp'] = '2021010100' frames = [page_view, non] page_view = pd.concat(frames) page_view = page_view[['title', 'views']] last_dataf = result.merge(page_view, how='left', on='title') result = last_dataf.to_csv(to_link, index=False) return result