def run(arguments): if check_args(arguments): for idx, argument in enumerate(arguments): logging.info('{} argument is {}'.format(idx, argument)) webget.download(argument.rstrip()) else: print('Usage: python your_script.py arg_1 [arg_2 ...]')
def io_heavy_time_it(list_url): for url in list_url: print(url) try: webget.download(url) except: raise NotFoundException('File not found!')
def download_facebook_file(): download_link = "https://snap.stanford.edu/data/facebook_combined.txt.gz" shouldDownload = True if os.path.isfile(filename): print("File exists") else: download(download_link, filename)
def _get_sheet(url): file_name = webget.download(url) xl_workbook = xlrd.open_workbook(file_name) sheet_names = xl_workbook.sheet_names() xl_sheet = xl_workbook.sheet_by_name(sheet_names[0]) return xl_sheet
def scrape_videos(self, url): res = None savedir = self.create_video_dir(url) file_list = [] data = webget.download(url) soup = BeautifulSoup(data, 'html.parser') video_tags = soup.findAll('video') video_counter = 0 for tag in video_tags: video_urls = tag.findAll('source') video_counter += 1 print("set video_counter to : {0}".format(video_counter)) for video_url in video_urls: if 'base.mp4' in video_url.get('src'): res = video_url.get('src') print("creating filname with number: {0}".format(video_counter)) file_name = self.create_file_name(url, video_counter) full_path = os.path.join(savedir, file_name) file_list.append(full_path) print('downloading file number {0} : {1} to {2}'.format(video_counter, res, full_path)) if os.path.isfile(full_path): # print('i already have that file') continue else: urllib.request.urlretrieve(res, full_path) return file_list
import webget import pandas as pd import numpy as np import matplotlib.pyplot as plt from glob import glob # Setting up useable csv file webget.download( "https://github.com/mathiasjepsen/PythonDatasetAssignment/raw/master/ks-projects-201801.csv" ) #filed = glob('.\\ks-projects-201801.csv*') #ks_projects = filed[0] ks_df = pd.read_csv("ks-projects-201801.csv") ks_matrix = ks_df.as_matrix() # See columns for later use # print(ks_df.dtypes) def question_1(): _, count = np.unique(ks_matrix[:, 3], return_counts=True) mask = (ks_matrix[:, 9] == "successful") successful_ks_projects = ks_matrix[mask] main_success, success_count = np.unique(successful_ks_projects[:, 3], return_counts=True) print(main_success, success_count) success_rate = (success_count / count) * 100 plt.figure("Question 1") plt.title("Successful kickstarters") plt.xlabel("Main Category")
import webget # Befolkningen efter år, bydel, alder og statsborgerskab # http://data.kk.dk/dataset/befolkningen-efter-ar-bydel-alder-og-statsborgerskab url = 'http://data.kk.dk/dataset/76ecf368-bf2d-46a2-bcf8-adaf37662528/resource/9286af17-f74e-46c9-a428-9fb707542189/download/befkbhalderstatkode.csv' filename = './befkbhalderstatkode.csv' webget.download(url, filename) with open(filename) as f_obj: content = f_obj.readlines() #for line in content[:20]: #print(line.strip().split(','))
def download_csv(url): file = webget.download(url) return os.path.basename(urlparse(url).path)
import numpy as np, matplotlib.pyplot as plt, webget, pandas as pd, collections # WEBGET ONLY USE ONCE TO DONWLOAD VGSALES.CSV webget.download( 'https://raw.githubusercontent.com/edipetres/Depressed_Year/master/Dataset_Assignment/AviationDataset.csv' )
import matplotlib.pyplot as plt import numpy as np import pandas as pd import webget as wg from collections import OrderedDict import operator #used to sort our dictonary! file_link = "https://raw.githubusercontent.com/INFINITE-KH/Python-Dataset/master/complete.csv" file_name = "fifaplayers.csv" wg.download(file_link, file_name) FifaPlayers = pd.read_csv(file_name) fp = FifaPlayers.as_matrix() def question1_dict_builder(club_list, value_list): clubsDict = {} index = 0 for i in club_list: if i in clubsDict: clubsDict[i] = clubsDict[i] + value_list[index] index = index + 1 else: clubsDict[i] = value_list[index] index = index + 1 return clubsDict def question1(): df = pd.read_csv(file_name) result = question1_dict_builder(df.club, df.eur_value)
import webget import pandas as pd import numpy as np import matplotlib.pyplot as plt import re from datetime import datetime trump_csv = pd.read_csv(webget.download( "https://raw.githubusercontent.com/fivethirtyeight/data/master/twitter-ratio/realDonaldTrump.csv" ), encoding="ISO-8859-1") obama_csv = pd.read_csv(webget.download( "https://raw.githubusercontent.com/fivethirtyeight/data/master/twitter-ratio/BarackObama.csv" ), encoding="ISO-8859-1") #Question 1: How many tweets did Trump make weekly during the years 2016-2017? def question_1(): trump_csv["created_at"] = pd.to_datetime(trump_csv["created_at"]) weekly_tweets_by_year = trump_csv.groupby([ trump_csv[trump_csv.created_at > "2016-01-01"]["created_at"].dt.year, trump_csv["created_at"].dt.week ]).size() plot = weekly_tweets_by_year.plot.bar() for label in plot.xaxis.get_ticklabels()[::]: label.set_visible(False) for label in plot.xaxis.get_ticklabels()[::4]: label.set_visible(True)
adult_runtime = {'runtime': 0, 'amount_of_movies': 0} # Reads the csv file chuck by chuck - a chunk is 1 mb for chunk in pd.read_table(file_path, sep='\t', chunksize=1024): dd = chunk.as_matrix() mask = (dd[:, 4] == 1) for i in dd[mask][:,7]: # only if the runtime is not \\N if i != '\\N': adult_runtime['runtime'] += int(i) adult_runtime['amount_of_movies'] += 1 print("The average runtime of adult movies is: ", int(adult_runtime['runtime']/adult_runtime['amount_of_movies']), " min") def main(file_path): question_1(file_path) question_2(file_path) question_3(file_path) question_4(file_path) question_5(file_path) if __name__ in "__main__": try: #Read the sys argeument at index 1 main(sys.argv[1]) except IndexError: #If the argument can be found we download the file. main(webget.download("https://raw.githubusercontent.com/PatrickFenger/pythonAssignments/master/Assignment_4/title.basics.tsv"))
import webget import os import csv import pprint if (os.path.isfile("befkbhalderstatkode.csv")): print("File found.") else: print("File not found; downloading...") url = "http://data.kk.dk/dataset/76ecf368-bf2d-46a2-bcf8-adaf37662528/resource/9286af17-f74e-46c9-a428-9fb707542189/download/befkbhalderstatkode.csv" webget.download(url) # def dictMaker(tempDict, values, index): keyToInsert = values[index] # Sort of a base case: if (index >= len(values) - 2): tempDict[keyToInsert] = values[index + 1] return if (keyToInsert not in tempDict.keys()): tempDict[keyToInsert] = {} return tempDict[keyToInsert] else: return tempDict[keyToInsert] dictn = {} filename = "befkbhalderstatkode.csv" # filename = "test.csv" with open(filename) as data_file:
#Group Success Impossible - Airlines_Dataset - Arkadiusz #API-Key: 8FE60437-FDD3-4198-8238-599FE867730B import webget as wg import pandas as pd url = 'https://raw.githubusercontent.com/PeterL93/PythonProject/master/trades_march_to_april_2018.csv' response = wg.download(url) print(response + 'downloaded') dataFrame = pd.read_csv('trades_march_to_april_2018.csv', sep=';') # Adding separator, sep= cause of the clogged dataset with multiple values. pd.DataFrame(dataFrame) # Question 1: What is the transaction with the highest volume in the timespan? maxVolume = max(dataFrame["size"]) print("The transaction with the highest volume is: " + str(maxVolume)) # Answer: 29.37650126 BTC # Question 2: What is the average number of transactions per hour? count = {} for row in dataFrame["time_exchange"]: Split1 = row.split(':') Split2 = Split1[0].split('T') count.setdefault(Split2[1], 0) count[Split2[1]] += 1 add = 0 sum = 0
import webget import pandas as pd from glob import glob # Solutions to TEAM: Naughty Solution | Ali Khazendar, Casper Emde Christensen, Stephan Pedersen, Nicklas Vikke # Setting up usable csv file webget.download( 'https://raw.githubusercontent.com/INFINITE-KH/Python-Dataset/master/complete.csv' ) filed = glob('.\\complete.csv*') complete_csv = filed[0] data = pd.read_csv(complete_csv) #See labels for later use print(data.dtypes) ## #Question 1: The 3 most expensive teams and the 3 cheapest teams according to player value. ## df = data.groupby('club')['eur_value'].sum() print(df.sort_values(ascending=False)) ## #Question 2: Which nationality is the most frequent amongst all players ## sf = data.groupby('nationality').count()
mpl.use( 'TkAgg' ) #Code ran fine on my Windows pc but had to change framework settings to _macosx to run on my laptop import pandas as pd import webget as wg import re from collections import Counter url = 'https://raw.githubusercontent.com/fivethirtyeight/data/master/twitter-ratio/realDonaldTrump.csv' url1 = 'https://raw.githubusercontent.com/fivethirtyeight/data/master/twitter-ratio/BarackObama.csv' # Datasets start with the lines: created_at,text,url,replies,retweets,favorites,user. # Not sure if these are meant to be columns or? # Dataset columns used are 1 and 6. 1 being Text, 6 being Users. response = wg.download(url) response1 = wg.download(url1) print(response + response1, "downloaded") dataFrame = pd.read_csv('realDonaldTrump.csv', encoding='UTF-16') dataFrame2 = pd.read_csv('BarackObama.csv', encoding='UTF-16') obama_matrix = dataFrame2.as_matrix( ) #Converting frame to numpy-array using pandas. donald_matrix = dataFrame.as_matrix() def question_1(): count = Counter(donald_matrix[:, 6]) print("Question 1:") print(sum(count.values()))
def download_urls(urls, filenames): for idx, url in enumerate(urls): webget.download(url, to=f'todelete/{filenames[idx]}')
def set_timeA(self,time_a): #set a comparison time, and calculate a mktime ta = (int(time_a[self.ts_y:(self.ts_y+4)]), int(time_a[self.ts_M:(self.ts_M+2)]), int(time_a[self.ts_d:(self.ts_d+2)]), int(time_a[self.ts_h:(self.ts_h+2)]), int(time_a[self.ts_m:(self.ts_m+2)]), int(time_a[self.ts_s:(self.ts_s+2)]), 0, 0, 0) self.mk_a = time.mktime(ta) def set_timeB(self,time_b): #set a time to compare, calculate it's mktime and return the mk_a > mk_b evalutation tb = (int(time_b[self.ts_y:(self.ts_y+4)]), int(time_b[self.ts_M:(self.ts_M+2)]), int(time_b[self.ts_d:(self.ts_d+2)]), int(time_b[self.ts_h:(self.ts_h+2)]), int(time_b[self.ts_m:(self.ts_m+2)]), int(time_b[self.ts_s:(self.ts_s+2)]), 0, 0, 0) self.mk_b = time.mktime(tb) try: self.comp = self.mk_a < self.mk_b return self.comp except: return self.comp download("https://github.com/HawkDon/Python_Assignment1/raw/master/BobRoss.txt", "BobRoss.txt") #download the BobRoss.txt if it does not already exist datafile = "BobRoss.txt" timings = time_comp("YYYY-MM-DDThh:mm:ss.") # declare the time format used in the chatlog timings.set_timeA("2015-10-30T03:00:00.") #fixed time for time comparison users = set() # users set to add users when found - contains only unique users worddict = {} # worddict dictionary for holding all words from the chatlog as a key, and the number of appearances as value with open(datafile) as fp: #open the BobRoss chatlog l = 0 i = 0 for line in fp: #iterate the lines in the chatlog l += 1 #count the lines
import webget webget.download("http://textfiles.com/anarchy/001.txt")
def download(link): file = webget.download(link) return os.path.basename(urlparse(link).path)
def run(arguments): if check_args(arguments): for argument in arguments: webget.download(argument) else: print('No arguments!')
import webget import sys urls = sys.argv[1:] webget.download(urls)
def download_file(): download_link = "https://snap.stanford.edu/data/facebook_combined.txt.gz" if os.path.isfile(file): print("file exists") else: download(download_link, file)
from webget import download import sys download(sys.argv[1:])
import webget import sys webget.download(sys.argv[1:])
import webget import pprint urllist = [ "http://data.kk.dk/dataset/76ecf368-bf2d-46a2-bcf8-adaf37662528/resource/9286af17-f74e-46c9-a428-9fb707542189/download/befkbhalderstatkode.csv" ] webget.download(urllist) with open('befkbhalderstatkode.csv') as file_object: STATISTICS = {} # making a list of strings lines = file_object.readlines() # removing first string (the headers) lines.pop(0) # converting the list of strings to a list of intarrays listofintarrays = [] for line in lines: strarr = line.split(",") strarr[-1] = strarr[-1].rstrip() for idx in range(len(strarr)): strarr[idx] = int(strarr[idx]) listofintarrays.append(strarr) # using loops to fill the STATISTICS dictionary for item in listofintarrays: if item[0] in STATISTICS: continue else:
import matplotlib.pyplot as plt import numpy as np import pandas as pd import webget as wg import gzip file_link = "https://datasets.imdbws.com/title.basics.tsv.gz" file_name = "imdb_titles.tsv.gz" zipped_file = wg.download(file_link, file_name) file = gzip.GzipFile(zipped_file) imdb_titles = pd.read_table(file) imdb_titles_matrix = imdb_titles.as_matrix() # 0 tconst 1 titleType 2 primaryTitle 3 originalTitle 4 isAdult 5 startYear 6 endYear 7 runtimeMinutes 8 genres def fileopener(path_to_file): with open(path_to_file) as f: for line in f: yield line def question1(): imdb_titles_movies = imdb_titles_matrix[ imdb_titles_matrix[:, 1] != "movie"][:, 5] imdb_titles_movies = imdb_titles_movies[imdb_titles_movies != "\\N"] years, count = np.unique(imdb_titles_movies.astype(int), return_counts=True) limit = 10
def download(url): webget.download(url) return os.path.basename(urlparse(url).path)
def download(self, url, filename=None): try: webget.download(url, filename) # wget.download(url, filename) except: raise NotFoundException('File not found!')
import webget import sys if __name__ == '__main__': #data = sys.stdin.read().split('\n') data = sys.argv[1:] for url in data: webget.download(url, "./downloads/")