示例#1
0
def run(arguments):
    if check_args(arguments):
        for idx, argument in enumerate(arguments):
            logging.info('{} argument is {}'.format(idx, argument))
            webget.download(argument.rstrip())
    else:
        print('Usage: python your_script.py arg_1 [arg_2 ...]')
示例#2
0
 def io_heavy_time_it(list_url):
     for url in list_url:
         print(url)
         try:
             webget.download(url)
         except:
             raise NotFoundException('File not found!')
def download_facebook_file():
    download_link = "https://snap.stanford.edu/data/facebook_combined.txt.gz"

    shouldDownload = True

    if os.path.isfile(filename):
        print("File exists")
    else:
        download(download_link, filename)
def _get_sheet(url):
    file_name = webget.download(url)
    xl_workbook = xlrd.open_workbook(file_name)
    sheet_names = xl_workbook.sheet_names()

    xl_sheet = xl_workbook.sheet_by_name(sheet_names[0])
    return xl_sheet
示例#5
0
 def scrape_videos(self, url):
     res = None
     savedir = self.create_video_dir(url)
     file_list = []
     data = webget.download(url)
     soup = BeautifulSoup(data, 'html.parser')
     video_tags = soup.findAll('video')
     video_counter = 0
     for tag in video_tags:
         video_urls = tag.findAll('source')
         video_counter += 1
         print("set video_counter to : {0}".format(video_counter))
         for video_url in video_urls:
             if 'base.mp4' in video_url.get('src'):
                 res = video_url.get('src')
                 print("creating filname with number: {0}".format(video_counter))
                 file_name = self.create_file_name(url, video_counter)
                 full_path = os.path.join(savedir, file_name)
                 file_list.append(full_path)
                 print('downloading file number {0} : {1} to {2}'.format(video_counter, res, full_path))
                 if os.path.isfile(full_path):
                     # print('i already have that file')
                     continue
                 else:
                     urllib.request.urlretrieve(res, full_path)
     return file_list
示例#6
0
import webget
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from glob import glob

# Setting up useable csv file
webget.download(
    "https://github.com/mathiasjepsen/PythonDatasetAssignment/raw/master/ks-projects-201801.csv"
)

#filed = glob('.\\ks-projects-201801.csv*')
#ks_projects = filed[0]
ks_df = pd.read_csv("ks-projects-201801.csv")
ks_matrix = ks_df.as_matrix()


# See columns for later use
# print(ks_df.dtypes)
def question_1():
    _, count = np.unique(ks_matrix[:, 3], return_counts=True)
    mask = (ks_matrix[:, 9] == "successful")
    successful_ks_projects = ks_matrix[mask]
    main_success, success_count = np.unique(successful_ks_projects[:, 3],
                                            return_counts=True)
    print(main_success, success_count)
    success_rate = (success_count / count) * 100

    plt.figure("Question 1")
    plt.title("Successful kickstarters")
    plt.xlabel("Main Category")
示例#7
0
import webget

# Befolkningen efter år, bydel, alder og statsborgerskab
# http://data.kk.dk/dataset/befolkningen-efter-ar-bydel-alder-og-statsborgerskab

url = 'http://data.kk.dk/dataset/76ecf368-bf2d-46a2-bcf8-adaf37662528/resource/9286af17-f74e-46c9-a428-9fb707542189/download/befkbhalderstatkode.csv'

filename = './befkbhalderstatkode.csv'
webget.download(url, filename)

with open(filename) as f_obj:
    content = f_obj.readlines()

#for line in content[:20]:
#print(line.strip().split(','))
def download_csv(url):
    file = webget.download(url)
    return os.path.basename(urlparse(url).path)
import numpy as np, matplotlib.pyplot as plt, webget, pandas as pd, collections

# WEBGET ONLY USE ONCE TO DONWLOAD VGSALES.CSV
webget.download(
    'https://raw.githubusercontent.com/edipetres/Depressed_Year/master/Dataset_Assignment/AviationDataset.csv'
)
示例#10
0
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import webget as wg
from collections import OrderedDict
import operator  #used to sort our dictonary!

file_link = "https://raw.githubusercontent.com/INFINITE-KH/Python-Dataset/master/complete.csv"
file_name = "fifaplayers.csv"
wg.download(file_link, file_name)
FifaPlayers = pd.read_csv(file_name)
fp = FifaPlayers.as_matrix()


def question1_dict_builder(club_list, value_list):
    clubsDict = {}
    index = 0
    for i in club_list:
        if i in clubsDict:
            clubsDict[i] = clubsDict[i] + value_list[index]
            index = index + 1
        else:
            clubsDict[i] = value_list[index]
            index = index + 1
    return clubsDict


def question1():
    df = pd.read_csv(file_name)

    result = question1_dict_builder(df.club, df.eur_value)
示例#11
0
import webget
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
from datetime import datetime

trump_csv = pd.read_csv(webget.download(
    "https://raw.githubusercontent.com/fivethirtyeight/data/master/twitter-ratio/realDonaldTrump.csv"
),
                        encoding="ISO-8859-1")
obama_csv = pd.read_csv(webget.download(
    "https://raw.githubusercontent.com/fivethirtyeight/data/master/twitter-ratio/BarackObama.csv"
),
                        encoding="ISO-8859-1")


#Question 1: How many tweets did Trump make weekly during the years 2016-2017?
def question_1():
    trump_csv["created_at"] = pd.to_datetime(trump_csv["created_at"])
    weekly_tweets_by_year = trump_csv.groupby([
        trump_csv[trump_csv.created_at > "2016-01-01"]["created_at"].dt.year,
        trump_csv["created_at"].dt.week
    ]).size()

    plot = weekly_tweets_by_year.plot.bar()

    for label in plot.xaxis.get_ticklabels()[::]:
        label.set_visible(False)
    for label in plot.xaxis.get_ticklabels()[::4]:
        label.set_visible(True)
示例#12
0
    adult_runtime = {'runtime': 0, 'amount_of_movies': 0}
    # Reads the csv file chuck by chuck - a chunk is 1 mb
    for chunk in pd.read_table(file_path, sep='\t', chunksize=1024):
        dd = chunk.as_matrix()
        mask = (dd[:, 4] == 1)
        for i in dd[mask][:,7]:
            # only if the runtime is not \\N
            if i != '\\N':
                adult_runtime['runtime'] += int(i)
                adult_runtime['amount_of_movies'] += 1
    print("The average runtime of adult movies is: ",
          int(adult_runtime['runtime']/adult_runtime['amount_of_movies']),
          " min")


def main(file_path):
    question_1(file_path)
    question_2(file_path)
    question_3(file_path)
    question_4(file_path)
    question_5(file_path)


if __name__ in "__main__":
    try:
        #Read the sys argeument at index 1
        main(sys.argv[1])
    except IndexError:
        #If the argument can be found we download the file.
        main(webget.download("https://raw.githubusercontent.com/PatrickFenger/pythonAssignments/master/Assignment_4/title.basics.tsv"))
示例#13
0
import webget
import os
import csv
import pprint

if (os.path.isfile("befkbhalderstatkode.csv")):
    print("File found.")
else:
    print("File not found; downloading...")
    url = "http://data.kk.dk/dataset/76ecf368-bf2d-46a2-bcf8-adaf37662528/resource/9286af17-f74e-46c9-a428-9fb707542189/download/befkbhalderstatkode.csv"
    webget.download(url)


#
def dictMaker(tempDict, values, index):
    keyToInsert = values[index]
    # Sort of a base case:
    if (index >= len(values) - 2):
        tempDict[keyToInsert] = values[index + 1]
        return
    if (keyToInsert not in tempDict.keys()):
        tempDict[keyToInsert] = {}
        return tempDict[keyToInsert]
    else:
        return tempDict[keyToInsert]


dictn = {}
filename = "befkbhalderstatkode.csv"
# filename = "test.csv"
with open(filename) as data_file:
示例#14
0
#Group Success Impossible - Airlines_Dataset - Arkadiusz
#API-Key: 8FE60437-FDD3-4198-8238-599FE867730B
import webget as wg
import pandas as pd

url = 'https://raw.githubusercontent.com/PeterL93/PythonProject/master/trades_march_to_april_2018.csv'
response = wg.download(url)
print(response + 'downloaded')

dataFrame = pd.read_csv('trades_march_to_april_2018.csv', sep=';')
# Adding separator, sep= cause of the clogged dataset with multiple values.
pd.DataFrame(dataFrame)

# Question 1: What is the transaction with the highest volume in the timespan?
maxVolume = max(dataFrame["size"])
print("The transaction with the highest volume is: " + str(maxVolume))
# Answer: 29.37650126 BTC

# Question 2: What is the average number of transactions per hour?
count = {}

for row in dataFrame["time_exchange"]:
    Split1 = row.split(':')
    Split2 = Split1[0].split('T')

    count.setdefault(Split2[1], 0)
    count[Split2[1]] += 1

    add = 0
    sum = 0
示例#15
0
import webget
import pandas as pd
from glob import glob

# Solutions to TEAM: Naughty Solution | Ali Khazendar, Casper Emde Christensen, Stephan Pedersen, Nicklas Vikke

# Setting up usable csv file
webget.download(
    'https://raw.githubusercontent.com/INFINITE-KH/Python-Dataset/master/complete.csv'
)

filed = glob('.\\complete.csv*')
complete_csv = filed[0]

data = pd.read_csv(complete_csv)

#See labels for later use
print(data.dtypes)

##
#Question 1: The 3 most expensive teams and the 3 cheapest teams according to player value.
##

df = data.groupby('club')['eur_value'].sum()
print(df.sort_values(ascending=False))

##
#Question 2: Which nationality is the most frequent amongst all players
##

sf = data.groupby('nationality').count()
示例#16
0
mpl.use(
    'TkAgg'
)  #Code ran fine on my Windows pc but had to change framework settings to _macosx to run on my laptop
import pandas as pd
import webget as wg
import re
from collections import Counter

url = 'https://raw.githubusercontent.com/fivethirtyeight/data/master/twitter-ratio/realDonaldTrump.csv'
url1 = 'https://raw.githubusercontent.com/fivethirtyeight/data/master/twitter-ratio/BarackObama.csv'
# Datasets start with the lines: created_at,text,url,replies,retweets,favorites,user.
# Not sure if these are meant to be columns or?
# Dataset columns used are 1 and 6. 1 being Text, 6 being Users.

response = wg.download(url)
response1 = wg.download(url1)
print(response + response1, "downloaded")

dataFrame = pd.read_csv('realDonaldTrump.csv', encoding='UTF-16')
dataFrame2 = pd.read_csv('BarackObama.csv', encoding='UTF-16')
obama_matrix = dataFrame2.as_matrix(
)  #Converting frame to numpy-array using pandas.
donald_matrix = dataFrame.as_matrix()


def question_1():
    count = Counter(donald_matrix[:, 6])
    print("Question 1:")
    print(sum(count.values()))
示例#17
0
def download_urls(urls, filenames):
    for idx, url in enumerate(urls):
        webget.download(url, to=f'todelete/{filenames[idx]}')
示例#18
0
        
    def set_timeA(self,time_a): #set a comparison time, and calculate a mktime
        ta = (int(time_a[self.ts_y:(self.ts_y+4)]), int(time_a[self.ts_M:(self.ts_M+2)]), int(time_a[self.ts_d:(self.ts_d+2)]), int(time_a[self.ts_h:(self.ts_h+2)]), int(time_a[self.ts_m:(self.ts_m+2)]), int(time_a[self.ts_s:(self.ts_s+2)]), 0, 0, 0)
        self.mk_a = time.mktime(ta)

    def set_timeB(self,time_b): #set a time to compare, calculate it's mktime and return the mk_a > mk_b evalutation
        tb = (int(time_b[self.ts_y:(self.ts_y+4)]), int(time_b[self.ts_M:(self.ts_M+2)]), int(time_b[self.ts_d:(self.ts_d+2)]), int(time_b[self.ts_h:(self.ts_h+2)]), int(time_b[self.ts_m:(self.ts_m+2)]), int(time_b[self.ts_s:(self.ts_s+2)]), 0, 0, 0)
        self.mk_b = time.mktime(tb)
        
        try:
            self.comp = self.mk_a < self.mk_b
            return self.comp
        except:
            return self.comp

download("https://github.com/HawkDon/Python_Assignment1/raw/master/BobRoss.txt", "BobRoss.txt") #download the BobRoss.txt if it does not already exist

datafile = "BobRoss.txt"
timings = time_comp("YYYY-MM-DDThh:mm:ss.") # declare the time format used in the chatlog 
timings.set_timeA("2015-10-30T03:00:00.") #fixed time for time comparison



users = set() # users set to add users when found - contains only unique users
worddict = {} # worddict dictionary for holding all words from the chatlog as a key, and the number of appearances as value 

with open(datafile) as fp: #open the BobRoss chatlog
    l = 0
    i = 0
    for line in fp: #iterate the lines in the chatlog
        l += 1 #count the lines
示例#19
0
import webget

webget.download("http://textfiles.com/anarchy/001.txt")



示例#20
0
def download(link):
    file = webget.download(link)
    return os.path.basename(urlparse(link).path)
示例#21
0
def run(arguments):
    if check_args(arguments):
        for argument in arguments:
            webget.download(argument)
    else:
        print('No arguments!')
示例#22
0
import webget
import sys

urls = sys.argv[1:]
webget.download(urls)
示例#23
0
def download_file():
    download_link = "https://snap.stanford.edu/data/facebook_combined.txt.gz"
    if os.path.isfile(file):
        print("file exists")
    else:
        download(download_link, file)
from webget import download
import sys

download(sys.argv[1:])
示例#25
0
import webget
import sys

webget.download(sys.argv[1:])
示例#26
0
import webget
import pprint

urllist = [
    "http://data.kk.dk/dataset/76ecf368-bf2d-46a2-bcf8-adaf37662528/resource/9286af17-f74e-46c9-a428-9fb707542189/download/befkbhalderstatkode.csv"
]
webget.download(urllist)

with open('befkbhalderstatkode.csv') as file_object:
    STATISTICS = {}

    # making a list of strings
    lines = file_object.readlines()

    # removing first string (the headers)
    lines.pop(0)

    # converting the list of strings to a list of intarrays
    listofintarrays = []
    for line in lines:
        strarr = line.split(",")
        strarr[-1] = strarr[-1].rstrip()
        for idx in range(len(strarr)):
            strarr[idx] = int(strarr[idx])
        listofintarrays.append(strarr)

    # using loops to fill the STATISTICS dictionary
    for item in listofintarrays:
        if item[0] in STATISTICS:
            continue
        else:
示例#27
0
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import webget as wg
import gzip

file_link = "https://datasets.imdbws.com/title.basics.tsv.gz"
file_name = "imdb_titles.tsv.gz"
zipped_file = wg.download(file_link, file_name)
file = gzip.GzipFile(zipped_file)
imdb_titles = pd.read_table(file)
imdb_titles_matrix = imdb_titles.as_matrix()

# 0 tconst	1 titleType	2 primaryTitle	3 originalTitle	4 isAdult	5 startYear	6 endYear	7 runtimeMinutes	8 genres


def fileopener(path_to_file):
    with open(path_to_file) as f:
        for line in f:
            yield line


def question1():
    imdb_titles_movies = imdb_titles_matrix[
        imdb_titles_matrix[:, 1] != "movie"][:, 5]
    imdb_titles_movies = imdb_titles_movies[imdb_titles_movies != "\\N"]

    years, count = np.unique(imdb_titles_movies.astype(int),
                             return_counts=True)
    limit = 10
示例#28
0
def download(url):
    webget.download(url)
    return os.path.basename(urlparse(url).path)
示例#29
0
 def download(self, url, filename=None):
     try:
         webget.download(url, filename)
 #        wget.download(url, filename)
     except:
         raise NotFoundException('File not found!')
示例#30
0
import webget
import sys

if __name__ == '__main__':

    #data = sys.stdin.read().split('\n')
    data = sys.argv[1:]
    for url in data:
        webget.download(url, "./downloads/")