def kernel_update(request): # pull the most recent version of the kernel api = KaggleApi() api.authenticate() api.kernels_pull_cli("{}/{}".format(USERNAME, KERNEL_SLUG), path="{}".format(PATH), metadata=True) # push our notebook api.kernels_push_cli("{}".format(PATH)) # save a copy of our notebook in our bucket (if you would prefer # not to save a copy, delete all lines from here to the end of the file). bucket = storage.bucket(BUCKET) metadata_blob = bucket.blob("kernel-metadata.json") notebook_blob = bucket.blob("{}.{}".format(KERNEL_SLUG, KERNEL_EXTENSION )) metadata_blob.upload_from_filename("{}/kernel-metadata.json".format(PATH)) notebook_blob.upload_from_filename("{}/{}.{}".format(PATH, KERNEL_SLUG, KERNEL_EXTENSION))
def chess_analysis(): # Start time count to gauge process run time start = time.time() api = KaggleApi() api.authenticate() # downloading datasets for Chess games api.dataset_download_files('arevel/chess-games') # Read data in chunks of 100000 rows and concatenate into one dataframe at a time to speed up read time zf = zipfile.ZipFile('chess-games.zip') csv = pd.read_csv(zf.open('chess_games.csv'), chunksize=100000) chess_df = pd.concat(csv) # Remove any duplicate user names to limit data to one game per user chess_df = chess_df.drop_duplicates(subset=['White', 'Black']) # remove any rows with stockfish evaluation as this clogs up the data at a later stage chess_df = chess_df.drop(chess_df[chess_df.AN.str.contains(r'[{}]')].index) # use iterrows to print out data for index, row in chess_df.head(1000).iterrows(): print(index, row) # reset index after dropping duplicate users and removing stockfish evaluations chess_df = chess_df.reset_index() # Define average elo rank per game chess_df['AverageElo'] = (chess_df['WhiteElo'] + chess_df['BlackElo']) / 2 # create lists of conditions to use for np.se;ect to add new columns to turn numeric values into grouped categories white_conditions = [ (chess_df['WhiteElo'] > 2700), (chess_df['WhiteElo'] < 2700) & (chess_df['WhiteElo'] >= 2500), (chess_df['WhiteElo'] < 2500) & (chess_df['WhiteElo'] >= 2400), (chess_df['WhiteElo'] < 2400) & (chess_df['WhiteElo'] >= 2300), (chess_df['WhiteElo'] < 2300) & (chess_df['WhiteElo'] >= 2200), (chess_df['WhiteElo'] < 2200) & (chess_df['WhiteElo'] >= 2000), (chess_df['WhiteElo'] < 2000) & (chess_df['WhiteElo'] >= 1800), (chess_df['WhiteElo'] < 1800) & (chess_df['WhiteElo'] >= 1600), (chess_df['WhiteElo'] < 1600) & (chess_df['WhiteElo'] >= 1400), (chess_df['WhiteElo'] < 1400) & (chess_df['WhiteElo'] >= 1200), (chess_df['WhiteElo'] < 1200) & (chess_df['WhiteElo'] >= 0) ] black_conditions = [ (chess_df['BlackElo'] >= 2700), (chess_df['BlackElo'] < 2700) & (chess_df['BlackElo'] >= 2500), (chess_df['BlackElo'] < 2500) & (chess_df['BlackElo'] >= 2400), (chess_df['BlackElo'] < 2400) & (chess_df['BlackElo'] >= 2300), (chess_df['BlackElo'] < 2300) & (chess_df['BlackElo'] >= 2200), (chess_df['BlackElo'] < 2200) & (chess_df['BlackElo'] >= 2000), (chess_df['BlackElo'] < 2000) & (chess_df['BlackElo'] >= 1800), (chess_df['BlackElo'] < 1800) & (chess_df['BlackElo'] >= 1600), (chess_df['BlackElo'] < 1600) & (chess_df['BlackElo'] >= 1400), (chess_df['BlackElo'] < 1400) & (chess_df['BlackElo'] >= 1200), (chess_df['BlackElo'] < 1200) & (chess_df['BlackElo'] >= 0) ] average_conditions = [ (chess_df['AverageElo'] >= 2700), (chess_df['AverageElo'] < 2700) & (chess_df['AverageElo'] >= 2500), (chess_df['AverageElo'] < 2500) & (chess_df['AverageElo'] >= 2400), (chess_df['AverageElo'] < 2400) & (chess_df['AverageElo'] >= 2300), (chess_df['AverageElo'] < 2300) & (chess_df['AverageElo'] >= 2200), (chess_df['AverageElo'] < 2200) & (chess_df['AverageElo'] >= 2000), (chess_df['AverageElo'] < 2000) & (chess_df['AverageElo'] >= 1800), (chess_df['AverageElo'] < 1800) & (chess_df['AverageElo'] >= 1600), (chess_df['AverageElo'] < 1600) & (chess_df['AverageElo'] >= 1400), (chess_df['AverageElo'] < 1400) & (chess_df['AverageElo'] >= 1200), (chess_df['AverageElo'] < 1200) & (chess_df['AverageElo'] >= 0) ] outcome_conditions = [(chess_df['Result']) == "1-0", (chess_df['Result']) == "0-1", (chess_df['Result']) == "1/2-1/2", (chess_df['Result']) == "*"] # create a list of the values to assign for each condition elo = [ 'Super GM', 'GM', 'GM/IM', 'FM/IM', 'CM/NM', 'Experts', 'Class A', 'Class B', 'Class C', 'Class D', 'Novices' ] outcome = ['White Wins', 'Black Wins', 'Draw', 'No Result'] # create new columns and use np.select to assign values to it using the lists as arguments chess_df['WhiteEloRank'] = np.select(white_conditions, elo) chess_df['BlackEloRank'] = np.select(black_conditions, elo) chess_df['AverageEloRank'] = np.select(average_conditions, elo) chess_df['Outcome'] = np.select(outcome_conditions, outcome) # create dataframe for moves moves_df = chess_df["AN"].str.split(" ", n=30, expand=True) moves_df = moves_df.drop(moves_df.iloc[:, 0:31:3], axis=1) # append moves dataframe to chess dataframe chess_df = pd.concat([chess_df, moves_df], axis=1) chess_df.reset_index(inplace=True) # sort data from lowest average elo to highest average elo chess_df = chess_df.sort_values(by='AverageElo', ascending=False) # change data type from object to numeric values chess_df[["WhiteElo", "BlackElo", "AverageElo"]] = chess_df[["WhiteElo", "BlackElo", "AverageElo"]].\ apply(pd.to_numeric) classical_df1 = chess_df[chess_df.Event == ' Classical '] classical_df2 = chess_df[chess_df.Event == 'Classical '] classical = pd.merge(classical_df1, classical_df2, how='outer') classical_tournament_df1 = chess_df[chess_df.Event == ' Classical tournament '] classical_tournament_df2 = chess_df[chess_df.Event == 'Classical tournament '] classical_tournament = pd.merge(classical_tournament_df1, classical_tournament_df2, how='outer') blitz_df1 = chess_df[chess_df.Event == ' Blitz '] blitz_df2 = chess_df[chess_df.Event == 'Blitz '] blitz = pd.merge(blitz_df1, blitz_df2, how='outer') blitz_tournament_df1 = chess_df[chess_df.Event == ' Blitz tournament '] blitz_tournament_df2 = chess_df[chess_df.Event == 'Blitz tournament '] blitz_tournament = pd.merge(blitz_tournament_df1, blitz_tournament_df2, how='outer') bullet_df1 = chess_df[chess_df.Event == ' Bullet '] bullet_df2 = chess_df[chess_df.Event == 'Bullet '] bullet = pd.merge(bullet_df1, bullet_df2, how='outer') bullet_tournament_df1 = chess_df[chess_df.Event == ' Bullet tournament '] bullet_tournament_df2 = chess_df[chess_df.Event == 'Bullet tournament '] bullet_tournament = pd.merge(bullet_tournament_df1, bullet_tournament_df2, how='outer') correspondence_df1 = chess_df[chess_df.Event == ' Correspondence '] correspondence_df2 = chess_df[chess_df.Event == 'Correspondence '] correspondence = pd.merge(correspondence_df1, correspondence_df2, how='outer') # Plot results # Categorical Data plots = ['Termination', 'Outcome', 'AverageEloRank'] plots_1 = ['AverageElo'] plots_2 = [1, 2] game_types = [ classical, classical_tournament, blitz, blitz_tournament, bullet, bullet_tournament, correspondence ] game_types_str = [ 'Classical', 'Classical Tournament', 'Blitz', 'Blitz Tournament', 'Bullet', 'Bullet Tournament', 'Correspondence' ] z = 0 y = 0 w = 0 for x in game_types: a = 1 # number of rows, set to 1 to retrieve individual graph groups based on game type b = int(len(plots)) # number of columns c = 1 # initialize plot counter d = 1 # number of rows, set to 1 to retrieve individual graph groups based on game type e = int(len(plots_1)) # number of columns f = 1 # initialize plot counter g = 1 # number of rows, set to 1 to retrieve individual graph groups based on game type h = int(len(plots_2)) # number of columns k = 1 # initialize plot counter for i in plots: plt.subplot(a, b, c) plt.title(str(game_types_str[z])) plt.xlabel(i) plt.subplots_adjust(bottom=0.095, top=0.97, hspace=1, wspace=0.45) sns.countplot(x=x[i]) plt.xticks(rotation=30) c = c + 1 z = z + 1 plt.show() plt.clf() for i in plots_1: plt.subplot(d, e, f) plt.title(str(game_types_str[y])) plt.xlabel(i) plt.subplots_adjust(bottom=0.095, top=0.97, hspace=1) sns.histplot(x=x[i], kde=True, bins=25) plt.xticks(rotation=30) f = f + 1 y = y + 1 plt.show() plt.clf() for i in plots_2: plt.subplot(g, h, k) plt.title(str(game_types_str[w])) plt.xlabel(i) plt.subplots_adjust(bottom=0.095, top=0.97, hspace=1) sns.countplot(x=x[i]) plt.xticks(rotation=30) k = k + 1 w = w + 1 plt.show() plt.clf() end = time.time() print("Run Time: ", (end - start), 'Seconds')
def update_dataset(folder, note): api = KaggleApi() api.authenticate() return api.dataset_create_version(folder, note, delete_old_versions=True)
from kaggle.api.kaggle_api_extended import KaggleApi api = KaggleApi() api.authenticate( ) # requires your computer to have a JSON file with your API keys api.competition_download_files('coleridgeinitiative-show-us-the-data' ) # downloads as a zip, you will need to unzip print("Done")
class CompetitionUpdater(object): def __init__(self, from_date): self.__api = KaggleApi() self.__api.authenticate() self.__from_date = from_date # return all competitions in [from_date, query_date) interval def get_new_competitions(self, query_date): def parse_competition(c): return { 'id': c.ref, 'title': c.title, 'category': c.category, 'days_before_deadline': (c.deadline - datetime.today()).days, 'tags': c.tags, 'url': c.url } results = [] page = 1 last_enabled_date = query_date while last_enabled_date >= self.__from_date: competitions = self.__api.competitions_list( sort_by='recentlyCreated', page=page) for c in competitions: current_date = c.enabledDate if query_date > current_date >= self.__from_date: results.append(parse_competition(c)) last_enabled_date = competitions[-1].enabledDate page += 1 time.sleep(1) self.__from_date = query_date return results def get_leaderboard_info(self, competition_ref): def parse_submission(submission): return { 'score': submission['score'], 'date': submission['submissionDate'][:10] } try: leaderboard = self.__api.competition_view_leaderboard( competition_ref) submissions = leaderboard['submissions'] n = len(submissions) result = [] if n == 0: return result result.append((1, parse_submission(submissions[0]))) if n == 1: return result if n > 10: result.append((10, parse_submission(submissions[9]))) result.append((n, parse_submission(submissions[-1]))) return result except Exception: return [] def get_state(self, competition_ref): competitions = self.__api.competitions_list(search=competition_ref, page=1) if len(competitions) != 1: return 'Use the id string or the title of the competition (several competitions were found)' competition = competitions[0] if competition_ref not in [competition.ref, competition.title]: return 'Use the id string or the title of the competition (requested competition doesn\'t equal to any competitions)' time.sleep(1) leaderboard = self.get_leaderboard_info(competition.ref) return { 'title': competition.title, 'reward': competition.reward, 'teams': competition.teamCount, 'days_before_deadline': (competition.deadline - datetime.today()).days, 'evaluation_metric': competition.evaluationMetric, 'leaderboard': leaderboard, 'url': competition.url }
def push(self) -> None: # push to Kaggle api = KaggleApi() api.authenticate() api.kernels_push("./")
def kaggle_api(): api = KaggleApi() api.authenticate() return api
def trainModel(model_name): api = KaggleApi() api.authenticate() api.dataset_download_files('jigrubhatt/selfieimagedetectiondataset', 'data/', unzip=True) train_ds = tf.keras.preprocessing.image_dataset_from_directory( "data/Selfie-Image-Detection-Dataset/Training_data", labels="inferred", label_mode="int", seed=1337, image_size=image_size, batch_size=batch_size, ) val_ds = tf.keras.preprocessing.image_dataset_from_directory( "data/Selfie-Image-Detection-Dataset/Validation_data", labels="inferred", label_mode="int", seed=1337, image_size=image_size, batch_size=batch_size, ) base_model = keras.applications.Xception( weights='imagenet', # Load weights pre-trained on ImageNet. input_shape=(150, 150, 3), include_top=False) base_model.trainable = False data_augmentation = keras.Sequential([ layers.experimental.preprocessing.RandomFlip("horizontal"), layers.experimental.preprocessing.RandomRotation(0.1), layers.experimental.preprocessing.RandomZoom(0.3, 0.3), layers.experimental.preprocessing.RandomTranslation(0.2, 0.2), ]) inputs = keras.Input(shape=(150, 150, 3)) x = data_augmentation(inputs) # Apply random data augmentation norm_layer = keras.layers.experimental.preprocessing.Normalization() mean = np.array([127.5] * 3) var = mean**2 # Scale inputs to [-1, +1] x = norm_layer(x) norm_layer.set_weights([mean, var]) # We make sure that the base_model is running in inference mode here, # by passing `training=False`. x = base_model(x, training=False) # Convert features of shape `base_model.output_shape[1:]` to vectors x = keras.layers.GlobalAveragePooling2D()(x) # A Dense classifier with a single unit (binary classification) x = keras.layers.Dropout(0.2)(x) # Regularize with dropout outputs = keras.layers.Dense(1, activation="sigmoid")(x) model = keras.Model(inputs, outputs) model.compile(optimizer=keras.optimizers.Adam(), loss=keras.losses.BinaryCrossentropy(from_logits=True), metrics=[keras.metrics.BinaryAccuracy()]) train_history = model.fit(train_ds, epochs=20, validation_data=val_ds) # Save training history as a dictionary with open('/trainHistoryDict', 'wb') as file_pi: pickle.dump(train_history.history, file_pi) # Fine-tuning model base_model.trainable = True model.summary() model.compile( optimizer=keras.optimizers.Adam(1e-5), # Low learning rate loss=keras.losses.BinaryCrossentropy(from_logits=True), metrics=[keras.metrics.BinaryAccuracy()], ) epochs = 10 fine_tuning_history = model.fit(train_ds, epochs=epochs, validation_data=val_ds) # Save fine tuning history as dictionary with open('/fineTuningHistoryDict', 'wb') as file_pi: pickle.dump(fine_tuning_history.history, file_pi) # Creates a SavedModel model.save(model_name) # Remove used data shutil.rmtree("data/Selfie-Image-Detection-Dataset") return model
def download(self): api = KaggleApi() api.authenticate() x = api.competition_list_files('titanic') print(x)
def download_kaggle(kaggle_dataset, kaggle_file_name, files_dir): api = KaggleApi() api.authenticate() api.dataset_download_file(kaggle_dataset, kaggle_file_name, path=files_dir)
class myKaggleAPI: def __init__(self): self.api = KaggleApi() self.api.authenticate() def search_competition(self, name: str = None, category: str = None, page: int = 1, detail: bool = False) -> list: """ Returns a list of competitons available on Kaggle Parameters: name: str, optional text you want to search in the title of the competiton category: str, optional specific category you want to search page: int, optional returns the competition on a specific page detail: bool, optional, default=False toggle it to true to get competition details printed Retuns: list of competions fiiting the search """ category_list = [ 'all', 'featured', 'research', 'recruitment', 'gettingStarted', 'masters', 'playground' ] if category is not None and category not in category_list: raise ValueError( "Invalid Catgory Name!\nValid Options are: 'all', 'featured', 'research', 'recruitment', 'gettingStarted', 'masters', 'playground'" ) if detail: self.api.competitions_list_cli(search=name, category=category, page=page) comp_list = self.api.competitions_list(search=name, category=category) return [str(comp) for comp in comp_list] def get_list_of_files(self, competition: str, details: bool = False) -> list: """ Get list of all the files available for the competion Parameter: competions: str name of the competion Return: list of all the files """ file_details = self.api.competitions_data_list_files(competition) files = [file['name'] for file in file_details] if details: print(*file_details, sep='\n') return files def download_all_files(self, competition: str, path: str = None): """ Downloads all files from competion Parameters: competion: str name of the competition path: str, optional path where you want to save the file *if the path is invalid, it will create a folder with the given name in the base folder *it will create a folder if not present in the path Return: downloads all the file in the specified location """ try: self.api.competition_download_files(competition, path, force=True, quiet=True) if path is None: print( 'All files were successfully downloaded to the base folder' ) else: print(f'All files were successfully downlaoded to {path}') except: print( 'Unable to download file\nPlease check the Competition Name or File Name' ) def download_specific_file(self, competition: str, file_name: str, path: str = None): """ Downloads a specific file from competion Parameters: competion: str name of the competition file_name: str name of the file you want to download path: str, optional path where you want to save the file *if the path is invalid, it will create a folder with the given name in the base folder *it will create a folder if not present in the path Return: downloads the file in the specified location """ try: self.api.competition_download_file(competition, file_name, path=path, force=True, quiet=True) if path is None: print('File was successfully downloaded to the base folder') else: print(f'File was successfully downlaoded to {path}') except: print( 'Unable to download file\nPlease check the Competition Name or File Name' )
def kaggle_actions(contest = "titanic", dataset = 'titanic data'): from kaggle.api.kaggle_api_extended import KaggleApi api = KaggleApi() api.authenticate() api.competition_download_files(contest, dataset) return
#!/usr/bin/python # # Copyright 2018 Kaggle Inc # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import from kaggle.api.kaggle_api_extended import KaggleApi from kaggle.api_client import ApiClient api = KaggleApi(ApiClient()) api.authenticate()
from kaggle.api_client import ApiClient from kaggle.api.kaggle_api_extended import KaggleApi kaggle = KaggleApi(ApiClient()) kaggle.authenticate()
def download_data(competition_name,current_dir): api = KaggleApi(login_info[0]) api.authenticate() api.competition_download_files(competition_name) with zipfile.ZipFile(current_dir + '/' + competition_name + '.zip', 'r') as zip_ref: zip_ref.extractall(current_dir + '/' + 'data')
#Authorize API's #Authorize Twitter API f = open('/home/pi/twitter_api_creds.json') creds = json.load(f) consumer_key = creds['consumer_key'] consumer_secret = creds['consumer_secret'] access_token = creds['access_token'] access_token_secret = creds['access_token_secret'] f.close() auth = tw.OAuthHandler(consumer_key, consumer_secret) auth.set_access_token(access_token, access_token_secret) api = tw.API(auth, wait_on_rate_limit=True) #Authorize Kaggle's API kapi = KaggleApi() kapi.authenticate() # In[11]: #Twitter Functions #Calls on clean() for text cleanup then removes the URL def remove_url(txt): #Call on clean to clean text first txt = clean(txt) #removes URL return " ".join(re.sub("([^0-9A-Za-z \t])|(\w+:\/\/\S+)", "", txt).split()) #Cleans up the text like newlines before url is removed. def clean(text):
class Kaggle(object): def __init__(self, config): self.config = config self.kaggle = KaggleApi() self.kaggle.authenticate() self.descriptions = { 'submission': { 'submission': 'The id of the submission', 'subreddit': 'The subreddit name', 'author': 'The redditors username', 'created': 'Time the submission was created', 'retrieved': 'Time the submission was retrieved', 'edited': 'Time the submission was modified', 'pinned': 'Whether or not the submission is pinned', 'archived': 'Whether or not the submission is archived', 'locked': 'Whether or not the submission is locked', 'removed': 'Whether or not the submission is mod removed', 'deleted': 'Whether or not the submission is user deleted', 'is_self': 'Whether or not the submission is a text', 'is_video': 'Whether or not the submission is a video', 'is_original_content': 'Whether or not the submission has been set as original content', 'title': 'The title of the submission', 'link_flair_text': 'The submission link flairs text content', 'upvote_ratio': 'The percentage of upvotes from all votes on the submission', 'score': 'The number of upvotes for the submission', 'gilded': 'The number of gilded awards on the submission', 'total_awards_received': 'The number of awards on the submission', 'num_comments': 'The number of comments on the submission', 'num_crossposts': 'The number of crossposts on the submission', 'selftext': 'The submission selftext on text posts', 'thumbnail': 'The submission thumbnail on image posts', 'shortlink': 'The submission short url' }, 'comment': { # TODO fetch comments } } self.datatypes = { 'object': 'string', 'int64': 'integer', 'float64': 'number', 'datetime64[ns]': 'datetime' } self.timer = Timer() def download(self, dataset, local=False): # use local folder if local and Env.VSCODE_WORKSPACE(): return os.path.join(Env.VSCODE_WORKSPACE(), 'data', 'export') # download dataset path = tempfile.mkdtemp() self.kaggle.dataset_download_files(dataset, path=path, quiet=False, force=True, unzip=True) # return dataset path return path def update(self, root): summary = {} resources = [] # read metadata from files for file_path in sorted(gb.glob(os.path.join(root, '**', '*.csv'))): df = pd.read_csv(file_path, doublequote=True, quoting=csv.QUOTE_NONNUMERIC, sep=',', encoding='utf-8') count = df.shape[0] name = os.path.basename(file_path) path = os.path.join(*(file_path.split(os.path.sep)[2:])) link = f'r/{os.path.dirname(path)}' # ignore empty files if count == 0: continue # build description time_from = datetime.fromtimestamp(df['created'].min()).strftime('%Y-%m-%d %H:%M:%S') time_to = datetime.fromtimestamp(df['created'].max()).strftime('%Y-%m-%d %H:%M:%S') description = f'[{link}](https://reddit.com/{link}) | {time_from} | {time_to} | *{df.shape[0]}*' # build ressources resources.append({ 'name': name, 'path': path, 'description': description, 'schema': { 'fields': [{ 'name': f'{column}', 'title': f'{column}', 'description': self.descriptions[name.split('.')[0]][column], 'type': self.datatypes[df.dtypes.astype(str)[column]] } for column in df.columns] } }) # save number of entries summary[description] = count # read kaggle template template = {} with open(self.config) as f: template = json.load(f) # build readme md = template['description'] md_description = [f'{x[0]}' for x in sorted(summary.items(), key=lambda x: x[1], reverse=True)] md_data = [f'`{x["name"]}` | {x["description"]} | *{x["type"]}*' for x in resources[0]['schema']['fields']] md_date = datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S') # export readme file readme = md.format('\n'.join(md_description), '\n'.join(md_data), md_date) with open(os.path.join(root, 'README.md'), 'w') as f: f.write(readme) # export datapackage file template['description'] = readme template['resources'] = resources with open(os.path.join(root, 'datapackage.json'), 'w') as f: json.dump(template, f, indent=4) # update message return f'{md_date} - {sum([x for x in summary.values()])}' def upload(self, path): return self.kaggle.dataset_create_version(path, version_notes=self.update(root=path), dir_mode='zip')
if os.environ.get("PRODUCTION") is None: load_dotenv(verbose=True) consumer_key = os.environ.get('TWITTER_CONSUMER_KEY') consumer_secret = os.environ.get('TWITTER_CONSUMER_SECRET') access_token = os.environ.get('TWITTER_ACCESS_TOKEN') access_token_secret = os.environ.get('TWITTER_ACCESS_TOKEN_SECRET') auth = tweepy.OAuthHandler(consumer_key, consumer_secret) auth.set_access_token(access_token, access_token_secret) twitter_api = tweepy.API(auth) kaggle_api = KaggleApi() kaggle_api.authenticate() now = dt.now() now = now.astimezone(timezone('UTC')) competitions = pd.DataFrame([], columns=['Title', 'enabledDate', 'deadline']) competitions_list = kaggle_api.competitions_list() for competition in competitions_list: if getattr(competition, 'awardsPoints') and not getattr(competition, 'submissionsDisabled'): deadline = getattr(competition, 'deadline') deadline = deadline.astimezone(timezone('UTC')) diff = deadline - now if diff.days >= 0: competitions = competitions.append(pd.Series([ getattr(competition, 'title'), getattr(competition, 'enabledDate'), getattr(competition, 'deadline')], index=['Title', 'enabledDate', 'deadline']),
def download(self): import os from zipfile import ZipFile try: from kaggle.api.kaggle_api_extended import KaggleApi except ImportError: raise RuntimeError( 'please install and setup the kaggle ' 'competition api: https://github.com/Kaggle/kaggle-api') api = KaggleApi() api.authenticate() kgl_comp = 'trackml-particle-identification' test_file = 'train_sample.zip' if self.full_dataset: kgl_file = 'trackml-particle-identification.zip' print( 'Downloading full TrackML dataset (~80GB), this may take a while...' ) api.competition_download_files(kgl_comp, path=self.root, quiet=False, force=False) training_samples = None with ZipFile(os.path.join(self.root, kgl_file), 'r') as zf: training_samples = [fname for fname in filter(lambda x: 'train' in x and \ 'sample' not in x and \ 'blacklist' not in x, zf.namelist())] for name in tqdm(training_samples, desc='extracting zipballs'): if not os.path.exists(os.path.join(self.root, name)): zf.extract(name, path=self.root) for sample in training_samples: with ZipFile(os.path.join(self.root, sample), 'r') as zf: fnames = zf.namelist() action = f'unpacking {sample}' for name in tqdm(fnames, desc=action): sample_dir = sample.split('.')[0] + '/' if name == sample_dir: continue outname = os.path.join(self.raw_dir, os.path.basename(name)) if os.path.exists(outname): raise Exception(f'{outname} already exists!') with open(outname, 'wb') as fout: fout.write(zf.read(name)) else: kgl_file = test_file print( 'Downloading training example from TrackML dataset, only 100 training events...' ) api.competition_download_file(kgl_comp, test_file, path=self.root, quiet=False, force=False) with ZipFile(os.path.join(self.root, kgl_file), 'r') as zf: fnames = zf.namelist() for name in tqdm(fnames): if name == 'train_100_events/': continue with open( os.path.join(self.raw_dir, os.path.basename(name)), 'wb') as fout: fout.write(zf.read(name)) events = glob.glob( osp.join(osp.join(self.root, 'raw'), 'event*-hits.csv')) events = [e.split(osp.sep)[-1].split('-')[0][5:] for e in events] self.events = sorted(events) if (self.n_events > 0): self.events = self.events[:self.n_events]
class Competition(commands.Cog): def __init__(self, bot): self.bot = bot self.init_api_client() def init_api_client(self): self.api = KaggleApi() self.api.authenticate() @commands.group() async def comp(self, ctx): """ Collection of commands for interacting with Kaggle """ self.guild = ctx.guild self.gid = ctx.guild.id if ctx.invoked_subcommand is None: await ctx.send("Invalid command passed. Use !help.") @comp.command() async def list(self, ctx, cat="featured", num=5): """ List competitions sorted by latest deadline. Returns competitions from all categories and only lists the first 5 by default. Parameters: cat (str): category to filter by [featured|research|recruitment|gettingStarted|masters|playground] num (int): how many results to display """ if cat.lower() not in CATEGORIES: raise InvalidCategoryException comps = self.api.competitions_list(category="featured") latest_comps = [comp.__dict__ for comp in comps[:num]] for latest_comp in latest_comps: emb = get_competition_embed(latest_comp, ["description", "reward", "deadline"]) await ctx.channel.send(embed=emb) @comp.error async def comps_error(self, ctx, error): if isinstance(error.original, InvalidCategoryException): await ctx.channel.send("The specified category is not supported") @comp.command() async def create(self, ctx, comp_name, team_name: Optional[str] = " "): """ Create a competition given a name. Parameters: comp_name (str): name of the competition to find and create in lowercase and slug, eg comp-name team_name (str): optional string to add as team name """ logger.info(comp_name) comps = self.api.competitions_list(sort_by="latestDeadline") latest_comps = [comp.__dict__ for comp in comps] max_longest_match = 0 matched_comp = None for latest_comp in latest_comps: matcher = SequenceMatcher(None, comp_name, latest_comp["ref"]) longest_match = matcher.find_longest_match(0, len(comp_name), 0, len(latest_comp["ref"])).size if longest_match > max_longest_match: max_longest_match = longest_match matched_comp = latest_comp if matched_comp: category = discord.utils.get(ctx.guild.categories, name=matched_comp["ref"]) if category is not None: raise CompetitionAlreadyExistsException comp_role = await self.guild.create_role(name=f"Comp-{matched_comp['ref']}", mentionable=True) await ctx.message.author.add_roles(comp_role) overwrites = { # Everyone self.guild.get_role(self.gid): discord.PermissionOverwrite(read_messages=False), self.bot.user: discord.PermissionOverwrite(read_messages=True), comp_role: discord.PermissionOverwrite(read_messages=True), } category = await self.guild.create_category(name=matched_comp["ref"], overwrites=overwrites) general_channel = await self.guild.create_text_channel(name="general", category=category) Comp( name=category, description=matched_comp["description"], url=matched_comp["url"], created_at=datetime.datetime.now(), deadline=matched_comp["deadline"], team_name=team_name, max_team_size=matched_comp["maxTeamSize"], max_daily_subs=matched_comp["maxDailySubmissions"], merger_deadline=matched_comp["mergerDeadline"], team_members=[ctx.author.name], ).save() await general_channel.send("@here New competition created! @here Άτε κοπέλια..!") else: await ctx.channel.send("Νομίζω έφηε σου το όνομα! Use !comp list to see a list.") @create.error async def create_error(self, ctx, error): if isinstance(error.original, CompetitionAlreadyExistsException): await ctx.channel.send("Ρεεεε τούτο το κομπετίσιον υπάρχει!!") @comp.command(aliases=["ranking"]) async def show_ranking(self, ctx): """ Shows team's current ranking on Kaggle Competition (should be run within competition category) """ category = ctx.channel.category.name comp = Comp.objects.get({"name": category}) leaderboard_results = self.api.competition_leaderboard_view(category) if leaderboard_results: comp = Comp.objects.get({"name": category}) team_ranking = get_team_entry_from_leaderboard(leaderboard_results, comp.team_name) await ctx.channel.send("Πάμε καλά;;") team_ranking_vals = {key: getattr(team_ranking, key) for key in FIELDS} await ctx.channel.send( f"Place: {team_ranking_vals[FIELDS[0]]}, Last submission date: {team_ranking_vals[2]}, Score: {team_ranking_vals[FIELDS[3]]}" ) @show_ranking.error async def show_ranking_error(self, ctx, error): if isinstance(error.original, Comp.DoesNotExist): await ctx.channel.send("Πάενε μες το κομπετίσιον ρεεε. Run this command in the competition category.") @comp.command() async def addteammate(self, ctx, team_mate: str): """ Adds teammate to competition. (should be run within competition category) Parameters: team_mate (str): discord username of teammate to add """ category = ctx.channel.category.name comp = Comp.objects.get({"name": category}) comp = Comp.objects.get({"name": category}) if team_mate in comp.team_members: await ctx.channel.send(f"{team_mate} is already a member of the competition's team") else: if len(comp.team_members) < comp.max_team_size: general = discord.utils.get(ctx.channel.category.channels, name="general") user = None for guild in self.bot.guilds: for member in guild.members: if member.name == team_mate: user = member break if user: comp.team_members.append(team_mate) comp.save() comp_role = discord.utils.get(ctx.guild.roles, name=f"Comp-{category}") await user.add_roles(comp_role) await general.send( f"{user.mention} you have been added to {comp.name} by {ctx.message.author.mention}. Good luck!" ) else: await ctx.channel.send("Ένηβρα έτσι παίχτη... User not found!") else: await ctx.channel.send("Team is already at maximum capacity... :worried:") @addteammate.error async def addteammate_error(self, ctx, error): if isinstance(error.original, Comp.DoesNotExist): await ctx.channel.send("Πάενε μες το κομπετίσιον ρεεε. Run this command in the competition category.") @comp.command(aliases=["teamname"]) async def set_team_name(self, ctx, team_name: str): """ Sets the team name. Parameters: team_name (str) """ category = ctx.channel.category.name comp = Comp.objects.get({"name": category}) if len(comp.team_name) == 1: # default value is " " comp.team_name = team_name comp.save() general = discord.utils.get(ctx.channel.category.channels, name="general") await general.send(f"Let's go {team_name}!!!") else: raise TeamAlreadyHasNameException(comp.team_name) @set_team_name.error async def set_team_name_error(self, ctx, error): if isinstance(error.original, Comp.DoesNotExist): await ctx.channel.send("Πάενε μες το κομπετίσιον ρεεε. Run this command in the competition category.") elif isinstance(error.original, TeamAlreadyHasNameException): await ctx.channel.send( f"Άρκησες ρε φίλε! This team already has a name and its... drum roll please {error.original.team_name}:tada:" ) @comp.command() @commands.has_permissions(manage_channels=True, manage_roles=True) async def archive(self, ctx, comp_name): """ Marks a competition as finished, archives it in the server and deletes all associated channels and categories. Parameters: comp_name (str): name of the competition to mark as finished """ comp = Comp.objects.get({"name": comp_name}) category = discord.utils.get(ctx.guild.categories, name=comp_name) comp_role = discord.utils.get(ctx.guild.roles, name=f"Comp-{comp_name}") if comp.finished_on is not None: raise CompetitionAlreadyArchivedException comp.name = f"__ARCHIVED__ {comp.name}" comp.save() if comp_role is not None: await comp_role.delete() for c in category.channels: await c.delete() await category.delete() await ctx.channel.send("Good job on the competition everyone! Επήαμε τα καλά;") @archive.error async def archive_error(self, ctx, error): if isinstance(error.original, Comp.DoesNotExist): await ctx.channel.send("Πάενε μες το κομπετίσιον ρεεε. Run this command in the competition category.") if isinstance(error.original, CompetitionAlreadyArchivedException): await ctx.channel.send("This competition has already finished.") @comp.command() async def submit(self, ctx, desc: Optional[str] = ""): """ Makes a submission to the category's competition. Parameters: description (str): optional description of submission made """ category = ctx.channel.category.name comp = Comp.objects.get({"name": category}) if comp.subs_today == comp.max_daily_subs: raise MaxSubmissionsReachedException await ctx.channel.send("Please upload a submission file...") async def wait_for_file(): """ Coroutine that waits for a file to be uploaded """ url = None filename = None try: message = await self.bot.wait_for("message", timeout=60.0) except asyncio.TimeoutError: await ctx.channel.send("Time out... Try submitting again.") else: await ctx.channel.send("Submission succesfull!") url = message.attachments[0].url filename = message.attachments[0].filename return url, filename sub_file_url, filename = await self.bot.loop.create_task(wait_for_file()) sub_file_content = requests.get(sub_file_url).content local_file = os.path.join("/tmp", filename) with open(local_file, "wb") as outfile: outfile.write(sub_file_content) submit_result = self.api.competition_submit(local_file, desc, comp.name) await ctx.channel.send(repr(submit_result)) @submit.error async def submit_error(self, ctx, error): if isinstance(error.original, Comp.DoesNotExist): await ctx.channel.send("Πάενε μες το κομπετίσιον ρεεε. Run this command in the competition category.") elif isinstance(error, MaxSubmissionsReachedException): await ctx.channel.send("Πάππαλα τα σαμπμίσσιονς... No more submissions left for today.") @tasks.loop(hours=24) async def update_subs(self): comps = Comp.objects.all() for comp in comps: comp.subs_today = 0 comp.save() @tasks.loop(hours=24) async def finish_comps(self): comps = Comp.objects.all() comps = list(filter(lambda x: x.finished_on is not None, comps)) for comp in comps: if comp.deadline <= datetime.datetime.now(): comp.finished_on = datetime.datetime.now()
def download_kaggle_files(): api = KaggleApi() api.authenticate() api.dataset_download_files('netflix-inc/netflix-prize-data', path='/Users/pcc33/Downloads/', unzip=True)
# -*- coding: utf-8 -*- """ pull_data.py (5 points) When this is called using python pull_data.py in the command line, this will go to the 2 Kaggle urls provided below, authenticate using your own Kaggle sign on, pull the two datasets, and save as .csv files in the current local directory. The authentication login details (aka secrets) need to be in a hidden folder (hint: use .gitignore). There must be a data check step to ensure the data has been pulled correctly and clear commenting and documentation for each step inside the .py file. Training dataset url: https://www.kaggle.com/c/titanic/download/train.csv Scoring dataset url: https://www.kaggle.com/c/titanic/download/test.csv @author: Ada """ # from kaggle.api.kaggle_api_extended import KaggleApi import os download_path ="C:\\Users\\Ada\\Desktop\\CUNY_SPS_DA\\622 ML\\HW1 Titanic" api = KaggleApi() api.authenticate() api.competition_download_files('titanic')
def fetch_dataset(project_dir,download_from_kaggle=False, kaggle_dataset=None, kaggle_competition=None, download_from_s3=False, s3_bucket=None, download_from_url=False, data_url=None): """ Runs data processing scripts to turn raw data from (../raw) into cleaned data ready to be analyzed (saved in ../processed). """ logger = logging.getLogger(__name__) logger.info('making final data set from raw data') log_fmt = '%(asctime)s - %(name)s - %(levelname)s - %(message)s' logging.basicConfig(level=logging.INFO, format=log_fmt) logger.info('project directory {}'.format(project_dir)) output_path = os.path.join(project_dir, 'data', 'raw') try: os.makedirs(output_path) except OSError: pass logger.info('output path {}'.format(output_path)) if download_from_kaggle: os.environ['KAGGLE_USERNAME'] = os.environ['username'] os.environ['KAGGLE_KEY'] = os.environ['kaggle_key'] from kaggle.api.kaggle_api_extended import KaggleApi kaggle_api = KaggleApi() kaggle_api.authenticate() if kaggle_dataset: kaggle_api.dataset_download_files(kaggle_dataset,output_path,unzip=True) if kaggle_competition: kaggle_api.competition_download_files(kaggle_competition,path=output_path) if download_from_s3: import boto3 session = boto3.Session( aws_access_key_id=os.environ['aws_access_key_id'], aws_secret_access_key=os.environ['aws_secret_access_key'] ) s3 = session.resource('s3') if s3_bucket: my_bucket = s3.Bucket(s3_bucket) # download file into current directory for s3_object in tqdm(my_bucket.objects.all()): filename = s3_object.key logger.info("downloading {}".format(filename)) my_bucket.download_file(s3_object.key, os.path.join(output_path,filename)) try: #zf = zipfile.ZipFile(os.path.join(output_path,filename), 'r') #zf.extractall(output_path) #zf.close() shutil.unpack_archive(os.path.join(output_path,filename),output_path) os.remove(os.path.join(output_path,filename)) except shutil.ReadError: pass if download_from_url and data_url: import urllib3 http = urllib3.PoolManager() filename = data_url.split('/')[-1] logger.info("downloading {}".format(filename)) with open(os.path.join(output_path,filename), 'wb') as out: r = http.request('GET', data_url, preload_content=False) shutil.copyfileobj(r, out) try: #zf = zipfile.ZipFile(os.path.join(output_path, filename), 'r') #zf.extractall(output_path) #zf.close() shutil.unpack_archive(os.path.join(output_path, filename), output_path) os.remove(os.path.join(output_path, filename)) except shutil.ReadError: pass logger.info('download complete')