Exemplo n.º 1
0
def kernel_update(request):
    # pull the most recent version of the kernel
    api = KaggleApi()
    api.authenticate()
    api.kernels_pull_cli("{}/{}".format(USERNAME, KERNEL_SLUG), path="{}".format(PATH), metadata=True)

    # push our notebook
    api.kernels_push_cli("{}".format(PATH))

    # save a copy of our notebook in our bucket (if you would prefer
    # not to save a copy, delete all lines from here to the end of the file).
    bucket = storage.bucket(BUCKET)
    metadata_blob = bucket.blob("kernel-metadata.json")
    notebook_blob = bucket.blob("{}.{}".format(KERNEL_SLUG, KERNEL_EXTENSION ))

    metadata_blob.upload_from_filename("{}/kernel-metadata.json".format(PATH))
    notebook_blob.upload_from_filename("{}/{}.{}".format(PATH, KERNEL_SLUG, KERNEL_EXTENSION))
Exemplo n.º 2
0
def chess_analysis():
    # Start time count to gauge process run time
    start = time.time()
    api = KaggleApi()
    api.authenticate()

    # downloading datasets for Chess games
    api.dataset_download_files('arevel/chess-games')

    # Read data in chunks of 100000 rows and concatenate into one dataframe at a time to speed up read time
    zf = zipfile.ZipFile('chess-games.zip')
    csv = pd.read_csv(zf.open('chess_games.csv'), chunksize=100000)
    chess_df = pd.concat(csv)

    # Remove any duplicate user names to limit data to one game per user
    chess_df = chess_df.drop_duplicates(subset=['White', 'Black'])

    # remove any rows with stockfish evaluation as this clogs up the data at a later stage
    chess_df = chess_df.drop(chess_df[chess_df.AN.str.contains(r'[{}]')].index)

    # use iterrows to print out data
    for index, row in chess_df.head(1000).iterrows():
        print(index, row)

    # reset index after dropping duplicate users and removing stockfish evaluations
    chess_df = chess_df.reset_index()

    # Define average elo rank per game
    chess_df['AverageElo'] = (chess_df['WhiteElo'] + chess_df['BlackElo']) / 2

    # create lists of conditions to use for np.se;ect to add new columns to turn numeric values into grouped categories
    white_conditions = [
        (chess_df['WhiteElo'] > 2700),
        (chess_df['WhiteElo'] < 2700) & (chess_df['WhiteElo'] >= 2500),
        (chess_df['WhiteElo'] < 2500) & (chess_df['WhiteElo'] >= 2400),
        (chess_df['WhiteElo'] < 2400) & (chess_df['WhiteElo'] >= 2300),
        (chess_df['WhiteElo'] < 2300) & (chess_df['WhiteElo'] >= 2200),
        (chess_df['WhiteElo'] < 2200) & (chess_df['WhiteElo'] >= 2000),
        (chess_df['WhiteElo'] < 2000) & (chess_df['WhiteElo'] >= 1800),
        (chess_df['WhiteElo'] < 1800) & (chess_df['WhiteElo'] >= 1600),
        (chess_df['WhiteElo'] < 1600) & (chess_df['WhiteElo'] >= 1400),
        (chess_df['WhiteElo'] < 1400) & (chess_df['WhiteElo'] >= 1200),
        (chess_df['WhiteElo'] < 1200) & (chess_df['WhiteElo'] >= 0)
    ]

    black_conditions = [
        (chess_df['BlackElo'] >= 2700),
        (chess_df['BlackElo'] < 2700) & (chess_df['BlackElo'] >= 2500),
        (chess_df['BlackElo'] < 2500) & (chess_df['BlackElo'] >= 2400),
        (chess_df['BlackElo'] < 2400) & (chess_df['BlackElo'] >= 2300),
        (chess_df['BlackElo'] < 2300) & (chess_df['BlackElo'] >= 2200),
        (chess_df['BlackElo'] < 2200) & (chess_df['BlackElo'] >= 2000),
        (chess_df['BlackElo'] < 2000) & (chess_df['BlackElo'] >= 1800),
        (chess_df['BlackElo'] < 1800) & (chess_df['BlackElo'] >= 1600),
        (chess_df['BlackElo'] < 1600) & (chess_df['BlackElo'] >= 1400),
        (chess_df['BlackElo'] < 1400) & (chess_df['BlackElo'] >= 1200),
        (chess_df['BlackElo'] < 1200) & (chess_df['BlackElo'] >= 0)
    ]

    average_conditions = [
        (chess_df['AverageElo'] >= 2700),
        (chess_df['AverageElo'] < 2700) & (chess_df['AverageElo'] >= 2500),
        (chess_df['AverageElo'] < 2500) & (chess_df['AverageElo'] >= 2400),
        (chess_df['AverageElo'] < 2400) & (chess_df['AverageElo'] >= 2300),
        (chess_df['AverageElo'] < 2300) & (chess_df['AverageElo'] >= 2200),
        (chess_df['AverageElo'] < 2200) & (chess_df['AverageElo'] >= 2000),
        (chess_df['AverageElo'] < 2000) & (chess_df['AverageElo'] >= 1800),
        (chess_df['AverageElo'] < 1800) & (chess_df['AverageElo'] >= 1600),
        (chess_df['AverageElo'] < 1600) & (chess_df['AverageElo'] >= 1400),
        (chess_df['AverageElo'] < 1400) & (chess_df['AverageElo'] >= 1200),
        (chess_df['AverageElo'] < 1200) & (chess_df['AverageElo'] >= 0)
    ]

    outcome_conditions = [(chess_df['Result']) == "1-0",
                          (chess_df['Result']) == "0-1",
                          (chess_df['Result']) == "1/2-1/2",
                          (chess_df['Result']) == "*"]

    # create a list of the values to assign for each condition
    elo = [
        'Super GM', 'GM', 'GM/IM', 'FM/IM', 'CM/NM', 'Experts', 'Class A',
        'Class B', 'Class C', 'Class D', 'Novices'
    ]
    outcome = ['White Wins', 'Black Wins', 'Draw', 'No Result']

    # create new columns and use np.select to assign values to it using the lists as arguments
    chess_df['WhiteEloRank'] = np.select(white_conditions, elo)
    chess_df['BlackEloRank'] = np.select(black_conditions, elo)
    chess_df['AverageEloRank'] = np.select(average_conditions, elo)
    chess_df['Outcome'] = np.select(outcome_conditions, outcome)

    # create dataframe for moves
    moves_df = chess_df["AN"].str.split(" ", n=30, expand=True)
    moves_df = moves_df.drop(moves_df.iloc[:, 0:31:3], axis=1)

    # append moves dataframe to chess dataframe
    chess_df = pd.concat([chess_df, moves_df], axis=1)
    chess_df.reset_index(inplace=True)

    # sort data from lowest average elo to highest average elo
    chess_df = chess_df.sort_values(by='AverageElo', ascending=False)

    # change data type from object to numeric values
    chess_df[["WhiteElo", "BlackElo", "AverageElo"]] = chess_df[["WhiteElo", "BlackElo", "AverageElo"]].\
        apply(pd.to_numeric)

    classical_df1 = chess_df[chess_df.Event == ' Classical ']
    classical_df2 = chess_df[chess_df.Event == 'Classical ']
    classical = pd.merge(classical_df1, classical_df2, how='outer')

    classical_tournament_df1 = chess_df[chess_df.Event ==
                                        ' Classical tournament ']
    classical_tournament_df2 = chess_df[chess_df.Event ==
                                        'Classical tournament ']
    classical_tournament = pd.merge(classical_tournament_df1,
                                    classical_tournament_df2,
                                    how='outer')

    blitz_df1 = chess_df[chess_df.Event == ' Blitz ']
    blitz_df2 = chess_df[chess_df.Event == 'Blitz ']
    blitz = pd.merge(blitz_df1, blitz_df2, how='outer')

    blitz_tournament_df1 = chess_df[chess_df.Event == ' Blitz tournament ']
    blitz_tournament_df2 = chess_df[chess_df.Event == 'Blitz tournament ']
    blitz_tournament = pd.merge(blitz_tournament_df1,
                                blitz_tournament_df2,
                                how='outer')

    bullet_df1 = chess_df[chess_df.Event == ' Bullet ']
    bullet_df2 = chess_df[chess_df.Event == 'Bullet ']
    bullet = pd.merge(bullet_df1, bullet_df2, how='outer')

    bullet_tournament_df1 = chess_df[chess_df.Event == ' Bullet tournament ']
    bullet_tournament_df2 = chess_df[chess_df.Event == 'Bullet tournament ']
    bullet_tournament = pd.merge(bullet_tournament_df1,
                                 bullet_tournament_df2,
                                 how='outer')

    correspondence_df1 = chess_df[chess_df.Event == ' Correspondence ']
    correspondence_df2 = chess_df[chess_df.Event == 'Correspondence ']
    correspondence = pd.merge(correspondence_df1,
                              correspondence_df2,
                              how='outer')

    # Plot results
    #  Categorical Data
    plots = ['Termination', 'Outcome', 'AverageEloRank']
    plots_1 = ['AverageElo']
    plots_2 = [1, 2]
    game_types = [
        classical, classical_tournament, blitz, blitz_tournament, bullet,
        bullet_tournament, correspondence
    ]
    game_types_str = [
        'Classical', 'Classical Tournament', 'Blitz', 'Blitz Tournament',
        'Bullet', 'Bullet Tournament', 'Correspondence'
    ]

    z = 0
    y = 0
    w = 0
    for x in game_types:
        a = 1  # number of rows, set to 1 to retrieve individual graph groups based on game type
        b = int(len(plots))  # number of columns
        c = 1  # initialize plot counter
        d = 1  # number of rows, set to 1 to retrieve individual graph groups based on game type
        e = int(len(plots_1))  # number of columns
        f = 1  # initialize plot counter
        g = 1  # number of rows, set to 1 to retrieve individual graph groups based on game type
        h = int(len(plots_2))  # number of columns
        k = 1  # initialize plot counter
        for i in plots:
            plt.subplot(a, b, c)
            plt.title(str(game_types_str[z]))
            plt.xlabel(i)
            plt.subplots_adjust(bottom=0.095, top=0.97, hspace=1, wspace=0.45)
            sns.countplot(x=x[i])
            plt.xticks(rotation=30)
            c = c + 1
        z = z + 1
        plt.show()
        plt.clf()

        for i in plots_1:
            plt.subplot(d, e, f)
            plt.title(str(game_types_str[y]))
            plt.xlabel(i)
            plt.subplots_adjust(bottom=0.095, top=0.97, hspace=1)
            sns.histplot(x=x[i], kde=True, bins=25)
            plt.xticks(rotation=30)
            f = f + 1
        y = y + 1
        plt.show()
        plt.clf()
        for i in plots_2:
            plt.subplot(g, h, k)
            plt.title(str(game_types_str[w]))
            plt.xlabel(i)
            plt.subplots_adjust(bottom=0.095, top=0.97, hspace=1)
            sns.countplot(x=x[i])
            plt.xticks(rotation=30)
            k = k + 1
        w = w + 1
        plt.show()
        plt.clf()

    end = time.time()

    print("Run Time: ", (end - start), 'Seconds')
Exemplo n.º 3
0
def update_dataset(folder, note):
    api = KaggleApi()
    api.authenticate()

    return api.dataset_create_version(folder, note, delete_old_versions=True)
from kaggle.api.kaggle_api_extended import KaggleApi
api = KaggleApi()
api.authenticate(
)  # requires your computer to have a JSON file with your API keys

api.competition_download_files('coleridgeinitiative-show-us-the-data'
                               )  # downloads as a zip, you will need to unzip

print("Done")
Exemplo n.º 5
0
class CompetitionUpdater(object):
    def __init__(self, from_date):
        self.__api = KaggleApi()
        self.__api.authenticate()
        self.__from_date = from_date

    # return all competitions in [from_date, query_date) interval
    def get_new_competitions(self, query_date):
        def parse_competition(c):
            return {
                'id': c.ref,
                'title': c.title,
                'category': c.category,
                'days_before_deadline': (c.deadline - datetime.today()).days,
                'tags': c.tags,
                'url': c.url
            }

        results = []
        page = 1
        last_enabled_date = query_date

        while last_enabled_date >= self.__from_date:
            competitions = self.__api.competitions_list(
                sort_by='recentlyCreated', page=page)
            for c in competitions:
                current_date = c.enabledDate
                if query_date > current_date >= self.__from_date:
                    results.append(parse_competition(c))
            last_enabled_date = competitions[-1].enabledDate
            page += 1
            time.sleep(1)

        self.__from_date = query_date

        return results

    def get_leaderboard_info(self, competition_ref):
        def parse_submission(submission):
            return {
                'score': submission['score'],
                'date': submission['submissionDate'][:10]
            }

        try:
            leaderboard = self.__api.competition_view_leaderboard(
                competition_ref)
            submissions = leaderboard['submissions']
            n = len(submissions)
            result = []
            if n == 0:
                return result
            result.append((1, parse_submission(submissions[0])))
            if n == 1:
                return result
            if n > 10:
                result.append((10, parse_submission(submissions[9])))
            result.append((n, parse_submission(submissions[-1])))
            return result
        except Exception:
            return []

    def get_state(self, competition_ref):
        competitions = self.__api.competitions_list(search=competition_ref,
                                                    page=1)
        if len(competitions) != 1:
            return 'Use the id string or the title of the competition (several competitions were found)'
        competition = competitions[0]
        if competition_ref not in [competition.ref, competition.title]:
            return 'Use the id string or the title of the competition (requested competition doesn\'t equal to any competitions)'

        time.sleep(1)
        leaderboard = self.get_leaderboard_info(competition.ref)

        return {
            'title': competition.title,
            'reward': competition.reward,
            'teams': competition.teamCount,
            'days_before_deadline':
            (competition.deadline - datetime.today()).days,
            'evaluation_metric': competition.evaluationMetric,
            'leaderboard': leaderboard,
            'url': competition.url
        }
Exemplo n.º 6
0
    def push(self) -> None:

        # push to Kaggle
        api = KaggleApi()
        api.authenticate()
        api.kernels_push("./")
Exemplo n.º 7
0
def kaggle_api():
    api = KaggleApi()
    api.authenticate()
    return api
Exemplo n.º 8
0
def trainModel(model_name):
    api = KaggleApi()
    api.authenticate()

    api.dataset_download_files('jigrubhatt/selfieimagedetectiondataset',
                               'data/',
                               unzip=True)

    train_ds = tf.keras.preprocessing.image_dataset_from_directory(
        "data/Selfie-Image-Detection-Dataset/Training_data",
        labels="inferred",
        label_mode="int",
        seed=1337,
        image_size=image_size,
        batch_size=batch_size,
    )
    val_ds = tf.keras.preprocessing.image_dataset_from_directory(
        "data/Selfie-Image-Detection-Dataset/Validation_data",
        labels="inferred",
        label_mode="int",
        seed=1337,
        image_size=image_size,
        batch_size=batch_size,
    )

    base_model = keras.applications.Xception(
        weights='imagenet',  # Load weights pre-trained on ImageNet.
        input_shape=(150, 150, 3),
        include_top=False)

    base_model.trainable = False

    data_augmentation = keras.Sequential([
        layers.experimental.preprocessing.RandomFlip("horizontal"),
        layers.experimental.preprocessing.RandomRotation(0.1),
        layers.experimental.preprocessing.RandomZoom(0.3, 0.3),
        layers.experimental.preprocessing.RandomTranslation(0.2, 0.2),
    ])

    inputs = keras.Input(shape=(150, 150, 3))
    x = data_augmentation(inputs)  # Apply random data augmentation

    norm_layer = keras.layers.experimental.preprocessing.Normalization()
    mean = np.array([127.5] * 3)
    var = mean**2

    # Scale inputs to [-1, +1]
    x = norm_layer(x)
    norm_layer.set_weights([mean, var])

    # We make sure that the base_model is running in inference mode here,
    # by passing `training=False`.
    x = base_model(x, training=False)

    # Convert features of shape `base_model.output_shape[1:]` to vectors
    x = keras.layers.GlobalAveragePooling2D()(x)

    # A Dense classifier with a single unit (binary classification)
    x = keras.layers.Dropout(0.2)(x)  # Regularize with dropout
    outputs = keras.layers.Dense(1, activation="sigmoid")(x)
    model = keras.Model(inputs, outputs)

    model.compile(optimizer=keras.optimizers.Adam(),
                  loss=keras.losses.BinaryCrossentropy(from_logits=True),
                  metrics=[keras.metrics.BinaryAccuracy()])
    train_history = model.fit(train_ds, epochs=20, validation_data=val_ds)

    # Save training history as a dictionary
    with open('/trainHistoryDict', 'wb') as file_pi:
        pickle.dump(train_history.history, file_pi)

    # Fine-tuning model
    base_model.trainable = True
    model.summary()

    model.compile(
        optimizer=keras.optimizers.Adam(1e-5),  # Low learning rate
        loss=keras.losses.BinaryCrossentropy(from_logits=True),
        metrics=[keras.metrics.BinaryAccuracy()],
    )

    epochs = 10
    fine_tuning_history = model.fit(train_ds,
                                    epochs=epochs,
                                    validation_data=val_ds)

    # Save fine tuning history as dictionary
    with open('/fineTuningHistoryDict', 'wb') as file_pi:
        pickle.dump(fine_tuning_history.history, file_pi)

    # Creates a SavedModel
    model.save(model_name)

    # Remove used data
    shutil.rmtree("data/Selfie-Image-Detection-Dataset")

    return model
 def download(self):
     api = KaggleApi()
     api.authenticate()
     x = api.competition_list_files('titanic')
     print(x)
Exemplo n.º 10
0
def download_kaggle(kaggle_dataset, kaggle_file_name, files_dir):
    api = KaggleApi()
    api.authenticate()
    api.dataset_download_file(kaggle_dataset, kaggle_file_name, path=files_dir)
Exemplo n.º 11
0
class myKaggleAPI:
    def __init__(self):
        self.api = KaggleApi()
        self.api.authenticate()

    def search_competition(self,
                           name: str = None,
                           category: str = None,
                           page: int = 1,
                           detail: bool = False) -> list:
        """ 

        Returns a list of competitons available on Kaggle
        
        Parameters:
            name: str, optional
                text you want to search in the title of the competiton
            category: str, optional
                specific category you want to search
            page: int, optional
                returns the competition on a specific page
            detail: bool, optional, default=False
                toggle it to true to get competition details printed
        Retuns:
            list of competions fiiting the search
        
        """
        category_list = [
            'all', 'featured', 'research', 'recruitment', 'gettingStarted',
            'masters', 'playground'
        ]
        if category is not None and category not in category_list:
            raise ValueError(
                "Invalid Catgory Name!\nValid Options are: 'all', 'featured', 'research', 'recruitment', 'gettingStarted', 'masters', 'playground'"
            )

        if detail:
            self.api.competitions_list_cli(search=name,
                                           category=category,
                                           page=page)

        comp_list = self.api.competitions_list(search=name, category=category)
        return [str(comp) for comp in comp_list]

    def get_list_of_files(self,
                          competition: str,
                          details: bool = False) -> list:
        """
        Get list of all the files available for the competion
        
        Parameter:
            competions: str
                name of the competion
        
        Return:
            list of all the files
        """
        file_details = self.api.competitions_data_list_files(competition)
        files = [file['name'] for file in file_details]
        if details:
            print(*file_details, sep='\n')
        return files

    def download_all_files(self, competition: str, path: str = None):
        """
        Downloads all files from competion
        Parameters:
            competion: str
                name of the competition
            path: str, optional
                path where you want to save the file
                *if the path is invalid, it will create a folder with the given name in the base folder
                *it will create a folder if not present in the path
        Return:
            downloads all the file in the specified location
        """
        try:
            self.api.competition_download_files(competition,
                                                path,
                                                force=True,
                                                quiet=True)
            if path is None:
                print(
                    'All files were successfully downloaded to the base folder'
                )
            else:
                print(f'All files were successfully downlaoded to {path}')
        except:
            print(
                'Unable to download file\nPlease check the Competition Name or File Name'
            )

    def download_specific_file(self,
                               competition: str,
                               file_name: str,
                               path: str = None):
        """
        Downloads a specific file from competion
        Parameters:
            competion: str
                name of the competition
            file_name: str
                name of the file you want to download
            path: str, optional
                path where you want to save the file
                *if the path is invalid, it will create a folder with the given name in the base folder
                *it will create a folder if not present in the path
        Return:
            downloads the file in the specified location
        """
        try:
            self.api.competition_download_file(competition,
                                               file_name,
                                               path=path,
                                               force=True,
                                               quiet=True)
            if path is None:
                print('File was successfully downloaded to the base folder')
            else:
                print(f'File was successfully downlaoded to {path}')
        except:
            print(
                'Unable to download file\nPlease check the Competition Name or File Name'
            )
Exemplo n.º 12
0
def kaggle_actions(contest = "titanic", dataset = 'titanic data'):
        from kaggle.api.kaggle_api_extended import KaggleApi
        api = KaggleApi()
        api.authenticate()
        api.competition_download_files(contest, dataset)
        return 
Exemplo n.º 13
0
#!/usr/bin/python
#
# Copyright 2018 Kaggle Inc
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import absolute_import
from kaggle.api.kaggle_api_extended import KaggleApi
from kaggle.api_client import ApiClient

api = KaggleApi(ApiClient())
api.authenticate()
Exemplo n.º 14
0
from kaggle.api_client import ApiClient
from kaggle.api.kaggle_api_extended import KaggleApi

kaggle = KaggleApi(ApiClient())
kaggle.authenticate()
Exemplo n.º 15
0
def download_data(competition_name,current_dir):
    api = KaggleApi(login_info[0])
    api.authenticate()
    api.competition_download_files(competition_name)
    with zipfile.ZipFile(current_dir + '/' + competition_name + '.zip', 'r') as zip_ref:
        zip_ref.extractall(current_dir + '/' + 'data')
#Authorize API's
#Authorize Twitter API
f = open('/home/pi/twitter_api_creds.json')
creds = json.load(f)
consumer_key = creds['consumer_key']
consumer_secret = creds['consumer_secret']
access_token = creds['access_token']
access_token_secret = creds['access_token_secret']
f.close()

auth = tw.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tw.API(auth, wait_on_rate_limit=True)
#Authorize Kaggle's API
kapi = KaggleApi()
kapi.authenticate()

# In[11]:


#Twitter Functions
#Calls on clean() for text cleanup then removes the URL
def remove_url(txt):
    #Call on clean to clean text first
    txt = clean(txt)
    #removes URL
    return " ".join(re.sub("([^0-9A-Za-z \t])|(\w+:\/\/\S+)", "", txt).split())


#Cleans up the text like newlines before url is removed.
def clean(text):
Exemplo n.º 17
0
class Kaggle(object):
    def __init__(self, config):
        self.config = config

        self.kaggle = KaggleApi()
        self.kaggle.authenticate()

        self.descriptions = {
            'submission': {
                'submission': 'The id of the submission',
                'subreddit': 'The subreddit name',
                'author': 'The redditors username',
                'created': 'Time the submission was created',
                'retrieved': 'Time the submission was retrieved',
                'edited': 'Time the submission was modified',
                'pinned': 'Whether or not the submission is pinned',
                'archived': 'Whether or not the submission is archived',
                'locked': 'Whether or not the submission is locked',
                'removed': 'Whether or not the submission is mod removed',
                'deleted': 'Whether or not the submission is user deleted',
                'is_self': 'Whether or not the submission is a text',
                'is_video': 'Whether or not the submission is a video',
                'is_original_content': 'Whether or not the submission has been set as original content',
                'title': 'The title of the submission',
                'link_flair_text': 'The submission link flairs text content',
                'upvote_ratio': 'The percentage of upvotes from all votes on the submission',
                'score': 'The number of upvotes for the submission',
                'gilded': 'The number of gilded awards on the submission',
                'total_awards_received': 'The number of awards on the submission',
                'num_comments': 'The number of comments on the submission',
                'num_crossposts': 'The number of crossposts on the submission',
                'selftext': 'The submission selftext on text posts',
                'thumbnail': 'The submission thumbnail on image posts',
                'shortlink': 'The submission short url'
            },
            'comment': {
                # TODO fetch comments
            }
        }

        self.datatypes = {
            'object': 'string',
            'int64': 'integer',
            'float64': 'number',
            'datetime64[ns]': 'datetime'
        }

        self.timer = Timer()

    def download(self, dataset, local=False):

        # use local folder
        if local and Env.VSCODE_WORKSPACE():
            return os.path.join(Env.VSCODE_WORKSPACE(), 'data', 'export')

        # download dataset
        path = tempfile.mkdtemp()
        self.kaggle.dataset_download_files(dataset, path=path, quiet=False, force=True, unzip=True)

        # return dataset path
        return path

    def update(self, root):
        summary = {}
        resources = []

        # read metadata from files
        for file_path in sorted(gb.glob(os.path.join(root, '**', '*.csv'))):
            df = pd.read_csv(file_path, doublequote=True, quoting=csv.QUOTE_NONNUMERIC, sep=',', encoding='utf-8')

            count = df.shape[0]
            name = os.path.basename(file_path)
            path = os.path.join(*(file_path.split(os.path.sep)[2:]))
            link = f'r/{os.path.dirname(path)}'

            # ignore empty files
            if count == 0:
                continue

            # build description
            time_from = datetime.fromtimestamp(df['created'].min()).strftime('%Y-%m-%d %H:%M:%S')
            time_to = datetime.fromtimestamp(df['created'].max()).strftime('%Y-%m-%d %H:%M:%S')
            description = f'[{link}](https://reddit.com/{link}) | {time_from} | {time_to} | *{df.shape[0]}*'

            # build ressources
            resources.append({
                'name': name,
                'path': path,
                'description': description,
                'schema': {
                    'fields': [{
                        'name': f'{column}',
                        'title': f'{column}',
                        'description': self.descriptions[name.split('.')[0]][column],
                        'type': self.datatypes[df.dtypes.astype(str)[column]]
                    } for column in df.columns]
                }
            })

            # save number of entries
            summary[description] = count

        # read kaggle template
        template = {}
        with open(self.config) as f:
            template = json.load(f)

        # build readme
        md = template['description']
        md_description = [f'{x[0]}' for x in sorted(summary.items(), key=lambda x: x[1], reverse=True)]
        md_data = [f'`{x["name"]}` | {x["description"]} | *{x["type"]}*' for x in resources[0]['schema']['fields']]
        md_date = datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S')

        # export readme file
        readme = md.format('\n'.join(md_description), '\n'.join(md_data), md_date)
        with open(os.path.join(root, 'README.md'), 'w') as f:
            f.write(readme)

        # export datapackage file
        template['description'] = readme
        template['resources'] = resources
        with open(os.path.join(root, 'datapackage.json'), 'w') as f:
            json.dump(template, f, indent=4)

        # update message
        return f'{md_date} - {sum([x for x in summary.values()])}'

    def upload(self, path):
        return self.kaggle.dataset_create_version(path, version_notes=self.update(root=path), dir_mode='zip')
Exemplo n.º 18
0
if os.environ.get("PRODUCTION") is None:
    load_dotenv(verbose=True)

consumer_key = os.environ.get('TWITTER_CONSUMER_KEY')
consumer_secret = os.environ.get('TWITTER_CONSUMER_SECRET')
access_token = os.environ.get('TWITTER_ACCESS_TOKEN')
access_token_secret = os.environ.get('TWITTER_ACCESS_TOKEN_SECRET')

auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)

twitter_api = tweepy.API(auth)

kaggle_api = KaggleApi()
kaggle_api.authenticate()

now = dt.now()
now = now.astimezone(timezone('UTC'))
competitions = pd.DataFrame([], columns=['Title', 'enabledDate', 'deadline'])
competitions_list = kaggle_api.competitions_list()
for competition in competitions_list:
    if getattr(competition, 'awardsPoints') and not getattr(competition, 'submissionsDisabled'):
        deadline = getattr(competition, 'deadline')
        deadline = deadline.astimezone(timezone('UTC'))
        diff = deadline - now
        if diff.days >= 0:
            competitions = competitions.append(pd.Series([
                getattr(competition, 'title'),
                getattr(competition, 'enabledDate'),
                getattr(competition, 'deadline')], index=['Title', 'enabledDate', 'deadline']),
Exemplo n.º 19
0
    def download(self):
        import os
        from zipfile import ZipFile
        try:
            from kaggle.api.kaggle_api_extended import KaggleApi
        except ImportError:
            raise RuntimeError(
                'please install and setup the kaggle '
                'competition api: https://github.com/Kaggle/kaggle-api')

        api = KaggleApi()
        api.authenticate()

        kgl_comp = 'trackml-particle-identification'
        test_file = 'train_sample.zip'

        if self.full_dataset:
            kgl_file = 'trackml-particle-identification.zip'
            print(
                'Downloading full TrackML dataset (~80GB), this may take a while...'
            )
            api.competition_download_files(kgl_comp,
                                           path=self.root,
                                           quiet=False,
                                           force=False)
            training_samples = None
            with ZipFile(os.path.join(self.root, kgl_file), 'r') as zf:
                training_samples = [fname for fname in filter(lambda x: 'train' in x and \
                                                                        'sample' not in x and \
                                                                        'blacklist' not in x,
                                                              zf.namelist())]

                for name in tqdm(training_samples, desc='extracting zipballs'):
                    if not os.path.exists(os.path.join(self.root, name)):
                        zf.extract(name, path=self.root)

            for sample in training_samples:
                with ZipFile(os.path.join(self.root, sample), 'r') as zf:
                    fnames = zf.namelist()
                    action = f'unpacking {sample}'
                    for name in tqdm(fnames, desc=action):
                        sample_dir = sample.split('.')[0] + '/'
                        if name == sample_dir:
                            continue
                        outname = os.path.join(self.raw_dir,
                                               os.path.basename(name))
                        if os.path.exists(outname):
                            raise Exception(f'{outname} already exists!')
                        with open(outname, 'wb') as fout:
                            fout.write(zf.read(name))

        else:
            kgl_file = test_file
            print(
                'Downloading training example from TrackML dataset, only 100 training events...'
            )
            api.competition_download_file(kgl_comp,
                                          test_file,
                                          path=self.root,
                                          quiet=False,
                                          force=False)
            with ZipFile(os.path.join(self.root, kgl_file), 'r') as zf:
                fnames = zf.namelist()
                for name in tqdm(fnames):
                    if name == 'train_100_events/':
                        continue
                    with open(
                            os.path.join(self.raw_dir, os.path.basename(name)),
                            'wb') as fout:
                        fout.write(zf.read(name))

        events = glob.glob(
            osp.join(osp.join(self.root, 'raw'), 'event*-hits.csv'))
        events = [e.split(osp.sep)[-1].split('-')[0][5:] for e in events]
        self.events = sorted(events)
        if (self.n_events > 0):
            self.events = self.events[:self.n_events]
Exemplo n.º 20
0
class Competition(commands.Cog):
    def __init__(self, bot):
        self.bot = bot
        self.init_api_client()

    def init_api_client(self):
        self.api = KaggleApi()
        self.api.authenticate()

    @commands.group()
    async def comp(self, ctx):
        """
        Collection of commands for interacting with Kaggle
        """
        self.guild = ctx.guild
        self.gid = ctx.guild.id

        if ctx.invoked_subcommand is None:
            await ctx.send("Invalid command passed. Use !help.")

    @comp.command()
    async def list(self, ctx, cat="featured", num=5):
        """
        List competitions sorted by latest deadline. Returns
        competitions from all categories and only lists the first 5
        by default.

        Parameters:
            cat (str): category to filter by [featured|research|recruitment|gettingStarted|masters|playground]
            num (int): how many results to display
        """
        if cat.lower() not in CATEGORIES:
            raise InvalidCategoryException

        comps = self.api.competitions_list(category="featured")
        latest_comps = [comp.__dict__ for comp in comps[:num]]
        for latest_comp in latest_comps:
            emb = get_competition_embed(latest_comp, ["description", "reward", "deadline"])
            await ctx.channel.send(embed=emb)

    @comp.error
    async def comps_error(self, ctx, error):
        if isinstance(error.original, InvalidCategoryException):
            await ctx.channel.send("The specified category is not supported")

    @comp.command()
    async def create(self, ctx, comp_name, team_name: Optional[str] = " "):
        """
        Create a competition given a name.

        Parameters:
            comp_name (str): name of the competition to find and create in lowercase and slug, eg comp-name
            team_name (str): optional string to add as team name
        """

        logger.info(comp_name)
        comps = self.api.competitions_list(sort_by="latestDeadline")
        latest_comps = [comp.__dict__ for comp in comps]

        max_longest_match = 0
        matched_comp = None
        for latest_comp in latest_comps:
            matcher = SequenceMatcher(None, comp_name, latest_comp["ref"])
            longest_match = matcher.find_longest_match(0, len(comp_name), 0, len(latest_comp["ref"])).size
            if longest_match > max_longest_match:
                max_longest_match = longest_match
                matched_comp = latest_comp

        if matched_comp:
            category = discord.utils.get(ctx.guild.categories, name=matched_comp["ref"])

            if category is not None:
                raise CompetitionAlreadyExistsException

            comp_role = await self.guild.create_role(name=f"Comp-{matched_comp['ref']}", mentionable=True)
            await ctx.message.author.add_roles(comp_role)
            overwrites = {
                # Everyone
                self.guild.get_role(self.gid): discord.PermissionOverwrite(read_messages=False),
                self.bot.user: discord.PermissionOverwrite(read_messages=True),
                comp_role: discord.PermissionOverwrite(read_messages=True),
            }

            category = await self.guild.create_category(name=matched_comp["ref"], overwrites=overwrites)
            general_channel = await self.guild.create_text_channel(name="general", category=category)

            Comp(
                name=category,
                description=matched_comp["description"],
                url=matched_comp["url"],
                created_at=datetime.datetime.now(),
                deadline=matched_comp["deadline"],
                team_name=team_name,
                max_team_size=matched_comp["maxTeamSize"],
                max_daily_subs=matched_comp["maxDailySubmissions"],
                merger_deadline=matched_comp["mergerDeadline"],
                team_members=[ctx.author.name],
            ).save()

            await general_channel.send("@here New competition created! @here Άτε κοπέλια..!")
        else:
            await ctx.channel.send("Νομίζω έφηε σου το όνομα! Use !comp list to see a list.")

    @create.error
    async def create_error(self, ctx, error):
        if isinstance(error.original, CompetitionAlreadyExistsException):
            await ctx.channel.send("Ρεεεε τούτο το κομπετίσιον υπάρχει!!")

    @comp.command(aliases=["ranking"])
    async def show_ranking(self, ctx):
        """
        Shows team's current ranking on Kaggle Competition (should be run within competition category)
        """
        category = ctx.channel.category.name
        comp = Comp.objects.get({"name": category})
        leaderboard_results = self.api.competition_leaderboard_view(category)
        if leaderboard_results:
            comp = Comp.objects.get({"name": category})
            team_ranking = get_team_entry_from_leaderboard(leaderboard_results, comp.team_name)
            await ctx.channel.send("Πάμε καλά;;")
            team_ranking_vals = {key: getattr(team_ranking, key) for key in FIELDS}
            await ctx.channel.send(
                f"Place: {team_ranking_vals[FIELDS[0]]}, Last submission date: {team_ranking_vals[2]}, Score: {team_ranking_vals[FIELDS[3]]}"
            )

    @show_ranking.error
    async def show_ranking_error(self, ctx, error):
        if isinstance(error.original, Comp.DoesNotExist):
            await ctx.channel.send("Πάενε μες το κομπετίσιον ρεεε. Run this command in the competition category.")

    @comp.command()
    async def addteammate(self, ctx, team_mate: str):
        """
        Adds teammate to competition. (should be run within competition category)

        Parameters:
            team_mate (str): discord username of teammate to add
        """

        category = ctx.channel.category.name
        comp = Comp.objects.get({"name": category})
        comp = Comp.objects.get({"name": category})
        if team_mate in comp.team_members:
            await ctx.channel.send(f"{team_mate} is already a member of the competition's team")
        else:
            if len(comp.team_members) < comp.max_team_size:
                general = discord.utils.get(ctx.channel.category.channels, name="general")
                user = None
                for guild in self.bot.guilds:
                    for member in guild.members:
                        if member.name == team_mate:
                            user = member
                            break
                if user:
                    comp.team_members.append(team_mate)
                    comp.save()
                    comp_role = discord.utils.get(ctx.guild.roles, name=f"Comp-{category}")
                    await user.add_roles(comp_role)
                    await general.send(
                        f"{user.mention} you have been added to {comp.name} by {ctx.message.author.mention}. Good luck!"
                    )
                else:
                    await ctx.channel.send("Ένηβρα έτσι παίχτη... User not found!")
            else:
                await ctx.channel.send("Team is already at maximum capacity... :worried:")

    @addteammate.error
    async def addteammate_error(self, ctx, error):
        if isinstance(error.original, Comp.DoesNotExist):
            await ctx.channel.send("Πάενε μες το κομπετίσιον ρεεε. Run this command in the competition category.")

    @comp.command(aliases=["teamname"])
    async def set_team_name(self, ctx, team_name: str):
        """
        Sets the team name.

        Parameters:
            team_name (str)
        """
        category = ctx.channel.category.name
        comp = Comp.objects.get({"name": category})

        if len(comp.team_name) == 1:  # default value is " "
            comp.team_name = team_name
            comp.save()
            general = discord.utils.get(ctx.channel.category.channels, name="general")
            await general.send(f"Let's go {team_name}!!!")
        else:
            raise TeamAlreadyHasNameException(comp.team_name)

    @set_team_name.error
    async def set_team_name_error(self, ctx, error):
        if isinstance(error.original, Comp.DoesNotExist):
            await ctx.channel.send("Πάενε μες το κομπετίσιον ρεεε. Run this command in the competition category.")
        elif isinstance(error.original, TeamAlreadyHasNameException):
            await ctx.channel.send(
                f"Άρκησες ρε φίλε! This team already has a name and its... drum roll please {error.original.team_name}:tada:"
            )

    @comp.command()
    @commands.has_permissions(manage_channels=True, manage_roles=True)
    async def archive(self, ctx, comp_name):
        """
        Marks a competition as finished, archives it in the server and deletes all associated channels and categories.

        Parameters:
            comp_name (str): name of the competition to mark as finished
        """
        comp = Comp.objects.get({"name": comp_name})
        category = discord.utils.get(ctx.guild.categories, name=comp_name)
        comp_role = discord.utils.get(ctx.guild.roles, name=f"Comp-{comp_name}")

        if comp.finished_on is not None:
            raise CompetitionAlreadyArchivedException

        comp.name = f"__ARCHIVED__ {comp.name}"
        comp.save()

        if comp_role is not None:
            await comp_role.delete()
        for c in category.channels:
            await c.delete()

        await category.delete()

        await ctx.channel.send("Good job on the competition everyone! Επήαμε τα καλά;")

    @archive.error
    async def archive_error(self, ctx, error):
        if isinstance(error.original, Comp.DoesNotExist):
            await ctx.channel.send("Πάενε μες το κομπετίσιον ρεεε. Run this command in the competition category.")

        if isinstance(error.original, CompetitionAlreadyArchivedException):
            await ctx.channel.send("This competition has already finished.")

    @comp.command()
    async def submit(self, ctx, desc: Optional[str] = ""):
        """
        Makes a submission to the category's competition.

        Parameters:
            description (str): optional description of submission made
        """

        category = ctx.channel.category.name
        comp = Comp.objects.get({"name": category})

        if comp.subs_today == comp.max_daily_subs:
            raise MaxSubmissionsReachedException
        await ctx.channel.send("Please upload a submission file...")

        async def wait_for_file():
            """
            Coroutine that waits for a file to be uploaded
            """
            url = None
            filename = None
            try:
                message = await self.bot.wait_for("message", timeout=60.0)
            except asyncio.TimeoutError:
                await ctx.channel.send("Time out... Try submitting again.")
            else:
                await ctx.channel.send("Submission succesfull!")
                url = message.attachments[0].url
                filename = message.attachments[0].filename
            return url, filename

        sub_file_url, filename = await self.bot.loop.create_task(wait_for_file())
        sub_file_content = requests.get(sub_file_url).content
        local_file = os.path.join("/tmp", filename)
        with open(local_file, "wb") as outfile:
            outfile.write(sub_file_content)
        submit_result = self.api.competition_submit(local_file, desc, comp.name)

        await ctx.channel.send(repr(submit_result))

    @submit.error
    async def submit_error(self, ctx, error):
        if isinstance(error.original, Comp.DoesNotExist):
            await ctx.channel.send("Πάενε μες το κομπετίσιον ρεεε. Run this command in the competition category.")
        elif isinstance(error, MaxSubmissionsReachedException):
            await ctx.channel.send("Πάππαλα τα σαμπμίσσιονς... No more submissions left for today.")

    @tasks.loop(hours=24)
    async def update_subs(self):

        comps = Comp.objects.all()
        for comp in comps:
            comp.subs_today = 0
            comp.save()

    @tasks.loop(hours=24)
    async def finish_comps(self):

        comps = Comp.objects.all()
        comps = list(filter(lambda x: x.finished_on is not None, comps))

        for comp in comps:
            if comp.deadline <= datetime.datetime.now():
                comp.finished_on = datetime.datetime.now()
Exemplo n.º 21
0
def download_kaggle_files():
    api = KaggleApi()
    api.authenticate()
    api.dataset_download_files('netflix-inc/netflix-prize-data',
                               path='/Users/pcc33/Downloads/',
                               unzip=True)
Exemplo n.º 22
0
# -*- coding: utf-8 -*-
"""
pull_data.py (5 points) 
When this is called using python pull_data.py in the command line, 
this will go to the 2 Kaggle urls provided below, authenticate using 
your own Kaggle sign on, pull the two datasets, and save as .csv files 
in the current local directory. The authentication login details (aka secrets) 
need to be in a hidden folder (hint: use .gitignore). There must be a data check step to ensure the data has been pulled correctly and clear commenting and documentation for each step inside the .py file. 

Training dataset url: https://www.kaggle.com/c/titanic/download/train.csv 
Scoring dataset url: https://www.kaggle.com/c/titanic/download/test.csv

@author: Ada
"""
#
from kaggle.api.kaggle_api_extended import KaggleApi

import os

download_path ="C:\\Users\\Ada\\Desktop\\CUNY_SPS_DA\\622 ML\\HW1 Titanic"

api = KaggleApi()

api.authenticate()

api.competition_download_files('titanic')

    
    
Exemplo n.º 23
0
def fetch_dataset(project_dir,download_from_kaggle=False, kaggle_dataset=None, kaggle_competition=None, download_from_s3=False, s3_bucket=None, download_from_url=False, data_url=None):
    """ Runs data processing scripts to turn raw data from (../raw) into
        cleaned data ready to be analyzed (saved in ../processed).
    """
    logger = logging.getLogger(__name__)
    logger.info('making final data set from raw data')

    log_fmt = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
    logging.basicConfig(level=logging.INFO, format=log_fmt)

    logger.info('project directory {}'.format(project_dir))

    output_path = os.path.join(project_dir, 'data', 'raw')

    try:
        os.makedirs(output_path)
    except OSError:
        pass

    logger.info('output path {}'.format(output_path))

    if download_from_kaggle:
        os.environ['KAGGLE_USERNAME'] = os.environ['username']
        os.environ['KAGGLE_KEY'] = os.environ['kaggle_key']

        from kaggle.api.kaggle_api_extended import KaggleApi
        kaggle_api = KaggleApi()
        kaggle_api.authenticate()

        if kaggle_dataset:
            kaggle_api.dataset_download_files(kaggle_dataset,output_path,unzip=True)

        if kaggle_competition:
            kaggle_api.competition_download_files(kaggle_competition,path=output_path)

    if download_from_s3:
        import boto3
        session = boto3.Session(
                    aws_access_key_id=os.environ['aws_access_key_id'],
                    aws_secret_access_key=os.environ['aws_secret_access_key']
                    )
        s3 = session.resource('s3')

        if s3_bucket:
            my_bucket = s3.Bucket(s3_bucket)
            # download file into current directory
            for s3_object in tqdm(my_bucket.objects.all()):
                filename = s3_object.key
                logger.info("downloading {}".format(filename))
                my_bucket.download_file(s3_object.key, os.path.join(output_path,filename))

                try:
                    #zf = zipfile.ZipFile(os.path.join(output_path,filename), 'r')
                    #zf.extractall(output_path)
                    #zf.close()
                    shutil.unpack_archive(os.path.join(output_path,filename),output_path)

                    os.remove(os.path.join(output_path,filename))
                except shutil.ReadError:
                    pass

    if download_from_url and data_url:
        import urllib3

        http = urllib3.PoolManager()
        filename = data_url.split('/')[-1]
        logger.info("downloading {}".format(filename))

        with open(os.path.join(output_path,filename), 'wb') as out:
            r = http.request('GET', data_url, preload_content=False)
            shutil.copyfileobj(r, out)

        try:
            #zf = zipfile.ZipFile(os.path.join(output_path, filename), 'r')
            #zf.extractall(output_path)
            #zf.close()

            shutil.unpack_archive(os.path.join(output_path, filename), output_path)

            os.remove(os.path.join(output_path, filename))

        except shutil.ReadError:
            pass

    logger.info('download complete')