Exemplo n.º 1
0
else:
    # Download if not exist already
    if not os.path.isfile(X_train_path):
        urlretrieve(args.blob_path + "/X_train.npy", X_train_path)
    if not os.path.isfile(y_train_path):
        urlretrieve(args.blob_path + "/y_train.npy", y_train_path)
    if not os.path.isfile(X_valid_path):
        urlretrieve(args.blob_path + "/X_valid.npy", X_valid_path)
    if not os.path.isfile(y_valid_path):
        urlretrieve(args.blob_path + "/y_valid.npy", y_valid_path)

X_t = np.load(X_train_path)
y_t = np.load(y_train_path)
X_v = np.load(X_valid_path)
y_v = np.load(y_valid_path)

params = vars(args)

mnt_path = os.path.join(os.getenv('TEST_TMPDIR', '/tmp'),
                        'tensorflow')  # azurefile mount path
ts = int(round(time.time() * 1000))
params['model_dir'] = os.path.join(mnt_path, '{}_model'.format(ts))
params['log_dir'] = os.path.join(mnt_path, '{}_logs'.format(ts))

logger = Logger(None, 'katib')
logger.log(
    'model_id', ts
)  # This is hack, storing the model id as a metric in order to record it.

train(X_t, y_t, X_v, y_v, logger=logger, **params)
Exemplo n.º 2
0
# Load transformer with Adam optimizer and MSE loss function
net = Transformer(d_input,
                  d_model,
                  d_output,
                  q,
                  v,
                  h,
                  N,
                  attention_size=attention_size,
                  dropout=dropout,
                  chunk_mode=chunk_mode,
                  pe=pe).to(device)
optimizer = optim.Adam(net.parameters(), lr=LR)
loss_function = OZELoss(alpha=0.3)

logger = Logger(f'logs/training.csv', params=['loss'])

with tqdm(total=EPOCHS) as pbar:
    # Fit model
    loss = fit(net,
               optimizer,
               loss_function,
               dataloader_train,
               dataloader_val,
               epochs=EPOCHS,
               pbar=pbar,
               device=device)

    # Log
    logger.log(loss=loss)
Exemplo n.º 3
0
with tqdm(total=n_steps * EPOCHS) as pbar:
    for params in itertools.product(*search_params.values()):
        params = {
            key: params[idx]
            for idx, key in enumerate(search_params.keys())
        }
        pbar.set_postfix(params)

        # Load transformer with Adam optimizer and MSE loss function
        net = Transformer(d_input=d_input,
                          d_output=d_output,
                          dropout=dropout,
                          chunk_mode=chunk_mode,
                          pe=pe,
                          **params).to(device)
        optimizer = optim.Adam(net.parameters(), lr=LR)

        # Fit model
        loss = fit(net,
                   optimizer,
                   loss_function,
                   dataloader_train,
                   dataloader_val,
                   epochs=EPOCHS,
                   pbar=pbar,
                   device=device)

        # Log
        logger.log(loss=loss, **params)
Exemplo n.º 4
0
with tqdm(total=n_steps * EPOCHS) as pbar:
    for params in itertools.product(*search_params.values()):
        params = {
            key: params[idx]
            for idx, key in enumerate(search_params.keys())
        }
        pbar.set_postfix(params)

        # Load transformer with Adam optimizer and MSE loss function
        net = Transformer(d_input=d_input,
                          d_output=d_output,
                          dropout=dropout,
                          chunk_mode=chunk_mode,
                          pe=pe,
                          **params).to(device)
        optimizer = optim.Adam(net.parameters(), lr=LR)

        # Fit model
        loss = fit(net,
                   optimizer,
                   loss_function,
                   dataloader_train,
                   dataloader_val,
                   epochs=EPOCHS,
                   pbar=pbar,
                   device=device)

        # Log
        logger.log(params, loss=loss)
Exemplo n.º 5
0
# Compute predictions
predictions = torch.empty(len(dataloader_test.dataset), 168, 8)
idx_prediction = 0
with torch.no_grad():
    for x, y in tqdm(dataloader_test, total=len(dataloader_test)):
        netout = net(x.to(device)).cpu()
        predictions[idx_prediction:idx_prediction + x.shape[0]] = netout
        idx_prediction += x.shape[0]

# Compute occupation times
occupation = ozeDataset._x[dataloader_test.dataset.indices, :,
                           ozeDataset.labels['Z'].index('occupancy')]

results_metrics = {
    key: value
    for key, func in metrics.items() for key, value in {
        key: func(y_true, predictions).mean(),
        key + '_std': func(y_true, predictions).std()
    }.items()
}

# Log
logger.log(**results_metrics)

# Save model
torch.save(
    net.state_dict(),
    f'models/{net.name}_{datetime.datetime.now().strftime("%Y_%m_%d__%H%M%S")}.pth'
)
Exemplo n.º 6
0
        # Total loss
        loss_D_B = (loss_D_real + loss_D_fake) * 0.5
        loss_D_B.backward()

        optimizer_D_B.step()
        ###################################
        if (i + 1) % im_per_epoch == 0:
            break
        # Progress report (http://localhost:8097)
        logger.log(
            {
                'loss_G': loss_G,
                'loss_G_identity': (loss_identity_A + loss_identity_B),
                'loss_G_GAN': (loss_GAN_A2B + loss_GAN_B2A),
                'loss_G_cycle': (loss_cycle_ABA + loss_cycle_BAB),
                'loss_D': (loss_D_A + loss_D_B)
            },
            images={
                'real_A': real_A,
                'real_B': real_B,
                'fake_A': fake_A,
                'fake_B': fake_B
            })

    # Update learning rates
    lr_scheduler_G.step()
    lr_scheduler_D_A.step()
    lr_scheduler_D_B.step()

    # Save models checkpoints
    dataset_name = opt.dataroot.split('/')[-1]
Exemplo n.º 7
0
class Preprocessor(BaseEstimator):
    """
    Class for cleaning and tokenizing tweet's raw text

    Steps:
        1. remove ``@anonymized_account`` tag
        2. remove chars other than letters and spaces
        3. remove duplicate spaces
        4. apply lowercase
        5. lemmatizes tokens with ``pl_spacy_model``
        6. convert polish diacritics to latin letters
        7. drop adjacent equals letters
        8. collapse words exploded with spaces
        9. remove zero/one letter tokens
    """
    def __init__(self, min_tok_len: int = 2):
        self._min_tok_len = min_tok_len
        self._logger = Logger('preproc')
        self._nlp = None

    def fit(self, tweets: Tweets, tags: Tags = None) -> Preprocessor:
        return self

    def transform_tweet(self, tweet: Tweet) -> Tokens:

        tweet: Tweet = self._base_cleanup(tweet)
        tokens: Tokens = self._tokenizer(tweet)
        tokens = [Preprocessor._latinize_diacritics(tok) for tok in tokens]
        tokens = [Preprocessor._drop_adjacent_equals(tok) for tok in tokens]
        tokens = [Preprocessor._collapse_exploded(tok) for tok in tokens]
        tokens = [tok for tok in tokens if len(tok) >= self._min_tok_len]

        return tokens

    def transform(self, tweets: Tweets, tags: Tags = None) -> List[Tokens]:
        tokens = [self.transform_tweet(tweet) for tweet in tweets]

        return tokens

    @staticmethod
    def _base_cleanup(tweet: Tweet) -> Tweet:
        """Keep only letters and spaces, apply to lower, remove ``@anonymized_account`` and extra spaces"""
        tweet = tweet.strip()
        tweet = re.sub(r'@anonymized_account', '', tweet)
        tweet = re.sub(r'[^\w\s]', '', tweet)
        tweet = re.sub(r'[0-9]', '', tweet)
        tweet = re.sub(r' +', ' ', tweet)
        tweet = tweet.lower()
        tweet = tweet.strip()

        return tweet

    def load_spacy_model(self) -> None:
        """Tokenize tweet"""
        if self._nlp is None:
            self._logger.log('loading spacy model')
            self._nlp = spacy.load('pl_spacy_model')

    def _tokenizer(self, tweet: Tweet) -> Tokens:
        """Tokenize tweet"""
        self.load_spacy_model()
        tokens = [tok.lemma_ for tok in self._nlp(tweet)]

        return tokens

    @staticmethod
    def _drop_adjacent_equals(tok: Token) -> Token:
        """
        Remove adjacent duplicate characters.

        Examples
        --------
        >>> _drop_adjacent_equals('kkk')
        'k'

        >>> _drop_adjacent_equals('lekkie pióórko')
        'lekie piórko'
        """
        return ''.join(c[0] for c in itertools.groupby(tok))

    @staticmethod
    def _collapse_exploded(tok: Token, separators: str = ' .-_') -> Token:
        """
        Collapse word expanded with ``separators``.

        Example
        --------
        >>> _collapse_exploded('jesteś b r z y d k i')
        'jesteś brzydki'
        """
        if len(tok) < 5:
            return tok

        remove = []
        for i, l in enumerate(tok[2:-1]):
            if l in separators:
                if (tok[i - 2] in separators) & (tok[i + 2] in separators):
                    if (tok[i - 1].isalpha()) & (tok[i + 1].isalpha()):
                        remove.append(i)
                        remove.append(i + 2)

        return ''.join([l for i, l in enumerate(tok) if i not in remove])

    @staticmethod
    def _latinize_diacritics(tok: Token) -> Token:
        """
        Convert polish diacritics to latin letters.

        Example
        --------
        >>> _latinize_diacritics('gęśl')
        'gesl'
        """
        letters_diac = 'ąćęłńóśżźĄĆĘŁŃÓŚŻŹ'
        letters_latin = 'acelnoszzACELNOSZZ'
        table = str.maketrans(letters_diac, letters_latin)
        return tok.translate(table)
Exemplo n.º 8
0
class DataReader:
    """
    Class for loading and processing raw tweets.

    Attributes
    ----------
    df : pd.DataFrame
        Data frame with raw text and cleared tokens.
        Columns:
            Name: raw_tweets, dtype: str
            Name: tokens, dtype: List[str]
            Name: tokens_count, dtype: int
            Name: tag, dtype: int
    """
    def __init__(self,
                 text_file: str,
                 tags_file: str = None,
                 force_reload: bool = False) -> None:
        self._logger = Logger('io')
        self._preprocessor = Preprocessor()
        self.df = self._load_data(text_file, tags_file, force_reload)
        self._stats = None
        self.stats

    def _load_data(self,
                   tweets_path: str,
                   tags_path: str,
                   force_reload: bool = False) -> pd.DataFrame:
        """
        Load dataframe with cleared and tokenized tweets.

        First tries to load processed data from pickle.
        If pickle not found, or ``force_reload`` is True, reads raw data and run processing.

        Parameters
        ----------
        tweets_path : str
            Name of a file with raw texts.
        tags_path : str
            Name of a file with tags.
        force_reload : bool
            If true loads from raw data even if pickle found.

        Returns
        -------
        pd.DataFrame
            Data frame with raw text and cleared tokens.
        """
        pickle_path = tweets_path.replace('.txt',
                                          '.pkl').replace('raw', 'processed')
        pickle_folder, pickle_name = os.path.split(pickle_path)

        if (pickle_name in os.listdir(pickle_folder)) & ~force_reload:
            self._logger.log('reading from pickle')
            with open(pickle_path, "rb") as f:
                df = pickle.load(f)
        else:
            self._logger.log('processing raw data')
            df = self._build_dataframe(tweets_path, tags_path)

        self._logger.log('data ready')
        return df

    def _build_dataframe(self, tweets_path: str,
                         tags_path: str) -> pd.DataFrame:
        """
        Clear and tokenize raw texts.
        Pickle processed data

        Parameters
        ----------
        tweets_path : str
            Name of a file with raw texts.
        tags_path : str
            Name of a file with tags.

        Returns
        -------
        pd.DataFrame
            Data frame with raw text and cleared tokens.
        """
        with open(tweets_path) as f:
            raw_tweets = f.readlines()

            df = pd.DataFrame(raw_tweets, columns=['raw_tweets'])
            df['tokens'] = self._preprocessor.transform(raw_tweets)
            df['tokens_count'] = df['tokens'].apply(len)

            if tags_path is not None:
                df['tag'] = pd.read_fwf(tags_path, header=None)[0]
            else:
                df['tag'] = np.nan

            pickle_path = tweets_path.replace('.txt', '.pkl').replace(
                'raw', 'processed')
            with open(pickle_path, "wb") as p:
                pickle.dump(df, p)

            return df

    @property
    def stats(self):
        self._stats = dict()
        self._stats['tweets count'] = self.df.shape[0]
        self._stats['tokens in tweet distribution'] = self.df[
            'tokens_count'].describe([.25, .5, .75, .95, .99])
        self._stats['unique tokens'] = len(
            {toc
             for tweet_toc in self.df['tokens'] for toc in tweet_toc})
        self._stats['tags count'] = self.df['tag'].value_counts().sort_index()

        print("-------- stats --------")
        for stat, value in self._stats.items():
            print(f"=======================\n{stat}:\n{value}")