def __init__(self, dataset_name='', traindata=None, testdata=None, batch_size=8, sequence_length=64, mode='tuple', **kwargs): self.__name__ = dataset_name self.sequence_length = sequence_length self.uuid = uuid.uuid4().node if mode in ['tuple', 'dict']: self.mode = mode else: raise ValueError("Valid mode should be tuple or dict ") self.traindata = traindata self.testdata = testdata self.annotations = {} self.scenario = 'train' self._class_names = {} self._batch_size = batch_size self.__default_language__ = 'en-us' if len(self._class_names) > 0: if ctx.locale in self._class_names: self.__default_language__ = ctx.locale for k in self._class_names.keys(): if ctx.locale.split('-')[0] in k: self.__default_language__ = k break if isinstance(self.traindata, Iterator): for ds in self.traindata.get_datasets(): if isinstance(ds, TextSequenceDataset): ds.sequence_length = self.sequence_length if isinstance(self.testdata, Iterator): for ds in self.testdata.get_datasets(): if isinstance(ds, TextSequenceDataset): ds.sequence_length = self.sequence_length self._idx2lab = {} self._lab2idx = {} self.tot_minibatch = 0 self.tot_records = 0 self.tot_epochs = 0 self._text_transform_funcs = [] self._label_transform_funcs = [] self._paired_transform_funcs = [] self._batch_transform_funcs = [] cxt = context._context() cxt.regist_data_provider(self)
def __init__(self, axis=-1, sample_weight=None, auto_balance=False, from_logits=False, ignore_index=-100, cutoff=None, label_smooth=False, reduction='mean', enable_ohem=False, ohem_ratio=3.5, name=None, **kwargs): """ Args: axis (int): the position where the classes is. sample_weight (Tensor): means the weights of classes , it shoud be a 1D tensor and length the same as number of classes. from_logits (bool): whether the output tensor is normalized as a probability (total equal to 1) ignore_index (int or list of int): cutoff (None or decimal): the cutoff point of probability for classification, should be None of a number less than 1.. is_target_onehot (bool): Is the target tensor in onehot format? label_smooth (bool): Should use label smoothing? reduction (string): the method to aggrgate loss. None means no need to aggregate, 'mean' means average loss, 'sum' means the summation of losses,'batch_mean' means average loss cross the batch axis then summation them. Attributes: need_target_onehot (bool): If True, means the before loss calculation , need to transform target as one-hot format, ex. label-smooth, default is False. is_multiselection (bool): If True, means the classification model is multi-selection, so cannot use any softmax process, use sigmoid and binary_crosss_entropy insteaded. is_target_onehot (bool): If True, means we have confirmed (not just declare) the target is transformed as one-hot format reduction(str): The aggregation function for loss, available options are 'sum', 'mean 'and 'batch_mean', default is 'mean' axis (None or int): The axis we according with for loss calculation. Default is 1. from_logits (bool):If True, means the sum of all probability will equal 1. is_logsoftmax (bool):If True, means model use SoftMax as last layer or use any equivalent calculation. sample_weight(1D tensor):The loss weight for all classes. ignore_index(int , list, tuple): The classes we want to ignore in the loss calculation. cutoff(float): Means the decision boundary in this classification model, default=0.5. num_classes(int):number of all the classes. label_smooth (bool):If True, mean we will apply label-smoothing in loss calculation. """ super(_ClassificationLoss, self).__init__(reduction=reduction, sample_weight=sample_weight, axis=axis, enable_ohem=enable_ohem, ohem_ratio=ohem_ratio, namename=name) self._set_name_scope() self.need_target_onehot = True self.is_multiselection = False self.is_target_onehot = False self.from_logits = from_logits self.is_logsoftmax = False self.ignore_index = ignore_index self.ignore_index_weight = None self.auto_balance = auto_balance self.auto_balance = auto_balance if self.auto_balance: self.label_statistics = None ctx = context._context() if hasattr(ctx._thread_local_info, 'data_providers') and len(ctx._thread_local_info.data_providers) > 0: dp = list(ctx._thread_local_info.data_providers.values())[0] if dp.traindata.label.__class__.__name__ == 'LabelDataset': unique, counts = np.unique(np.array(dp.traindata.label.items), return_counts=True) reweights = np.clip(counts, 1, np.inf) / np.sum(counts).astype(np.float32) reweights1 = np.max(reweights) / reweights self.label_statistics = reweights1 if cutoff is not None and not 0 < cutoff < 1: raise ValueError('cutoff should between 0 and 1') self.cutoff = cutoff self.num_classes = None self.label_smooth = label_smooth
import copy import datetime import locale import os from tqdm import tqdm from collections import * from typing import Optional, List, Tuple from trident.backend.common import * from trident.backend.pytorch_ops import * from trident.backend.pytorch_backend import to_tensor, get_device, load, fix_layer, set_device from trident.data.utils import download_model_from_google_drive, download_file_from_google_drive from trident.layers.pytorch_layers import * from trident import context ctx = context._context() __all__ = ['Word2Vec', 'ChineseWord2Vec'] _trident_dir = get_trident_dir() dirname = os.path.join(_trident_dir, 'models') if not os.path.exists(dirname): try: os.makedirs(dirname) except OSError: # Except permission denied and potential race conditions # in multi-threaded environments. pass download_path = os.path.join(_trident_dir, 'download', 'vocabs_tw.txt') make_dir_if_need(download_path)