예제 #1
0
    def load(cls):
        # 從google drive載入模型
        st = datetime.datetime.now()
        download_model_from_google_drive('13XZPWh8QhEsC8EdIp1niLtZz0ipatSGC',
                                         dirname, 'word2vec_chinese.pth')
        recovery_model = fix_layer(
            load(os.path.join(dirname, 'word2vec_chinese.pth')))

        recovery_model.locale = locale.getdefaultlocale()[0].lower()
        recovery_model.to(get_device())
        download_file_from_google_drive(
            file_id='16yDlJJ4-O9pHF-ZbXy7XPZZk6vo3aw4e',
            dirname=os.path.join(_trident_dir, 'download'),
            filename='vocabs_tw.txt')
        if not hasattr(recovery_model,
                       'tw2cn') or recovery_model.tw2cn is None:
            with open(download_path, 'r', encoding='utf-8-sig') as f:
                vocabs_tw = f.readlines()
                vocabs_tw = [
                    s.replace('\n', '') for s in vocabs_tw if s != '\n'
                ]
                recovery_model.tw2cn = OrderedDict()
                recovery_model.cn2tw = OrderedDict()

                for i, (w, w_cn) in tqdm(
                        enumerate(zip(vocabs_tw,
                                      recovery_model._vocabs.keys()))):
                    if w not in recovery_model.tw2cn:
                        recovery_model.tw2cn[w] = w_cn
                    recovery_model.cn2tw[w_cn] = w

        et = datetime.datetime.now()
        print('total loading time:{0}'.format(et - st))
        return recovery_model
예제 #2
0
    def __init__(self,
                 pretrained=False,
                 locale=None,
                 embedding_dim: Optional[int] = None,
                 num_embeddings: Optional[int] = None,
                 vocabs: Optional[List[str]] = None,
                 padding_idx: Optional[int] = None,
                 max_norm: Optional[float] = None,
                 norm_type: float = 2.,
                 scale_grad_by_freq: bool = False,
                 sparse: bool = False,
                 _weight: Optional[Tensor] = None,
                 filter_index=-1,
                 keep_output: bool = False,
                 name: Optional[str] = None) -> None:
        """
        Py Word2vec结构
        """
        super().__init__(num_embeddings=num_embeddings,
                         embedding_dim=embedding_dim,
                         max_norm=max_norm,
                         norm_type=norm_type,
                         scale_grad_by_freq=scale_grad_by_freq,
                         sparse=sparse,
                         _weight=_weight,
                         filter_index=filter_index,
                         keep_output=keep_output,
                         name=name)
        self.locale = _locale
        if _locale is None:
            self.locale = locale.getdefaultlocale()[0].lower()
        print('locale:', self.locale)

        self._vocabs = OrderedDict()

        download_file_from_google_drive(
            file_id='16yDlJJ4-O9pHF-ZbXy7XPZZk6vo3aw4e',
            dirname=os.path.join(_trident_dir, 'download'),
            filename='vocabs_tw.txt')
        with open(download_path, 'r', encoding='utf-8-sig') as f:
            vocabs_tw = f.readlines()
            vocabs_tw = [s.replace('\n', '') for s in vocabs_tw if s != '\n']
            if vocabs_tw is not None:
                for k in range(len(vocabs_tw)):
                    self._vocabs[vocabs_tw[k]] = k

            if not hasattr(self, 'tw2cn') or self.tw2cn is None:

                self.tw2cn = OrderedDict()
                self.cn2tw = OrderedDict()

                for i, (w, w_cn) in tqdm(
                        enumerate(zip(vocabs_tw, self._vocabs.keys()))):
                    if w not in self.tw2cn:
                        self.tw2cn[w] = w_cn
                    self.cn2tw[w_cn] = w
예제 #3
0
    def __init__(self, convert_ratio=0.5, name='random_homomorphic_typo', **kwargs):
        super().__init__()
        self.convert_ratio = convert_ratio
        download_file_from_google_drive('1MDk7eH7nORa16SyzNzqv7fYzBofzxGRI', dirname=os.path.join(get_trident_dir(), 'download'), filename='chardict.pkl')
        self.chardict = unpickle(os.path.join(get_trident_dir(), 'download', 'chardict.pkl'))
        self.all_embedding = to_tensor(np.stack(self.chardict.value_list, 0)).to('cpu')
        self.name = name

        if not get_session().get_resources('char_freq'):
            char_freq = get_session().regist_resources('char_freq', OrderedDict())
            with open(os.path.join(os.path.dirname(os.path.abspath(__file__)), 'char_freq.txt'), 'r', encoding='utf-8-sig') as f:
                for line in f.readlines():
                    cols = line.strip().split('\t')
                    char_freq[cols[0]] = float(cols[1])
                self.char_freq = char_freq

        else:
            self.char_freq = get_session().get_resources('char_freq')
예제 #4
0
    def __init__(self,
                 pretrained=False,
                 locale=None,
                 embedding_dim: Optional[int] = None,
                 num_embeddings: Optional[int] = None,
                 vocabs: Optional[List[str]] = None,
                 padding_idx: Optional[int] = None,
                 max_norm: Optional[float] = None,
                 norm_type: float = 2.,
                 scale_grad_by_freq: bool = False,
                 sparse: bool = False,
                 _weight: Optional[Tensor] = None,
                 filter_index=-1,
                 keep_output: bool = False,
                 name: Optional[str] = None) -> None:
        """
        Py Word2vec结构
        """
        super().__init__(num_embeddings=num_embeddings,
                         embedding_dim=embedding_dim,
                         max_norm=max_norm,
                         norm_type=norm_type,
                         scale_grad_by_freq=scale_grad_by_freq,
                         sparse=sparse,
                         _weight=_weight,
                         filter_index=filter_index,
                         keep_output=keep_output,
                         name=name)
        self.locale = ctx.locale
        print('locale:', self.locale)

        self._vocabs = OrderedDict()
        if vocabs is not None:
            for k in range(len(vocabs)):
                self._vocabs[vocabs[k]] = k
        download_file_from_google_drive(
            file_id='16yDlJJ4-O9pHF-ZbXy7XPZZk6vo3aw4e',
            dirname=os.path.join(_trident_dir, 'download'),
            filename='vocabs_tw.txt')