示例#1
0
 def _load_archive(self, path, extract_path):
     from zipfile import ZipFile, ZIP_DEFLATED
     try:
         zfile = ZipFile(path, mode='r', compression=ZIP_DEFLATED)
         allfile = zfile.namelist()
         # validate extract_path
         if not os.path.isdir(extract_path):
             raise ValueError('Extract path must be path folder, but path'
                              '={} is a file'.format(extract_path))
         extract_path = os.path.join(
             extract_path,
             os.path.basename(path).replace('.zip', ''))
         # found the extracted dir, use it
         if os.path.isdir(extract_path) and \
            set(os.listdir(extract_path)) == set(allfile):
             self._set_path(extract_path)
             return
         # decompress everything
         if not os.path.exists(extract_path):
             os.mkdir(extract_path)
         maxlen = max([len(i) for i in allfile])
         progbar = Progbar(len(allfile))
         for i, f in enumerate(allfile):
             zfile.extract(f, path=extract_path)
             progbar.title = ('Unarchiving: %-' + str(maxlen) + 's') % f
             progbar.update(i + 1)
         # ====== finally set path ====== #
         self._set_path(extract_path)
     except IOError as e:
         raise IOError('Error loading archived dataset, path:{}, error:{}'
                       '.'.format(path, e))
     return None
示例#2
0
 def fit(self, texts, vocabulary=None):
     """q
     Parameters
     ----------
     texts: iterator of unicode
         iterator, generator or list of unicode string.
     """
     texts = self._validate_texts(texts)
     word_counts = self._word_counts
     word_docs = self._word_docs
     # ====== pick engine ====== #
     if self.__engine == 'spacy':
         processor = self._preprocess_docs_spacy
     elif self.__engine == 'odin':
         processor = self._preprocess_docs_odin
     # ====== start processing ====== #
     prog = Progbar(target=1208)
     start_time = timeit.default_timer()
     for nb_docs, doc in processor(texts, vocabulary, keep_order=False):
         total_docs_tokens = 0
         seen_words = {}
         # update words->count
         for token in doc:
             total_docs_tokens += 1
             word_counts[token] += 1
             # update words->doc
             if token not in seen_words:
                 seen_words[token] = 1
                 word_docs[token] += 1
         # save longest docs
         if total_docs_tokens > self.__longest_document[-1]:
             self.__longest_document = [doc, total_docs_tokens]
         # print progress
         if self.print_progress:
             prog.title = '[Training]#Doc:%d #Tok:%d' % (nb_docs,
                                                         len(word_counts))
             prog.add(1)
             if prog.seen_so_far >= 0.8 * prog.target:
                 prog.target = 1.2 * prog.target
     # ====== print summary of the process ====== #
     if self.print_progress:
         prog.target = nb_docs
         prog.update(nb_docs)
     processing_time = timeit.default_timer() - start_time
     print('Processed %d-docs, %d-tokens in %f second.' %
           (nb_docs, len(word_counts), processing_time))
     self.nb_docs += nb_docs
     # ====== sorting ====== #
     self._refresh_dictionary()
     return self
示例#3
0
    def archive(self):
        from zipfile import ZipFile, ZIP_DEFLATED
        path = self.archive_path
        zfile = ZipFile(path, mode='w', compression=ZIP_DEFLATED)

        files = set([_[-1] for _ in self._data_map.itervalues()])

        progbar = Progbar(len(files), title='Archiving:')
        maxlen = max([len(os.path.basename(i)) for i in files])
        for i, f in enumerate(files):
            zfile.write(f, os.path.basename(f))
            progbar.title = ('Archiving: %-' + str(maxlen) +
                             's') % os.path.basename(f)
            progbar.update(i + 1)
        zfile.close()
        return path
示例#4
0
文件: features.py 项目: liqin123/odin
    def run(self):
        if self.pca:
            from odin.ml import MiniBatchPCA
        if not hasattr(self, 'jobs'):
            raise Exception(
                'the Processor must has "jobs" attribute, which is '
                'the list of all jobs.')
        njobs = len(self.jobs) if self.njobs == 0 else self.njobs
        prog = Progbar(target=njobs)
        dataset = self.dataset
        datatype = self.datatype
        if self.ncpu is None:  # auto select number of CPU
            ncpu = min(njobs, int(1.2 * cpu_count()))
        else:
            ncpu = self.ncpu
        # ====== indices ====== #
        indices = defaultdict(list)
        # ====== MmapDict ====== #
        dicts = {}
        for name, dtype, stats in self.features_properties:
            if 'dict' in str(dtype).lower():
                dicts[name] = MmapDict(os.path.join(dataset.path, name))
        # ====== statistic ====== #
        statistic_able = {i[0]: i[-1] for i in self.features_properties}
        sum1 = defaultdict(int)
        sum2 = defaultdict(int)
        # init PCA
        pca = defaultdict(lambda *args, **kwargs: MiniBatchPCA(
            n_components=None,
            whiten=self.pca_whiten,
            copy=True,
            batch_size=None) if self.pca else None)
        # all data are cached for periodically flushed
        cache = defaultdict(list)
        if self.ncache <= 1:
            cache_limit = max(2, int(0.12 * njobs))
        else:
            cache_limit = int(self.ncache)
        ref_vars = {'start': defaultdict(int), 'processed_count': 0}

        # ====== helper ====== #
        def flush_feature(name, cache_data):
            if len(cache_data) > 0:
                cache_data = np.concatenate(cache_data, 0)
                # NOTE: if nb_samples < nb_features, fitting PCA
                # will course error
                if self.pca and statistic_able[name]:
                    pca[name].partial_fit(cache_data)
                # flush data
                if name in dataset:
                    dataset[name].append(cache_data)
                else:
                    dataset[(name, datatype)] = cache_data

        def wrapped_reduce(result):
            name, data = result
            ref_vars['processed_count'] += 1
            # check data
            if not isinstance(data, (tuple, list)):
                data = (data, )
            length = []  # store length of all data for validation
            # processing
            for prop, d in zip(self.features_properties, data):
                n, t, s = prop  # data-type-name, dtype, stats
                # mmapdict type:
                if 'dict' in str(t).lower():
                    dicts[n][name] = d.tolist() if isinstance(
                        d, np.ndarray) else d
                    del d
                    continue
                # auto-create new indices
                if len(d) not in length:
                    length.append(len(d))
                    indices[n].append([
                        name, ref_vars['start'][n],
                        ref_vars['start'][n] + len(d)
                    ])
                    ref_vars['start'][n] += len(d)
                # cache data, only if we have more than 0 sample
                if len(d) > 0:
                    cache[n].append(d.astype(t))
                    if self.save_stats and s:  # save stats
                        sum1[n] += np.sum(d, axis=0, dtype='float64')
                        sum2[n] += np.sum(np.power(d, 2),
                                          axis=0,
                                          dtype='float64')
                del d
            # ====== flush cache ====== #
            if ref_vars['processed_count'] % cache_limit == 0:  # 12 + 8
                for i, j in cache.iteritems():
                    flush_feature(i, j)
                cache.clear()
            # ====== update progress ====== #
            return name

        # ====== processing ====== #
        mpi = MPI(self.jobs,
                  self.map,
                  wrapped_reduce,
                  ncpu=ncpu,
                  buffer_size=1,
                  maximum_queue_size=ncpu * 3)
        for name in mpi:
            prog.title = '%-20s' % name
            prog.add(1)
        # ====== end, flush the last time ====== #
        for i, j in cache.iteritems():
            flush_feature(i, j)
        cache = None
        dataset.flush()
        # ====== saving indices ====== #
        for n, ids in indices.iteritems():
            outpath = os.path.join(
                dataset.path,
                'indices' if n in self.primary_indices else 'indices_%s' % n)
            _ = MmapDict(outpath)
            for name, start, end in ids:
                _[name] = (int(start), int(end))
            _.flush()
            _.close()

        # ====== save mean and std ====== #
        def save_mean_std(sum1, sum2, pca, name, dataset):
            N = dataset[name].shape[0]
            mean = sum1 / N
            std = np.sqrt(sum2 / N - mean**2)
            if self.substitute_nan is not None:
                mean = np.where(np.isnan(mean), self.substitute_nan, mean)
                std = np.where(np.isnan(std), self.substitute_nan, std)
            else:
                assert not np.any(
                    np.isnan(mean)), 'Mean contains NaN, %s' % name
                assert not np.any(np.isnan(std)), 'Std contains NaN, %s' % name
            dataset[name + '_sum1'] = sum1
            dataset[name + '_sum2'] = sum2
            dataset[name + '_mean'] = mean
            dataset[name + '_std'] = std
            dataset[name + '_pca'] = pca

        # save all stats
        if self.save_stats:
            print('Saving statistics of each data ...')
            for n, d, s in self.features_properties:
                if s:  # save stats
                    print(' * Name:', n)
                    s1, s2, pca_ = sum1[n], sum2[n], pca[n]
                    save_mean_std(s1, s2, pca_, n, dataset)
        # ====== dataset flush() ====== #
        dataset.flush()
        dataset.close()
        # ====== all MmapDict flush() ====== #
        for d in dicts.itervalues():
            d.flush()
            d.close()
示例#5
0
 def transform(self,
               texts,
               mode='seq',
               dtype='int32',
               padding='pre',
               truncating='pre',
               value=0.,
               end_document=None,
               maxlen=None,
               token_not_found='ignore'):
     """
     Parameters
     ----------
     mode: 'binary', 'tfidf', 'count', 'freq', 'seq'
         'binary', abc
         'tfidf', abc
         'count', abc
         'freq', abc
         'seq', abc
     token_not_found: 'ignore', 'raise', a token string, an integer
         pass
     """
     # ====== check arguments ====== #
     texts = self._validate_texts(texts)
     # ====== check mode ====== #
     mode = str(mode)
     if mode not in ('seq', 'binary', 'count', 'freq', 'tfidf'):
         raise ValueError('The "mode" argument must be: "seq", "binary", '
                          '"count", "freq", or "tfidf".')
     # ====== check token_not_found ====== #
     if not isinstance(token_not_found, Number) and \
     not is_string(token_not_found) and \
     token_not_found not in ('ignore', 'raise'):
         raise ValueError('token_not_found can be: "ignore", "raise"'
                          ', an integer of token index, or a string '
                          'represented a token.')
     if token_not_found not in ('ignore', 'raise'):
         token_not_found = int(self.dictionary[token_not_found])
     elif isinstance(token_not_found, Number):
         token_not_found = int(token_not_found)
     # ====== pick engine ====== #
     if self.__engine == 'spacy':
         processor = self._preprocess_docs_spacy
     elif self.__engine == 'odin':
         processor = self._preprocess_docs_odin
     # ====== Initialize variables ====== #
     dictionary = self.dictionary
     results = []
     # ====== preprocess arguments ====== #
     if isinstance(end_document, str):
         end_document = dictionary.index(end_document)
     elif isinstance(end_document, Number):
         end_document = int(end_document)
     # ====== processing ====== #
     if hasattr(texts, '__len__'):
         target_len = len(texts)
         auto_adjust_len = False
     else:
         target_len = 1208
         auto_adjust_len = True
     prog = Progbar(target=target_len)
     for nb_docs, doc in processor(texts, vocabulary=None, keep_order=True):
         # found the word in dictionary
         vec = []
         for x in doc:
             idx = dictionary.get(x, -1)
             if idx >= 0:
                 vec.append(idx)
                 # not found the token in dictionary
             elif token_not_found == 'ignore':
                 continue
             elif token_not_found == 'raise':
                 raise RuntimeError(
                     'Cannot find token: "%s" in dictionary' % x)
             elif isinstance(token_not_found, int):
                 vec.append(token_not_found)
         # append ending document token
         if end_document is not None:
             vec.append(end_document)
         # add the final results
         results.append(vec)
         # print progress
         if self.print_progress:
             prog.title = "[Transforming] %d docs" % nb_docs
             prog.add(1)
             if auto_adjust_len and prog.seen_so_far >= 0.8 * prog.target:
                 prog.target = 1.2 * prog.target
     # end the process
     if self.print_progress and auto_adjust_len:
         prog.target = nb_docs
         prog.update(nb_docs)
     # ====== pad the sequence ====== #
     # just transform into sequence of tokens
     if mode == 'seq':
         maxlen = self.longest_document_length if maxlen is None \
             else int(maxlen)
         results = pad_sequences(results,
                                 maxlen=maxlen,
                                 dtype=dtype,
                                 padding=padding,
                                 truncating=truncating,
                                 value=value)
     # transform into one-hot matrix
     else:
         X = np.zeros(shape=(len(results), self.nb_words))
         for i, seq in enumerate(results):
             if mode == 'binary':
                 X[i, seq] = 1
             elif mode == 'freq':
                 length = len(seq)
                 count = freqcount(seq)
                 for tok, n in count.iteritems():
                     X[i, tok] = n / float(length)
             elif mode == 'count':
                 count = freqcount(seq)
                 for tok, n in count.iteritems():
                     X[i, tok] = n
             elif mode == 'tfidf':
                 count = freqcount(seq)
                 for tok, n in count.iteritems():
                     tf = 1 + np.log(n)
                     docs_freq = self._word_dictionary_info.get(
                         tok, (0, 0))[-1]
                     idf = np.log(1 + self.nb_docs / (1 + docs_freq))
                     X[i, tok] = tf * idf
         results = X
     return results