示例#1
0
 def writeFile(self, filename=None, password=None):
     if len(self.passwords) < 1:
         print("no data to write to file")
         return False
     if filename is None:
         if self.lastOpened is not None:
             filename = self.lastOpened
         else:
             print("no specified file to write to")
             return False
     if password is None:
         if self.lastPass is not None:
             password = self.lastPass
         else:
             print("no specified password for encryption")
             return False
     try:
         fobj = open(filename, 'w')
         binData = pickle.dumps(self.passwords)
         encData = self.encryptor.encrypt(binData, password)
         #test decryption
         decData = self.encryptor.decrypt(
             Binary(encData.toHex()).resized(len(encData), True), password)
         loadObj = pickle.loads(bytes(decData))
         if not isinstance(loadObj, dict):
             raise pickle.PickleError(
                 "Decryption Test expected encoded dictionary object")
         for key, val in loadObj.items():
             if not isinstance(key, str):
                 raise pickle.PickleError(
                     "Decryption Test detected non-string key")
             if not isinstance(val, str):
                 raise pickle.PickleError(
                     "Decryption Test detected non-string password")
         if loadObj != self.passwords:
             raise Exception(
                 "Decryption Test failed to reproduce password data")
         fobj.write(str(len(encData)))
         fobj.write(encData.toHex())
         fobj.close()
         self.modified = False
         self.lastOpened = filename
         self.lastPass = password
         return True
     except Exception as error:
         print("error writing to file: " + str(filename) + ": " +
               str(error))
         return False
示例#2
0
 def __getstate__(self):
     if not self._picklable_safe:
         raise pickle.PickleError("The instance of the workflow engine cannot be serialized, "
         "because it was constructed with custom, user-supplied callbacks. Either use"
         "PickableWorkflowEngine or provide your own __getstate__ method.")
     return {'_store':self._store, '_objects': self._objects,
             '_i': self._i, '_callbacks': {}, 'log': self.log}
示例#3
0
        def got_all(results):

            logger.debug("Transfers for job %s finished" % tick)

            values = []
            for success, result in results:
                if not success:
                    if result.check(pickle.PickleError):
                        raise pickle.PickleError(
                            "Failed to unpickle input of %r.%r: %s" %
                            (tick, port, result))
                    else:
                        result.raiseException()
                else:
                    values.append(result)

            inputs = dict(zip(ports, values))

            evalresult = yield threads.deferToThread(run, inputs)

            if not isinstance(evalresult.result, dict) and not isinstance(
                    evalresult.result, failure.Failure):
                raise ValueError(
                    "Evaluation of task %r did not produce a dict or a failure. Got %r."
                    % (task, evalresult.result))

            defer.returnValue(evalresult)
示例#4
0
    def load(self, filename=""):
        if filename == "":
            filename = "%d.acc" % (self.__accountNumber)
        try:
            fh = open(filename, "rb")
            data = pickle.load(fh)
            all_data = data.strip().split("\n")
            self.__accountNumber = int(all_data[0])
            self.__accountName = all_data[1]
            self.__transacationList = []
            for d in all_data[2:]:
                item = d.strip().split()
                amount = float(item[0])
                date = item[1]
                currency = item[2]
                usd_conversion_rate = float(item[3])
                description = None
                if len(item) >= 5:
                    description = item[4]
                self.__transacationList.append(
                    Transaction(amount, date, currency, usd_conversion_rate,
                                description))

        except (EnvironmentError, pickle.UnpicklingError) as err:
            raise pickle.PickleError(str(err))
        finally:
            if fh is not None:
                fh.close()
示例#5
0
 def __getstate__(self):
     raise pickle.PickleError(
         "ScriptModules cannot be deepcopied using copy.deepcopy or saved using torch.save. "
         +
         "Mixed serialization of script and non-script modules is not supported. "
         +
         "For purely script modules use my_script_module.save(<filename>) instead."
     )
示例#6
0
 def __setstate__(self, state):
     self._store = state['_store']
     self._objects = state['_objects']
     self._i = state['_i']
     self._callbacks = state['_callbacks']
     self.log = state['log']
     if len(self._objects) < self._i[0]:
         raise pickle.PickleError("The workflow instance inconsistent state, too few objects")
     self._unpickled = True
示例#7
0
 def readFile(self, filename=None, password=None):
     if filename is None:
         if self.lastOpened is not None:
             filename = self.lastOpened
         else:
             print("no specified file to read")
             return False
     if password is None:
         if self.lastPass is not None:
             password = self.lastPass
         else:
             print("no specified password for encryption")
             return False
     try:
         fobj = open(filename, 'r')
         length, binData = fobj.read().split('0x')
         if not length.isdigit():
             raise TypeError("Expected integer length precursing the data")
         decData = self.encryptor.decrypt(
             Binary("0x" + binData).resized(int(length), True), password)
         loadObj = pickle.loads(bytes(decData))
         if not isinstance(loadObj, dict):
             raise pickle.PickleError("expected encoded dictionary object")
         for key, val in loadObj.items():
             if not isinstance(key, str):
                 raise pickle.PickleError("non-string key detected")
             if not isinstance(val, str):
                 raise pickle.PickleError("non-string password detected")
         for key, val in loadObj.items():
             self.passwords[key] = val
         fobj.close()
         self.lastOpened = filename
         self.lastPass = password
         return True
     except pickle.PickleError as error:
         if DEBUG:
             print("error decrypting file: " + str(error))
         else:
             print("Incorrect Password!")
         return False
     except Exception as error:
         print("error loading file: " + str(filename) + ": " + str(error))
         return False
示例#8
0
    def __getstate__(self):
        """
		Pickles a ZvitWriter.
		
		A ZvitWriter may not be pickled. It is inherently a transient, local
		object. Moreover, pickling directly or indirectly the path to which it
		logs ties a snapshot to a particular path. This is highly undesirable;
		Snapshots should be path-independent.
		
		The ZvitWriter should be considered a write-only interface to the true
		state, which is the .zvit file in the log directory.
		"""
        self.flush()
        raise pkl.PickleError("Cannot pickle a ZvitWriter!")
示例#9
0
 def save(self, filename=""):
     if filename == "":
         filename = "%d.acc" % (self.__accountNumber)
     try:
         data = str(self.__accountNumber) + "\n" + str(
             self.__accountName) + "\n"
         for t in self.__transacationList:
             data += str(t)
             data += "\n"
         fh = open(filename, "wb")
         pickle.dump(data, fh, pickle.HIGHEST_PROTOCOL)
     except (EnvironmentError, pickle.PicklingError) as err:
         raise pickle.PickleError(str(err))
     finally:
         if fh is not None:
             fh.close()
示例#10
0
    def dumps(self):
        try:
            return self._dumps(self.err)
        except settings.PICKLE_ERRORS as exc:
            logger.warning(exc, exc_info=exc)

            return self._dumps(exc)
        except BaseException as exc:
            logger.critical(exc, exc_info=exc)

            try:
                return self._dumps(exc)
            except BaseException as exc:
                logger.critical(exc, exc_info=exc)

                exc_tb = traceback.format_exception(
                    etype=type(exc),
                    value=exc,
                    tb=exc.__traceback__,
                )

                return self._dumps(pickle.PickleError(''.join(exc_tb)))
示例#11
0
    def __reduce__(self):
        """Used by Pickle to serialize the PmmlBinding.
        
        This serialization includes the associated ModelLoader's
        schema and tag-to-class mapping, so that the same classes are
        loaded when the pickled object is reconstituted.  This is not
        guaranteed by XML or JSON serialization.  The ModelLoader is
        only stored once per Pickle string.

        The serialization format for the PMML itself is compressed
        XML, with Gzip compression level 1 (fastest).

        @raise PickleError: If this PmmlBinding does not have a reference to the ModelLoader that made it (e.g. if a Python reference to the object was lost and the object was reconstituted by lxml from its C extension's internal data structure), then it cannot be pickled.  To resolve this problem, simply set C{pmmlBinding.modelLoader} to the desired ModelLoader object.
        """

        try:
            modelLoader = self.modelLoader
        except AttributeError:
            raise pickle.PickleError(
                "PmmlBinding instances can only be pickled if they have a .modelLoader attribute pointing back to the ModelLoader that would reconstitute them"
            )
        buff = StringIO()
        ElementTree(self).write(buff, compression=defs.PICKLE_XML_COMPRESSION)
        return _PmmlBinding_unserialize, (modelLoader, buff.getvalue())
示例#12
0
    def __init__(self,
                 sentences=None,
                 min_count=5,
                 threshold=10.0,
                 max_vocab_size=40000000,
                 delimiter=b'_',
                 progress_per=10000,
                 scoring='default',
                 common_terms=frozenset(),
                 doc2vec=False):
        """
        Parameters
        ----------
        sentences : iterable of list of str, optional
            The `sentences` iterable can be simply a list, but for larger corpora, consider a generator that streams
            the sentences directly from disk/network, See :class:`~gensim.models.word2vec.BrownCorpus`,
            :class:`~gensim.models.word2vec.Text8Corpus` or :class:`~gensim.models.word2vec.LineSentence`
            for such examples.
        min_count : float, optional
            Ignore all words and bigrams with total collected count lower than this value.
        threshold : float, optional
            Represent a score threshold for forming the phrases (higher means fewer phrases).
            A phrase of words `a` followed by `b` is accepted if the score of the phrase is greater than threshold.
            Heavily depends on concrete scoring-function, see the `scoring` parameter.
        max_vocab_size : int, optional
            Maximum size (number of tokens) of the vocabulary. Used to control pruning of less common words,
            to keep memory under control. The default of 40M needs about 3.6GB of RAM. Increase/decrease
            `max_vocab_size` depending on how much available memory you have.
        delimiter : str, optional
            Glue character used to join collocation tokens, should be a byte string (e.g. b'_').
        scoring : {'default', 'npmi', function}, optional
            Specify how potential phrases are scored. `scoring` can be set with either a string that refers to a
            built-in scoring function, or with a function with the expected parameter names.
            Two built-in scoring functions are available by setting `scoring` to a string:
            #. "default" - :func:`~gensim.models.phrases.original_scorer`.
            #. "npmi" - :func:`~gensim.models.phrases.npmi_scorer`.
        common_terms : set of str, optional
            List of "stop words" that won't affect frequency count of expressions containing them.
            Allow to detect expressions like "bank_of_america" or "eye_of_the_beholder".
        Notes
        -----
        'npmi' is more robust when dealing with common words that form part of common bigrams, and
        ranges from -1 to 1, but is slower to calculate than the default. The default is the PMI-like scoring
        as described by `Mikolov, et. al: "Distributed Representations of Words and Phrases and their Compositionality"
        <https://arxiv.org/abs/1310.4546>`_.
        To use a custom scoring function, pass in a function with the following signature:
        * worda_count - number of corpus occurrences in `sentences` of the first token in the bigram being scored
        * wordb_count - number of corpus occurrences in `sentences` of the second token in the bigram being scored
        * bigram_count - number of occurrences in `sentences` of the whole bigram
        * len_vocab - the number of unique tokens in `sentences`
        * min_count - the `min_count` setting of the Phrases class
        * corpus_word_count - the total number of tokens (non-unique) in `sentences`
        The scoring function **must accept all these parameters**, even if it doesn't use them in its scoring.
        The scoring function **must be pickleable**.
        """
        if min_count <= 0:
            raise ValueError("min_count should be at least 1")

        if threshold <= 0 and scoring == 'default':
            raise ValueError(
                "threshold should be positive for default scoring")
        if scoring == 'npmi' and (threshold < -1 or threshold > 1):
            raise ValueError(
                "threshold should be between -1 and 1 for npmi scoring")

        # set scoring based on string
        # intentially override the value of the scoring parameter rather than set self.scoring here,
        # to still run the check of scoring function parameters in the next code block

        if isinstance(scoring, six.string_types):
            if scoring == 'default':
                scoring = original_scorer
            elif scoring == 'npmi':
                scoring = npmi_scorer
            else:
                raise ValueError('unknown scoring method string %s specified' %
                                 (scoring))

        scoring_parameters = [
            'worda_count', 'wordb_count', 'bigram_count', 'len_vocab',
            'min_count', 'corpus_word_count'
        ]
        if callable(scoring):
            if all(parameter in getargspec(scoring)[0]
                   for parameter in scoring_parameters):
                self.scoring = scoring
            else:
                raise ValueError(
                    'scoring function missing expected parameters')

        self.min_count = min_count
        self.threshold = threshold
        self.max_vocab_size = max_vocab_size
        self.vocab = defaultdict(
            int)  # mapping between utf8 token => its count
        self.min_reduce = 1  # ignore any tokens with count smaller than this
        self.delimiter = delimiter
        self.progress_per = progress_per
        self.corpus_word_count = 0
        self.common_terms = frozenset(utils.any2utf8(w) for w in common_terms)
        self.doc2vec = doc2vec

        # ensure picklability of custom scorer
        try:
            test_pickle = pickle.dumps(self.scoring)
            load_pickle = pickle.loads(test_pickle)
        except pickle.PickleError:
            raise pickle.PickleError(
                'unable to pickle custom Phrases scoring function')
        finally:
            del (test_pickle)
            del (load_pickle)

        if sentences is not None:
            self.add_vocab(sentences)
示例#13
0
    def __init__(self, sentences=None, min_count=5, threshold=10.0,
                 max_vocab_size=40000000, delimiter=b'_', progress_per=10000,
                 scoring='default', common_terms=frozenset(),custom_bigrams=None,ignore_list = None):
        """
        Initialize the model from an iterable of `sentences`. Each sentence must be
        a list of words (unicode strings) that will be used for training.

        The `sentences` iterable can be simply a list, but for larger corpora,
        consider a generator that streams the sentences directly from disk/network,
        without storing everything in RAM. See :class:`BrownCorpus`,
        :class:`Text8Corpus` or :class:`LineSentence` in the :mod:`gensim.models.word2vec`
        module for such examples.

        `min_count` ignore all words and bigrams with total collected count lower
        than this.

        `threshold` represents a score threshold for forming the phrases (higher means
        fewer phrases). A phrase of words `a` followed by `b` is accepted if the score of the
        phrase is greater than threshold. see the `scoring` setting.

        `max_vocab_size` is the maximum size of the vocabulary. Used to control
        pruning of less common words, to keep memory under control. The default
        of 40M needs about 3.6GB of RAM; increase/decrease `max_vocab_size` depending
        on how much available memory you have.

        `delimiter` is the glue character used to join collocation tokens, and
        should be a byte string (e.g. b'_').

        `scoring` specifies how potential phrases are scored for comparison to the `threshold`
        setting. `scoring` can be set with either a string that refers to a built-in scoring function,
        or with a function with the expected parameter names. Two built-in scoring functions are available
        by setting `scoring` to a string:

        'default': from "Efficient Estimaton of Word Representations in Vector Space" by
                   Mikolov, et. al.:
                   (count(worda followed by wordb) - min_count) * N /
                   (count(worda) * count(wordb)) > threshold`, where `N` is the total vocabulary size.
        'npmi': normalized pointwise mutual information, from "Normalized (Pointwise) Mutual
                Information in Colocation Extraction" by Gerlof Bouma:
                ln(prop(worda followed by wordb) / (prop(worda)*prop(wordb))) /
                - ln(prop(worda followed by wordb)
                where prop(n) is the count of n / the count of everything in the entire corpus.

        'npmi' is more robust when dealing with common words that form part of common bigrams, and
        ranges from -1 to 1, but is slower to calculate than the default.

        To use a custom scoring function, create a function with the following parameters and set the `scoring`
        parameter to the custom function. You must use all the parameters in your function call, even if the
        function does not require all the parameters.

            worda_count: number of occurrances in `sentences` of the first token in the phrase being scored
            wordb_count: number of occurrances in `sentences` of the second token in the phrase being scored
            bigram_count: number of occurrances in `sentences` of the phrase being scored
            len_vocab: the number of unique tokens in `sentences`
            min_count: the `min_count` setting of the Phrases class
            corpus_word_count: the total number of (non-unique) tokens in `sentences`

        A scoring function without any of these parameters (even if the parameters are not used) will
        raise a ValueError on initialization of the Phrases class. The scoring function must be picklable.

        `common_terms` is an optionnal list of "stop words" that won't affect frequency count
        of expressions containing them.
        """
        if min_count <= 0:
            raise ValueError("min_count should be at least 1")

        if threshold <= 0 and scoring == 'default':
            raise ValueError("threshold should be positive for default scoring")
        if scoring == 'npmi' and (threshold < -1 or threshold > 1):
            raise ValueError("threshold should be between -1 and 1 for npmi scoring")

        # set scoring based on string
        # intentially override the value of the scoring parameter rather than set self.scoring here,
        # to still run the check of scoring function parameters in the next code block

        if isinstance(scoring, six.string_types):
            if scoring == 'default':
                scoring = original_scorer
            elif scoring == 'npmi':
                scoring = npmi_scorer
            else:
                raise ValueError('unknown scoring method string %s specified' % (scoring))

        scoring_parameters = [
            'worda_count', 'wordb_count', 'bigram_count', 'len_vocab', 'min_count', 'corpus_word_count'
        ]
        if callable(scoring):
            if all(parameter in getargspec(scoring)[0] for parameter in scoring_parameters):
                self.scoring = scoring
            else:
                raise ValueError('scoring function missing expected parameters')

        self.min_count = min_count
        self.ignore_list = ignore_list
        self.custom_bigrams = custom_bigrams
        self.threshold = threshold
        self.max_vocab_size = max_vocab_size
        self.vocab = defaultdict(int)  # mapping between utf8 token => its count
        self.min_reduce = 1  # ignore any tokens with count smaller than this
        self.delimiter = delimiter
        self.progress_per = progress_per
        self.corpus_word_count = 0
        self.common_terms = frozenset(utils.any2utf8(w) for w in common_terms)

        # ensure picklability of custom scorer
        try:
            test_pickle = pickle.dumps(self.scoring)
            load_pickle = pickle.loads(test_pickle)
        except pickle.PickleError:
            raise pickle.PickleError('unable to pickle custom Phrases scoring function')
        finally:
            del(test_pickle)
            del(load_pickle)

        if sentences is not None:
            self.add_vocab(sentences)
示例#14
0
    def __init__(self,
                 shots=None,
                 time_windows=None,
                 device="DIIID",
                 probes="DIIID_toroidal_mag",
                 fft_settings=None,
                 datamining_settings=None,
                 n_cpus=1,
                 _from_pickle=False,
                 _pickle_data=None):
        if not _from_pickle:
            # This executes the 'normal' construction of an Analysis object.
            if shots is None:
                shots = 159243
            if type(shots) == int:
                shots = [shots]
            if time_windows is None:
                time_windows = list(itertools.repeat([300, 1400], len(shots)))
            elif type(time_windows) is list:
                time_windows = list(itertools.repeat(time_windows, len(shots)))
            elif type(time_windows[0]) is not list:
                time_windows = [time_windows]
            self.shot_info = {
                "shots": shots,
                "time_windows": time_windows,
                "device": device,
                "probes": probes
            }
            self.fft_settings = fft_settings if fft_settings is not None else \
                {"n_pts": 8, "lower_freq": 10, "upper_freq": 250, "cutoff_by": "sigma_eq",
                 "ave_kappa_cutoff": 25, "filter_item": "EM_VMM_kappas"}
            self.datamining_settings = datamining_settings if datamining_settings is not None else \
                {'n_clusters': 16, 'n_iterations': 20, 'start': 'k_means',
                 'verbose': 0, 'method': 'EM_VMM', "seeds": None}
            self.n_cpus = n_cpus

            mag_iter = itertools.izip(
                itertools.repeat(self), self.shot_info["shots"],
                self.shot_info["time_windows"],
                itertools.repeat(self.shot_info["probes"]))
            self.mags = self.mp_acquire(func=mag_pickle_workaround,
                                        iter=mag_iter)
            fft_iter = itertools.izip(itertools.repeat(self),
                                      self.shot_info["shots"])

            self.raw_ffts = self.mp_acquire(func=fft_pickle_workaround,
                                            iter=fft_iter)
            raw_mirnov_iter = itertools.izip(
                itertools.repeat(self),
                self.shot_info["shots"])  # Add shots to avoid endless loops
            self.raw_mirnov_datas = self.mp_acquire(
                func=mirnov_pickle_workaround, iter=raw_mirnov_iter)
            raw_times_iter = itertools.izip(itertools.repeat(self),
                                            self.shot_info["shots"])
            self.raw_times = self.mp_acquire(times_pickle_workaround,
                                             iter=raw_times_iter)
            # self.mags = self.return_mags()
            # self.raw_ffts = self.return_raw_ffts()
            # self.raw_mirnov_datas = self.return_raw_mirnov_datas()
            # self.raw_times = self.return_raw_times()
        else:
            # This executes the construction of an Analysis object from a previously saved
            # pickle file. That file must have been saved using the save method.
            try:
                self.shot_info = _pickle_data["shot_info"]
                self.fft_settings = _pickle_data["fft_settings"]
                self.datamining_settings = _pickle_data["datamining_settings"]
                self.n_cpus = _pickle_data["n_cpus"]
                self.mags = _pickle_data["mags"]
                self.raw_ffts = _pickle_data["raw_ffts"]
                self.raw_mirnov_datas = _pickle_data["raw_mirnov_datas"]
                self.raw_times = _pickle_data["raw_times"]
            except:
                raise pickle.PickleError("Incorrect pickle file format.")
        return
示例#15
0
    def __init__(
            self, sentences=None, min_count=5, threshold=10.0,
            max_vocab_size=40000000, delimiter='_', progress_per=10000,
            scoring='default', connector_words=frozenset(),
        ):
        """

        Parameters
        ----------
        sentences : iterable of list of str, optional
            The `sentences` iterable can be simply a list, but for larger corpora, consider a generator that streams
            the sentences directly from disk/network, See :class:`~gensim.models.word2vec.BrownCorpus`,
            :class:`~gensim.models.word2vec.Text8Corpus` or :class:`~gensim.models.word2vec.LineSentence`
            for such examples.
        min_count : float, optional
            Ignore all words and bigrams with total collected count lower than this value.
        threshold : float, optional
            Represent a score threshold for forming the phrases (higher means fewer phrases).
            A phrase of words `a` followed by `b` is accepted if the score of the phrase is greater than threshold.
            Heavily depends on concrete scoring-function, see the `scoring` parameter.
        max_vocab_size : int, optional
            Maximum size (number of tokens) of the vocabulary. Used to control pruning of less common words,
            to keep memory under control. The default of 40M needs about 3.6GB of RAM. Increase/decrease
            `max_vocab_size` depending on how much available memory you have.
        delimiter : str, optional
            Glue character used to join collocation tokens.
        scoring : {'default', 'npmi', function}, optional
            Specify how potential phrases are scored. `scoring` can be set with either a string that refers to a
            built-in scoring function, or with a function with the expected parameter names.
            Two built-in scoring functions are available by setting `scoring` to a string:

            #. "default" - :func:`~gensim.models.phrases.original_scorer`.
            #. "npmi" - :func:`~gensim.models.phrases.npmi_scorer`.
        connector_words : set of str, optional
            Set of words that may be included within a phrase, without affecting its scoring.
            No phrase can start nor end with a connector word; a phrase may contain any number of
            connector words in the middle.

            **If your texts are in English, set** ``connector_words=phrases.ENGLISH_CONNECTOR_WORDS``.

            This will cause phrases to include common English articles, prepositions and
            conjuctions, such as `bank_of_america` or `eye_of_the_beholder`.

            For other languages or specific applications domains, use custom ``connector_words``
            that make sense there: ``connector_words=frozenset("der die das".split())`` etc.

        Examples
        --------
        .. sourcecode:: pycon

            >>> from gensim.test.utils import datapath
            >>> from gensim.models.word2vec import Text8Corpus
            >>> from gensim.models.phrases import Phrases, ENGLISH_CONNECTOR_WORDS
            >>>
            >>> # Load corpus and train a model.
            >>> sentences = Text8Corpus(datapath('testcorpus.txt'))
            >>> phrases = Phrases(sentences, min_count=1, threshold=1, connector_words=ENGLISH_CONNECTOR_WORDS)
            >>>
            >>> # Use the model to detect phrases in a new sentence.
            >>> sent = [u'trees', u'graph', u'minors']
            >>> print(phrases[sent])
            [u'trees_graph', u'minors']
            >>>
            >>> # Or transform multiple sentences at once.
            >>> sents = [[u'trees', u'graph', u'minors'], [u'graph', u'minors']]
            >>> for phrase in phrases[sents]:
            ...     print(phrase)
            [u'trees_graph', u'minors']
            [u'graph_minors']
            >>>
            >>> # Export a FrozenPhrases object that is more efficient but doesn't allow any more training.
            >>> frozen_phrases = phrases.freeze()
            >>> print(frozen_phrases[sent])
            [u'trees_graph', u'minors']

        Notes
        -----

        The ``scoring="npmi"`` is more robust when dealing with common words that form part of common bigrams, and
        ranges from -1 to 1, but is slower to calculate than the default ``scoring="default"``.
        The default is the PMI-like scoring as described in `Mikolov, et. al: "Distributed
        Representations of Words and Phrases and their Compositionality" <https://arxiv.org/abs/1310.4546>`_.

        To use your own custom ``scoring`` function, pass in a function with the following signature:

        * ``worda_count`` - number of corpus occurrences in `sentences` of the first token in the bigram being scored
        * ``wordb_count`` - number of corpus occurrences in `sentences` of the second token in the bigram being scored
        * ``bigram_count`` - number of occurrences in `sentences` of the whole bigram
        * ``len_vocab`` - the number of unique tokens in `sentences`
        * ``min_count`` - the `min_count` setting of the Phrases class
        * ``corpus_word_count`` - the total number of tokens (non-unique) in `sentences`

        The scoring function must accept all these parameters, even if it doesn't use them in its scoring.

        The scoring function **must be pickleable**.

        """
        super().__init__(connector_words=connector_words)
        if min_count <= 0:
            raise ValueError("min_count should be at least 1")

        if threshold <= 0 and scoring == 'default':
            raise ValueError("threshold should be positive for default scoring")
        if scoring == 'npmi' and (threshold < -1 or threshold > 1):
            raise ValueError("threshold should be between -1 and 1 for npmi scoring")

        # Set scoring based on string.
        # Intentially override the value of the scoring parameter rather than set self.scoring here,
        # to still run the check of scoring function parameters in the next code block.
        if isinstance(scoring, str):
            if scoring == 'default':
                scoring = original_scorer
            elif scoring == 'npmi':
                scoring = npmi_scorer
            else:
                raise ValueError(f'unknown scoring method string {scoring} specified')

        scoring_params = [
            'worda_count', 'wordb_count', 'bigram_count', 'len_vocab', 'min_count', 'corpus_word_count',
        ]
        if callable(scoring):
            missing = [param for param in scoring_params if param not in getargspec(scoring)[0]]
            if not missing:
                self.scoring = scoring
            else:
                raise ValueError(f'scoring function missing expected parameters {missing}')

        self.min_count = min_count
        self.threshold = threshold
        self.max_vocab_size = max_vocab_size
        self.vocab = defaultdict(int)  # mapping between token => its count
        self.min_reduce = 1  # ignore any tokens with count smaller than this
        self.delimiter = delimiter
        self.progress_per = progress_per
        self.corpus_word_count = 0

        # Ensure picklability of the scorer.
        try:
            pickle.loads(pickle.dumps(self.scoring))
        except pickle.PickleError:
            raise pickle.PickleError(f'Custom scoring function in {self.__class__.__name__} must be pickle-able')

        if sentences is not None:
            self.add_vocab(sentences)
示例#16
0
 def __getstate__(self):
     raise pickle.PickleError()
示例#17
0
	def __getstate__        (self):
		self.flush()
		raise pkl.PickleError("Cannot pickle a ZvitWriter!")