def transform(self, X, y=None):
     """ Transforms the list of documents and returns tokens with their features.
         Each document should represent a sentence.
     """
     log.info("Generating features for {} documents...".format(len(X)))
     features = []
     for doc in X:
         doc_features = []
         for token in document_to_tokens(doc):
             if token in self.model.wv:
                 doc_features.append((token, self.model.wv[token]))
         features.append(doc_features)
     return features
예제 #2
0
    def fit(self, X, y=None, size=100, min_count=5, workers=1, window=5, sample=1e-3, skipgram=False, min_n=3, max_n=6):
        """ Trains a Word2vec model on given documents.
            Each document should represent a sentence.
        Args:
            X: list(Document | AnnotatedDocument | list(str))
            y: optional labels
            size: Size of embeddings to be learnt (Default 100), i.e. word vector dimensionality
            min_count: Minimum word count. Ignore words with number of occurrences below this (Default 5).
            workers: Number of threads to run in parallel
            window: Context window size
            sample: Threshold for downsampling higher-frequency words (Default 0.001)
            skipgram: Use skip-gram if True and CBOW otherwise
            min_n: min length of char ngrams (Default 3)
            max_n: max length of char ngrams (Default 6)
        """
        log.info("Checking parameters...")
        self.config.set_parameters({
            "size": size,
            "min_count": min_count,
            "workers": workers,
            "window": window,
            "sample": sample,
            "min_n": min_n,
            "max_n": max_n
        })
        self.config.validate()
        # Get sentences as lists of tokens
        log.info("Tokenizing {} documents...".format(len(X)))
        sentences = []
        for idx, doc in enumerate(X):
            sentences.append(document_to_tokens(doc))
            log_progress(log, idx, len(X))
        # Initialize and train the model (this will take some time)
        log.info("Training FastText on {} sentences...".format(len(X)))
        self.model = FastText(
            sentences,
            workers=self.config.get_parameter("workers"),
            size=self.config.get_parameter("size"),
            min_count=self.config.get_parameter("min_count"),
            window=self.config.get_parameter("window"),
            sample=self.config.get_parameter("sample"),
            sg=1 if skipgram else 0,
            min_n=self.config.get_parameter("min_n"),
            max_n=self.config.get_parameter("max_n"))

        # If you don't plan to train the model any further, calling
        # init_sims() will make the model much more memory-efficient.
        self.model.init_sims(replace=True)
        return self
    def fit(self,
            X,
            y=None,
            size=100,
            min_count=5,
            workers=1,
            window=5,
            sample=1e-3,
            skipgram=False):
        """ Trains a Word2vec model on given documents.
            Each document should represent a sentence.
        Args:
            X: list(Document | AnnotatedDocument | list(str))
            y: optional labels
            size: Word vector dimensionality
            min_count: Minimum word count
            workers: Number of threads to run in parallel
            window: Context window size
            sample: Downsample setting for frequent words
            skipgram: Use skip-gram if True and CBOW otherwise
        """
        log.info("Checking parameters...")
        self.config.set_parameters({
            "size": size,
            "min_count": min_count,
            "workers": workers,
            "window": window,
            "sample": sample
        })
        # Get sentences as lists of tokens
        log.info("Tokenizing {} documents...".format(len(X)))
        sentences = []
        for idx, doc in enumerate(X):
            sentences.append(document_to_tokens(doc))
            log_progress(log, idx, len(X))
        # Initialize and train the model (this will take some time)
        log.info("Training Word2vec on {} sentences...".format(len(X)))
        self.model = Word2Vec(sentences,
                              workers=self.config.get_parameter("workers"),
                              size=self.config.get_parameter("size"),
                              min_count=self.config.get_parameter("min_count"),
                              window=self.config.get_parameter("window"),
                              sample=self.config.get_parameter("sample"),
                              sg=1 if skipgram else 0)

        # If you don't plan to train the model any further, calling
        # init_sims() will make the model much more memory-efficient.
        self.model.init_sims(replace=True)
        return self
 def fit(self,
         X,
         y=None,
         size=100,
         min_count=5,
         workers=1,
         window=5,
         sample=1e-3,
         skipgram=False,
         min_n=3,
         max_n=6):
     """ Trains word, character, and part-of-speech embeddings
         (see Char2VecFeatureExtractor for the description of arguments).
     """
     # Get sentences as lists of tokens
     log.info("Tokenizing {} documents...".format(len(X)))
     sentences = []
     for idx, doc in enumerate(X):
         sentences.append(document_to_tokens(doc))
         log_progress(log, idx, len(X))
     self.word_vectorizer.fit(sentences,
                              y,
                              size=size,
                              min_count=min_count,
                              workers=workers,
                              window=window,
                              sample=sample,
                              skipgram=skipgram)
     self.pos_vectorizer.fit(sentences,
                             y,
                             size=size,
                             min_count=min_count,
                             workers=workers,
                             window=window,
                             sample=sample,
                             skipgram=skipgram)
     self.char_vectorizer.fit(sentences,
                              y,
                              size=size,
                              min_count=min_count,
                              workers=workers,
                              window=window,
                              sample=sample,
                              skipgram=skipgram,
                              min_n=min_n,
                              max_n=max_n)
     return self