示例#1
0
  def train(self, epochs=30,no_threads=None):
    """
    Train with own Data(s)
    Support single or multiple corpus or dataframe.
    Parameters:
    -----------
    model_name(optional): preferred model name
    epochs : int : total epochs for training
    no_threads(optional): int : no of threads for training

    Example
    --------
    >>> from ekushey.feature_extraction import BN_GloVe

    #Training Against Sentences
    >>> glv = BN_GloVe(sentences=[['আমার', 'প্রিয়', 'জন্মভূমি'], ['বাংলা', 'আমার', 'মাতৃভাষা'], ['বাংলা', 'আমার', 'মাতৃভাষা'], ['বাংলা', 'আমার', 'মাতৃভাষা'], ['বাংলা', 'আমার', 'মাতৃভাষা'] ])
    >>> glv.train()

    #Training Against one Text Corpus
    >>> glv = BN_GloVe(corpus_file="path_to_corpus.txt")
    >>> glv.train()

    #Training Against Multiple Corpuses
    path
      ->corpus
        ->1.txt
        ->2.txt
        ->3.txt

    >>> glv = BN_GloVe(corpus_path="path/corpus")
    >>> glv.train(epochs=25)

    #Training Against a Dataframe Column

    >>> glv = BN_GloVe(df= news_data['text_content'])
    >>> glv.train(epochs=25)



    """
    if not(self.sentences) and  not(self.corpus_file) and not(self.corpus_path) and self.df is None:
      raise Exception('Data is not given')
    elif self.sentences:
      data = self.sentences
      print("got sentence")
    elif self.corpus_file:
      print("got sentence")
      data = PathLineSentences(self.corpus_file)
    elif self.corpus_path:
      print("got sentence")
      data = PathLineSentences(self.corpus_path)
    elif self.df is not None:
      print("Dataframe got")
      data = '\n'.join(self.df)
      data = data.split('\n')
      data = [sent.split() for sent in data]
    else:
      print("Unexpected error occured: Please check your data file again.")

    
    
    if no_threads is None:
      no_threads = self.cpu_cores

    t = time()
    corpus = Corpus()
    corpus.fit(data, window=self.window)
    print('Dict size: %s' % len(corpus.dictionary))
    glove = Glove(no_components=self.size, learning_rate=self.n)
    glove.fit(corpus.matrix, epochs=epochs, no_threads=no_threads, verbose=True)
    print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))
    glove.add_dictionary(corpus.dictionary)
    print("Saving model to current directory")
    glove.save(self.model_name)
示例#2
0
  def train(self, epochs=30):
    """
    Train with own Data(s)
    Support single or multiple corpus or dataframe.
    Parameters:
    -----------
    model_name(optional): preferred model name
    epochs : int : total epochs for training

    Example
    --------
    >>> from ekushey.feature_extraction import BN_FastText

    #Training Against Sentences
    >>> ft = BN_FastText(sentences=[['আমার', 'প্রিয়', 'জন্মভূমি'], ['বাংলা', 'আমার', 'মাতৃভাষা'], ['বাংলা', 'আমার', 'মাতৃভাষা'], ['বাংলা', 'আমার', 'মাতৃভাষা'], ['বাংলা', 'আমার', 'মাতৃভাষা'] ])
    >>> ft.train()

    #Training Against one Text Corpus
    >>> ft = BN_FastText(corpus_file="path_to_corpus.txt")
    >>> ft.train()

    #Training Against Multiple Corpuses
    path
      ->corpus
        ->1.txt
        ->2.txt
        ->3.txt

    >>> ft = BN_FastText(corpus_path="path/corpus")
    >>> ft.train(epochs=25)

    #Training Against a Dataframe Column

    >>> ft = BN_FastText(df= news_data['text_content'])
    >>> ft.train(epochs=25)

"""
    if not(self.sentences) and  not(self.corpus_file) and not(self.corpus_path) and self.df is None:
      raise Exception('Data is not given')
    elif self.sentences:
      data = self.sentences
      #print("got sentence")
    elif self.corpus_file:
      #print("got sentence")
      data = PathLineSentences(self.corpus_file)
    elif self.corpus_path:
      #print("got sentence")
      data = PathLineSentences(self.corpus_path)
    elif self.df is not None:
      #print("Dataframe got")
      data = '\n'.join(self.df)
      data = data.split('\n')
      data = [sent.split() for sent in data]
    else:
      print("Unexpected error occured: Please check your data file again.")
    
    
    cpu_cores = multiprocessing.cpu_count()
    ft_model = FastText(
                        size=self.size,
                        alpha=self.alpha,
                        window=self.window,
                        min_count=self.min_count,
                        max_vocab_size=self.max_vocab_size,
                        sample=self.sample,
                        workers=self.workers,
                        min_alpha=self.min_alpha,
                        sg=self.sg,
                        negative=self.negative
                       )

 
    print("Working with "+str(self.workers)+" worker threads")
    ft_model.build_vocab(data,  progress_per=10000)
    print("Vocabulary build Successfully")
    t=time()
    ft_model.train(data, total_examples=ft_model.corpus_count, epochs=epochs, report_delay=1)
    print('Training took : {} mins'.format(round((time() - t) / 60, 2)))
    ft_model.save(self.model_name)
    print(ft_model)