def __init__(self, normalize_text= default_normalize_text, spellcor_count=0, spellcor_dist= 2, n_words= 10000000, min_df= 0, max_df= 1.0, raw_min_df= -1, procs= 0, minibatch_size= 20000, stemmer= None, pos_tagger= None, extractor=None, timeout= 600, use_sc= False, method= "multiprocessing", verbose= 1): if procs==0: procs= multiprocessing.cpu_count() self.verbose= verbose self.use_sc = use_sc #self.procs= procs #self.minibatch_size= minibatch_size #self.timeout= timeout #self.method= method self.batcher= batcher.Batcher(procs, minibatch_size, timeout, use_sc, method, verbose) self.dictionary_freeze= False self.dictionary= {} self.dft= Counter() self.raw_dft= Counter() self.preserve_raw_dft= False self.normalize_text= normalize_text if spellcor_count==0: spellcor_dist= 0 elif spellcor_dist==0: spellcor_count= 0 self.spellcor_count= spellcor_count self.spellcor_dist= spellcor_dist self.stemmer= stemmer if raw_min_df==-1: self.raw_min_df= min_df else: self.raw_min_df= raw_min_df self.pos_tagger= pos_tagger self.doc_count= 0 self.n_words= n_words self.min_df= min_df self.max_df= max_df self.set_extractor(extractor)
def __init__(self, normalize_text=default_normalize_text, max_words=10000000, min_df=0, max_df=1.0, spellcor_count=0, spellcor_dist=2, raw_min_df=-1, stemmer=None, extractor=None, procs=0, minibatch_size=20000, timeout=600, spark_context=None, freeze=False, method="multiprocessing", verbose=1): if procs == 0: procs = multiprocessing.cpu_count() self.verbose = verbose self.batcher = batcher.Batcher(procs=procs, minibatch_size=minibatch_size, timeout=timeout, spark_context=spark_context, method=method, verbose=verbose) self.dictionary = {} self.dft = Counter() import wordbatch.transformers.apply as apply if normalize_text is None: self.normalize_text = None else: self.normalize_text = apply.Apply(self.batcher, normalize_text) import wordbatch.transformers.dictionary as dictionary self.dictionary = dictionary.Dictionary(self.batcher, min_df=min_df, max_df=max_df, max_words=max_words, freeze=False, verbose=verbose) import wordbatch.transformers.tokenizer as tokenizer if spellcor_count > 0 or stemmer != None: self.tokenizer = tokenizer.Tokenizer(self.batcher, spellcor_count, spellcor_dist, raw_min_df, stemmer, verbose=verbose) else: self.tokenizer = None self.set_extractor(extractor) self.freeze = freeze