Exemplo n.º 1
0
    def __init__(self, normalize_text= default_normalize_text, spellcor_count=0, spellcor_dist= 2, n_words= 10000000,
                 min_df= 0, max_df= 1.0, raw_min_df= -1, procs= 0, minibatch_size= 20000,
                 stemmer= None, pos_tagger= None, extractor=None, timeout= 600, use_sc= False,
                 method= "multiprocessing", verbose= 1):
        if procs==0:  procs= multiprocessing.cpu_count()
        self.verbose= verbose
        self.use_sc = use_sc
        #self.procs= procs
        #self.minibatch_size= minibatch_size
        #self.timeout= timeout
        #self.method= method

        self.batcher= batcher.Batcher(procs, minibatch_size, timeout, use_sc, method, verbose)
        self.dictionary_freeze= False
        self.dictionary= {}
        self.dft= Counter()
        self.raw_dft= Counter()
        self.preserve_raw_dft= False

        self.normalize_text= normalize_text
        if spellcor_count==0:  spellcor_dist= 0
        elif spellcor_dist==0:  spellcor_count= 0
        self.spellcor_count= spellcor_count
        self.spellcor_dist= spellcor_dist
        self.stemmer= stemmer
        if raw_min_df==-1:  self.raw_min_df= min_df
        else:  self.raw_min_df= raw_min_df
        self.pos_tagger= pos_tagger

        self.doc_count= 0
        self.n_words= n_words
        self.min_df= min_df
        self.max_df= max_df

        self.set_extractor(extractor)
Exemplo n.º 2
0
    def __init__(self,
                 normalize_text=default_normalize_text,
                 max_words=10000000,
                 min_df=0,
                 max_df=1.0,
                 spellcor_count=0,
                 spellcor_dist=2,
                 raw_min_df=-1,
                 stemmer=None,
                 extractor=None,
                 procs=0,
                 minibatch_size=20000,
                 timeout=600,
                 spark_context=None,
                 freeze=False,
                 method="multiprocessing",
                 verbose=1):
        if procs == 0: procs = multiprocessing.cpu_count()
        self.verbose = verbose

        self.batcher = batcher.Batcher(procs=procs,
                                       minibatch_size=minibatch_size,
                                       timeout=timeout,
                                       spark_context=spark_context,
                                       method=method,
                                       verbose=verbose)
        self.dictionary = {}
        self.dft = Counter()

        import wordbatch.transformers.apply as apply
        if normalize_text is None: self.normalize_text = None
        else: self.normalize_text = apply.Apply(self.batcher, normalize_text)

        import wordbatch.transformers.dictionary as dictionary
        self.dictionary = dictionary.Dictionary(self.batcher,
                                                min_df=min_df,
                                                max_df=max_df,
                                                max_words=max_words,
                                                freeze=False,
                                                verbose=verbose)

        import wordbatch.transformers.tokenizer as tokenizer
        if spellcor_count > 0 or stemmer != None:
            self.tokenizer = tokenizer.Tokenizer(self.batcher,
                                                 spellcor_count,
                                                 spellcor_dist,
                                                 raw_min_df,
                                                 stemmer,
                                                 verbose=verbose)
        else:
            self.tokenizer = None
        self.set_extractor(extractor)
        self.freeze = freeze