def __init__(self, source_dir, clean=False): self.dataset_directory = source_dir self.data_extractor = extractor.Extractor(self.dataset_directory, clean) self.datafilter = data_filter.DataFilter() self.data_transformer = transformer.Transformer() self.processor = clustering.Cluster()
def test_rm_stopwords(self): data_filt = data_filter.DataFilter() cleaned_text = data_filt.rm_stopwords('\n\n\n\n\nwhat the ' 'is this' '' '' ' non-sense') print(cleaned_text)
def test_rm_blanklines(self): data_filt = data_filter.DataFilter() cleaned_text = data_filt.rm_blanklines('what the ' '\n\n\nis this' '' '' ' non-sense') print(cleaned_text)
def __init__(self): self.NCLUSTERS = 50 self.NITER = 5 self.NTOPICS = 10 # Trial variable for number of cluster self.max_d = 50 self.model = None self.svd = None self.tokenizer = data_filter.DataFilter()
def __init__(self): self.tokenizer = data_filter.DataFilter() self.stemmer = SnowballStemmer('english') self.vocabulary_builder = vocab_builder.VocabBuilder( "C:\\Users\\ramji\\PycharmProjects\\mindshift-solutions\\mindshift\\dataFiles\\business_terms.txt" ) self.custom_vocabulary = self._build_custom_vocabulary() self.vectorizer = TfidfVectorizer(stop_words='english', min_df=0.1, max_df=0.8, analyzer='word', ngram_range=(1, 3), vocabulary=self.custom_vocabulary) self.vector_features = [] self.modeller = clustering.Cluster() self.lda_model = None self.config_handler = configparser.ConfigParser()
def test_tokenzie_and_stem(self): data_filt = data_filter.DataFilter() cleaned_text = data_filt.tokenize_and_stem("Will this, be removed?.") print(cleaned_text)
def __init__(self, source_dir, clean=False): self.dataset_dir = source_dir self.file_handler = filehandler.FileHandler() self.clean = clean self.data_filter = data_filter.DataFilter()