Пример #1
0
 def __init__(self, source_dir, clean=False):
     self.dataset_directory = source_dir
     self.data_extractor = extractor.Extractor(self.dataset_directory,
                                               clean)
     self.datafilter = data_filter.DataFilter()
     self.data_transformer = transformer.Transformer()
     self.processor = clustering.Cluster()
 def test_rm_stopwords(self):
     data_filt = data_filter.DataFilter()
     cleaned_text = data_filt.rm_stopwords('\n\n\n\n\nwhat the '
                                           'is this'
                                           ''
                                           ''
                                           ' non-sense')
     print(cleaned_text)
 def test_rm_blanklines(self):
     data_filt = data_filter.DataFilter()
     cleaned_text = data_filt.rm_blanklines('what the '
                                            '\n\n\nis this'
                                            ''
                                            ''
                                            ' non-sense')
     print(cleaned_text)
Пример #4
0
 def __init__(self):
     self.NCLUSTERS = 50
     self.NITER = 5
     self.NTOPICS = 10
     # Trial variable for number of cluster
     self.max_d = 50
     self.model = None
     self.svd = None
     self.tokenizer = data_filter.DataFilter()
Пример #5
0
 def __init__(self):
     self.tokenizer = data_filter.DataFilter()
     self.stemmer = SnowballStemmer('english')
     self.vocabulary_builder = vocab_builder.VocabBuilder(
         "C:\\Users\\ramji\\PycharmProjects\\mindshift-solutions\\mindshift\\dataFiles\\business_terms.txt"
     )
     self.custom_vocabulary = self._build_custom_vocabulary()
     self.vectorizer = TfidfVectorizer(stop_words='english',
                                       min_df=0.1,
                                       max_df=0.8,
                                       analyzer='word',
                                       ngram_range=(1, 3),
                                       vocabulary=self.custom_vocabulary)
     self.vector_features = []
     self.modeller = clustering.Cluster()
     self.lda_model = None
     self.config_handler = configparser.ConfigParser()
 def test_tokenzie_and_stem(self):
     data_filt = data_filter.DataFilter()
     cleaned_text = data_filt.tokenize_and_stem("Will this, be removed?.")
     print(cleaned_text)
Пример #7
0
 def __init__(self, source_dir, clean=False):
     self.dataset_dir = source_dir
     self.file_handler = filehandler.FileHandler()
     self.clean = clean
     self.data_filter = data_filter.DataFilter()