def testNormalization(self): print("** Test normalization **") the_normalizer = Normalizer("datasets/test_normalization.csv") normalized = [[0, 0, 0], [1, 1, 1], [0.5, 0.1, 0.9]] self.assertTrue(the_normalizer.normalize() == normalized, "Normalized data doesn't match")
def detectCircle(im): # detect circles in the image n = Normalizer(170) im = n.crop(im) new = imutils.resize(im, height=170) if new.shape[1] > 170: new = imutils.resize(im, width=170) circles = cv2.HoughCircles(new, cv2.HOUGH_GRADIENT, 1.5, minDist=170, param2=30, minRadius=70, maxRadius=85) return not circles is None
def testCSVIntegrity(self): print("** Test CSV Integrity **") the_normalizer = Normalizer("datasets/test_normalization.csv") data = the_normalizer.get_csv() origin_data = [['0', '3', '0'], ['1', '33', '100'], ['0.5', '6', '90']] length = 3 self.assertTrue(data == origin_data, "Data and CSV file doesn't match") self.assertTrue(length == the_normalizer.getRowLength(), "Line length doesn't match")
def __init__(self, k, n, columns, datafile): """Constructeur pour la classe KMeanClusterer""" super(KMeanClusterer, self).__init__() # Number of clusters wanted self.k = k self.n = n self.is_over = False # columns to work with self.columns = sorted(columns) # Get CSV data norm = Normalizer(datafile) self.data = norm.normalize() self.row_length = norm.getRowLength() self.clusters = []
def brain(command): response = "" command = command # from 0 =>> 15 is verb for search and find # from 16 =>> 21 is verb for open actions = [ "search", "find", "view", "reach", "detect", "get", "catch", "explore", "achieve", "obtain", "pass", "check", "reveal", "expose", "observe", "show", "see", "listen", "hear", "open", "watch", "arise", "awaken", "call", "consciousness", "get up", "stir", "wake", "wake up" ] tokens = Tokenizer().tokenize(command) # call weather function if there is weather word and country or city name citiesORcountries = weatherFunction(command) if 'weather' in command.split() and citiesORcountries != []: return 'the weather in ' + citiesORcountries[0] + ' is ' + WeatherC( ).weatherForecast(citiesORcountries[0]) + ' today' action = None fileName = None # -----------------------------------<<Variable>>-------------------------------------------- tagSentence = Tagger().tag(tokens) for counter in range(len(tagSentence)): # if tagSentence[counter][1] == 'VB' or tagSentence[counter][0] in self.actions: if tagSentence[counter][0] in actions: action = tagSentence[counter][0] elif tagSentence[counter][1] == 'NN': fileName = tagSentence[counter][0] normlizeAction = Normalizer().snowBallStemmer(action) if normlizeAction in actions: filePath = FileSearch().search( fileName) # return list of file shared the same name if normlizeAction in actions[:15]: # for search about folder or file OpenMedia().openFile(filePath[0].split("//")[0]) response = "i hope you're satisfied with our service" return response if normlizeAction in actions[15:21]: #if he if normlizeAction in [ 'listen', 'hear', 'watch' ] and filePath[0].split('.')[1] != ['mp3', 'mp4', 'mkv']: pass OpenMedia().openFile(filePath[0])
def loading_dataSet(): file = open("res/dataset.txt", "r") data = file.read() file.close() docs = data.split("\n") types = [] train = [] for d in docs: d = d.split() if len(d) != 0: types.append(d[0]) print('dataset Count = ' + str(len(types))) normalized_corpus = Normalizer.normalize_corpus(docs) normalized_corpus.remove('') counter = 0 for x in normalized_corpus: train.append((x, types[counter])) counter = counter + 1 return train
def classify_btn_clicked(): def setClassification(type): if type == '1': classi_out.setPlainText('culture') elif type == '2': classi_out.setPlainText('sport') elif type == '3': classi_out.setPlainText('economy') elif type == '4': classi_out.setPlainText('international') elif type == '5': classi_out.setPlainText('local') elif type == '6': classi_out.setPlainText('religion') tester_doc = file_.toPlainText().strip() normalized_tester_doc = Normalizer.normalize_corpus([tester_doc]) featuresets_test = [features(words) for words in normalized_tester_doc] predicted_label = classifier.classify_many(featuresets_test) setClassification(predicted_label[0])
def brn(self): tagSentence = Tagger().tag(self.tokens) for counter in range(len(tagSentence)): # if tagSentence[counter][1] == 'VB' or tagSentence[counter][0] in self.actions: if tagSentence[counter][0] in self.actions: action = tagSentence[counter][0] elif tagSentence[counter][1] == 'NN': fileName = tagSentence[counter][0] normlizeAction = Normalizer().snowBallStemmer(action) if normlizeAction in self.actions: filePath = FileSearch().search(fileName) # return list of file shared the same name if normlizeAction in self.actions[:15]: # for search about folder or file OpenMedia().openFile(filePath[0].split("//")[0]) if normlizeAction in self.actions[15:21]: OpenMedia().openFile(filePath[0]) else: pass # return "can you explain more" # Brain("i wanna open workout").brn()
def getDatasetSize(self, datafile): norm = Normalizer() iris_data_matrix = norm.load_csv(datafile) return len(iris_data_matrix)
def __init__(self): # Text normalizer self.normalizer = Normalizer()
class FeatureExtractor: """ Contains methods for corpus normalization. """ def __init__(self): # Text normalizer self.normalizer = Normalizer() def bag_of_words(self, corpus, ngram_range=(1, 1), type_="binary"): """ Generate bag of words for each document of a corpus. Args: corpus (list of str): List of documents ngram_range (tuple of int): Minimum and maximum size of ngrams in text used only if type is *-ngram ngram_size (int): Size of a ngram tupe (int): Type of bag of words: - binary - frequency - tfidf - binary-ngram - frequency-ngram - tfidf-ngram Returns: list of str/tuple of str:int pairs: Bag of words/ngrams """ corpus = [ self.normalizer.normalize_text(document) for document in corpus ] if type_ == "binary": bag_of_words = feature_extraction.bag_of_words_binary_corpus( corpus) elif type_ == "frequency": bag_of_words = feature_extraction.bag_of_words_frequencies_corpus( corpus) elif type_ == "tfidf": bag_of_words = feature_extraction.bag_of_words_tfidf_corpus(corpus) elif type_ == "binary-ngram": bag_of_words = feature_extraction.bag_of_ngrams_binary_corpus( corpus, ngram_range[0]) elif type_ == "frequency-ngram": bag_of_words = feature_extraction.bag_of_ngrams_frequencies_range_corpus( corpus, ngram_range) elif type_ == "tfidf-ngram": bag_of_words = feature_extraction.bag_of_ngrams_tfidf_range_corpus( corpus, ngram_range) else: raise ValueError( """Wrong type_ input. Type help(bag_of_words) to see supported types.""" ) return bag_of_words def feature_matrix(self, corpus, ngram_range=(1, 1), type_="binary"): """ Generate feature matrix for each document of a corpus. Args: corpus (list of str): List of documents ngram_range (tuple of int): Minimum and maximum size of ngrams in text used only if type is *-ngram ngram_size (int): Size of a ngram tupe (int): Type of bag of words: - binary - frequency - tfidf - binary-ngram - frequency-ngram - tfidf-ngram Returns: ### """ bag_of_words = self.bag_of_words(corpus, ngram_range, type_) vocabulary = dict() id_ = 0 for document in bag_of_words: for word in document: if not word in vocabulary.keys(): vocabulary[word] = id_ id_ += 1 sorted_vocabulary = sorted(vocabulary.items(), key=lambda x: x[1]) feature_matrix = list() for document in bag_of_words: vector = list() for word in sorted_vocabulary: try: vector.append(document[word[0]]) except KeyError: # If word is not present in bag of words, fill respective # column with default value if type_.startswith("binary"): vector.append(False) elif type_.startswith("frequency"): vector.append(0) elif type_.startswith("tfidf"): vector.append(0.0) feature_matrix.append(vector) return vocabulary, feature_matrix def feature_matrix_sklearn(self, corpus, ngram_range=(1, 1), binary=False, type_=0): """ Generate feature matrix for each document of a corpus. Args: corpus (list of str): Raw documents to be transformed into matrix of bags of words ngram_range (tuple of int, int): Start and end range for ngrams binary (bool): True if only indicator of presence of word in document is needed, else False type_ (int): 0 - frequencies, 1 - tfidf Returns: list of dict of str/tuple of str:int pairs: Matrix of word/ngram:frequency or tfidf measure of a word in text """ if type_ == 0: count_vectorizer, feature_matrix = scikit_bag_of_words_frequencies( corpus, ngram_range, binary) elif type_ == 1: count_vectorizer, feature_matrix = scikit_bag_of_words_tfidf( corpus, ngram_range) return count_vectorizer.vocabulary_, feature_matrix.toarray( ), feature_matrix.toarray().tolist()
return json.loads(res) if __name__ == "__main__": # datafile = "kddcup.data_10_percent.csv" # fields = [0, 4, 5, 22, 24, 25, 28, 31, 32, 35, 37, 38] # header = False # fieldClass = 41 # k = 23 # n = 20 datafile = "kddcup.data_1000.csv" header = False fields = [0, 4, 5, 22, 24, 25, 28, 31, 32, 35, 37, 38] fieldClass = 41 k = 17 n = 20 # datafile = "iris.csv" # fields = [0, 1, 2, 3] # fieldClass = 4 # header = True # k = 3 # n = 50 norm = Normalizer(datafile, header) res = norm.run(fields, fieldClass) classes = norm.classes kMeanClusterer = KMeanClusterer(res, classes, k, n) print json.dumps(kMeanClusterer.jsonify(), indent=2, separators=(',', ': '))
encoding="utf-8", object_pairs_hook=collections.OrderedDict) fin.close() abbrevs = abbrev_json["abbreviation-entries"].keys() # word tokenizer token_json_filepath = os.path.join(lang_path, "token.json") wordtok = WordTokenizer(token_json_filepath, abbrev_json["abbreviation-entries"].keys()) # normalizer norm_json_filepath = os.path.join(lang_path, "norm.json") alphaexp_json_filepath = os.path.join(lang_path, "alphaexp.json") numexp_rule_filepath = os.path.join(lang_path, "numexp.rule") norm = Normalizer(norm_json_filepath, alphaexp_json_filepath, numexp_rule_filepath, abbrev_json["abbreviation-entries"]) # sentence tokenizer sentence_json_filepath = os.path.join(lang_path, "sentence.json") senttok = SentenceTokenizer(sentence_json_filepath, raw_text_filepath) # ======================== # run # ======================== utts = [] for sent in senttok.tokenize_iter(): tokens, classes, puncs = wordtok.tokenize(sent) words = [] for token, cls, punc in zip(tokens, classes, puncs):
from normalization import Normalizer import nltk from nltk import bigrams #================= Loading dataset and normalize it =========================== Normalizer = Normalizer() def loading_dataSet(): file = open("res/dataset.txt", "r") data = file.read() file.close() docs = data.split("\n") types = [] train = [] for d in docs: d = d.split() if len(d) != 0: types.append(d[0]) print('dataset Count = ' + str(len(types))) normalized_corpus = Normalizer.normalize_corpus(docs) normalized_corpus.remove('') counter = 0 for x in normalized_corpus: train.append((x, types[counter])) counter = counter + 1 return train normalized_dataset = loading_dataSet() #=============================================================================== #========================= Starting Trainning dataset ==========================
class Test(unittest.TestCase): def setUp(self): self.datafile = "datasets/spambase_2.data" self.normalizer = Normalizer(self.datafile) pass def tearDown(self): pass def getDatasetSize(self, datafile): norm = Normalizer() iris_data_matrix = norm.load_csv(datafile) return len(iris_data_matrix) def testKMean(self): print("** test KMean **") # perform initialization k = 3 n = 10 cols = [3, 4, 5] kMeanClusterer = KMeanClusterer(k, n, cols, self.datafile) kMeanClusterer.performClustering() #total number of lines in the dataset dataLines = 0 data_matrix = self.normalizer.get_csv() for row in data_matrix: if len(row) > 0: dataLines += 1 #check the number of observations from dataset is kept totalObsNb = 0 for clusterNb in range(kMeanClusterer.getClusterNumber()): cluster = kMeanClusterer.getCluster(clusterNb) totalObsNb += len(cluster.getObservations()) self.assertTrue( dataLines == totalObsNb, "Number of entries in dataset: " + str(dataLines) + " is different from number of observations in cluster: " + str(totalObsNb)) # check all normalized entries in the dataset are kept index = 0 for entry in self.normalizer.normalize(): found = False for clusterNb in range(kMeanClusterer.getClusterNumber()): cluster = kMeanClusterer.getCluster(clusterNb) observations = cluster.getObservations() for obs in observations: if obs == entry: found = True break self.assertTrue( found, "observation " + str(entry) + " not found at index " + str(index)) index += 1 def testKMeanUpdate(self): print("** test KMean update **") k = 3 n = 10 cols = [3, 4, 5] datafile = "datasets/spambase_2.data" kMeanClusterer = KMeanClusterer(k, n, cols, datafile) kMeanClusterer.assignement() kMeanClusterer.update() # check existence of centroid for i in range(kMeanClusterer.getClusterNumber()): current_cluster = kMeanClusterer.getCluster(i) self.assertTrue( len(current_cluster.getCentroid()) > 0, "void centroid for cluster " + str(i)) # check validity of centroid for i in range(kMeanClusterer.getClusterNumber()): current_cluster = kMeanClusterer.getCluster(i) current_centroid = current_cluster.getCentroid() obs = current_cluster.getObservations() for j in range(len(current_centroid)): tmp = 0 for i in range(len(obs)): try: tmp += float(obs[i][j]) except ValueError: pass # field is not numeric try: value = float( current_centroid[j]) #for test that data is numeric self.assertTrue( tmp / len(obs) == value, "current centroid: " + str(value) + "; actual centroid value: " + str(tmp / len(obs))) except ValueError: pass # field is not numeric def testCentroidsComparison(self): print("** Test centroids comparison **") k = KMeanClusterer(3, 10, [3, 4, 5], "datasets/spambase_2.data") centroid1 = tuple([1, 2, 3, 4, 5]) centroid2 = tuple([1, 2, 3, 4, 5]) centroid3 = tuple([5, 4, 3, 2, 6]) centroidsEquals1 = [centroid1, centroid1] centroidsEquals2 = [centroid2, centroid2] centroidsDifferents1 = [centroid1, centroid1] centroidsDifferents2 = [centroid1, centroid3] self.assertTrue( k.compareCentroids(centroidsEquals1, centroidsEquals2) == False, "Centroids should be equals") self.assertTrue( k.compareCentroids(centroidsDifferents1, centroidsDifferents2), "Centroids should be different") def testCalculations(self): print("** Mean test **") arr = [10, 15, 20] moy = 15 self.assertTrue( self.normalizer.moyenne(arr) == moy, "Mean calculation is uncorrect") def testColumnExtraction(self): print("** Test column extraction **") multi = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 0]] single = [1, 3, 5, 7, 9] self.assertTrue( self.normalizer.column(multi, 0) == single, "Extracted column doesn't match") def testNormalization(self): print("** Test normalization **") the_normalizer = Normalizer("datasets/test_normalization.csv") normalized = [[0, 0, 0], [1, 1, 1], [0.5, 0.1, 0.9]] self.assertTrue(the_normalizer.normalize() == normalized, "Normalized data doesn't match") def testCSVIntegrity(self): print("** Test CSV Integrity **") the_normalizer = Normalizer("datasets/test_normalization.csv") data = the_normalizer.get_csv() origin_data = [['0', '3', '0'], ['1', '33', '100'], ['0.5', '6', '90']] length = 3 self.assertTrue(data == origin_data, "Data and CSV file doesn't match") self.assertTrue(length == the_normalizer.getRowLength(), "Line length doesn't match")
def setUp(self): self.datafile = "datasets/spambase_2.data" self.normalizer = Normalizer(self.datafile) pass
if i != c: cv2.drawContours(new, [cnts[i]], -1, color, thickness=cv2.FILLED) if all(all(p == 255 for p in line) == True for line in new): return None return new # Parse arguments ap = argparse.ArgumentParser() ap.add_argument("-i", "--imgs_folder", required=True, help="Images folder") args = vars(ap.parse_args()) imgs_folder = args['imgs_folder'] N = Normalizer(170) for img in os.listdir(imgs_folder): image = cv2.imread("{}/{}".format(imgs_folder, img), 0) display("original", image) thresh = cv2.threshold(image, 60, 255, cv2.THRESH_BINARY)[1] _, cnts, h = cv2.findContours(thresh.copy(), cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) # Hierarchy: For each contour -> [next, previous, child, parent] n = h[0][0][2] # first child c = [] # c -> external contours [contour, area, id]