def create_sentence_list(doc_body, vector_dictionary): """Return sentence data structure containing their vectors""" sentence_list = [] sentences = split(doc_body) for sentence in sentences: list_entry = tokenize_and_vectorize(sentence, vector_dictionary) if list_entry is not DO_NOT_INCLUDE: sentence_list.append(list_entry) return sentence_list
def fit(self, features, data): """ fit data to tree model using recursion""" # get best criterion lead to greatest infomation gain""" gain, criterion = find_best_split(features, data) # Base case: no further info gain, data reach leaf nodes, if gain == 0: return Leaf(data) # split the data at the criterion left_data, right_data = split(data, criterion) # Recursively build the branches left_branch = self.fit(features, left_data) right_branch = self.fit(features, right_data) # Return a Decision Node, which contains the criterion and two child branches of the node fitted_node = DecisionNode(criterion, left_branch, right_branch) self.tree = fitted_node return fitted_node
def __build_tree(self, X, y, n_features, feature_indices, depth): node_data_set = np.column_stack((X, y)) sample_size = len(y) if len(y) <= self.min_samples_split or (depth != None and depth == self.max_depth): estimated_value = np.mean(y) # leaf = Leaf(estimated_value=estimated_value, sample_size=sample_size, leaf_data_set=node_data_set) return leaf #寻找分裂属性和最优分裂点 best_feature_index, threshold, min_mes = find_split( X, y, self.criterion, feature_indices) X_true, y_true, X_false, y_false = split(X, y, best_feature_index, threshold) # 分成左子树和右子树 node = Node(feature_index=best_feature_index, threshold=threshold, min_mes=min_mes, sample_size=sample_size, node_data_set=node_data_set) # # 随机的选特征 feature_indices = random.sample(range(n_features), int(self.max_features)) ## 递归的创建左子树 node.branch_true = self.__build_tree(X_true, y_true, n_features, feature_indices, depth + 1) ## 随机的选特征 feature_indices = random.sample(range(n_features), int(self.max_features)) node.branch_false = self.__build_tree(X_false, y_false, n_features, feature_indices, depth + 1) return node
def crawl(self): while True: try: self.urls = [] line = self.queue.get() self.url,depth = line depth = int(depth) if depth>=self.maxDepth or self.bloomset.get(self.url) or utilities.compare(self.baseURL, self.url): continue self.bloomset.add(self.url) arr = utilities.split(self.url) self.bloomset.multiAdd(arr) res = urlopen(self.url) body = res.read().decode("ISO-8859-1") self.feed(body) print self.url,len(self.urls) self.queue.put(self.urls, depth+1) self.queue.putResult(self.url) # utilities.addtocsv(datetime.datetime.now() - SpiderWorker.date ,Storage.queue,Storage.crawled) except Exception as e: print e,line finally: self.queue.task_done()
from preprocessing import load, add_title_bad from utilities import split, tfidf_transform if __name__ == '__main__': steam_reviews, nlp, docs = load(r"steam_reviews.csv") add_title_bad(docs, steam_reviews) # Split on user_suggestion X_train, X_test, y_train, y_test = split(docs, steam_reviews.user_suggestion) # TF-IDF and LDA X_train_tfidf, X_test_tfidf, tfidf = tfidf_transform(X_train, X_test)
no_of_layers = int(sys.argv[4]) layer_perceptrons = [] learning_rate = 0.1 for i in range(0, no_of_layers): layer_perceptrons.append(int(sys.argv[5 + i])) data = pd.read_csv(data_set_path) column_list = data.columns.values attr = column_list[:-1] classname = column_list[-1] no_of_attributes = len(attr) no_of_bits = len(utilities.getBinaryArray(data[classname].max())) array = data.as_matrix() max_array = [] for k in range(0, len(array)): max_array.append(max(array[k])) train_data, test_data = utilities.split(array, training_percent) output_perceptrons = no_of_bits layer_perceptrons.append(output_perceptrons) layer_random_count = no_of_attributes print('Getting random weights ...') weights = utilities.getRandomWeights(layer_perceptrons, layer_random_count) print('Populated random weights ...') print('Building model ...') for i in range(0, len(train_data)): expected_output = [] expected_output = utilities.modifyBinaryArray(int(train_data[i][-1:]), no_of_bits) utilities.back_propogation(weights, train_data[i][:-1], layer_perceptrons, 0, expected_output, learning_rate, iterations) print('ANN created ...') print('')
def build_tree(self, X, y, feature_indices,fa_feature_index,select_feature_fa, father_node,depth): """ 建立决策树 X : y: feature_indices:随机选择的特征集合 fa_feature_index:父节点选择的哪个特征作为分裂特征,、初始时为-1, depth :树的深度 select_feature_fa :记录当前节点的父节点的最优分割属性 """ select_feature_fa.append(fa_feature_index) n_features = X.shape[1] n_features_list = [i for i in range(n_features)] #记录选择的特征 self.select_feature.append(feature_indices) self.sample_num.append(len(y)) node_data_set = np.column_stack((X, y)) # 树终止条件 if self.criterion == 'entropy': if depth is self.max_depth or len(y) < self.min_samples_split or entropy(y) is 0: return mode(y)[0][0]# 返回y数组的众数 # 树终止条件 if self.criterion == 'gini': temp_gini = gini(y) self.gini_.append(temp_gini) sample_num = len(y) if depth is self.max_depth or sample_num < self.min_samples_split or temp_gini < self.min_impurity_split: # if depth is self.max_depth or temp_gini < self.min_impurity_split: #所有的特征都已经被选择了,就随机选择一个特征,使得叶子节点构成双特征 if set(n_features_list) == set(select_feature_fa): index = random.randrange(len(n_features_list)) current_feature_index = n_features_list[index] current_max_value = np.max(X[:, current_feature_index]) current_min_value = np.min(X[:, current_feature_index]) else: to_be_select = list(set(n_features_list) - set(select_feature_fa)) index = random.randrange(len(to_be_select)) current_feature_index = to_be_select[index] current_max_value = np.max(X[:, current_feature_index]) current_min_value = np.min(X[:, current_feature_index]) leaf = Leaf(mode(y)[0][0],fa_feature_index , np.max(X[:,fa_feature_index]), np.min(X[:,fa_feature_index]),current_feature_index,current_max_value, current_min_value,select_feature_fa,node_data_set,sample_num,prior_node= father_node) self.leaf_list.append(leaf) return leaf # feature_index最佳分割属性, threshold 最佳分割属性值,gini_ 系数 feature_index, threshold, max_value ,min_value ,gini_ = find_split(X, y, self.criterion, feature_indices) fa_max_value = np.max(X[:, fa_feature_index]) # 该节点记录父节点分裂特征的最大值 fa_min_value = np.min(X[:, fa_feature_index]) # 该节点记录父节点分裂特征的最小值 X_true, y_true, X_false, y_false = split(X, y, feature_index, threshold)# 分成左子树和右子树 # 没有元素 if y_true.shape[0] is 0 or y_false.shape[0] is 0: if set(n_features_list) == set(select_feature_fa): index = random.randrange(len(n_features_list)) current_feature_index = n_features_list[index] current_max_value = np.max(X[:, current_feature_index]) current_min_value = np.min(X[:, current_feature_index]) else: to_be_select = list(set(n_features_list) - set(select_feature_fa)) index = random.randrange(len(to_be_select)) current_feature_index = to_be_select[index] current_max_value = np.max(X[:, current_feature_index]) current_min_value = np.min(X[:, current_feature_index]) leaf = Leaf(mode(y)[0][0], fa_feature_index, np.max(X[:, fa_feature_index]), np.min(X[:, fa_feature_index]), current_feature_index,current_max_value,current_min_value,select_feature_fa,node_data_set,prior_node= father_node,sample_num= 0) self.leaf_list.append(leaf) return leaf node = Node(feature_index=feature_index, fa_feature_index = fa_feature_index, threshold = threshold, max_value = max_value, min_value = min_value, fa_max_value = fa_max_value, fa_min_value = fa_min_value, gini_coefficient = gini_, node_data_set = node_data_set) # # 随机的选特征 n_features = X.shape[1] n_sub_features = int(self.max_features) # feature_indices = random.sample(range(n_features), n_sub_features) select_feature = list() select_feature += select_feature_fa # 记录节点选择的特征 ## 递归的创建左子树 node.branch_true = self.build_tree(X_true, y_true, feature_indices,feature_index, select_feature,node,depth + 1) ## 随机的选特征 feature_indices = random.sample(range(n_features), n_sub_features) # 递归的创建右子树 select_feature = list() select_feature += select_feature_fa # 记录节点选择的特征 node.branch_false = self.build_tree(X_false, y_false, feature_indices,feature_index, select_feature,node,depth + 1) node.prior_node = father_node #指向前驱节点 return node
# -*- coding: UTF-8 –*- #!/usr/bin/env python2 import audioSegmentation as aS import sh import utilities if __name__ == '__main__': # 输入音频文件 audio = "/Users/nettech/Music/Logger/AM1053/20180302/08-00-00.m4a" keywords = [u"中科", u"虫草"] # 分开文件路径、文件名、后缀 path, name, suffix = utilities.split(audio) print "文件路径: " + path + "\n文件名: " + name + "\n后缀: " + suffix print('--------------------------------') # m4a转换成mp3 #mp3_audio = utilities.convert(audio) # 生成wav文件名和路径 wav_audio = path + "/" + name + ".wav" print "wav文件路径预设为: " + wav_audio print('--------------------------------') # 批量文件路径下的mp3转wav #audioAnalysis.dirMp3toWavWrapper(path, 16000, 1) # hmm分段,并生成segment文件 segFileName = path + "/" + name + ".segment" print "seg文件路径预设为: " + segFileName print('--------------------------------') #[flagsInd, classesAll, acc, CM] = aS.hmmSegmentation(wav_audio, "data/hmmRadioSM", segFileName, True, '') # 根据seg去除100秒以上的music cmd = "mkdir " + path + "/" + name