示例#1
0
文件: summariser.py 项目: EoinM95/FYP
def create_sentence_list(doc_body, vector_dictionary):
    """Return sentence data structure containing their vectors"""
    sentence_list = []
    sentences = split(doc_body)
    for sentence in sentences:
        list_entry = tokenize_and_vectorize(sentence, vector_dictionary)
        if list_entry is not DO_NOT_INCLUDE:
            sentence_list.append(list_entry)
    return sentence_list
示例#2
0
 def fit(self, features, data):
     """ fit data to tree model using recursion"""
     # get best criterion lead to greatest infomation gain"""
     gain, criterion = find_best_split(features, data)
     # Base case: no further info gain, data reach leaf nodes,
     if gain == 0:
         return Leaf(data)
     # split the data at the criterion
     left_data, right_data = split(data, criterion)
     # Recursively build the branches
     left_branch = self.fit(features, left_data)
     right_branch = self.fit(features, right_data)
     # Return a Decision Node, which contains the criterion and two child branches of the node
     fitted_node = DecisionNode(criterion, left_branch, right_branch)
     self.tree = fitted_node
     return fitted_node
示例#3
0
    def __build_tree(self, X, y, n_features, feature_indices, depth):

        node_data_set = np.column_stack((X, y))
        sample_size = len(y)

        if len(y) <= self.min_samples_split or (depth != None
                                                and depth == self.max_depth):

            estimated_value = np.mean(y)  #

            leaf = Leaf(estimated_value=estimated_value,
                        sample_size=sample_size,
                        leaf_data_set=node_data_set)
            return leaf

        #寻找分裂属性和最优分裂点
        best_feature_index, threshold, min_mes = find_split(
            X, y, self.criterion, feature_indices)

        X_true, y_true, X_false, y_false = split(X, y, best_feature_index,
                                                 threshold)  # 分成左子树和右子树

        node = Node(feature_index=best_feature_index,
                    threshold=threshold,
                    min_mes=min_mes,
                    sample_size=sample_size,
                    node_data_set=node_data_set)

        # # 随机的选特征
        feature_indices = random.sample(range(n_features),
                                        int(self.max_features))
        ## 递归的创建左子树
        node.branch_true = self.__build_tree(X_true, y_true, n_features,
                                             feature_indices, depth + 1)

        ## 随机的选特征
        feature_indices = random.sample(range(n_features),
                                        int(self.max_features))

        node.branch_false = self.__build_tree(X_false, y_false, n_features,
                                              feature_indices, depth + 1)

        return node
示例#4
0
	def crawl(self):
		while True:
			try:
				self.urls = []
				line = self.queue.get()
				self.url,depth = line
				depth = int(depth)
				if depth>=self.maxDepth or self.bloomset.get(self.url) or utilities.compare(self.baseURL, self.url):
					continue
				self.bloomset.add(self.url)
				arr = utilities.split(self.url)
				self.bloomset.multiAdd(arr)
				res = urlopen(self.url)
				body = res.read().decode("ISO-8859-1")
				self.feed(body)
				print self.url,len(self.urls)
				self.queue.put(self.urls, depth+1)
				self.queue.putResult(self.url)
			#	utilities.addtocsv(datetime.datetime.now() - SpiderWorker.date ,Storage.queue,Storage.crawled)
			except Exception as e:
				print e,line
			finally:
				self.queue.task_done()
from preprocessing import load, add_title_bad
from utilities import split, tfidf_transform

if __name__ == '__main__':
    steam_reviews, nlp, docs = load(r"steam_reviews.csv")
    add_title_bad(docs, steam_reviews)

    # Split on user_suggestion
    X_train, X_test, y_train, y_test = split(docs,
                                             steam_reviews.user_suggestion)

    # TF-IDF and LDA
    X_train_tfidf, X_test_tfidf, tfidf = tfidf_transform(X_train, X_test)

示例#6
0
no_of_layers = int(sys.argv[4])
layer_perceptrons = []
learning_rate = 0.1
for i in range(0, no_of_layers):
    layer_perceptrons.append(int(sys.argv[5 + i]))
data = pd.read_csv(data_set_path)
column_list = data.columns.values
attr = column_list[:-1]
classname = column_list[-1]
no_of_attributes = len(attr)
no_of_bits = len(utilities.getBinaryArray(data[classname].max()))
array = data.as_matrix()
max_array = []
for k in range(0, len(array)):
    max_array.append(max(array[k]))
train_data, test_data = utilities.split(array, training_percent)
output_perceptrons = no_of_bits
layer_perceptrons.append(output_perceptrons)
layer_random_count = no_of_attributes
print('Getting random weights ...')
weights = utilities.getRandomWeights(layer_perceptrons, layer_random_count)
print('Populated random weights ...')
print('Building model ...')
for i in range(0, len(train_data)):
    expected_output = []
    expected_output = utilities.modifyBinaryArray(int(train_data[i][-1:]),
                                                  no_of_bits)
    utilities.back_propogation(weights, train_data[i][:-1], layer_perceptrons,
                               0, expected_output, learning_rate, iterations)
print('ANN created ...')
print('')
示例#7
0
    def build_tree(self, X, y, feature_indices,fa_feature_index,select_feature_fa, father_node,depth):
        """
        建立决策树
        X :
        y:
        feature_indices:随机选择的特征集合
        fa_feature_index:父节点选择的哪个特征作为分裂特征,、初始时为-1,
        depth :树的深度
        select_feature_fa :记录当前节点的父节点的最优分割属性
        """
        select_feature_fa.append(fa_feature_index)
        n_features = X.shape[1]
        n_features_list = [i for i in range(n_features)]
        #记录选择的特征
        self.select_feature.append(feature_indices)
        self.sample_num.append(len(y))

        node_data_set = np.column_stack((X, y))

        # 树终止条件
        if self.criterion == 'entropy':
            if depth is self.max_depth or len(y) < self.min_samples_split or entropy(y) is 0:
                return mode(y)[0][0]# 返回y数组的众数

        # 树终止条件
        if self.criterion == 'gini':
            temp_gini = gini(y)
            self.gini_.append(temp_gini)
            sample_num = len(y)
            if depth is self.max_depth or sample_num < self.min_samples_split or temp_gini < self.min_impurity_split:
            # if depth is self.max_depth or temp_gini < self.min_impurity_split:

                #所有的特征都已经被选择了,就随机选择一个特征,使得叶子节点构成双特征
                if set(n_features_list) == set(select_feature_fa):
                    index = random.randrange(len(n_features_list))
                    current_feature_index = n_features_list[index]
                    current_max_value = np.max(X[:, current_feature_index])
                    current_min_value = np.min(X[:, current_feature_index])

                else:
                    to_be_select = list(set(n_features_list) - set(select_feature_fa))
                    index = random.randrange(len(to_be_select))

                    current_feature_index = to_be_select[index]
                    current_max_value = np.max(X[:, current_feature_index])
                    current_min_value = np.min(X[:, current_feature_index])

                leaf = Leaf(mode(y)[0][0],fa_feature_index , np.max(X[:,fa_feature_index]),
                            np.min(X[:,fa_feature_index]),current_feature_index,current_max_value,
                            current_min_value,select_feature_fa,node_data_set,sample_num,prior_node= father_node)
                self.leaf_list.append(leaf)
                return leaf

        # feature_index最佳分割属性, threshold 最佳分割属性值,gini_ 系数
        feature_index, threshold, max_value ,min_value ,gini_ = find_split(X, y, self.criterion, feature_indices)

        fa_max_value = np.max(X[:, fa_feature_index])  # 该节点记录父节点分裂特征的最大值
        fa_min_value = np.min(X[:, fa_feature_index])  # 该节点记录父节点分裂特征的最小值

        X_true, y_true, X_false, y_false = split(X, y, feature_index, threshold)# 分成左子树和右子树

        # 没有元素
        if y_true.shape[0] is 0 or y_false.shape[0] is 0:

            if set(n_features_list) == set(select_feature_fa):
                index = random.randrange(len(n_features_list))
                current_feature_index = n_features_list[index]
                current_max_value = np.max(X[:, current_feature_index])
                current_min_value = np.min(X[:, current_feature_index])

            else:
                to_be_select = list(set(n_features_list) - set(select_feature_fa))
                index = random.randrange(len(to_be_select))

                current_feature_index = to_be_select[index]
                current_max_value = np.max(X[:, current_feature_index])
                current_min_value = np.min(X[:, current_feature_index])

            leaf = Leaf(mode(y)[0][0], fa_feature_index, np.max(X[:, fa_feature_index]), np.min(X[:, fa_feature_index]),
                        current_feature_index,current_max_value,current_min_value,select_feature_fa,node_data_set,prior_node= father_node,sample_num= 0)

            self.leaf_list.append(leaf)
            return leaf

        node = Node(feature_index=feature_index,
                    fa_feature_index = fa_feature_index,
                    threshold = threshold, max_value = max_value, min_value = min_value,
                    fa_max_value = fa_max_value, fa_min_value = fa_min_value,
                    gini_coefficient = gini_,
                    node_data_set = node_data_set)


        # # 随机的选特征
        n_features = X.shape[1]
        n_sub_features = int(self.max_features)
        #
        feature_indices = random.sample(range(n_features), n_sub_features)
        select_feature = list()
        select_feature += select_feature_fa  # 记录节点选择的特征
        ## 递归的创建左子树
        node.branch_true = self.build_tree(X_true, y_true, feature_indices,feature_index,
                                           select_feature,node,depth + 1)

        ## 随机的选特征
        feature_indices = random.sample(range(n_features), n_sub_features)
        # 递归的创建右子树
        select_feature = list()
        select_feature += select_feature_fa  # 记录节点选择的特征
        node.branch_false = self.build_tree(X_false, y_false, feature_indices,feature_index,
                                            select_feature,node,depth + 1)

        node.prior_node = father_node #指向前驱节点

        return node
示例#8
0
# -*- coding: UTF-8 –*-
#!/usr/bin/env python2
import audioSegmentation as aS
import sh
import utilities

if __name__ == '__main__':

    # 输入音频文件
    audio = "/Users/nettech/Music/Logger/AM1053/20180302/08-00-00.m4a"
    keywords = [u"中科", u"虫草"]

    # 分开文件路径、文件名、后缀
    path, name, suffix = utilities.split(audio)
    print "文件路径: " + path + "\n文件名: " + name + "\n后缀: " + suffix
    print('--------------------------------')
    # m4a转换成mp3
    #mp3_audio = utilities.convert(audio)
    # 生成wav文件名和路径
    wav_audio = path + "/" + name + ".wav"
    print "wav文件路径预设为: " + wav_audio
    print('--------------------------------')
    # 批量文件路径下的mp3转wav
    #audioAnalysis.dirMp3toWavWrapper(path, 16000, 1)
    # hmm分段,并生成segment文件
    segFileName = path + "/" + name + ".segment"
    print "seg文件路径预设为: " + segFileName
    print('--------------------------------')
    #[flagsInd, classesAll, acc, CM] = aS.hmmSegmentation(wav_audio, "data/hmmRadioSM", segFileName, True, '')
    # 根据seg去除100秒以上的music
    cmd = "mkdir " + path + "/" + name