def plot_barchar(features_name): # Example https://matplotlib.org/2.0.2/examples/api/barchart_demo.html normal = df.loc[df['label'] == 'NORMAL'][features_name] malware = df.loc[df['label'] == 'MALWARE'][features_name] c = Counter(normal) print c.most_common(15) return features_alexa = ["in_alexa_top100", "in_alexa_top1k", "in_alexa_top10k", "in_alexa_top100k", "in_alexa_top1m", "not_in_alexa"] features_names_simplified = ["top 100", "top 1k", "top 10k", "top 100k", " top 1m", "not"] normal_means = list() normal_std = list() malware_means = list() malware_std = list() for i in range(len(features_alexa)): normal = df.loc[df['label'] == 'NORMAL'][features_alexa[i]] malware = df.loc[df['label'] == 'MALWARE'][features_alexa[i]] normal_means.append(np.mean(normal)) normal_std.append(np.std(normal)) malware_means.append(np.mean(malware)) malware_std.append(np.std(malware)) N = len(features_alexa) men_means = (20, 35, 30, 35, 27) men_std = (2, 3, 4, 1, 2) ind = np.arange(N) # the x locations for the groups width = 0.35 # the width of the bars fig, ax = plt.subplots() rects1 = ax.bar(ind, normal_means, width, color='g', yerr=normal_std) women_means = (25, 32, 34, 20, 25) women_std = (3, 5, 2, 3, 3) rects2 = ax.bar(ind + width, malware_means, width, color='r', yerr=malware_std) # add some text for labels, title and axes ticks ax.set_ylabel('Scores') ax.set_title('Scores by group and gender') ax.set_xticks(ind + width / 2) ax.set_xticklabels(features_names_simplified) ax.legend((rects1[0], rects2[0]), ('Normal', 'Botnet')) fig.savefig(c.graphs_folder + "features_alexa" + '.png')
def predictByUnique(self, text, threshold=float(1e40)): """ Förutsäger en koordinat för en bunte text Tar endast och använder det vanliga på unika ord Input: text Output: koordinat (lon, lat) """ lenText = len(text) lenWords = int(lenText / 8.3) lowerPercent = 0.00008 # Ger frekvens median 3 lowerBound = int(lenWords*lowerPercent) topBound = int(lenWords/300.0) if topBound == 0: topBound = 999999999999999999999999 print "low ", lowerBound, "top ", topBound words = self.cleanData(text).split() c = Counter() c.update(words) wordsInSpan = [t[0] for t in c.most_common() if t[1] > lowerBound and t[1] < topBound] print "lenord i spann", len(wordsInSpan) text = " ".join(wordsInSpan) return self.predict(text, threshold=threshold)
def predictByGrammar(self, text, threshold=float(1e40), clipping=True): """ Förutsäger en koordinat för en bunte text Implementatation av gramatikförfarandet Input: text Output: kortad text med bara ord som grammatiken tagit """ lenText = len(text) lenWords = int(lenText / 8.3) lowerPercent = 0.00008 # Ger frekvens median 3 if clipping: lowerBound = int(lenWords*lowerPercent) topBound = int(lenWords/300.0) else: lowerBound = 0 topBound = 999999999999999999999999 if topBound == 0: topBound = 999999999999999999999999 print "low ", lowerBound, "top ", topBound c = Counter() text = text.lower() for pattern in self.patterns: found = re.findall(pattern, text) if found: c.update(found) wordsInSpan = [t[0] for t in c.most_common() if t[1] > lowerBound and t[1] < topBound] print "lenord i spann", len(wordsInSpan) text = " ".join(wordsInSpan) #return self.predict(text, threshold=threshold) return text
fetch_data(i) else: raw_data = pd.read_csv(result_save_file) word_result = word_pattern.sub("", ''.join(analysis_word(raw_data))) words = [ word for word in jb.cut(word_result, cut_all=False) if len(word) >= 3 ] exclude_words = [ '一辈子', '不相离', '另一半', '业余时间', '性格特点', '茫茫人海', '男朋友', '找对象', '谈恋爱', '有时候', '女孩子', '哈哈哈', '加微信', '兴趣爱好', '是因为', '不良嗜好', '男孩子', '为什么', '没关系', '不介意', '没什么', '交朋友', '大大咧咧', '大富大贵', '联系方式', '打招呼', '有意者', '晚一点', '哈哈哈', '以上学历', '是不是', '给我发', '不怎么', '第一次', '越来越', '遇一人', '择一人', '无数次', '符合条件', '什么样', '全世界', '比较简单', '浪费时间', '不知不觉', '有没有', '寻寻觅觅', '自我介绍', '请勿打扰', '差不多', '不在乎', '看起来', '一点点', '陪你到', '这么久', '看清楚', '身高体重', '比较慢', '比较忙', '多一点', '小女生', '土生土长', '发消息', '最合适' ] for i in range(0, len(words)): if words[i] in exclude_words: words[i] = None filter_list = list(filter(lambda t: t is not None, words)) data = r' '.join(filter_list) c = Counter(filter_list) word_name = [] # 词 word_count = [] # 词频 for word_freq in c.most_common(100): word, freq = word_freq word_name.append(word) word_count.append(freq) draw_word_wc(word_name, word_count)
fetch_data(i) else: raw_data = pd.read_csv(result_save_file) word_result = word_pattern.sub("", ''.join(analysis_word(raw_data))) words = [word for word in jb.cut(word_result, cut_all=False) if len(word) >= 3] exclude_words = [ '一辈子', '不相离', '另一半', '业余时间', '性格特点', '茫茫人海', '男朋友', '找对象', '谈恋爱', '有时候', '女孩子', '哈哈哈', '加微信', '兴趣爱好', '是因为', '不良嗜好', '男孩子', '为什么', '没关系', '不介意', '没什么', '交朋友', '大大咧咧', '大富大贵', '联系方式', '打招呼', '有意者', '晚一点', '哈哈哈', '以上学历', '是不是', '给我发', '不怎么', '第一次', '越来越', '遇一人', '择一人', '无数次', '符合条件', '什么样', '全世界', '比较简单', '浪费时间', '不知不觉', '有没有', '寻寻觅觅', '自我介绍', '请勿打扰', '差不多', '不在乎', '看起来', '一点点', '陪你到', '这么久', '看清楚', '身高体重', '比较慢', '比较忙', '多一点', '小女生', '土生土长', '发消息', '最合适' ] for i in range(0, len(words)): if words[i] in exclude_words: words[i] = None filter_list = list(filter(lambda t: t is not None, words)) data = r' '.join(filter_list) c = Counter(filter_list) word_name = [] # 词 word_count = [] # 词频 for word_freq in c.most_common(100): word, freq = word_freq word_name.append(word) word_count.append(freq) draw_word_wc(word_name, word_count)
max_font_size=400) wc = wc.generate(content) wc.to_file('result.png') if __name__ == '__main__': result = punctuation_pattern.sub("", get_article(myurl)) words = [ word for word in jieba.cut(result, cut_all=False) if len(word) >= 2 ] #data cleaning with unmeaning words exclude_words = [ "一年", "一位", "他们", "新华社", "前夕", "主席", "习近平", "中央", "广播电视", "总台", "长江", "不会", "正在", "这些", "新年贺词", "看到", "隆重庆祝", "多万", "发表" ] for num in range(len(words) - 1, -1, -1): if words[num] in exclude_words: print("移除非关键字" + words[num]) del words[num] c = Counter(words) for word_freq in c.most_common(50): word, freq = word_freq print(word, freq) data = r' '.join(words) generate_wc(data) image = Image.open("result.png") image.show()