def plot_barchar(features_name):
    # Example https://matplotlib.org/2.0.2/examples/api/barchart_demo.html

    normal = df.loc[df['label'] == 'NORMAL'][features_name]
    malware = df.loc[df['label'] == 'MALWARE'][features_name]
    c = Counter(normal)
    print c.most_common(15)
    return



    features_alexa = ["in_alexa_top100", "in_alexa_top1k", "in_alexa_top10k", "in_alexa_top100k", "in_alexa_top1m",
                      "not_in_alexa"]
    features_names_simplified = ["top 100", "top 1k", "top 10k", "top 100k", " top 1m", "not"]

    normal_means = list()
    normal_std = list()

    malware_means = list()
    malware_std = list()
    for i in range(len(features_alexa)):
        normal = df.loc[df['label'] == 'NORMAL'][features_alexa[i]]
        malware = df.loc[df['label'] == 'MALWARE'][features_alexa[i]]
        normal_means.append(np.mean(normal))
        normal_std.append(np.std(normal))
        malware_means.append(np.mean(malware))
        malware_std.append(np.std(malware))



    N = len(features_alexa)
    men_means = (20, 35, 30, 35, 27)
    men_std = (2, 3, 4, 1, 2)

    ind = np.arange(N)  # the x locations for the groups
    width = 0.35       # the width of the bars

    fig, ax = plt.subplots()
    rects1 = ax.bar(ind, normal_means, width, color='g', yerr=normal_std)

    women_means = (25, 32, 34, 20, 25)
    women_std = (3, 5, 2, 3, 3)
    rects2 = ax.bar(ind + width, malware_means, width, color='r', yerr=malware_std)

    # add some text for labels, title and axes ticks
    ax.set_ylabel('Scores')
    ax.set_title('Scores by group and gender')
    ax.set_xticks(ind + width / 2)
    ax.set_xticklabels(features_names_simplified)

    ax.legend((rects1[0], rects2[0]), ('Normal', 'Botnet'))
    fig.savefig(c.graphs_folder + "features_alexa" + '.png')
示例#2
0
    def predictByUnique(self, text, threshold=float(1e40)):
        """ 
        Förutsäger en koordinat för en bunte text
        Tar endast och använder det vanliga på unika ord
        Input: text
        Output: koordinat (lon, lat)
        """
        lenText = len(text)
        lenWords = int(lenText / 8.3)
        lowerPercent = 0.00008 # Ger frekvens median 3
        lowerBound = int(lenWords*lowerPercent) 
        topBound = int(lenWords/300.0)
        
        if topBound == 0:
            topBound = 999999999999999999999999
            
        print "low ", lowerBound, "top ", topBound
        
        words = self.cleanData(text).split()
        c = Counter()
        c.update(words)

        wordsInSpan = [t[0] for t in c.most_common() if t[1] > lowerBound and t[1] < topBound]
        print "lenord i spann", len(wordsInSpan)
        text = " ".join(wordsInSpan)
                
        return self.predict(text, threshold=threshold)
示例#3
0
    def predictByGrammar(self, text, threshold=float(1e40), clipping=True):
        """ 
        Förutsäger en koordinat för en bunte text
        Implementatation av gramatikförfarandet
        Input: text
        Output: kortad text med bara ord som grammatiken tagit 
        """
    
        lenText = len(text)
        lenWords = int(lenText / 8.3)
        lowerPercent = 0.00008 # Ger frekvens median 3        
        
        if clipping:
            lowerBound = int(lenWords*lowerPercent) 
            topBound = int(lenWords/300.0)
        else:
            lowerBound = 0
            topBound = 999999999999999999999999
        
        if topBound == 0:
            topBound = 999999999999999999999999
            
        print "low ", lowerBound, "top ", topBound

        c = Counter()
        text = text.lower()
        for pattern in self.patterns:
            found = re.findall(pattern, text)
            if found:
                c.update(found)

        wordsInSpan = [t[0] for t in c.most_common() if t[1] > lowerBound and t[1] < topBound]
        print "lenord i spann", len(wordsInSpan)
        text = " ".join(wordsInSpan)
                
        #return self.predict(text, threshold=threshold)
        return text 
示例#4
0
            fetch_data(i)
    else:
        raw_data = pd.read_csv(result_save_file)
        word_result = word_pattern.sub("", ''.join(analysis_word(raw_data)))
        words = [
            word for word in jb.cut(word_result, cut_all=False)
            if len(word) >= 3
        ]
        exclude_words = [
            '一辈子', '不相离', '另一半', '业余时间', '性格特点', '茫茫人海', '男朋友', '找对象', '谈恋爱',
            '有时候', '女孩子', '哈哈哈', '加微信', '兴趣爱好', '是因为', '不良嗜好', '男孩子', '为什么',
            '没关系', '不介意', '没什么', '交朋友', '大大咧咧', '大富大贵', '联系方式', '打招呼', '有意者',
            '晚一点', '哈哈哈', '以上学历', '是不是', '给我发', '不怎么', '第一次', '越来越', '遇一人',
            '择一人', '无数次', '符合条件', '什么样', '全世界', '比较简单', '浪费时间', '不知不觉', '有没有',
            '寻寻觅觅', '自我介绍', '请勿打扰', '差不多', '不在乎', '看起来', '一点点', '陪你到', '这么久',
            '看清楚', '身高体重', '比较慢', '比较忙', '多一点', '小女生', '土生土长', '发消息', '最合适'
        ]
        for i in range(0, len(words)):
            if words[i] in exclude_words:
                words[i] = None
        filter_list = list(filter(lambda t: t is not None, words))
        data = r' '.join(filter_list)
        c = Counter(filter_list)
        word_name = []  # 词
        word_count = []  # 词频
        for word_freq in c.most_common(100):
            word, freq = word_freq
            word_name.append(word)
            word_count.append(freq)
        draw_word_wc(word_name, word_count)
            fetch_data(i)
    else:
        raw_data = pd.read_csv(result_save_file)
        word_result = word_pattern.sub("", ''.join(analysis_word(raw_data)))
        words = [word for word in jb.cut(word_result, cut_all=False) if len(word) >= 3]
        exclude_words = [
            '一辈子', '不相离', '另一半', '业余时间', '性格特点', '茫茫人海', '男朋友', '找对象',
            '谈恋爱', '有时候', '女孩子', '哈哈哈', '加微信', '兴趣爱好',
            '是因为', '不良嗜好', '男孩子', '为什么', '没关系', '不介意',
            '没什么', '交朋友', '大大咧咧', '大富大贵', '联系方式', '打招呼',
            '有意者', '晚一点', '哈哈哈', '以上学历', '是不是', '给我发',
            '不怎么', '第一次', '越来越', '遇一人', '择一人', '无数次',
            '符合条件', '什么样', '全世界', '比较简单', '浪费时间', '不知不觉',
            '有没有', '寻寻觅觅', '自我介绍', '请勿打扰', '差不多', '不在乎', '看起来',
            '一点点', '陪你到', '这么久', '看清楚', '身高体重', '比较慢', '比较忙',
            '多一点', '小女生', '土生土长', '发消息', '最合适'
        ]
        for i in range(0, len(words)):
            if words[i] in exclude_words:
                words[i] = None
        filter_list = list(filter(lambda t: t is not None, words))
        data = r' '.join(filter_list)
        c = Counter(filter_list)
        word_name = []  # 词
        word_count = []  # 词频
        for word_freq in c.most_common(100):
            word, freq = word_freq
            word_name.append(word)
            word_count.append(freq)
        draw_word_wc(word_name, word_count)
示例#6
0
                   max_font_size=400)
    wc = wc.generate(content)
    wc.to_file('result.png')


if __name__ == '__main__':
    result = punctuation_pattern.sub("", get_article(myurl))
    words = [
        word for word in jieba.cut(result, cut_all=False) if len(word) >= 2
    ]
    #data cleaning with unmeaning words
    exclude_words = [
        "一年", "一位", "他们", "新华社", "前夕", "主席", "习近平", "中央", "广播电视", "总台", "长江",
        "不会", "正在", "这些", "新年贺词", "看到", "隆重庆祝", "多万", "发表"
    ]

    for num in range(len(words) - 1, -1, -1):
        if words[num] in exclude_words:
            print("移除非关键字" + words[num])
            del words[num]

    c = Counter(words)
    for word_freq in c.most_common(50):
        word, freq = word_freq
        print(word, freq)
    data = r' '.join(words)
    generate_wc(data)

    image = Image.open("result.png")
    image.show()