def get_2_gram_result(input_domain):
    input_domain = input_domain
    if input_domain == '':
        return 0
#    print 'input domain is: %s'%input_domain
    
    dic = {}
    dic_good = {}

    tmp_sum = 0
    bi_gram_list = []
    bi_gram_list = calc_ngram(input_domain, 2)

    count = 0 #count how many time this n-gram appears in domain
    for each in bi_gram_list:
        count = count + each[1]

#    print 'bi_gram_list is: %s'%bi_gram_list
    for item in bi_gram_list:    # each[4] is bi-gram list
            if item[0] in dic_2gram:    #item[0] is bi-gram letters
                tmp_2gram_count = dic_2gram[item[0]] 
            else:
                tmp_2gram_count = 0
            tmp_sum = float(tmp_sum) + float(tmp_2gram_count)
#    print 'tmp_sum is now: %s'%tmp_sum
    if len(bi_gram_list) == 0:
            tmp_2gram_nor_score = 0
    else:
            tmp_2gram_nor_score = float(tmp_sum)/count
            print 'float(tmp_sum) is: %s'%float(tmp_sum)
            print 'count is: %s'%count
#    print 'tmp_2gram_nor_score_ratio is: %s'%(tmp_2gram_nor_score/1024908267229.0)
    return (tmp_2gram_nor_score/1024908267229.0)
示例#2
0
def define_useragent(useragents, output='score'):
    """Define individual elements of hte useragent.

    Useragents should be a list of suspect useragents to evaluate.
    Default output will be JSON Blob containing descriptive information.
    Set output='score' to receive a threat score for each UA.

    If you're calling this from your own application, be sure 'useragents' is a list.
    If you're expecting score to be returned, leave output alone, if you want the
    features, change output to json.
    """
    response_dict = {'results': {}}
    allowlist = ('curl', 'mobileasset', 'microsoft ncsi'
                 )  # Always permit these agents
    blocklist = ('mozilla/4.0')  # Always block these user agents
    for agent in useragents:
        pua = woothee.parse(agent)
        open_count = len([x for x in list(agent) if x in ['(', '[']])
        close_count = len([x for x in list(agent) if x in [')', ']']])
        response_dict['results'].update({agent: {}})
        allow = block = False
        if agent.split(' ')[0].lower() in allowlist:
            allow = True
        elif agent.split(' ')[0].lower() in blocklist:
            block = True
        response_dict['results'][agent].update(pua)
        response_dict['results'][agent].update({'allowlisted': alllow})
        response_dict['results'][agent].update({'blocklisted': block})
        response_dict['results'][agent].update(
            {'tokens': len(agent.split(' '))})
        response_dict['results'][agent].update(
            {'ngrams': [x for x in pyngram.calc_ngram(agent, 2) if x[1] > 1]})
        if open_count != close_count:  # unbalanced
            response_dict['results'][agent].update({'unbalanced': True})
        else:
            response_dict['results'][agent].update({'unbalanced': False})
        if ';' in agent and '; ' not in agent:  # Malformed, should be '; ' between settings
            response_dict['results'][agent].update(
                {'malformed_semicolon': True})
        else:
            response_dict['results'][agent].update(
                {'malformed_semicolon': False})
        if '/' in agent and ' ' in agent and '(' not in agent:
            response_dict['results'][agent].update({'malformed_noparen': True})
        else:
            response_dict['results'][agent].update(
                {'malformed_noparen': False})
        if '==' in agent or '<' in agent or '>' in agent or '`' in agent:  # SQLi/XSS Tactics
            response_dict['results'][agent].update(
                {'malformed_hacklang': True})
        else:
            response_dict['results'][agent].update(
                {'malformed_hacklang': False})
        response_dict['results'][agent].update(
            {'length': len(agent)})  # Length is kinda interesting

    if output == 'json':
        return response_dict
    else:
        return _score(response_dict)
def _pattern_bigram(pattern_str, bigram):
    if len(pattern_str) == 1:
        return 0
    temp_list = calc_ngram(pattern_str, 2)
    temp_dic = {}
    for each in temp_list:
        temp_dic[each[0]] = each[1]
        if bigram in temp_dic:
            return temp_dic[bigram]
        else:
            return 0
def _pattern_bigram(pattern_str,bigram):
    if len(pattern_str) == 1:
        return 0
    temp_list =  calc_ngram(pattern_str,2)
    temp_dic = {}
    for each in temp_list:
        temp_dic[each[0]] = each[1]
        if bigram in temp_dic:
            return temp_dic[bigram]
        else:
            return 0
    def _count_bigram(pattern_str, bigram):
        """Function that counts how many times the given bigram appears
        """

        if len(pattern_str) == 1:
            return 0
        temp_list = calc_ngram(pattern_str, 2)
        temp_dic = {}
        for each in temp_list:
            temp_dic[each[0]] = each[1]
            if bigram in temp_dic:
                return temp_dic[bigram]
            else:
                return 0
示例#6
0
    def _count_bigram(pattern_str, bigram):
        """Function that counts how many times the given bigram appears
        """

        if len(pattern_str) == 1:
            return 0
        temp_list = calc_ngram(pattern_str, 2)
        temp_dic = {}
        for each in temp_list:
            temp_dic[each[0]] = each[1]
            if bigram in temp_dic:
                return temp_dic[bigram]
            else:
                return 0
    def ave_trigram_nor_score(self, file):
        '''
        count how many times number is mentioned
        return int
        self.cmudict
        '''
        ret = 0
        total = 0
        L = 0
        def nsyl(word):
            return [len(list(y for y in x if y[-1].isdigit())) for x in self.cmudict[word.lower()]]

        with open(file) as f:
            for line in f:
                exclude = set(string.punctuation)
                #remove all the ounctuations
                line = ''.join(ch for ch in line if ch not in exclude)
                #tokenize
                tokenized_lst = self.tokenize(line)
                #print 'tokenized_lst is {}'.format(tokenized_lst)
                for w in tokenized_lst:
                    
                    #>>> calc_ngram('gooogle', 2)
                    #[('oo', 2), ('go', 1), ('gl', 1), ('le', 1), ('og', 1)]
                    ngram_lst = calc_ngram(w, 3)
                    local_total = 0
                    local_L = 0
                    if ngram_lst:
                        for each in ngram_lst:
                            if each[0] in self.trigram_nor_dic:
                            #print 'self.trigram_nor_dic[each[0]] is {0} of type {1}'.format(self.trigram_nor_dic[each[0]], type(self.trigram_nor_dic[each[0]]))
                                local_total += (self.trigram_nor_dic[each[0]]*each[1])
                            #print 'for 2-gram {0}: count->{1}'.format(each[0], self.trigram_nor_dic[each[0]])
                            else:
                            #print '[Warning]{} not found in trigram_nor_dic'.format(each[0])
                                local_total += (1*each[1])
                            local_L += each[1]
                    #print 'for word {0}: local_total->{1}, local_L-> {2}'.format(w, local_total, local_L)
                    else:
                        local_L = 1
                    total += (local_total/float(local_L))
                    L += 1
        if L == 0:
            L = 1
        return total/float(L)
示例#8
0
# -*- coding: utf-8 -*-
"""
Created on Sun Jul 26 15:49:21 2015

@author: bryan_000
"""

from pyngram import calc_ngram
txt = 'aaaabbbcccdb'
results = calc_ngram(txt, 2)   

print results

import nltk
a = nltk.bigrams(txt)

print nltk