Пример #1
0
def train_dir(directory, cls, bool_model, bool_feat, ntf_model, ntf_feat):
    for f in NBmodel.get_files(directory):
        bool_ex = NBmodel.munge_Boolean(f, bool_feat)
        ntf_ex = NBmodel.munge_NTF(f, ntf_feat)

        bool_model.train(bool_ex, cls)
        ntf_model.train(ntf_ex, cls)
Пример #2
0
def _get_features_list(spam_dir, ham_dir):
    def process_email(freq, filename, cls):
        email = open(filename, "r")
        for line in email:
            tok = line.rstrip("\n").rstrip("\r").strip(" ").split(" ")
            for t in tok:
                freq[(cls, t)] = [freq[(cls, t)][0] + 1, freq[(cls, t)][0] + float(1 / len(tok))]
        email.close()

    freq = defaultdict(lambda: list([0, 0]))
    for email_file in NBmodel.get_files(spam_dir):
        process_email(freq, email_file, "SPAM")

    for email_file in NBmodel.get_files(ham_dir):
        process_email(freq, email_file, "HAM")

    return freq
Пример #3
0
def train_dir(directory, cls, model):
    exs = []
    for f in NBmodel.get_files(directory):
        exs.append(model.munge(f))
        if len(exs) > 500:
            for ex in exs:
                model.train(ex, cls)
            exs = []
Пример #4
0
def train_dir(directory, cls, model):
    exs = []
    for f in NBmodel.get_files(directory):
        exs.append(model.munge(f))
        if len(exs) > 500:
            for ex in exs:
                model.train(ex, cls)
            exs = []
Пример #5
0
def _get_features_list(spam_dir, ham_dir):
    def process_email(freq, filename, cls):
        email = open(filename, 'r')
        for line in email:
            tok = line.rstrip("\n").rstrip("\r").strip(" ").split(" ")
            for t in tok:
                freq[(cls,t)] = [freq[(cls,t)][0]+1, 
                                 freq[(cls,t)][0]+float(1/len(tok))]
        email.close()

    freq = defaultdict(lambda: list([0,0]))
    for email_file in NBmodel.get_files(spam_dir):
        process_email(freq, email_file, 'SPAM')

    for email_file in NBmodel.get_files(ham_dir):
        process_email(freq, email_file, 'HAM')

    return freq