Пример #1
0
def train_dir(directory, cls, bool_model, bool_feat, ntf_model, ntf_feat):
    for f in NBmodel.get_files(directory):
        bool_ex = NBmodel.munge_Boolean(f, bool_feat)
        ntf_ex = NBmodel.munge_NTF(f, ntf_feat)

        bool_model.train(bool_ex, cls)
        ntf_model.train(ntf_ex, cls)
Пример #2
0
def _get_features_list(spam_dir, ham_dir):
    def process_email(freq, filename, cls):
        email = open(filename, "r")
        for line in email:
            tok = line.rstrip("\n").rstrip("\r").strip(" ").split(" ")
            for t in tok:
                freq[(cls, t)] = [freq[(cls, t)][0] + 1, freq[(cls, t)][0] + float(1 / len(tok))]
        email.close()

    freq = defaultdict(lambda: list([0, 0]))
    for email_file in NBmodel.get_files(spam_dir):
        process_email(freq, email_file, "SPAM")

    for email_file in NBmodel.get_files(ham_dir):
        process_email(freq, email_file, "HAM")

    return freq
Пример #3
0
def train_dir(directory, cls, model):
    exs = []
    for f in NBmodel.get_files(directory):
        exs.append(model.munge(f))
        if len(exs) > 500:
            for ex in exs:
                model.train(ex, cls)
            exs = []
Пример #4
0
def train_dir(directory, cls, model):
    exs = []
    for f in NBmodel.get_files(directory):
        exs.append(model.munge(f))
        if len(exs) > 500:
            for ex in exs:
                model.train(ex, cls)
            exs = []
Пример #5
0
def _get_features_list(spam_dir, ham_dir):
    def process_email(freq, filename, cls):
        email = open(filename, 'r')
        for line in email:
            tok = line.rstrip("\n").rstrip("\r").strip(" ").split(" ")
            for t in tok:
                freq[(cls,t)] = [freq[(cls,t)][0]+1, 
                                 freq[(cls,t)][0]+float(1/len(tok))]
        email.close()

    freq = defaultdict(lambda: list([0,0]))
    for email_file in NBmodel.get_files(spam_dir):
        process_email(freq, email_file, 'SPAM')

    for email_file in NBmodel.get_files(ham_dir):
        process_email(freq, email_file, 'HAM')

    return freq
Пример #6
0
if __name__ == "__main__":
    if len(sys.argv) < NUM_ARGS + 1:
        print "Usage: train.py spamdir hamdir bool_features ntf_features"
        sys.exit(1)
    spamdir = sys.argv[1]
    hamdir = sys.argv[2]
    bool_features_file = sys.argv[3]
    ntf_features_file = sys.argv[4]

    bool_features = _write_bool_features(bool_features_file, spamdir, hamdir)
    ntf_features = bool_features

    #bool_features = pickle.load(open(bool_features_file, 'rb'))
    #ntf_features = pickle.load(open(ntf_features_file, 'rb'))

    bool_model = NBmodel.NB_Boolean(bool_features_file)
    ntf_model = NBmodel.NB_NTF(bool_features_file)

    t = time.time()

    print "+ Begin SPAM"
    train_dir(spamdir, NBmodel.SPAM, bool_model, bool_features, ntf_model,
              ntf_features)
    print "++ End SPAM %f" % (time.time() - t)

    t = time.time()
    print "+ Begin HAM"
    train_dir(hamdir, NBmodel.HAM, bool_model, bool_features, ntf_model,
              ntf_features)
    print "+ End HAM %f" % (time.time() - t)
Пример #7
0
def _write_bool_features(features_file, spam_dir, ham_dir):
    return _write_features(features_file, spam_dir, ham_dir, 0)

def _write_ntf_features(features_file, spam_dir, ham_dir):
    return _write_features(features_file, spam_dir, ham_dir, 1)

if __name__ == "__main__":
    if len(sys.argv) < NUM_ARGS + 1:
        print "Usage: train.py spamdir hamdir features"
        sys.exit(1)
    spamdir = sys.argv[1]
    hamdir = sys.argv[2]
    features_file = sys.argv[3]

    gamma_model = NBmodel.NB_Gamma(features_file)

    t = time.time()

    print "+ Begin SPAM"
    train_dir(spamdir, NBmodel.SPAM, gamma_model)
    print "++ End SPAM %f" % (time.time() - t)

    t = time.time()
    print "+ Begin HAM"
    train_dir(hamdir, NBmodel.HAM, gamma_model)
    print "+ End HAM %f" % (time.time() - t)

    print "+ Dumping objects"
    dump_obj(GAMMA_MODEL_FILE, gamma_model)