n1 = '1000' n2 = '100000' strong_ext = '_chi2_strong_' + vectorizer weak_ext = '_chi2_weak_' + vectorizer train_features, train_labels = load_data.load_feature_data(0, test_train='train') len_train= len(train_labels) test_features, test_labels = load_data.load_feature_data(0, test_train='test') all_labels = np.append(train_labels, test_labels) ## Do better Normalization on Customer Features all_features = np.vstack((train_features, test_features)) float_feats = [[float(i) for i in row] for row in all_features] # Turn values to floating point new_feats = preprocessing.normalize(float_feats, norm='l1', axis=0) train_features = new_feats[:len_train] test_features = new_feats[len_train:] ### Done Normalizing train_email_features, test_email_features = load_data.load_email_data(0, vectorizer, stemmer=stem, vectorizer=v2) train_subject_features, test_subject_features = load_data.load_subject_data(0, vectorizer, stemmer=stem, vectorizer=v2) train_email_features_n1, test_email_features_n1 = load_data.load_email_data(0, n1+v2, stemmer=stem, vectorizer=v2) train_subject_features_n1, test_subject_features_n1 = load_data.load_subject_data(0, n1+v2, stemmer=stem, vectorizer=v2) train_email_features_n2, test_email_features_n2 = load_data.load_email_data(0, n2+v2, stemmer=stem, vectorizer=v2) train_subject_features_n2, test_subject_features_n2 = load_data.load_subject_data(0, n2+v2, stemmer=stem, vectorizer=v2) train_email_strong_features, test_email_strong_features = load_data.load_email_data(0, strong_ext, stemmer=stem, vectorizer=vectorizer) train_subject_strong_features, test_subject_strong_features = load_data.load_subject_data(0, strong_ext, stemmer=stem, vectorizer=vectorizer) train_email_weak_features, test_email_weak_features = load_data.load_email_data(0, weak_ext, stemmer=stem, vectorizer=vectorizer) train_subject_weak_features, test_subject_weak_features = load_data.load_subject_data(0, weak_ext, stemmer=stem, vectorizer=vectorizer) logging.info("All Data Loaded") for t in ['email']: for s in ['n2']: if t == 'email': if s =='strong':
vectorizers = ['TfidfVectorizer', 'HashingVectorizer'] stemmers = ['RegexpStemmer', 'LancasterStemmer', 'PorterStemmer'] for vectorizer in vectorizers: for stem in stemmers: train_features, train_labels = load_data.load_feature_data(0, test_train='train') len_train= len(train_labels) test_features, test_labels = load_data.load_feature_data(0, test_train='test') all_labels = np.append(train_labels, test_labels) ## Do better Normalization on Customer Features all_features = np.vstack((train_features, test_features)) float_feats = [[float(i) for i in row] for row in all_features] # Turn values to floating point new_feats = preprocessing.normalize(float_feats, norm='l1', axis=0) train_features = new_feats[:len_train] test_features = new_feats[len_train:] ### Done Normalizing train_email_features, test_email_features = load_data.load_email_data(0, vectorizer, stemmer=stem, vectorizer=vectorizer) train_subject_features, test_subject_features = load_data.load_subject_data(0, vectorizer, stemmer=stem, vectorizer=vectorizer) logging.info("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++") logging.info("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++") logging.info("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++") logging.info("VECTORIZER = %s" % vectorizer) logging.info("STEMMER = %s" % stem) logging.info("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++") logging.info("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++") logging.info("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++") logging.info("All Data Loaded") for t in ['both', 'all', 'subject', 'email', 'normal']: if t == 'email': trainer = train_email_features
observed = classifier.classify(feats) testsets[observed].add(i) print 'accuracy:', nltk.classify.util.accuracy(classifier, testfeats) print 'pos precision:', nltk.metrics.precision(refsets['pos'], testsets['pos']) print 'pos recall:', nltk.metrics.recall(refsets['pos'], testsets['pos']) print 'neg precision:', nltk.metrics.precision(refsets['neg'], testsets['neg']) print 'neg recall:', nltk.metrics.recall(refsets['neg'], testsets['neg']) classifier.show_most_informative_features() def best_bigram_word_feats(words, score_fn=BigramAssocMeasures.chi_sq, n=400): bigram_finder = BigramCollocationFinder.from_words(words) bigrams = bigram_finder.nbest(score_fn, n) return dict([(ngram, True) for ngram in itertools.chain(words, bigrams)]) if __name__ == "__main__": train_subject, test_subject = load_data.load_email_data(0, extension='unVectorized', stemmer='PorterStemmer', vectorizer='unVectorized') train_email, test_email = load_data.load_email_data(0, extension='unVectorized', stemmer='PorterStemmer', vectorizer='unVectorized') train_features, train_labels = load_data.load_feature_data(0, test_train='train') test_features, test_labels = load_data.load_feature_data(0, test_train='test') train_email_features = [email.split(' ') for email in train_email] test_email_features = [email.split(' ') for email in test_email] train_subject_features = [subject.split(' ') for subject in train_subject] test_subject_features = [subject.split(' ') for subject in test_subject] for email_group in ['email', 'subject']: for wordType in ['single', 'bigram', 'trigram']: logging.info("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++") logging.info("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++") logging.info("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++") logging.info("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++") logging.info("Getting Word Stuff for %s and %s" % (email_group, wordType)) if email_group == 'email':
# trainer = train_features # test = test_features # split_train, split_labels = segment_for_even_distribution(trainer, train_labels) # logging.info("============================================================") # logging.info("Grid Search SVM on Customer Info") # svmTrainandPrintWithGridSearch(split_train, split_labels, test_email_features, test_labels) # #SPlit up the data to better parts # for stem in ['PorterStemmer', 'RegexpStemmer', 'LancasterStemmer']: # for vectorizer in ['TfidfVectorizer', 'HashingVectorizer']: for n in [100, 1000, 10000, 100000, 1000000, 10000000, 100000000]: stem = "RegexpStemmer" vectorizer = "HashingVectorizer" logging.info("Loading Data for %s, %s, %i:" % (stem, vectorizer, n)) train_email_features, test_email_features = load_data.load_email_data( 0, str(n) + vectorizer, stemmer=stem, vectorizer=vectorizer, num_features=n ) train_subject_features, test_subject_features = load_data.load_subject_data( 0, str(n) + vectorizer, stemmer=stem, vectorizer=vectorizer, num_features=n ) for textType in ["both"]: logging.info("Building Model for %s " % textType) if textType == "email": train, test = train_email_features, test_email_features elif textType == "subject": train, test = train_subject_features, test_subject_features elif textType == "both": train = scipy.sparse.hstack([train_subject_features, train_email_features]) test = scipy.sparse.hstack([test_subject_features, test_email_features]) train = scipy.sparse.csr_matrix(train) test = scipy.sparse.csr_matrix(test)