def main(sc): start = timer() categs = ["Computers & Tablets", "Video Games", "TV & Home Theater"]# , "Musical Instruments"] stpwrds = stopwords.words('english') tbl_translate = dict.fromkeys(i for i in xrange(sys.maxunicode) if unicodedata.category(unichr(i)).startswith('S') or unicodedata.category(unichr(i)).startswith('P') or unicodedata.category(unichr(i)).startswith('N')) productRDD = sc.parallelize(findProductsByCategory(categs)) corpusRDD = (productRDD.map(lambda s: (s[0], word_tokenize(s[1].translate(tbl_translate).lower()), s[2], s[3])) .map(lambda s: (s[0], [PorterStemmer().stem(x) for x in s[1] if x not in stpwrds], s[2], s[3] )) .map(lambda s: (s[0], [x[0] for x in pos_tag(s[1]) if x[1] == 'NN' or x[1] == 'NNP'], s[2], s[3])) .cache()) idfsRDD = idfs(corpusRDD) idfsRDDBroadcast = sc.broadcast(idfsRDD.collectAsMap()) tfidfRDD = corpusRDD.map(lambda x: (x[0], tfidf(x[1], idfsRDDBroadcast.value), x[2], x[3])) category = productRDD.map(lambda x: x[2]).distinct().collect() categoryAndSubcategory = productRDD.map(lambda x: (x[2], x[3])).distinct().collect() tokens = corpusRDD.flatMap(lambda x: x[1]).distinct().collect() insertTokensAndCategories(tokens, category, categoryAndSubcategory) classifier = Classifier(sc, 'NaiveBayes') trainingVectSpaceCategoryRDD, testVectSpaceCategoryRDD = classifier.createVectSpaceCategory(tfidfRDD, category, tokens).randomSplit([8, 2], seed=0L) modelNaiveBayesCategory = classifier.trainModel(trainingVectSpaceCategoryRDD, '/dados/models/naivebayes/category_new') predictionAndLabelCategoryRDD = testVectSpaceCategoryRDD.map(lambda p : (category[int(modelNaiveBayesCategory.predict(p.features))], category[int(p.label)])) acuraccyCategory = float(predictionAndLabelCategoryRDD.filter(lambda (x, v): x[0] == v[0]).count())/float(predictionAndLabelCategoryRDD.count()) print 'the accuracy of the Category Naive Bayes model is %f' % acuraccyCategory #training in this second way just for test trainingVectSpaceSubcategory, testVectSpaceSubcategory = classifier.createVectSpaceSubcategory(tfidfRDD, categoryAndSubcategory, tokens).randomSplit([8, 2], seed=0L) modelNaiveBayesSubcategory = classifier.trainModel(trainingVectSpaceSubcategory, '/dados/models/naivebayes/subcategory_new') predictionAndLabelSubcategory = testVectSpaceSubcategory.map(lambda p : (categoryAndSubcategory[int(modelNaiveBayesSubcategory.predict(p.features))], categoryAndSubcategory[int(p.label)])) acuraccySubcategory = float(predictionAndLabelSubcategory.filter(lambda (x, v): x[0] == v[0]).count())/float(predictionAndLabelSubcategory.count()) print 'the accuracy of the Subcategory Naive Bayes model is %f' % acuraccySubcategory #test with DecisionTree Model classifierDT = Classifier(sc, 'DecisionTree') trainingVectSpaceCategory, testVectSpaceCategory = classifierDT.createVectSpaceCategory(tfidfRDD, category, tokens).randomSplit([8, 2], seed=0L) modelDecisionTreeCategory = classifierDT.trainModel(trainingVectSpaceCategory, '/dados/models/dt/category_new') predictions = modelDecisionTreeCategory.predict(testVectSpaceCategory.map(lambda x: x.features)) predictionAndLabelCategory = testVectSpaceCategory.map(lambda lp: lp.label).zip(predictions) acuraccyDecisionTree = float(predictionAndLabelCategory.filter(lambda (x, v): x == v).count())/float(predictionAndLabelCategory.count()) print 'the accuracy of the Decision Tree model is %f' % acuraccyDecisionTree elap = timer()-start print 'it tooks %d seconds' % elap
def main(sc): iduser = 1 posts = [ (u'post1', u'I love computers! i would like to buy an asus notebook.', u'Post', u'Twitter'), (u'post2', u'My tablet is not working anymore, i need to buy a new one', u'Post', u'Facebook'), (u'post3', u'I love to watch TV on saturday nights! ', u'Post', u'Twitter'), (u'post4', u'i love to watch netflix on my smart tv', u'Post', u'Twitter'), (u'post5', u'The #Kindle2 seems the best eReader, but will it work in the UK and where can I get one?', u'Post', u'Facebook'), (u'post6', u'I still love my Kindle2 but reading The New York Times on it does not feel natural. I miss the Bloomingdale ads.', u'Post', u'Facebook') ] postsRDD = sc.parallelize(posts) tokens, category, categoryAndSubcategory = getTokensAndCategories() stpwrds = stopwords.words('english') tbl_translate = dict.fromkeys(i for i in xrange(sys.maxunicode) if unicodedata.category(unichr(i)).startswith('P') or unicodedata.category(unichr(i)).startswith('N')) productRDD = sc.parallelize(findProductsByCategory(category)) productAndPostRDD = productRDD.union(postsRDD) corpusRDD = (productAndPostRDD.map(lambda s: (s[0], word_tokenize(s[1].translate(tbl_translate).lower()), s[2], s[3])) .map(lambda s: (s[0], [PorterStemmer().stem(x) for x in s[1] if x not in stpwrds], s[2], s[3])) .map(lambda s: (s[0], [x for x in s[1] if x in tokens], s[2], s[3])) .filter(lambda x: len(x[1]) >= 20 or x[2] == u'Post') .cache()) idfsRDD = idfs(corpusRDD) idfsRDDBroadcast = sc.broadcast(idfsRDD.collectAsMap()) tfidfRDD = corpusRDD.map(lambda x: (x[0], tfidf(x[1], idfsRDDBroadcast.value), x[2], x[3])).cache() tfidfPostsRDD = tfidfRDD.filter(lambda x: x[2]=='Post').cache() tfidfPostsBroadcast = sc.broadcast(tfidfPostsRDD.map(lambda x: (x[0], x[1])).collectAsMap()) corpusPostsNormsRDD = tfidfPostsRDD.map(lambda x: (x[0], norm(x[1]))).cache() corpusPostsNormsBroadcast = sc.broadcast(corpusPostsNormsRDD.collectAsMap()) classifier = Classifier(sc, 'NaiveBayes') #classifierDT = Classifier(sc, 'DecisionTree') #modelNaiveBayesCategory = classifier.getModel('/dados/models/naivebayes/category_new') modelNaiveBayesSubcategory = classifier.getModel('/dados/models/naivebayes/subcategory_new') #modelDecisionTree = classifierDT.getModel('/dados/models/dt/category_new') postsSpaceVectorRDD = classifier.createVectSpacePost(tfidfPostsRDD, tokens) #predictionCategoryNaiveBayesCategoryRDD = postsSpaceVectorRDD.map(lambda p: modelNaiveBayesCategory.predict(p)) #predictionCategoryDecisionTreeRDD = modelDecisionTree.predict(postsSpaceVectorRDD.map(lambda x: x)) predictions = postsSpaceVectorRDD.map(lambda p: (modelNaiveBayesSubcategory.predict(p[1]), p[0])).groupByKey().mapValues(list).collect() for prediction in predictions: category_to_use = categoryAndSubcategory[int(prediction[0])][0] tfidfProductsCategoryRDD = tfidfRDD.filter(lambda x: x[2]==category_to_use).cache() tfidfProductsCategoryBroadcast = sc.broadcast(tfidfProductsCategoryRDD.map(lambda x: (x[0], x[1])).collectAsMap()) corpusInvPairsProductsRDD = tfidfProductsCategoryRDD.flatMap(lambda r: ([(x, r[0]) for x in r[1]])).cache() corpusInvPairsPostsRDD = tfidfPostsRDD.flatMap(lambda r: ([(x, r[0]) for x in r[1]])).filter(lambda x: x[1] in prediction[1]).cache() commonTokens = (corpusInvPairsProductsRDD.join(corpusInvPairsPostsRDD) .map(lambda x: (x[1], x[0])) .groupByKey() .cache()) corpusProductsNormsRDD = tfidfProductsCategoryRDD.map(lambda x: (x[0], norm(x[1]))).cache() corpusProductsNormsBroadcast = sc.broadcast(corpusProductsNormsRDD.collectAsMap()) similaritiesRDD = (commonTokens .map(lambda x: cosineSimilarity(x, tfidfProductsCategoryBroadcast.value, tfidfPostsBroadcast.value, corpusProductsNormsBroadcast.value, corpusPostsNormsBroadcast.value)) .cache()) suggestions = (similaritiesRDD .map(lambda x: (x[0][1], (x[0][0], x[1]))) .filter(lambda x: x[1][1]>threshold) .groupByKey() .mapValues(list) .join(postsRDD) .join(postsRDD.map(lambda x: (x[0], x[3]))) .collect()) if len(suggestions) > 0: insertSuggestions(suggestions, iduser, productRDD) elap = timer()-start print 'it tooks %d seconds' % elap
import mlflow import pandas as pd from sklearn.utils import shuffle import seaborn as sns import matplotlib.pyplot as plt from sklearn.svm import LinearSVC, SVC from sklearn.linear_model import LogisticRegression from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, StackingClassifier from classes.Vectorizer import Vectorizer from classes.Classifier import Classifier from classes.Resample import Resample from classes.ModelBuilder import ModelBuilder from classes.Constans import * # In[ ]: classifier = Classifier() vectorizer = Vectorizer() resample = Resample() builder = ModelBuilder() classifier_list = [] # ### Get Info from CSV # In[ ]: df_train = shuffle(pd.read_csv('data/train_preprocessed.csv', sep='|')) df_test = shuffle( pd.read_csv('data/test_santander.csv', usecols=['id', 'Pregunta'])) print(df_train['Intencion_cat_label'].value_counts()) # add one more sample because I have one case with just one sample and stratify need at least 2 samples df_train = resample.apply_resample(df_train, 'Pregunta', 5, 100)
# In[ ]: import sys import mlflow import pandas as pd import seaborn as sns import matplotlib.pyplot as plt from sklearn.utils import shuffle from classes.Vectorizer import Vectorizer from classes.Classifier import Classifier from classes.Resample import Resample from classes.ModelBuilder import ModelBuilder from classes.Constans import * # In[ ]: classifier = Classifier() vectorizer = Vectorizer() resample = Resample() builder = ModelBuilder() #--------------------------------------------------# ### CLASSIFIERS ### #--------------------------------------------------# classifier_list = classifier.get_classifier_list() def build_model(X, y, model, df_test): model_name = model.__class__.__name__ X_train, X_test, y_train, y_test = builder.get_train_test_split(X, y) RESAMPLE_FILE = 'data/apply_resample_after_{}.png'.format(model_name) if APPLY_RESAMPLE == True: