leafWords = ['s','es','ed','er','ly','ing'] cw = features.readCommonWords('common_words.txt') # Import Data reviews = Data(inputFile, numLines = 50000, testLines = 5000) reviews.getInfo() reviews.shuffle() # Different feature extractors f1 = features.posNegClusterFeatures("embeddings.p", "dictionary.p", 'NRC-Emotion-Lexicon-v0.92/NRC-emotion-lexicon-wordlevel-alphabetized-v0.92.txt', 200) f2 = features.wordFeatures f3 = features.positiveNegativeCountsWithClause('NRC-Emotion-Lexicon-v0.92/NRC-emotion-lexicon-wordlevel-alphabetized-v0.92.txt') f4 = features.emotionCounts('NRC-Emotion-Lexicon-v0.92/NRC-emotion-lexicon-wordlevel-alphabetized-v0.92.txt') f5 = features.stemmedWordFeatures(leafWords) f6 = features.clusterFeatures("embeddings.p", "dictionary.p", 200) f7 = features.wordFeaturesWithNegation(cw, leafWords) # SVM SVMModel = SVM(reviews, [f1, f5]) SVMModel.getInfo() # Naive Bayes naiveBayesModel = NaiveBayes(reviews, [f3, f4, f5]) naiveBayesModel.getInfo() # Linear Regression linearModel = LinearRegression(reviews, [f7, f3, f1]) linearModel.getInfo()
# Analyzing command line arguments if len(sys.argv) < 2: print 'Usage:' print ' python %s <JSON file>' % sys.argv[0] exit() inputFile = sys.argv[1] # Import Data reviews = Data(inputFile, numLines = 10000, testLines = 1000) reviews.getInfo() reviews.shuffle() #lexicon = features.readFullLexicon('NRC-Emotion-Lexicon-v0.92/NRC-emotion-lexicon-wordlevel-alphabetized-v0.92.txt') featureExtractor = features.clusterFeatures("embeddings.p", "dictionary.p", 200) # Create sparse numpy arrays reviews.convertDataToArray(featureExtractor) reviews.convertLabelsToOneHot() # Need this for tensorflow #print reviews.trainArray.todense() #print reviews.trainLabelOneHot.transpose().todense() x = tf.placeholder("float", [None, reviews.numFeatures]) # We have 5 stars and reviews.numFeatures number of reviews W = tf.Variable(tf.zeros([reviews.numFeatures, 5])) b = tf.Variable(tf.zeros([5])) # Our prediction