Exemplo n.º 1
0
def SentimentAnalyzer():
    text = "The movie was amazing"
    fileids_pos = movie_reviews.fileids('pos')
    fileids_neg = movie_reviews.fileids('neg')

    features_pos = [(extract_features(movie_reviews.words(fileids=[f])),
                     'Positive') for f in fileids_pos]
    features_neg = [(extract_features(movie_reviews.words(fileids=[f])),
                     'Negative') for f in fileids_neg]

    threshold = 0.8
    num_pos = int(threshold * len(features_pos))
    num_neg = int(threshold * len(features_neg))

    features_train = features_pos[:num_pos] + features_neg[:num_neg]
    features_test = features_pos[num_pos:] + features_neg[num_neg:]

    classifier = NaiveBayesClassifier.train(features_train)

    probabilities = classifier.prob_classify(extract_features(text.split()))
    predicted_sentiment = probabilities.max()

    response = {
        "accuracy": nltk_accuracy(classifier, features_test),
        "predicted_sentiment": predicted_sentiment,
        "probability": round(probabilities.prob(predicted_sentiment), 2)
    }
    print(response)
    return response
Exemplo n.º 2
0
def SentimentAnalyzer(text):
    # load movie reviews from sample data
    fileids_pos = movie_reviews.fileids('pos')
    fileids_neg = movie_reviews.fileids('neg')

    features_pos = [(extract_features(movie_reviews.words(fileids=[f])),'Positive') for f in fileids_pos]
    features_neg = [(extract_features(movie_reviews.words(fileids=[f])),'Negative') for f in fileids_neg]

    threshold = 0.8
    num_pos = int(threshold*len(features_pos))
    num_neg = int(threshold*len(features_neg))

    # creating training and testing data
    features_train = features_pos[:num_pos] + features_neg[:num_neg]
    features_test = features_pos[num_pos:] + features_neg[num_neg:]

    #print('\nNumber of training datapoints:', len(features_train))
    #print('Number of test datapoints:', len(features_test))

    # training a naive bayes classifier
    classifier = NaiveBayesClassifier.train(features_train)
    print('Accuracy:',nltk_accuracy(classifier, features_test))

    probabilities = classifier.prob_classify(extract_features(text.split()))
    # Pick the maximum value
    predicted_sentiment = probabilities.max()
    print("Predicted sentiment:", predicted_sentiment)
    print("Probability:",round(probabilities.prob(predicted_sentiment), 2))

    return predicted_sentiment, probabilities.prob(predicted_sentiment)
Exemplo n.º 3
0
def SentimentAnalyzer(text):
    # load movie reviews from sample data
    # fileids_pos = movie_reviews.fileids('pos')
    # fileids_neg = movie_reviews.fileids('neg')

    # features_pos = [(extract_features(movie_reviews.words(fileids=[f])),'Positive') for f in fileids_pos]
    # features_neg = [(extract_features(movie_reviews.words(fileids=[f])),'Negative') for f in fileids_neg]

    threshold = 0.8
    # num_pos = int(threshold*len(features_pos))
    # num_neg = int(threshold*len(features_neg))

    # creating training and testing data
    # features_train = features_pos[:num_pos] + features_neg[:num_neg]
    # features_test = features_pos[num_pos:] + features_neg[num_neg:]
    feature = frame.body_text
    label = frame.label
    features_train = [(extract_features(feature), label) for index, (feature, label) in frame.iterrows()]

    features_train = features_train[:2000] 
    features_test = features_train[2000:]

    print('\nNumber of training datapoints:', len(features_train))
    print('Number of test datapoints:', len(features_test))

    # training a naive bayes classifier 
    print(type(features_train))
    print(type(features_train[0]))
    print(type(features_train[0][0]))
    classifier = NaiveBayesClassifier.train(features_train)
    print('Accuracy:',nltk_accuracy(classifier, features_test))

    probabilities = classifier.prob_classify(extract_features(text.split()))
    # Pick the maximum value
    predicted_sentiment = probabilities.max()
    print("Predicted sentiment:", predicted_sentiment)
    print("Probability:",round(probabilities.prob(predicted_sentiment), 2))

    return predicted_sentiment 
Exemplo n.º 4
0
                     'Positive') for f in fileids_pos]
    features_neg = [(extract_features(movie_reviews.words(fileids=[f])),
                     'Negative') for f in fileids_neg]

threshold = 0.8
num_pos = int(threshold * len(features_pos))
num_neg = int(threshold * len(features_neg))

features_train = features_pos[:num_pos] + features_neg[:num_neg]
features_test = features_pos[num_pos:] + features_neg[num_neg:]

print('\nNumber of training datapoints:', len(features_train))
print('Number of test datapoints:', len(features_test))

classifier = NaiveBayesClassifier.train(features_train)
print('\nAccuracy of the classifier:', nltk_accuracy(classifier,
                                                     features_test))

N = 15
print('\nTop ' + str(N) + ' most informative words:')
for i, item in enumerate(classifier.most_informative_features()):
    print(str(i + 1) + '. ' + item[0])
    if i == N - 1:
        break

input_reviews = [
    "Everything about this movie is outstanding -- the performances, the way the true events are handled, the cinematography. In this day of digital news, this movie makes us stand back and realize what we may lose in the way of investigative journalism as we slowly kill off print media. The focus remains the child abuse scandal in the archdiocese in Boston. That reflects the conflict the characters face and deal with when events make them rethink the focus of their article. The movie is riveting, though we know the outcome."
]

print("\nMovie review predictions:")
for review in input_reviews:
    print("\nReview:", review)
Exemplo n.º 5
0
    threshold = 0.8
    num_pos = int(threshold * len(features_pos))
    num_neg = int(threshold * len(features_neg))

    # Create training and training datasets
    features_train = features_pos[:num_pos] + features_neg[:num_neg]
    features_test = features_pos[num_pos:] + features_neg[num_neg:]

    # Print the number of datapoints used
    print('\nNumber of training datapoints:', len(features_train))
    print('Number of test datapoints:', len(features_test))

    # Train a Naive Bayes classifier
    classifier = NaiveBayesClassifier.train(features_train)
    print('\nAccuracy of the classifier:',
          nltk_accuracy(classifier, features_test))

    N = 20
    print('\nTop ' + str(N) + ' most informative words:')
    for i, item in enumerate(classifier.most_informative_features()):
        print(str(i + 1) + '. ' + item[0])
        if i == N - 1:
            break

    # Test input movie reviews
    input_reviews = [
        'Im not sure theres a single unsuccessful moment in this entire film. This was the movie that reminded me how much I can still love a movie. ',
        'However, although entertaining in parts, there is very little connective tissue between the two main running storylines, creating a disappointing disconnect which prevents the movie from truly coming together in the end.',
        'While this has interesting moments, Foster seems unable to follow the story into as deep or dark a place as it should go and the ambiguity in the storytelling is unwarranted and frustrating to witness.',
        'There is an appreciated sense of unconventionally to the film. However, the story quickly takes an overemotional and theatrical turn which diminish the many topics the story could have explored. ',
        "A sensational Korean trial makes for a fairly riveting cinematic ride, with its very own touches of that infamous gangnam style.",
Exemplo n.º 6
0
    def build_naive_bayes_model(self):
        print('Processing Naive Bayes classification: \n')
        for r in reviews:

            # tokenize review text
            tokens = word_tokenize(self.review_text(r))

            # lower case tokens
            tokens = [w.lower() for w in tokens]

            # remove punctuation
            stripped = [w.translate(self.punc_table) for w in tokens]

            # filter out non-alphabetic words
            words = [word for word in stripped if word.isalpha()]

            # filter stop words
            stop_words = set(stopwords.words('english'))
            words = [w for w in words if not w in stop_words]
            # words = [w for w in words if not w in self.sentimentAnalyzerLexicons]

            # Frequency distribution
            for w in words:
                self.all_words.append(w)

            # Frequency distribution
            fdist = FreqDist(self.all_words)
            word_features = list(fdist.keys())[:3000]

            # set the text to processced result for model training later
            r['text'] = ' '.join(words)

            # label reviews with 4, 5 ratings as pos and the rest as neg.
            if (r['score'] == 4 or r['score'] == 5):
                self.documents.append((r, "pos"))
            else:
                self.documents.append((r, "neg"))

        def find_features(text):
            words = word_tokenize(text)
            features = {}
            for w in word_features:
                features[w] = (w in words)

            return features

        featuresets = [(find_features(rvw['text']), sentiment) for (rvw, sentiment) in self.documents]
        random.shuffle(featuresets)

        threshold = 0.8
        training_set = featuresets[:int(threshold * len(featuresets))]
        testing_set = featuresets[int(threshold * len(featuresets)):]

        # Prep done. build the model and validate.
        classifier = NaiveBayesClassifier.train(training_set)

        print("Naive Bayes classifier accuracy percent:", (nltk_accuracy(classifier, testing_set)) * 100)
        print("\n")

        classifier.show_most_informative_features(15)

        # build the list of sentiment for each review from the review
        featuresets_to_classify = [find_features(rvw['text']) for (rvw, sentiment) in self.documents]
        labels = classifier.classify_many(featuresets_to_classify)
        self.nb_df = pd.DataFrame(labels, columns=['nb_label'])
        self.nb_df['review_id'] = [rvw['id'] for rvw, sentiment in self.documents]
        self.nb_df['appid'] = [rvw['appid'] for rvw, sentiment in self.documents]
def Analtsis_Movie(Movie_Reple):
    positive_count = 0
    negative_count = 0
    nomal_count = 0
    features_train = read_pickle('features_train.txt')
    features_test = read_pickle('features_test.txt')
    print('\n학습데이터의 수:', len(features_train))
    print('테스트 데이터의 수: ', len(features_test))

    classifier = NaiveBayesClassifier.train(features_train)
    print('\n정확도 :', nltk_accuracy(classifier, features_test))

    N = 15
    print('\nTop ' + str(N) + ' 결정적인 단어 :')
    for i, item in enumerate(classifier.most_informative_features()):
        print(str(i + 1) + '. ' + item[0])
        if i == N - 1:
            break

    input_reviews = read_review(Movie_Reple)

    print("\n영화 리뷰 예측:")

    review_list = []

    for review in input_reviews:

        probabilities = classifier.prob_classify(
            extract_features(pos_tagger.nouns(review)))
        # Pick the maximum value
        predicted_sentiment = probabilities.max()

        print("\n리뷰 :", review)

        print("예측된 감정:", predicted_sentiment)
        print("정확도 :", round(probabilities.prob(predicted_sentiment), 2))
        if predicted_sentiment == 'Positive':
            positive_count += 1
        elif predicted_sentiment == "Negative":
            negative_count += 1
        elif predicted_sentiment == 'Nomal':
            nomal_count += 1

    print('긍정적인 비율 : ',
          positive_count / (positive_count + negative_count + nomal_count))
    print('부정적인 비율 : ',
          negative_count / (positive_count + negative_count + nomal_count))
    print('중립의 비율 : ',
          nomal_count / (positive_count + negative_count + nomal_count))
    print('총 개수 : ', positive_count + negative_count + nomal_count)

    num = [positive_count, nomal_count, negative_count]
    vec = ['postive', 'nomal', 'negative']

    plt.pie(num,
            labels=vec,
            colors=['steelblue', 'lightskyblue', 'salmon'],
            startangle=90,
            shadow=True,
            autopct='%1.1f%%')
    plt.show()
Exemplo n.º 8
0
def analyzeFile(input_file):
    #Initialisation
    snowball = SnowballStemmer('english')
    healthplan = {"raw_text": "", "number_pages": 0}
    keywords=["autism", "prosthetics", "disability", "disabled",  "learning disability",\
    "prostheses", "physiotherapy", "deaf", "blind", "chronic conditions","hearing loss", "physiotherapist",\
    "mental health", "autistic spectrum disorder", "autistic", "aspergers", "ADHD", "attention defecit disorder", \
    "speech therapy", "dyslexia", "dyspraxia","learning disorders", "speech delay", "genetic screening", \
    "cystic fibrosis", "visual impairment", "blindness", "deaf-blindness", "ABI", "acquired brain injury", "prosthesis"]

    keywords_lem = list(map(lambda x: snowball.stem(x), keywords))

    #Import pdf
    pdfFileObj = open(input_file, 'rb')  #'rb' for read binary mode
    pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
    print("Number of pages in document:", pdfReader.numPages)
    healthplan["number_pages"] = pdfReader.numPages

    for i in range(healthplan["number_pages"]):
        healthplan["raw_text"] += pdfReader.getPage(i).extractText().replace(
            "\n", "")
        print("Processed ", round(i / healthplan["number_pages"] * 100, 2),
              "% of documents")

    #Tokenisation and lemmisation of document
    #strip removes trailing and leading from provided  string, not good for whole document
    #translate quicker than replace, quicker than list concatenation ignoring badchars
    print(healthplan["raw_text"][:500])
    table = str.maketrans(dict.fromkeys("(){}<>,'\t"))
    healthplan["raw_text"] = healthplan["raw_text"].translate(table)
    print(healthplan["raw_text"][:500])
    healthplan["words"] = word_tokenize(healthplan["raw_text"])
    print(healthplan["words"][:50])
    healthplan["sentences"] = sent_tokenize(healthplan["raw_text"])
    print(healthplan["sentences"][:10])

    #Perform stemmisation
    healthplan["words"] = list(
        map(lambda x: snowball.stem(x), healthplan["words"]))

    #Could optionally do text chunking, so not limited by sentences
    #I'm going to combine n sentences to form sentence chunks
    healthplan["chunks"] = []
    chunksize = 1
    print("Number Document sentences: ", len(healthplan["sentences"]))
    for i in range(0, len(healthplan["sentences"]), chunksize):
        healthplan["chunks"].append(' '.join(
            healthplan["sentences"][i:(i) + chunksize]))
    print("Number Document chunks: ", len(healthplan["chunks"]))
    #Sentence category prediction

    category_map = {'sci.med': 'Medicine'}

    categories = [
        'alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc',
        'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x',
        'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball',
        'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.space',
        'soc.religion.christian', 'talk.politics.guns',
        'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc'
    ]

    for c in categories:
        category_map[c] = "Other"

    # Get the training dataset
    training_data = fetch_20newsgroups(subset='train',
                                       categories=category_map.keys(),
                                       shuffle=True,
                                       random_state=5)

    #print(training_data.data[0])

    #extract term counts
    count_vectorizer = CountVectorizer()
    train_tc = count_vectorizer.fit_transform(training_data.data)

    #train term freq inversion
    tfidf = TfidfTransformer()
    train_tfidf = tfidf.fit_transform(train_tc)

    #Train a Multinomial Naive Bayes classifier
    classifier = MultinomialNB().fit(train_tfidf, training_data.target)

    # Transform input data using count vectorizer
    input_tc = count_vectorizer.transform(healthplan["chunks"])
    # Transform vectorized data using tfidf transformer
    input_tfidf = tfidf.transform(input_tc)

    # Predict the output categories
    predictions = classifier.predict(input_tfidf)

    no_outputs = 10
    # Print the outputs
    for sent, category in zip(healthplan["chunks"][:no_outputs],
                              predictions[:no_outputs]):
        print('\nInput:', sent, '\nPredicted category:',
              category_map[training_data.target_names[category]])

    #only include medicine relevant
    print("Percentage of document medicine related: ",
          round(sum(predictions) / len(predictions), 2), "% : ",
          sum(predictions))

    #healthplan["chunks"]=healthplan["chunks"][predictions=="Medicine"]

    #leminize words in chunks, compare to leminized keywords
    #print chunks where relevant. If last/first sentence, group with previous?
    healthplan["chunks_lem"] = [[snowball.stem(s) for s in word_tokenize(c)]
                                for c in healthplan["chunks"]]

    print("Disability key words: ", keywords_lem)

    healthplan["disabled_chunks_lem"] = [
        (i, list(set(c).intersection(set(keywords_lem))))
        if set(c).intersection(set(keywords_lem)) else (i, None)
        for i, c in enumerate(healthplan["chunks_lem"])
    ]
    print(
        "Number of disability sentences: ",
        len(healthplan["disabled_chunks_lem"]) -
        healthplan["disabled_chunks_lem"].count(None))

    print(
        "Percentage of document disability related: ",
        len(healthplan["disabled_chunks_lem"]) -
        healthplan["disabled_chunks_lem"].count(None))

    healthplan["disabled_chunks"] = []
    for i, c in enumerate(healthplan["chunks"]):
        if (healthplan["disabled_chunks_lem"][i][1] != None):
            print("Sentence: ", c)
            healthplan["disabled_chunks"].append([int(i), c])
            print("Key words: ", healthplan["disabled_chunks_lem"][i])

    print("\n", "#" * 10, "\n")
    #Returned sentences that relate to disability topics
    for c in healthplan["disabled_chunks"]:
        print(c)
    #print(np.array(healthplan["disabled_chunks"])[:,1])
    '''
    for i,c in enumerate(healthplan["chunks"]):
        if "prosthesis" in c:
            print("Index ",i,": ", healthplan["chunks_lem"][i], healthplan["disabled_chunks_lem"][i])
            print("Original sentence: ", c)
            print("previous sebtebces", healthplan["chunks"][i-5:i])
    print("Percentage of medical sentences disability related: ", round((len(healthplan["disabled_chunks_lem"])-healthplan["disabled_chunks_lem"].count(None))/len(healthplan["chunks_lem"]),2), "% : ", (len(healthplan["disabled_chunks_lem"])-healthplan["disabled_chunks_lem"].count(None)))

    print("\n", "#"*20, "\n")
    for i,c in enumerate(healthplan["sentences"]):
        if "prosthesis" in c:
            print("Index ",i,": ", healthplan["chunks_lem"][i], healthplan["disabled_chunks_lem"][i])
            print("Original sentence: ", c)
            print("previous sebtebces", healthplan["sentences"][i-5:i])
            '''
    print("\n", "#" * 20, "\n")
    print(sentences)
    sentences = [(s[0], [snowball.stem(w) for w in word_tokenize(s[1])])
                 for s in sentences]
    print("\n", "#" * 20, "\n")

    #Convert sentence features using bag of words model
    def extract_features(words):
        return dict([(word, True) for word in words])

    #split sentences into pos, neg, neut training/testing
    trainsplit = 0.8
    features_pos, features_neg, features_neut = [], [], []
    for s in sentences:
        if (s[0] == 0):
            print("I am a positive sentence:", s[1])
            features_neut.append((extract_features(s[1]), "neutral"))
        elif (s[0] == 1):
            features_pos.append((extract_features(s[1]), "positive"))
        else:
            features_neg.append((extract_features(s[1]), "negative"))
    num_neut, num_pos, num_neg = list(
        map(
            int,
            np.array(
                [len(features_neut),
                 len(features_pos),
                 len(features_neg)]) * trainsplit))

    features_test = features_neut[num_neut:] + features_pos[
        num_pos:] + features_neg[num_neg:]
    features_train = features_neut[:
                                   num_neut] + features_pos[:
                                                            num_pos] + features_neg[:
                                                                                    num_neg]

    #sentence sentiment analysis

    # Train a Naive Bayes classifier

    classifier = NaiveBayesClassifier.train(features_train)
    print('\nAccuracy of the classifier:',
          nltk_accuracy(classifier, features_test))
    '''
    #predictions=classifier.predict(features_test)
    #Create Confusion Matrix
    confusion_mat=confusion_matrix(np.array(features_test)[:,1], predictions)

    #Visualize Confusion Matrix
    plt.imshow(confusion_mat, interpolation="nearest", cmap=plt.cm.spring)#, cmap=plt.cm.gray
    plt.title('Confusion Matrix')
    plt.colorbar()
    ticks=np.arange(5)
    plt.xticks(ticks,ticks)
    plt.yticks(ticks,ticks)
    plt.ylabel("True Labels")
    plt.xlabel("Predicted Labels")
    plt.show()

    #Classification Report
    targets=['Class-0', 'Class-1', 'Class-2', 'Class-3', 'Class-4']
    print('/n', classification_report(np.array(features_test)[:,1], predictions, target_names=targets))
    '''

    N = 15
    print('\nTop ' + str(N) + ' most informative words:')
    for i, item in enumerate(classifier.most_informative_features()):
        print(item)
        print(str(i + 1) + '. ' + item[0])
        if i == N - 1:
            break

    print("Indexes of disability related chunks in sentences:",
          np.array(healthplan["disabled_chunks"])[:, 0])
    print(list(map(int, np.array(healthplan["disabled_chunks"])[:, 0])))
    #Go through all disability related chunks
    for i in np.array(healthplan["disabled_chunks"])[:, 0]:
        # Compute the probabilities
        i = int(i)
        dis_sentence = healthplan["chunks_lem"][i]
        print("Sentence trying to sentiment classify: ",
              healthplan["chunks"][i])
        probabilities = classifier.prob_classify(
            extract_features(dis_sentence))

        # Pick the maximum value
        predicted_sentiment = probabilities.max()
        # Print outputs
        print("Predicted sentiment:", predicted_sentiment)
        print("Probability:", round(probabilities.prob(predicted_sentiment),
                                    2))

    sys.stdout.flush()
    # Define the train and test split (80% and 20%)
    threshold = 0.8
    num_pos = int(threshold * len(features_pos))
    num_neg = int(threshold * len(features_neg))

     # Create training and training datasets
    features_train = features_pos[:num_pos] + features_neg[:num_neg]
    features_test = features_pos[num_pos:] + features_neg[num_neg:]

    # Print the number of datapoints used
    print('\nNumber of training datapoints:', len(features_train))
    print('Number of test datapoints:', len(features_test))

    # Train a Naive Bayes classifier
    classifier = NaiveBayesClassifier.train(features_train)
    print('\nAccuracy of the classifier:', nltk_accuracy(
            classifier, features_test))

    N = 15
    print('\nTop ' + str(N) + ' most informative words:')
    for i, item in enumerate(classifier.most_informative_features()):
        print(str(i+1) + '. ' + item[0])
        if i == N - 1:
            break

    # Test input movie reviews
    input_reviews = [
        'The costumes in this movie were great',
        'I think the story was terrible and the characters were very weak',
        'People say that the director of the movie is amazing',
        'This is such an idiotic movie. I will not recommend it to anyone.'
    ]
Exemplo n.º 10
0
    MNB_classifier = SklearnClassifier(MultinomialNB(alpha=1)).train(features_train)
    BNB_classifier = SklearnClassifier(BernoulliNB(alpha=1,binarize=0)).train(features_train)
    LGR_classifier = SklearnClassifier(LogisticRegression()).train(features_train)
    SDGC_classifier = SklearnClassifier(SGDClassifier(max_iter=1000,tol=1e-3)).train(features_train)
    SVC_classifier = SklearnClassifier(SVC()).train(features_train)
    LSVC_classifier = SklearnClassifier(LinearSVC()).train(features_train)
    NuSVC_classifier = SklearnClassifier(NuSVC()).train(features_train) #nu <= 0 or nu > 1

    # N = 15
    # print('\nTop ' + str(N) + ' most informative words:')
    # for i, item in enumerate(MNB_classifier.most_informative_features()):
    #     print(str(i+1) + '. ' + item[0])
    #     if i == N - 1:
    #         break

    print('ONB_classifier accuracy: ',nltk_accuracy(ONB_classifier,features_test))
    print('MNB_classifier accuracy: ',nltk_accuracy(MNB_classifier,features_test))
    print('BNB_classifier accuracy: ',nltk_accuracy(BNB_classifier,features_test))
    print('LGR_classifier accuracy: ',nltk_accuracy(LGR_classifier,features_test))
    print('SDGC_classifier accuracy: ',nltk_accuracy(SDGC_classifier,features_test))
    print('SVC_classifier accuracy: ',nltk_accuracy(SVC_classifier,features_test))
    print('LSVC_classifier accuracy: ',nltk_accuracy(LSVC_classifier,features_test))
    print('NuSVC_classifier accuracy: ',nltk_accuracy(NuSVC_classifier,features_test))
    
    # Test input movie reviews
    with open('text.txt','r',encoding='utf-8') as f1:
        input_reviews = sent_tokenize(f1.read())

    f1.close()

    f = open('result.txt','w',encoding='utf-8')
                     'Negative') for f in fileids_neg]

    threshold = 0.8
    num_pos = int(threshold * len(features_pos))
    num_neg = int(threshold * len(features_neg))

    # creating training and testing data
    features_train = features_pos[:num_pos] + features_neg[:num_neg]
    features_test = features_pos[num_pos:] + features_neg[num_neg:]

    print('\nNumber of training datapoints:', len(features_train))
    print('Number of test datapoints:', len(features_test))

    # training a naive bayes classifier
    classifier = NaiveBayesClassifier.train(features_train)
    print('Accuracy:', nltk_accuracy(classifier, features_test))

    # testing
    input_reviews = [
        'The costumes in this movie were great',
        'I think the story was terrible and the characters were very weak',
        'People say that the director of the movie is amazing',
        'This is such an idiotic movie. I will not recommend it to anyone.'
    ]

    print('Movie review prediction:')
    for review in input_reviews:
        print('Review:', review)
        # computing the probabilities
        probabilities = classifier.prob_classify(
            extract_features(review.split()))