Exemplo n.º 1
0
      def test_calibration(self):
         '''tests whether all features like default adjectives 
            are generated when class is instantiated'''

         train = []
 
         bound = int(len(movie_reviews.sents(categories="pos"))*0.8)
         for i,sent in enumerate(movie_reviews.sents(categories="pos")[:bound]):
            train.append(("positive"," ".join(sent)))


         bound = int(len(movie_reviews.sents(categories="neg"))*0.8)
         for sent in movie_reviews.sents(categories="neg")[:bound]:
            train.append(("negative", " ".join(sent)))
            

         random.shuffle(train)

         x = opinionminer.OpinionMiner()

         x.trainClassifier(train)


         print 'Tags are generated: ',len(x.selective_pos.conditions())>0
         print 'Positive Adverbs exist:', (len(x.positive_adverbs) > 2)
         print 'Positive Adjectives exist:', (len(x.positive_adjectives)>2)
Exemplo n.º 2
0
   def _setDefaultPositiveNegativeWords(self):

      buff1 = self._loadData('positive_adjectives.bin')
      buff2 = self._loadData('positive_adverbs.bin')
      buff3 = self._loadData('negative_adverbs.bin')
      buff4 = self._loadData('negative_adjectives.bin')

      if buff1 and buff2 and buff3 and buff4:
         self.positive_adjectives = buff1
         self.positive_adverbs = buff2
         self.negative_adverbs = buff3
         self.negative_adjectives = buff4
         return

      #First compile list of positive adjectives & adverbs
      #by initially tagging all positive sentences with POS tagger
      tagger = speechtagger.SpeechTagger()
      processed_sents = []
      self.positive_adjectives = set()
      self.positive_adverbs = set()
      self.negative_adverbs = set()
      self.negative_adjectives = set()

      train_bound_pos = int(len(movie_reviews.sents(categories="pos"))*0.8)
      train_bound_neg = int(len(movie_reviews.sents(categories="neg"))*0.8)

      #***************positive******************#
      for sentence in movie_reviews.sents(categories="pos")[:train_bound_pos]:
         concat_sent = (" ".join(sentence)).lower()
         processed_sents.append(concat_sent)

      tagged_sents = tagger.tag(processed_sents) #TODO: Save to file

      for sentence in tagged_sents:
         for (word, tag) in sentence:
            if tag is 'ADJ' or word in self.selective_pos['ADJ']:
               self.positive_adjectives.add(word)
            elif tag is 'ADV' or word in self.selective_pos['ADV']:
               self.positive_adverbs.add(word)
         
      #**************negative*****************#
      processed_sents = []
      for sentence in movie_reviews.sents(categories="neg")[:train_bound_neg]:
         concat_sent = (" ".join(sentence)).lower()
         processed_sents.append(concat_sent)

      tagged_sents = tagger.tag(processed_sents) #TODO: Save to file

      for sentence in tagged_sents:
         for (word, tag) in sentence:
            if tag is 'ADJ' or word in self.selective_pos['ADJ']:
               self.negative_adjectives.add(word)
            elif tag is 'ADV' or word in self.selective_pos['ADV']:
               self.negative_adverbs.add(word)

      self._saveData('positive_adjectives.bin',self.positive_adjectives)
      self._saveData('positive_adverbs.bin', self.positive_adverbs)
      self._saveData('negative_adjectives.bin', self.negative_adjectives)
      self._saveData('negative_adverbs.bin', self.negative_adverbs)
Exemplo n.º 3
0
def Default_Dataset():
    documents = movies.fileids()
    for doc in documents:
        sents = movies.sents(doc)
        doc_label = doc[:3]
        for sent in sents:
            default_sentence_set.append((doc_label, sent))
Exemplo n.º 4
0
def New_Dataset():
    documents = movies.fileids();
    for doc in documents:
        sents = movies.sents(doc);
        doc_label = doc[:3];
        for sent in sents:
            new_sentence_set.append((doc_label,sent));
Exemplo n.º 5
0
    def opinion_features(fileid):
        """ starter feature engineering for movie reviews... """
        # many features are counts!
        #global rn
        #print("rev#",rn)
        
        #rn += 1
        positive_count=0
        negative_count=0

        for word in movie_reviews.words(fileid):
            if word in pos_set: positive_count += 1
            if word in neg_set: negative_count += 1

        for s in movie_reviews.sents(fileid):
            # s == list of words in one sentence
            sentence = " ".join(s)
            tb = textblob.TextBlob(sentence)
            polarity = tb.polarity
            subjectivity = tb.subjectivity

        # here is the dictionary of features...
        features = {'positive': positive_count,
                    'negative': negative_count,
                    'polarity': polarity,
                    'subjectivity': subjectivity}

        return features
def get_word2vec(
        train_fn="data/rap/input.txt",
        saved_model_fn="save/save/GoogleNews-vectors-negative300.bin"):
    try:
        print "loading word2vec model at {0}".format(saved_model_fn)
        model = Word2Vec.load_word2vec_format(saved_model_fn, binary=True)
        print "model loaded"
        return model
    except IOError:
        print "no word2vec model found at {0}".format(saved_model_fn)
        with open(train_fn) as f:
            data = f.read()
            clean = TextLoader.clean_str(data)
            lines = [line.split(" ") for line in clean.split('\n')]
            full_data = brown.sents() + movie_reviews.sents() + treebank.sents(
            ) + lines
            print "training word2vec model"
            model = Word2Vec(workers=8)
            model.build_vocab(full_data)
            for i in xrange(0, 5):
                print "epoch " + str(i + 1)
                # full_data = shuffle(full_data)
                pb = ProgressBar(maxval=len(full_data))
                chunk_size = len(full_data) / 100
                j = 0
                pb.start()
                while j + chunk_size < len(full_data):
                    model.train(full_data[j:j + chunk_size])
                    j += chunk_size
                    pb.update(j)

            print "done training"
            model.save(saved_model_fn)
            return model
Exemplo n.º 7
0
def evaluate_features(feature_extractor, N, only_acc=False):
    from nltk.corpus import movie_reviews
    from nltk.classify import NaiveBayesClassifier as naive
    from nltk.classify.util import accuracy
    from nltk.metrics import precision, recall, f_measure
    from sys import stdout
    
    negative = movie_reviews.fileids('neg')
    positive = movie_reviews.fileids('pos')
    negfeats = [(feature_extractor(movie_reviews.sents(fileids=[f])),
                 'neg') for f in negative]

    posfeats = [(feature_extractor(movie_reviews.sents(fileids=[f])),
                 'pos') for f in positive]
    negtrain, negtest = stratifiedSamples(negfeats, N)
    postrain, postest = stratifiedSamples(posfeats, N)

    trainfeats = negtrain + postrain
    testfeats = negtest + postest
    classifier = naive.train(trainfeats)
    if only_acc: return accuracy(classifier, testfeats)
    print 'accuracy: {}'.format(accuracy(classifier, testfeats))

    # Precision, Recall, F-measure
    from collections import defaultdict
    refsets = defaultdict(set)
    testsets = defaultdict(set)

    for i, (feats, label) in enumerate(testfeats):
        refsets[label].add(i)
        observed = classifier.classify(feats)
        testsets[observed].add(i)
        
    print 'pos precision:', precision(refsets['pos'], testsets['pos'])
    print 'pos recall:', recall(refsets['pos'], testsets['pos'])
    print 'pos F-measure:', f_measure(refsets['pos'], testsets['pos'])
    print 'neg precision:', precision(refsets['neg'], testsets['neg'])
    print 'neg recall:', recall(refsets['neg'], testsets['neg'])
    print 'neg F-measure:', f_measure(refsets['neg'], testsets['neg'])
    stdout.flush()
    classifier.show_most_informative_features()
    return classifier
Exemplo n.º 8
0
def evaluate_features(feature_extractor, N, only_acc=False):
    from nltk.corpus import movie_reviews
    from nltk.classify import NaiveBayesClassifier as naive
    from nltk.classify.util import accuracy
    from nltk.metrics import precision, recall, f_measure
    from sys import stdout

    negative = movie_reviews.fileids('neg')
    positive = movie_reviews.fileids('pos')
    negfeats = [(feature_extractor(movie_reviews.sents(fileids=[f])), 'neg')
                for f in negative]

    posfeats = [(feature_extractor(movie_reviews.sents(fileids=[f])), 'pos')
                for f in positive]
    negtrain, negtest = stratifiedSamples(negfeats, N)
    postrain, postest = stratifiedSamples(posfeats, N)

    trainfeats = negtrain + postrain
    testfeats = negtest + postest
    classifier = naive.train(trainfeats)
    if only_acc: return accuracy(classifier, testfeats)
    print 'accuracy: {}'.format(accuracy(classifier, testfeats))

    # Precision, Recall, F-measure
    from collections import defaultdict
    refsets = defaultdict(set)
    testsets = defaultdict(set)

    for i, (feats, label) in enumerate(testfeats):
        refsets[label].add(i)
        observed = classifier.classify(feats)
        testsets[observed].add(i)

    print 'pos precision:', precision(refsets['pos'], testsets['pos'])
    print 'pos recall:', recall(refsets['pos'], testsets['pos'])
    print 'pos F-measure:', f_measure(refsets['pos'], testsets['pos'])
    print 'neg precision:', precision(refsets['neg'], testsets['neg'])
    print 'neg recall:', recall(refsets['neg'], testsets['neg'])
    print 'neg F-measure:', f_measure(refsets['neg'], testsets['neg'])
    stdout.flush()
    classifier.show_most_informative_features()
    return classifier
def Initialize_SentimentAnalyzer():
    pos_docs = movies.fileids('pos')
    neg_docs = movies.fileids('neg')
    classifier_training = []

    for doc in pos_docs:
        sents = movies.sents(doc)
        for sent in sents:
            tagged = t2.tag(sent)
            words = [w for w, k in tagged]
            tags = [k for w, k in tagged]
            feature = {}
            for i in range(len(words) - 1):
                feature[words[i] + ' ' +
                        words[i + 1]] = tags[i] + ' ' + tags[i + 1]

            temp = (feature, 'pos')
            classifier_training.append(temp)

    for doc in neg_docs:
        sents = movies.sents(doc)
        for sent in sents:
            tagged = t2.tag(sent)
            words = [w for w, k in tagged]
            tags = [k for w, k in tagged]
            feature = {}
            for i in range(len(words) - 1):
                feature[words[i] + ' ' +
                        words[i + 1]] = tags[i] + ' ' + tags[i + 1]

            temp = (feature, 'neg')
            classifier_training.append(temp)

    random.shuffle(classifier_training)
    train_set = classifier_training
    classifier = nltk.NaiveBayesClassifier.train(train_set)

    return classifier
Exemplo n.º 10
0
    def _init_train(self):
        lemmas = [
            tup[0].split() for tup in self.db.loadProcessed("lemmatized")
        ]

        model = FastText(min_count=5)
        model.build_vocab(brown.sents())
        model.train(
            brown.sents(),
            total_examples=model.corpus_count,
            total_words=model.corpus_total_words,
            epochs=model.epochs,
        )
        model.build_vocab(treebank.sents(), update=True)
        model.train(
            treebank.sents(),
            total_examples=model.corpus_count,
            total_words=model.corpus_total_words,
            epochs=model.epochs,
        )
        model.build_vocab(movie_reviews.sents(), update=True)
        model.train(
            movie_reviews.sents(),
            total_examples=model.corpus_count,
            total_words=model.corpus_total_words,
            epochs=model.epochs,
        )
        model.build_vocab(lemmas, update=True)
        model.train(
            lemmas,
            total_examples=model.corpus_count,
            total_words=model.corpus_total_words,
            epochs=model.epochs,
        )

        return model
Exemplo n.º 11
0
def make_mech_turk_entry():
    fileid = movie_reviews.fileids(categories=category)[fileid_num]
    turk_entry_filename = "entry-%s" % fileid.replace("/", "-")

    with open(turk_entry_filename, "wb") as f:
        f.write(preamble)
        f.write("\n\n\n<h1>Review</h1>\n\n<p>")
        f.write(movie_reviews.raw(fileid).replace("\n", "<br/>\n"))
        f.write("</p>\n")

        f.write("\n\n\n<h1>Select Summary Sentence</h1>\n\n<p>")
        for i, sent in enumerate(movie_reviews.sents(fileid)):
            opentag = '<input type="radio" name="%s" value="sent%i">' % (fileid, i)
            taginner = string.join(sent, " ")
            closetag = "</input><br/>\n"
            f.write(opentag + taginner + closetag)
        f.write("</p>\n")
def Initialize_SentimentAnalyzer():
    documents = movies.fileids()
    classifier_training = []

    for doc in documents:
        sents = movies.sents(doc)
        doc_label = doc[:3]
        for sent in sents:
            tagged = t2.tag(sent)
            pairs = [(w, k) for w, k in tagged]
            feature = {}
            for i in range(len(pairs) - 1):
                feature[pairs[i][0] + ' ' +
                        pairs[i + 1][0]] = pairs[i][1] + ' ' + pairs[i + 1][1]

            temp = (feature, doc_label)
            classifier_training.append(temp)

    random.shuffle(classifier_training)
    train_set = classifier_training
    classifier = nltk.NaiveBayesClassifier.train(train_set)

    return classifier
Exemplo n.º 13
0
import pandas as pd

#print(movie_reviews.fileids())


def func(list0):
    sents = []
    for blist in list0:
        fsent = ""
        for slist in blist:
            fsent += slist + " "
        sents.append(fsent)
    return sents


sents0 = movie_reviews.sents("neg/cv000_29416.txt")
sents1 = movie_reviews.sents("pos/cv041_21113.txt")

texts0 = func(sents0)
texts1 = func(sents1)

texts2 = texts0 + texts1

vec = CountVectorizer()
vec.fit(texts2)

print([w for w in sorted(vec.vocabulary_.keys())])

print(
    pd.DataFrame(vec.transform(texts2).toarray(),
                 columns=sorted(vec.vocabulary_.keys())))
def trainAllClassifiers():
    #Get all subjective and objective sentences.
    #Note: The "encode/decode" statement is used to parse the unicode representation of the text to an
    #Ascii representation. The "apply_features()" method throws an error if this isn't done. This is most
    #likely because python 3 uses unicode characters to perform operations on string, while python 2 doesn't.
    print("Splitting positive and negative documents...")
    positive_docs = [
        ([string.encode('ascii', 'ignore').decode('ascii')
          for string in sent], 'pos')
        for sent in movie_reviews.sents(categories='pos')
    ]
    negative_docs = [
        ([string.encode('ascii', 'ignore').decode('ascii')
          for string in sent], 'neg')
        for sent in movie_reviews.sents(categories='neg')
    ]
    #obj_docs = [(sent.encode('ascii', 'ignore').decode('ascii'), 'obj') for sent in subjectivity.sents(categories='obj')]

    #Randomly split data sets into train and test sets.
    train_pos, test_pos = train_test_split(positive_docs,
                                           test_size=1000,
                                           train_size=4000)
    train_neg, test_neg = train_test_split(negative_docs,
                                           test_size=1000,
                                           train_size=4000)

    #Aggregate train and test data sets.
    train = train_pos + train_neg
    test = test_pos + test_neg

    #Create a sentiment analyzer to analyze the text documents. This analyzer
    #provides an abstraction for managing a classifier, and feature extractor.
    #It also provides convinence data metrics on classifier performance.
    sentim_analyzer = SentimentAnalyzer()
    #Mark negations in the tokenized training text, and count all negative words.
    #all_words() returns all tokens from the document, which is used to create a set
    #of features with a feature extractor.
    print("Creating feature set...")
    all_words_with_neg_tags = sentim_analyzer.all_words(
        [mark_negation(doc) for doc in train])
    #Create the unigram features, only taking features that occur more than 4 time.
    unigram_features = sentim_analyzer.unigram_word_feats(
        all_words_with_neg_tags, min_freq=2)

    #Save the unigram feature list to a file so it can be used later.
    #These features need to be applied to the email set.
    f = open("./bow_features.pkl", "w")
    pickle.dump(unigram_features, f)
    f.close()

    #Create a feature extractor based on the unigram word features created.
    #The unigram feature extractor is found in the sentiment utils package.
    sentim_analyzer.add_feat_extractor(extract_unigram_feats,
                                       unigrams=unigram_features)
    #Create feature-value representations of the data.
    train_set = sentim_analyzer.apply_features(train)
    test_set = sentim_analyzer.apply_features(test)

    #Collect some memory.
    positive_docs = None
    negative_docs = None
    gc.collect()

    #Note, training may take a long time.
    #Create a trainer and train the sentiment analyzer on the training set.
    print("Beginning the classifier training...")

    #SVM
    startTime = time.time()
    print("Linear Support Vector Machine.")
    clf = SklearnClassifier(LinearSVC())
    trainer = clf.train
    classifier = sentim_analyzer.train(trainer, train_set)
    endTime = time.time()
    timeDiff = endTime - startTime
    saveModel(classifier, "lsvm")
    saveMetricsToFile("lsvm", sentim_analyzer, test_set, timeDiff / 60.0)
    print "Total time to train: " + str(timeDiff / 60.0) + " minutes."

    #Naive Bayes
    startTime = time.time()
    print("Naive Bayes.")
    trainer = NaiveBayesClassifier.train
    classifier = sentim_analyzer.train(trainer, train_set)
    endTime = time.time()
    timeDiff = endTime - startTime
    saveModel(classifier, "nb")
    saveMetricsToFile("nb", sentim_analyzer, test_set, timeDiff / 60.0)
    print "Total time to train: " + str(timeDiff / 60.0) + " minutes."

    #Stochastic Gradient Descent. (Performed first since it takes the least amount of time.)
    startTime = time.time()
    print("Stochastic Gradient Descent.")
    clf = SklearnClassifier(SGDClassifier())
    trainer = clf.train
    classifier = sentim_analyzer.train(trainer, train_set)
    endTime = time.time()
    timeDiff = endTime - startTime
    saveModel(classifier, "sgd")
    saveMetricsToFile("sgd", sentim_analyzer, test_set, timeDiff / 60.0)
    print "Total time to train: " + str(timeDiff / 60.0) + " minutes."

    #SVM
    startTime = time.time()
    print("RBF Support Vector Machine.")
    clf = SklearnClassifier(svm.SVC(kernel='rbf'))
    trainer = clf.train
    classifier = sentim_analyzer.train(trainer, train_set)
    endTime = time.time()
    timeDiff = endTime - startTime
    saveModel(classifier, "svm")
    saveMetricsToFile("svm", sentim_analyzer, test_set, timeDiff / 60.0)
    print "Total time to train: " + str(timeDiff / 60.0) + " minutes."

    #Multinomial Naive Bayes.
    startTime = time.time()
    print("Multinomial Naive Bayes.")
    clf = SklearnClassifier(MultinomialNB())
    trainer = clf.train
    classifier = sentim_analyzer.train(trainer, train_set)
    endTime = time.time()
    timeDiff = endTime - startTime
    saveModel(classifier, "mnb")
    saveMetricsToFile("mnb", sentim_analyzer, test_set, timeDiff / 60.0)
    print "Total time to train: " + str(timeDiff / 60.0) + " minutes."

    #Logistic Regression.
    startTime = time.time()
    print("Logistic Regression.")
    clf = SklearnClassifier(LogisticRegression())
    trainer = clf.train
    classifier = sentim_analyzer.train(trainer, train_set)
    endTime = time.time()
    timeDiff = endTime - startTime
    saveModel(classifier, "lr")
    saveMetricsToFile("lr", sentim_analyzer, test_set, timeDiff / 60.0)
    print "Total time to train: " + str(timeDiff / 60.0) + " minutes."

    #Descision tree
    startTime = time.time()
    print("Decision Tree.")
    clf = SklearnClassifier(DecisionTreeClassifier())
    trainer = clf.train
    classifier = sentim_analyzer.train(trainer, train_set)
    endTime = time.time()
    timeDiff = endTime - startTime
    saveModel(classifier, "dt")
    saveMetricsToFile("dt", sentim_analyzer, test_set, timeDiff / 60.0)
    print "Total time to train: " + str(timeDiff / 60.0) + " minutes."

    #Random Forrest.
    startTime = time.time()
    print("Random Forrest.")
    clf = SklearnClassifier(RandomForestClassifier())
    trainer = clf.train
    classifier = sentim_analyzer.train(trainer, train_set)
    endTime = time.time()
    timeDiff = endTime - startTime
    saveModel(classifier, "rf")
    saveMetricsToFile("rf", sentim_analyzer, test_set, timeDiff / 60.0)
    print "Total time to train: " + str(timeDiff / 60.0) + " minutes."

    #Adaboost
    startTime = time.time()
    print("Ada Boost")
    clf = SklearnClassifier(AdaBoostClassifier())
    trainer = clf.train
    classifier = sentim_analyzer.train(trainer, train_set)
    endTime = time.time()
    timeDiff = endTime - startTime
    saveModel(classifier, "ab")
    saveMetricsToFile("ab", sentim_analyzer, test_set, timeDiff / 60.0)
    print "Total time to train: " + str(timeDiff / 60.0) + " minutes."
def classifyOn1000Examples(binary=False):
    print("Splitting positive and negative documents...")
    positive_docs = [
        ([string.encode('ascii', 'ignore').decode('ascii')
          for string in sent], 'pos')
        for sent in movie_reviews.sents(categories='pos')
    ]
    negative_docs = [
        ([string.encode('ascii', 'ignore').decode('ascii')
          for string in sent], 'neg')
        for sent in movie_reviews.sents(categories='neg')
    ]
    #Randomly split data sets into train and test sets.
    train_pos, test_pos = train_test_split(positive_docs,
                                           test_size=500,
                                           train_size=4000)
    train_neg, test_neg = train_test_split(negative_docs,
                                           test_size=500,
                                           train_size=4000)

    #Aggregate train and test data sets.
    test = test_pos + test_neg

    #Create a sentiment analyzer to analyze the text documents. This analyzer
    #provides an abstraction for managing a classifier, and feature extractor.
    #It also provides convinence data metrics on classifier performance.
    sentim_analyzer = SentimentAnalyzer()
    #Mark negations in the tokenized training text, and count all negative words.
    #all_words() returns all tokens from the document, which is used to create a set
    #of features with a feature extractor.
    print("Creating feature set...")
    f = open("./bow_features.pkl", "r")
    unigram_features = pickle.load(f)
    f.close()

    #Create a feature extractor based on the unigram word features created.
    #The unigram feature extractor is found in the sentiment utils package.
    sentim_analyzer.add_feat_extractor(extract_unigram_feats,
                                       unigrams=unigram_features)
    #Create feature-value representations of the data.
    test_set = sentim_analyzer.apply_features(test)

    #Make a dict to hold predicted labels.
    testDict = {"test_labels": []}
    for sent in test_set:
        if binary == True:
            if sent[1] == "pos":
                testDict["test_labels"].append(1)
            else:
                testDict["test_labels"].append(-1)
        else:
            testDict["test_labels"].append(sent[1])

    print("Beginning classification...")
    classifierResultsDict = {key: [] for key in classifierNamesList}
    for classifierKey in classifierNamesList:
        print("Starting classifier: " + classifierKey)
        classifier = loadModel(classifierKey)
        for sent in test_set:
            label = classifier.classify(sent[0])
            if binary == True:
                if label == "pos":
                    classifierResultsDict[classifierKey].append(1)
                else:
                    classifierResultsDict[classifierKey].append(-1)
            else:
                classifierResultsDict[classifierKey].append(label)

    return pd.DataFrame(classifierResultsDict), pd.DataFrame(testDict)
Exemplo n.º 16
0
from sklearn.model_selection import train_test_split

# For the classifier, to tell between pos and neg since the NN uses numbers.
posNegDict = {'pos': 0, 'neg': 1}
numToCatDict = {1: 'pos', 0: 'neg'}

# collapse and average the word2vecs keyvaluepair and remove words that arent in the vocab.
def averageVectors(vec, words):
    words = [wd for wd in words if wd in vec.wv.index_to_key]
    if len(words) != 0:
        return np.average(vec.wv[words], axis=0)
    else:
        return None

# Gather the documents with their classifications in numeric form.
document = [(Word2Vec(movie_reviews.sents(file), min_count=1), movie_reviews.words(file), posNegDict[category]) for file in movie_reviews.fileids() for category in movie_reviews.categories(file)]
# Randomizes document files so that the data doesnt bias.
shuffle(document)

# Gather user input
userInput = []
userRaw = []
i = 0
for input in open("classifyUserInput.txt"):
    userRaw.append(word_tokenize(input))
    userInput.append(averageVectors(Word2Vec(word_tokenize(input)), userRaw[i]))
    i = i + 1

# Separate the vectors and the classificiation
x = np.array([averageVectors(x[0], x[1]) for x in document])
y = np.array([y[2] for y in document])
Exemplo n.º 17
0
print("condll2007 to sents")
inaugural_corp_sents = inaugural.sents()
print("inaugural to sents")
abc_corp_sents = abc.sents()
print("ABC to sentences")
genesis_corp_sents = genesis.sents()
print("Genesis to sents")
frame_net_corp_sents = fn.sents()
print("Frame_net to sents")
state_union_corp_sents = state_union.sents()
print('state union to sents')
subject_corp_sents = subjectivity.sents()
print('Subjectvity to sents')
brown_corp_sents = brown.sents()
print("Brown corpus to sents")
movie_reviews_corp_sents = movie_reviews.sents()
print("Movie reviews to sents ")
guttenberg_corp_sents = gutenberg.sents()
print("Guttenberg to sents")
treebank_corb_sents = treebank.sents()
print("Freebank to sents")
reuters_corp_sents = reuters.sents()
print("Reuters to sents")
webtext_corp_sents = webtext.sents()
print("Webtext to sents")

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                    level=logging.INFO)

print("Cleaning data ...")
Exemplo n.º 18
0
import pandas
import pandasql
from nltk import sent_tokenize  
from nltk import word_tokenize 
import numpy as np
from nltk.corpus import wordnet as wn
from nltk.corpus import movie_reviews
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB


movWords=[]
movTarget=[]
for sentence in movie_reviews.sents():
    for word in sentence:
        types=[b.lexname() for b in wn.synsets(word)]
        if 'adj.all' in types and word!='i' and 'noun.quantity' not in types and all(['verb' not in type1 for type1 in types]) and 'noun.food' not in types:
            movWords.append(word)
            movTarget.append('mov')
#load the move reviews data, not verbs and not foods

filename="filenames.csv"#the recipe id stored in this file
recipeIds=pandas.read_csv(filename,encoding='ISO-8859-1')['filenames'].values#load the file
dictInfo={}#create a dic for the word info
for recipeId in recipeIds:#go through each recipe
    print recipeId
    filename="reviews/reviews"+str(recipeId)+".csv"#this is our keywords
    reviews=pandas.read_csv(filename,encoding='ISO-8859-1')#load the file
    #get the reviews
    recRev = sent_tokenize(' '.join([review.lower().replace('c.','cup').replace('tspn.','teaspoon').replace(
__author__ = 'a_medelyan'

# Goal: Get movie reviews and read them
# See: http://www.nltk.org/book/ch02.html

from nltk.corpus import movie_reviews

# How many documents in this corpus?
print len(movie_reviews.fileids())

# What are the categories?
print movie_reviews.categories()

# What are some files names?
print movie_reviews.fileids('neg')[:10]
print movie_reviews.fileids('pos')[:10]

# Print the words in a sample text
print movie_reviews.words('pos/cv000_29590.txt')

# Print the original text
print movie_reviews.raw('pos/cv000_29590.txt')

# Print the sentences of the text
print movie_reviews.sents('pos/cv000_29590.txt')

# Spare time? Calculate the average number of words and sentences in positive and negative reviews
# Do people use a lot more words when giving positive vs. negative reviews?
Exemplo n.º 20
0
def find_summary_sentence(parser, fileid=None, localfile=None):
    """Finds the summary sentence for a body of text, specified by fileid or by
    localfile. fileid is accessed by NLTK.corpus.movie_reviews; localfile is 
    a path to a non-NLTK text file"""

    #load feature/opinion keywords and their respective ranks
    opinion_ranks = load_opinion_keywords()
    feature_ranks = load_feature_keywords()
    proper_noun_rank = 2
    
    #convert feature/opinions words to set, for quickly checking membership
    feature_words = set(feature_ranks.keys())
    opinion_words = set(opinion_ranks.keys())

    #load movie review as a list of sentence. (each sent is a list of words)
    if fileid and (not localfile):
        source = movie_reviews.sents(fileid)
    elif (not fileid) and localfile:
        source = open_file_as_sentences(localfile, feature_words, opinion_words)
    else:
        print "Please enter an nltk fileid, or the name of a local textfile"
        return

    #filter review for sentences containing a feature and an opinion
    summary_sents = [[word.rstrip(string.punctuation) for word in sent 
                        if word.rstrip(string.punctuation) != ''] 
                        for sent in source
                        if (set(sent) & opinion_words != set()) and 
                        ((set(sent) & feature_words != set()) or 
                            len(find_proper_nouns(sent)) > 0)]

    summary_sents_with_feature_opinion_dist = []
    for sent in summary_sents:
        try:
            feature, feature_rank = None, 10000
            opinion, opinion_rank = None, 10000
            sent_str = string.join(sent, ' ')
            proper_nouns = set(find_proper_nouns(sent)) #unique to each sentence

            #find the best opinion/feature in the sentence
            for word in sent:
                if (word in opinion_words) and opinion_ranks[word] < opinion_rank:
                    opinion = word
                    opinion_rank = opinion_ranks[word]
                elif (word in feature_words) and feature_ranks[word] < feature_rank:
                    feature = word
                    feature_rank = feature_ranks[word]
                elif (word in proper_nouns) and proper_noun_rank < feature_rank :
                    feature = word
                    feature_rank = proper_noun_rank

            #keep track of distance btwn feature/opinion for each sentence
            if feature and opinion:
                distance = dist_btwn_feature_and_opinion(feature, opinion, sent_str, parser)
                summary_sents_with_feature_opinion_dist.append((distance, sent_str))
        except JavaException:
            # print "Failure: sentence is too long (len = %i)" % len(sent)
            pass
        except AssertionError:
            # print "Failure: could not find root"
            pass

    #best summary sentences is the one with closest feature/opinion
    summary_sents_with_feature_opinion_dist.sort()
    if len(summary_sents_with_feature_opinion_dist) > 0:
        return summary_sents_with_feature_opinion_dist[0][1]
    else:
        return None
Exemplo n.º 21
0
words = [w.lower() for w in brown.words() if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'")]
words.extend([w.lower() for w in treebank.words() if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'")])
words.extend([w.lower() for w in words_list.words() if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'")])
words.extend([w.lower() for w in abc.words() if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'")])
words.extend([w.lower() for w in movie_reviews.words() if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'")])
words.extend([w.lower() for w in genesis.words() if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'")])

print "Building clean sentences list"
sentences = []
for s in brown.sents():
    sentences.append(' '.join(w.lower() for w in s if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'")))
for s in treebank.sents():
    sentences.append(' '.join(w.lower() for w in s if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'")))
for s in abc.sents():
    sentences.append(' '.join(w.lower() for w in s if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'")))
for s in movie_reviews.sents():
    sentences.append(' '.join(w.lower() for w in s if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'")))
for s in genesis.sents():
    sentences.append(' '.join(w.lower() for w in s if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'")))

    
def singles(words):
        if len(words) < 1:
            return
        for w in words:
            if re.match("[a-zA-Z'-]+", w) and w.strip() != "''":
                yield w

def doubles(sentences):
    for s in sentences:
        s = s.split(' ')
Exemplo n.º 22
0
# Module 5: Advanced Topics
# Gensime

from gensim.models import Word2Vec
from nltk.corpus import movie_reviews

embedding = Word2Vec(movie_reviews.sents(), min_count=1, size=10)

print(embedding.most_similar('man', topn=5))
print(embedding.most_similar('woman', topn=5))

# embedding.save('movie_model')
# embedding2 = Word2Vec.load('movie_model')
# print(embedding2.most_similar('man', topn=5))
Exemplo n.º 23
0
stop = stopwords.words('english')
documents = [([
    w for w in mr.words(i)
    if w.lower() not in stop and w.lower() not in string.punctuation
], i.split('/')[0]) for i in mr.fileids()]
#test = lemma().lemmatize([i for i,j in documents])

random.shuffle(documents)

# DEFINE WORDS AS KEYS AND OCCURENCES AS VALUES
#word_features = FreqDist(chain(*[i for i,j in documents])) #from itertools import chain
word_features = FreqDist([x for y, z in documents for x in y])
word_features = list(word_features.keys())  #[:1000]

# TERM-DOC MATRIX, SAMPLING TRAIN AND TEST SETS AT 80-20
numtrain = int(len(documents) * 80 / 100)
train_set = [({i: (i in tokens)
               for i in word_features}, tag)
             for tokens, tag in documents[:numtrain]]
test_set = [({i: (i in tokens)
              for i in word_features}, tag)
            for tokens, tag in documents[numtrain:]]

# RUN CLASSIFIER AND RETURN PERFORMANCE MEASURES
classifier = nbc.train(train_set)
print(nltk.classify.accuracy(classifier, test_set) * 100)
classifier.show_most_informative_features(5)
""" MODELLING NLTK MOVIE REVIEWS - NB, WORD2VEC """
w2v = Word2Vec(mr.sents())
w2v.most_similar("damon", topn=5)
posts = nltk.corpus.nps_chat.xml_posts();
featuresets = [nltk.pos_tag(word_tokenize(post.text)) for post in posts];
t0= nltk.DefaultTagger('NN');
t1= nltk.UnigramTagger(featuresets, backoff=t0);
t2= nltk.BigramTagger(featuresets, backoff= t1);

##text = word_tokenize("I am good");
##print(t2.tag(text));
##print(text);

pos_docs = movies.fileids('pos');
neg_docs = movies.fileids('neg');
classifier_training=[];

for doc in pos_docs:
    sents = movies.sents(doc);
    for sent in sents:
        tagged = t2.tag(sent);
        words = [w for w,k in tagged];        
        tags = [k for w,k in tagged];
        feature={};
        for i in range(len(words)-1):
            feature[words[i]+ ' ' + words[i+1]] = tags[i]+ ' ' + tags[i+1];
        
        temp = (feature, 'pos');
        classifier_training.append(temp);

##print('pos data acquired !');

for doc in neg_docs:
    sents = movies.sents(doc);
Exemplo n.º 25
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu Jun 28 12:08:25 2018

@author: ritesh
"""

#importing libraries
import os
import gensim
from nltk.corpus import movie_reviews

#getting data
sentences = movie_reviews.sents()

#training word2vec model
model = gensim.models.Word2Vec(sentences, min_count=1)
#model.save('Mreview_model')

#load and test

#model = gensim.models.Word2Vec.load(‘Mreview_model’)
#words most similar to mother
print("Most Similar:", model.most_similar('mother'))

#find the odd one out
print(model.doesnt_match("breakfast cereal dinner lunch".split()))
print(model.doesnt_match("cat dog table".split()))

#vector representation of word human
Exemplo n.º 26
0
	print "Processed {0} sentences\r".format(processed_count),
print "Current Structure total: {0}".format(len(sentences))

print "Adding brown sentence structures ({0})...".format(len(brown.sents()))
for sentence in brown.sents():
	processed_count += 1
	try:
		blob = TextBlob(filter(lambda x: x in string.printable, " ".join(sentence)), pos_tagger=PerceptronTagger())
		tags = tuple([tag[1] for tag in blob.tags])
		sentences.add(tags)
	except:
		print "\r",
	print "Processed {0} sentences\r".format(processed_count),
print "Current Structure total: {0}".format(len(sentences))

print "Adding movie_review sentence structures ({0})...".format(len(movie_reviews.sents()))
for sentence in movie_reviews.sents():
	processed_count += 1
	try:
		blob = TextBlob(filter(lambda x: x in string.printable, " ".join(sentence)), pos_tagger=PerceptronTagger())
		tags = tuple([tag[1] for tag in blob.tags])
		sentences.add(tags)
	except:
		print "\r",
	print "Processed {0} sentences\r".format(processed_count),

print "Processed {0} sentences".format(processed_count)
print "Current Structure total: {0}".format(len(sentences))

print "Saving structures to text file"
f = open('./sentence_structures.txt', 'w')
Exemplo n.º 27
0
t0 = nltk.DefaultTagger('NN')
t1 = nltk.UnigramTagger(featuresets, backoff=t0)
t2 = nltk.BigramTagger(featuresets, backoff=t1)

##text = word_tokenize("I am good");
##print(t2.tag(text));
##print(text);

from nltk.corpus import movie_reviews as movies
pos_docs = movies.fileids('pos')
neg_docs = movies.fileids('neg')
classifier_training = []

for doc in pos_docs:
    sents = movies.sents(doc)
    for sent in sents:
        tagged = t2.tag(sent)
        words = [w for w, k in tagged]
        tags = [k for w, k in tagged]
        feature = {}
        for i in range(len(words) - 1):
            feature[words[i] + ' ' +
                    words[i + 1]] = tags[i] + ' ' + tags[i + 1]

        temp = (feature, 'pos')
        classifier_training.append(temp)

print('pos data acquired !')

for doc in neg_docs:
Exemplo n.º 28
0
"""Following the tutorial on http://www.nltk.org/howto/sentiment.html"""

from nltk.classify import NaiveBayesClassifier
from nltk.corpus import movie_reviews
from nltk.sentiment import SentimentAnalyzer
from nltk.sentiment.util import *

n_instances = 1000
pos_sent = [(sent, 'pos')
            for sent in movie_reviews.sents(categories='pos')][:n_instances]
neg_pos = [(sent, 'neg')
           for sent in movie_reviews.sents(categories='neg')][:n_instances]

print(pos_sent[:30])
print(neg_pos[:30])
# Split subjective and objective instances to keep a balanced distribution in both train and test sets

train_subj_docs = pos_sent[:80]
test_subj_docs = pos_sent[80:1000]

train_obj_docs = neg_pos[:80]
test_obj_docs = neg_pos[80:1000]

testing_docs = test_subj_docs + test_obj_docs
training_docs = train_subj_docs + train_obj_docs

# Handles negation
sentim_analyzer = SentimentAnalyzer()
all_words_neg = sentim_analyzer.all_words(
    [mark_negation(doc) for doc in training_docs])
Exemplo n.º 29
0
from nltk.corpus import sentiwordnet as wdn
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.util import ngrams

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS

pp = pprint.PrettyPrinter(indent=4)

neg, pos = movie_reviews.categories()

new_phrases = []
for ids in movie_reviews.fileids(neg):
    for phrase in movie_reviews.sents(ids)[1:]:
        if len(phrase) > 3:
            new_phrases.append({
                'type': 'neg',
                'phrase': ' '.join(phrase).lower(),
                'pos_score': 0.0,
                'neg_score': 0.0,
                'over_score': 0.0
            })
for ids in movie_reviews.fileids(pos):
    for phrase in movie_reviews.sents(ids):
        if len(phrase) > 3:
            new_phrases.append({
                'type': 'pos',
                'phrase': ' '.join(phrase).lower(),
                'pos_score': 0.0,
Exemplo n.º 30
0
def preprocess():
    train_data, train_label = [], []
    model = Word2Vec(movie_reviews.sents())

    path = os.getcwd() + "/aclImdb/train/pos/"
    for filename in os.listdir(path):
        pos_file = open("aclImdb/train/pos/" + filename, 'r')
        for line in pos_file.readlines():
            review = []
            for word in line.split():
                if word in model.wv.vocab:
                    review.append(model.wv[word])
            train_data.append(review)
            train_label.append((0, 1))
        pos_file.close()

    path = os.getcwd() + "/aclImdb/train/neg/"
    for filename in os.listdir(path):
        neg_file = open("aclImdb/train/neg/" + filename, 'r')
        for line in neg_file.readlines():
            review = []
            for word in line.split():
                if word in model.wv.vocab:
                    review.append(model.wv[word])
            train_data.append(review)
            train_label.append((1, 0))
        neg_file.close()

    print len(train_data), len(train_label)
    x_train = np.array([np.array(xi) for xi in train_data])
    y_train = np.array(train_label)
    print x_train.shape, x_train[0].shape, x_train[1].shape
    print y_train.shape, y_train[0].shape, y_train[1].shape
    np.save('train_data.npy', x_train)
    np.save('train_label.npy', y_train)

    test_data, test_label = [], []

    path = os.getcwd() + "/aclImdb/test/pos/"
    for filename in os.listdir(path):
        pos_file = open("aclImdb/test/pos/" + filename, 'r')
        for line in pos_file.readlines():
            review = []
            for word in line.split():
                if word in model.wv.vocab:
                    review.append(model.wv[word])
            test_data.append(review)
            test_label.append((0, 1))
        pos_file.close()

    path = os.getcwd() + "/aclImdb/test/neg/"
    for filename in os.listdir(path):
        neg_file = open("aclImdb/test/neg/" + filename, 'r')
        for line in neg_file.readlines():
            review = []
            for word in line.split():
                if word in model.wv.vocab:
                    review.append(model.wv[word])
            test_data.append(review)
            test_label.append((0, 1))
        neg_file.close()

    print 'finish loading data'
    print len(test_data), len(test_label)
    x_test = np.array([np.array(xi) for xi in test_data])
    y_test = np.array(test_label)
    np.save('test_data.npy', x_test)
    np.save('test_label.npy', y_test)
    return [
        np.asarray(x_train),
        np.asarray(y_train),
        np.asarray(x_test),
        np.asarray(y_test)
    ]
Exemplo n.º 31
0
def find_summary_sentence(parser, fileid=None, localfile=None):
    """Finds the summary sentence for a body of text, specified by fileid or by
    localfile. fileid is accessed by NLTK.corpus.movie_reviews; localfile is 
    a path to a non-NLTK text file"""

    #load feature/opinion keywords and their respective ranks
    opinion_ranks = load_opinion_keywords()
    feature_ranks = load_feature_keywords()
    proper_noun_rank = 2

    #convert feature/opinions words to set, for quickly checking membership
    feature_words = set(feature_ranks.keys())
    opinion_words = set(opinion_ranks.keys())

    #load movie review as a list of sentence. (each sent is a list of words)
    if fileid and (not localfile):
        source = movie_reviews.sents(fileid)
    elif (not fileid) and localfile:
        source = open_file_as_sentences(localfile, feature_words,
                                        opinion_words)
    else:
        print("Please enter an nltk fileid, or the name of a local textfile")
        return

    #filter review for sentences containing a feature and an opinion
    summary_sents = [[
        word.rstrip(string.punctuation) for word in sent
        if word.rstrip(string.punctuation) != ''
    ] for sent in source if (set(sent) & opinion_words != set()) and (
        (set(sent)
         & feature_words != set()) or len(find_proper_nouns(sent)) > 0)]

    summary_sents_with_feature_opinion_dist = []
    for sent in summary_sents:
        try:
            feature, feature_rank = None, 10000
            opinion, opinion_rank = None, 10000
            sent_str = str.join(sent, ' ')
            proper_nouns = set(
                find_proper_nouns(sent))  #unique to each sentence

            #find the best opinion/feature in the sentence
            for word in sent:
                if (word in opinion_words
                    ) and opinion_ranks[word] < opinion_rank:
                    opinion = word
                    opinion_rank = opinion_ranks[word]
                elif (word
                      in feature_words) and feature_ranks[word] < feature_rank:
                    feature = word
                    feature_rank = feature_ranks[word]
                elif (word
                      in proper_nouns) and proper_noun_rank < feature_rank:
                    feature = word
                    feature_rank = proper_noun_rank

            #keep track of distance btwn feature/opinion for each sentence
            if feature and opinion:
                distance = dist_btwn_feature_and_opinion(
                    feature, opinion, sent_str, parser)
                summary_sents_with_feature_opinion_dist.append(
                    (distance, sent_str))
        except JavaException:
            # print "Failure: sentence is too long (len = %i)" % len(sent)
            pass
        except AssertionError:
            # print "Failure: could not find root"
            pass

    #best summary sentences is the one with closest feature/opinion
    summary_sents_with_feature_opinion_dist.sort()
    if len(summary_sents_with_feature_opinion_dist) > 0:
        return summary_sents_with_feature_opinion_dist[0][1]
    else:
        return None
Exemplo n.º 32
0
 def word2vec_processing(self, corpora='treebank'):
     print("Start word2vec training...")
     self.t = Word2Vec(movie_reviews.sents())
     print("Word2vec training is finished")
Exemplo n.º 33
0
from gensim.models import Word2Vec
from nltk.corpus import brown, movie_reviews
import os
data_folder = r"""C:\Users\K1774755\King's College London\Cognitive Impairment in Schizophrenia - Documents\Courses\CUSMUMH\week 7 - NLP_courses_and_tutorials with nltk & spacy"""
pycharm_folder = r'C:\Users\K1774755\PycharmProjects\toy-models\NLP'

# Let's generate word vectors over the Brown corpus text.
# We will have 20 dimensions, using a window of five for the context words in the skip-grams
# (e.g. c1, c2, w, c3, c4).
# This might be a little slow (maybe 1-2 minutes).

# for the Brown corpus
b = Word2Vec(brown.sents(), size=400, window=10, min_count=5)
# for the movie review corpus
mr = Word2Vec(movie_reviews.sents(), size=20, window=5, min_count=3)

#Now we have the vectors, we can see how good they are by measuring which words are similar to each other.
b.wv.most_similar('company', topn=5)
mr.wv.most_similar('love', topn=5)
#Try altering the window and the dimension size, to see if you get better results.

#We can also do some arithmetic with the words. Let's try that classical result, king - man + woman.

b.wv.most_similar(positive=['biggest', 'small'], negative=['big'], topn=5)

#We can then load these in using Gensim; they might take a minute to load.
from gensim.models.keyedvectors import KeyedVectors
glove = KeyedVectors.load_word2vec_format(os.path.join(pycharm_folder,'glove.twitter.27B.25d.txt.bz2'), binary=False)
print("Done loading")

#Can you find any cool word combinations? What differences are there in the datasets?
# -*- coding: utf-8 -*-
#!/usr/bin/env python 
from gensim.models import Word2Vec 
from nltk.corpus import brown, movie_reviews, treebank 

if __name__ == '__main__':
	brown_sentences = Word2Vec(brown.sents())
	movie_sentences = Word2Vec(movie_reviews.sents()) 
	treebank_sentences = Word2Vec(treebank.sents()) 

	print brown_sentences.most_similar('money', topn=5) 
	print movie_sentences.most_similar('money', topn=5) 
	print treebank_sentences.most_similar('money', topn=5) 

Exemplo n.º 35
0
    "나의": "너의",
    "당신은": "나는",
    "너의": "나의"}

def hugot_bot():
    print("안녕 이름이 뭐니?")
    chat = Chat(pairs, reflections)
    chat.converse()
hugot_bot()




import nltk
from nltk.corpus import movie_reviews
movie_reviews.sents()
sentences = [list(s) for s in movie_reviews.sents()]
sentences[0]
sentences[1]


#도수 #가까이 근접 #벡터 사이의 코사인 유사도를 이용한 가까운 단어를 추출.
from gensim.models.word2vec import Word2Vec
model = Word2Vec(sentences)
model.init_sims(replace=True)
model.similarity('actor', 'actress')
model.similarity('he', 'she')
model.similarity('actor', 'she')
model.similarity('actress', 'she')
model.most_similar('accident', topn=10) #default 값이 10개
Exemplo n.º 36
0
    return correct_texts


if __name__ == '__main__':
    with open("nytimes_news_articles.txt", "r") as f:
        corpus = f.read().splitlines()
        nytimes_corpus = []
        for line in corpus:
            for sent in sent_tokenize(line):
                if len(sent) != 0:
                    if word_tokenize(sent)[0] != "URL":
                        nytimes_corpus.append(word_tokenize(sent.lower()))

    corpus = nytimes_corpus
    for sent in movie_reviews.sents():
        corpus.append(sent)

    #cfd, cpd = bigram_language_model(movie_reviews.sents(), 3)
    print(len(corpus))
    trigram_model = ngram_language_model(corpus, 3)

    vocabulary_size = input('Vocabulary size를 입력해주세요.')

    vocabulary_size = int(vocabulary_size)
    if vocabulary_size > 9000:
        print('배울 단어가 없습니다.')
        pass

    else:
        thousands = vocabulary_size // 1000 + 1
Exemplo n.º 37
0
import sys, os, nltk, random
from nltk.corpus import movie_reviews
documents = [(list(movie_reviews.sents(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]
random.shuffle(documents)
all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())
all_bigrams = nltk.FreqDist((w1.lower(),w2.lower()) for sent in \
       movie_reviews.sents() for (w1, w2) in \
       nltk.bigrams(sent))
word_features = all_words.keys()[:2000]
bigram_features = all_bigrams.keys()[:2000]


def document_features(document):
    document_words = set(word for sent in document for word in sent)
    document_bigrams = set(bg for sent in document \
                           for bg in nltk.bigrams(sent))
    features = {}
    for word in word_features:
        features['contains(%s)' % word] = (word in document_words)
    for bigram in bigram_features:
        features['contains bigram(%s)' % str(bigram)] = \
                 (bigram in document_bigrams)
    return features


'''
Sentence classifier.  Goal is to break text up into sentences.  Use
punctuation marks to do so...
'''
Exemplo n.º 38
0
def get_index_sequences(Ids, voc):
    sentences = []
    for fid in Ids:
        Sentences = movie_reviews.sents(fileids=fid)
        sentences.extend([np.array([voc.get(w.lower(), 0) for w in s]) for s in Sentences])
    return sentences
import nltk
from gensim.models import Word2Vec
from nltk.corpus import brown, movie_reviews, treebank

b = Word2Vec(brown.sents())
mr = Word2Vec(movie_reviews.sents())
t = Word2Vec(treebank.sents())

print(b.most_similar('money', topn=5))

print('aew')
# Module 3: Corpus
# Movie Review Corpus
# Author: Dr. Alfred

from nltk.corpus import movie_reviews

# print(movie_reviews.fileids())
# print(movie_reviews.categories())

fileid = 'pos/cv971_10874.txt'

text = movie_reviews.raw(fileid)
print(text)

print(" Num of chars :", len(movie_reviews.raw(fileid)))
print(" Num of words :", len(movie_reviews.words(fileid)))
print(" Num of sentences :", len(movie_reviews.sents(fileid)))

print(" Categories:", movie_reviews.categories(fileid))
Exemplo n.º 41
0
	def word2vec_processing(self, corpora='treebank'):
		print("Start word2vec training...")
		self.t = Word2Vec(movie_reviews.sents())
		print("Word2vec training is finished")
Exemplo n.º 42
0

def word2vec(document):
    """
    :param document: It is a list of tokenized sentences
                    Example : [ ['first', 'sentence'],['second','sentence']]
    :return:
    """
    model = Word2Vec(sentences=document,min_count=1)
    return model


full_corpus = []

for i in negids:
    full_corpus.extend(movie_reviews.sents(i))

for i in posids:
    full_corpus.extend((movie_reviews.sents(i)))

print len(full_corpus)

print full_corpus[0]

print full_corpus[1]

print full_corpus[0][0]

model = word2vec(full_corpus,size=50)
print model['bad']
print model['good']