예제 #1
0
def load_sentences_brown(nb_sentences=None):
    """
    :param nb_sentences: Use if all brown sentences are too many
    :return: index2word (list of string)
    """
    from nltk.corpus import brown
    import gensim

    print 'building vocab ...'

    if nb_sentences is None:
        sents = brown.sents()
    else:
        sents = brown.sents()[:nb_sentences]

    # I use gensim model only for building vocab
    model = gensim.models.Word2Vec()
    model.build_vocab(sents)
    vocab = model.vocab

    # ids: list of (list of word-id)
    ids = [[vocab[w].index for w in sent
            if w in vocab and vocab[w].sample_int > model.random.rand() * 2**32]
           for sent in sents]

    return ids, model.index2word
예제 #2
0
    def clean():

        '''
        1. Removes any individual special character.
        2. Lowers all the words.
        :return: list of clean sentences
        '''

        sents = list(brown.sents())
        sents_copy = list(brown.sents())
        n = len(sents)
        print 'Removing special chars...'
        for i in range(0, n):
            for word in sents[i]:
                if not bool(re.search('[A-Za-z0-9]', word)):
                    sents_copy[i].remove(word)
        print 'Removed special chars.'
        sents = None

        print 'Lowercasing all the words...'
        for i in range(0, n):
            m = len(sents_copy[i])
            for j in range(0, m):
                sents_copy[i][j] = sents_copy[i][j].lower()
        print 'Lowered all the words.'
        return sents_copy
예제 #3
0
파일: toturial.py 프로젝트: Paul-Lin/misc
def print_brown():
    from nltk.corpus import brown
    print brown.categories()
    print brown.words(categories='news')
    print brown.words(fileids=['cg22'])
    print brown.sents(categories=['news','reviews'])
    news_text=brown.words(categories='news')
    fdist=nltk.FreqDist([w.lower() for w in news_text])
    modals=['can','could','may','might','must','will']
    for m in modals:
        print m+':',fdist[m]
예제 #4
0
 def read_datas(self):
     brown_tagged_sentence  = brown.tagged_sents()
     brown_sent = brown.sents()
     size = int(len(brown_tagged_sentence) * 0.9)
     train_set =  brown_tagged_sentence[:size]
     test_set = brown_tagged_sentence[size:]
     return (train_set,test_set)
예제 #5
0
파일: main.py 프로젝트: lberezy/LangComp
def build_index(out_filename, in_filename = None):
    '''Builds data files for word lookup. Can take an optional input file
    to add to the data pool which is processed (not working).
    Data is then dumped to a pickle file.'''

    sents_data = []
    try:
        in_file = open(in_filename).read()
        sents_data += sent_tokenize(in_file)
        in_file.close()
    except:
        print("Warning: Failed to load external file for building.")

    sents_data += brown.sents() + treebank.sents()

    # get sentences, chop of rtheir ambiguous heads, and look at their words!
    mysents = [sent[1:] for sent in sents_data]
    # flatten sublists of words to list of words
    mywords = [word for word in mysents for word in word]
    cfd = ConditionalFreqDist((word.lower(), word) for word in mywords)
    # look up most frequent form of lowercase word by doing cfd['word'].max()
    # but need to check for existance of word in cfd first

    # made pickle file too large and slow
    # wordlist = set(words.words())
    # wordlist.update(brown.words())
    # wordlist.update(treebank.words())
    # common_words_lower = set([w for w in wordlist if w.islower()])
    # common_words_titlecase = set([w.lower() for w in wordlist if (w.istitle() and w not in common_words_lower)])

    out_file = open(out_filename, 'wb')
    pickle.dump(cfd, out_file, 2)
    # pickle.dump(common_words_lower, out_file, 2)
    # pickle.dump(common_words_titlecase, out_file, 2)
    out_file.close()
예제 #6
0
def update_category_by_pos():
    from nltk.corpus import brown
    from nltk import NaiveBayesClassifier
    from nltk import classify
    from nltk.tag import untag
    from nltk import DecisionTreeClassifier

    def pos_features(sentence, i):
        features = {'suffix(1)':sentence[i][-1:],
                    'suffix(2)':sentence[i][-2:],
                    'suffix(3)':sentence[i][-3:]
                    }
        features['prev-word'] = '<start>' if i==0 else sentence[i-1]
        return features

    print pos_features(brown.sents()[0], 8)

    tagged_sents = brown.tagged_sents(categories='news')
    featuresets = []

    for tagged_sent in tagged_sents:
        untagged_sent = untag(tagged_sent)
        for i, (word, tag) in enumerate(tagged_sent):
            featuresets.append((pos_features(untagged_sent, i), tag))

    size = int(len(featuresets) * 0.1)
    train_set, test_set = featuresets[size:], featuresets[:size]
#    classifier = NaiveBayesClassifier.train(train_set)
    classifier = DecisionTreeClassifier.train(train_set)
    print 'NaiveBay %f' % classify.accuracy(classifier, test_set)
예제 #7
0
	def find_ngrams(self, n):
		""" Input: the 'n' of 'n-grams'

			Find all the n-grams in the brown corpus. Store in frequency dictionary.
			Optionally it can be decided to use more corpora in order to have more data.

			Note: these are of course n-grams based on going through the sentence from left to right
			If we want to give the correction back based on the dependency tree, we need to
			parse the brown corpus (or any other data set) with the dependency parser, so that
			we can use this data. 			

		"""
		
		total_ngram_count = 0
		ngram_freq_dict = {}

		sents = brown.sents()
		for sent in sents:
			sent = ['-START-']*(n-1)+sent
			ngrams_brown = ngrams(sent, n)
			
			for i in ngrams_brown:
				total_ngram_count += 1
				old = ngram_freq_dict.get(i,0)
				old += 1
				ngram_freq_dict[i] = old
				#print i,old

		return ngram_freq_dict, total_ngram_count
예제 #8
0
def import_brown_pos(ds, simplify_tags=False, silent=False, log=sys.stdout):
    """
    Import the brown corpus into `ds`. E.g.
    
    >>> from nathan.core import Dataspace
    >>> ds = Dataspace()
    >>> %time brown.import_brown(ds, silent=True)
    CPU times: user 12min 28s, sys: 536 ms, total: 12min 29s
    Wall time: 12min 29s
    """
    if not silent:
        total = len(brown.sents())
        counter = 0
    for category in brown.categories():
        cat_handle = ds.insert("#%s" % category)
        for sent in brown.tagged_sents(categories=category):
            if simplify_tags:
                norm = (simplify_tag(t) for t in sent)
            norm = [nltk.tuple2str(t) for t in norm]
            sen_handle = ds.insert(norm)
            ds.link(cat_handle, sen_handle)
            if not silent:
                counter += 1
                if (counter % 100 == 0):
                    print("importing %s of %s sentences..." % (counter, total), 
                        file=log)
def load_movie_corpus_each_sentence(range):
    m = re.match(r'(\d+):(\d+)$', range)
    if m:
        start = int(m.group(1))
        end = int(m.group(2))
        from nltk.corpus import movie_reviews as corpus
        return [corpus.sents(fileid) for fileid in corpus.fileids()[start:end]]
예제 #10
0
def lookupTagger():

    fd = nltk.FreqDist(brown.words(categories='news'))
    cfd = nltk.ConditionalFreqDist(brown.tagged_words(categories='news'))
    most_freq_words = fd.keys()[:100]
    likely_tags = dict((word, cfd[word].max()) for word in most_freq_words)
    baseline_tagger = nltk.UnigramTagger(model=likely_tags)
    baseline_tagger.evaluate(brown_tagged_sents)

    sent = brown.sents(categories='news')[3]
    baseline_tagger.tag(sent)

    baseline_tagger = nltk.UnigramTagger(model=likely_tags,
            backoff=nltk.DefaultTagger('NN'))

    def performance(cfd, wordlist):
        lt = dict((word, cfd[word].max()) for word in wordlist)
        baseline_tagger = nltk.UnigramTagger(model=lt, backoff=nltk.DefaultTagger('NN'))
        return baseline_tagger.evaluate(brown.tagged_sents(categories='news'))

    def display():
        import pylab
        words_by_freq = list(nltk.FreqDist(brown.words(categories='news')))
        cfd = nltk.ConditionalFreqDist(brown.tagged_words(categories='news'))
        sizes = 2 ** pylab.arange(15)
        perfs = [performance(cfd, words_by_freq[:size]) for size in sizes]
        pylab.plot(sizes, perfs, '-bo')
        pylab.title('Lookup Tagger Performance with Varying Model Size')
        pylab.xlabel('Model Size')
        pylab.ylabel('Performance')
        pylab.show()   
def data_api(spilt_rate):
    raw_sent = brown.sents()
    partial_data = raw_sent[:int(0.1*len(raw_sent))]

    data_x, data_y = prepare_0(partial_data, word2intdict)

    print 'len data_x', len(data_x), len(data_y)

    train_inds = npr.choice(range(len(data_x)), size = int((1 - spilt_rate) * len(data_x)), replace = False)
    X_train = []
    Y_train = []
    X_test = []
    Y_test = []
    print 'len train_inds', len(train_inds), len(data_x)
    for i in range(len(data_x)):
        if i in train_inds:
        	#print 'trn', i
            X_train.append(data_x[i])
            Y_train.append(data_y[i])
        else :
        	#print 'tst', i
            X_test.append(data_x[i])
            Y_test.append(data_y[i])
    print 'len X_train', len(X_train), len(X_test)
    return (X_train, Y_train), (X_test, Y_test)
예제 #12
0
def cal_idf():
    # brown.sents()
    total_wordlists = []
    doc_sents = []
    for f in brown.fileids():
        print f
        doc_wordlist = []
        doc_sentlist = brown.sents(fileids=[f])
        d_sents = ''
        for sent in doc_sentlist:
            s = ''
            # sent = stem_tokens(sent)
            for w in sent:
                w = w.lower()
                s += w + ' '
            d_sents += s + '\n'
            doc_wordlist.extend(sent)
        total_wordlists.append(doc_wordlist)
        doc_sents.append(d_sents)
    print 'start caling tfidf'

    from sklearn.feature_extraction.text import TfidfVectorizer
    corpus = doc_sents
    vectorizer = TfidfVectorizer(min_df=1)
    X = vectorizer.fit_transform(corpus)
    idf = vectorizer.idf_
    # print dict(zip(vectorizer.get_feature_names(), idf))
    pickle.dump(vectorizer, open('idf_vectorizer', 'w'))
    dictionary = corpora.Dictionary(total_wordlists)
    dic, corps = get_corpus_by_lists(total_wordlists)
    tfidf = models.TfidfModel(corps, id2word=dic)
    pickle.dump(tfidf, open('brown_tfidf', 'w'))
예제 #13
0
def auto_tag(company):
    """
    tag a given text using brown corpus and unigram tagger
    :param company: company whose reviews are tagged
    :return: a list of tagged words
    """
    brown_tagged_sents = brown.tagged_sents(categories = 'news', tagset='universal')
    brown_sents = brown.sents(categories = 'news')

    # open the review of a company, and print error message if company review doesn't exist
    # first deal with unique cases such as General Motors => GM
    if company == 'General Motors':
        company = 'GM'
    elif company == 'Ford Motor Company':
        company = 'Ford'
    try:
        text = open('/Users/vickyzhang/Documents/Python/chart/comp/review/'+ company.capitalize() + '_review.txt').read()
    except FileNotFoundError:
        print('The system doesn\'t have a review for the company you entered. Please enter another company.')

    # normalize (tokenize and lowercase-ize) each word in the string
    text_token = nltk.word_tokenize(text)
    text_normal = [w.lower() for w in text_token]

    # build unigram tagger based on brown corpus, and use it to tag the normalized text
    unigram_tagger = nltk.UnigramTagger(brown_tagged_sents)
    text_tagged = unigram_tagger.tag(text_normal)
    return text_tagged
예제 #14
0
def createModel():
    global classifierit
    global classifierloose
    global classifieryou
    global classifierto
    global classifiertheir
    trainingitSet = []
    traininglooseSet = []
    trainingyouSet = []
    trainingtoSet = []
    trainingtheirSet= []
    st = POSTagger('/home/siddhartha/Downloads/stanford-postagger-full-2014-01-04/models/english-bidirectional-distsim.tagger', '/home/siddhartha/Downloads/stanford-postagger-full-2014-01-04/stanford-postagger.jar')
    for line in brown.sents():
        print line
        tagSent = st.tag(line)
        print tagSent
        arrayOfitFeature = pos_itfeatures(tagSent)
        arrayOfyouFeature = pos_youfeatures(tagSent)
        arrayOftheirFeature = pos_theirfeatures(tagSent)
        arrayOflooseFeature = pos_loosefeatures(tagSent)
        arrayOftoFeature = pos_tofeatures(tagSent)
        if arrayOfitFeature:
            trainingitSet.extend(arrayOfitFeature)
        if arrayOftheirFeature:
            trainingtheirSet.extend(arrayOftheirFeature)
        if arrayOflooseFeature:
            traininglooseSet.extend(arrayOflooseFeature)
        if arrayOftoFeature:
            trainingtoSet.extend(arrayOftoFeature)
        if arrayOfyouFeature:
            trainingyouSet.extend(arrayOfyouFeature)
        
    
    algorithm = nltk.classify.MaxentClassifier.ALGORITHMS[1]
    #encodingit = maxent.TypedMaxentFeatureEncoding.train(trainingitSet, count_cutoff=3, alwayson_features=True)
    classifierit = maxent.MaxentClassifier.train(trainingitSet, algorithm)
    f = open('classifierit.pickle', 'wb')
    pickle.dump(classifierit, f)
    f.close()
    #encodingloose = maxent.TypedMaxentFeatureEncoding.train(traininglooseSet, count_cutoff=3, alwayson_features=True)
    classifierloose = maxent.MaxentClassifier.train(traininglooseSet, algorithm)
    f = open('classifierloose.pickle', 'wb')
    pickle.dump(classifierloose, f)
    f.close()
    #encodingyou = maxent.TypedMaxentFeatureEncoding.train(trainingyouSet, count_cutoff=3, alwayson_features=True)
    classifieryou = maxent.MaxentClassifier.train(trainingyouSet, algorithm)
    f = open('classifieryou.pickle', 'wb')
    pickle.dump(classifieryou, f)
    f.close()
    #encodingto = maxent.TypedMaxentFeatureEncoding.train(trainingtoSet, count_cutoff=3, alwayson_features=True)
    classifierto = maxent.MaxentClassifier.train(trainingtoSet, algorithm)
    f = open('classifierto.pickle', 'wb')
    pickle.dump(classifierto, f)
    f.close()
    #encodingtheir = maxent.TypedMaxentFeatureEncoding.train(trainingtheirSet, count_cutoff=3, alwayson_features=True)
    classifiertheir = maxent.MaxentClassifier.train(trainingtheirSet, algorithm)
    f = open('classifiertheir.pickle', 'wb')
    pickle.dump(classifiertheir, f)
    f.close()      
예제 #15
0
파일: u3.py 프로젝트: atokop/compling
def brown_tagged_sents():
    from nltk.corpus import brown
    brown_tagged_sents = brown.tagged_sents(categories='news')
    brown_sents = brown.sents(categories='news')
    unigram_tagger = nltk.UnigramTagger(brown_tagged_sents)
    size = int(len(brown_tagged_sents) * 0.9)
    train_sents = brown_tagged_sents[:size]
    return (train_sents, brown_tagged_sents[size:])
예제 #16
0
def get_valid_brown_corpus():
    global DIR
    DIR = BROWN_DIR
    genre = ['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction']
    sentences = brown.sents(categories=genre)
    sents = remove_bad_sents(sentences)
    sents = [[w.lower() for w in s] for s in sents]
    return sents
예제 #17
0
def ic(w) :
	total = 0
	for sentence in b.sents():
		for word in sentence:
			total = total + 1
			brown_freqs[word.lower()] +=1
	
	print w.lower() ,":",brown_freqs[w.lower()], 1.0 - (math.log(brown_freqs[w.lower()]) / math.log(total+1))
예제 #18
0
파일: nGram.py 프로젝트: cglennk/nGrams
def uG():
    global uniCounter   #counts repeats of uniGrams
    global uniGram      #dictionary of biGrams
    global uniGrams     #counts biGrams
    uniCounter = {}
    uniGram = []
    uniGrams = 0

        
    news = brown.sents(categories='editorial')
        
    for x in range (1, MAX, 1):
        
        sent = news[x]
        sent.append('</s>')    #ending sentences with '</s>'
        sent.insert(0, '<s>')  #beginning sentences with '<s>'
        
        for x in range (0,sent.count('.')+1,1):
            try:
                sent.remove('.')   #removing .'s
            except:
                pass
        for x in range (0,sent.count(',')+1,1):
            try:
                sent.remove(',')   #removing ,'s
            except:
                pass
        for x in range (0,sent.count("'")+1,1):
            try:
                sent.remove("'")   #removing ''s
            except:
                pass
        for x in range (0,sent.count('"')+1,1):
            try:
                sent.remove('"')   #removing ''s
            except:
                pass
        x = 0
        for word in sent:
            word = word.lower()  #making all letters lowercase
            sent[x] = word       #so differences dont occur when
            x = x+1              #they shouldn't

        value = '1'
        for x in range (0,len(sent),1):
            try:
                word = sent[x]
                if(word not in uniGram):
                    uniGram.append(word)
                    uniGrams = uniGrams + 1
                if (word in uniCounter):
                    value = uniCounter[word]
                    value = value + 1
                    uniCounter[word] = value
                else:
                    uniCounter[word] = 1
            except:
                    pass
예제 #19
0
def Automated_Readability_Index(section):
	sents = len(brown.sents(categories = section))
	words = len(brown.words(categories = section))
	text = " ".join(brown.words(categories = section))
	letters = len(text)
	uw = letters / float(words) 
	us = words / float(sents) 
	ari = (4.71 * uw) + (0.5 * us) - 21.43
	return ari
예제 #20
0
 def learn(self, listofsentences=[], n=2000):
     self.learned = defaultdict(mydict)
     if listofsentences == []:
         listofsentences = brown.sents()
     for i, sent in enumerate(listofsentences):
         if i >= n:  # Limit to the first nth sentences of the corpus
             break
         for word in sent:
             self.learned[self.specialhash(word)][word.lower()] += 1
예제 #21
0
def collect_data_from_ptb_brow_duc2004():

    start_collect = time.time()
    samples = []
    # Penn Tree Bank
    treebank_sents = treebank.sents()
    for i in range(len(treebank_sents)):
        senttmp = " ".join(treebank_sents[i])
        words = nltk.word_tokenize(senttmp)
        samples.append(words)

    sys.stdout.write("Finish collecting training data from Penn Tree Bank")
    sys.stdout.flush()

    # Brown
    brown_sents = brown.sents()
    for i in range(len(brown_sents)):
        senttmp = " ".join(brown_sents[i])
        words = nltk.word_tokenize(senttmp)
        samples.append(words)
    sys.stdout.write("Finish collecting training data from Brown")
    sys.stdout.flush()

    # DUC data
    folder_path = "/Users/HyNguyen/Documents/Research/Data/duc2004/DUC2004_Summarization_Documents/duc2004_testdata/tasks1and2/duc2004_tasks1and2_docs/docs"
    clusters_name = os.listdir(folder_path)
    for cluster_name in clusters_name:
        if cluster_name[0] == ".":
            # except file .DStore in my macbook
            continue
        files_name = os.listdir(folder_path + "/" + cluster_name)
        for file_name in files_name:
            if file_name[0] == ".":
                # except file .DStore in my macbook
                continue
            file_path = folder_path + "/" + cluster_name +"/"+ file_name
            try:
                tree = ET.parse(file_path)
                root = tree.getroot()
                text_tag = root._children[3]
                if text_tag.tag == "TEXT":
                    text = text_tag.text.replace("\n", "")
                sentences = nltk.tokenize.sent_tokenize(text)
                for sentence in sentences:
                    words = nltk.word_tokenize(sentence)
                    samples.append(words)
            except:
                print "exception parse XML: ", file_name
                continue
    sys.stdout.write("Finish collecting training data from DUC2004")
    sys.stdout.flush()
    sys.stdout.write("length of samples" + str(len(samples)))
    sys.stdout.flush()
    end_collect = time.time()
    sys.stdout.write("Total time for collecting training data: " + str(end_collect - start_collect))
    sys.stdout.flush()
    return samples
예제 #22
0
def brown_corpus_word_frequency(targetWord):
    words = FreqDist()

    for sentence in brown.sents():
        for word in sentence:
            words.inc(word.lower())

    print words[targetWord]
    print words.freq(targetWord)
예제 #23
0
    def __init__(self, file_name):
        self.embsize = int(file_name.split('_')[-1])
        self.model = None
        if os.path.isfile(file_name):
            self.model = word2vec.Word2Vec.load(file_name)

        if self.model is None:
            model = word2vec.Word2Vec(brown.sents(), size=self.embsize, window=5, min_count=5, workers=4)
            model.save(file_name)
예제 #24
0
def main():
	corpussub = brown.sents()[:3000]
	#runCalc(corpussub)
	evalf = Evaluate()
	sents = map(lambda s: ' '.join(s), corpussub)
	finsents = reduce(lambda a,b: a + ' '+ b, sents)
	evalf.initBleu(sents)
	#ngramEval(finsents,evalf,10)
	#humanEval(finsents, evalf, "rawcorpus/humansentence.txt")
	evalAllHmm(finsents, evalf,"rawcorpus/andersen.txt",5)
예제 #25
0
def exercise_brown():
    # 打印布朗语料库中的分类
    print brown.categories()
    # 打印分类为新闻的文本词汇
    print brown.words(categories="news")
    # 打印文本'cg22'
    print brown.words(fileids=["cg22"])
    # 打印句子
    print brown.sents(categories=["news", "reviews"])

    """比较不同文体中的情态动词的用法"""
    # 获取文本
    news_text = brown.words(categories="news")
    # 单词定义频率
    fdist = nltk.FreqDist([w.lower() for w in news_text])
    # 定义情态动词表
    modals = ["can", "could", "may", "might", "must", "will"]
    for m in modals:
        print m + ":", fdist[m]
def calc_readability(corpus):
    texts = []
    results = []
    for fileid in corpus.fileids():
        sentlist = brown.sents(fileids=[fileid])
        text = ' '.join([ ' '.join(ss) for ss in sentlist ])
        texts.append(text)
    for text in texts:
        results.append(simple.get_text_stats(text)['read'])
    return results
예제 #27
0
def lookupTagger(i):
	brown_tagged_sents = bn.tagged_sents(categories='news')
	brown_sents = bn.sents(categories='news')
	fd = nltk.FreqDist(bn.words(categories = 'news'))
	cfd = nltk.ConditionalFreqDist(bn.tagged_words(categories = 'news'))
	most_freq_words = fd.keys()[:i]
	likely_tags = dict((word, cfd[word].max()) for word in most_freq_words)
	baseline_tagger = nltk.UnigramTagger(model=likely_tags)
	evalResult = baseline_tagger.evaluate(brown_tagged_sents)
	print "Evaluation of lookupTagger for the size %d is: %f" %(i, evalResult)
def preprocess(wikipedia_text):
	brown_text = brown.sents()
	brown_tagged = brown.tagged_sents()
	unigram_tagger = nltk.UnigramTagger(brown_tagged)
	bigram_tagger = nltk.BigramTagger(brown_tagged,backoff=unigram_tagger)
	paragraph_text = wikipedia_text.split('\n')
	paragraph_tagged = [tp for p in paragraph_text for tp in bigram_tagger.tag(nltk.word_tokenize(p.translate(None,'.')))]
	#print paragraph_tagged

	return paragraph_tagged
예제 #29
0
 def __init__(self):
     """Initialize your data structures in the constructor."""
     self.bigramCounts = collections.defaultdict(lambda : 0)
     self.unigramCounts = collections.defaultdict(lambda : 1)
     self.continuationCounts = collections.defaultdict(lambda: 0)
     self.followingCounts = collections.defaultdict(lambda: 0)
     self.total = 1
     print "Training Language Model..."
     self.train(brown.sents())
     print "--Training Complete--"
예제 #30
0
파일: chapter5.py 프로젝트: hbdhj/python
def  nGramTagging():
    print "=============== Unigram Tagging  ==============="
    from nltk.corpus import brown
    brown_tagged_sents = brown.tagged_sents(categories='news')
    brown_sents = brown.sents(categories='news')
    unigram_tagger = nltk.UnigramTagger(brown_tagged_sents)
    print unigram_tagger.tag(brown_sents[2007])
    print unigram_tagger.evaluate(brown_tagged_sents)

    print "=============== Separating the Training and Testing Data  ==============="
    size = int(len(brown_tagged_sents) * 0.9)
    print size
    train_sents = brown_tagged_sents[:size]
    test_sents = brown_tagged_sents[size:]
    unigram_tagger = nltk.UnigramTagger(train_sents)
    print unigram_tagger.evaluate(test_sents)

    print "=============== General N-Gram Tagging  ==============="
    bigram_tagger = nltk.BigramTagger(train_sents)
    print bigram_tagger.tag(brown_sents[2007])
    unseen_sent = brown_sents[4203]
    print bigram_tagger.tag(unseen_sent)
    print bigram_tagger.evaluate(test_sents)


    print "=============== Combining Taggers ==============="
    t0 = nltk.DefaultTagger('NN')
    t1 = nltk.UnigramTagger(train_sents, backoff=t0)
    t2 = nltk.BigramTagger(train_sents, backoff=t1)
    print t2.evaluate(test_sents)


    print "=============== Tagging Across Sentence Boundaries ==============="
    brown_tagged_sents = brown.tagged_sents(categories='news')
    brown_sents = brown.sents(categories='news')
    size = int(len(brown_tagged_sents) * 0.9)
    train_sents = brown_tagged_sents[:size]
    test_sents = brown_tagged_sents[size:]
    t0 = nltk.DefaultTagger('NN')
    t1 = nltk.UnigramTagger(train_sents, backoff=t0)
    t2 = nltk.BigramTagger(train_sents, backoff=t1)
    print t2.evaluate(test_sents)
예제 #31
0
    entropy = -1 * mean
    perplexity = pow(2.0, entropy)
    return perplexity


def avg_sent_perplexity(corpus, lm):
    perplexities = []
    for sent in corpus:
        ngrams = [ngram for ngram in sent]
        perplexities.append(lm.perplexity(ngrams))
    return sum(perplexities) / len(perplexities)


if __name__ == '__main__':
    args = parse_args()

    lm = Laplace(args.n)  # smoothing

    if args.train is not None:
        train_corpus = load_corpus(args.train)
    else:
        train_corpus = brown.sents()
    train, vocab = padded_everygram_pipeline(args.n, train_corpus)
    lm.fit(train, vocab)

    for test_file in args.corpora:
        test_corpus = load_corpus(test_file)
        test, vocab = padded_everygram_pipeline(args.n, test_corpus)
        perplexity = avg_sent_perplexity(test, lm)
        print('{}: {}'.format(test_file, perplexity))
예제 #32
0
from nltk.corpus import brown
import nltk
print(brown.categories())
print(brown.words(categories='news'))
print(brown.words(fileids=['cg22']))
print(brown.sents(categories=['news', 'editorial', 'reviews']))
from nltk.corpus import brown
news_text = brown.words(categories='news')
fdist = nltk.FreqDist(w.lower() for w in news_text)
modals = ['can', 'could', 'may', 'might', 'must', 'will']
for m in modals:
    print(m + ':', fdist[m], end=' ')

cfd = nltk.ConditionalFreqDist((genre, word) for genre in brown.categories()
                               for word in brown.words(categories=genre))
genres = ['news', 'religion', 'hobbies', 'science_fiction', 'romance', 'humor']
modals = ['can', 'could', 'may', 'might', 'must', 'will']
print()
print(cfd.tabulate(conditions=genres, samples=modals))
예제 #33
0
from nltk.corpus import brown
import statistics, viterbi_algo

corpus_tagged_sentences = brown.tagged_sents(categories='news')
corpus_sentences = brown.sents(categories='news')

training_size = round(len(corpus_sentences) * 0.9)
training_set = corpus_tagged_sentences[:training_size]
test_set = corpus_tagged_sentences[training_size:][:100]
untagged_test_set = corpus_sentences[training_size:][:100]

corpus_size = len(brown.words(categories='news'))

# Constants
START_1 = "START_1"
START_2 = "START_2"
STOP = "STOP"
TIMES = "times"
TAG_TO_UNKNOWN_WORD = "NN"
COMMON_TAGS = 20


# --------------------- Helper Function ---------------------
def get_common_tags(corpus):
    """
    :param corpus: a corpus
    :return: set with the common tags of the corpus
    """
    # Counts number of occurrences of each tag
    tags = {}
    for sen in corpus:
예제 #34
0
print(classifier.pseudocode(depth=4))

## Exploiting Context
from nltk.corpus import brown

def pos_features(sentence,i):
    features={'suffix(1)':sentence[i][-1:],
              'suffix(2)':sentence[i][-2:],
              'suffix(3)':sentence[i][-3:]}
    if i ==0:
        features['prev-word']='<START>'
    else:
        features['prev-word']=sentence[i-1]
    return features

brown.sents()[0]
pos_features(brown.sents()[0],8)

tagged_sents=brown.tagged_sents(categories='news')
featuresets=[]

for tagged_sent in tagged_sents:
    untagged_sent=nltk.tag.untag(tagged_sent)
    for i, (word, tag) in enumerate(tagged_sent):
        featuresets.append((pos_features(untagged_sent,i),tag))

size=int(len(featuresets)*0.1)
train_set,test_set=featuresets[size:],featuresets[:size]
classifier=nltk.NaiveBayesClassifier.train(train_set)

nltk.classify.accuracy(classifier,test_set) # 0.789
    #print("Labels : ", Y)
    count = Counter(Y)
    print(count)

    list_of_sents = []
    list_of_sents_raw = pickle.load(open('listOfSentences.pkl', 'rb'))

    for line in list_of_sents_raw:
        list_of_sents.append(line.split(" "))

    #building word2vec model on brown corpus
    size = 20
    window = 3
    print("Word2Vec parameters - Vector size : ", size, "Window size : ",
          window)
    sentences_brown = brown.sents()
    w2v_model_brown = Word2Vec(sentences_brown,
                               size=size,
                               window=window,
                               min_count=1)
    w2v_model_wv = w2v_model_brown.wv
    del w2v_model_brown
    """
   #load the labeled raw data 
   data = pickle.load(open("actualData.pkl","rb"))

   #create pickle file of features and get the labels for the videos
   Y = np.array(load_data(data))
   #print("Labels : ", Y)
   count = Counter(Y)
   print(count)
예제 #36
0
def exercise1():
    train_data = brown.tagged_sents(categories='news')
    test_data = brown.tagged_sents(categories='lore')
    unigram_tagger_model = nltk.UnigramTagger(train_data)
    print("Evaluate on all of the sentences from the Brown corpus with the category lore : ",unigram_tagger_model.evaluate(test_data))
    print("Evaluate on all of the sentences from the Brown corpus with the category news : ", unigram_tagger_model.evaluate(train_data))
    print("Output of tagger on the 200th sentence of the lore category of the Brown Corpus : ", unigram_tagger_model.tag(brown.sents(categories='lore')[199]))
예제 #37
0
np.random.seed(1000)


def scatter_documents(X):
    fig, ax = plt.subplots(1, 1, figsize=(10, 6))

    ax.scatter(X[:, 0], X[:, 1])
    ax.set_xlabel('t0')
    ax.set_ylabel('t1')
    ax.grid()
    plt.show()


if __name__ == '__main__':
    # Compose a corpus
    sentences = sentences = brown.sents(categories=['news', 'fiction'])
    corpus = []

    for s in sentences:
        corpus.append(' '.join(s))

    # Vectorize the corpus
    vectorizer = TfidfVectorizer(strip_accents='unicode', stop_words='english', sublinear_tf=True, use_idf=True)
    Xc = vectorizer.fit_transform(corpus).todense()

    # Perform SVD
    U, s, V = svd(Xc, full_matrices=False)

    # Extract a sub-space with rank=2
    rank = 2
예제 #38
0
with open('results.txt', 'a') as resfile:
    resfile.write(
        'pearson correlation in dataset [%s] for Datamuse methods is %f\n' %
        ('STS-131', corr))

# part 6
from gensim.models import Word2Vec
import nltk
nltk.download('brown')

from nltk.corpus import brown

with open('datasets/stss-131.csv', newline='') as csvfile:
    contents = list(csv.reader(csvfile, delimiter=';'))

model_word2vec = Word2Vec(brown.sents(), min_count=8)
sim_cal = np.array(
    sentence_similarity_dataset_model(contents,
                                      model_word2vec.wv)).reshape(-1, )

with open('sentence_similarity.txt', 'a') as simfile:
    simfile.write('Using Word2Vec embedding\n')
    simfile.write('s1; s2; human_sim; method_sim\n\n')
    for i, pair in enumerate(contents):
        simfile.write('%s;%s;%s;%f\n' %
                      (pair[0], pair[1], pair[2], sim_cal[i] * 4))
    simfile.write('\n\n')

sim_ref = np.array(contents)[:, 2].astype(float) / 4.0
corr = pearson_correlation(sim_cal, sim_ref)
예제 #39
0
# -*- coding:utf-8 -*-
"""
    2019/4/2 15:46 by young
"""

import nltk
from nltk.corpus import brown

print(brown.categories())

print('共有{}个句子'.format(len(brown.sents())))
print('共有{}个单词'.format(len(brown.words())))
예제 #40
0
    """
    text = [None, None, None]
    sentence_finished = False

    # generate random  sentences
    while not sentence_finished:
        r = random.random()
        accumulator = .0

        for word in model[tuple(text[-2:])].keys():
            accumulator += model[tuple(text[-2:])][word]
            if accumulator >= r:
                text.append(word)
                break

        if text[-2:] == [None, None]:
            sentence_finished = True

    return ' '.join([t for t in text if t])


if __name__ == '__main__':
    print('Modelling the corpus')
    model = model_trigram(brown.sents())

    print('Assign probabilities.')
    model = model_proabilities(model)

    print('Generating sentences from the model')
    print(generate_sentence(model))
예제 #41
0
def main():
    taggedsents = []
    for f in inputstring:
        s = brown.sents(f)[:]
        for i in s:
            i = nltk.pos_tag(i)
            i.insert(0, ("<s>", "<s>"))
            i.append(("</s>", "</s>"))
            taggedsents.append(i)

    tagbigrams = createbigrams(taggedsents)

    taggedwords = []
    uniquewords = []
    words = []
    uniquetags = []
    tags = []
    for sent in taggedsents:
        for i in sent:
            taggedwords.append(i)
            words.append(i[0])
            tags.append(i[1])
            if i[0] not in uniquewords:
                uniquewords.append(i[0])
            if i[1] not in uniquetags:
                uniquetags.append(i[1])
    words = [i for i in words if i not in ["<s>", "</s>"]]
    uniquewords = [i for i in uniquewords if i not in ["<s>", "</s>"]]

    epmatrix = [[0] * len(uniquewords) for i in range(len(uniquetags))]
    tpmatrix = [[0] * len(uniquetags) for i in range(len(uniquetags))]

    #hmm traning
    for wordi in range(len(uniquewords)):
        for tagi in range(len(uniquetags)):
            epmatrix[tagi][wordi] = taggedwords.count(
                (uniquewords[wordi], uniquetags[tagi])) / tags.count(
                    uniquetags[tagi])

    for tagi in range(len(uniquetags)):
        for t in range(len(uniquetags)):
            tpmatrix[tagi][t] = tagbigrams.count(
                (uniquetags[tagi], uniquetags[t])) / tags.count(
                    uniquetags[tagi])

    #hmm testing
    s = brown.sents(outputstring)[:]
    defaulttaggedsents = []
    for i in s:
        i = nltk.pos_tag(i)
        defaulttaggedsents.append(i)

    hmmtaggedsents = []
    for i in s:
        i = hmm_pos_tag(i, epmatrix, tpmatrix, uniquetags, uniquewords, tags)
        hmmtaggedsents.append(i)

    #testing
    correct = 0
    wrong = 0
    for i in range(len(defaulttaggedsents)):
        for j in range(len(defaulttaggedsents[i])):
            if (defaulttaggedsents[i][j][1] == hmmtaggedsents[i][j][1]):
                correct += 1
            else:
                wrong += 1

    print("Correct tags: " + str(correct))
    print("Wrong tags: " + str(wrong))
    print("Accuracy of hmm pos tagger: " + str(correct / (correct + wrong)))
예제 #42
0
    '10-24-40s_706posts.xml', '10-26-teens_706posts.xml',
    '11-06-adults_706posts.xml', '11-08-20s_705posts.xml',
    '11-08-40s_706posts.xml', '11-08-adults_705posts.xml',
    '11-08-teens_706posts.xml', '11-09-20s_706posts.xml',
    '11-09-40s_706posts.xml', '11-09-adults_706posts.xml',
    '11-09-teens_706posts.xml'
])

# Create a placeholder for model
model = defaultdict(lambda: defaultdict(lambda: 0))

# Count frequency of co-occurance
i = 0
for sentence in brown.sents(categories=[
        'adventure', 'belles_lettres', 'editorial', 'fiction', 'government',
        'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion',
        'reviews', 'romance', 'science_fiction'
]):
    for w1, w2, w3 in trigrams(sentence, pad_right=True, pad_left=True):
        model[(w1, w2)][w3] += 1
for sentence in posts:
    for w1, w2, w3 in trigrams(sentence.text, pad_right=True, pad_left=True):
        model[(w1, w2)][w3] += 1

# Let's transform the counts to probabilities
for w1_w2 in model:
    total_count = float(sum(model[w1_w2].values()))
    for w3 in model[w1_w2]:
        model[w1_w2][w3] /= total_count
# default test case
sentence = "find the nearest medical shop to center of arizona"
예제 #43
0
파일: ch2.py 프로젝트: juri-220/Python-NLP
#webtext in nltk.corpus
from nltk.corpus import webtext
for filleid in webtext.fileids():
    print (fileid, webtext.raw(fileid)[:2])

from nltk.corpus import nps_chat
chatroom = nps_chat.posts('10-19-20s_706posts.xml')
chatroom[123]

#brown corpus
from nltk.corpus import brown
brown.categories()
brown.words(categories='editorial')
brown.words(fileids=['cp12'])
brown.sents(categories=['news','editorials'])

edi_text = brown.words(categories='fiction')
fdist=nltk.FreqDist([w.lower() for w in edi_text])
modals=['what','who','where','when','why']
for m in modals:
    print (m + ':', fdist[m])

#Reuters corpus
from nltk.corpus import reuters
reuters.fileids()
reuters.categories()
reuters.words('training/9947')[:14]
reuters.words(categories=['sorghum','rye'])

#inaugural address
예제 #44
0
								def get_word2vec():
									model = gensim.models.Word2Vec(brown.sents())
									return model
예제 #45
0
def ARI(cat):
    words = brown.words(categories=cat)
    sents = brown.sents(categories=cat)
    mw = sum(len(w) for w in words) / len(words)
    ms = sum(len(s) for s in sents) / len(sents)
    return 4.71 * mw + 0.5 * ms - 21.43
예제 #46
0
import nltk

# Import the Brown corpus
from nltk.corpus import brown

# Show the categories available in the Brown corpus
print("Total categories in the Brown corpus: {}".format(len(
    brown.categories())))
print("- {}".format("\n- ".join(brown.categories())))

# Tokenized senteces
brown.sents(categories="mystery")

# POS tagged senteces
print("\nPOS Tagged sentences: \n{}".format(
    brown.tagged_sents(categories="learned")))

# Get the nouns from the tagged words. Nouns are tagged as NN or NP.
# Note the use of a generator within the any function, thus the values
# evaluated by any are generated as it iterates over the words and short-circuits
# as it sees the first True value. (Some words have several tags. e.g. NN-HL!)
tagged_words = brown.tagged_words(categories="science_fiction")
nouns = [(word, tag) for word, tag in tagged_words
         if any(noun_tag in tag for noun_tag in ['NN', 'NP'])]

print("\nNouns: {}\n- {}".format(
    len(nouns), "\n- ".join(
        (wt_pair[0] + ": " + wt_pair[1]) for wt_pair in nouns[0:20])))

# Build frequency distribution for nouns. (Note that using a generator instead of a comprehension
# should have a positive effect in performance/memory footprint)
예제 #47
0
import nltk
from nltk.corpus import brown
from nltk import word_tokenize
import pylab
# setting up data
brown_tagged_sents = brown.tagged_sents(categories='news')
brown_sents = brown.sents(categories='news')

# The default tagger

tags = [tag for (word, tag) in brown.tagged_words(categories='news')]
# most frequent
print nltk.FreqDist(tags).max()

# tagger that tags everything as NN (Noun)
raw = 'I do not like green eggs and ham, I do not like them Sam I am!'
tokens = word_tokenize(raw)
default_tagger = nltk.DefaultTagger('NN')
print default_tagger.tag(tokens)

# this will perform poorly on a corpus
print default_tagger.evaluate(brown_tagged_sents)

# The regular expression Tagger

# these are processed in order the first one to match applies
patterns = [
    (r'.*ing$', 'VBG'),  # gerunds
    (r'.*ed$', 'VBD'),  # simple past
    (r'.*es$', 'VBZ'),  # 3rd singular present
    (r'.*ould$', 'MD'),  # modals
예제 #48
0
filebase = "/home/fnielsen/"


def word_feats(words):
    return dict([(word, True) for word in words])


def sents2words(sents):
    return [
        set(map(lambda w: w.lower(), pattern_word.findall(j))) for j in sents
    ]


pattern_word = re.compile('[^\W\d_]+', re.UNICODE)

news_sents = map(lambda words: " ".join(words), brown.sents(categories='news'))

categories = [
    'reviews', 'religion', 'hobbies', 'lore', 'belles_lettres', 'government',
    'learned', 'fiction', 'mystery', 'science_fiction', 'adventure', 'romance',
    'humor'
]

other_feats = []
others_feats = []
other_sents = []
for category in categories:
    sents = map(lambda words: " ".join(words),
                brown.sents(categories=category))
    other_sents.append(sents)
    words = sents2words(sents)
예제 #49
0
파일: 5.4.py 프로젝트: XU-ZHOU/PythonNLP
#-*-coding:utf-8 -*-

#自动标注
from nltk.corpus import brown
brown_tagged_sents = brown.tagged_sents(categories='news')
brown_sents = brown.sents(categories='news')

#默认标注器
import nltk
from nltk.corpus import brown
tags = [tag for (word, tag) in brown.tagged_words(categories='news')]
print(nltk.FreqDist(tags).max())

raw = 'I do not like gree eggs and ham,I do not like them Sam I am!'
tokens = nltk.word_tokenize(raw)
default_tagger = nltk.DefaultTagger('NN')
print(default_tagger.tag(tokens))
print(default_tagger.evaluate(brown_tagged_sents))

#正则表达式标注器
patterns = [(r'.*ing$', 'VBG'), (r'.*ed$', 'VBD'), (r'.*es$', 'VBZ'),
            (r'.*ould$', 'MD'), (r'.*\'s$', 'NN$'), (r'.*s$', 'NNS'),
            (r'^-?[0-9]+(.[0-9]+)?$', 'CD'), (r'.*', 'NN')]
regexp_tagger = nltk.RegexpTagger(patterns)
print(regexp_tagger.tag(brown_sents[3]))

#查询标注器
fd = nltk.FreqDist(brown.words(categories='news'))
cfd = nltk.ConditionalFreqDist(brown.tagged_words(categories='news'))
most_freq_words = fd.keys()[:100]
likely_tags = dict((word, cfd[word].max()) for word in most_freq_words)
예제 #50
0
# File for getting the Brown corpus into just stings of text, one sentence
# per line, via NLTK
# Brown corpus = 1161192 words in 57340 sentences
# Outputs separate files with ~5000 sentences each

# Tested w/ Python 3.7 and NLTK 3.7.3

from nltk.corpus import brown
import os
import time

filebase = '/Users/garrettsmith/Google Drive/UniPotsdam/Research/Features/GenEmbeddings/BrownCorpus/'

start_time = time.time()
fileno = 0
for i, sent in enumerate(brown.sents()):
    if (i > 0) and (i % 5000 == 0):
        fileno += 1
        print('{} sentences processed\r'.format(i), end='')
    file = filebase + 'brown' + str(fileno) + '.txt'
    # Open file to append to if it exists
    if os.path.exists(filebase):
        mode = 'a'
    else:
        mode = 'w'
    with open(file, mode) as f:
        sent = ' '.join(sent) + '\n'
        f.write(sent)


print('Elapsed time: {} seconds'.format(time.time() - start_time))
예제 #51
0
 print classifier.pseudocode(depth=4)
 print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
 print "E X A M P L E 5: Exploiting Context"
 print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
 print "Exploiting Context. " \
       " contextual features often provide powerful clues about " \
       "the correct tag—for example, when tagging the word fly, " \
       "knowing that the previous word is a will allow us to " \
       "determine that it is functioning as a noun, not a verb." \
       "In order to accommodate features that depend on a word’s " \
       "context, we must revise the pattern that we used to define " \
       "our feature extractor. Instead of just passing in the word " \
       "to be tagged, we will pass in a complete (untagged) sentence, " \
       "along with the index of the target word."
 print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
 print pos_features2(brown.sents()[0], 6)
 print pos_features2(brown.sents()[0], 7)
 print pos_features2(brown.sents()[0], 8)
 print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
 print "Now that we’ve defined our feature extractor, we can use it " \
       "to generate a features set from UNTAGGED sentences in corpus " \
       "Data structure of brown.tagged_sents():" \
       "[ [(sentence1-token1,POS-tag), (sentence1-tokens2,POS-tag),...]" \
       "  [(sentence2-token1,POS-tag), (sentence2-tokens2,POS-tag),...]" \
       "  ... ]"
 print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
 tagged_sents = brown.tagged_sents(categories='news')[0:5]
 print "length of tagged sentence set: ", len(tagged_sents)
 print tagged_sents[:3]
 featuresets = []
 for tagged_sent in tagged_sents:
            x=(words_te[i],words_te[i+1],words_te[i+2])
            if(Interpolated_Kneser_Ney_dict.get(x,"empty")=="empty"):
                if((x[0],x[1]) not in bgcounter):
                    Interpolated_Kneser_Ney_dict[x]=findPKn_bigram((x[1],x[2]))

                else:
                    Interpolated_Kneser_Ney_dict[x]=findPKn_trigram(x,discount_final)

            perp=perp*((1/Interpolated_Kneser_Ney_dict[x])**(1/N))
    return perp


# In[30]:

text_gutenberg=list(gutenberg.sents())
text_brown=list(brown.sents())
text_gutenberg_size=len(text_gutenberg)
text_gutenberg_size=len(text_gutenberg)
text_brown_size=len(text_brown)
for i in range(text_gutenberg_size):
    text_gutenberg[i].insert(0,"<s>")
    text_gutenberg[i].insert(len(text_gutenberg[i]),'<e>')
    text_gutenberg[i].insert(len(text_gutenberg[i]),'<e>')
for i in range(text_brown_size):
    text_brown[i].insert(0,"<s>")
    text_brown[i].insert(len(text_brown[i]),'<e>')
    text_brown[i].insert(len(text_brown[i]),'<e>')
text_gutenberg_tr,text_gutenberg_te=train_test_split(text_gutenberg,test_size=.20,random_state=4)
text_brown_tr,text_brown_te=train_test_split(text_brown,test_size=.20,random_state=4)

plt.close()
tkinter.Tk().withdraw()
in_path = filedialog.askopenfilename()

tkinter.Tk().withdraw()
out_path = filedialog.asksaveasfilename()

try:

    bigram_model = json.load(open("Spelling_Correction/bigrams.txt"))

except IOError:

    bigram_model = {}

    for sentence in brown.sents():

        for w1, w2 in bigrams(sentence):

            if w1 in bigram_model:

                if w2 in bigram_model[w1].keys():

                    bigram_model[w1][w2] = bigram_model[w1][w2] + 1

                else:

                    bigram_model[w1][w2] = 1

            else:
예제 #54
0
    s = ''
    for i in range(len(text)):
        s += text[i] + ' '

    if (len(text) != 10):
        return False, s
    else:
        return True, s


if __name__ == '__main__':
    time.clock()

    print()

    brown_corpus = list(brown.sents(brown.fileids()))
    for i in range(len(brown_corpus)):
        brown_corpus[i] = list(map(lambda x: x.lower(), brown_corpus[i]))
    gutenberg_corpus = list(gutenberg.sents(gutenberg.fileids()))
    for i in range(len(gutenberg_corpus)):
        gutenberg_corpus[i] = list(
            map(lambda x: x.lower(), gutenberg_corpus[i]))
    combined_corpus = brown_corpus + gutenberg_corpus

    unigram_list, bigram_list = training(combined_corpus)
    i = 0
    while (i < 1):
        bool, s = generate_trigram_token(bigram_list)
        if (bool):
            i += 1
            print(s)
예제 #55
0
import gensim
import logging
from nltk.corpus import brown

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                    level=logging.INFO)
sentences = brown.sents()
model = gensim.models.Word2Vec(sentences, min_count=1)
model.save('brown_model')
예제 #56
0
                         sg=0)
print svk.syn0[0]
dsk = Doc2VecKeras(dm_concat=1)
dsk.train_with_word2vec_instance(test_docs, svk, learn_words=True, iter=3)
print dsk.syn0[0]

print(dk0.docvecs.most_similar(0))
print(dk.docvecs.most_similar(0))
print(dsk.docvecs.most_similar(0))
print(dklw.docvecs.most_similar(0))

#sys.exit()

from nltk.corpus import brown

brown_sents_sub = list(brown.sents()[:100])
brown_docs_sub = LabeledListSentence(brown_sents_sub)
brown_scorewordsents = list(
    ScoredListSentence(brown_sents_sub, dummy_score_vec_fn))

vck_br = Word2VecKeras(brown_sents_sub, null_word=1, iter=3, sg=0)
vkk_br = Word2VecKeras(brown_sents_sub, null_word=1, iter=3, sg=1)

dg_br = gensim.models.doc2vec.Doc2Vec(brown_docs_sub)
dk0_br = Doc2VecKeras(brown_docs_sub, iter=3)

svk_br = ScoreWord2VecKeras(brown_scorewordsents, null_word=1, iter=3, sg=0)

dk_br = Doc2VecKeras(dm_concat=1)
dk_br.train_with_word2vec_instance(brown_docs_sub,
                                   vck_br,
예제 #57
0
파일: baseline.py 프로젝트: xzwj/sos
    # Optionally, shorten words to their stems
    if stem_words:
        text = text.split()
        #stemmer = SnowballStemmer('english')
        #stemmed_words = [stemmer.stem(word) for word in text]
        stemmed_words = [
            nltk.PorterStemmer().stem_word(word.lower()) for word in text
        ]
        text = " ".join(stemmed_words)

    # Return a list of words
    return (text)


#word_vectors = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)
br = Word2Vec(brown.sents())


def get_similar(word):
    if word in br:
        lis = br.most_similar(word, topn=3)
        ret = []
        for one in lis:
            ret.append(one[0])
        return ret
    else:
        return [word]


logger.info('Read data...')
train = pd.read_csv('../../data/tmp/train_sample.csv')
예제 #58
0
    unigramFreq = do_train_uni(train)
    # use the maximum likelihood estimate MLEProbDist to create
    # a probability distribution from the observed frequencies

    unigram = MLEProbDist(unigramFreq)
    bigram = ConditionalProbDist(bigramFreq, MLEProbDist)
    bigram_add_one = ConditionalProbDist(bigramFreq,
                                         LaplaceProbDist,
                                         bins=bigramFreq.__len__())

    if method == 'no_smoothing':
        print "%s:%s:%s" % (method, 'train', compute_perplexity(bigram, train))
        print "%s:%s:%s" % (method, 'test', compute_perplexity(bigram, test))
    elif method == 'interpolation':
        sents = []
        for l in brown.sents(categories=trainsection):
            sents = sents + l
        V = len(sents)
        print "%s:%s:%s" % (method, 'train',
                            compute_perplexity_interp(unigram, bigram, train,
                                                      lambda_vector, V))
        print "%s:%s:%s" % (method, 'test',
                            compute_perplexity_interp(unigram, bigram, test,
                                                      lambda_vector, V))
    elif method == 'add_one':
        print "%s:%s:%s" % (method, 'train',
                            compute_perplexity(bigram_add_one, train))
        print "%s:%s:%s" % (method, 'test',
                            compute_perplexity(bigram_add_one, test))
    elif method == 'interpolation_add_one':
        print "%s:%s:%s" % (method, 'train', compute_perplexity(bigram, train))
#Projet de Sciences des données
#Master 2 IA "Machine Learning for Data Science"

#Import de toutes les méthodes de la classe DistributionalSemantics et de la classe NltkBigrams
from distributional_semantics import DistributionalSemantics as ds
from nltk_bigrams import NltkBigrams as nb
import gensim
from nltk.corpus import brown

#Test des méthodes
#Construction du corpus de bigrammes
corpus_bigrams = nb()

#Entrainement des models gensim
print('Création des vecteurs de sens des mots')
unigram_model = gensim.models.Word2Vec(brown.sents(),min_count = 1,size = 100)
print('Création des vecteurs de sens des bigrammes')
bigram_model = gensim.models.Word2Vec(corpus_bigrams.bigram_sents,min_count = 1,size = 100)

#Calcul de la matrice de composition pour la relation ADJNOUN
W = ds.composition_w(brown.tagged_sents(categories = 'science_fiction', tagset = 'universal'), corpus_bigrams.tagged_sents_bigram, unigram_model, bigram_model, "ADJNOUN")

#Calcul de la matrice de décomposition pour l'étiquette de phrases ADJNOUN
W2 = ds.decomposition_w(brown.tagged_sents(categories = 'science_fiction', tagset = 'universal'), corpus_bigrams.tagged_sents_bigram,bigram_model,unigram_model,"ADJNOUN")

#Calcul de la matrice de décomposition pour l'étiquette de phrases ADJNOUN selon la composition effectuée précédemment
W3 = ds.decomposition_from_composition_w(brown.tagged_sents(categories = 'science_fiction', tagset = 'universal'),unigram_model,W,"ADJNOUN")

#Calcul de composition de vecteurs de sens de "new" et "ones"
P = ds.compose(unigram_model["new"],unigram_model["ones"],W)
def get_sentences():
  # returns 57340 of the Brown corpus
  # each sentence is represented as a list of individual string tokens
  return brown.sents()