Python raw 예제들, nltk.corpus.abc.raw Python 예제들

예제 #1

0

파일 보기

파일: Word2Vec.py 프로젝트: isha-goel/MCA

def collect_data(vocabulary_size=10000):
    v1 = abc.raw("rural.txt").split()
    v2 = abc.raw("science.txt").split()
    vocabulary = v1 + v2
    data, count, dictionary, reverse_dictionary = build_dataset(
        vocabulary, vocabulary_size)
    del vocabulary
    return data, count, dictionary, reverse_dictionary

예제 #2

0

파일 보기

def ari(fileid):
    """Accept text as list of words"""
    print(fileid)
    num_chars = len(abc.raw(fileid))
    num_words = len(abc.words(fileid))
    num_sents = len(abc.sents(fileid))

    avg_word_len = num_chars / num_words
    avg_sent_len = num_words / num_sents

    return avg_word_len * 4.71 + avg_sent_len * 0.5 - 21.43

예제 #3

0

파일 보기

파일: ch340.py 프로젝트: GirishSrinivas/PythonPrograms

def Automated_Readability_Index40(section):
    sent_tokenize = nltk.data.load('tokenizers/punkt/english.pickle')
    text = abc.raw(section)
    sents = len(sent_tokenize.tokenize(text))
    words = len(abc.words(section))
    text = " ".join(abc.words(section))
    letters = len(text)
    uw = letters / float(words)
    us = words / float(sents)
    ari = (4.71 * uw) + (0.5 * us) - 21.43
    return ari

예제 #4

0

파일 보기

파일: ch340.py 프로젝트: GirishSrinivas/PythonPrograms

def Automated_Readability_Index40(section):
	sent_tokenize = nltk.data.load('tokenizers/punkt/english.pickle')
	text = abc.raw(section)
	sents = len(sent_tokenize.tokenize(text))
	words = len(abc.words(section))
	text = " ".join(abc.words(section))
	letters = len(text)
	uw = letters / float(words) 
	us = words / float(sents) 
	ari = (4.71 * uw) + (0.5 * us) - 21.43
	return ari

예제 #5

0

파일 보기

파일: ex5.py 프로젝트: Lion223/univ_training_python

def calcARI(file):
    sent_tokenizer=nltk.data.load('tokenizers/punkt/english.pickle')
    text = abc.raw(file)
    sents = sent_tokenizer.tokenize(text)
    avg_words = 0
    avg_letters = 0
    for sentence in sents:
        avg_words += len(sentence)
    avg_words = avg_words / len(sents)
    for word in abc.words(file):
        avg_letters += len(word)
    avg_letters = avg_letters / len(abc.words(file))
    return (4.71 * avg_letters) + (0.5 * avg_words) - 21.43

예제 #6

0

파일 보기

파일: Girish_Srinivas3b.py 프로젝트: GirishSrinivas/PythonPrograms

def Automated_Readability_Index40(section):
    char_count = 0
    sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    raw_text = abc.raw(section)
    sent = len(sent_tokenizer.tokenize(raw_text))
    words = len(abc.words(section))

    for ch in raw_text:
        if ch.isalpha():
            char_count = char_count + 1

    uw = char_count / float(words)
    us = words / float(sent)
    ARI = (4.71 * uw) + (0.5 * us) - 21.43
    return ARI

예제 #7

0

파일 보기

def practice():
    stemmed_tokens = []
    train_tokens = word_tokenize(abc.raw("rural.txt").lower())
    bigrams = list(ngrams(train_tokens, 3))
    POS_tag = nltk.pos_tag(train_tokens)
    print(POS_tag)
    #custom_tokenizer = PunktSentenceTokenizer(train_tokens)
    #word_token = custom_tokenizer.tokenize(sample_tokens)
    ps = PorterStemmer()
    for token in train_tokens:
        stemmed_value = ps.stem(token)
        stemmed_tokens.append(stemmed_value)

    frequencies = Counter(stemmed_tokens)
    stop_words = stopwords.words('English')

    for word, count in frequencies.most_common(50):
        if word not in stop_words and len(word) > 2:
            #continue

            print(word, count)

예제 #8

0

파일 보기

파일: c3q40.py 프로젝트: jonathanmonreal/nltk-examples

import nltk
from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktParameters
from nltk.corpus import abc

def ari(raw):

    # tokenize raw text and get words
    tokens = nltk.wordpunct_tokenize(raw)
    words = [word.lower() for word in tokens if word.isalpha()]

    # instantiate punctuation parameters
    punkt_params = PunktParameters()
    # specify abbreviations to be ignored in sentence separation
    punkt_params.abbrev_types = set(['dr', 'inc', 'mr', 'mrs', 'ms', 'prof',
                                     'etc'])
    # separate into sentences using a PuktSentenceTokenizer
    sentences = PunktSentenceTokenizer(punkt_params).tokenize(raw)

    chars = 0

    for word in words:
        chars += len(word)
    
    return (4.71 * (chars / len(words)) + 0.5 * (len(words) / len(sentences))
            - 21.43)

for fileid in abc.fileids():
    print '%*s %9f' % (max(len(f) for f in abc.fileids()), fileid,
                       ari(abc.raw(fileids=fileid)))

예제 #9

0

파일 보기

import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import abc, stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import PunktSentenceTokenizer
from nltk.corpus import wordnet
from collections import Counter


cor = abc.raw("rural.txt").lower()
cor_abc = abc.raw("rural.txt").lower()
cor_word_tokens = word_tokenize(cor)
#print(cor_word_tokens)
cor_sent_tokens = sent_tokenize(cor)
#print(cor_sent_tokens)

#stop words
stp = stopwords.words("english")
#print(stp)
filtered_sentence = [i for i in cor_word_tokens if i not in stp and len(i)>2]

# for i in cor_word_tokens:
#      if i not in stp:
#          filtered_sentence.append(i)
#print(filtered_sentence)

#stemming
def filteredstem(input):
    ps = PorterStemmer()
    for w in input:
        print(ps.stem(w))

예제 #10

0

파일 보기

파일: term_extraction.py 프로젝트: darkliquid/NaNoGenMo

from nltk.corpus import gutenberg, abc, reuters, brown, movie_reviews
from topia.termextract import extract
extractor = extract.TermExtractor()

with open('./corpus/all3.txt', 'r') as f:
	with open('./data/terms.txt', 'w') as o:
		o.write("Term\tOccurences\tStrength\n")
		for term in extractor(f.read()+gutenberg.raw()+abc.raw()+reuters.raw()+brown.raw()+movie_reviews.raw()):
			o.write("\t".join(map(str, term)) + "\n")

예제 #11

0

파일 보기

파일: q1.py 프로젝트: vishaal27/MCA-W2020

def get_corpus():
    science = abc.raw('science.txt')
    rural = abc.raw('rural.txt')
    concat = science + '\n' + rural
    return concat

예제 #12

0

파일 보기

파일: question_1.py 프로젝트: rishabh1704/MCA

import nltk
nltk.download('abc')
nltk.download('punkt')
"""#### Skip gram model is used for making word embeddings."""

from nltk.corpus import abc
from nltk.tokenize import RegexpTokenizer
import torch
from tqdm import tqdm
'''
  The size of the corpus is : 663964
  The Vocabulary size is : 11557
'''

cut_indx = 70000
corp = abc.raw()
wds1 = corp.split()[:cut_indx]
print(len(wds1))
t = 1e-5
# this is the frequency
d = dict()
for i in wds1:
    d[i] = 0
for i in wds1:
    d[i] += 1

wds = list()
for j in wds1:
    if (d[j] >= 5):
        wds.append(j)

예제 #13

0

파일 보기

bible = genesis.raw('english-kjv.txt')
blake = gutenberg.raw('blake-poems.txt')
bryant = gutenberg.raw('bryant-stories.txt')
burgess = gutenberg.raw('burgess-busterbrown.txt')
carroll = gutenberg.raw('carroll-alice.txt')
ch_ball = gutenberg.raw('chesterton-ball.txt')
ch_brown = gutenberg.raw('chesterton-brown.txt')
ch_thurs = gutenberg.raw('chesterton-thursday.txt')
edge = gutenberg.raw('edgeworth-parents.txt')
mel = gutenberg.raw('melville-moby_dick.txt')
mil = gutenberg.raw('milton-paradise.txt')
caesar = gutenberg.raw('shakespeare-caesar.txt')
hamlet = gutenberg.raw('shakespeare-hamlet.txt')
macbeth = gutenberg.raw('shakespeare-macbeth.txt')
whit = gutenberg.raw('whitman-leaves.txt')
rural = abc.raw('rural.txt')
science = abc.raw('science.txt')
plots = subjectivity.raw('plot.tok.gt9.5000')
quotes = subjectivity.raw('quote.tok.gt9.5000')
austen = sense + emma + persuasion
shakespeare = caesar + hamlet + macbeth
facts = rural + science
opinions = plots + quotes
gute = bryant + burgess + carroll + edge + mel + mil + whit
chester = ch_ball + ch_brown + ch_thurs
total = austen + shakespeare + facts + opinions + gute + chester + b

spaces = {}
wordlist = []

with open('words.json', 'r') as f:

예제 #14

0

파일 보기

from collections import Counter
import random, math
import itertools
import torch
from torch import nn, optim
from torch.utils.data import DataLoader
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
import pandas as pd

device = 'cuda' if torch.cuda.is_available() else 'cpu'

# creating corpus
corpus = []
for text_id in abc.fileids():
    text = abc.raw(text_id)
    text = text.lower()
    text = text.replace('\n', ' ')
    text = re.sub('[^a-zA-Z1-9]+', ' ', text)
    text = re.sub(' +', ' ', text)
    corpus.append([w for w in text.split() if w != ''])

n_docs = len(corpus)

# subsample frequent words
filtered_corpus = []
word_counts = dict(Counter(list(itertools.chain.from_iterable(corpus))))
total_words = np.sum(list(word_counts.values()))
freq = {word: word_counts[word] / float(total_words) for word in word_counts}
threshold = 1e-5
for doc in corpus:

예제 #15

0

파일 보기

파일: question1_1.py 프로젝트: Avenger17146/MCA

#refernce : https://towardsdatascience.com/google-news-and-leo-tolstoy-visualizing-word2vec-word-embeddings-with-t-sne-11558d8bd4d
def tsne_plot(label, embedding):
    print('Plotting...')
    plt.figure(figsize=(16, 9))
    colors = cm.rainbow(np.linspace(0, 1, 1))
    plt.legend(loc=4)
    x = embedding[:, 0]
    y = embedding[:, 1]
    plt.scatter(x, y, c=colors, alpha=0.2, label=label)
    plt.savefig(label + '.png')


#   plt.show()

t = 1e-5
x1 = abc.raw()
x1 = re.findall(r"[\w']+", x1)
vocab_to_int = dict()
int_to_vocab = dict()

x2 = set(x1)
x2 = list(x2)
for i in range(len(x2)):
    vocab_to_int[x2[i]] = i
    int_to_vocab[i] = x2[i]

# vocab_to_int, int_to_vocab = utils.create_lookup_tables(x1)
int_words = [vocab_to_int[word] for word in x1]

y = dict()

예제 #16

0

파일 보기

파일: untitled0.py 프로젝트: saksham16085/MCA-Assignment-3

from nltk.corpus import abc
import string
import numpy as np
from torch.utils.data import DataLoader
from torch.utils.data import Dataset, DataLoader
from datetime import datetime
import pickle as pkl
from sklearn.manifold import TSNE
# %matplotlib inline
import matplotlib.pyplot as plt
import pickle as pkl

torch.manual_seed(1)

CONTEXT_SIZE = 2  # 2 words to the left, 2 to the right
text = abc.raw().lower().split()
text2 = []
for i in text:
    word = ''
    for j in i:
        if j not in string.punctuation:
            word += j
    if word != '':
        text2.append(word)
# text = [''.join(c for c in s if c not in string.punctuation) for s in text]
# text = [s for s in text if s]

text = text2

vocab = set(text)
vocab_size = len(vocab)

예제 #17

0

파일 보기

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu Sep 20 21:17:25 2018

@author: vpapg
"""

# Obtain raw texts from two or more genres and compute their respective reading difficulty scores as in the earlier exercise on reading difficulty. E.g. compare ABC Rural News and ABC Science News (nltk.corpus.abc). Use Punkt to perform sentence segmentation.

from nltk.corpus import abc
from nltk import word_tokenize, sent_tokenize

abc_rural = abc.raw("rural.txt")
abc_science = abc.raw("science.txt")


def ARI(raw):
    words = word_tokenize(raw)
    sents = sent_tokenize(
        raw)  # I used different method for sentence segmentation
    mw = sum(len(w) for w in words) / len(words)
    ms = sum(len(s) for s in sents) / len(sents)
    return 4.71 * mw + 0.5 * ms - 21.43


print(ARI(abc_rural))
print(ARI(abc_science))