예제 #1
0
파일: Word2Vec.py 프로젝트: isha-goel/MCA
def collect_data(vocabulary_size=10000):
    v1 = abc.raw("rural.txt").split()
    v2 = abc.raw("science.txt").split()
    vocabulary = v1 + v2
    data, count, dictionary, reverse_dictionary = build_dataset(
        vocabulary, vocabulary_size)
    del vocabulary
    return data, count, dictionary, reverse_dictionary
예제 #2
0
def ari(fileid):
    """Accept text as list of words"""
    print(fileid)
    num_chars = len(abc.raw(fileid))
    num_words = len(abc.words(fileid))
    num_sents = len(abc.sents(fileid))

    avg_word_len = num_chars / num_words
    avg_sent_len = num_words / num_sents

    return avg_word_len * 4.71 + avg_sent_len * 0.5 - 21.43
예제 #3
0
def Automated_Readability_Index40(section):
    sent_tokenize = nltk.data.load('tokenizers/punkt/english.pickle')
    text = abc.raw(section)
    sents = len(sent_tokenize.tokenize(text))
    words = len(abc.words(section))
    text = " ".join(abc.words(section))
    letters = len(text)
    uw = letters / float(words)
    us = words / float(sents)
    ari = (4.71 * uw) + (0.5 * us) - 21.43
    return ari
예제 #4
0
def Automated_Readability_Index40(section):
	sent_tokenize = nltk.data.load('tokenizers/punkt/english.pickle')
	text = abc.raw(section)
	sents = len(sent_tokenize.tokenize(text))
	words = len(abc.words(section))
	text = " ".join(abc.words(section))
	letters = len(text)
	uw = letters / float(words) 
	us = words / float(sents) 
	ari = (4.71 * uw) + (0.5 * us) - 21.43
	return ari
예제 #5
0
def calcARI(file):
    sent_tokenizer=nltk.data.load('tokenizers/punkt/english.pickle')
    text = abc.raw(file)
    sents = sent_tokenizer.tokenize(text)
    avg_words = 0
    avg_letters = 0
    for sentence in sents:
        avg_words += len(sentence)
    avg_words = avg_words / len(sents)
    for word in abc.words(file):
        avg_letters += len(word)
    avg_letters = avg_letters / len(abc.words(file))
    return (4.71 * avg_letters) + (0.5 * avg_words) - 21.43
def Automated_Readability_Index40(section):
    char_count = 0
    sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    raw_text = abc.raw(section)
    sent = len(sent_tokenizer.tokenize(raw_text))
    words = len(abc.words(section))

    for ch in raw_text:
        if ch.isalpha():
            char_count = char_count + 1

    uw = char_count / float(words)
    us = words / float(sent)
    ARI = (4.71 * uw) + (0.5 * us) - 21.43
    return ARI
예제 #7
0
def practice():
    stemmed_tokens = []
    train_tokens = word_tokenize(abc.raw("rural.txt").lower())
    bigrams = list(ngrams(train_tokens, 3))
    POS_tag = nltk.pos_tag(train_tokens)
    print(POS_tag)
    #custom_tokenizer = PunktSentenceTokenizer(train_tokens)
    #word_token = custom_tokenizer.tokenize(sample_tokens)
    ps = PorterStemmer()
    for token in train_tokens:
        stemmed_value = ps.stem(token)
        stemmed_tokens.append(stemmed_value)

    frequencies = Counter(stemmed_tokens)
    stop_words = stopwords.words('English')

    for word, count in frequencies.most_common(50):
        if word not in stop_words and len(word) > 2:
            #continue

            print(word, count)
예제 #8
0
import nltk
from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktParameters
from nltk.corpus import abc

def ari(raw):

    # tokenize raw text and get words
    tokens = nltk.wordpunct_tokenize(raw)
    words = [word.lower() for word in tokens if word.isalpha()]

    # instantiate punctuation parameters
    punkt_params = PunktParameters()
    # specify abbreviations to be ignored in sentence separation
    punkt_params.abbrev_types = set(['dr', 'inc', 'mr', 'mrs', 'ms', 'prof',
                                     'etc'])
    # separate into sentences using a PuktSentenceTokenizer
    sentences = PunktSentenceTokenizer(punkt_params).tokenize(raw)

    chars = 0

    for word in words:
        chars += len(word)
    
    return (4.71 * (chars / len(words)) + 0.5 * (len(words) / len(sentences))
            - 21.43)

for fileid in abc.fileids():
    print '%*s %9f' % (max(len(f) for f in abc.fileids()), fileid,
                       ari(abc.raw(fileids=fileid)))
예제 #9
0
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import abc, stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import PunktSentenceTokenizer
from nltk.corpus import wordnet
from collections import Counter


cor = abc.raw("rural.txt").lower()
cor_abc = abc.raw("rural.txt").lower()
cor_word_tokens = word_tokenize(cor)
#print(cor_word_tokens)
cor_sent_tokens = sent_tokenize(cor)
#print(cor_sent_tokens)

#stop words
stp = stopwords.words("english")
#print(stp)
filtered_sentence = [i for i in cor_word_tokens if i not in stp and len(i)>2]

# for i in cor_word_tokens:
#      if i not in stp:
#          filtered_sentence.append(i)
#print(filtered_sentence)

#stemming
def filteredstem(input):
    ps = PorterStemmer()
    for w in input:
        print(ps.stem(w))
예제 #10
0
from nltk.corpus import gutenberg, abc, reuters, brown, movie_reviews
from topia.termextract import extract
extractor = extract.TermExtractor()

with open('./corpus/all3.txt', 'r') as f:
	with open('./data/terms.txt', 'w') as o:
		o.write("Term\tOccurences\tStrength\n")
		for term in extractor(f.read()+gutenberg.raw()+abc.raw()+reuters.raw()+brown.raw()+movie_reviews.raw()):
			o.write("\t".join(map(str, term)) + "\n")
예제 #11
0
파일: q1.py 프로젝트: vishaal27/MCA-W2020
def get_corpus():
    science = abc.raw('science.txt')
    rural = abc.raw('rural.txt')
    concat = science + '\n' + rural
    return concat
예제 #12
0
import nltk
nltk.download('abc')
nltk.download('punkt')
"""#### Skip gram model is used for making word embeddings."""

from nltk.corpus import abc
from nltk.tokenize import RegexpTokenizer
import torch
from tqdm import tqdm
'''
  The size of the corpus is : 663964
  The Vocabulary size is : 11557
'''

cut_indx = 70000
corp = abc.raw()
wds1 = corp.split()[:cut_indx]
print(len(wds1))
t = 1e-5
# this is the frequency
d = dict()
for i in wds1:
    d[i] = 0
for i in wds1:
    d[i] += 1

wds = list()
for j in wds1:
    if (d[j] >= 5):
        wds.append(j)
예제 #13
0
bible = genesis.raw('english-kjv.txt')
blake = gutenberg.raw('blake-poems.txt')
bryant = gutenberg.raw('bryant-stories.txt')
burgess = gutenberg.raw('burgess-busterbrown.txt')
carroll = gutenberg.raw('carroll-alice.txt')
ch_ball = gutenberg.raw('chesterton-ball.txt')
ch_brown = gutenberg.raw('chesterton-brown.txt')
ch_thurs = gutenberg.raw('chesterton-thursday.txt')
edge = gutenberg.raw('edgeworth-parents.txt')
mel = gutenberg.raw('melville-moby_dick.txt')
mil = gutenberg.raw('milton-paradise.txt')
caesar = gutenberg.raw('shakespeare-caesar.txt')
hamlet = gutenberg.raw('shakespeare-hamlet.txt')
macbeth = gutenberg.raw('shakespeare-macbeth.txt')
whit = gutenberg.raw('whitman-leaves.txt')
rural = abc.raw('rural.txt')
science = abc.raw('science.txt')
plots = subjectivity.raw('plot.tok.gt9.5000')
quotes = subjectivity.raw('quote.tok.gt9.5000')
austen = sense + emma + persuasion
shakespeare = caesar + hamlet + macbeth
facts = rural + science
opinions = plots + quotes
gute = bryant + burgess + carroll + edge + mel + mil + whit
chester = ch_ball + ch_brown + ch_thurs
total = austen + shakespeare + facts + opinions + gute + chester + b

spaces = {}
wordlist = []

with open('words.json', 'r') as f:
예제 #14
0
from collections import Counter
import random, math
import itertools
import torch
from torch import nn, optim
from torch.utils.data import DataLoader
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
import pandas as pd

device = 'cuda' if torch.cuda.is_available() else 'cpu'

# creating corpus
corpus = []
for text_id in abc.fileids():
    text = abc.raw(text_id)
    text = text.lower()
    text = text.replace('\n', ' ')
    text = re.sub('[^a-zA-Z1-9]+', ' ', text)
    text = re.sub(' +', ' ', text)
    corpus.append([w for w in text.split() if w != ''])

n_docs = len(corpus)

# subsample frequent words
filtered_corpus = []
word_counts = dict(Counter(list(itertools.chain.from_iterable(corpus))))
total_words = np.sum(list(word_counts.values()))
freq = {word: word_counts[word] / float(total_words) for word in word_counts}
threshold = 1e-5
for doc in corpus:
예제 #15
0
#refernce : https://towardsdatascience.com/google-news-and-leo-tolstoy-visualizing-word2vec-word-embeddings-with-t-sne-11558d8bd4d
def tsne_plot(label, embedding):
    print('Plotting...')
    plt.figure(figsize=(16, 9))
    colors = cm.rainbow(np.linspace(0, 1, 1))
    plt.legend(loc=4)
    x = embedding[:, 0]
    y = embedding[:, 1]
    plt.scatter(x, y, c=colors, alpha=0.2, label=label)
    plt.savefig(label + '.png')


#   plt.show()

t = 1e-5
x1 = abc.raw()
x1 = re.findall(r"[\w']+", x1)
vocab_to_int = dict()
int_to_vocab = dict()

x2 = set(x1)
x2 = list(x2)
for i in range(len(x2)):
    vocab_to_int[x2[i]] = i
    int_to_vocab[i] = x2[i]

# vocab_to_int, int_to_vocab = utils.create_lookup_tables(x1)
int_words = [vocab_to_int[word] for word in x1]

y = dict()
예제 #16
0
from nltk.corpus import abc
import string
import numpy as np
from torch.utils.data import DataLoader
from torch.utils.data import Dataset, DataLoader
from datetime import datetime
import pickle as pkl
from sklearn.manifold import TSNE
# %matplotlib inline
import matplotlib.pyplot as plt
import pickle as pkl

torch.manual_seed(1)

CONTEXT_SIZE = 2  # 2 words to the left, 2 to the right
text = abc.raw().lower().split()
text2 = []
for i in text:
    word = ''
    for j in i:
        if j not in string.punctuation:
            word += j
    if word != '':
        text2.append(word)
# text = [''.join(c for c in s if c not in string.punctuation) for s in text]
# text = [s for s in text if s]

text = text2

vocab = set(text)
vocab_size = len(vocab)
예제 #17
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu Sep 20 21:17:25 2018

@author: vpapg
"""

# Obtain raw texts from two or more genres and compute their respective reading difficulty scores as in the earlier exercise on reading difficulty. E.g. compare ABC Rural News and ABC Science News (nltk.corpus.abc). Use Punkt to perform sentence segmentation.

from nltk.corpus import abc
from nltk import word_tokenize, sent_tokenize

abc_rural = abc.raw("rural.txt")
abc_science = abc.raw("science.txt")


def ARI(raw):
    words = word_tokenize(raw)
    sents = sent_tokenize(
        raw)  # I used different method for sentence segmentation
    mw = sum(len(w) for w in words) / len(words)
    ms = sum(len(s) for s in sents) / len(sents)
    return 4.71 * mw + 0.5 * ms - 21.43


print(ARI(abc_rural))
print(ARI(abc_science))