-
Notifications
You must be signed in to change notification settings - Fork 0
/
textstats.py
executable file
·67 lines (54 loc) · 1.81 KB
/
textstats.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
#!/usr/bin/python
import string
import argparse
import random
import re
import load as Loader
import unicodedata
import nltk
from textstat.textstat import textstat
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import *
parser = argparse.ArgumentParser(description='Analyze scraped data.')
parser.add_argument('directory', metavar='dir',
help='directory to process')
args = parser.parse_args()
data = Loader.load_data_partial(args.directory,10)
#nltk.download()
def fic2text(ident):
textsegs = Loader.get_field(data['fics'],ident,'fic')
rtags = Loader.get_field(data['base'],ident,'tags')
rtext = ""
for line in textsegs:
line = line.replace(u'\xa0',' ')
s = re.sub('([.,!?()])', r' \1 ', line)
s = re.sub('\s{2,}', ' ', line)
line = line.encode('ascii', 'ignore').decode('ascii')
rtext += line+" "
tags = []
for genre in rtags:
for el in rtags[genre]:
tname = el["name"]
tags.append(tname)
reading_ease = textstat.flesch_reading_ease(rtext)
reading_level = textstat.flesch_kincaid_grade(rtext)
print(ident,reading_ease,reading_level)
#tokens = nltk.word_tokenize(rtext)
return tags,rtext
print("==== Loaded. Getting Data.... =====")
ids = Loader.get_primaries(data['fics']);
vdata = []
vtags = []
for i in ids:
tags,arr = fic2text(i)
vdata.append(arr)
vtags.append(tags)
print("==== TFIDF Vectorize.... =====")
tf_vectorize = TfidfVectorizer(use_idf=True)
tfidf = tf_vectorize.fit_transform(vdata)
idf = tf_vectorize.idf_
# these are the words that we use for classification
vects = dict(zip(tf_vectorize.get_feature_names(),idf))
print("==== Predict Topics.... =====")
ctags = MultiLabelBinarizer().fit_transform(vtags)
clf = MultinomialNB().fit(tfidf, twenty_train.target)