import string import argparse import random import unidecode import re import load as Loader ## determine which genres are most common, least common NUM_FICS = 50 parser = argparse.ArgumentParser(description="Analyze scraped data.") parser.add_argument("directory", metavar="dir", help="directory to process") args = parser.parse_args() data = Loader.load_data_partial(args.directory, NUM_FICS) ## determine which genres are corralated ## determine the most popular genres
import re import load as Loader import unicodedata import nltk from textstat.textstat import textstat from sklearn.preprocessing import MultiLabelBinarizer from sklearn.feature_extraction.text import * parser = argparse.ArgumentParser(description='Analyze scraped data.') parser.add_argument('directory', metavar='dir', help='directory to process') args = parser.parse_args() data = Loader.load_data_partial(args.directory,10) #nltk.download() def fic2text(ident): textsegs = Loader.get_field(data['fics'],ident,'fic') rtags = Loader.get_field(data['base'],ident,'tags') rtext = "" for line in textsegs: line = line.replace(u'\xa0',' ') s = re.sub('([.,!?()])', r' \1 ', line) s = re.sub('\s{2,}', ' ', line) line = line.encode('ascii', 'ignore').decode('ascii') rtext += line+" " tags = []