Exemplo n.º 1
0
import string
import argparse
import random
import unidecode
import re

import load as Loader

## determine which genres are most common, least common
NUM_FICS = 50

parser = argparse.ArgumentParser(description="Analyze scraped data.")
parser.add_argument("directory", metavar="dir", help="directory to process")

args = parser.parse_args()
data = Loader.load_data_partial(args.directory, NUM_FICS)


## determine which genres are corralated


## determine the most popular genres
Exemplo n.º 2
0
import re
import load as Loader 
import unicodedata
import nltk
from textstat.textstat import textstat

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import * 


parser = argparse.ArgumentParser(description='Analyze scraped data.')
parser.add_argument('directory', metavar='dir',
                   help='directory to process')

args = parser.parse_args()
data = Loader.load_data_partial(args.directory,10)

#nltk.download()
def fic2text(ident):
   textsegs = Loader.get_field(data['fics'],ident,'fic') 
   rtags = Loader.get_field(data['base'],ident,'tags')
   rtext = ""

   for line in textsegs:
      line = line.replace(u'\xa0',' ')
      s = re.sub('([.,!?()])', r' \1 ', line)
      s = re.sub('\s{2,}', ' ', line)
      line = line.encode('ascii', 'ignore').decode('ascii')
      rtext += line+" "

   tags = []