def setUp(self):
     categories = [
         'red\n',
         'YelLOw',
         'green',
         'blue'
     ]
     self.mapper = mapper(categories)
 def setUp(self):
     categories = ['red\n', 'YelLOw', 'green', 'blue']
     self.mapper = mapper(categories)
ytrain = csv.writer(open('data/ytrain.csv','w')) 
ytest = csv.writer(open('data/ytest.csv','w')) 

mindocs = round(0.01*len(s['train']))
maxdocs = round(0.99*len(s['train']))

#create the subject mapping
print datetime.now(), 'creating subject mapping'
#get the subjects
subjects = [f['subject2_hierarchy'] for f in s['train']]
#take the top-level element of each subject for each doc
subjects = [[sub.split('/')[0] for sub in f] for f in subjects]
#sort and take the first one
subjects = [ sorted(sub)[0] for sub in subjects]

mapper = words.mapper(subjects, subjectFile='data/subjects.txt')

#setup word counters
wordcounters = {} 
for textfield in text_fields:
    print datetime.now(), 'creating dictionary for %s' % (textfield)

    wordcounters[textfield] = words.counter(
        [f[textfield] for f in s['train']],
        mindocs=mindocs, maxdocs=maxdocs,
        dictionaryFile='data/dictionary-%s.txt' % (textfield))

#process the sample and write vectors
print datetime.now(), 'converting texts to vectors and storing to csv'
for doc in s['train']:
    subject = sorted([sub.split('/')[0] for sub in doc['subject2_hierarchy']])[0]
예제 #4
0
ytrain = csv.writer(open('data/ytrain.csv', 'w'))
ytest = csv.writer(open('data/ytest.csv', 'w'))

mindocs = round(0.01 * len(s['train']))
maxdocs = round(0.99 * len(s['train']))

#create the subject mapping
print datetime.now(), 'creating subject mapping'
#get the subjects
subjects = [f['subject2_hierarchy'] for f in s['train']]
#take the top-level element of each subject for each doc
subjects = [[sub.split('/')[0] for sub in f] for f in subjects]
#sort and take the first one
subjects = [sorted(sub)[0] for sub in subjects]

mapper = words.mapper(subjects, subjectFile='data/subjects.txt')

#setup word counters
wordcounters = {}
for textfield in text_fields:
    print datetime.now(), 'creating dictionary for %s' % (textfield)

    wordcounters[textfield] = words.counter(
        [f[textfield] for f in s['train']],
        mindocs=mindocs,
        maxdocs=maxdocs,
        dictionaryFile='data/dictionary-%s.txt' % (textfield))

#process the sample and write vectors
print datetime.now(), 'converting texts to vectors and storing to csv'
for doc in s['train']: