def setUp(self): self.texts = [ u'I visited S. F and visited the Conservatory of Flowers and saw the older "Wicked" plants!', u'There needs to be a third option because getting older or dying aren\'t working for me.' ] self.words = [ u'conservatori', u'die', u'flower', u'get', u'need', u'older', u'option', u'plant', u'saw', u'third', u'visit', u'wick', u'work' ] self.counts = [ [1,0,1,0,0,1,0,1,1,0,2,1,0], #counts from the first text [0,1,0,1,1,1,1,0,0,1,0,0,1] #counts from the second text ] self.idf_values = [ float(0.6931471805599453) for f in self.words ] self.idf_values[5] = float(0) #only 'older' is in both texts self.counter = counter(self.texts)
def setUp(self): self.texts = [ u'I visited S. F and visited the Conservatory of Flowers and saw the older "Wicked" plants!', u'There needs to be a third option because getting older or dying aren\'t working for me.' ] self.words = [ u'conservatori', u'die', u'flower', u'get', u'need', u'older', u'option', u'plant', u'saw', u'third', u'visit', u'wick', u'work' ] self.counts = [ [1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 2, 1, 0], #counts from the first text [0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1] #counts from the second text ] self.idf_values = [float(0.6931471805599453) for f in self.words] self.idf_values[5] = float(0) #only 'older' is in both texts self.counter = counter(self.texts)
#get the subjects subjects = [f['subject2_hierarchy'] for f in s['train']] #take the top-level element of each subject for each doc subjects = [[sub.split('/')[0] for sub in f] for f in subjects] #sort and take the first one subjects = [ sorted(sub)[0] for sub in subjects] mapper = words.mapper(subjects, subjectFile='data/subjects.txt') #setup word counters wordcounters = {} for textfield in text_fields: print datetime.now(), 'creating dictionary for %s' % (textfield) wordcounters[textfield] = words.counter( [f[textfield] for f in s['train']], mindocs=mindocs, maxdocs=maxdocs, dictionaryFile='data/dictionary-%s.txt' % (textfield)) #process the sample and write vectors print datetime.now(), 'converting texts to vectors and storing to csv' for doc in s['train']: subject = sorted([sub.split('/')[0] for sub in doc['subject2_hierarchy']])[0] x = [] for textfield in text_fields: x += wordcounters[textfield].tfidf_vector(doc[textfield]) train.writerow(x) ytrain.writerow(mapper.vector(subject)) for doc in s['test']: subject = sorted([sub.split('/')[0] for sub in doc['subject2_hierarchy']])[0] x = []
subjects = [f['subject2_hierarchy'] for f in s['train']] #take the top-level element of each subject for each doc subjects = [[sub.split('/')[0] for sub in f] for f in subjects] #sort and take the first one subjects = [sorted(sub)[0] for sub in subjects] mapper = words.mapper(subjects, subjectFile='data/subjects.txt') #setup word counters wordcounters = {} for textfield in text_fields: print datetime.now(), 'creating dictionary for %s' % (textfield) wordcounters[textfield] = words.counter( [f[textfield] for f in s['train']], mindocs=mindocs, maxdocs=maxdocs, dictionaryFile='data/dictionary-%s.txt' % (textfield)) #process the sample and write vectors print datetime.now(), 'converting texts to vectors and storing to csv' for doc in s['train']: subject = sorted([sub.split('/')[0] for sub in doc['subject2_hierarchy']])[0] x = [] for textfield in text_fields: x += wordcounters[textfield].tfidf_vector(doc[textfield]) train.writerow(x) ytrain.writerow(mapper.vector(subject)) for doc in s['test']: