def test_stop_words_full(self):
     with open('{}{}'.format(base_resources, '2011-1-19raw.txt'), 'r')\
     as f:
         text = f.read().decode('utf-8')
         stopped = stop_words(text)  
         with open('{}{}'.format(target_out, '2011-1-19stopped'), 'w')\
         as out_file:
             out = ''.join(stopped)
             out_file.write(out.encode('utf8'))
 def test_stop_words(self):
     with open('{}{}'.format(base_resources, 'lebowskiIpsum'), 'r')\
     as f:
         text = f.read().decode('utf-8')
         #print(text, file=stderr)
         stopped = stop_words(text)  
         #print(stopped, file=stderr)
         self.assertGreater(len(stopped.split()), 50)
         self.assertNotEqual(text, stopped)
    def test_freq_dist(self):
        with open('{}{}'.format(base_resources, 'lebowskiIpsum'), 'r')\
        as f:
            text = f.read().decode('utf-8')
            #print(text, file=stderr)
            stopped = stop_words(text)  

            freq_dist = freq_dist_count(stopped.split()) 
            #print(pformat(freq_dist), file=stderr)
            self.assertGreater(len(freq_dist), 50)
    def test_freq_dist_dict_full(self):
        with open('{}{}'.format(base_resources, '2011-1-19raw.txt'), 'r')\
        as f:
            text = f.read().decode('utf-8')
            stopped = stop_words(text)  
            freq_dist = freq_dist_dict(stopped.split()) 
            #print(pformat(freq_dist), file=stderr)
            self.assertGreater(freq_dist[u'year'], 8)
            self.assertLess(freq_dist[u'year'], 12)

            text = remove_punctuation(text)
            stopped = stop_words(text)  
            freq_dist = freq_dist_dict(stopped.split())
            #print(pformat(freq_dist), file=stderr)
            self.assertGreater(freq_dist[u'year'], 16)

            with open('{}{}'.format(target_out, '2011-1-19freq_dist_dict'),\
            
            'w') as out_file:
                out_file.write(pformat(freq_dist))
 def test_freq_dist_full(self):
     with open('{}{}'.format(base_resources, '2011-1-19raw.txt'), 'r')\
     as f:
         text = f.read().decode('utf-8')
         assert type(text) == unicode
         stopped = stop_words(text)  
         freq_dist = freq_dist_count(stopped.split()) 
         #print(pformat(freq_dist), file=stderr)
         with open('{}{}'.format(target_out, '2011-1-19freq_dist_count'), 'w')\
         as out_file:
             out_file.write(pformat(freq_dist))
 def test_freq_dist_dict_full(self):
     with open('{}{}'.format(base_resources, '2011-1-19raw.txt'), 'r')\
     as f:
         text = f.read().decode('utf-8')
         text = remove_punctuation(text)
         stopped = stop_words(text)  
         ti = token_index(stopped)
         #print(pformat(ti), file=stderr)
         with open('{}{}'.format(target_out, '2011-1-19token_index'),\
         
         'w') as out_file:
             out_file.write(pformat(ti))