예제 #1
0
    def test_read_grouped_real_file(self):
        data = read_grouped_file(TWITTER_FILE, 0, False)
        self.assertEqual(174171, len(data))

        #There are 24 groups of 5 mins in two hours. But since
        #the very last one becomes a new one (it is after 2h)
        #we have 25.
        data = read_grouped_file(TWITTER_FILE, 300, False)
        self.assertEqual(25, len(data))
예제 #2
0
def main(twitter_fpath, out_folder):
    data = read_grouped_file(twitter_fpath)
    
    for gid, group in enumerate(data):
        with io.open(os.path.join(out_folder, '%d.parsed'%gid), 'w') as outf:
            for document in group:
                outf.write(u'%s\n' % u' '.join(document))