def test_read_grouped_real_file(self): data = read_grouped_file(TWITTER_FILE, 0, False) self.assertEqual(174171, len(data)) #There are 24 groups of 5 mins in two hours. But since #the very last one becomes a new one (it is after 2h) #we have 25. data = read_grouped_file(TWITTER_FILE, 300, False) self.assertEqual(25, len(data))
def main(twitter_fpath, out_folder): data = read_grouped_file(twitter_fpath) for gid, group in enumerate(data): with io.open(os.path.join(out_folder, '%d.parsed'%gid), 'w') as outf: for document in group: outf.write(u'%s\n' % u' '.join(document))