import itertools from utils import get_chunks_of_file def get_comment_files_in_folder(folder): files = [] for (dirpath, dirnames, filenames) in os.walk(folder): for f in filenames: if len(f) == 10 and re.match('RC_20\d\d-\d\d', f) is not None: files.append(os.path.join(dirpath, f)) return files def save_lines(lines, chunk=None): if chunk is not None: print 'SORTING CHUNK %d ...' % chunk, lines = sorted(lines, key=lambda line: json.loads(line)['subreddit']) if chunk is not None: print 'DONE\nSAVING CHUNK %d ...' % chunk, for sub, group in itertools.groupby(lines, key=lambda line: json.loads(line)['subreddit']): with open(sub, 'a') as sub_file: sub_file.write(''.join(group)) print 'DONE' if __name__ == '__main__': for month_fname in get_comment_files_in_folder('../reddit_data_comments/'): with open(month_fname, 'r') as month_file: print 'BEGINNING FILE: %s' % month_fname i = 1 for lines in get_chunks_of_file(month_file, True): i += 1 save_lines(lines, i)
import json from utils import get_chunks_of_file import itertools if __name__ == '__main__': with open('../RS_full_corpus', 'r') as f: chunk = 1 print 'READING CHUNK %d ...' % chunk, for lines in get_chunks_of_file(f): print 'DONE\nSORTING CHUNK %d ...' % chunk, lines = sorted(lines, key=lambda l: json.loads(l).get('subreddit', 'NO_SUBREDDIT')) print 'DONE\nSAVING CHUNK %d ...' % chunk, for sub, g in itertools.groupby(lines, key=lambda l: json.loads(l).get('subreddit', 'NO_SUBREDDIT')): with open(sub, 'a') as subfile: subfile.write(''.join(g)) chunk += 1 print 'DONE\nREADING CHUNK %d ...' % chunk, print 'END'
self.df = pd.DataFrame(index=filenames, columns=["done"]) self.df.done = False if os.path.exists(self.fname): saved_df = pd.read_csv(self.fname) saved_df.columns = ["fnames", "done"] self.df.ix[saved_df[saved_df.done == True].fnames, "done"] = True def completed(self, filename): try: self.df.ix[filename, "done"] = True self.df.to_csv(self.fname) finally: self.df.ix[filename, "done"] = True self.df.to_csv(self.fname) if __name__ == "__main__": sub_filenames = sorted(get_sub_files("../sub_files")) df = StatusDF(sub_filenames) for sub_filename in sub_filenames: if df.df.ix[sub_filename, "done"]: print "%s ALREADY COMPLETED" % sub_filename else: print "%s" % sub_filename sub_name = sub_filename.split("/")[-1] remake_folder(sub_name) with open(sub_filename, "r") as sub_file: for i, lines in enumerate(get_chunks_of_file(sub_file, True)): save_lines(lines, sub_name + "/", i + 1) df.completed(sub_filename)