def run(self): zip_files = glob.glob(os.path.join(self.args.src, ALEXA_ZIP_FILE_FORMAT)) zip_files.sort() for zip_f in zip_files: basename = os.path.basename(zip_f) date = midas.parse_tstamp(basename, ALEXA_TS_FORMAT) tstamp = midas.serialize_tstamp(date) dst_fname = 'top_1m_{0}'.format(tstamp) dst_f = os.path.join(self.args.dst, dst_fname) if os.path.isfile(dst_f) or os.stat(zip_f).st_size == 0: self.out('Skipping {0}'.format(basename)) else: with open(dst_f, 'w') as fp: for site, rank in iter_alexa_zip_file(zip_f): fp.write('{0}\t{1}\t{2}\n'.format(site, tstamp, rank)) self.out('Processed {0}'.format(basename))
def ids_to_samples(self): if self.args.samples: directory = self.args.samples else: directory = self.config['samples'] if os.path.isfile(directory): files = [directory, ] else: files = [] make_abs = functools.partial(os.path.join, directory) for path in imap(make_abs, os.listdir(directory)): if os.path.isfile(path): files.append(path) samples = dict() for f in files: for site, tstamp, code in csv_file_reader(f, delimiter='\t'): tstamp = parse_tstamp(tstamp) site_id = self.sites_to_ids[site] samples[site_id] = (site, tstamp, code) return samples