def collect_segments(sampling): logger.info('{} - assembling data for sampling'.format(sampling)) sampling_dir = join(ARTIFACTS_DIR, 'samplings', '{:0>3}'.format(sampling)) makedirs(sampling_dir, exist_ok=True) for day, segments in sampling_segments(sampling): for segment_idx, segment in enumerate(segments): enhanced = enhance_segment(day, segment, sampling) if enhanced is None or len(enhanced) == 0: logger.error(json.dumps(segment, indent=2)) continue fw = FileWriter(SampledNACSRow, enhanced) fname = join(sampling_dir, '{}.no{:0>2} - {:.0f} seconds.asc'.format( day, segment_idx, round((enhanced[-1][0] - enhanced[0][0]) / 1000) )) segment['filename'] = basename(fname) logger.info('\t{} - segment file'.format(basename(fname))) with open(fname, 'w') as datafile: fw.reflect(datafile) with open(join(sampling_dir, '000.{}_list.json'.format(day)), 'w') as segments_list: json.dump(segments, segments_list)
def make_deltas(key, dirname, RowParser): logger.info('{}. Reading datafiles'.format(key)) all_datafiles = [ fname.strip() for fname in open( join(ARTIFACTS_DIR, '{}.good.txt'.format(key)), 'r').readlines() ] days = {fname[:7] for fname in all_datafiles} for day in days: datafiles = sorted([ join(dirname, fname) for fname in all_datafiles if fname.startswith(day) ]) ut = round( concatenate([ local_preload(fname, FileParser, RowParser, fname).get( 'ut_of_day', transposed=True)[0] for fname in datafiles ], axis=0) / 1000.) o_dens = concatenate([ local_preload(fname, FileParser, RowParser, fname).get( 'o_dens', transposed=True)[0] for fname in datafiles ], axis=0) deltas = (concatenate([ut, array([0])]) - concatenate([array([0]), ut]))[1:] yield day, deltas, ut, o_dens
def chunkup(data): chunks = [] # Lets ignore fluctuationsx of sampling within 1s for datafile in data: ut = datafile.get('ut_of_day', transposed=True)[0] ongoing_sampling = round( ut[1] - ut[0]) # We're sure there's no files with less than 2 datapoints starts_at = 0 for idx in range(1, len(ut)): sampling = round(ut[idx] - ut[idx - 1]) if sampling == ongoing_sampling: continue chunks.append((ut[starts_at], ut[idx - 1], idx - starts_at, ongoing_sampling)) starts_at = idx - 1 ongoing_sampling = sampling chunks.append( (ut[starts_at], ut[-1], len(ut) - starts_at, ongoing_sampling)) for n, (start, end, length, sampling) in enumerate(chunks): logger.info('{}.\t{}\t- {}\t{} / {}'.format(n, start, end, sampling, length)) return chunks
def chunkup_samplings(key, dirname, RowParser): logger.info('{} at {}'.format(key, dirname)) for sampling in [1]: # range(1, 199): logger.info('{} -- sampling'.format(sampling)) # by_ut, by_length = artifacts(key, sampling) # if not exists(by_ut) or not exists(by_length): sample(key, dirname, RowParser, sampling)
def main(): removed = 0 for fname in listdir(CACHE_DIR): if fnmatch(fname, '*.pydata'): logger.debug('Removing {}'.format(fname)) remove(join(CACHE_DIR, fname)) removed += 1 logger.info('Removed {} files'.format(removed))
def segments_list(sampling): logger.info('{} - listing sampling'.format(sampling)) sampling_dir = join(ARTIFACTS_DIR, 'samplings', '{:0>3}'.format(sampling)) return sorted([ join(sampling_dir, fname) for fname in listdir(sampling_dir) if fnmatch(fname, '19822*.asc') or fnmatch(fname, '19823*.asc') ])
def find_duplicates(filenames): hashes = {} for filename in filenames: fhash = hashlib.sha256(dataof(filename).encode('utf-8').strip()).hexdigest() if fhash not in hashes: hashes[fhash] = [] hashes[fhash].append(filename) for fhash, fnames in hashes.items(): if len(fnames) < 2: continue logger.info('\n\t{}'.format(fhash)) for fname in fnames: logger.info('\t\t{}'.format(basename(fname))) return {ec[0] for ec in hashes.values() if len(ec) > 1}
def draw_tracks(destination_dir=None): nacs_ignores = [ join(DE2SOURCE_NACS_DIR, fname.strip()) for fname in open( join(ARTIFACTS_DIR, 'nacs.ignore.txt'), 'r').readlines() ] nacs_goodfiles = [ fname for fname in list_datafiles(DE2SOURCE_NACS_DIR) if fname not in nacs_ignores ] wats_ignores = [ join(DE2SOURCE_WATS_DIR, fname.strip()) for fname in open( join(ARTIFACTS_DIR, 'wats.ignore.txt'), 'r').readlines() ] wats_goodfiles = [ fname for fname in list_datafiles(DE2SOURCE_WATS_DIR) if fname not in wats_ignores ] files_by_days = {} for filename in nacs_goodfiles: year_day = basename(filename)[:7] if year_day not in files_by_days: files_by_days[year_day] = { 'nacs': [], 'wats': [], } files_by_days[year_day]['nacs'].append(filename) for filename in wats_goodfiles: year_day = basename(filename)[:7] if year_day not in files_by_days: files_by_days[year_day] = { 'nacs': [], 'wats': [], } files_by_days[year_day]['wats'].append(filename) for yearday in sorted(files_by_days.keys()): logger.info('{}: Year/Day'.format(yearday)) logger.info('\t{}: Number of files'.format(len( files_by_days[yearday]))) nacs_chunks = sum([ chunkup(SourceNACSRow, filename) for filename in files_by_days[yearday]['nacs'] ], []) wats_chunks = sum([ chunkup(SourceWATSRow, filename) for filename in files_by_days[yearday]['wats'] ], []) logger.info('\t{}: Total NACS chunks'.format(len(nacs_chunks))) logger.info('\t{}: Total WATS chunks'.format(len(wats_chunks))) year_value = yearday[:4] day_value = yearday[4:] draw_chunks(year_value, day_value, nacs_chunks, wats_chunks, destination_dir)
def chunkup(RowParser, filename): data = FileParser(RowParser, filename) ut = data.get('ut', transposed=True)[0] lat = data.get('lat', transposed=True)[0] lon = data.get('lon', transposed=True)[0] chunks = list() sidx = 0 threshold = 500 / 8.9 # Lets set it at the moment as 500 km gap is long enough to treat as different track for idx in range(1, len(ut)): if ut[idx] - ut[idx - 1] > threshold: chunks.append( [list(lat[sidx:idx]), list(lon[sidx:idx]), list(ut[sidx:idx])]) sidx = idx chunks.append([list(lat[sidx:]), list(lon[sidx:]), list(ut[sidx:])]) logger.info('\t\t{} :points // {}: chunks at {}'.format( len(ut), len(chunks), basename(filename))) return chunks
def draw_segment(sampling, segment_file): logger.info('\t{}: processing'.format(basename(segment_file))) segment_data = local_preload(segment_file, FileParser, SampledNACSRow, segment_file) ut = segment_data.get('ut', transposed=True)[0] lat = segment_data.get('lat', transposed=True)[0] lon = segment_data.get('lon', transposed=True)[0] o_dens = omit_zeros(segment_data.get('o_dens', transposed=True)[0]) hours = ut_to_hours(ut) day = basename(segment_file)[:7] param_name = 'O density' fig_avg, fig_wave = analyze_param(sampling, day, hours, lat, lon, o_dens, param_name) fig_avg_artifact_fname = segment_file[:-3] + param_name.lower() + '.trend.png' logger.debug('\t\t{}: artifact'.format(basename(fig_avg_artifact_fname))) fig_avg.savefig(fig_avg_artifact_fname, dpi=300, papertype='a0', orientation='landscape') plt.close(fig_avg) fig_wave_artifact_fname = segment_file[:-3] + param_name.lower() + '.wave.png' logger.debug('\t\t{}: artifact'.format(basename(fig_wave_artifact_fname))) fig_wave.savefig(fig_wave_artifact_fname, dpi=300, papertype='a0', orientation='landscape') plt.close(fig_wave)
def filtration(key, basedir, RowParser): badfiles = read_badfileslist( basedir, join(ARTIFACTS_DIR, '{}.notmonotone.txt'.format(key)) ) datafiles = goodfiles(basedir, badfiles) logger.info('key: {}'.format(key)) logger.info('\t{}: total number of good datafiles'.format(len(datafiles))) duplicates = find_duplicates(datafiles) filtered_datafiles = list(set(datafiles).difference(duplicates)) logger.info('\t{}: total number of exclusive datafiles'.format(len(filtered_datafiles))) total_intersections_list = [] iteration_number = 0 while True: intersections = find_intersections(filtered_datafiles, RowParser) logger.info('\t{} iteration. Intersection search'.format(iteration_number)) iteration_number += 1 logger.info('\t\t{} files are intersecting'.format(len(intersections))) if len(intersections) == 0: break total_intersections_list += list(intersections) filtered_datafiles = list(set(filtered_datafiles).difference(intersections)) logger.info('{} files left after filtering'.format(len(filtered_datafiles))) total_datapoints = sum([ len(local_preload(filename, FileParser, RowParser, filename).get('ut', transposed=True)[0]) for filename in filtered_datafiles ]) logger.info('{} datapoints left'.format(total_datapoints)) logger.info('\nDuplicated files:') for fname in duplicates: logger.info('\t{}'.format(basename(fname))) with open(join(ARTIFACTS_DIR, '{}.duplicates.txt'.format(key)), 'w') as datafile: datafile.write('\n'.join([basename(filename) for filename in sorted(duplicates)])) logger.info('\nIntersected files:') for fname in total_intersections_list: logger.info('\t{}'.format(basename(fname))) with open(join(ARTIFACTS_DIR, '{}.intersections.txt'.format(key)), 'w') as datafile: datafile.write('\n'.join([basename(filename) for filename in sorted(intersections)])) with open(join(ARTIFACTS_DIR, '{}.ignore.txt'.format(key)), 'w') as datafile: datafile.write('\n'.join([basename(filename) for filename in sorted(list(badfiles) + list(intersections) + list(duplicates))])) with open(join(ARTIFACTS_DIR, '{}.good.txt'.format(key)), 'w') as datafile: datafile.write('\n'.join([basename(filename) for filename in sorted(datafiles)]))
def sample(key, dirname, RowParser, sampling): for day, deltas, ut, o_dens in make_deltas(key, dirname, RowParser): # 1. Split on chunks with gaps no longer than sampling; # 2. Iterate over datashifts 0 <= j < sampling; # 3. Look for sampling matches; # 4. Exclude multiples of matched samplings; # 5. Store (t_start, t_end, sampling) taking in acount sampling shift j; logger.info('[{}]\t {}: Total length of ut'.format(day, len(ut))) min_sequence_duration = 250 * sqrt(sampling) working_samplings = [] logger.info('[{}]\t {}: sampling to check'.format(day, sampling)) starts_at = 0 continuous = True for idx in range(len(deltas)): continuous = continuous and o_dens[idx] is not None and not isnan( o_dens[idx]) and o_dens[idx] != 0 if not continuous: starts_at = idx + 1 continuous = True continue if deltas[idx] - sampling >= 0.5: year = date.fromtimestamp(ut[starts_at]).strftime('%Y / %j') if deltas[idx] > 500.: logger.info( chalk.red('{:0>4d}\t..\t\t\t{:.2f}'.format( sampling, deltas[idx]), bold=True, underline=True)) segment_length = ut[idx] - ut[starts_at] if segment_length > min_sequence_duration: logger.info( chalk.green( '{:0>4d}\t++[{}]\t{}\t\t{:.2f} > {}'.format( sampling, len(working_samplings), year, segment_length, min_sequence_duration), bold=True, underline=True)) shift = 0 sub_working = [] while sum(deltas[starts_at:starts_at + shift] ) < segment_length - min_sequence_duration: points = verify_sampling(deltas[starts_at + shift:idx], sampling) if points is not None and points > 0: sub_working.append({ 'indexes': (starts_at + shift, idx), 'length': idx - starts_at - shift + 1, 'points': points, 'segment': (ut[starts_at + shift], ut[idx]), 'duration': ut[idx] - ut[starts_at + shift], 'resolution': float(sampling) / points, 'day': day, }) if (ut[idx] - ut[starts_at + shift]) > 0.9 * segment_length: break shift += 1 if len(sub_working) > 0: working_samplings.append( sorted(sub_working, key=lambda x: x['duration'])[0]) starts_at = idx + 1 by_ut, by_length = artifacts(day, key, sampling) with open(by_ut, 'w') as artifact: json.dump(working_samplings, artifact) with open(by_length, 'w') as artifact: json.dump( sorted(working_samplings, key=lambda x: (-x['duration'], x['segment'][0])), artifact)
def data_report(key, RowParser, dirname): datafiles = list_datafiles(dirname) doppelganger_class = set() dc_eof = 0 # usually doppleganers appears at the end of the file, but better check it dc_neof = 0 midnightcut_class = set() jumps_per_file = {} total_datapoints = 0 badfiles_datapoints = 0 good_datapoints_in_badfiles = 0 total_files = len(list_datafiles(dirname)) for n, file_name in enumerate(datafiles): breaking_idx = -1 file_key = basename(file_name) logger.debug('{}. {}'.format(n, file_key)) filedata = local_preload(file_name, FileParser, RowParser, file_name) uts = filedata.get('ut', transposed=True)[0] total_datapoints += len(filedata.data) for idx in range(1, len(uts)): if uts[idx] == uts[idx - 1]: doppelganger_class.add(file_name) if idx + 1 == len(uts): dc_eof += 1 else: dc_neof += 1 if breaking_idx == -1: # We care about very first data compromising datapoint breaking_idx = idx - 2 # Because we count both of doppelgangers as bad datapoints if uts[idx] < uts[idx - 1]: midnightcut_class.add(file_name) if file_key not in jumps_per_file: jumps_per_file[file_key] = list() jumps_per_file[file_key].append((uts[idx - 1], uts[idx])) logger.debug('\t[{}/{}] {} > {}'.format( idx, len(uts), uts[idx - 1], uts[idx])) if breaking_idx == -1: # We care about very first data compromising datapoint breaking_idx = idx - 1 if file_name in doppelganger_class or file_name in midnightcut_class: badfiles_datapoints += len(filedata.data) good_datapoints_in_badfiles += breaking_idx + 1 # + 0th index jumps_histogram = { k: len(list(filter(lambda x: len(x) == k, jumps_per_file.values()))) for k in set([len(x) for x in jumps_per_file.values()]) } all_badfiles = list(midnightcut_class) + list(doppelganger_class) logger.info('key: {}'.format(key)) logger.info('\tTotals:') logger.info('\t{}: total data points'.format(total_datapoints)) logger.info( '\t\t{}: total data points in bad files'.format(badfiles_datapoints)) logger.info('\t\t{:2.4}%: % of all datapoints in bad files'.format( 100. * badfiles_datapoints / total_datapoints)) logger.info('\t\t{}: total good datapoints in BAD files'.format( good_datapoints_in_badfiles)) logger.info('\t\t{}: total good datapoints in ALL files'.format( total_datapoints - badfiles_datapoints + good_datapoints_in_badfiles)) logger.info( '\t\t{:2.4}%: ratio of good datapoints to all datapoints'.format( 100. - 100 * (badfiles_datapoints - good_datapoints_in_badfiles) / total_datapoints)) logger.info('\t{}: total data files'.format(total_files)) logger.info('\t{}: total bad files'.format(len(all_badfiles))) logger.info('\t\t{}: midnight cut'.format(len(midnightcut_class))) for jumps, files in jumps_histogram.items(): logger.info('\t\t\t{} jumps in {} files'.format(jumps, files)) logger.info('\t\t{}: doppelgangers'.format(len(doppelganger_class))) logger.info('\t\t\t{}: of them in the end of file'.format(dc_eof)) logger.info('\t\t\t{}: of them NOT in the end of file'.format(dc_neof)) logger.info('\t{:2.4}%: rate of losts with removing doppledangers'.format( 100 * (dc_eof + dc_neof) / total_datapoints)) logger.debug('Bad files:') for badfile_name in sorted(all_badfiles): logger.debug('\t\t{}'.format(basename(badfile_name))) with open(join(ARTIFACTS_DIR, '{}.notmonotone.txt'.format(key)), 'w') as datafile: datafile.write('\n'.join( [basename(filename) for filename in all_badfiles]))