def collect_segments(sampling):
    logger.info('{} - assembling data for sampling'.format(sampling))
    sampling_dir = join(ARTIFACTS_DIR, 'samplings', '{:0>3}'.format(sampling))
    makedirs(sampling_dir, exist_ok=True)

    for day, segments in sampling_segments(sampling):
        for segment_idx, segment in enumerate(segments):
            enhanced = enhance_segment(day, segment, sampling)
            if enhanced is None or len(enhanced) == 0:
                logger.error(json.dumps(segment, indent=2))
                continue

            fw = FileWriter(SampledNACSRow, enhanced)
            fname = join(sampling_dir, '{}.no{:0>2} - {:.0f} seconds.asc'.format(
                day,
                segment_idx,
                round((enhanced[-1][0] - enhanced[0][0]) / 1000)
            ))
            segment['filename'] = basename(fname)

            logger.info('\t{} - segment file'.format(basename(fname)))
            with open(fname, 'w') as datafile:
                fw.reflect(datafile)

        with open(join(sampling_dir, '000.{}_list.json'.format(day)), 'w') as segments_list:
            json.dump(segments, segments_list)
示例#2
0
def make_deltas(key, dirname, RowParser):
    logger.info('{}. Reading datafiles'.format(key))
    all_datafiles = [
        fname.strip() for fname in open(
            join(ARTIFACTS_DIR, '{}.good.txt'.format(key)), 'r').readlines()
    ]

    days = {fname[:7] for fname in all_datafiles}

    for day in days:
        datafiles = sorted([
            join(dirname, fname) for fname in all_datafiles
            if fname.startswith(day)
        ])
        ut = round(
            concatenate([
                local_preload(fname, FileParser, RowParser, fname).get(
                    'ut_of_day', transposed=True)[0] for fname in datafiles
            ],
                        axis=0) / 1000.)

        o_dens = concatenate([
            local_preload(fname, FileParser, RowParser, fname).get(
                'o_dens', transposed=True)[0] for fname in datafiles
        ],
                             axis=0)

        deltas = (concatenate([ut, array([0])]) -
                  concatenate([array([0]), ut]))[1:]
        yield day, deltas, ut, o_dens
示例#3
0
def chunkup(data):
    chunks = []
    # Lets ignore fluctuationsx of sampling within 1s
    for datafile in data:
        ut = datafile.get('ut_of_day', transposed=True)[0]
        ongoing_sampling = round(
            ut[1] -
            ut[0])  # We're sure there's no files with less than 2 datapoints
        starts_at = 0
        for idx in range(1, len(ut)):
            sampling = round(ut[idx] - ut[idx - 1])
            if sampling == ongoing_sampling:
                continue

            chunks.append((ut[starts_at], ut[idx - 1], idx - starts_at,
                           ongoing_sampling))
            starts_at = idx - 1
            ongoing_sampling = sampling

        chunks.append(
            (ut[starts_at], ut[-1], len(ut) - starts_at, ongoing_sampling))
    for n, (start, end, length, sampling) in enumerate(chunks):
        logger.info('{}.\t{}\t- {}\t{} / {}'.format(n, start, end, sampling,
                                                    length))

    return chunks
示例#4
0
def chunkup_samplings(key, dirname, RowParser):
    logger.info('{} at {}'.format(key, dirname))
    for sampling in [1]:  # range(1, 199):
        logger.info('{} -- sampling'.format(sampling))
        # by_ut, by_length = artifacts(key, sampling)
        # if not exists(by_ut) or not exists(by_length):
        sample(key, dirname, RowParser, sampling)
def main():
    removed = 0
    for fname in listdir(CACHE_DIR):
        if fnmatch(fname, '*.pydata'):
            logger.debug('Removing {}'.format(fname))
            remove(join(CACHE_DIR, fname))
            removed += 1
    logger.info('Removed {} files'.format(removed))
示例#6
0
def segments_list(sampling):
    logger.info('{} - listing sampling'.format(sampling))
    sampling_dir = join(ARTIFACTS_DIR, 'samplings', '{:0>3}'.format(sampling))
    return sorted([
        join(sampling_dir, fname)
        for fname in listdir(sampling_dir)
        if fnmatch(fname, '19822*.asc') or fnmatch(fname, '19823*.asc')
    ])
def find_duplicates(filenames):
    hashes = {}
    for filename in filenames:
        fhash = hashlib.sha256(dataof(filename).encode('utf-8').strip()).hexdigest()
        if fhash not in hashes:
            hashes[fhash] = []
        hashes[fhash].append(filename)

    for fhash, fnames in hashes.items():
        if len(fnames) < 2:
            continue
        logger.info('\n\t{}'.format(fhash))
        for fname in fnames:
            logger.info('\t\t{}'.format(basename(fname)))

    return {ec[0] for ec in hashes.values() if len(ec) > 1}
def draw_tracks(destination_dir=None):
    nacs_ignores = [
        join(DE2SOURCE_NACS_DIR, fname.strip()) for fname in open(
            join(ARTIFACTS_DIR, 'nacs.ignore.txt'), 'r').readlines()
    ]
    nacs_goodfiles = [
        fname for fname in list_datafiles(DE2SOURCE_NACS_DIR)
        if fname not in nacs_ignores
    ]

    wats_ignores = [
        join(DE2SOURCE_WATS_DIR, fname.strip()) for fname in open(
            join(ARTIFACTS_DIR, 'wats.ignore.txt'), 'r').readlines()
    ]
    wats_goodfiles = [
        fname for fname in list_datafiles(DE2SOURCE_WATS_DIR)
        if fname not in wats_ignores
    ]

    files_by_days = {}
    for filename in nacs_goodfiles:
        year_day = basename(filename)[:7]
        if year_day not in files_by_days:
            files_by_days[year_day] = {
                'nacs': [],
                'wats': [],
            }
        files_by_days[year_day]['nacs'].append(filename)

    for filename in wats_goodfiles:
        year_day = basename(filename)[:7]
        if year_day not in files_by_days:
            files_by_days[year_day] = {
                'nacs': [],
                'wats': [],
            }
        files_by_days[year_day]['wats'].append(filename)

    for yearday in sorted(files_by_days.keys()):
        logger.info('{}: Year/Day'.format(yearday))
        logger.info('\t{}: Number of files'.format(len(
            files_by_days[yearday])))
        nacs_chunks = sum([
            chunkup(SourceNACSRow, filename)
            for filename in files_by_days[yearday]['nacs']
        ], [])
        wats_chunks = sum([
            chunkup(SourceWATSRow, filename)
            for filename in files_by_days[yearday]['wats']
        ], [])
        logger.info('\t{}: Total NACS chunks'.format(len(nacs_chunks)))
        logger.info('\t{}: Total WATS chunks'.format(len(wats_chunks)))
        year_value = yearday[:4]
        day_value = yearday[4:]
        draw_chunks(year_value, day_value, nacs_chunks, wats_chunks,
                    destination_dir)
def chunkup(RowParser, filename):
    data = FileParser(RowParser, filename)
    ut = data.get('ut', transposed=True)[0]
    lat = data.get('lat', transposed=True)[0]
    lon = data.get('lon', transposed=True)[0]

    chunks = list()
    sidx = 0
    threshold = 500 / 8.9  # Lets set it at the moment as 500 km gap is long enough to treat as different track
    for idx in range(1, len(ut)):
        if ut[idx] - ut[idx - 1] > threshold:
            chunks.append(
                [list(lat[sidx:idx]),
                 list(lon[sidx:idx]),
                 list(ut[sidx:idx])])
            sidx = idx

    chunks.append([list(lat[sidx:]), list(lon[sidx:]), list(ut[sidx:])])
    logger.info('\t\t{} :points // {}: chunks at {}'.format(
        len(ut), len(chunks), basename(filename)))
    return chunks
示例#10
0
def draw_segment(sampling, segment_file):
    logger.info('\t{}: processing'.format(basename(segment_file)))
    segment_data = local_preload(segment_file, FileParser, SampledNACSRow, segment_file)
    ut = segment_data.get('ut', transposed=True)[0]
    lat = segment_data.get('lat', transposed=True)[0]
    lon = segment_data.get('lon', transposed=True)[0]
    o_dens = omit_zeros(segment_data.get('o_dens', transposed=True)[0])

    hours = ut_to_hours(ut)
    day = basename(segment_file)[:7]

    param_name = 'O density'

    fig_avg, fig_wave = analyze_param(sampling, day, hours, lat, lon, o_dens, param_name)

    fig_avg_artifact_fname = segment_file[:-3] + param_name.lower() + '.trend.png'
    logger.debug('\t\t{}: artifact'.format(basename(fig_avg_artifact_fname)))
    fig_avg.savefig(fig_avg_artifact_fname, dpi=300, papertype='a0', orientation='landscape')
    plt.close(fig_avg)

    fig_wave_artifact_fname = segment_file[:-3] + param_name.lower() + '.wave.png'
    logger.debug('\t\t{}: artifact'.format(basename(fig_wave_artifact_fname)))
    fig_wave.savefig(fig_wave_artifact_fname, dpi=300, papertype='a0', orientation='landscape')
    plt.close(fig_wave)
def filtration(key, basedir, RowParser):
    badfiles = read_badfileslist(
        basedir,
        join(ARTIFACTS_DIR, '{}.notmonotone.txt'.format(key))
    )
    datafiles = goodfiles(basedir, badfiles)
    logger.info('key: {}'.format(key))
    logger.info('\t{}: total number of good datafiles'.format(len(datafiles)))
    duplicates = find_duplicates(datafiles)
    filtered_datafiles = list(set(datafiles).difference(duplicates))
    logger.info('\t{}: total number of exclusive datafiles'.format(len(filtered_datafiles)))
    total_intersections_list = []
    iteration_number = 0
    while True:
        intersections = find_intersections(filtered_datafiles, RowParser)
        logger.info('\t{} iteration. Intersection search'.format(iteration_number))
        iteration_number += 1
        logger.info('\t\t{} files are intersecting'.format(len(intersections)))
        if len(intersections) == 0:
            break
        total_intersections_list += list(intersections)
        filtered_datafiles = list(set(filtered_datafiles).difference(intersections))

    logger.info('{} files left after filtering'.format(len(filtered_datafiles)))
    total_datapoints = sum([
        len(local_preload(filename, FileParser, RowParser, filename).get('ut', transposed=True)[0])
        for filename in filtered_datafiles
    ])
    logger.info('{} datapoints left'.format(total_datapoints))
    logger.info('\nDuplicated files:')
    for fname in duplicates:
        logger.info('\t{}'.format(basename(fname)))

    with open(join(ARTIFACTS_DIR, '{}.duplicates.txt'.format(key)), 'w') as datafile:
        datafile.write('\n'.join([basename(filename) for filename in sorted(duplicates)]))

    logger.info('\nIntersected files:')
    for fname in total_intersections_list:
        logger.info('\t{}'.format(basename(fname)))

    with open(join(ARTIFACTS_DIR, '{}.intersections.txt'.format(key)), 'w') as datafile:
        datafile.write('\n'.join([basename(filename) for filename in sorted(intersections)]))

    with open(join(ARTIFACTS_DIR, '{}.ignore.txt'.format(key)), 'w') as datafile:
        datafile.write('\n'.join([basename(filename) for filename in sorted(list(badfiles) + list(intersections) + list(duplicates))]))

    with open(join(ARTIFACTS_DIR, '{}.good.txt'.format(key)), 'w') as datafile:
        datafile.write('\n'.join([basename(filename) for filename in sorted(datafiles)]))
示例#12
0
def sample(key, dirname, RowParser, sampling):
    for day, deltas, ut, o_dens in make_deltas(key, dirname, RowParser):
        # 1. Split on chunks with gaps no longer than sampling;
        # 2. Iterate over datashifts 0 <= j < sampling;
        # 3. Look for sampling matches;
        # 4. Exclude multiples of matched samplings;
        # 5. Store (t_start, t_end, sampling) taking in acount sampling shift j;

        logger.info('[{}]\t {}: Total length of ut'.format(day, len(ut)))
        min_sequence_duration = 250 * sqrt(sampling)
        working_samplings = []
        logger.info('[{}]\t {}: sampling to check'.format(day, sampling))
        starts_at = 0
        continuous = True
        for idx in range(len(deltas)):
            continuous = continuous and o_dens[idx] is not None and not isnan(
                o_dens[idx]) and o_dens[idx] != 0

            if not continuous:
                starts_at = idx + 1
                continuous = True
                continue

            if deltas[idx] - sampling >= 0.5:
                year = date.fromtimestamp(ut[starts_at]).strftime('%Y / %j')
                if deltas[idx] > 500.:
                    logger.info(
                        chalk.red('{:0>4d}\t..\t\t\t{:.2f}'.format(
                            sampling, deltas[idx]),
                                  bold=True,
                                  underline=True))
                segment_length = ut[idx] - ut[starts_at]

                if segment_length > min_sequence_duration:
                    logger.info(
                        chalk.green(
                            '{:0>4d}\t++[{}]\t{}\t\t{:.2f} > {}'.format(
                                sampling, len(working_samplings), year,
                                segment_length, min_sequence_duration),
                            bold=True,
                            underline=True))
                    shift = 0
                    sub_working = []
                    while sum(deltas[starts_at:starts_at + shift]
                              ) < segment_length - min_sequence_duration:
                        points = verify_sampling(deltas[starts_at + shift:idx],
                                                 sampling)
                        if points is not None and points > 0:
                            sub_working.append({
                                'indexes': (starts_at + shift, idx),
                                'length':
                                idx - starts_at - shift + 1,
                                'points':
                                points,
                                'segment': (ut[starts_at + shift], ut[idx]),
                                'duration':
                                ut[idx] - ut[starts_at + shift],
                                'resolution':
                                float(sampling) / points,
                                'day':
                                day,
                            })
                            if (ut[idx] - ut[starts_at +
                                             shift]) > 0.9 * segment_length:
                                break
                        shift += 1

                    if len(sub_working) > 0:
                        working_samplings.append(
                            sorted(sub_working,
                                   key=lambda x: x['duration'])[0])
                starts_at = idx + 1

        by_ut, by_length = artifacts(day, key, sampling)

        with open(by_ut, 'w') as artifact:
            json.dump(working_samplings, artifact)

        with open(by_length, 'w') as artifact:
            json.dump(
                sorted(working_samplings,
                       key=lambda x: (-x['duration'], x['segment'][0])),
                artifact)
示例#13
0
def data_report(key, RowParser, dirname):
    datafiles = list_datafiles(dirname)
    doppelganger_class = set()
    dc_eof = 0  # usually doppleganers appears at the end of the file, but better check it
    dc_neof = 0
    midnightcut_class = set()
    jumps_per_file = {}
    total_datapoints = 0
    badfiles_datapoints = 0
    good_datapoints_in_badfiles = 0
    total_files = len(list_datafiles(dirname))
    for n, file_name in enumerate(datafiles):
        breaking_idx = -1
        file_key = basename(file_name)
        logger.debug('{}. {}'.format(n, file_key))
        filedata = local_preload(file_name, FileParser, RowParser, file_name)
        uts = filedata.get('ut', transposed=True)[0]
        total_datapoints += len(filedata.data)
        for idx in range(1, len(uts)):
            if uts[idx] == uts[idx - 1]:
                doppelganger_class.add(file_name)
                if idx + 1 == len(uts):
                    dc_eof += 1
                else:
                    dc_neof += 1
                if breaking_idx == -1:  # We care about very first data compromising datapoint
                    breaking_idx = idx - 2  # Because we count both of doppelgangers as bad datapoints

            if uts[idx] < uts[idx - 1]:
                midnightcut_class.add(file_name)

                if file_key not in jumps_per_file:
                    jumps_per_file[file_key] = list()
                jumps_per_file[file_key].append((uts[idx - 1], uts[idx]))
                logger.debug('\t[{}/{}] {} > {}'.format(
                    idx, len(uts), uts[idx - 1], uts[idx]))

                if breaking_idx == -1:  # We care about very first data compromising datapoint
                    breaking_idx = idx - 1

        if file_name in doppelganger_class or file_name in midnightcut_class:
            badfiles_datapoints += len(filedata.data)
            good_datapoints_in_badfiles += breaking_idx + 1  # + 0th index

    jumps_histogram = {
        k: len(list(filter(lambda x: len(x) == k, jumps_per_file.values())))
        for k in set([len(x) for x in jumps_per_file.values()])
    }
    all_badfiles = list(midnightcut_class) + list(doppelganger_class)
    logger.info('key: {}'.format(key))
    logger.info('\tTotals:')
    logger.info('\t{}: total data points'.format(total_datapoints))
    logger.info(
        '\t\t{}: total data points in bad files'.format(badfiles_datapoints))
    logger.info('\t\t{:2.4}%: % of all datapoints in bad files'.format(
        100. * badfiles_datapoints / total_datapoints))
    logger.info('\t\t{}: total good datapoints in BAD files'.format(
        good_datapoints_in_badfiles))
    logger.info('\t\t{}: total good datapoints in ALL files'.format(
        total_datapoints - badfiles_datapoints + good_datapoints_in_badfiles))
    logger.info(
        '\t\t{:2.4}%: ratio of good datapoints to all datapoints'.format(
            100. - 100 * (badfiles_datapoints - good_datapoints_in_badfiles) /
            total_datapoints))
    logger.info('\t{}: total data files'.format(total_files))
    logger.info('\t{}: total bad files'.format(len(all_badfiles)))
    logger.info('\t\t{}: midnight cut'.format(len(midnightcut_class)))
    for jumps, files in jumps_histogram.items():
        logger.info('\t\t\t{} jumps in {} files'.format(jumps, files))
    logger.info('\t\t{}: doppelgangers'.format(len(doppelganger_class)))
    logger.info('\t\t\t{}: of them in the end of file'.format(dc_eof))
    logger.info('\t\t\t{}: of them NOT in the end of file'.format(dc_neof))
    logger.info('\t{:2.4}%: rate of losts with removing doppledangers'.format(
        100 * (dc_eof + dc_neof) / total_datapoints))
    logger.debug('Bad files:')
    for badfile_name in sorted(all_badfiles):
        logger.debug('\t\t{}'.format(basename(badfile_name)))

    with open(join(ARTIFACTS_DIR, '{}.notmonotone.txt'.format(key)),
              'w') as datafile:
        datafile.write('\n'.join(
            [basename(filename) for filename in all_badfiles]))