day=(date-year*10000-month*100)

    hour=time//100
    min=(time-hour*100)

    doy=[ datetime.datetime(np.int(year[i]),np.int(month[i]),np.int(day[i]),\
                                np.int(hour[i]),np.int(min[i]),0)- \
              datetime.datetime(first_year,1,1,0,0,0) \
              for i in range(len(year))]

    processed_dates=[doy[i].days+doy[i].seconds/(24.*60.*60.) for i in range(len(doy))]
    return processed_dates

#read obs files
all_files = glob.glob('/work/home/db876/observations/surface/%s/EMEP/*'%(species))
all_files = modules.natsorted(all_files)

#get all refs
ref_list = []
valid_refs=[]

for i in range(len(all_files)):
    f = all_files[i]
    f = f.replace("/work/home/db876/observations/surface/%s/EMEP/"%(species), "")
    f = f[:7]
    ref_list.append(f)

refs = set(ref_list)
refs = sorted([i for i in refs])
refs = np.array(refs)
print 'all refs len = ', len(refs)
elif species == "NO":
    data_resolution = 1.1
    mol_mass = 30.01

elif species == "NO2":
    data_resolution = 1.1
    mol_mass = 46.0055

elif species == "CO":
    data_resolution = 100.1
    mol_mass = 28.01

# read obs files
hourly_files = glob.glob("/work/home/db876/observations/surface/%s/GAW/*.hr*.dat" % (species))
hourly_files = modules.natsorted(hourly_files)
daily_files = glob.glob("/work/home/db876/observations/surface/%s/GAW/*.da.*" % (species))
daily_files = modules.natsorted(daily_files)
monthly_files = glob.glob("/work/home/db876/observations/surface/%s/GAW/*.mo.*" % (species))
monthly_files = modules.natsorted(monthly_files)

# get all refs
ref_list_hourly = []
ref_list_daily = []
ref_list_monthly = []

valid_refs = []

for i in range(len(hourly_files)):
    f = hourly_files[i]
    f = f.replace("/work/home/db876/observations/surface/%s/GAW/" % (species), "")
    def site_iter_process(valid_refs, c):
        # for r in range(len(valid_refs)):
        ref = valid_refs[c]
        print ref

        # get site instrument for species
        met_i = met_refs.index(ref)
        print len(met_refs)
        print len(met_species)
        site_species = list(met_species[met_i])
        site_instruments = list(met_instruments[met_i])
        print site_species
        print site_instruments
        mm = site_instruments[site_species.index(species)]

        site_resolutions = []
        data_valid = True

        s_files = insensitive_glob("/work/home/db876/observations/surface/%s/EANET/*%s.csv" % (species, ref))
        site_files = []
        for y in year_array:
            for f in s_files:
                if str(y)[-2:] in f:
                    site_files.append(f)

        site_files = modules.natsorted(site_files)

        if site_files == []:
            print "No files for ref.\n"

        years = []
        months = []
        days = []
        hours = []

        vals = []

        last_year_index = len(site_files)
        for y in year_array:
            print "Processing Year %s" % y
            got_year = False
            for file in site_files:
                last_file_split = file.split("/")[-1]
                if str(y)[2:] in last_file_split:
                    got_year = True
                    break
            if got_year == False:
                # fill in data for missing year
                timedelta_diff = datetime.date(y + 1, 1, 1) - datetime.date(y, 1, 1)
                ndays_missing = timedelta_diff.days
                print "ndays missing = ", ndays_missing

                vals = np.append(vals, [-99999] * (ndays_missing * 24))

                continue

            print file

            valid = True
            with open(file, "rb") as f:
                reader = csv.reader(f, delimiter=",")
                counter = 0

                # get resolution
                for row in reader:
                    if counter == 0:
                        all_units = row

                    elif counter == 1:
                        file_res = "H"

                        try:
                            hour_index = row.index("Hour")
                        except:
                            file_res = "D"
                        try:
                            day_index = row.index("Day")
                        except:
                            file_res = "M"
                        month_index = row.index("Month")
                        year_index = row.index("Year")

                        try:
                            spec_index = row.index(species.upper())
                            units = all_units[spec_index]
                        except:
                            valid = False
                            break

                        # make sure each year units are ppb
                        if units != "ppb":
                            print "Units not ppb!"
                            1 + "a"

                    if counter == 2:
                        if file_res == "H":
                            yyyy = row[year_index]
                            mm = row[month_index]
                            dd = row[day_index]
                            hh = row[hour_index]
                        elif file_res == "D":
                            yyyy = row[year_index]
                            mm = row[month_index]
                            dd = row[day_index]
                            hh = 1
                        elif file_res == "M":
                            yyyy = row[year_index]
                            mm = row[month_index]
                            dd = 1
                            hh = 1

                        start_datetime = datetime.datetime(int(yyyy), int(mm), int(dd), int(hh))

                    if counter == 3:
                        if file_res == "H":
                            yyyy = row[year_index]
                            mm = row[month_index]
                            dd = row[day_index]
                            hh = row[hour_index]
                        elif file_res == "D":
                            yyyy = row[year_index]
                            mm = row[month_index]
                            dd = row[day_index]
                            hh = 1
                        elif file_res == "M":
                            yyyy = row[year_index]
                            mm = row[month_index]
                            dd = 1
                            hh = 1

                        present_datetime = datetime.datetime(int(yyyy), int(mm), int(dd), int(hh))

                        time_delt = present_datetime - start_datetime
                        hour_delt = datetime.timedelta(hours=1)
                        day_delt = datetime.timedelta(hours=24)
                        week_delt = datetime.timedelta(hours=24 * 7)
                        month_delt = datetime.timedelta(hours=24 * 28)

                        print time_delt

                        if time_delt < day_delt:
                            print "Hourly Data"
                            file_res = "H"
                            site_resolutions.append(file_res)

                        elif (time_delt > hour_delt) & (time_delt < week_delt):
                            print "Daily Data"
                            file_res = "D"
                            site_resolutions.append(file_res)

                        elif time_delt > week_delt:
                            print "Monthly Data"
                            file_res = "M"
                            site_resolutions.append(file_res)

                        # break
                        # limit files by timeres return if not suitable for output res
                        if output_res == "H":
                            if (file_res == "D") or (file_res == "M"):
                                print "File resolution has to be Minimum Hourly. Skipping"
                                data_valid = False
                                return c, vals, data_valid, -999, -999, -999, "na", "na", "na", "na", "na", -999
                        elif output_res == "D":
                            if file_res == "M":
                                print "File resolution has to be Minimum Daily. Skipping"
                                data_valid = False
                                return c, vals, data_valid, -999, -999, -999, "na", "na", "na", "na", "na", -999

                    counter += 1

            # READ IN DATA
            if valid == True:
                with open(file, "rb") as f:
                    reader = csv.reader(f, delimiter=",")
                    counter = 0
                    for row in reader:

                        if counter >= 2:
                            yyyy = row[year_index]
                            mm = row[month_index]

                            if file_res == "H":
                                try:
                                    vals = np.append(vals, np.float64(row[spec_index]))
                                except:
                                    vals = np.append(vals, -99999)

                            elif file_res == "D":
                                try:
                                    vals = np.append(vals, [np.float64(row[spec_index])] * 24)
                                except:
                                    vals = np.append(vals, [-99999] * 24)

                            elif file_res == "M":
                                month_days = monthrange(int(yyyy), int(mm))[1]
                                try:
                                    vals = np.append(vals, [np.float64(row[spec_index])] * (month_days * 24))
                                except:
                                    vals = np.append(vals, [-99999] * (month_days * 24))

                        counter += 1
            else:
                print "Species is not in file header. Skipping Year"
                timedelta_diff = datetime.date(y + 1, 1, 1) - datetime.date(y, 1, 1)
                ndays_missing = timedelta_diff.days
                print "ndays missing = ", ndays_missing
                vals = np.append(vals, [-99999] * (ndays_missing * 24))

        valid_refs_rev.append(ref)

        i_ref = met_refs.index(ref)
        tz = np.float64(met_tz[i_ref])
        lat = np.float64(met_lats[i_ref])
        lon = np.float64(met_lons[i_ref])
        alt = np.float64(met_alts[i_ref])
        raw_class_name = met_class[i_ref]
        anthrome_class_name = class_name[i_ref]

        # check tz is whole number else skip site
        if (tz % 1) != 0:
            print "Timezone is not even. Skipping"
            data_valid = False

        tz = int(tz)
        # correct time to UTC
        if tz < 0:
            # get rid of values at start and append -99999's at end
            cut = vals[:tz]
            for num in range(np.abs(tz)):
                cut = np.insert(cut, 0, -99999)
            vals = cut
        elif tz > 0:
            # put -99999's at start and get rid of values at end
            cut = vals[tz:]
            for num in range(tz):
                cut = np.append(cut, -99999)
            vals = cut

        # do data quality checks
        full_data, data_valid, data_complete = modules.quality_check_periodic(
            vals, data_valid, data_resolution, np.float64(alt), grid_dates, start_year, end_year
        )

        # if all site resolutions are same continue, make program exit
        all_same = all(x == site_resolutions[0] for x in site_resolutions)
        if all_same == True:
            pass
        else:
            print "Not all files for site have same resolution. Skipping."
            data_valid = False
            return c, full_data, data_valid, -999, -999, -999, "na", "na", "na", "na", "na", -999

        # set sampling as average
        st = "average"

        return (
            c,
            full_data,
            data_valid,
            lat,
            lon,
            alt,
            raw_class_name,
            anthrome_class_name,
            mm,
            st,
            file_res,
            data_complete,
        )
elif (species == 'NO'):
    data_resolution = 1.1
    param_code = 42601

elif (species == 'NO2'):
    data_resolution = 1.1
    param_code = 42602
 
elif species == 'ISOP':
    data_resolution = 0.6
    param_code =  43243 


files = glob.glob('/work/home/db876/observations/surface/%s/AQS/*'%(species))
files=modules.natsorted(files)

year_array = np.arange(start_year,end_year+1)

valid_files = []
for i in year_array: 
    for f in files:
        if str(i) in f:
            valid_files.append(f)
            
print valid_files

#create ref lists
all_site_refs = []
test_site_refs = []
uniq_refs = []
示例#5
0
    doy=[ datetime.datetime(np.int(year[i]),np.int(month[i]),np.int(day[i]),\
                                np.int(hour[i]),np.int(min[i]),0)- \
              datetime.datetime(first_year,1,1,0,0,0) \
              for i in range(len(year))]

    processed_dates = [
        doy[i].days + doy[i].seconds / (24. * 60. * 60.)
        for i in range(len(doy))
    ]
    return processed_dates


#read obs files
all_files = glob.glob('/work/home/db876/observations/surface/%s/EMEP/*' %
                      (species))
all_files = modules.natsorted(all_files)

#get all refs
ref_list = []
valid_refs = []

for i in range(len(all_files)):
    f = all_files[i]
    f = f.replace("/work/home/db876/observations/surface/%s/EMEP/" % (species),
                  "")
    f = f[:7]
    ref_list.append(f)

refs = set(ref_list)
refs = sorted([i for i in refs])
refs = np.array(refs)
    start_year = start_years[y]
    end_year = end_years[y]

    print start_year, end_year

    year_array = range(start_year, end_year + 1)

    if (species == "O3") or (species == "NO") or (species == "NO2"):
        data_resolution = 1.1

    n_years = (end_year - start_year) + 1

    # read obs files
    all_files = glob.glob("/work/home/db876/observations/surface/%s/EANET/AT*" % (species))
    all_files = modules.natsorted(all_files)

    ref_list = []
    valid_refs = []
    valid_refs_rev = []

    for i in range(len(all_files)):
        f = all_files[i].replace(".csv", "")
        f = f.replace("/work/home/db876/observations/surface/%s/EANET/" % (species), "")
        f = f[4:]
        f = f.lower()
        ref_list.append(f)

    refs = set(ref_list)
    refs = [i for i in refs]
    refs = sorted(refs)
示例#7
0
def site_iter_process(valid_refs, c):
    #set local counts
    inv_nometa = 0
    inv_anyvaliddata = 0
    inv_nokeymeta = 0
    inv_resolution = 0
    inv_badmeasurementmethod = 0
    n_all = 0
    n_after_nometa = 0
    n_after_flagsandlod = 0
    n_after_duplicate = 0
    n_after_anyvaliddata = 0
    n_after_nokeymeta = 0
    n_after_resolution = 0
    n_after_badmeasurementmethod = 0

    #set local unknown lists
    unknown_mm_list = []
    unknown_mm_refs_list = []
    unknown_local_tz_list = []

    #for ref_i in range(len(valid_refs)):
    data_valid = True

    site_ref = valid_refs[c]
    print 'Current Ref is = ', site_ref, c

    s_files = glob.glob(
        '/work/home/db876/observations/surface/%s/CAPMON/ozon_smpls_%s*' %
        (species, site_ref))
    site_files = []
    for y in year_array:
        for f in s_files:
            if str(y) in f:
                site_files.append(f)

    site_files = modules.natsorted(site_files)

    yymmdd = []
    hhmm = []
    vals = []

    for file_i in range(len(site_files)):

        count = 0
        meta_start = -99999
        start_read_1 = False
        start_read_2 = False

        with open(site_files[file_i], 'rb') as f:
            reader = csv.reader(f, delimiter=',')
            print site_files[file_i]
            for row in reader:
                #print count
                #break out of loop at bottom of file
                if (start_read_2 == True) & (row[0] == '*TABLE ENDS'):
                    break

            #get metadata
                try:
                    if (row[0] == '*TABLE NAME') & (row[1]
                                                    == 'Site information'):
                        meta_start = count + 2
                except:
                    pass
                if count == meta_start:
                    siteid_i = row.index('Site ID: standard')
                    sitename_i = row.index('Description')
                    lat_i = row.index('Latitude: decimal degrees')
                    lon_i = row.index('Longitude: decimal degrees')
                    try:
                        alt_i = row.index(
                            'Ground elevation: above mean sea level')
                    except:
                        alt_i = row.index('Ground altitude')
                    class_i = row.index('Site land use')

                if count == (meta_start + 6):
                    latitude = row[lat_i]
                    longitude = row[lon_i]
                    altitude = row[alt_i]
                    raw_class_name = row[class_i]
                    site_name = row[sitename_i]

                #get data
                if start_read_2 == True:
                    #read dates, times, and vals
                    date = row[8]
                    time = row[9]
                    yymmdd.append(date[:4] + date[5:7] + date[8:])
                    hhmm.append(time[:2] + time[3:])
                    quality_code = row[13]
                    #if flag not equal to V0 then make -99999
                    if quality_code == 'V0':
                        vals = np.append(vals, np.float64(row[12]))
                    else:
                        vals = np.append(vals, -99999)

                try:
                    if (row[0] == '*TABLE NAME') & (row[1] == 'OZONE_HOURLY'):
                        start_read_1 = True
                except:
                    pass

                if (start_read_1 == True) & (row[0] == '*TABLE COLUMN UNITS'):
                    unit = row[12]

                if (start_read_1 == True) & (row[0] == '*TABLE BEGINS'):
                    start_read_2 = True
                count += 1

    #add to n_obs_all
    n_all += len(vals)
    n_after_nometa += len(vals)

    #convert data less < 0 to -99999
    test_inv = vals < 0
    vals[test_inv] = -99999

    #create max possible grid
    full_data = np.empty(n_hours)
    full_data_after_flagsandlod = np.empty(n_hours)
    big_n_dup_array = np.zeros(n_hours)
    full_data[:] = -99999
    full_data_after_flagsandlod[:] = -99999

    #put vals into full grid
    date_con = np.array(yymmdd).astype(int)
    time_con = np.array(hhmm).astype(int)

    #remove data < 1970 and >= 2015
    test_inds = (date_con >= 19700101) & (date_con < 20150101)
    date_con = date_con[test_inds]
    time_con = time_con[test_inds]
    vals = vals[test_inds]

    #set st_big and mm_big
    st_big = ['continuous'] * len(vals)
    mm_big = ['ultraviolet photometry'] * len(vals)

    #get obs valid
    test = vals != -99999
    n_obs_valid = len(vals[test])
    n_after_flagsandlod += n_obs_valid

    #find matching times between actual times and grid of times, return big array of indices of matched indices in grid
    converted_time = modules.date_process(date_con, time_con, start_year)
    converted_time = np.round(converted_time, decimals=5)
    syn_grid_time = np.arange(0, n_days, 1. / 24)
    syn_grid_time = np.round(syn_grid_time, decimals=5)
    raw_indices = np.searchsorted(syn_grid_time, converted_time, side='left')
    vals = np.array(vals)
    full_data_after_flagsandlod[raw_indices] = vals
    raw_st = np.copy(st_big)
    raw_mm = np.copy(mm_big)

    # test and remove duplicate and overlap points
    converted_time, vals, mm_big, st_big, na = modules.remove_duplicate_points(
        site_ref, converted_time, vals, mm_big, st_big, 'blank', output_res)
    test = vals >= 0
    n_obs_valid = int(len(vals[test]))
    n_after_duplicate += n_obs_valid

    #find matching times between actual times and grid of times, return big array of indices of matched indices in grid
    indices = np.searchsorted(syn_grid_time, converted_time, side='left')
    full_data[indices] = vals

    #get metadata
    try:
        lat = np.float32(latitude)
    except:
        lat = 'na'
    try:
        lon = np.float32(longitude)
    except:
        lon = 'na'
    try:
        alt = np.float32(altitude)
    except:
        alt = 'na'
    unit = str(unit)
    raw_class_name = str(raw_class_name)
    site_name = str(site_name)
    country = 'Canada'
    contact = 'Dave MacTavish, 4905 Dufferin St., Toronto ON, CANADA, M3H 5T4, [email protected]'

    #set data tz - all CAPMON times are UTC
    data_tz = 0
    all_tz = [data_tz]

    key_meta = [lat, lon, alt]

    #set site file resolution
    file_res = 'H'

    #get sampling/instrument grids
    raw_st_grid, p_st_grid, p_st_grid_after_flagsandlod, raw_mm_grid, p_mm_grid, p_mm_grid_after_flagsandlod, unknown_mm_list, unknown_mm_refs_list = modules.sampling_and_instruments_by_processgroup(
        site_ref, process_group, species, raw_st, raw_mm,
        full_data_after_flagsandlod, full_data, raw_indices, unknown_mm_list,
        unknown_mm_refs_list, no2_type)

    #do quality checks
    data_valid, full_data, valid_hours_dup, p_st_grid, p_mm_grid, n_all, inv_nometa, n_after_nometa, n_after_flagsandlod, n_after_duplicate, inv_anyvaliddata, n_after_anyvaliddata, inv_nokeymeta, n_after_nokeymeta, inv_resolution, n_after_resolution, inv_badmeasurementmethod, n_after_badmeasurementmethod, exit_r = modules.primary_quality_control(
        site_ref, species, file_res, no2_type, grid_dates, full_data,
        big_n_dup_array, 0, raw_st_grid, p_st_grid,
        p_st_grid_after_flagsandlod, raw_mm_grid, p_mm_grid,
        p_mm_grid_after_flagsandlod, data_resolution, n_obs_valid, key_meta,
        n_all, inv_nometa, n_after_nometa, n_after_flagsandlod,
        n_after_duplicate, inv_anyvaliddata, n_after_anyvaliddata,
        inv_nokeymeta, n_after_nokeymeta, inv_resolution, n_after_resolution,
        inv_badmeasurementmethod, n_after_badmeasurementmethod)
    if data_valid == False:
        exit_c_list = np.array([
            inv_nometa, inv_anyvaliddata, inv_nokeymeta, inv_resolution,
            inv_badmeasurementmethod
        ])
        n_c_list = np.array([
            n_all, n_after_nometa, n_after_flagsandlod, n_after_duplicate,
            n_after_anyvaliddata, n_after_nokeymeta, n_after_resolution,
            n_after_badmeasurementmethod
        ])
        unknown_list = [
            unknown_mm_list, unknown_mm_refs_list, unknown_local_tz_list
        ]
        meta = [
            lat, lon, alt, 'na', 'na', 'na', 'na', 'na', 'na', 'na', 'na', 'na'
        ]
        return c, ['na'], ['na'], [
            'na'
        ], False, meta, exit_c_list, n_c_list, unknown_list, exit_r, np.zeros(
            1)

    #set processed unit
    p_unit = 'pbbv'

    #get local timezone
    try:
        local_tz_name = tz_root.tzNameAt(lat, lon, forceTZ=True)
        pytz_obj = pytz.timezone(local_tz_name)
        datetime_offset = pytz_obj.utcoffset(datetime.datetime(2000, 1, 1))
        if datetime_offset < datetime.timedelta(0):
            local_tz = -(24 - int(datetime_offset.seconds / 60 / 60))
        else:
            local_tz = int(datetime_offset.seconds / 60 / 60)
    except:
        local_tz = 'na'
        print 'TIMEZONE NOT KNOWN, SITE IS %s' % (site_ref)
        unknown_local_tz_list.append(site_ref)

    #pack meta
    meta = [
        lat, lon, alt, raw_class_name, file_res, unit, p_unit, data_tz,
        local_tz, site_name, country, contact
    ]

    #if blank strings in meta then convert to 'na'
    for i in range(len(meta)):
        try:
            if meta[i].strip() == '':
                meta[i] = 'na'
        except:
            pass

    print set(raw_st_grid)
    print set(raw_mm_grid)
    print set(p_st_grid)
    print set(p_mm_grid)
    print meta

    exit_c_list = np.array([
        inv_nometa, inv_anyvaliddata, inv_nokeymeta, inv_resolution,
        inv_badmeasurementmethod
    ])
    n_c_list = np.array([
        n_all, n_after_nometa, n_after_flagsandlod, n_after_duplicate,
        n_after_anyvaliddata, n_after_nokeymeta, n_after_resolution,
        n_after_badmeasurementmethod
    ])
    print 'exit counts = ', exit_c_list
    print 'n obs counts = ', n_c_list

    unknown_list = [
        unknown_mm_list, unknown_mm_refs_list, unknown_local_tz_list
    ]

    return c, full_data, p_st_grid, p_mm_grid, data_valid, meta, exit_c_list, n_c_list, unknown_list, 'na', big_n_dup_array
示例#8
0
#setup netcdf output
root_grp = Dataset(
    '%s_RADIOSONDES_SHADOZ_%s_%s.nc' % (species, start_year, end_year), 'w')
root_grp.description = 'SHADOZ Radiosondes of %s at sites in ppb - Program written by Dene Bowdalo' % (
    species)

site_count = 0

#---------------------------------------------------------------------------------
#process SHADOZ data

print '\nProcessing SHADOZ data\n'

files = glob.glob('/work/home/db876/observations/ozonesonde/SHADOZ/*')
files = modules.natsorted(files)

site_names = []
#separate out sites into each location
for i in files:
    i = i.replace('/work/home/db876/observations/ozonesonde/SHADOZ/', '')
    split = i.split('_')
    site_names.append(split[0])
site_names = np.sort(list(set(site_names)))

for site_name in site_names:
    print '\n'
    print site_name

    data_valid = True
示例#9
0
exit_resolution_lats = []
exit_resolution_lons = []
exit_resolution_pg = []
exit_badmeasurementmethod_refs = []
exit_badmeasurementmethod_lats = []
exit_badmeasurementmethod_lons = []
exit_badmeasurementmethod_pg = []

unknown_mm = []
unknown_mm_refs = []
unknown_local_tz = []

#read obs files
all_files = glob.glob('/work/home/db876/observations/surface/%s/CAPMON/*' %
                      (species))
all_files = modules.natsorted(all_files)

#get all refs
ref_list = []
valid_refs = []

for i in range(len(all_files)):
    f = all_files[i]
    f = f.replace(
        "/work/home/db876/observations/surface/%s/CAPMON/" % (species), "")
    f = f[11:14]
    ref_list.append(f)

refs = set(ref_list)
refs = [i for i in refs]
refs = sorted(refs)
示例#10
0
def site_iter_process(valid_refs,c):
    #set local counts
    inv_nometa = 0
    inv_anyvaliddata = 0
    inv_nokeymeta = 0
    inv_resolution = 0
    inv_badmeasurementmethod = 0
    n_all = 0
    n_after_nometa = 0
    n_after_flagsandlod = 0
    n_after_duplicate = 0
    n_after_anyvaliddata = 0
    n_after_nokeymeta = 0
    n_after_resolution = 0
    n_after_badmeasurementmethod = 0

    #set local unknown lists
    unknown_mm_list = []
    unknown_mm_refs_list = []
    unknown_local_tz_list = []

#read files site at a time
#for ref_i in range(len(valid_refs)):
    site_ref = valid_refs[c]

    all_latitudes = []
    all_longitudes = []
    all_altitudes = []
    all_unit = []
    all_site_name = []
    all_country = []
    all_contact = []
    mm_big = []
    meta_valid_list = []

    data_valid = True

    print 'Current Ref is = ', site_ref,c
    #find if sites have full valid range from start year and finishing in end year
    s_files = glob.glob('/work/home/db876/observations/surface/%s/EMEP/%s*'%(fname_species,site_ref))
    year_files = [file.replace("/work/home/db876/observations/surface/%s/EMEP/"%(fname_species), "") for file in s_files]
    cut_year_files = [file[8:12] for file in year_files]
    site_files = []
    for y in year_array:
        for i in range(len(s_files)):
            if str(y) in cut_year_files[i]:
                site_files.append(s_files[i])
                  
    site_files = modules.natsorted(site_files)
    
    #test for duplicate file years, if duplicates break processing
    file_years = []
    for file in site_files:
        last_file_split = file.split('/')[-1]
        file_years=np.append(file_years,last_file_split[8:12])
    for y in year_array:
        test = file_years == str(y)
        if len(file_years[test]) > 1:
            print 'Site has duplicate files for %s. Breaking processing'%(y)
            1+'a'

    if site_files == []:
        print 'No valid files for site\n'
        return
    
    #remove daily/monthly files if necessary
    if output_res == 'H':
        del_i = []
        for i in range(len(site_files)):
            if '.1d.' in site_files[i]:
                del_i.append(i)
            elif '.1mo.' in site_files[i]:
                del_i.append(i)
        site_files=np.delete(site_files,del_i)
    elif output_res == 'HD':
        del_i = []
        for i in range(len(site_files)):
            if '.1mo.' in site_files[i]:
                del_i.append(i)
        site_files=np.delete(site_files,del_i)
    
    for y in year_array:
        bad_meta = False
        got_year = False
        for file in site_files:
            last_file_split = file.split('/')[-1]
            if str(y) in last_file_split[8:12]:
                got_year = True
                break
        if got_year == False:
            #fill in data for missing year
            timedelta_diff = datetime.date(y+1, 1, 1) - datetime.date(y, 1, 1)
            ndays_missing = timedelta_diff.days       
            continue
    
        count = 0
        with open(file, 'rb') as f:
            reader = csv.reader(f,delimiter=' ')
            print file
            for row in reader:
                try:
                    row = filter(lambda a: a != '', row)
                except:
                    pass
                try:
                    row = filter(lambda a: a != ',', row)
                except:
                    pass
                                
                #get start date of file
                if row[0] == 'Startdate:':
                    data = row[1]
                    s_yyyy = data[:4]
                    s_mm = data[4:6]
                    s_dd = data[6:8]
                    s_hh = data[8:10]
                    s_min = data[10:12]
                    start_datetime = datetime.datetime(int(s_yyyy),1,1,0,0)
                
                #get unit
                if row[0] == 'Unit:':
                    try:
                        if len(row) == 3:
                            unit_part1 = row[1]
                            unit_part2 = row[2]
                            unit = unit_part1+'_'+unit_part2
                        
                        elif len(row) == 2:
                            unit = row[1] 
                        all_unit.append(unit)
                    except:
                        bad_meta = True
        
                #get resolution
                if row[0] == 'Resolution':
                    if row[1] == 'code:':
                        file_res = row[2]
                        print 'Resolution = %s'%file_res
                
                #get latitude
                if row[0] == 'Station':
                    if row[1] == 'latitude:':
                        latitude = row[2]
                        all_latitudes.append(latitude)
            
                #get longitude
                if row[0] == 'Station':
                    if row[1] == 'longitude:':
                        longitude = row[2]
                        all_longitudes.append(longitude)
                    
                #get altitude
                if row[0] == 'Station':
                    if row[1] == 'altitude:':
                        altitude = row[2][:-1]
                        all_altitudes.append(altitude)
                        
                #get site name
                if row[0] == 'Station':
                    if row[1] == 'name:':
                        site_name = row[2]
                        all_site_name.append(site_name)
            
                #get period
                if row[0] == 'Period':
                    period_code = row[2]
                
                #get stats method
                if row[0] == 'Statistics:':
                    try:
                        st = row[1] + row[2]
                        if st != 'arithmeticmean':
                            print 'Not Arithmetic Mean!'
                            print row[1]
                            print 1+'a'  
                    except:
                        print 'Not Arithmetic Mean!'
                        print row[1]
                        print 1+'a'
            
                #get instrument method and name
                if row[0] == 'Instrument':
                    if row[1] == 'type:':
                        mm_list = row[2:]
                        if len(mm_list) > 1:
                            site_mm = ''
                            for x in range(len(mm_list)):
                                site_mm = site_mm+mm_list[x]+' '
                            site_mm = site_mm.strip()
                        else:
                            site_mm = mm_list[0]
                
                    if row[1] == 'name:':
                        mn_list = row[2:]
                        if len(mn_list) > 1:
                            site_mn = ''
                            for x in range(len(mn_list)):
                                site_mn = site_mn+mn_list[x]+' '
                            site_mn = site_mn.strip()
                        else:
                            site_mn = mn_list[0]
                
                #get method ref
                if row[0] == 'Method':
                    if row[1] == 'ref:':
                        try:
                            mf_list = row[2:]
                            if len(mf_list) > 1:
                                site_mf = ''
                                for x in range(len(mf_list)):
                                    site_mf = site_mf+mf_list[x]+' '
                                site_mf = site_mf.strip()
                            else:
                                site_mf = mf_list[0]
                        except:
                            site_mf = ''
                
                    #put together intrument type+instrument_name+method_ref
                    mm = site_mm+site_mn+site_mf
                
                #get contact
                if row[0] == 'Originator:':
                    try:
                        contact_list = row[1:]
                        if len(contact_list) > 1:
                            site_contact = ''
                            for x in range(len(mf_list)):
                                site_contact = site_contact+contact_list[x]+' '
                            site_contact = site_contact.strip()
                        else:
                            site_contact = site_contact[0]
                    except:
                        site_contact = ''
                    all_contact.append(site_contact)
                
                #get country
                site_country = EMEP_COUNTRIES(file.split('/')[-1][:2])
                all_country.append(site_country)
                
                if row[0] == 'starttime':
                    skip_n = count+1
                    if species == 'ISOP':
                        spec_ind = row.index('C5H8')
                        try:
                            flag_ind = row.index('flag_C5H8')
                        except:
                            flag_ind = row.index('flag')
                    else:
                        spec_ind = row.index(species)
                        try:
                            flag_ind = row.index('flag_'+species)
                        except:
                            flag_ind = row.index('flag')
                    
                count+=1
            
        read = np.loadtxt(file,dtype="f8,f8,f8,f8",skiprows=skip_n,usecols=(0,1,spec_ind,flag_ind),unpack=True)
        read = np.array(read)
        times_since_start = read[0,:]
        endtimes_since_start = read[1,:]
        conc = read[2,:]
        conc = np.array(conc).astype('float64')
        flags = read[3,:]

        dates = []
        times = []
        enddates = []
        endtimes = []
        times_since_start = np.float64(times_since_start)   
        endtimes_since_start = np.float64(endtimes_since_start)  
        for x in range(len(times_since_start)):
            days_since_start = math.trunc(times_since_start[x])
            enddays_since_start = math.trunc(endtimes_since_start[x])
            remainder = times_since_start[x] - days_since_start
            remainder_end = endtimes_since_start[x] - enddays_since_start
            unrounded_hour = remainder*24
            unrounded_hour_end = remainder_end*24
            hour = np.round(unrounded_hour)
            hour_end = np.round(unrounded_hour_end)
            time_delta = datetime.timedelta(days = days_since_start,hours = hour)
            time_delta_end = datetime.timedelta(days = enddays_since_start,hours = hour_end)
            calc_datetime = start_datetime + time_delta
            calc_datetime_end = start_datetime + time_delta_end
            calc_yyyymmdd = calc_datetime.strftime("%Y%m%d") 
            calc_hhmm = calc_datetime.strftime("%H%M")  
            end_calc_yyyymmdd = calc_datetime_end.strftime("%Y%m%d") 
            end_calc_hhmm = calc_datetime_end.strftime("%H%M")
            dates.append(calc_yyyymmdd)
            times.append(calc_hhmm)
            enddates.append(end_calc_yyyymmdd)
            endtimes.append(end_calc_hhmm)
            
        conc = np.float64(conc)
        flags = np.float64(flags)
        
        #add to n_obs_all
        n_all += len(conc)
        
        #IF bad_meta == True then set all file vals as nans
        if bad_meta == True:
            conc[:] = np.NaN
        meta_valid_list.append(bad_meta)
        
        #DO INLINE INVALID AND FLAG CONVERT to NaN
        test = conc < 0
        conc[test] = np.NaN
        
        test = flags != 0
        conc[test] = np.NaN
            
        #convert units by line (only if value is >= than 0
        try:
            if (unit.lower() != 'ppb') & (unit.lower() != 'ppbv'):
                if unit == 'ug/m3':
                    #calculate conversion factor from mg/m3 assuming 293K and 1013 hPa - in EU LAW
                    #R/MW*(TEMP0C(K)*TEMP(degC)/P(hPa)/10
                    conv_fact = 8.3144/mol_mass*(293.)/(1013./10.)
                    conc = conv_fact*conc
                elif unit == 'ug_N/m3':
                    conv_fact = 8.3144/14.00674*(293.)/(1013./10.)
                    conc = conv_fact*conc
                elif (unit == 'ppm') or (unit == 'ppmv'):
                    conc = conc*1e3
                    #print 'Converting Units from ppmv to ppbv'
                elif (unit == 'ppt') or (unit == 'pptv'):
                    conc = conc/1e3
                    #print 'Converting Units from pptv to ppbv'
                else:
                    print 'Unknown Unit'
                    1+'a'
        except:
            pass
        
        #remove 9.999 from ISOP dataset
        if species == 'ISOP':
            test = conc == 9.999
            conc[test] = np.NaN
        
        #if file resolution is daily or monthly then replicate times after point, to fill hourly data array.
        count=0
        if file_res == '1h':
            n_dups = np.zeros(len(conc))
        elif file_res == '1d':
            n_dups = []
            #if measurement method is flask, then put leave flask measurement in as hourly measurement, the first hour of month 
            file_hours = len(dates)
            for i in range(file_hours):
                current_year = int(dates[count][:4])
                current_month = int(dates[count][4:6])
                current_day = int(dates[count][6:])
                current_hh = int(times[count][:2])
                current_mm = int(times[count][2:])
        
                next_year = int(enddates[i][:4])
                next_month = int(enddates[i][4:6])
                next_day = int(enddates[i][6:])
                next_hh = int(endtimes[i][:2])
                next_mm =  int(endtimes[i][2:])
                
                s = datetime.datetime(year = current_year, month = current_month, day = current_day, hour = current_hh, minute = current_mm)
                e = datetime.datetime(year = next_year, month = next_month, day = next_day, hour = next_hh, minute = next_mm)
                day_dates = [d.strftime('%Y%m%d') for d in pd.date_range(s,e,freq='H')][1:-1]
                day_hours = [d.strftime('%H%M') for d in pd.date_range(s,e,freq='H')][1:-1]

                dates = np.insert(dates,count+1,day_dates)
                times = np.insert(times,count+1,day_hours)
                conc = np.insert(conc,count+1,[conc[count]]*len(day_dates))

                #append to n duplicated array
                n_dups=np.append(n_dups,0)
                n_dups=np.append(n_dups,[1]*len(day_dates))

                count +=(len(day_dates)+1)
        
        elif file_res == '1mo':
            n_dups = []
            #if measurement method is flask, then put leave flask measurement in as hourly measurement, the first hour of month 
            file_hours = len(dates)
            for i in range(file_hours):
                current_year = int(dates[count][:4])
                current_month = int(dates[count][4:6])
                current_day = int(dates[count][6:])
                current_hh = int(times[count][:2])
                current_mm = int(times[count][2:])
    
                next_year = int(enddates[i][:4])
                next_month = int(enddates[i][4:6])
                next_day = int(enddates[i][6:])
                next_hh = int(endtimes[i][:2])
                next_mm =  int(endtimes[i][2:])
    
                s = datetime.datetime(year = current_year, month = current_month, day = current_day, hour = current_hh, minute = current_mm)
                e = datetime.datetime(year = next_year, month = next_month, day = next_day, hour = next_hh, minute = next_mm)
        
                day_dates = [d.strftime('%Y%m%d') for d in pd.date_range(s,e,freq='H')][1:-1]
                day_hours = [d.strftime('%H%M') for d in pd.date_range(s,e,freq='H')][1:-1]
                dates = np.insert(dates,count+1,day_dates)
                times = np.insert(times,count+1,day_hours)
                conc = np.insert(conc,count+1,[conc[count]]*len(day_dates))
                
                #append to n duplicated array
                n_dups=np.append(n_dups,0)
                n_dups=np.append(n_dups,[1]*len(day_dates))
                
                count += (len(day_dates)+1)
        
        data = [dates,times,conc,n_dups]
        
        #put measurnement methods and into big list len of times
        mm_big=np.append(mm_big,[mm]*len(dates))
      
        try:
            big_list = np.hstack((big_list,data))
        except:
            big_list = np.array(data)
                
    if (y == year_array[-1]):    

        #get dates and times
        date_con = big_list[0,:]
        time_con = big_list[1,:]
          
        #get vals
        vals = np.array(big_list[2,:]).astype('float64')
        
        #get n dup array
        n_dup_array = np.array(big_list[3,:]).astype(float).astype(int)

        #if all files have missing key meta then exit
        if all(i == True for i in meta_valid_list) == True:
            inv_nometa += 1
            print 'Site Invalid. No Metadata for ref'
            if no2_type == 'MOLYBDENUM':
                n_all,inv_nometa,n_after_nometa,n_after_flagsandlod,n_after_duplicate,inv_anyvaliddata,n_obs_after_anyvaliddata,inv_nokeymeta,n_obs_after_nokeymeta,inv_resolution,n_obs_after_resolution,inv_badmeasurementmethod,n_obs_after_badmeasurementmethod = 0,0,0,0,0,0,0,0,0,0,0,0,0
            exit_c_list = np.array([inv_nometa,inv_anyvaliddata,inv_nokeymeta,inv_resolution,inv_badmeasurementmethod])
            n_c_list = np.array([n_all,n_after_nometa,n_after_flagsandlod,n_after_duplicate,n_after_anyvaliddata,n_after_nokeymeta,n_after_resolution,n_after_badmeasurementmethod])
            unknown_list = [unknown_mm_list,unknown_mm_refs_list,unknown_local_tz_list]
            meta = ['na','na','na','na','na','na','na','na','na','na','na','na']
            exit_r = 'nometa'
            return c,['na'],['na'],['na'],False,meta,exit_c_list,n_c_list,unknown_list,exit_r,np.zeros(1)
        valid_hours_dup = np.sum(n_dup_array)
        n_after_nometa += (len(vals)-valid_hours_dup)

        #delete big list
        del big_list

        date_con = np.array(date_con).astype(int)
        time_con = np.array(time_con).astype(int)
        
        #remove data < 1970 and >= 2015
        test_inds = (date_con >= 19700101) & (date_con < 20150101)
        date_con = date_con[test_inds]
        time_con = time_con[test_inds]
        vals = vals[test_inds]
        mm_big = mm_big[test_inds]
        n_dup_array = n_dup_array[test_inds]
        
        #set st_big as 'continuous'
        st_big = ['continuous']*len(vals)
        
        #convert all Nans back to -99999
        test = np.isnan(vals)
        vals[test] = -99999
        
        #get obs valid
        test = vals >= 0
        valid_hours_dup = np.sum(n_dup_array[test])
        n_obs_valid = int(len(vals[test]) - valid_hours_dup)
        n_after_flagsandlod += n_obs_valid
        
        #create max possible species grid, measurement method and sampling type grids
        full_data = np.empty(n_hours)
        full_data_after_flagsandlod = np.empty(n_hours)
        big_n_dup_array = np.zeros(n_hours)
        full_data[:] = -99999
        full_data_after_flagsandlod[:] = -99999
        
        #find matching times between actual times and grid of times, return big array of indices of matched indices in grid
        converted_time = modules.date_process(date_con,time_con,start_year)
        converted_time = np.round(converted_time,decimals=5)
        syn_grid_time = np.arange(0,n_days,1./24)
        syn_grid_time = np.round(syn_grid_time,decimals=5)
        #find matching times between actual times and grid of times, return big array of indices of matched indices in grid
        raw_indices = np.searchsorted(syn_grid_time, converted_time, side='left')
        vals = np.array(vals)
        full_data_after_flagsandlod[raw_indices] = vals
        raw_st = np.copy(st_big)
        raw_mm = np.copy(mm_big)
        
        # test and remove duplicate and overlap points
        converted_time,vals,mm_big,st_big,n_dup_array = modules.remove_duplicate_points(site_ref,converted_time,vals,mm_big,st_big,n_dup_array,output_res)
        test = vals >= 0
        valid_hours_dup = np.sum(n_dup_array[test])
        n_obs_valid = int(len(vals[test]) - valid_hours_dup)
        n_after_duplicate += n_obs_valid
        
        #find matching times between actual times and grid of times, return big array of indices of matched indices in grid
        indices = np.searchsorted(syn_grid_time, converted_time, side='left')
        full_data[indices] = vals 
        big_n_dup_array[indices] = n_dup_array
    
        #get mode of metadata
        try:
            lat = np.float32(stats.mode(all_latitudes)[0][0]) 
        except:
            lat = 'na'
        try:
            lon = np.float32(stats.mode(all_longitudes)[0][0])  
        except:
            lon = 'na'
        try:
            alt = np.float32(stats.mode(all_altitudes)[0][0]) 
        except:
            alt = 'na'
        unit = stats.mode(all_unit)[0][0]
        #remove empty strings from extra meta before mode test
        try:
            site_name = stats.mode(filter(None, all_site_name))[0][0]
        except:
            site_name = 'na'
        try:
            country = stats.mode(filter(None, all_country))[0][0]
        except:
            country = 'na'
        try:
            contact = stats.mode(filter(None, all_contact))[0][0] 
        except:
            contact = 'na'
    
        #set data tz - all EMEP times are UTC
        data_tz = 0
        all_tz = [data_tz]
    
        key_meta = [lat,lon,alt]
        
        #convert file res to standard format
        if file_res == '1h':
            file_res = 'H'
        elif file_res == '1d':
            file_res = 'D'
        elif file_res == '1mo':
            file_res = 'M'
    
        #get sampling/instrument grids
        raw_st_grid,p_st_grid,p_st_grid_after_flagsandlod,raw_mm_grid,p_mm_grid,p_mm_grid_after_flagsandlod,unknown_mm_list,unknown_mm_refs_list = modules.sampling_and_instruments_by_processgroup(site_ref,process_group,species,raw_st,raw_mm,full_data_after_flagsandlod,full_data,raw_indices,unknown_mm_list,unknown_mm_refs_list,no2_type)

        #do quality checks                                                                                                                                                                                                                                                                                                     
        data_valid,full_data,valid_hours_dup,p_st_grid,p_mm_grid,n_all,inv_nometa,n_after_nometa,n_after_flagsandlod,n_after_duplicate,inv_anyvaliddata,n_after_anyvaliddata,inv_nokeymeta,n_after_nokeymeta,inv_resolution,n_after_resolution,inv_badmeasurementmethod,n_after_badmeasurementmethod,exit_r = modules.primary_quality_control(site_ref,species,file_res,no2_type,grid_dates,full_data,big_n_dup_array,valid_hours_dup,raw_st_grid,p_st_grid,p_st_grid_after_flagsandlod,raw_mm_grid,p_mm_grid,p_mm_grid_after_flagsandlod,data_resolution,n_obs_valid,key_meta,n_all,inv_nometa,n_after_nometa,n_after_flagsandlod,n_after_duplicate,inv_anyvaliddata,n_after_anyvaliddata,inv_nokeymeta,n_after_nokeymeta,inv_resolution,n_after_resolution,inv_badmeasurementmethod,n_after_badmeasurementmethod)
        if data_valid == False:
            exit_c_list = np.array([inv_nometa,inv_anyvaliddata,inv_nokeymeta,inv_resolution,inv_badmeasurementmethod])
            n_c_list = np.array([n_all,n_after_nometa,n_after_flagsandlod,n_after_duplicate,n_after_anyvaliddata,n_after_nokeymeta,n_after_resolution,n_after_badmeasurementmethod])
            unknown_list = [unknown_mm_list,unknown_mm_refs_list,unknown_local_tz_list]
            meta = [lat,lon,alt,'na','na','na','na','na','na','na','na','na']
            return c,['na'],['na'],['na'],False,meta,exit_c_list,n_c_list,unknown_list,exit_r,np.zeros(1)

        #set metadata not available as na
        raw_class_name = 'na'
    
        #set processed unit
        p_unit = 'ppbv'
    
        #get local timezone
        try:
            local_tz_name = tz_root.tzNameAt(lat,lon,forceTZ=True)
            pytz_obj = pytz.timezone(local_tz_name)
            datetime_offset = pytz_obj.utcoffset(datetime.datetime(2000,1,1))
            if datetime_offset < datetime.timedelta(0):
                local_tz = -(24-int(datetime_offset.seconds/60/60))
            else:
                local_tz = int(datetime_offset.seconds/60/60)
        except:
            local_tz = 'na'
            print 'TIMEZONE NOT KNOWN, SITE IS %s'%(site_ref)
            unknown_local_tz_list.append(site_ref)

        #pack meta
        meta = [lat,lon,alt,raw_class_name,file_res,unit,p_unit,data_tz,local_tz,site_name,country,contact]
    
        #if blank strings in meta then convert to 'na'
        for i in range(len(meta)):
            try:
                if meta[i].strip() == '':
                    meta[i] = 'na'
            except:
                pass
    
        print set(raw_st_grid)
        print set(raw_mm_grid)
        print set(p_st_grid)
        print set(p_mm_grid)
        print meta
    
        exit_c_list = np.array([inv_nometa,inv_anyvaliddata,inv_nokeymeta,inv_resolution,inv_badmeasurementmethod])
        n_c_list = np.array([n_all,n_after_nometa,n_after_flagsandlod,n_after_duplicate,n_after_anyvaliddata,n_after_nokeymeta,n_after_resolution,n_after_badmeasurementmethod])
        print 'exit counts = ', exit_c_list
        print 'n obs counts = ', n_c_list

        unknown_list = [unknown_mm_list,unknown_mm_refs_list,unknown_local_tz_list]

        return c,full_data,p_st_grid,p_mm_grid,data_valid,meta,exit_c_list,n_c_list,unknown_list,'na',big_n_dup_array
def site_iter_process(valid_refs,c):
#for ref_i in range(len(valid_refs)):
    data_valid = True

    site_ref = valid_refs[c]
    print 'Current Ref is = ', site_ref

    s_files = glob.glob('/work/home/db876/observations/surface/%s/CAPMON/ozon_smpls_%s*'%(species,site_ref))
    site_files = []
    for y in year_array:
        for f in s_files:
            if str(y) in f:
                site_files.append(f)
                           

    site_files = modules.natsorted(site_files)

    yymmdd = []
    hhmm = []
    vals = []

    #create max possible o3 grid
    full_data = np.empty(n_hours)
    full_data[:] = -99999

    for file_i in range(len(site_files)):

        count = 0
        meta_start = -99999
        start_read_1 = False
        start_read_2 = False

        with open(site_files[file_i], 'rb') as f:
            reader = csv.reader(f,delimiter=',')
            print site_files[file_i]
            for row in reader:
                #print count
               #break out of loop at bottom of file
                if (start_read_2 == True) & (row[0] == '*TABLE ENDS'):
                    break
               
               #get metadata
                try:
                    if (row[0] =='*TABLE NAME') & (row[1] == 'Site information'):
                        meta_start = count+2
                except:
                    pass
                if count == meta_start:
                    lat_i = row.index('Latitude: decimal degrees')
                    lon_i = row.index('Longitude: decimal degrees')
                    try:
                        alt_i = row.index('Ground elevation: above mean sea level')
                    except:
                        alt_i = row.index('Ground altitude')
                    class_i = row.index('Site land use')
                
                if count == (meta_start+6):
                    latitude = row[lat_i]
                    longitude = row[lon_i]
                    altitude = row[alt_i]
                    raw_class_name = row[class_i]
                      
                #get data
                if start_read_2 == True:
                    #read dates, times, and vals
                    date = row[8]
                    time = row[9]
                    yymmdd.append(date[:4]+date[5:7] + date[8:])
                    hhmm.append(time[:2]+time[3:])
                    quality_code = row[13]
                    if quality_code == 'V0':
                        vals = np.append(vals,np.float64(row[12]))
                    else:
                        vals = np.append(vals,-99999)
                    
                try:
                    if (row[0] == '*TABLE NAME') & (row[1] == 'OZONE_HOURLY'):
                        start_read_1 = True
                except:
                    pass
                   
                if (start_read_1 == True) & (row[0] == '*TABLE COLUMN UNITS'):
                    unit = row[12]
                
                if (start_read_1 == True) & (row[0] == '*TABLE BEGINS'):
                    start_read_2 = True
                count+=1

    #convert all invalids to -99999
    test_inv = vals < 0
    vals[test_inv] = -99999

    #put o3 vals into full grid
    date_con = np.array(yymmdd).astype(int)
    time_con = np.array(hhmm).astype(int)
    
    #find matching times between actual times and grid of times, return big array of indices of matched indices in grid
    converted_time = modules.date_process(date_con,time_con,start_year)
    converted_time = np.round(converted_time,decimals=5)
    syn_grid_time = np.arange(0,n_days,1./24)
    syn_grid_time = np.round(syn_grid_time,decimals=5)
    #find matching times between actual times and grid of times, return big array of indices of matched indices in grid
    indices = np.searchsorted(syn_grid_time, converted_time, side='left')
    vals = np.array(vals)
    #make sure no data is past end year
    index_test = indices < len(full_data)
    indices = indices[index_test]
    vals = vals[index_test]
    full_data[indices] = vals
    
    
    #get metadata
    lat = np.float64(latitude)
    lon = np.float64(longitude)
    alt = np.float64(altitude)
        
    #do data quality checks
    full_data,data_valid = modules.quality_check_nr(full_data,data_valid,data_resolution,np.float64(altitude),grid_dates,start_year,end_year)
    
    #set measurement method
    mm = 'ultraviolet photometry'
    
    #set site file resolution
    file_res = 'H'
    
    #set sampling as average
    st = 'average'
    
    anthrome_class_name = 'na'
    
    return c,full_data,data_valid,lat,lon,alt,raw_class_name,anthrome_class_name,mm,st,file_res
def site_iter_process(valid_refs,c):
#read files site at a time
#for ref_i in range(len(valid_refs)):
    site_ref = valid_refs[c]

    all_latitudes = []
    all_longitudes = []
    all_altitudes = []
    all_mm = []

    print 'Current Ref is = ', site_ref
    #find if sites have full valid range from start year and finishing in end year
    s_files = glob.glob('/work/home/db876/observations/surface/%s/EMEP/%s*'%(species,site_ref))
    year_files = [file.replace("/work/home/db876/observations/surface/%s/EMEP/"%(species), "") for file in s_files]
    cut_year_files = [file[8:12] for file in year_files]
    site_files = []
    for y in year_array:
        for i in range(len(s_files)):
            if str(y) in cut_year_files[i]:
                site_files.append(s_files[i])
                  
    site_files = modules.natsorted(site_files)
    year_files = modules.natsorted(year_files)
  
    file_startdate = []
    file_height = []
    instr_names = []
    file_lasttime = []
    
    data_valid = True

    yyyymmdd = []
    hhmm = []
    vals = []
    flags = []

    #create max possible o3 grid
    full_data = np.empty(n_hours)
    full_data[:] = -99999

    if site_files == []:
        print 'No valid files for site\n'
        return
    
    for y in year_array:
    
        print 'Processing Year %s'%y 
        got_year = False
        for file in site_files:
            last_file_split = file.split('/')[-1]
            if str(y) in last_file_split[8:12]:
                got_year = True
                break
        if got_year == False:
            #fill in data for missing year
            timedelta_diff = datetime.date(y+1, 1, 1) - datetime.date(y, 1, 1)
            ndays_missing = timedelta_diff.days
            print 'ndays missing = ', ndays_missing        
            continue
    
        if data_valid == True:
            data_start = 9999999
            count = 0
            start_read = False
            with open(file, 'rb') as f:
                read_count = 0
                reader = csv.reader(f,delimiter=' ')
                print file
                for row in reader:
                    try:
                        row = filter(lambda a: a != '', row)
                    except:
                        pass
                    try:
                        row = filter(lambda a: a != ',', row)
                    except:
                        pass
                                    
                    #get start date of file
                    if row[0] == 'Startdate:':
                        data = row[1]
                        s_yyyy = data[:4]
                        s_mm = data[4:6]
                        s_dd = data[6:8]
                        s_hh = data[8:10]
                        s_min = data[10:12]


                        start_datetime = datetime.datetime(int(s_yyyy),1,1,0,0)
                    
                    #get unit
                    if row[0] == 'Unit:':
                        try:
                            unit_part1 = row[1]
                            unit_part2 = row[2]
                            unit = unit_part1+'_'+unit_part2
                        except:
                            unit = row[1]   
            
                    #get resolution
                    if row[0] == 'Resolution':
                        if row[1] == 'code:':
                            file_res = row[2]
                            print 'Resolution = %s'%file_res
                            if (output_res == 'H'):
                                if (file_res == '1d') or (file_res == '1mo'):
                                    print 'File resolution has to be Minimum Hourly. Skipping'
                                    data_valid = False
                                    return c,full_data,data_valid,-999,-999,-999,'na','na','na','na','na'
                            elif (output_res == 'D'):
                                if (file_res == '1mo'):
                                    print 'File resolution has to be Minimum Daily. Skipping'
                                    data_valid = False
                                    return c,full_data,data_valid,-999,-999,-999,'na','na','na','na','na'
                    #get latitude
                    if row[0] == 'Station':
                        if row[1] == 'latitude:':
                            latitude = row[2]
                            all_latitudes.append(latitude)
                
                    #get longitude
                    if row[0] == 'Station':
                        if row[1] == 'longitude:':
                            longitude = row[2]
                            all_longitudes.append(longitude)
                        
                    #get altitude
                    if row[0] == 'Station':
                        if row[1] == 'altitude:':
                            altitude = row[2][:-1]
                            all_altitudes.append(altitude)
                
                    #get period
                    if row[0] == 'Period':
                        period_code = row[2]
                    
                    #get stats method
                    if row[0] == 'Statistics:':
                        try:
                            st = row[1] + row[2]
                            if st != 'arithmeticmean':
                                print 'Not Arithmetic Mean!'
                                print row[1]
                                print 1+'a'  
                        except:
                            print 'Not Arithmetic Mean!'
                            print row[1]
                            print 1+'a'
                
                    #get instrument method
                    if row[0] == 'Instrument':
                        if row[1] == 'type:':
                            mm_list = row[2:]
                            if len(mm_list) > 1:
                                site_mm = ''
                                for x in range(len(mm_list)):
                                    site_mm = site_mm+mm_list[x]+' '
                                site_mm = site_mm.strip()
                            else:
                                site_mm = mm_list[0]
                            all_mm.append(site_mm)
                    
                    #get data
                    if start_read == True:
                        #calc dates, times, and take o3 vals

                        time_since_start = np.float64(row[0])
                        days_since_start = math.trunc(time_since_start)
                        remainder = time_since_start - days_since_start
                        unrounded_hour = remainder*24
                        hour = np.round(unrounded_hour)
                        time_delta = datetime.timedelta(days = days_since_start,hours = hour)
                        calc_datetime = start_datetime + time_delta
                        calc_yyyymmdd = calc_datetime.strftime("%Y%m%d") 
                        calc_hhmm = calc_datetime.strftime("%H%M")        
                            
                        line_val = np.float64(row[2])
                    
                        #convert units by line (only if value is >= than 0
                        if line_val >= 0:
                            if (unit.lower() != 'ppb') & (unit.lower() != 'ppbv'):
                                if unit == 'ug/m3':
                                    #print 'converting units, temp = 20degC'
                                    #calculate conversion factor from mg/m3 assuming 20 degC and 1 atm - default for O3 instruments
                                    #R/MW*(TEMP0C(K)*TEMP(degC)/P(hPa)/10
                                    conv_fact = 8.3144/mol_mass*(273.15+20)/(1013.25/10)
                                    line_val = conv_fact*line_val
                                    #print 'Converting Units from ug/m3 20degC to ppbv'
                                elif unit == 'ug_N/m3':
                                    conv_fact = 8.3144/mol_mass*(273.15+20)/(1013.25/10)
                                    line_val = conv_fact*line_val
                                    #print 'Converting Units from ug/Nm3 20degC to ppbv' 
                                elif (unit == 'ppm') or (unit == 'ppmv'):
                                    line_val = line_val*1e3
                                    #print 'Converting Units from ppmv to ppbv'
                                elif (unit == 'ppt') or (unit == 'pptv'):
                                    line_val = line_val/1e3
                                    #print 'Converting Units from pptv to ppbv'
                                else:
                                    print 'Unknown Unit'
                                    data_valid = False
                                    1+'a'
                       
                        if file_res == '1h':
                            yyyymmdd=np.append(yyyymmdd,calc_yyyymmdd)
                            hhmm=np.append(hhmm,calc_hhmm)
                            vals = np.append(vals,line_val)
                            flags = np.append(flags,np.float64(row[3]))
                    
                        elif file_res == '1d':
                            yyyymmdd=np.append(yyyymmdd,calc_yyyymmdd)
                            hhmm=np.append(hhmm,'0000')
                            vals = np.append(vals,line_val)
                            flags = np.append(flags,np.float64(row[3]))
                        
                            for j in range(1,24):
                                time_delta = datetime.timedelta(days = days_since_start,hours = j)
                                calc_datetime = start_datetime + time_delta 
                                vals = np.append(vals,vals[-1])
                                flags = np.append(flags,flags[-1])
                                yyyymmdd = np.append(yyyymmdd,calc_datetime.strftime("%Y%m%d"))
                                hhmm = np.append(hhmm,calc_datetime.strftime("%H%M"))
                        
                        elif file_res == '1mo':
                            yyyymmdd=np.append(yyyymmdd,calc_yyyymmdd)
                            hhmm=np.append(hhmm,'0000')
                            vals = np.append(vals,line_val)
                            flags = np.append(flags,np.float64(row[3]))
                        
                            month_days = monthrange(int(yyyymmdd[-1][:4]), int(yyyymmdd[-1][4:6]))[1]
                            for j in range(1,24*month_days):
                                time_delta = datetime.timedelta(days = days_since_start,hours = j)
                                calc_datetime = start_datetime + time_delta
                                vals = np.append(vals,vals[-1])
                                flags = np.append(flags,flags[-1])
                                yyyymmdd = np.append(yyyymmdd,calc_datetime.strftime("%Y%m%d"))
                                hhmm = np.append(hhmm,calc_datetime.strftime("%H%M"))
        
                    if row[0] == 'starttime':
                        start_read = True
                
                    count+=1
                
    if (y == year_array[-1]):    
            
        #convert all invalids by flags to -99999
        test_inv = flags != 0
        if len(test_inv) != 0:
            vals[test_inv] = -99999
        
        #any values less than zero are -99999
        test_inv = vals < 0
        if len(test_inv) != 0:
            vals[test_inv] = -99999
        
        #do additional invalid test, as flags not always correct
        #test_inv_2 = vals > 300
        #vals[test_inv_2] = -99999

        #put o3 vals into full grid
        date_con = np.array(yyyymmdd).astype(int)
        time_con = np.array(hhmm).astype(int)
        
        #find matching times between actual times and grid of times, return big array of indices of matched indices in grid
        converted_time = date_process(date_con,time_con,start_year)
        converted_time = np.round(converted_time,decimals=5)
        syn_grid_time = np.arange(0,n_days,1./24)
        syn_grid_time = np.round(syn_grid_time,decimals=5)
        #find matching times between actual times and grid of times, return big array of indices of matched indices in grid
    
        indices = np.searchsorted(syn_grid_time, converted_time, side='left')
        vals = np.array(vals)
        #make sure no data is past end year
        index_test = indices < len(full_data)
        indices = indices[index_test]
        vals = vals[index_test]
        full_data[indices] = vals
    
    #get mode of metadata
    lat = np.float64(stats.mode(all_latitudes)[0][0]) 
    lon = np.float64(stats.mode(all_longitudes)[0][0])  
    alt = np.float64(stats.mode(all_altitudes)[0][0]) 
    mm = stats.mode(all_mm)[0][0]
    
    #check site is not urban using anthrome map from 2000
    anthfile = '/work/home/db876/plotting_tools/core_tools/anthro2_a2000.nc'
    anthload = Dataset(anthfile)
    class_valid,anthrome_class_name = modules.anthrome_classify(anthload,[lat],[lon])
    if class_valid == 'invalid':
        data_valid = False
        print 'Site Invalid, site classed as urban by anthrome map.'
    
    #get measurement method
    if (mm == 'uv_abs') or (mm == 'chemiluminesc') or (mm == 'uv_fluoresc'):
        if species == 'O3':
            mm = 'ultraviolet photometry'
        if (species == 'NO') or (species == 'NO2') or (species == 'CO'):
            mm = 'chemiluminescence'
        
    elif (mm == 'ndir') or (mm == 'infrared_absorption'):
        mm = 'non-dispersive infrared spectroscopy'
        
    elif (mm == 'GC-HgO'):
        mm = 'gas chromatography reduction gas detection'
    
    elif (mm == 'tracegas_monitor'):
        mm = 'cavity attenuated phase shift spectroscopy'
    
    elif (mm == 'filter_1pack') or (mm == 'filter_2pack') or (mm == 'filter_3pack'):
        if species == 'NO2':
            mm = 'griess saltzman colorimetric'
        elif species == 'CO':
            mm = 'ion chromatography'
        
    elif (mm == 'steel_canister'):
        mm = 'gas chromatography flame ionisation detection'
        
    elif (mm == 'online_gc'):
        mm = 'online gas chromatography'
    
    elif (mm == 'glass_sinter') or (mm == 'abs_solution') or (mm == 'filter_abs_solution') or (mm == 'abs_tube') or (mm == 'continuous_colorimetric'):
        mm = 'griess saltzman colorimetric'
        
    elif (mm == 'NaJ_solution'):
        mm = 'flame ionisation detection'
        
    elif (mm == 'doas'):
        mm = 'differential optical absorption spectrosocopy'
    
    elif (mm == 'diffusion_tube'):
        mm = 'diffusive sampler'
    
    elif (mm == 'NA') or (mm == ''):
        if species == 'O3':
            mm = 'ultraviolet photometry'
        if species == 'CO':
            mm = 'non-dispersive infrared spectroscopy'
        if species == 'NO2':
            mm = 'chemiluminescence'
        if species == 'NO':
            mm = 'chemiluminescence'
        if species == 'ISOP':
            mm = 'gas chromatography flame ionisation detection'
        
    else:
        print mm
        1+'a'
    
    #do data quality checks        
    full_data,data_valid = modules.quality_check(full_data,data_valid,data_resolution,alt,grid_dates,start_year,end_year)

    #convert file res to standard format
    if file_res == '1h':
        file_res = 'H'
    elif file_res == '1d':
        file_res = 'D'
    elif file_res == '1mo':
        file_res = 'M'

    #no raw class so set as na
    raw_class_name = 'na'
    
    #set sampling as average
    st = 'average'

    return c,full_data,data_valid,lat,lon,alt,raw_class_name,anthrome_class_name,mm,st,file_res
示例#13
0
def site_iter_process(valid_refs, c):
    #set local counts
    inv_nometa = 0
    inv_anyvaliddata = 0
    inv_nokeymeta = 0
    inv_resolution = 0
    inv_badmeasurementmethod = 0
    n_all = 0
    n_after_nometa = 0
    n_after_flagsandlod = 0
    n_after_duplicate = 0
    n_after_anyvaliddata = 0
    n_after_nokeymeta = 0
    n_after_resolution = 0
    n_after_badmeasurementmethod = 0

    #set local unknown lists
    unknown_mm_list = []
    unknown_mm_refs_list = []
    unknown_local_tz_list = []

    ref = valid_refs[c]
    print 'ref = ', ref, c

    #get site instrument for species
    met_i = file_refs.index(ref)
    file_name = met_refs[met_i]
    site_name = met_sitenames[met_i]
    print site_name
    site_species = list(met_species[met_i])
    print site_species
    site_instruments = list(met_instruments[met_i])
    m_method = site_instruments[site_species.index(species)]

    site_resolutions = []
    data_valid = True

    s_files = insensitive_glob(
        '/work/home/db876/observations/surface/%s/EANET/*%s.csv' %
        (fname_species, file_name))
    site_files = []
    for y in year_array:
        for f in s_files:
            if str(y)[-2:] in f:
                site_files.append(f)

    site_files = modules.natsorted(site_files)

    years = []
    months = []
    days = []
    hours = []

    vals = []
    yyyymmdd = []
    hhmm = []

    n_dup_array = []

    last_year_index = len(site_files)
    for y in year_array:
        got_year = False
        for file in site_files:
            last_file_split = file.split('/')[-1]
            if str(y)[2:] in last_file_split:
                got_year = True
                break
        if got_year == False:
            timedelta_diff = datetime.date(y + 1, 1, 1) - datetime.date(
                y, 1, 1)
            ndays_missing = timedelta_diff.days
            continue

        print file

        valid = True
        with open(file, 'rb') as f:
            reader = csv.reader(f, delimiter=',')
            counter = 0

            #get resolution
            for row in reader:
                if counter == 0:
                    all_units = row

                elif counter == 1:
                    file_res = 'H'

                    try:
                        hour_index = row.index('Hour')
                    except:
                        file_res = 'D'
                    try:
                        day_index = row.index('Day')
                    except:
                        file_res = 'M'
                    month_index = row.index('Month')
                    year_index = row.index('Year')

                    try:
                        spec_index = row.index(species.upper())
                        unit = all_units[spec_index]
                    except:
                        valid = False
                        break

                    #make sure each year units are ppb
                    if unit != 'ppb':
                        print 'Units not ppb!'
                        1 + 'a'

                if counter == 2:
                    if file_res == 'H':
                        yyyy = row[year_index]
                        mm = row[month_index]
                        dd = row[day_index]
                        hh = row[hour_index]
                    elif file_res == 'D':
                        yyyy = row[year_index]
                        mm = row[month_index]
                        dd = row[day_index]
                        hh = 1
                    elif file_res == 'M':
                        yyyy = row[year_index]
                        mm = row[month_index]
                        dd = 1
                        hh = 1

                    start_datetime = datetime.datetime(int(yyyy), int(mm),
                                                       int(dd), int(hh))

                if counter == 3:
                    if file_res == 'H':
                        yyyy = row[year_index]
                        mm = row[month_index]
                        dd = row[day_index]
                        hh = row[hour_index]
                    elif file_res == 'D':
                        yyyy = row[year_index]
                        mm = row[month_index]
                        dd = row[day_index]
                        hh = 1
                    elif file_res == 'M':
                        yyyy = row[year_index]
                        mm = row[month_index]
                        dd = 1
                        hh = 1

                    present_datetime = datetime.datetime(
                        int(yyyy), int(mm), int(dd), int(hh))

                    time_delt = present_datetime - start_datetime
                    hour_delt = datetime.timedelta(hours=1)
                    day_delt = datetime.timedelta(hours=24)
                    week_delt = datetime.timedelta(hours=24 * 7)
                    month_delt = datetime.timedelta(hours=24 * 28)

                    print time_delt

                    if (time_delt < day_delt):
                        print 'Hourly Data'
                        file_res = 'H'
                        site_resolutions.append(file_res)

                    elif (time_delt > hour_delt) & (time_delt < week_delt):
                        print 'Daily Data'
                        file_res = 'D'
                        site_resolutions.append(file_res)

                    elif (time_delt > week_delt):
                        print 'Monthly Data'
                        file_res = 'M'
                        site_resolutions.append(file_res)

                counter += 1

        #READ IN DATA
        if valid == True:
            #limit to sites with hourly date files for, if required
            if output_res == 'H':
                if file_res != 'H':
                    print 'Not processing as only want hourly files'
                    continue
            if output_res == 'HD':
                if file_res == 'M':
                    print 'Not processing as only want hourly and daily files'
                    continue
            with open(file, 'rb') as f:
                reader = csv.reader(f, delimiter=',')
                counter = 0
                val_count = 0
                for row in reader:

                    if counter >= 2:
                        yyyy = row[year_index]
                        mm = row[month_index]

                        #add to n_obs_all
                        n_all += 1
                        n_after_nometa += 1

                        if file_res == 'H':
                            try:
                                vals = np.append(vals,
                                                 np.float64(row[spec_index]))
                            except:
                                vals = np.append(vals, -99999)

                            current_datetime = present_datetime + relativedelta(
                                hours=val_count)
                            yyyymmdd.append(
                                current_datetime.strftime("%Y%m%d"))
                            hhmm.append(current_datetime.strftime("%H%M"))
                            n_dup_array = np.append(n_dup_array, 0)

                        elif file_res == 'D':
                            try:
                                vals = np.append(
                                    vals, [np.float64(row[spec_index])] * 24)
                            except:
                                vals = np.append(vals, [-99999] * 24)

                            current_datetime = present_datetime + relativedelta(
                                days=val_count)
                            next_datetime = present_datetime + relativedelta(
                                days=val_count + 1)
                            all_datetimes = pd.date_range(current_datetime,
                                                          next_datetime,
                                                          freq='H')[:-1]
                            for d in all_datetimes:
                                yyyymmdd.append(d.strftime("%Y%m%d"))
                                hhmm.append(d.strftime("%H%M"))
                            n_dup_array = np.append(n_dup_array, 0)
                            n_dup_array = np.append(n_dup_array, [1] * 23)

                        elif file_res == 'M':
                            month_days = monthrange(int(yyyy), int(mm))[1]
                            try:
                                vals = np.append(
                                    vals, [np.float64(row[spec_index])] *
                                    (month_days * 24))
                            except:
                                vals = np.append(vals,
                                                 [-99999] * (month_days * 24))

                            current_datetime = present_datetime + relativedelta(
                                months=int(mm) - 1)
                            next_datetime = present_datetime + relativedelta(
                                months=int(mm))
                            all_datetimes = pd.date_range(current_datetime,
                                                          next_datetime,
                                                          freq='H')[:-1]
                            for d in all_datetimes:
                                yyyymmdd.append(d.strftime("%Y%m%d"))
                                hhmm.append(d.strftime("%H%M"))
                            n_dup_array = np.append(n_dup_array, 0)
                            n_dup_array = np.append(n_dup_array, [1] *
                                                    ((month_days * 24) - 1))

                        val_count += 1
                    counter += 1

        else:
            print 'Species is not in file header. Skipping Year'
            timedelta_diff = datetime.date(y + 1, 1, 1) - datetime.date(
                y, 1, 1)
            ndays_missing = timedelta_diff.days
            print 'ndays missing = ', ndays_missing

    #test if have no data due to not required time resolution, if so exit
    if len(vals) == 0:
        n_all, inv_nometa, n_after_nometa, n_after_flagsandlod, n_after_duplicate, inv_anyvaliddata, n_obs_after_anyvaliddata, inv_nokeymeta, n_obs_after_nokeymeta, inv_resolution, n_obs_after_resolution, inv_badmeasurementmethod, n_obs_after_badmeasurementmethod = 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
        exit_c_list = np.array([
            inv_nometa, inv_anyvaliddata, inv_nokeymeta, inv_resolution,
            inv_badmeasurementmethod
        ])
        n_c_list = np.array([
            n_all, n_after_nometa, n_after_flagsandlod, n_after_duplicate,
            n_after_anyvaliddata, n_after_nokeymeta, n_after_resolution,
            n_after_badmeasurementmethod
        ])
        unknown_list = [
            unknown_mm_list, unknown_mm_refs_list, unknown_local_tz_list
        ]
        meta = [
            'na', 'na', 'na', 'na', 'na', 'na', 'na', 'na', 'na', 'na', 'na',
            'na'
        ]
        return c, ['na'], ['na'], [
            'na'
        ], False, meta, exit_c_list, n_c_list, unknown_list, 'nothourly', np.zeros(
            0)

    #create max possible grid
    full_data = np.empty(n_hours)
    full_data_after_flagsandlod = np.empty(n_hours)
    big_n_dup_array = np.zeros(n_hours)
    full_data[:] = -99999
    full_data_after_flagsandlod[:] = -99999

    #convert blank values to -99999
    test_inv = vals == ''
    vals[test_inv] = -99999

    #convert number invalids to -99999
    test_inv = vals < 0
    vals[test_inv] = -99999

    #if all site resolutions are same continue then take first file_res
    all_same = all(x == site_resolutions[0] for x in site_resolutions)
    if all_same == True:
        file_res = site_resolutions[0]
    else:
        #otherwise take lowest frequency res as file_res
        if 'M' in site_resolutions:
            file_res = 'M'
        elif 'D' in site_resolutions:
            file_res = 'D'
        else:
            file_res = 'H'

    #get meta
    i_ref = file_refs.index(ref)
    site_ref = ref
    data_tz = np.float32(met_tz[i_ref])
    all_tz = [data_tz]
    lat = np.float32(met_lats[i_ref])
    lon = np.float32(met_lons[i_ref])
    alt = np.float32(met_alts[i_ref])
    raw_class_name = met_class[i_ref]
    country = met_country[i_ref]
    unit = str(unit)
    contact = 'Ayako Aoyagi, Asia Center for Air Pollution Research, [email protected]'

    #adjust dates and times if tz is not equal to 0
    tz = int(data_tz)
    if tz != 0:
        for i in range(len(yyyymmdd)):
            #create datetime
            dt = datetime.datetime(int(yyyymmdd[i][:4]), int(yyyymmdd[i][4:6]),
                                   int(yyyymmdd[i][6:]), int(hhmm[i][:2]),
                                   int(hhmm[i][2:]))
            if tz > 0:
                dt = dt - datetime.timedelta(hours=int(tz))
            elif tz < 0:
                dt = dt + datetime.timedelta(hours=np.abs(int(tz)))
            yyyymmdd[i] = dt.strftime("%Y%m%d")
            hhmm[i] = dt.strftime("%H%M")

    #put vals into full grid
    date_con = np.array(yyyymmdd).astype(int)
    time_con = np.array(hhmm).astype(int)

    #remove data < 1970 and >= 2015
    test_inds = (date_con >= 19700101) & (date_con < 20150101)
    date_con = date_con[test_inds]
    time_con = time_con[test_inds]
    vals = vals[test_inds]
    n_dup_array = n_dup_array[test_inds]

    #set st_big and mm_big
    st_big = ['continuous'] * len(vals)
    mm_big = [m_method] * len(vals)

    #get obs valid
    test = vals >= 0
    valid_hours_dup = np.sum(n_dup_array[test])
    n_obs_valid = int(len(vals[test]) - valid_hours_dup)
    n_after_flagsandlod += n_obs_valid

    #find matching times between actual times and grid of times, return big array of indices of matched indices in grid
    converted_time = modules.date_process(date_con, time_con, start_year)
    converted_time = np.round(converted_time, decimals=5)
    syn_grid_time = np.arange(0, n_days, 1. / 24)
    syn_grid_time = np.round(syn_grid_time, decimals=5)
    raw_indices = np.searchsorted(syn_grid_time, converted_time, side='left')
    vals = np.array(vals)
    full_data_after_flagsandlod[raw_indices] = vals
    raw_st = np.copy(st_big)
    raw_mm = np.copy(mm_big)

    # test and remove duplicate and overlap points
    converted_time, vals, mm_big, st_big, n_dup_array = modules.remove_duplicate_points(
        site_ref, converted_time, vals, mm_big, st_big, n_dup_array,
        output_res)
    test = vals >= 0
    valid_hours_dup = np.sum(n_dup_array[test])
    n_obs_valid = int(len(vals[test]) - valid_hours_dup)
    n_after_duplicate += n_obs_valid

    #find matching times between actual times and grid of times, return big array of indices of matched indices in grid
    indices = np.searchsorted(syn_grid_time, converted_time, side='left')
    full_data[indices] = vals
    big_n_dup_array[indices] = n_dup_array

    key_meta = [lat, lon, alt]

    #get sampling/instrument grids
    raw_st_grid, p_st_grid, p_st_grid_after_flagsandlod, raw_mm_grid, p_mm_grid, p_mm_grid_after_flagsandlod, unknown_mm_list, unknown_mm_refs_list = modules.sampling_and_instruments_by_processgroup(
        site_ref, process_group, species, raw_st, raw_mm,
        full_data_after_flagsandlod, full_data, raw_indices, unknown_mm_list,
        unknown_mm_refs_list, no2_type)

    #do quality checks
    data_valid, full_data, valid_hours_dup, p_st_grid, p_mm_grid, n_all, inv_nometa, n_after_nometa, n_after_flagsandlod, n_after_duplicate, inv_anyvaliddata, n_after_anyvaliddata, inv_nokeymeta, n_after_nokeymeta, inv_resolution, n_after_resolution, inv_badmeasurementmethod, n_after_badmeasurementmethod, exit_r = modules.primary_quality_control(
        site_ref, species, file_res, no2_type, grid_dates, full_data,
        big_n_dup_array, valid_hours_dup, raw_st_grid, p_st_grid,
        p_st_grid_after_flagsandlod, raw_mm_grid, p_mm_grid,
        p_mm_grid_after_flagsandlod, data_resolution, n_obs_valid, key_meta,
        n_all, inv_nometa, n_after_nometa, n_after_flagsandlod,
        n_after_duplicate, inv_anyvaliddata, n_after_anyvaliddata,
        inv_nokeymeta, n_after_nokeymeta, inv_resolution, n_after_resolution,
        inv_badmeasurementmethod, n_after_badmeasurementmethod)
    if data_valid == False:
        exit_c_list = np.array([
            inv_nometa, inv_anyvaliddata, inv_nokeymeta, inv_resolution,
            inv_badmeasurementmethod
        ])
        n_c_list = np.array([
            n_all, n_after_nometa, n_after_flagsandlod, n_after_duplicate,
            n_after_anyvaliddata, n_after_nokeymeta, n_after_resolution,
            n_after_badmeasurementmethod
        ])
        unknown_list = [
            unknown_mm_list, unknown_mm_refs_list, unknown_local_tz_list
        ]
        meta = [
            lat, lon, alt, 'na', 'na', 'na', 'na', 'na', 'na', 'na', 'na', 'na'
        ]
        return c, ['na'], ['na'], [
            'na'
        ], False, meta, exit_c_list, n_c_list, unknown_list, exit_r, np.zeros(
            1)

    #make tz int after checks
    data_tz = np.float32(data_tz)

    #set processed unit
    p_unit = 'pbbv'

    #get local timezone
    try:
        local_tz_name = tz_root.tzNameAt(lat, lon, forceTZ=True)
        pytz_obj = pytz.timezone(local_tz_name)
        datetime_offset = pytz_obj.utcoffset(datetime.datetime(2000, 1, 1))
        if datetime_offset < datetime.timedelta(0):
            local_tz = -(24 - int(datetime_offset.seconds / 60 / 60))
        else:
            local_tz = int(datetime_offset.seconds / 60 / 60)
    except:
        local_tz = 'na'
        print 'TIMEZONE NOT KNOWN, SITE IS %s' % (site_ref)
        unknown_local_tz_list.append(site_ref)

    #pack meta
    meta = [
        lat, lon, alt, raw_class_name, file_res, unit, p_unit, data_tz,
        local_tz, site_name, country, contact
    ]

    #if blank strings in meta then convert to 'na'
    for i in range(len(meta)):
        try:
            if meta[i].strip() == '':
                meta[i] = 'na'
        except:
            pass

    print set(raw_st_grid)
    print set(raw_mm_grid)
    print set(p_st_grid)
    print set(p_mm_grid)
    print meta

    exit_c_list = np.array([
        inv_nometa, inv_anyvaliddata, inv_nokeymeta, inv_resolution,
        inv_badmeasurementmethod
    ])
    n_c_list = np.array([
        n_all, n_after_nometa, n_after_flagsandlod, n_after_duplicate,
        n_after_anyvaliddata, n_after_nokeymeta, n_after_resolution,
        n_after_badmeasurementmethod
    ])
    print 'exit counts = ', exit_c_list
    print 'n obs counts = ', n_c_list

    unknown_list = [
        unknown_mm_list, unknown_mm_refs_list, unknown_local_tz_list
    ]

    return c, full_data, p_st_grid, p_mm_grid, data_valid, meta, exit_c_list, n_c_list, unknown_list, 'na', big_n_dup_array