示例#1
0
    def to_dwca(self, file_path):
        if self.df.empty:
            return

        def concatenateIssue(row):
            if row._error:
                try:
                    is_nan = math.isnan(row.issue)
                except TypeError:
                    is_nan = False
                if row.issue and not is_nan:
                    row.issue += f';{row._error}'
                else:
                    row.issue = row._error

            return row.issue

        df = self.df.copy()
        df['dynamicProperties'] = np.nan

        df['dynamicProperties'] = df.apply(
            lambda row: {
                'vessel': self.vessel,
                'distance': row['_distance'],
                'voyagerInferrences': row['_inferred_on']
            }, axis=1)

        df['issue'] = df.apply(concatenateIssue, axis=1)
        df.drop(self.additional_columns, axis=1, inplace=True)
        # We merged these in from dynamicProperties
        df.drop(['vessel', 'expedition', 'datetime'], axis=1, inplace=True)
        df.drop(df.filter(regex="Unname"), axis=1, inplace=True)
        logger.info('Saved DWC-A %s', file_path)
        df.to_csv(file_path, index=False)
示例#2
0
    def _parse_imma(self, vessel_name, years):

        cache_key = 'icoads_{0}_{1}'.format(
            vessel_name,
            '-'.join(years)
        ).lower()

        cache_path = CACHE_DIR / f'{cache_key}.csv'

        try:
            logger.info(f'Loading {cache_key} from cache')
            df = pd.read_csv(
                cache_path
            )
        except FileNotFoundError:
            logger.info(f'Cached {cache_key} not found - parsing data')
        else:
            return df

        re_search = re.compile(vessel_name, re.IGNORECASE)

        data = []
        for year in years:
            for record in self._read_imma(year):
                supd = record.get('SUPD')
                if not supd:
                    continue

                try:
                    rid = record['ID'].strip()
                except AttributeError:
                    # No record ID??
                    continue

                m = re_search.search(supd)

                if m:
                    data.append({
                        'ship_id': rid,
                        'year': record['YR'],
                        'month': record['MO'],
                        'day': record['DY'],
                        'lat': record['LAT'],
                        'lon': record['LON']
                    })

        df = pd.DataFrame(data)

        df['datetime'] = pd.to_datetime(df[['day', 'year', 'month']])
        df = df.sort_values(by=['datetime'])

        df.to_csv(cache_path)
        return df
示例#3
0
    def _get_occurences_by_geotemporal_proximity(self):

        # Have to have date so lets filter on date range
        df = self.gbif[
            (self.gbif.datetime >= self.route.date_from) &
            (self.gbif.datetime <= self.route.date_to)
        ]

        df = self._add_distance_to_route(df)
        df = df[df['_distance'] < self.MAX_KM_TO_ROUTE]
        df['_inferred_on'] = 'route_proximity'

        logger.info('%s occurrences found within %skm of route',
                    df.shape[0], self.MAX_KM_TO_ROUTE)

        self._frames.append(df)
示例#4
0
    def _get_occurences(self):
        self._get_occurences_by_vessel()
        if self.expedition:
            self._get_occurences_by_expedition()
        if self.collectors:
            self._get_occurences_by_collector()
        if self.inferred_collectors:
            self._get_occurences_by_inferred_collector()
        self._get_occurences_by_geotemporal_proximity()

        if self._frames:
            df = pd.concat(self._frames)
            df = df.drop_duplicates(subset='gbifID', keep="last")
            logger.info('TOTAL: %s Occurrences with complete data',
                        df.shape[0])
            return df
        else:
            logger.error('No occurrences found for %s - %s',
                         self.vessel, self.route.year_from)

            return pd.DataFrame()
示例#5
0
文件: gbif.py 项目: benscott/voyager
    def get_cached(self, func, **kwargs):
        cache_key = 'gbif'

        if kwargs:
            kwargs_key = '-'.join(map(str, kwargs.values()))
            cache_key = f'{cache_key}-{kwargs_key}'

        cache_path = CACHE_DIR / f'{cache_key}.csv'

        try:
            logger.info(f'Loading {cache_key} from cache')
            df = pd.read_csv(cache_path)
        except FileNotFoundError:
            logger.info(f'Cached {cache_key} not found')

            # If we haven't yet loaded the main GBIF file, load it now
            # But only if kwargs are specified to prevent infinite loop
            if self.dwca is None and kwargs:
                logger.info(f'Loading DWCA')
                self.dwca = self.get_cached(self.parse_dwca, )

            df = func(**kwargs)
            df.to_csv(cache_path)
        finally:
            return df
示例#6
0
文件: route.py 项目: benscott/voyager
    def get_location_by_date(self, date):

        location = self.interpolated.loc[self.interpolated.index == date,
                                         ['lat', 'lon']]

        try:
            location = location.iloc[0]
        except IndexError:
            if date > self.date_to:
                logger.debug(
                    f'Occurrence dated {date} is after voyage end date {self.date_to}'
                )
            elif date < self.date_from:
                logger.debug(
                    f'Occurrence dated {date} is before voyage start date {self.date_to}'
                )
            else:
                # Downgraded to debug from error, as we no longer have
                # A date from journey start to journey end
                logger.info(f'Could not find location for date {date}')
        else:
            return location.tolist()
示例#7
0
    def _get_occurences_by_vessel(self):

        # Match data by expedition and vessel data
        for field in ['vessel', 'expedition']:
            if self.gbif[field].any():

                if '+' in self.vessel:
                    vessels = self.vessel.split('+')
                    df = self.gbif[self.gbif[field].str.contains(
                        '|'.join(vessels), case=False, na=False, regex=True)]
                else:
                    df = self.gbif[self.gbif[field].str.contains(
                        self.vessel, case=False, na=False)]

                df['_inferred_on'] = field

                logger.info('Found %s occurences for %s.', df.shape[0], field)

                self._process_occurrences_with_inferences(df)

                # As we have matched on vessel, expedition we can be usre these collectors
                # Are correct, so update the property
                self._update_inferred_collectors(df)
示例#8
0
文件: route.py 项目: benscott/voyager
    def interpolate(self):
        df = self.df.copy()

        df['date_diff'] = (df['datetime'] - df['datetime'].shift(1)).dt.days
        df.reset_index(drop=True, inplace=True)

        interpolation_break_points = []
        for index, row in df.iterrows():

            if row['date_diff'] > self.INTERPOLATION_MAX_DAYS:
                try:
                    previous_point = (previous_row.lat, previous_row.lon)
                except NameError:
                    # Not defined on first loop
                    pass
                else:
                    point = (row.lat, row.lon)
                    previous_point = (previous_row.lat, previous_row.lon)
                    distance = geodesic(previous_point, point).kilometers
                    if distance > self.INTERPOLATION_MAX_DISTANCE:
                        interpolation_break_points.append(index)

            previous_row = row

        if interpolation_break_points:
            logger.info('Splitting voyage into %s stages.',
                        len(interpolation_break_points))

            frames = self._split_df_into_frames(df, interpolation_break_points)
            frames = [self._interpolate(f) for f in frames]
            df = pd.concat(frames)

        else:
            # Perform interpolation on the whole data frame
            df = self._interpolate(df)

        return df
示例#9
0
    def _add_distance_to_route(self, df):
        # It is possible that matching on date / records that have lat/lon
        # includes records that are too distant from the route - remove these
        # And must have lat/lon
        df = df[
            df['decimalLatitude'].notnull()
            & df['decimalLongitude'].notnull()
        ]

        mask = df['_distance'].isnull()

        logger.info('Calculating distance for %s occurences.',
                    mask.sum())

        df['_distance'] = df.apply(
            lambda row: row['_distance'] if row['_distance'] >= 0 else self._calc_geodesic_distance(
                row['datetime'],
                row['decimalLatitude'],
                row['decimalLongitude']
            ),
            axis=1
        )

        return df
示例#10
0
    def _process_occurrences_with_inferences(self, df):
        if not df.empty:

            has_date_mask = df.datetime.notnull()
            has_location_mask = df['decimalLatitude'].notnull(
            ) & df['decimalLongitude'].notnull()

            # If they have both lat/lon and date, no further processing required
            with_date_location = df[has_date_mask & has_location_mask]
            logger.info('%s occurences with both date and location.',
                        with_date_location.shape[0])

            self._frames.append(with_date_location)

            # If they have a date & no lat/lng
            date_not_location = df[has_date_mask &
                                   np.logical_not(has_location_mask)].copy()

            if not date_not_location.empty:

                date_not_location['_error'] = 'COORDINATES_INFERRED'
                # We're calculating the lat/lon from the date, so distance will always be 0
                date_not_location['_distance'] = 0

                date_not_location[['decimalLatitude', 'decimalLongitude']] = date_not_location['datetime'].apply(
                    self._get_location_by_date)

                # Make sure we don;t have any NaNs
                date_not_location = date_not_location[df['decimalLatitude'].notna(
                ) & df['decimalLongitude'].notna(
                )]

                logger.info('%s occurences with date and inferred location.',
                            date_not_location.shape[0])

                self._frames.append(date_not_location)

            location_not_date = df[has_location_mask &
                                   np.logical_not(has_date_mask)].copy()

            if not location_not_date.empty:

                location_not_date['_error'] = 'RECORDED_DATE_INFERRED'

                location_not_date['datetime'] = location_not_date.apply(
                    lambda row: self._get_date_by_location(
                        row['decimalLatitude'], row['decimalLongitude']), axis=1)

                logger.info('%s occurences with location and inferred date.',
                            location_not_date.shape[0])

                self._frames.append(location_not_date)
示例#11
0
文件: cli.py 项目: benscott/voyager
def app(limit, vessel_name):

    imma_files = _cli_imma_files(vessel_name)

    counter = 0

    js_voyages_file = APP_DATA_DIR / 'voyages.js'
    js_metadata_file = APP_DATA_DIR / 'metadata.js'
    js_occurrences_file = APP_DATA_DIR / 'occurrences.js'

    map_metadata = {'timestamp': {'min': 0, 'max': 0}}

    new_line = False

    logger.info(f'Exporting to {js_voyages_file}')

    def _update_map_metadata(coordinates):
        min_timestamp = coordinates.timestamp.min()
        max_timestamp = coordinates.timestamp.max()

        if min_timestamp < map_metadata['timestamp']['min']:
            map_metadata['timestamp']['min'] = min_timestamp

        if max_timestamp > map_metadata['timestamp'][
                'max'] or not map_metadata['timestamp']['max']:
            map_metadata['timestamp']['max'] = max_timestamp

    occurrence_count = 0

    with js_voyages_file.open('w') as f:
        f.write('export default [\n')

        occurence_records = {}

        for vessel, route, dwca in _cli_get_routes(vessel_name):

            logger.info(
                f'Exporting {vessel} {route.year_from} - {route.year_to}')

            if new_line:
                f.write(',\n')

            coordinates = route.get_coordinates()

            df = dwca.copy()

            df['datetime'] = pd.to_datetime(df['eventDate'],
                                            infer_datetime_format=True)

            df['timestamp'] = df.datetime.astype('int64') // 10**9

            df.rename(columns={
                'gbifID': 'id',
                'scientificName': 'name',
                'decimalLatitude': 'lat',
                'decimalLongitude': 'lon',
            },
                      inplace=True)

            # Ensure the occurrences occur only within the published time frame
            df = df[(df['datetime'] > route.date_from)
                    & (df['datetime'] <= route.date_to)]

            df = df[['timestamp', 'id', 'name', 'lat', 'lon']]

            df = df.sort_values(by=['timestamp'])
            df['timestamp'] = df['timestamp'].astype(str)

            occurrence_count += df.shape[0]
            occurence_records[vessel] = df.values.tolist()

            _update_map_metadata(coordinates)

            coordinates['timestamp'] = coordinates['timestamp'].astype(str)

            voyage = {
                'coordinates': coordinates.values.tolist(),
                'metadata': {
                    'vessel': vessel,
                    'year_from': route.year_from,
                    'year_to': route.year_to,
                    'count': df.shape[0]
                }
            }

            f.write(json.dumps(voyage))
            new_line = True

            counter += 1

            if limit and counter >= limit:
                break

        f.write('\n]')

    # Give the map tiles time to load before rendering any lines
    dt = datetime.fromtimestamp(map_metadata['timestamp']['min'])
    timestamp_min = datetime.timestamp(dt + relativedelta(months=-24))

    # Output the metadata file
    with js_metadata_file.open('w') as f:
        f.write('export default {\n')
        f.write(f"\tminTimestamp: {timestamp_min},\n")
        f.write(f"\tmaxTimestamp: {map_metadata['timestamp']['max']}")
        f.write('\n}')

    with js_occurrences_file.open('w') as f:
        f.write('export default \n')
        f.write(json.dumps(occurence_records))
        f.write('\n')

    click.secho(f'Occurrence count: {occurrence_count}', fg='green')
示例#12
0
            # Are correct, so update the property
            self._update_inferred_collectors(df)

    def _update_inferred_collectors(self, df):
        for name, count in df['recordedBy'].value_counts().to_dict().items():
            self.inferred_collectors.setdefault(name, 0)
            self.inferred_collectors[name] += count

    def _get_occurences_by_collector(self):

        df = self.gbif[self.gbif['recordedBy'].str.contains(
            '|'.join(self.collectors), case=False, na=False)]

        df['_inferred_on'] = 'collector'

        logger.info('Found %s occurences by collector name.', df.shape[0])

        self._process_occurrences_with_inferences(df)

    def _get_occurences_by_inferred_collector(self):

        name_blacklist = ['Anonymous', 'Unknown',
                          'Unnamed', 'Unidentified', 'Anon']

        # Number of times we should have seen a collector name for it to be included
        occurrence_threshold = 50

        collectors = [name for name, count in self.inferred_collectors.items(
        ) if count > occurrence_threshold]

        collectors = [extract_surname(c) for c in collectors]