def scrape(self): page = requests.get( 'https://www.ndbc.noaa.gov/station_page.php?station=' + self.stationID) self.tree = html.fromstring(page.content) if not self.isValidStation(): return # setup the dictionaries buoy = {} reading = {} buoy['station_id'] = self.stationID reading['station_id'] = self.stationID # Main box variables reading['wind_direction'] = self.grabDirectionTupleFromString( self.grabFromTree('"Wind Direction (WDIR):"')) reading['wind_speed'] = self.grabNumberFromString( self.grabFromTree('"Wind Speed (WSPD):"')) reading['wind_gust'] = self.grabNumberFromString( self.grabFromTree('"Wind Gust (GST):"')) reading['wave_height'] = self.grabNumberFromString( self.grabFromTree('"Wave Height (WVHT):"') ) # this also seems to track 'Significant Wave Height' reading['dominant_period'] = self.grabNumberFromString( self.grabFromTree('"Dominant Wave Period (DPD):"')) reading['average_period'] = self.grabNumberFromString( self.grabFromTree('"Average Period (APD):"')) reading['wave_direction'] = self.grabDirectionTupleFromString( self.grabFromTree('"Mean Wave Direction (MWD):"')) reading['air_temperature'] = self.grabNumberFromString( self.grabFromTree('"Air Temperature (ATMP):"')) # Secondary box variables reading['significant_wave_height'] = self.grabNumberFromString( self.grabFromTree('"Significant Wave Height (WVHT):"')) reading['swell_height'] = self.grabNumberFromString( self.grabFromTree('"Swell Height (SwH):"')) reading['swell_period'] = self.grabNumberFromString( self.grabFromTree('"Swell Period (SwP):"')) reading['swell_direction'] = self.constructDirectionTupleFromString( self.grabFromTree('"Swell Direction (SwD):"')) reading['wind_wave_height'] = self.grabNumberFromString( self.grabFromTree('"Wind Wave Height (WWH):"')) reading['wind_wave_period'] = self.grabNumberFromString( self.grabFromTree('"Wind Wave Period (WWP):"')) reading[ 'wind_wave_direction'] = self.constructDirectionTupleFromString( self.grabFromTree('"Wind Wave Direction (WWD):"')) reading['average_wave_period'] = self.grabNumberFromString( self.grabFromTree('"Average Wave Period (APD):"')) # Grab the time reading['first_time'] = self.grabLocalTime(True) reading['second_time'] = self.grabLocalTime(False) # Grab the buoy name buoyName = self.grabBuoyName() buoy['name'] = buoyName reading['buoy_name'] = buoyName # Remove nulls reading = {k: v for k, v in reading.items() if v != None} # Instantiate model objects buoyObject = Buoy(**buoy) readingObject = Reading(**reading) #readingObject.id = self.db.readings.insert_one(readingObject.mongoDB()).inserted_id self.db.readings.update({'station_id': self.stationID}, readingObject.mongoDB(), upsert=True) self.db.buoys.update({'station_id': self.stationID}, buoyObject.mongoDB(), upsert=True)
def update(self, stationID): potentialBuoy = self.db.potentialBuoys.find_one( {'station_id': stationID}) if potentialBuoy is None: # This is not a valid stationID return buoyName = potentialBuoy['name'] buoy = {'station_id': potentialBuoy['station_id'], 'name': buoyName} # gets you the first box: # https://www.ndbc.noaa.gov/data/realtime2/41013.txt # then this for second box: # https://www.ndbc.noaa.gov/data/realtime2/44013.spec try: firstBoxData = urllib2.urlopen( "https://www.ndbc.noaa.gov/data/realtime2/" + stationID + ".txt").read(2000) secondBoxData = urllib2.urlopen( "https://www.ndbc.noaa.gov/data/realtime2/" + stationID + ".spec").read(2000) except urllib2.HTTPError: print('error caught trying to fetch ' + stationID) # TODO make this still update the reading as empty or something return # For some stations (e.g. 44008) the first box data is taken every 10 min, # but only 1 per hour (usually on the 50 min) has all the data. # First box columns and units: # YY MM DD hh mm WDIR WSPD GST WVHT DPD APD MWD PRES ATMP WTMP DEWP VIS PTDY TIDE # yr mo dy hr mn degT m/s m/s m sec sec degT hPa degC degC degC nmi hPa ft # Second box columns and units: # YY MM DD hh mm WVHT SwH SwP WWH WWP SwD WWD STEEPNESS APD MWD # yr mo dy hr mn m m sec m sec - degT - sec degT lines = firstBoxData.split('\n') firstLine = lines[0] firstLineValues = firstLine.split() minuteIndex = firstLineValues.index('mm') readingLines = lines[2:] # default to the latest if no 50 min readings are found latestFullReadingValues = readingLines[0].split() for readingLine in readingLines: readingValues = readingLine.split() minutes = readingValues[minuteIndex] if minutes == '50': latestFullReadingValues = readingValues break firstBoxDictionary = {} for index, label in enumerate(firstLineValues): readingValue = latestFullReadingValues[index] firstBoxDictionary[label] = readingValue lines = secondBoxData.split('\n') firstLine = lines[0] firstLineValues = firstLine.split() readingLine = lines[2] latestFullReadingValues = readingLine.split() secondBoxDictionary = {} for index, label in enumerate(firstLineValues): readingValue = latestFullReadingValues[index] secondBoxDictionary[label] = readingValue waveHeight = self.feetFromReading(firstBoxDictionary, 'WVHT') wavePeriod = self.floatFromReading(firstBoxDictionary, 'DPD') waveDirection = self.floatFromReading(firstBoxDictionary, 'MWD') swellHeight = self.feetFromReading(secondBoxDictionary, 'SwH') swellPeriod = self.floatFromReading(secondBoxDictionary, 'SwP') swellDirection = self.compassToDegreesFromReading( secondBoxDictionary, 'SwD') # Are all of these Strings? #2019-10-20T17:50:00Z try: wavesDatetime = (firstBoxDictionary['#YY'] + '-' + firstBoxDictionary['MM'] + '-' + firstBoxDictionary['DD'] + 'T' + firstBoxDictionary['hh'] + ':' + firstBoxDictionary['mm'] + ':00Z') except KeyError: wavesDatetime = None windDirection = self.floatFromReading(firstBoxDictionary, 'WDIR') windSpeed = self.knotsFromReading(firstBoxDictionary, 'WSPD') reading = { 'station_id': stationID, 'buoy_name': buoyName, 'wind_direction': windDirection, 'wind_speed': windSpeed, 'wave_height': waveHeight, 'dominant_period': wavePeriod, 'wave_direction': waveDirection, 'swell_height': swellHeight, 'swell_period': swellPeriod, 'swell_direction': swellDirection, 'datetime': wavesDatetime } # Remove nulls reading = {k: v for k, v in reading.items() if v != None} # Instantiate model objects buoyObject = Buoy(**buoy) readingObject = Reading(**reading) #readingObject.id = self.db.readings.insert_one(readingObject.mongoDB()).inserted_id self.db.readings.update({'station_id': stationID}, readingObject.mongoDB(), upsert=True) self.db.buoys.update({'station_id': stationID}, buoyObject.mongoDB(), upsert=True)