def fetchLatestArticlesFromAPI(startIdx, lastArticleDate): APP_ROOT_DIR=utils.getAppRootDir() config = ConfigParser.ConfigParser() config.read(APP_ROOT_DIR + 'python/globe.config') articles_at_a_time=config.getint('boston_globe','articles_at_a_time') api_fields=config.get('boston_globe','api_fields') today = date.today() todayStr = today.strftime('%Y%m%d') allArticles = [] req = urllib2.Request( config.get('boston_globe','api_host') + "s?key=" + config.get('boston_globe','api_key') + "&bq=printpublicationdate:" + lastArticleDate +".."+todayStr+"&return-fields="+ api_fields+ "&size="+str(articles_at_a_time)+"&"+ "start="+str(startIdx)+ "&rank=printpublicationdate") print req.get_full_url() opener = urllib2.build_opener() #try and then basically try again #sometimes server returns 500 error, so just try the same request again to see if that will catch these cases try: f = opener.open(req) except: f = opener.open(req) try: data = simplejson.load(f) except: print "JSONDecodeError\nEither there was no data found or some other parsy thing happened" print str(data["hits"]["found"]) + " articles found" allArticles.extend(data["hits"]["hit"]) return allArticles
def fetchLatestArticlesFromAPI(startIdx, lastArticleDate): APP_ROOT_DIR = utils.getAppRootDir() config = ConfigParser.ConfigParser() config.read(APP_ROOT_DIR + 'python/globe.config') articles_at_a_time = config.getint('boston_globe', 'articles_at_a_time') api_fields = config.get('boston_globe', 'api_fields') today = date.today() todayStr = today.strftime('%Y%m%d') allArticles = [] req = urllib2.Request( config.get('boston_globe', 'api_host') + "s?key=" + config.get('boston_globe', 'api_key') + "&bq=printpublicationdate:" + lastArticleDate + ".." + todayStr + "&return-fields=" + api_fields + "&size=" + str(articles_at_a_time) + "&" + "start=" + str(startIdx) + "&rank=printpublicationdate") print req.get_full_url() opener = urllib2.build_opener() #try and then basically try again #sometimes server returns 500 error, so just try the same request again to see if that will catch these cases try: f = opener.open(req) except: f = opener.open(req) try: data = simplejson.load(f) except: print "JSONDecodeError\nEither there was no data found or some other parsy thing happened" print str(data["hits"]["found"]) + " articles found" allArticles.extend(data["hits"]["hit"]) return allArticles
################################################################################ # Adds new fulltext articles to DB # Matches articles to those that exist with metadata already in DB ################################################################################ import ConfigParser import couchdb import utils import os import lxml from lxml import etree from DBManager import DBManager conn = DBManager() APP_ROOT_DIR = utils.getAppRootDir() config = ConfigParser.ConfigParser() config.read(APP_ROOT_DIR + 'python/globe.config') docDir = config.get('boston_globe', 'local_ftp_dir') listing = os.listdir(docDir) errors = 0 docs = 0 newFile = config_file = open('uuids.txt', 'w') matches = 0 missingUUID = 0 noMatches = 0 for infile in listing: if "DS_Store" in infile: continue
class Geoprocessor: APP_ROOT_DIR = utils.getAppRootDir() ################################################################################ # These things are not happening in an init method because GDAL throws segfaults # when the layers & transforms are instance variables. Not sure why. ################################################################################ config = ConfigParser.ConfigParser() config.read(APP_ROOT_DIR + 'python/globe.config') path_to_census_tracts_dir = APP_ROOT_DIR + config.get( 'geo', 'path_to_census_tracts_dir') census_shape_name = config.get('geo', 'census_shape_name') path_to_towns_dir = APP_ROOT_DIR + config.get('geo', 'path_to_towns_dir') towns_shape_name = config.get('geo', 'towns_shape_name') excluded_lat1k_long1k = json.loads( config.get('geo', 'excluded_lat1k_long1k')) censusTracts = csv.reader( open(APP_ROOT_DIR + config.get('geo', 'census_tracts_to_neighborhood'), "rU")) censusTracts_list = [] censusTracts_list.extend(censusTracts) census_tracts_to_neighborhood = {} for row in censusTracts_list: census_tracts_to_neighborhood[row[0]] = row[1] ################################################################################ #LOAD CENSUS SHP FILE ################################################################################ os.chdir(path_to_census_tracts_dir) drv = ogr.GetDriverByName('ESRI Shapefile') neighborhoodShape = drv.Open(census_shape_name) neighborhoodLayer = neighborhoodShape.GetLayer(0) spatialRef = osr.SpatialReference() #set projection & geo coord system - this comes from metadata in neighborhood files spatialRef.SetWellKnownGeogCS("NAD83") #translate to WGS84 which is lat/long spatialRef2 = osr.SpatialReference() spatialRef2.SetWellKnownGeogCS("WGS84") neighborhoodTransformObject = osr.CoordinateTransformation( spatialRef, spatialRef2) ################################################################################ #LOAD TOWNS SHP FILE ################################################################################ os.chdir(path_to_towns_dir) drv = ogr.GetDriverByName('ESRI Shapefile') townShape = drv.Open(towns_shape_name) townsLayer = townShape.GetLayer(0) spatialRef3 = osr.SpatialReference() #set projection & geo coord system - this comes from metadata in neighborhood files spatialRef3.SetLCC(41.716667, 42.683333, 41.000000, -71.500000, 200000.000000, 750000.000000) spatialRef3.SetWellKnownGeogCS("NAD83") #translate to WGS84 which is lat/long spatialRef4 = osr.SpatialReference() spatialRef4.SetWellKnownGeogCS("WGS84") townTransformObject = osr.CoordinateTransformation(spatialRef3, spatialRef4) def getNeighborhoodFromLatLong(self, latitude, longitude): for x in range(0, self.neighborhoodLayer.GetFeatureCount()): feature = self.neighborhoodLayer.GetFeature(x) poly = feature.GetGeometryRef() tractNumber = feature.GetFieldAsString(2) poly.Transform(self.neighborhoodTransformObject) WGSPoint = ogr.Geometry(ogr.wkbPoint) WGSPoint.SetPoint(0, longitude, latitude) if poly.Contains(WGSPoint): try: neighborhood = self.census_tracts_to_neighborhood[ tractNumber] except KeyError: print "censusTract # " + tractNumber + " doesn't exist in the mapping file" return "" print "Point is in " + neighborhood return neighborhood return "" def getCityFromLatLong(self, latitude, longitude): for x in range(0, self.townsLayer.GetFeatureCount()): feature = self.townsLayer.GetFeature(x) poly = feature.GetGeometryRef() city = feature.GetFieldAsString(1) poly.Transform(self.townTransformObject) WGSPoint = ogr.Geometry(ogr.wkbPoint) WGSPoint.SetPoint(0, longitude, latitude) if poly.Contains(WGSPoint): city = titlecase(city) print "Point is in " + city return city return "" def filterAndCleanArticles(self, allArticles, conn): db = conn.db cleanedArticles = [] for article in allArticles: if conn.documentExists(article["id"]): print "document id " + article[ 'id'] + " already exists - discarding..." continue conn.db_metadata["last_article_date"] = article["data"][ "printpublicationdate"] conn.db_metadata["last_article_id"] = article["id"] #create a document and insert it into the db: article["_id"] = article["id"] #If article has geocoding AND it doesn't have excluded lat longs then include in data set if len(article["data"]["latitude"]) > 0: latlong1k = [ article["data"]["latitude_1k"][0], article["data"]["longitude_1k"][0] ] if latlong1k in self.excluded_lat1k_long1k: conn.db_metadata["filtered_articles_bad_geodata"] += 1 print "Filtering the point " + str( article["data"]["latitude"][0]) + ", " + str( article["data"]["longitude"][0]) continue else: #Now apply various processing to data, add metadata for neighborhoods to record and save to DB #GET NEIGHBORHOOD neighborhood = self.getNeighborhoodFromLatLong( float(article["data"]["latitude"][0]), float(article["data"]["longitude"][0])) article["data"]["neighborhood"] = neighborhood #GET CITY & change it if it's entered improperly from Globe cityFromLatLong = self.getCityFromLatLong( float(article["data"]["latitude"][0]), float(article["data"]["longitude"][0])) if article["data"]["city"] == None or len( article["data"]["city"]) == 0: article["data"]["city"] = [""] #Fix up state declarations to MA instead of Mass, Massachusetts and all manner of whatnot if len(cityFromLatLong) > 0: if article["data"]["state"] == None or len( article["data"]["state"]) == 0: article["data"]["state"] = [""] oldState = article["data"]["state"][0] if "MA" != article["data"]["state"][0]: article["data"]["state"][0] = "MA" print "Changed state to MA from Globe entered state " + oldState + " for city " + cityFromLatLong if cityFromLatLong != article["data"]["city"][0] and len( cityFromLatLong) > 0: print "Changing Globe entered city " + article["data"][ "city"][0] + " to verified city " + cityFromLatLong conn.db_metadata[ "number_of_updated_MA_city_names"] += 1 #save the city data as entered by the Globe article["data"]["city_OLD"] = article["data"]["city"][ 0] article["data"]["city"][0] = cityFromLatLong if (len(article["data"]["city"]) > 0): city = str(article["data"]["city"][0]) else: city = "" article["type"] = "article" #change this line later when full text is actually working article["data"]["fulltext"] = article["data"][ "catherine_dignazio"] cleanedArticles.append(article) conn.db_metadata["total_articles_added"] += 1 else: conn.db_metadata["filtered_articles_no_geodata"] += 1 print "Filtering this article because there's no geodata. " continue return cleanedArticles