def getnytimes(self, url): community_key = '35fb5c7778409b89b6869e2ac8f79838:0:65748530' article_key = '8870ced3a7299320d2efd36791e636c8:5:65748530' expanded_content = expandURL(url) self.url = exp_url = re.sub(r'(http://www)\d*(\..*)\?.*', r'\1\2', expanded_content['long-url']) q = 'url:' + '"' + exp_url + '"' article_api_query = 'http://api.nytimes.com/svc/search/v1/article?format=json&query={0}&api-key={1}'.format(urllib.quote_plus(q), urllib.quote_plus(article_key)) http = httplib2.Http() (response, content) = http.request(article_api_query, 'GET') try: if response['status'] == '200': content = json.loads(content) self.body = content['results'][0]["body"] print self.body except: print "body not initialized: ", self.url try: #redirection failure (res1, cont1) = http.request(exp_url, 'GET') if res1['status'] == '200': print cont1 except: print 'could not get data by crawling', exp_url comm_api_query = 'http://api.nytimes.com/svc/community/v2/comments/url/exact-match.json?&url={0}&api-key={1}'.format(urllib.quote_plus(exp_url), urllib.quote_plus(community_key)) (response, content) = http.request(comm_api_query, 'GET') if response['status'] == '200': content = json.loads(content) self.comments = content['results']['comments']
def processrequest(self): domain = "" if re.search(r'(eng*\.co)|(engadget\.com.*)', self.url) != None: domain = 'engadget' elif re.search(r'(mash*\.to)|(mashable\.*)', self.url) != None: domain = 'mashable' elif re.search(r'ndtv', self.url) != None: domain = 'ndtv' elif re.search(r'fakingnews', self.url) != None: domain = 'fakingnews' elif re.search(r'treehugger', self.url) != None or 'treehugger' in expandURL(self.url)['long-url']: domain = 'treehugger' elif re.search(r'news\.cnet', self.url) != None: domain = 'cnetnews' if domain == "": return self.geterrorresponse("Url Not Valid...") self.domain = domain bh = crawler.BlogHtml(self.url, domain) self.title = bh.title self.items = util.getitems(bh.title, bh.blogparas)