def analyzeAll(): ''' create a view to find article that are not analyzed itterate over those, single threaded ''' view = dblayer.view("article/notAnalized") for u in view: analyzeArticle.perform(u.id)
def analyzeAll(): ''' create a view to find article that are not analyzed itterate over those, single threaded ''' view=dblayer.view("article/notAnalized") for u in view : analyzeArticle.perform(u.id)
def findById(self) : ''' return question that matches id ''' view = dblayer.view("question/id", self.id) if len(view) == 0: return None elif len(view) == 1: for q in view : return Question.load(getDb(), q.id) else: print 'ERROR: more than one question for this ID' raise IntegrityConstraintException
def findById(self): ''' return question that matches id ''' view = dblayer.view("question/id", self.id) if len(view) == 0: return None elif len(view) == 1: for q in view: return Question.load(getDb(), q.id) else: print 'ERROR: more than one question for this ID' raise IntegrityConstraintException
def findByQuestion(self): view = dblayer.view("timeLineEvent/question", self.question) return view
def findByQuestion(self) : view=dblayer.view("timeLineEvent/question",self.question) return view
url = urlList[i].get('href') url = url[:url.find('?')] article._id = url article.link = article._id article.extract = summaryList[i].text article.keyword = keyword article.source = 'nyt' print article._id print article.title print article.extract print article.date article.create() #return num of search result def resultNum(soup): list = soup('span', {'class':'sortText'}) if len(list)<=0: return 0 else: num = list[-1].text.split()[-2] return int(num) #################TEST#################### if __name__ == '__main__': view=dblayer.view("article/test") for u in view : a = Article(u.id) a=a.findById() getDb().delete(a) wrapNYTimes('laden', 1, pastDay=3)
url = 'http://www.washingtonpost.com/newssearch/search.html?sa=as&sd=%s&ed=%s&st=%s&cp=%d' % ( sd, sd, keyword, j + 1) url += '&fa_1_sourcenavigator=%22The+Washington+Post%22&fa_1_sourcenavigator=washingtonpost.com&fa_1_mediatypenavigator=^Articles%24' try: text = urlopen(url).read() except: print 'error occur during connect to url %s and read contents' % url continue try: wp.feed(text.decode('cp949', errors='replace')) except: print 'error occur during parsing %s' % url continue print 'wrapping WashingtonPost : ' + str( searchDate) + ', page ' + str(j + 1) print url wp.storeArticle(keyword, searchDate) wp.close() searchDate -= oneDay print 'done' #################TEST#################### if __name__ == '__main__': view = dblayer.view("article/test") for u in view: a = Article(u.id) a = a.findById() getDb().delete(a) keyword = 'laden' wrapWPost(keyword, 1, 3)