示例#1
0
def wrapWPost(keyword, maxPage = 1, pastDay = 7):
    searchDate = date.today()
    oneDay = timedelta(days=1)
    for i in range(pastDay):
        y = str(searchDate.year)
        m = str(searchDate.month)
        d = str(searchDate.day)
        if (len(m)==1):
            m = '0'+m
        if (len(d)==1):
            d = '0'+d
        sd = y+m+d
        for j in range(maxPage):
            wp = WPostParser()
            url = 'http://www.washingtonpost.com/newssearch/search.html?sa=as&sd=%s&ed=%s&st=%s&cp=%d' % (sd, sd, keyword, j+1)
            url += '&fa_1_sourcenavigator=%22The+Washington+Post%22&fa_1_sourcenavigator=washingtonpost.com&fa_1_mediatypenavigator=^Articles%24'
            try:
                text = urlopen(url).read()
            except:
                print 'error occur during connect to url %s and read contents' % url
                continue
            try:
                wp.feed(text.decode('cp949', errors='replace'))
            except:
                print 'error occur during parsing %s' % url
                continue
            print 'wrapping WashingtonPost : '+str(searchDate)+', page '+str(j+1)
            print url
            wp.storeArticle(keyword, searchDate)
            wp.close()
        searchDate -= oneDay
    print 'done'
示例#2
0
def wrapWPost(keyword, maxPage=1, pastDay=7):
    searchDate = date.today()
    oneDay = timedelta(days=1)
    for i in range(pastDay):
        y = str(searchDate.year)
        m = str(searchDate.month)
        d = str(searchDate.day)
        if (len(m) == 1):
            m = '0' + m
        if (len(d) == 1):
            d = '0' + d
        sd = y + m + d
        for j in range(maxPage):
            wp = WPostParser()
            url = 'http://www.washingtonpost.com/newssearch/search.html?sa=as&sd=%s&ed=%s&st=%s&cp=%d' % (
                sd, sd, keyword, j + 1)
            url += '&fa_1_sourcenavigator=%22The+Washington+Post%22&fa_1_sourcenavigator=washingtonpost.com&fa_1_mediatypenavigator=^Articles%24'
            try:
                text = urlopen(url).read()
            except:
                print 'error occur during connect to url %s and read contents' % url
                continue
            try:
                wp.feed(text.decode('cp949', errors='replace'))
            except:
                print 'error occur during parsing %s' % url
                continue
            print 'wrapping WashingtonPost : ' + str(
                searchDate) + ', page ' + str(j + 1)
            print url
            wp.storeArticle(keyword, searchDate)
            wp.close()
        searchDate -= oneDay
    print 'done'
示例#3
0
def wrapNYTimes(keyword, maxPage=1, pastDay=7):
    searchDate = date.today()
    oneDay = timedelta(days=1)
    while 1:
        index = keyword.find(' ')
        if index == -1:
            break
        keyword = keyword[:index] + '%20' + keyword[index + 1:]
    for i in range(pastDay):
        y = searchDate.year
        m = str(searchDate.month)
        d = str(searchDate.day)
        if (len(m) == 1):
            m = '0' + m
        if (len(d) == 1):
            d = '0' + d
        url = 'http://query.nytimes.com/search/query?query=%s&daterange=period&year1=%d&mon1=%s&day1=%s&year2=%d&mon2=%s&day2=%s' % (
            keyword, y, m, d, y, m, d)
        try:
            req = urlopen(url)
            page = req.read()
        except:
            print 'error occur during connect to url %s and read contents' % url
            continue
        soup = BeautifulSoup(page)
        n = resultNum(soup)
        if n > maxPage * 10:
            pageNum = maxPage
        else:
            pageNum = (n + 9) / 10
        for j in range(pageNum):
            url = 'http://query.nytimes.com/search/query?query=%s&daterange=period&year1=%d&mon1=%s&day1=%s&year2=%d&mon2=%s&day2=%s&frow=%d' % (
                keyword, y, m, d, y, m, d, j * 10)
            try:
                req = urlopen(url)
                page = req.read()
            except:
                print 'error occur during connect to url %s and read contents' % url
                continue
            print 'wrapping NYTimes : ' + str(searchDate) + ', page ' + str(j +
                                                                            1)
            print url
            soup = BeautifulSoup(page.decode('utf8', 'replace'))
            storeArticles(soup, keyword, searchDate)
        searchDate -= oneDay
    print 'done'
示例#4
0
def wrapNYTimes(keyword, maxPage = 1, pastDay = 7):
    searchDate = date.today()
    oneDay = timedelta(days=1)
    while 1:
        index = keyword.find(' ')
        if index==-1:
            break
        keyword = keyword[:index] + '%20' + keyword[index+1:]
    for i in range(pastDay):
        y = searchDate.year
        m = str(searchDate.month)
        d = str(searchDate.day)
        if (len(m)==1):
            m = '0'+m
        if (len(d)==1):
            d = '0'+d
        url = 'http://query.nytimes.com/search/query?query=%s&daterange=period&year1=%d&mon1=%s&day1=%s&year2=%d&mon2=%s&day2=%s' % (keyword, y, m, d, y, m, d)
        try:
            req = urlopen(url)
            page = req.read()
        except:
            print 'error occur during connect to url %s and read contents' % url
            continue
        soup = BeautifulSoup(page)
        n = resultNum(soup)
        if n>maxPage*10:
            pageNum = maxPage
        else:
            pageNum = (n+9)/10
        for j in range(pageNum):
            url = 'http://query.nytimes.com/search/query?query=%s&daterange=period&year1=%d&mon1=%s&day1=%s&year2=%d&mon2=%s&day2=%s&frow=%d' % (keyword, y, m, d, y, m, d, j*10)
            try:
                req = urlopen(url)
                page = req.read()
            except:
                print 'error occur during connect to url %s and read contents' % url
                continue
            print 'wrapping NYTimes : '+str(searchDate)+', page '+str(j+1)
            print url
            soup = BeautifulSoup(page.decode('utf8', 'replace'))
            storeArticles(soup, keyword, searchDate)
        searchDate -= oneDay
    print 'done'