示例#1
0
    def test_parse_reviews_nbwithin(self):
        wid = ['7571386']

        # mock a spider with specific last-harvest and launch dates
        spider = self.mock_spider(spiderreviews.LibraryThing, wid, to_date=datetime.date(2013, 8, 7))

        spider.works_to_harvest[0]['last_harvest_date'] = datetime.date(2013, 4, 4)
        meta = {'work_index': 0, 'passed_item': spider.build_review_item(work_refid=wid[0], review_lang=u'eng')}
        review_items = spider.parse_reviews(mock_from_file("mockobject/LTRespReviews_%s.html" % wid[0],
                                                           response_type="Html",
                                                           meta=meta))
        nb = 0
        # only expected to have the 3 duplicated reviews done on the 4/4/2013
        for rev in review_items:
            nb += 1
        self.assertEqual(3, nb)

        # otherwise initial load must yield all reviews
        meta = {'work_index': 0, 'passed_item': spider.build_review_item(work_refid=wid[0], review_lang=u'und')}
        spider = self.mock_spider(spiderreviews.LibraryThing, wid)
        spider.works_to_harvest[0]['last_harvest_date'] = spider.min_harvest_date
        review_items = spider.parse_reviews(mock_from_file("mockobject/LTRespReviews_%s.html" % wid[0],
                                                           response_type="Html",
                                                           meta=meta))
        nb = 0
        for item in review_items:
            nb += 1
            self.assert_not_none(item)
            # validate automatic review lang detection
            self.assertEqual(item['review_lang'], u'eng')

        self.assertEqual(127, nb)
示例#2
0
    def test_start_request_and_search(self):
        spider = self.mock_spider(spiderreviews.Goodreads, [1000, 2000], [['9780590406406', '1111111111111'], ['1111111111112']])
        start_reqs = spider.start_requests()

        work_req = start_reqs.next()
        self.assertEqual(work_req.meta['work_index'], 0)
        self.assertEqual(work_req.meta['nb_try'], 1)
        self.assertEqual(work_req.url, spider.url_search + '9780590406406')

        r = spider.parse_search_resp(mock_from_file("mockobject/GR_noresults.html", response_type="Html", meta=work_req.meta))
        sec_req = r.next()
        self.assertEqual(sec_req.meta['work_index'], 0)
        self.assertEqual(sec_req.meta['nb_try'], 2)
        self.assertEqual(sec_req.url, spider.url_search + '1111111111111')

        finish_nores = spider.parse_search_resp(mock_from_file("mockobject/GR_noresults.html", response_type="Html", meta=sec_req.meta))
        item = finish_nores.next()
        self.assertEqual(item['work_refid'], 1000)
        self.assertEqual(item['work_uid'], '-1')
        self.assertEqual(item['site_logical_name'], 'goodreads')

        work_req = start_reqs.next()
        meta = work_req.meta
        self.assertEqual(meta['work_index'], 1)
        self.assertEqual(meta['nb_try'], 1)
        self.assertEqual(work_req.url, spider.url_search + '1111111111112')

        try:
            start_reqs.next()
            self.fail("only one request per work_refid")
        except StopIteration:
            pass
示例#3
0
    def test_parse_reviews(self):
        spider = self.mock_spider(spiderreviews.Amazon, [10])
        asin = '0439358078'
        spider.works_to_harvest[0]['last_harvest_date'] = spider.min_harvest_date
        results = spider.parse_reviews(mock_from_file("mockobject/AZ_HP_0439358078_reviews_p774_of_775.html",
                                                      url=spider.review_url % asin,
                                                      response_type="Html",
                                                      meta={'work_index': 0}))
        i = 0
        for item in results:
            if type(item) == dict:
                self.assert_not_none(item)
                self.assertEqual(item['site_logical_name'], 'amazon.com')
                self.assertEqual(item['authors'], u'J.K. Rowling')
                self.assertEqual(item['title'], u"Harry Potter And The Order Of The Phoenix")
                self.assertEqual(item['work_refid'], 10)
                self.assertEqual(item['work_uid'], asin)
                self.assertEqual(item['review_lang'], u'eng')
                if i == 0:
                    self.assertEqual(item['username'], u'Amazon Customer')
                    self.assertEqual(item['user_uid'], u'A3FVDEFHGRLLYH')
                    self.assertEqual(item['parsed_rating'], 10)
                    self.assertEqual(item['parsed_likes'], 2)
                    self.assertEqual(item['review'], u"This book is simply amazing. You will never guess who dies :( Can't wait for book 6!!")
                    self.assertEqual(item['parsed_review_date'], datetime.date(2003, 6, 21))
                i += 1
            else:
                self.assertEqual(type(item), scrapy.Request)
                self.assertEqual(item.url, "http://www.amazon.com/Harry-Potter-And-Order-Phoenix/product-reviews/0439358078/ref=cm_cr_arp_d_paging_btm_775?ie=UTF8&pageNumber=775&sortBy=recent" )
        self.assertEqual(i, 10)

        asin = '0199537828'
        # test limited time period (1 review should be filtered out)
        spider.works_to_harvest[0]['last_harvest_date'] = datetime.date(1997, 8, 12)
        results = spider.parse_reviews(mock_from_file("mockobject/AZ_reviews_9036_asin0199537828.html",
                                                      url=spider.review_url % asin,
                                                      response_type="Html",
                                                      meta={'work_index': 0}))
        i = 0
        for item in results:
            if type(item) == dict:
                self.assert_not_none(item)
                self.assertEqual(item['site_logical_name'], 'amazon.com')
                self.assertEqual(item['authors'], u'Saint Augustine')
                self.assertEqual(item['title'], u"Confessions (Oxford World's Classics)")
                self.assertEqual(item['work_refid'], 10)
                self.assertEqual(item['work_uid'], asin)
                self.assertEqual(item['review_lang'], u'eng')
                if i == 3:
                    self.assertEqual(item['username'], u'*****@*****.**')
                    self.assertEqual(item['user_uid'], u'A2OOFX8HTKNDE8')
                    self.assertEqual(item['parsed_rating'], 8)
                    self.assertEqual(item['parsed_likes'], 6)
                    self.assertEqual(item['review'][0:100], u"An in depth look at the nature of evil through the personal struggle of the great Augustine of Hippo")
                    self.assertEqual(item['parsed_review_date'], datetime.date(1997, 8, 12))
                i += 1
            else:
                self.fail("last page should not trigger a new request")
        self.assertEqual(i, 4)
示例#4
0
    def test_mainpage_and_review(self):
        wid = 1000
        wuid = "Green-Nos-etoiles-contraires/436732"
        spider = self.mock_spider(spiderreviews.Babelio, [wid], [['dummyISBN']])
        # mock-up a response with reviews
        res_withrev = spider.parse_search_resp(mock_from_file("mockobject/BO_found_reviews_green.htm",
                                               response_type="Html",
                                               meta={'nb_try': 1, 'work_index': 0},
                                               url=spider.url_main + "dummy/wuid"))
        # 1st request is to main page
        main_pag_req = res_withrev.next()
        url_exp = spider.url_main % wuid
        self.assertEqual(main_pag_req.url, url_exp)
        self.assertEqual(main_pag_req.meta['work_index'], 0)
        self.assertEqual(main_pag_req.meta['main_page'], True)
        e_item = ReviewItem(work_refid=1000, work_uid=wuid, site_logical_name='babelio',
                            title=u'Nos étoiles contraires', authors=u'John Green')
        self.assertEqual(main_pag_req.meta['pass_item'], e_item)
        try:
            res_withrev.next()
            self.fail("no more request expected")
        except StopIteration:
            pass

        # mock-up a response to main page
        res_mainpage = spider.parse_reviews(mock_from_file("mockobject/BO_mainpage_withTags_green-436732.htm",
                                            response_type="Html",
                                            meta={'work_index': 0, 'pass_item': main_pag_req.meta['pass_item'], 'main_page': True},
                                            url=spider.url_main + "dummy/wuid"))
        # trigger a request to review page-1 with Tag added
        page_rev_req = res_mainpage.next()
        url_exp = (spider.url_main + spider.param_review) % (wuid, 1)
        self.assertEqual(page_rev_req.url, url_exp)
        self.assertEqual(page_rev_req.meta['work_index'], 0)
        self.assertIsNone(page_rev_req.meta.get('main_page'))
        self.assertTrue(page_rev_req.meta['pass_item'] != e_item)
        self.assertEquals(page_rev_req.meta['pass_item']['tags_lang'], u'fre')
        self.assertEquals(page_rev_req.meta['pass_item']['tags_t'],
                          u"adapté au cinéma__&__roman__&__littérature jeunesse__&__jeune adulte__&__littérature pour adolescents__&__jeunesse__&__roman d'amour__&__deuil__&__drame__&__amitié__&__triste__&__adolescence__&__maladie__&__mort__&__espoir__&__cancer__&__humour__&__romance__&__amour__&__littérature américaine")
        self.assertEquals(page_rev_req.meta['pass_item']['tags_n'], u"15;19;18;15;17;18;20;15;17;17;14;23;25;19;15;24;16;17;27;19")

        try:
            res_mainpage.next()
            self.fail("no more request expected")
        except StopIteration:
            pass

        # mock-up a response to review page1
        spider.works_to_harvest[0]['last_harvest_date'] = spider.min_harvest_date
        rev_page = spider.parse_reviews(mock_from_file("mockobject/BO_reviews_green_p1_126.htm",
                                            response_type="Html",
                                            meta={'work_index': 0, 'pass_item': page_rev_req.meta['pass_item']},
                                            url=url_exp))
        i = 0
示例#5
0
    def test_mainpage_onereview_nolang(self):
        wids = ['5266585']
        spider = self.mock_spider(spiderreviews.LibraryThing, wids)
        meta = {'work_index': 0}
        req_gen = spider.parse_mainpage(mock_from_file("mockobject/LTWorkmain_%s_onerev_nolang.html" % wids[0],
                                                        url=spider.url_mainpage % wids[0],
                                                        response_type="Html",
                                                        meta=meta))
        form_req = req_gen.next()
        self.assertEqual(spider.works_to_harvest[0]['last_harvest_date'], spider.min_harvest_date)
        self.assertEqual(form_req.meta['work_index'], 0)
        self.assertEqual(form_req.meta['passed_item']['tags_n'], u'1;2;2;1;1;2;1;2;1;1;1;7;1;2;1;1;1;4;1;4;1;3;1;1;4;1;1;7;1;1')
        self.assertEqual(form_req.meta['passed_item']['tags_t'], u'2008__&__21st century__&___global_reads__&__AF__&__BTBA longlist 2012__&__Canada__&__Canadian__&__Canadian literature__&__contemporary literature__&__diaspora__&__ethnic identity__&__fiction__&__French literature__&__Haiti__&__Haitian/Canadian/Japanese__&__home__&__importjan__&__Japan__&__Japan - Novel__&__littérature québécoise__&__my library__&__novel__&__postmodern__&__Q4 12__&__Quebec__&__stream of consciousness__&__to-buy__&__to-read__&__world fiction__&__writing')
        self.assertEqual(form_req.meta['passed_item']['tags_lang'], u'eng')
        self.assertEqual(form_req.meta['passed_item']['work_refid'], wids[0])
        self.assertIsNone(form_req.meta['passed_item'].get('review_lang'))
        self.assertIsNone(form_req.meta['passed_item'].get('dup_refid'))

        form_body = form_req.body
        self.assertTrue(form_body.find('workid=%s' %wids[0]) != -1)
        self.assertTrue(form_body.find('sort=0') != -1)
        self.assertTrue(form_body.find('languagePick=all') != -1)
        self.assertTrue(form_body.find('showCount=21') != -1)
        try:
            req_gen.next()
            self.fail("no more request expected")
        except StopIteration:
            pass
示例#6
0
    def test_mainpage_revs_nolang(self):
        wids = ['413508']
        spider = self.mock_spider(spiderreviews.LibraryThing, wids)
        meta = {'work_index': 0}
        req_gen = spider.parse_mainpage(mock_from_file("mockobject/LTWorkmain_%s_revs_nolang.html" % wids[0],
                                                       url=spider.url_mainpage % wids[0],
                                                       response_type="Html",
                                                       meta=meta))
        form_req = req_gen.next()
        self.assertEqual(spider.works_to_harvest[0]['last_harvest_date'], spider.min_harvest_date)
        self.assertEqual(form_req.meta['work_index'], 0)

        self.assertTrue(form_req.body.find('workid=413508') != -1)
        self.assertTrue(form_req.body.find('sort=0') != -1)
        self.assertTrue(form_req.body.find('languagePick=all') != -1)
        self.assertTrue(form_req.body.find('showCount=29') != -1)

        passed_item = form_req.meta['passed_item']
        self.assertIsNone(passed_item.get('review_lang'))
        self.assertEqual(passed_item['work_refid'], wids[0])
        self.assertIsNone(passed_item.get('dup_refid'))
        self.assertIsNone(passed_item.get('tags_t'))
        self.assertIsNone(passed_item.get('tags_n'))
        try:
            req_gen.next()
            self.fail("no more request expected")
        except StopIteration:
            pass
示例#7
0
    def test_parse_2authors_ref(self):
        spider = spiderref.WorkReference(works_to_harvest=None, dump_filepath=None)
        meta = {'wid': '1'}
        # validate parse_work()
        ref_items = spider.parse_work(
                mock_from_file("mockobject/Workdetails_2authors_work.html",
                               response_type="Html",
                               url="https://www.librarything.com/work/1/workdetails",
                               meta=meta))

        item = ref_items.next()
        self.assertEqual(item['title'], u"Nick & Norah's Infinite Playlist")
        self.assertEqual(item['authors'], u'Rachel Cohn;David Levithan')
        self.assertEqual(item['authors_code'], u'cohnrachel;levithandavid')
        self.assertEqual(item['original_lang'], u'English')
        self.assertEqual(item['ori_lang_code'], u'ENG')
        self.assertEqual(item['mds_code'], u'813.6')
        self.assertEqual(item['mds_text'], u'Literature-->American And Canadian-->Fiction-->21st Century')
        self.assertEqual(item['lc_subjects'], u'Love->Fiction__&__New York (N.Y.)->Fiction__&__Rock groups->Fiction')
        self.assertEqual(item['popularity'], u'2,552')
        self.assertEqual(item['other_lang_title'],
                         u'Czech : Nick a Norah: až do ochraptění__&__'
                         u'Dutch : Nick en Norah__&__'
                         u'French : Une nuit a New York__&__'
                         u'German : Nick & Norah - Soundtrack einer Nacht__&__'
                         u'Swedish : Nick & Norahs oändliga låtlista')
示例#8
0
    def test_parse_ref(self):
        spider = spiderref.WorkReference(works_to_harvest=None, dump_filepath=None)
        meta = {'wid': '1'}
        # validate parse_work()
        ref_items = spider.parse_work(
                mock_from_file("mockobject/Workdetails_Rien_de_grave_LibraryThing.html",
                               response_type="Html",
                               url="https://www.librarything.com/work/1/workdetails",
                               meta=meta))

        item = ref_items.next()
        self.assertEqual(item['work_refid'], '1')
        self.assertIsNone(item.get('dup_refid'))
        self.assertEqual(item['title'], u'Rien de grave')
        self.assertEqual(item['authors'], u'Justine Lévy')
        self.assertEqual(item['authors_code'], u'levyjustine')
        self.assertEqual(item['original_lang'], u'French')
        self.assertEqual(item['ori_lang_code'], u'FRE')
        self.assertEqual(item['mds_code'], u'848')
        self.assertEqual(item['mds_text'], u'Literature-->French-->Authors, French and French miscellany')
        self.assertIsNone(item.get('lc_subjects'))
        self.assertEqual(item['popularity'], u'97,109')
        self.assertEqual(item['other_lang_title'],
                         u'French : Rien de grave__&__'
                         u'German : Nicht so tragisch__&__'
                         u'Italian : Niente di grave__&__'
                         u'Swedish : Inget allvarligt')
        try:
            ref_items.next()
        except StopIteration, e:
            self.assertTrue(e is not None)
示例#9
0
    def test_start_request_and_search(self):
        spider = self.mock_spider(spiderreviews.Babelio, [1000, 2000], [['0141345632', '1111111111111'], ['1111111111112']])
        start_reqs = spider.start_requests()

        form_req = start_reqs.next()
        self.assertEqual(form_req.meta['work_index'], 0)
        self.assertEqual(form_req.meta['nb_try'], 1)
        self.assertEqual(form_req.url, spider.form_search)
        self.assertTrue(form_req.body.find('Recherche=0141345632') != -1)
        self.assertTrue(form_req.body.find('item_recherche=isbn') != -1)

        r = spider.parse_search_resp(mock_from_file("mockobject/BO_nores_searchISBN_9783727297489.htm", response_type="Html", meta=form_req.meta))
        sec_form_req = r.next()
        self.assertEqual(sec_form_req.meta['work_index'], 0)
        self.assertEqual(sec_form_req.meta['nb_try'], 2)
        self.assertEqual(sec_form_req.url, spider.form_search)
        self.assertTrue(sec_form_req.body.find('Recherche=1111111111111') != -1)
        self.assertTrue(sec_form_req.body.find('item_recherche=isbn') != -1)
        try:
            r.next()
            self.fail("no more isbn to trigger request")
        except StopIteration:
            pass

        form_req = start_reqs.next()
        self.assertEqual(form_req.meta['work_index'], 1)
        self.assertEqual(form_req.meta['nb_try'], 1)
        self.assertEqual(form_req.url, spider.form_search)
        self.assertTrue(form_req.body.find('Recherche=1111111111112') != -1)
        self.assertTrue(form_req.body.find('item_recherche=isbn') != -1)

        res_norev = spider.parse_search_resp(mock_from_file("mockobject/BO_found_noreviews_searchISBN_9782010174629.htm",
                                                            response_type="Html",
                                                            meta=form_req.meta,
                                                            url=spider.url_main + "dummy/wuid"))
        item = res_norev.next()
        self.assertEqual(item['work_refid'], 2000)
        self.assertEqual(item['work_uid'], u'Nougier-Les-Celtes--La-vie-privee-des-hommes--Au-temps-d/135501')
        self.assertEqual(item['site_logical_name'], 'babelio')
        self.assertEqual(item['title'], u'Les Celtes : La vie privée des hommes : Au temps des Gaulois')
        self.assertEqual(item['authors'], u'Louis-René Nougier')
        try:
            res_norev.next()
            self.fail("no more item expected")
        except StopIteration:
            pass
示例#10
0
    def test_parse_reviews(self):
        # instruct not to truncate long text with assertionError diagnose
        self.maxDiff = None
        wid = 1000
        wuid = "2776527-traffic"
        spider = self.mock_spider(spiderreviews.Goodreads, [wid], [['9780590406406']])
        # mock-up
        spider.works_to_harvest[0]['last_harvest_date'] = spider.min_harvest_date
        meta = {'work_index': 0, 'item': spider.build_review_item(work_refid=wid,
                                                                  work_uid=wuid)}
        resp = spider.parse_reviews(mock_from_file("mockobject/GR_%s_reviews_p2of32.html" % wuid,
                                                   response_type="Html", meta=meta,
                                                   url=spider.url_review % (wuid, 2, 'newest')))
        #self.assertEqual(spider.current_page, 2)
        i = 0
        for item in resp:
            if i < 30:
                self.assert_not_none(item)
                self.assertEqual(item['site_logical_name'], 'goodreads')
                # self.assertEqual(item['authors'], u'Tom Vanderbilt')
                # self.assertEqual(item['title'], u'Traffic: Why We Drive the Way We Do (and What It Says About Us)')
                self.assertEqual(item['work_refid'], 1000)
                self.assertEqual(item['work_uid'], wuid)
                if i != 4:
                    self.assertEqual(item['review_lang'], u'eng')
                # check the tags (genre)
                self.assertEqual(item['tags_n'], u'386 ;92 ;83 ;52 ;20 ;20 ;17 ;13 ;9 ;7 ')
                self.assertEqual(item['tags_t'], u'Non Fiction__&__Psychology__&__Science__&__Sociology__&__Cities > Urban Planning__&__Culture__&__Social Science__&__History__&__Adult__&__Business')
                # these tow validate the long text shown partially
                if i == 0:
                    self.assertEqual(len(item['review']), 1624)
                if i == 1:

                    self.assertEqual(item['review'], u"As I said in an update to this book yesterday: Ugh. Seriously, p. 8 and I'm about to give up. "
                                                     u"In a book that I hoped to provide, well, science about traffic, he starts out repeating answers he "
                                                     u"got about traffic from asking on an internet forum. It really looks bleak right now that he's going to "
                                                     u"turn this ship around.... I read a few more pages and it was clear that it was going to go down hill. "
                                                     u"What I wanted was a book about traffic science, highway design, round-a-bout theory, etc. "
                                                     u"This is just a guy talking about \"doesn't it always seem like when you driving X happens? Well I asked "
                                                     u"some people on the internet. Some of them said X, but some of them said Y.\"")
                elif i == 3:
                    self.assertEqual(item['username'], u'Rebecca')
                    self.assertEqual(item['likes'], u'1 like')
                    self.assertEqual(item['parsed_likes'], 1)
                elif i == 5:
                    self.assertEqual(item['username'], u'Rayfes Mondal')
                    self.assertEqual(item['user_uid'], u'35461072-rayfes-mondal')
                    self.assertEqual(item['parsed_rating'], 8)
                    self.assertEqual(item['review'], u"A fascinating look at how traffic works and how we drive. Some controversial viewpoints but this was a fun book for me. There's so much idiocy in how we drive and set up our road system.")
                    self.assertEqual(item['parsed_review_date'], datetime.date(2015, 7, 14))
            # the request to next page
            if i == 30:
                self.assertEqual(type(item), scrapy.http.request.Request)
                self.assertEqual(item.url, "https://www.goodreads.com/book/show/2776527-traffic?page=3&sort=oldest")
                self.assertEqual(item.meta['lastpage_no'], 32)
            i += 1
示例#11
0
    def test_many_search_res(self):
        spider = self.mock_spider(spiderreviews.Amazon, [1000], [['theisbn']])
        meta = {'work_index': 0}
        iterato = spider.parse_search_resp(mock_from_file("mockobject/AZ_9036_resp_searchISBN.html",
                                                          response_type="Html",
                                                          meta=meta))
        all_wuids = list()
        for req in iterato:
            u = req.url
            work_uid = u[u.index('/product-reviews/') + 17:u.index('/ref=')]
            all_wuids.append(work_uid)
            self.assertTrue(work_uid in ['014044114X', '0143039512', '0883683822', '1591094003', '1565481542'])
        self.assertEqual(len(all_wuids), 5)

        items = spider.parse_search_resp(mock_from_file("mockobject/AZ_821501_worksFound_noreview.html",
                                                        response_type="Html",
                                                        meta=meta))
        item = items.next()
        self.assertEqual(item['work_refid'], 1000)
        try:
            items.next()
            self.fail()
        except StopIteration:
            pass
示例#12
0
    def test_parse_reviews(self):
        wid = ['2371329']
        spider = self.mock_spider(spiderreviews.LibraryThing, wid)
        # mock dependency
        spider.works_to_harvest[0]['last_harvest_date'] = spider.min_harvest_date
        # validate parse_reviews() and simulate an undetermined language to validate the automatic lang detection

        meta = {'work_index': 0, 'passed_item': spider.build_review_item(work_refid=wid[0], review_lang=u'und')}
        review_items = spider.parse_reviews(mock_from_file("mockobject/LTRespReviews_%s.html" % wid[0],
                                                           response_type="Html",
                                                           meta=meta))
        usernames = []
        userids = []
        parsed_dates = []
        parsed_ratings = []
        parsed_likes = []
        for item in review_items:
            usernames.append(item['username'])
            userids.append(item['user_uid'])
            parsed_dates.append(item['parsed_review_date'])
            if item.get('parsed_likes'):
                parsed_likes.append(item.get('parsed_likes'))
            if item.get('parsed_rating'):
                parsed_ratings.append(item.get('parsed_rating'))
            if item['username'] == u'briconella':
                self.assertIsNone(item.get('parsed_rating'))
                self.assertEqual(item['review_text'], u"Le meilleur d'Amélie Nothomb. J'ai découvert qu'il est au programme des terminales!")

            if item['username'] == u'yermat':
                self.assertEqual(1, item['parsed_likes'])
            else:
                self.assertIsNone(item.get('parsed_likes'))
            # language is detected automatically
            self.assertEqual(item['review_lang'], u'fre')

        all_users = u"yermat==soniaandree==grimm==briconcella==Cecilturtle"
        self.assertEqual(all_users, u"==".join(usernames))
        self.assertEqual(all_users, u"==".join(userids))
        self.assertEqual([1], parsed_likes)
        self.assertEqual([10, 8, 6, 8], parsed_ratings)
        self.assertEqual([datetime.date(2012, 11, 22),
                          datetime.date(2009, 2, 25),
                          datetime.date(2007, 3, 14),
                          datetime.date(2007, 3, 4),
                          datetime.date(2006, 5, 22)], parsed_dates)
示例#13
0
    def test_mainpage_revs_manylang(self):
        wids = ['2371329']
        spider = self.mock_spider(spiderreviews.LibraryThing, wids)
        meta = {'work_index': 0}

        req_gen = spider.parse_mainpage(mock_from_file("mockobject/LTWorkmain_%s_manylang.html" % wids[0],
                                                       url=spider.url_mainpage % wids[0],
                                                       response_type="Html",
                                                       meta=meta))
        form_req = req_gen.next()
        self.assertTrue(form_req.body.find('workid=2371329') != -1)
        self.assertTrue(form_req.body.find('languagePick=all') != -1)
        self.assertTrue(form_req.body.find('showCount=77') != -1)

        passed_item = form_req.meta['passed_item']
        self.assertIsNone(passed_item.get('tags_t'))
        self.assertIsNone(passed_item.get('tags_n'))
        self.assertIsNone(passed_item.get('review_lang'))
示例#14
0
 def test_mainpage_noreview(self):
     wids = ['3440235', 'dummy']
     spider = self.mock_spider(spiderreviews.LibraryThing, wids)
     meta = {'work_index': 0}
     item_gen = spider.parse_mainpage(mock_from_file("mockobject/LTWorkmain_%s_norev.html" % wids[0],
                                                     url=spider.url_mainpage % "100",
                                                     response_type="Html",
                                                     meta=meta))
     item = item_gen.next()
     self.assertEqual(item['site_logical_name'], 'librarything')
     self.assertEqual(item['work_refid'], "100")
     self.assertEqual(item['dup_refid'], wids[0])
     self.assertEqual(item['tags_n'], u'1;1;1;1;1;1;1;1')
     self.assertEqual(item['tags_t'],
         u'21st century__&__biography__&__fiction__&__FRA 848 LAF 2001__&__French__&__Haiti__&__littérature québécoise__&__Quebec')
     self.assertEqual(item['tags_lang'], u'eng')
     try:
         item_gen.next()
         self.fail("no more item expected")
     except StopIteration:
         pass
示例#15
0
    def test_startreq_and_nores(self):
        more_than_100 = [str(i) for i in range(200)]
        spider = self.mock_spider(spiderreviews.Amazon, [1], [more_than_100])
        reqs = spider.start_requests()
        r = reqs.next()
        self.assertEqual(spider.search_url % "|".join(more_than_100[0:100]), r.url)
        m = r.meta
        self.assertEqual(m['work_index'], 0)
        try:
            reqs.next()
            self.fail("no more request expected")
        except StopIteration:
            pass

        itera = spider.parse_search_resp(mock_from_file("mockobject/AZ_noresults.html", response_type="Html", meta=m))
        nores = itera.next()
        self.assertEqual(nores, ReviewItem(work_refid=1, work_uid='-1', site_logical_name='amazon.com'))
        try:
            itera.next()
            self.fail("no more item")
        except StopIteration:
            pass