예제 #1
0
    def parse(self, response):
        # for each author ID on the page,create a new authorItem
        for ids in response.xpath(
                '//*[@id="gsc_ccl"]/div/div/div[@class="gsc_1usr_int"]'):
            full = ids.extract()
            fos = re.findall('=label:([^"]+)"', full)
            if fos:
                for f in fos:
                    it = ItemLoader(item=FOSItem(), response=response)
                    self.logger.debug(f)
                    it.add_value('field_name', f)
                    yield it.load_item()

        # generate  next url
        new1 = response.xpath(
            '//*[@id="gsc_authors_bottom_pag"]/span/button[2]').extract_first(
            )
        if new1:
            new2 = re.search('mauthors(.*)\'"', new1)
            if new2:
                newUrl = str(new2.group(1)).replace('\\x3d',
                                                    '=').replace('\\x26', '&')
                newUrl = self.base_url + newUrl
                self.container.append(newUrl)
        # proceed with another random url to randomize access pattern to gscholar
        next = utils.pop_random(self.container)
        if next:
            yield Request(url=next)
예제 #2
0
파일: org_detail.py 프로젝트: shoraj/grespa
 def next_label_from_db(self):
     next_label = utils.pop_random(self.fields)
     if not next_label:
         return None
     enc = urllib2.quote(next_label.name.encode('utf-8')).encode('ASCII')
     self.logger.debug('Choosing existing org %s.' % enc)
     self.curr= next_label.id
     return self.pattern.format(enc)
예제 #3
0
    def choose_next(self):
        if random.random() > 0.5:
            if len(self.container) == 0:
                l = self.next_label_from_db()
                return l
            else:
                u = utils.pop_random(self.container)
                self.logger.debug('Choosing existing url %s.' % u)
                return u
        else:
            next_url = self.next_label_from_db()
            if next_url:
                return next_url

            next_url = utils.pop_random(self.container)
            self.logger.debug('Choosing existing url %s.' % next_url)
            return next_url
예제 #4
0
    def choose_next(self):
        if random.random() > 0.5:
            if len(self.container) == 0:
                l = self.next_label_from_db()
                return l
            else:
                u = utils.pop_random(self.container)
                self.logger.debug('Choosing existing url %s.' % u)
                return u
        else:
            next_url = self.next_label_from_db()
            if next_url:
                return next_url

            next_url = utils.pop_random(self.container)
            self.logger.debug('Choosing existing url %s.' % next_url)
            return next_url
예제 #5
0
    def __init__(self, *args, **kwargs):
        super(self.__class__, self).__init__(*args, **kwargs)
        settings = get_project_settings()
        with open(settings['SEED_NAME_LIST'], mode='r') as f:
            self.container = [(self.base_url + '={0}').format(urllib.quote(i)) for i in f.readlines() if len(i) > 0]

        self.logger.info('Starting with %d surnames.', len(self.container))

        start = utils.pop_random(self.container)
        if start:
            self.start_urls = [start]
예제 #6
0
    def choose_next(self):
        # do not choose from database if we only want to scrape the given authors
        if self.scrape_given:
            return utils.pop_random(self.container)

        if random.random() > 0.5:
            if len(self.container) == 0:
                l = self.next_author_from_db()
                return l
            else:
                u = utils.pop_random(self.container)
                self.logger.debug('Choosing existing url %s.' % u)
                return u
        else:
            next_author = self.next_author_from_db()
            if next_author:
                return next_author

            next_author = utils.pop_random(self.container)
            self.logger.debug('Choosing existing url %s.' % next_author)
            return next_author
예제 #7
0
    def __init__(self, *args, **kwargs):
        super(self.__class__, self).__init__(*args, **kwargs)
        settings = get_project_settings()
        with open(settings['SEED_NAME_LIST'], mode='r') as f:
            self.container = [(self.base_url + '={0}').format(urllib.quote(i))
                              for i in f.readlines() if len(i) > 0]

        self.logger.info('Starting with %d surnames.', len(self.container))

        start = utils.pop_random(self.container)
        if start:
            self.start_urls = [start]
예제 #8
0
    def choose_next(self):
        # do not choose from database if we only want to scrape the given authors
        if self.scrape_given:
            return utils.pop_random(self.container)

        if random.random() > 0.5:
            if len(self.container) == 0:
                l = self.next_author_from_db()
                return l
            else:
                u = utils.pop_random(self.container)
                self.logger.debug('Choosing existing url %s.' % u)
                return u
        else:
            next_author = self.next_author_from_db()
            if next_author:
                return next_author

            next_author = utils.pop_random(self.container)
            self.logger.debug('Choosing existing url %s.' % next_author)
            return next_author
예제 #9
0
파일: org_detail.py 프로젝트: shoraj/grespa
    def __init__(self, *args, **kwargs):
        super(self.__class__, self).__init__(*args, **kwargs)

        # fields from the database
        self.fields = self.all_fields()


        # select a field to start at
        if self.fields:
            start_org = utils.pop_random(self.fields)
            print 'starting with org %s ' % start_org.name
            enc = urllib2.quote(start_org.name.encode('utf-8')).encode('ASCII')
            self.curr = start_org.id
            self.start_urls = [self.pattern.format(enc)]
예제 #10
0
    def parse(self, response):
        # for each author ID on the page,create a new authorItem
        for ids in response.xpath('//*[@id="gsc_ccl"]/div/div/div[@class="gsc_1usr_int"]'):
            full = ids.extract()
            fos = re.findall('=label:([^"]+)"', full)
            if fos:
                for f in fos:
                    it = ItemLoader(item=FOSItem(), response=response)
                    self.logger.debug(f)
                    it.add_value('field_name', f)
                    yield it.load_item()

        # generate  next url
        new1 = response.xpath('//*[@id="gsc_authors_bottom_pag"]/span/button[2]').extract_first()
        if new1:
            new2 = re.search('mauthors(.*)\'"', new1)
            if new2:
                newUrl = str(new2.group(1)).replace('\\x3d','=').replace('\\x26', '&')
                newUrl = self.base_url + newUrl
                self.container.append(newUrl)
        # proceed with another random url to randomize access pattern to gscholar
        next = utils.pop_random(self.container)
        if next:
            yield Request(url=next)
예제 #11
0
 def choose_next(self):
     # do not choose from database if we only want to scrape the given authors
     return utils.pop_random(self.container)
예제 #12
0
 def choose_next(self):
     # do not choose from database if we only want to scrape the given authors
     return utils.pop_random(self.container)