示例#1
0
    def parse(self, response):
        sequence1 = range(1, 8)
        iterator1 = sequence1.__iter__()
        iter_value1 = iterator1.next()
        for iter_value1 in sequence1:
            link_value1 = iter_value1 * 4
            for selection in response.xpath('//*[@id="Content"]'):
                item = PsyscraperItem()
                item['title'] = selection.xpath(
                    'table//table[1]//h3[%s]/text()' %
                    (iter_value1)).extract()
                #item['link'] = selection.xpath('table//table[1]//p[%s]/a/@href'%(link_value1)).extract()
                #item['link'] = re.findall(r"'(.*?)'", '%s' % item['link'])
                #item['link'] = [u'http://www.icml2010.org/%s' % "".join(item['link'])]
                yield item

        sequence2 = range(8, 153)
        iterator2 = sequence2.__iter__()
        iter_value2 = iterator2.next()
        for iter_value2 in sequence2:
            link_value2 = iter_value2 * 4
            for selection in response.xpath('//*[@id="Content"]'):
                item = PsyscraperItem()
                item['title'] = selection.xpath(
                    'table//table[2]//h3[%s]/text()' %
                    (iter_value2)).extract()
                #item['link'] = selection.xpath('table//table[2]//p[%s]/a/@href'%(link_value2)).extract()
                #item['link'] = re.findall(r"'(.*?)'", '%s' % item['link'])
                #item['link'] = [u'http://www.icml2010.org/%s' % "".join(item['link'])]
                yield item
示例#2
0
 def parse(self, response):
     # Sequence for first half of paper titles
     sequence1 = range(2, 83)
     iterator1 = sequence1.__iter__()
     iter_value1 = iterator1.next()
     #Loop for first half of papers
     for iter_value1 in sequence1:
         link_value1 = iter_value1 * 2 - 3
         for selection in response.xpath('//*[@id="right_column"]'):
             item = PsyscraperItem()
             item['title'] = selection.xpath('h3[%s]/text()' %
                                             (iter_value1)).extract()
             #item['link'] = selection.xpath('a[%s]/@href'%(link_value1)).extract()
             #item['link'] = re.findall(r"'(.*?)'", '%s' % item['link'])
             #item['link'] = [u'http://machinelearning.org/archive/icml2009/%s' % "".join(item['link'])]
             yield item
     #XPath naming scheme changed halfway through
     sequence2 = range(83, 162)
     iterator2 = sequence2.__iter__()
     iter_value2 = iterator2.next()
     for iter_value2 in sequence2:
         link_value2 = iter_value2 * 2 - 2
         for selection in response.xpath('//*[@id="right_column"]'):
             item = PsyscraperItem()
             item['title'] = selection.xpath('h3[%s]/text()' %
                                             (iter_value2)).extract()
             #item['link'] = selection.xpath('a[%s]/@href'%(link_value2)).extract()
             #item['link'] = re.findall(r"'(.*?)'", '%s' % item['link'])
             #item['link'] = [u'http://machinelearning.org/archive/icml2009/%s' % "".join(item['link'])]
             yield item
示例#3
0
 def parse(self, response):
     for selection in response.xpath(
             '//li[contains(@class,"title")]'
     ):  # using contains because the Elsevier decided to switch their attribute names halfway through. This expression generalizes the class to any li's class attribute containing "title".
         item = PsyscraperItem()
         item['title'] = selection.xpath('.//a/text()').extract()
         yield item
示例#4
0
 def parse(self, response):
     for selection in response.xpath(
             '//h3[contains(@class,"title")]'
     ):  # Springer likes to use an "embed" tag for papers that reference other papers in their title. I'm going to ignore this for now, but may revisit later if we find many partial or cut-off titles.
         item = PsyscraperItem()
         item['title'] = selection.xpath('.//a/text()').extract()
         yield item
示例#5
0
 def parse(self, response):
     sequence1 = range(15, 505)
     iterator1 = sequence1.__iter__()
     iter_value1 = iterator1.next()
     for iter_value1 in sequence1:
         for selection in response.xpath('/html/body/table[2]'):
             item = PsyscraperItem()
             item['title'] = selection.xpath('//p[%i]/a[1]/text()' %
                                             (iter_value1)).extract()
             yield item
示例#6
0
 def parse(self, response):
     sequence = range(1, 159)
     iterator = sequence.__iter__()
     iter_value = iterator.next()
     for iter_value in sequence:
         link_value = iter_value * 4
         for selection in response.xpath('/html/body/div/div[3]'):
             item = PsyscraperItem()
             item['title'] = selection.xpath('h3[%s]/text()' %
                                             (iter_value)).extract()
             #item['link'] = selection.xpath('p[%s]/a[1]/@href'%(link_value)).extract()
             yield item
示例#7
0
 def parse(self, response):
     sequence1 = range(9, 161)
     iterator1 = sequence1.__iter__()
     iter_value1 = iterator1.next()
     for iter_value1 in sequence1:
         link_value1 = iter_value1
         for selection in response.xpath('//*[@style="width:900px"]'):
             item = PsyscraperItem()
             item['title'] = selection.xpath('tr[1]/td[2]/a[%s]/h3/text()' %
                                             (iter_value1)).extract()
             # Due to time constraints I'm pulling a plug on finding links.
             # Revisit when able, but low priority.
             #item['link'] = selection.xpath('p[%s]/a[2]/@href'%(link_value1)).extract()
             #item['link'] = re.findall(r'\"(.*?)\"', '%s' % item['link'])
             #item['link'] = [u'http://www.icml-2011.org/%s' % "".join(item['link'])]
             yield item
示例#8
0
 def parse(self, response):
     for selection in response.xpath('//*[@id="content"]/dl/div'):
         item = PsyscraperItem()
         item['title'] = selection.xpath('p[1]/text()').extract()
         yield item
示例#9
0
 def parse(self, response): 
         for selection in response.xpath('/html/body/div[2]/div/ul/li/a[1]'):                
             item = PsyscraperItem()
             item['title'] = selection.xpath('./text()').extract()
             yield item
示例#10
0
 def parse(self, response): 
         for selection in response.xpath('//*[@class="detail"]'):
             item = PsyscraperItem()
             item['title'] = selection.xpath('h3/a//text()').extract()
             #tem['link'] = selection.xpath('h3/a/@href').extract()
             yield item
示例#11
0
 def parse(self, response):
     for selection in response.xpath('//td'):
         item = PsyscraperItem()
         item['title'] = selection.xpath('a/text()').extract()
         #item['link'] = selection.xpath('a/@href').extract()
         yield item
示例#12
0
 def parse(self, response):
     for selection in response.xpath('//*[@class="doctitle"]'):
         item = PsyscraperItem()
         item['title'] = selection.xpath('a/text()').extract()
         yield item
示例#13
0
 def parse(self, response): 
     for selection in response.xpath('//*[@id="content"]/div/div'):                
         item = PsyscraperItem()
         item['title'] = selection.xpath('./meta[@name="citation_title"]/@content').extract()
         yield item