def parse(self, response): sequence1 = range(1, 8) iterator1 = sequence1.__iter__() iter_value1 = iterator1.next() for iter_value1 in sequence1: link_value1 = iter_value1 * 4 for selection in response.xpath('//*[@id="Content"]'): item = PsyscraperItem() item['title'] = selection.xpath( 'table//table[1]//h3[%s]/text()' % (iter_value1)).extract() #item['link'] = selection.xpath('table//table[1]//p[%s]/a/@href'%(link_value1)).extract() #item['link'] = re.findall(r"'(.*?)'", '%s' % item['link']) #item['link'] = [u'http://www.icml2010.org/%s' % "".join(item['link'])] yield item sequence2 = range(8, 153) iterator2 = sequence2.__iter__() iter_value2 = iterator2.next() for iter_value2 in sequence2: link_value2 = iter_value2 * 4 for selection in response.xpath('//*[@id="Content"]'): item = PsyscraperItem() item['title'] = selection.xpath( 'table//table[2]//h3[%s]/text()' % (iter_value2)).extract() #item['link'] = selection.xpath('table//table[2]//p[%s]/a/@href'%(link_value2)).extract() #item['link'] = re.findall(r"'(.*?)'", '%s' % item['link']) #item['link'] = [u'http://www.icml2010.org/%s' % "".join(item['link'])] yield item
def parse(self, response): # Sequence for first half of paper titles sequence1 = range(2, 83) iterator1 = sequence1.__iter__() iter_value1 = iterator1.next() #Loop for first half of papers for iter_value1 in sequence1: link_value1 = iter_value1 * 2 - 3 for selection in response.xpath('//*[@id="right_column"]'): item = PsyscraperItem() item['title'] = selection.xpath('h3[%s]/text()' % (iter_value1)).extract() #item['link'] = selection.xpath('a[%s]/@href'%(link_value1)).extract() #item['link'] = re.findall(r"'(.*?)'", '%s' % item['link']) #item['link'] = [u'http://machinelearning.org/archive/icml2009/%s' % "".join(item['link'])] yield item #XPath naming scheme changed halfway through sequence2 = range(83, 162) iterator2 = sequence2.__iter__() iter_value2 = iterator2.next() for iter_value2 in sequence2: link_value2 = iter_value2 * 2 - 2 for selection in response.xpath('//*[@id="right_column"]'): item = PsyscraperItem() item['title'] = selection.xpath('h3[%s]/text()' % (iter_value2)).extract() #item['link'] = selection.xpath('a[%s]/@href'%(link_value2)).extract() #item['link'] = re.findall(r"'(.*?)'", '%s' % item['link']) #item['link'] = [u'http://machinelearning.org/archive/icml2009/%s' % "".join(item['link'])] yield item
def parse(self, response): for selection in response.xpath( '//li[contains(@class,"title")]' ): # using contains because the Elsevier decided to switch their attribute names halfway through. This expression generalizes the class to any li's class attribute containing "title". item = PsyscraperItem() item['title'] = selection.xpath('.//a/text()').extract() yield item
def parse(self, response): for selection in response.xpath( '//h3[contains(@class,"title")]' ): # Springer likes to use an "embed" tag for papers that reference other papers in their title. I'm going to ignore this for now, but may revisit later if we find many partial or cut-off titles. item = PsyscraperItem() item['title'] = selection.xpath('.//a/text()').extract() yield item
def parse(self, response): sequence1 = range(15, 505) iterator1 = sequence1.__iter__() iter_value1 = iterator1.next() for iter_value1 in sequence1: for selection in response.xpath('/html/body/table[2]'): item = PsyscraperItem() item['title'] = selection.xpath('//p[%i]/a[1]/text()' % (iter_value1)).extract() yield item
def parse(self, response): sequence = range(1, 159) iterator = sequence.__iter__() iter_value = iterator.next() for iter_value in sequence: link_value = iter_value * 4 for selection in response.xpath('/html/body/div/div[3]'): item = PsyscraperItem() item['title'] = selection.xpath('h3[%s]/text()' % (iter_value)).extract() #item['link'] = selection.xpath('p[%s]/a[1]/@href'%(link_value)).extract() yield item
def parse(self, response): sequence1 = range(9, 161) iterator1 = sequence1.__iter__() iter_value1 = iterator1.next() for iter_value1 in sequence1: link_value1 = iter_value1 for selection in response.xpath('//*[@style="width:900px"]'): item = PsyscraperItem() item['title'] = selection.xpath('tr[1]/td[2]/a[%s]/h3/text()' % (iter_value1)).extract() # Due to time constraints I'm pulling a plug on finding links. # Revisit when able, but low priority. #item['link'] = selection.xpath('p[%s]/a[2]/@href'%(link_value1)).extract() #item['link'] = re.findall(r'\"(.*?)\"', '%s' % item['link']) #item['link'] = [u'http://www.icml-2011.org/%s' % "".join(item['link'])] yield item
def parse(self, response): for selection in response.xpath('//*[@id="content"]/dl/div'): item = PsyscraperItem() item['title'] = selection.xpath('p[1]/text()').extract() yield item
def parse(self, response): for selection in response.xpath('/html/body/div[2]/div/ul/li/a[1]'): item = PsyscraperItem() item['title'] = selection.xpath('./text()').extract() yield item
def parse(self, response): for selection in response.xpath('//*[@class="detail"]'): item = PsyscraperItem() item['title'] = selection.xpath('h3/a//text()').extract() #tem['link'] = selection.xpath('h3/a/@href').extract() yield item
def parse(self, response): for selection in response.xpath('//td'): item = PsyscraperItem() item['title'] = selection.xpath('a/text()').extract() #item['link'] = selection.xpath('a/@href').extract() yield item
def parse(self, response): for selection in response.xpath('//*[@class="doctitle"]'): item = PsyscraperItem() item['title'] = selection.xpath('a/text()').extract() yield item
def parse(self, response): for selection in response.xpath('//*[@id="content"]/div/div'): item = PsyscraperItem() item['title'] = selection.xpath('./meta[@name="citation_title"]/@content').extract() yield item