def parse(self, response): # If you want to look at the HTML you are parsing, uncomment the next few lines and then look at the file """ f = open("html.txt","w+") f.write(response.url) f.write("\n\n") f.write(response.body) f.close() """ i = 0 hxs = HtmlXPathSelector(response) #if it is a directory if hxs.select('//body[@id="pagekey-seo_people_directory"]'): for url in hxs.select( '//ul[@class="column dual-column"]/li/a/@href' ).extract( ): #take all of the subdirectories that show up and request them url = url.encode('utf-8') if i > 8: return #print url if "linkedin.com" not in url: url = "https://www.linkedin.com" + url #print url if randomSampling and random.random( ) > samplingProbability: #random sampling. continue yield Request(url, callback=self.parse) #if it is not a directory (its a regular page) elif hxs.select('//meta[@name="pageImpressionID"]'): i = i + 1 f = open("html.txt", "w+") f.write(response.url) f.write("\n\n") f.write(response.body) f.close() item = linkedInItem() item['url'] = response.url #print response.url item['headlineTitle'] = striplist( hxs.select('//p[@class="headline title"]/text()').extract()) HTMLtitle = striplist(hxs.select('//title/text()').extract()) item['name'] = [HTMLtitle[0].split('|')[0].strip()] item['location'] = striplist( hxs.select('//dd/span/text()').extract()) #if not checkLocation(item['location']): #print item['location'] #sys.stdout.flush() #else: if not filterForUS or checkLocation(item['location']): item['industry'] = striplist( hxs.select('//dd[@class="descriptor"]/text()').extract()) item['overviewCurrent'] = striplist( hxs.select('//li/span[@class="org"]/text()').extract()) item['overviewPast'] = striplist( hxs.select("//td/ol/li/text()").extract()) # TODO: overviewEducation and overviewPast have the same xpath... item['overviewEducation'] = striplist( hxs.select('//td/ol/li/text()').extract()) #item['recommendations'] = striplist(hxs.select('').extract()) item['connections'] = striplist( hxs.select( '//div[@class="member-connections"]/strong/text()'). extract()) #item['websites'] = striplist(hxs.select('').extract()) item['descriptionSummary'] = striplist( hxs.select('//section/div[@class="description"]/p/text()'). extract()) # TODO: broken from here on down. item['summarySpecialties'] = striplist( hxs.select( '//div[@id="profile-specialties"]/p/text()').extract()) # ------------------------------------------------------------------------------------------------------------------ # Education # ------------------------------------------------------------------------------------------------------------------ # Education: School Names firstEducationSchool = [] for scope in hxs.select( '//div[@class="position first education vevent vcard"]/h3[@class="summary fn org"]' ): names1 = scope.xpath('a/text()').extract() names2 = scope.xpath('text()').extract() firstEducationSchool += names1 + names2 firstEducationSchool = striplist(firstEducationSchool) schoolNames = [] for scope in hxs.select( '//div[@class="position education vevent vcard"]/h3[@class="summary fn org"]' ): names1 = scope.xpath('a/text()').extract() names2 = scope.xpath('text()').extract() schoolNames += names1 + names2 """ for x in names1: x = x.strip() if x: schoolNames.append(x) for x in names2: x = x.strip() if x: schoolNames.append(x) """ schoolNames = striplist(schoolNames) # Education: Degrees firstDegree = striplist( hxs.select( '//div[@class="position first education vevent vcard"]/h4/span[@class="degree"]/text()' ).extract()) schoolDegrees = striplist( hxs.select( '//div[@class="position education vevent vcard"]/h4/span[@class="degree"]/text()' ).extract()) # Education: Majors firstMajor = striplist( hxs.select( '//div[@class="position first education vevent vcard"]/h4/span[@class="major"]/text()' ).extract()) schoolMajors = striplist( hxs.select( '//div[@class="position education vevent vcard"]/h4/span[@class="major"]/text()' ).extract()) # Education: Time Start firstEducationStart = striplist( hxs.select( '//div[@class="position first education vevent vcard"]/p[@class="period"]/abbr[@class="dtstart"]/text()' ).extract()) educationStarts = striplist( hxs.select( '//div[@class="position education vevent vcard"]/p[@class="period"]/abbr[@class="dtstart"]/text()' ).extract()) # Education: Time End firstEducationEnd = striplist( hxs.select( '//div[@class="position first education vevent vcard"]/p[@class="period"]/abbr[@class="dtend"]/text()' ).extract()) educationEnds = striplist( hxs.select( '//div[@class="position education vevent vcard"]/p[@class="period"]/abbr[@class="dtend"]/text()' ).extract()) item['educationSchoolName1'] = [] item['educationDegree1'] = [] item['educationMajor1'] = [] item['eduTimeStart1'] = [] item['eduTimeEnd1'] = [] if firstEducationSchool: item['educationSchoolName1'] = firstEducationSchool.pop(0) if firstDegree: item['educationDegree1'] = firstDegree.pop(0) if firstMajor: item['educationMajor1'] = firstMajor.pop(0) if firstEducationStart: item['eduTimeStart1'] = firstEducationStart.pop(0) if firstEducationEnd: item['eduTimeEnd1'] = firstEducationEnd.pop(0) elif schoolNames: item['educationSchoolName1'] = schoolNames.pop(0) if schoolDegrees: item['educationDegree1'] = schoolDegrees.pop(0) if schoolMajors: item['educationMajor1'] = schoolMajors.pop(0) if educationStarts: item['eduTimeStart1'] = educationStarts.pop(0) if educationEnds: item['eduTimeEnd1'] = educationEnds.pop(0) if not schoolNames: item['educationSchoolName2'] = [] else: item['educationSchoolName2'] = schoolNames.pop(0) if not schoolNames: item['educationSchoolName3'] = [] else: item['educationSchoolName3'] = schoolNames.pop(0) if not schoolDegrees: item['educationDegree2'] = [] else: item['educationDegree2'] = schoolDegrees.pop(0) if not schoolDegrees: item['educationDegree3'] = [] else: item['educationDegree3'] = schoolDegrees.pop(0) if not schoolMajors: item['educationMajor2'] = [] else: item['educationMajor2'] = schoolMajors.pop(0) if not schoolMajors: item['educationMajor3'] = [] else: item['educationMajor3'] = schoolMajors.pop(0) if not educationStarts: item['eduTimeStart2'] = [] else: item['eduTimeStart2'] = educationStarts.pop(0) if not educationStarts: item['eduTimeStart3'] = [] else: item['eduTimeStart3'] = educationStarts.pop(0) if not educationEnds: item['eduTimeEnd2'] = [] else: item['eduTimeEnd2'] = educationEnds.pop(0) if not educationEnds: item['eduTimeEnd3'] = [] else: item['eduTimeEnd3'] = educationEnds.pop(0) #------------------------------------------------------------------------------------------------------------------ # Work Experience #------------------------------------------------------------------------------------------------------------------ # Work Experience: Title experienceHeads = striplist( hxs.select('//h3/span[@class="title"]/text()').extract()) item['experienceHeads'] = experienceHeads # Work Experience: Company experienceCompany = [] for scope in hxs.select('//h4/strong'): #print "current scope:", scope companies1 = scope.xpath( 'span[@class="org summary"]/text()').extract() companies2 = scope.xpath( 'a/span[@class="org summary"]/text()').extract() experienceCompany += companies1 + companies2 experienceCompany = striplist(experienceCompany) item['expCompany'] = experienceCompany # Work Experience: Time started expTimeStarts = striplist( hxs.select( '//div[contains(@class, "experience")]/p/abbr[@class="dtstart"]/text()' ).extract()) item['expTimeStarts'] = expTimeStarts # Work Experience: Time ended timePresent = striplist( hxs.select( '//div[contains(@class, "experience")]/p/abbr[@class="dtstamp"]/text()' ).extract()) expTimeEnds = striplist( hxs.select( '//div[contains(@class, "experience")]/p/abbr[@class="dtend"]/text()' ).extract()) expTimeEnds = timePresent + expTimeEnds item['expTimeEnds'] = expTimeEnds # Work Experience: Time duration expTimeDurations = striplist( hxs.select( '//div[contains(@class, "experience")]/p/span[@class="duration"]/text()' ).extract()) item['expTimeDurations'] = expTimeDurations # Work Experience: Description #expDescriptions = striplist( #hxs.select('//p[@class=" description past-position"]/text()').extract()) yield item
def parse(self, response): hxs = HtmlXPathSelector(response) item = linkedInItem() item ['url'] = response.url item['headlineTitle'] = striplist(hxs.select('//p[@class="headline-title title"]/text()').extract()) HTMLtitle = striplist(hxs.select('//title/text()').extract()) item['name'] = find_between(response.body, "<title>", "</title>").strip().split('|')[0].strip() item['location'] = find_between(response.body, '<span class="locality">', '</span>').strip().split('|')[0].strip() if filterForUS: item['industry'] = find_between(response.body, '<dd class="industry">', '</dd>').strip().split('|')[0].strip() item['overviewCurrent'] = striplist(hxs.select('//dd[@class="summary-current"]/ul[@class="current"]/li/text()').extract()) item['overviewPast'] = striplist(hxs.select('//dd[@class="summary-past"]/ul[@class="past"]/li/text()').extract()) item['overviewEducation'] = striplist(hxs.select('//dd[@class="summary-education"]/ul/li/text()').extract()) #item['recommendations'] = striplist(hxs.select('').extract()) item['connections'] = striplist(hxs.select('//dd[@class="overview-connections"]/p/strong/text()').extract()) #item['websites'] = striplist(hxs.select('').extract()) item['descriptionSummary'] = striplist(hxs.select('//p[@class=" description summary"]/text()').extract()) item['summarySpecialties'] = striplist(hxs.select('//div[@id="profile-specialties"]/p/text()').extract()) # ------------------------------------------------------------------------------------------------------------------ # Education # ------------------------------------------------------------------------------------------------------------------ # Education: School Names firstEducationSchool = [find_between(response.body, '<h3 class="summary fn org">', '</h3>').strip().split('|')[0].strip()] schoolNames = striplist(hxs.select('//div[@class="position education vevent vcard"]/h3[@class="summary fn org"]').extract()) # Education: Degrees firstDegree = striplist(hxs.select('//div[@class="position first education vevent vcard"]/h4/span[@class="degree"]/text()').extract()) schoolDegrees = striplist(hxs.select('//div[@class="position education vevent vcard"]/h4/span[@class="degree"]/text()').extract()) # Education: Majors firstMajor = striplist(hxs.select('//div[@class="position first education vevent vcard"]/h4/span[@class="major"]/text()').extract()) schoolMajors = striplist(hxs.select('//div[@class="position education vevent vcard"]/h4/span[@class="major"]/text()').extract()) # Education: Time Start firstEducationStart = striplist(hxs.select('//div[@class="position first education vevent vcard"]/p[@class="period"]/abbr[@class="dtstart"]/text()').extract()) educationStarts = striplist(hxs.select('//div[@class="position education vevent vcard"]/p[@class="period"]/abbr[@class="dtstart"]/text()').extract()) # Education: Time End firstEducationEnd = striplist(hxs.select('//div[@class="position first education vevent vcard"]/p[@class="period"]/abbr[@class="dtend"]/text()').extract()) educationEnds = striplist(hxs.select('//div[@class="position education vevent vcard"]/p[@class="period"]/abbr[@class="dtend"]/text()').extract()) if firstEducationSchool: item['educationSchoolName1'] = strip_tags(firstEducationSchool.pop(0)).strip() if firstDegree: item['educationDegree1'] = firstDegree.pop(0) else: item['educationDegree1'] = [] if firstMajor: item['educationMajor1'] = firstMajor.pop(0) else: item['educationMajor1'] = [] if firstEducationStart: item['eduTimeStart1'] = firstEducationStart.pop(0) else: item['eduTimeStart1'] = [] if firstEducationEnd: item['eduTimeEnd1'] = firstEducationEnd.pop(0) else: item['eduTimeEnd1'] = [] elif schoolNames: item['educationSchoolName1'] = strip_tags(schoolNames.pop(0)) if schoolDegrees: item['educationDegree1'] = schoolDegrees.pop(0) else: item['educationDegree1'] = [] if schoolMajors: item['educationMajor1'] = schoolMajors.pop(0) else: item['educationMajor1'] = [] if educationStarts: item['eduTimeStart1'] = educationStarts.pop(0) else: item['eduTimeStart1'] = [] if educationEnds: item['eduTimeEnd1'] = educationEnds.pop(0) else: item['eduTimeEnd1'] = [] else: item['educationSchoolName1'] = [] item['educationDegree1'] = [] item['educationMajor1'] = [] item['eduTimeStart1'] = [] item['eduTimeEnd1'] = [] if not schoolNames: item['educationSchoolName2'] = [] else: item['educationSchoolName2'] = strip_tags(schoolNames.pop(0)).strip() if not schoolNames: item['educationSchoolName3'] = [] else: item['educationSchoolName3'] = strip_tags(schoolNames.pop(0)).strip() if not schoolDegrees: item['educationDegree2'] = [] else: item['educationDegree2'] = schoolDegrees.pop(0) if not schoolDegrees: item['educationDegree3'] = [] else: item['educationDegree3'] = schoolDegrees.pop(0) if not schoolMajors: item['educationMajor2'] = [] else: item['educationMajor2'] = schoolMajors.pop(0) if not schoolMajors: item['educationMajor3'] = [] else: item['educationMajor3'] = schoolMajors.pop(0) if not educationStarts: item['eduTimeStart2'] = [] else: item['eduTimeStart2'] = educationStarts.pop(0) if not educationStarts: item['eduTimeStart3'] = [] else: item['eduTimeStart3'] = educationStarts.pop(0) if not educationEnds: item['eduTimeEnd2'] = [] else: item['eduTimeEnd2'] = educationEnds.pop(0) if not educationEnds: item['eduTimeEnd3'] = [] else: item['eduTimeEnd3'] = educationEnds.pop(0) item['education'] = [] if(item['educationDegree1'].strip() != ''): temp = {'title':item['educationDegree1'],'industry':item['educationMajor1'],'start':item['eduTimeStart1'],'end':item['eduTimeEnd1']} item['education'].append(temp) if(item['educationDegree2'].strip() != ''): temp = {'title':item['educationDegree2'],'industry':item['educationMajor2'],'start':item['eduTimeStart2'],'end':item['eduTimeEnd2']} item['education'].append(temp) if(item['educationDegree3'].strip() != ''): temp = {'title':item['educationDegree3'],'industry':item['educationMajor3'],'start':item['eduTimeStart3'],'end':item['eduTimeEnd3']} item['education'].append(temp) #------------------------------------------------------------------------------------------------------------------ # Work Experience #------------------------------------------------------------------------------------------------------------------ # Work Experience: title experienceHeads = striplist(hxs.select('//h3[@class="position-title anet"]/span[@class="title"]/text()').extract()) item['experienceHeads'] = striplist(hxs.select('//h3[@class="position-title anet"]/span[@class="title"]/text()').extract()) # Work Experience: Time started currentExpTimeStart = striplist(hxs.select('//div[@class="position first experience vevent vcard summary-current"]/p/abbr[@class="dtstart"]/text()').extract()) if not currentExpTimeStart: currentExpTimeStart = striplist(hxs.select('//div[@class="position first experience vevent vcard current-position"]/p/abbr[@class="dtstart"]/text()').extract()) moreExpTimeStart = striplist(hxs.select('//div[@class="position experience vevent vcard summary-current"]/p/abbr[@class="dtstart"]/text()').extract()) if not moreExpTimeStart: moreExpTimeStart = striplist(hxs.select('//div[@class="position experience vevent vcard current-position"]/p/abbr[@class="dtstart"]/text()').extract()) expTimeStarts = striplist(hxs.select('//div[@class="position experience vevent vcard summary-past"]/p/abbr[@class="dtstart"]/text()').extract()) if not expTimeStarts: expTimeStarts = striplist(hxs.select('//div[@class="position experience vevent vcard past-position"]/p/abbr[@class="dtstart"]/text()').extract()) item['expTimeStarts'] = currentExpTimeStart + moreExpTimeStart +expTimeStarts # Work Experience: Time ended present = striplist(hxs.select('//p[@class="period"]/abbr[@class="dtstamp"]/text()').extract()) expTimeEnds = striplist(hxs.select('//div[@class="position experience vevent vcard summary-past"]/p[@class="period"]/abbr[@class="dtend"]/text()').extract()) if not expTimeEnds: expTimeEnds = striplist(hxs.select('//div[@class="position experience vevent vcard past-position"]/p[@class="period"]/abbr[@class="dtend"]/text()').extract()) item['expTimeEnds'] = present + expTimeEnds ''' print 'experienceHeads' print experienceHeads print 'currentExpTimeStart' print currentExpTimeStart print 'moreExpTimeStart' print moreExpTimeStart print 'expTimeStarts' print expTimeStarts print 'present' print present print 'expTimeEnds' print expTimeEnds ''' # Work Experience: Time duration ''' currentDuration = striplist(hxs.select('//div[@class="position first experience vevent vcard summary-current"]/p/span[@class="duration"]/text()').extract()) expTimeDurations = striplist(hxs.select('//div[@class="position experience vevent vcard summary-past"]/p/span[@class="duration"]/text()').extract()) ''' # Work Experience: Description currentDescription = striplist(hxs.select('//p[@class=" description current-position"]/text()').extract()) expDescriptions = striplist(hxs.select('//p[@class=" description past-position"]/text()').extract()) divs = hxs.select('//p[@class=" description past-position"]/text()') for p in divs.select('.//br') : print p.extract() if not currentDescription: if not expDescriptions: item['expDescription1'] = [] else: item['expDescription1'] = expDescriptions.pop(0) else: item['expDescription1'] = currentDescription.pop(0) if not expDescriptions: item['expDescription2'] = [] else: item['expDescription2'] = expDescriptions.pop(0) if not expDescriptions: item['expDescription3'] = [] else: item['expDescription3'] = expDescriptions.pop(0) if not expDescriptions: item['expDescription4'] = [] else: item['expDescription4'] = expDescriptions.pop(0) if not expDescriptions: item['expDescription5'] = [] else: item['expDescription5'] = expDescriptions.pop(0) item['expDescription2'] = [] item['expDescription3'] = [] item['expDescription4'] = [] item['expDescription5'] = [] yield item else : #if it is a directory for url in hxs.select('//ul[@class="directory"]/li/a/@href').extract(): #take all of the subdirectories that show up and request them if not randomSampling or random.random() < samplingProbability: #random sampling. yield Request('http://www.linkedin.com'+url, callback=self.parse)
def parse(self, response): response = HtmlResponse(url=response.url, status=response.status, headers=response.headers, body=response.body) hxs = HtmlXPathSelector(response) #return if not hxs.select('//body[@class="guest directory"]'): #if it is not a directory (its a regular page) if hxs.select('//meta[@name="pageImpressionID"]'): item = linkedInItem() item ['url'] = response.url # I found that linkedIn changed around their HTML. Furthermore linkedin seems to have . #item['name'] = striplist(hxs.select('//h1/span/span/text()').extract()) item['headlineTitle'] = striplist(hxs.select('//p[@class="headline-title title"]/text()').extract()) HTMLtitle = striplist(hxs.select('//title/text()').extract()) item['name'] = [HTMLtitle[0].split('|')[0].strip()] item['location'] = striplist(hxs.select('//dd/span/text()').extract()) #if not checkLocation(item['location']): #print item['location'] #sys.stdout.flush() #else: if filterForUS and checkLocation(item['location']): item['industry'] = striplist(hxs.select('//dd[@class="industry"]/text()').extract()) item['overviewCurrent'] = striplist(hxs.select('//dd[@class="summary-current"]/ul/li/text()').extract()) item['currentPosition'] = striplist(hxs.select('//div[@class="position first experience vevent vcard summary-current"]/div/h3/span/text()').extract()) item['currentCompany'] = striplist(hxs.select('//div[@class="position first experience vevent vcard summary-current"]/div/h4/strong/a/span/text()').extract()) item['overviewPast'] = striplist(hxs.select('//dd[@class="summary-past"]/ul[@class="past"]/li/text()').extract()) item['overviewEducation'] = striplist(hxs.select('//dd[@class="summary-education"]/ul/li/text()').extract()) #item['recommendations'] = striplist(hxs.select('').extract()) item['connections'] = striplist(hxs.select('//dd[@class="overview-connections"]/p/strong/text()').extract()) #item['websites'] = striplist(hxs.select('').extract()) item['descriptionSummary'] = striplist(hxs.select('//p[@class=" description summary"]/text()').extract()) item['summarySpecialties'] = striplist(hxs.select('//ol[@id="skills-list"]/li/span/text()').extract()) # ------------------------------------------------------------------------------------------------------------------ # Education # ------------------------------------------------------------------------------------------------------------------ # Education: School Names firstEducationSchool = striplist(hxs.select('//div[@class="position first education vevent vcard"]/h3[@class="summary fn org"]/text()').extract()) schoolNames = striplist(hxs.select('//div[@class="position education vevent vcard"]/h3[@class="summary fn org"]/text()').extract()) # Education: Degrees firstDegree = striplist(hxs.select('//div[@class="position first education vevent vcard"]/h4/span[@class="degree"]/text()').extract()) schoolDegrees = striplist(hxs.select('//div[@class="position education vevent vcard"]/h4/span[@class="degree"]/text()').extract()) # Education: Majors firstMajor = striplist(hxs.select('//div[@class="position first education vevent vcard"]/h4/span[@class="major"]/text()').extract()) schoolMajors = striplist(hxs.select('//div[@class="position education vevent vcard"]/h4/span[@class="major"]/text()').extract()) # Education: Time Start firstEducationStart = striplist(hxs.select('//div[@class="position first education vevent vcard"]/p[@class="period"]/abbr[@class="dtstart"]/text()').extract()) educationStarts = striplist(hxs.select('//div[@class="position education vevent vcard"]/p[@class="period"]/abbr[@class="dtstart"]/text()').extract()) # Education: Time End firstEducationEnd = striplist(hxs.select('//div[@class="position first education vevent vcard"]/p[@class="period"]/abbr[@class="dtend"]/text()').extract()) educationEnds = striplist(hxs.select('//div[@class="position education vevent vcard"]/p[@class="period"]/abbr[@class="dtend"]/text()').extract()) if firstEducationSchool: item['educationSchoolName1'] = firstEducationSchool.pop(0) if firstDegree: item['educationDegree1'] = firstDegree.pop(0) else: item['educationDegree1'] = [] if firstMajor: item['educationMajor1'] = firstMajor.pop(0) else: item['educationMajor1'] = [] if firstEducationStart: item['eduTimeStart1'] = firstEducationStart.pop(0) else: item['eduTimeStart1'] = [] if firstEducationEnd: item['eduTimeEnd1'] = firstEducationEnd.pop(0) else: item['eduTimeEnd1'] = [] elif schoolNames: item['educationSchoolName1'] = schoolNames.pop(0) if schoolDegrees: item['educationDegree1'] = schoolDegrees.pop(0) else: item['educationDegree1'] = [] if schoolMajors: item['educationMajor1'] = schoolMajors.pop(0) else: item['educationMajor1'] = [] if educationStarts: item['eduTimeStart1'] = educationStarts.pop(0) else: item['eduTimeStart1'] = [] if educationEnds: item['eduTimeEnd1'] = educationEnds.pop(0) else: item['eduTimeEnd1'] = [] else: item['educationSchoolName1'] = [] item['educationDegree1'] = [] item['educationMajor1'] = [] item['eduTimeStart1'] = [] item['eduTimeEnd1'] = [] if not schoolNames: item['educationSchoolName2'] = [] else: item['educationSchoolName2'] = schoolNames.pop(0) if not schoolNames: item['educationSchoolName3'] = [] else: item['educationSchoolName3'] = schoolNames.pop(0) if not schoolDegrees: item['educationDegree2'] = [] else: item['educationDegree2'] = schoolDegrees.pop(0) if not schoolDegrees: item['educationDegree3'] = [] else: item['educationDegree3'] = schoolDegrees.pop(0) if not schoolMajors: item['educationMajor2'] = [] else: item['educationMajor2'] = schoolMajors.pop(0) if not schoolMajors: item['educationMajor3'] = [] else: item['educationMajor3'] = schoolMajors.pop(0) if not educationStarts: item['eduTimeStart2'] = [] else: item['eduTimeStart2'] = educationStarts.pop(0) if not educationStarts: item['eduTimeStart3'] = [] else: item['eduTimeStart3'] = educationStarts.pop(0) if not educationEnds: item['eduTimeEnd2'] = [] else: item['eduTimeEnd2'] = educationEnds.pop(0) if not educationEnds: item['eduTimeEnd3'] = [] else: item['eduTimeEnd3'] = educationEnds.pop(0) #------------------------------------------------------------------------------------------------------------------ # Work Experience #------------------------------------------------------------------------------------------------------------------ # Work Experience: title experienceHeads = striplist(hxs.select('//h3[@class="position-title anet"]/span[@class="title"]/text()').extract()) item['experienceHeads'] = striplist(hxs.select('//h3[@class="position-title anet"]/span[@class="title"]/text()').extract()) # Work Experience: Time started currentExpTimeStart = striplist(hxs.select('//div[@class="position first experience vevent vcard summary-current"]/p/abbr[@class="dtstart"]/text()').extract()) if not currentExpTimeStart: currentExpTimeStart = striplist(hxs.select('//div[@class="position first experience vevent vcard current-position"]/p/abbr[@class="dtstart"]/text()').extract()) moreExpTimeStart = striplist(hxs.select('//div[@class="position experience vevent vcard summary-current"]/p/abbr[@class="dtstart"]/text()').extract()) if not moreExpTimeStart: moreExpTimeStart = striplist(hxs.select('//div[@class="position experience vevent vcard current-position"]/p/abbr[@class="dtstart"]/text()').extract()) expTimeStarts = striplist(hxs.select('//div[@class="position experience vevent vcard summary-past"]/p/abbr[@class="dtstart"]/text()').extract()) if not expTimeStarts: expTimeStarts = striplist(hxs.select('//div[@class="position experience vevent vcard past-position"]/p/abbr[@class="dtstart"]/text()').extract()) item['expTimeStarts'] = currentExpTimeStart + moreExpTimeStart +expTimeStarts # Work Experience: Time ended present = striplist(hxs.select('//p[@class="period"]/abbr[@class="dtstamp"]/text()').extract()) expTimeEnds = striplist(hxs.select('//div[@class="position experience vevent vcard summary-past"]/p[@class="period"]/abbr[@class="dtend"]/text()').extract()) if not expTimeEnds: expTimeEnds = striplist(hxs.select('//div[@class="position experience vevent vcard past-position"]/p[@class="period"]/abbr[@class="dtend"]/text()').extract()) item['expTimeEnds'] = present + expTimeEnds ''' print 'experienceHeads' print experienceHeads print 'currentExpTimeStart' print currentExpTimeStart print 'moreExpTimeStart' print moreExpTimeStart print 'expTimeStarts' print expTimeStarts print 'present' print present print 'expTimeEnds' print expTimeEnds ''' # Work Experience: Time duration ''' currentDuration = striplist(hxs.select('//div[@class="position first experience vevent vcard summary-current"]/p/span[@class="duration"]/text()').extract()) expTimeDurations = striplist(hxs.select('//div[@class="position experience vevent vcard summary-past"]/p/span[@class="duration"]/text()').extract()) ''' # Work Experience: Description #currentDescription = striplist(hxs.select('//p[@class=" description current-position"]/text()').extract()) #expDescriptions = striplist(hxs.select('//p[@class=" description past-position"]/text()').extract()) #divs = hxs.select('//p[@class=" description past-position"]/text()') #for p in divs.select('.//br') : # print p.extract() # if not currentDescription: # if not expDescriptions: # item['expDescription1'] = [] # else: # item['expDescription1'] = expDescriptions.pop(0) # else: # item['expDescription1'] = currentDescription.pop(0) # # if not expDescriptions: # item['expDescription2'] = [] # else: # item['expDescription2'] = expDescriptions.pop(0) # if not expDescriptions: # item['expDescription3'] = [] # else: # item['expDescription3'] = expDescriptions.pop(0) # if not expDescriptions: # item['expDescription4'] = [] # else: # item['expDescription4'] = expDescriptions.pop(0) # if not expDescriptions: # item['expDescription5'] = [] # else: # item['expDescription5'] = expDescriptions.pop(0) # #item['expDescription2'] = [] #item['expDescription3'] = [] #item['expDescription4'] = [] #item['expDescription5'] = [] yield item else : #if it is a directory for url in hxs.select('//ul[@class="directory"]/li/a/@href').extract(): #take all of the subdirectories that show up and request them if not randomSampling or random.random() < samplingProbability: #random sampling. yield Request('http://www.linkedin.com'+url, callback=self.parse)
def parse(self, response): # If you want to look at the HTML you are parsing, uncomment the next # few lines and then look at the file f = open("html.txt", "w+") f.write(response.body) f.close() hxs = HtmlXPathSelector(response) # if it is not a directory (its a regular page) if not hxs.select('//body[@class="guest directory"]'): if hxs.select('//meta[@name="pageImpressionID"]'): item = linkedInItem() item['url'] = response.url item['headlineTitle'] = striplist( hxs.select('//p[@class="headline-title title"]/text()'). extract()) HTMLtitle = striplist(hxs.select('//title/text()').extract()) item['name'] = [HTMLtitle[0].split('|')[0].strip()] item['location'] = striplist( hxs.select('//dd/span/text()').extract()) # if not checkLocation(item['location']): # print item['location'] # sys.stdout.flush() # else: if not filterForUS or checkLocation(item['location']): item['industry'] = striplist( hxs.select('//dd[@class="industry"]/text()').extract()) item['overviewCurrent'] = striplist( hxs.select( '//dd[@class="summary-current"]/ul[@class="current"]/li/text()'). extract()) item['overviewPast'] = striplist( hxs.select( '//dd[@class="summary-past"]/ul[@class="past"]/li/text()'). extract()) item['overviewEducation'] = striplist( hxs.select( '//dd[@class="summary-education"]/ul/li/text()'). extract()) #item['recommendations'] = striplist(hxs.select('').extract()) item['connections'] = striplist( hxs.select( '//dd[@class="overview-connections"]/p/strong/text()'). extract()) #item['websites'] = striplist(hxs.select('').extract()) item['descriptionSummary'] = striplist( hxs.select('//p[@class=" description summary"]/text()'). extract()) item['summarySpecialties'] = striplist( hxs.select('//div[@id="profile-specialties"]/p/text()'). extract()) # ------------------------------------------------------------------------------------------------------------------ # Education # ------------------------------------------------------------------------------------------------------------------ # Education: School Names firstEducationSchool = [] for scope in hxs.select('//div[@class="position first education vevent vcard"]/h3[@class="summary fn org"]'): names1 = scope.xpath('a/text()').extract() names2 = scope.xpath('text()').extract() firstEducationSchool += names1 + names2 firstEducationSchool = striplist(firstEducationSchool) schoolNames = [] for scope in hxs.select('//div[@class="position education vevent vcard"]/h3[@class="summary fn org"]'): names1 = scope.xpath('a/text()').extract() names2 = scope.xpath('text()').extract() schoolNames += names1 + names2 """ for x in names1: x = x.strip() if x: schoolNames.append(x) for x in names2: x = x.strip() if x: schoolNames.append(x) """ schoolNames = striplist(schoolNames) # Education: Degrees firstDegree = striplist( hxs.select( '//div[@class="position first education vevent vcard"]/h4/span[@class="degree"]/text()'). extract()) schoolDegrees = striplist( hxs.select( '//div[@class="position education vevent vcard"]/h4/span[@class="degree"]/text()'). extract()) # Education: Majors firstMajor = striplist( hxs.select( '//div[@class="position first education vevent vcard"]/h4/span[@class="major"]/text()'). extract()) schoolMajors = striplist( hxs.select( '//div[@class="position education vevent vcard"]/h4/span[@class="major"]/text()'). extract()) # Education: Time Start firstEducationStart = striplist( hxs.select( '//div[@class="position first education vevent vcard"]/p[@class="period"]/abbr[@class="dtstart"]/text()'). extract()) educationStarts = striplist( hxs.select( '//div[@class="position education vevent vcard"]/p[@class="period"]/abbr[@class="dtstart"]/text()'). extract()) # Education: Time End firstEducationEnd = striplist( hxs.select( '//div[@class="position first education vevent vcard"]/p[@class="period"]/abbr[@class="dtend"]/text()'). extract()) educationEnds = striplist( hxs.select( '//div[@class="position education vevent vcard"]/p[@class="period"]/abbr[@class="dtend"]/text()'). extract()) item['educationSchoolName1'] = [] item['educationDegree1'] = [] item['educationMajor1'] = [] item['eduTimeStart1'] = [] item['eduTimeEnd1'] = [] if firstEducationSchool: item['educationSchoolName1'] = firstEducationSchool.pop( 0) if firstDegree: item['educationDegree1'] = firstDegree.pop(0) if firstMajor: item['educationMajor1'] = firstMajor.pop(0) if firstEducationStart: item['eduTimeStart1'] = firstEducationStart.pop(0) if firstEducationEnd: item['eduTimeEnd1'] = firstEducationEnd.pop(0) elif schoolNames: item['educationSchoolName1'] = schoolNames.pop(0) if schoolDegrees: item['educationDegree1'] = schoolDegrees.pop(0) if schoolMajors: item['educationMajor1'] = schoolMajors.pop(0) if educationStarts: item['eduTimeStart1'] = educationStarts.pop(0) if educationEnds: item['eduTimeEnd1'] = educationEnds.pop(0) if not schoolNames: item['educationSchoolName2'] = [] else: item['educationSchoolName2'] = schoolNames.pop(0) if not schoolNames: item['educationSchoolName3'] = [] else: item['educationSchoolName3'] = schoolNames.pop(0) if not schoolDegrees: item['educationDegree2'] = [] else: item['educationDegree2'] = schoolDegrees.pop(0) if not schoolDegrees: item['educationDegree3'] = [] else: item['educationDegree3'] = schoolDegrees.pop(0) if not schoolMajors: item['educationMajor2'] = [] else: item['educationMajor2'] = schoolMajors.pop(0) if not schoolMajors: item['educationMajor3'] = [] else: item['educationMajor3'] = schoolMajors.pop(0) if not educationStarts: item['eduTimeStart2'] = [] else: item['eduTimeStart2'] = educationStarts.pop(0) if not educationStarts: item['eduTimeStart3'] = [] else: item['eduTimeStart3'] = educationStarts.pop(0) if not educationEnds: item['eduTimeEnd2'] = [] else: item['eduTimeEnd2'] = educationEnds.pop(0) if not educationEnds: item['eduTimeEnd3'] = [] else: item['eduTimeEnd3'] = educationEnds.pop(0) #----------------------------------------------------------- # Work Experience #----------------------------------------------------------- # Work Experience: Title experienceHeads = striplist( hxs.select('//h3/span[@class="title"]/text()').extract()) item['experienceHeads'] = experienceHeads # Work Experience: Company experienceCompany = [] for scope in hxs.select('//h4/strong'): # print "current scope:", scope companies1 = scope.xpath( 'span[@class="org summary"]/text()').extract() companies2 = scope.xpath( 'a/span[@class="org summary"]/text()').extract() experienceCompany += companies1 + companies2 experienceCompany = striplist(experienceCompany) item['expCompany'] = experienceCompany # Work Experience: Time started expTimeStarts = striplist( hxs.select( '//div[contains(@class, "experience")]/p/abbr[@class="dtstart"]/text()'). extract()) item['expTimeStarts'] = expTimeStarts # Work Experience: Time ended timePresent = striplist( hxs.select( '//div[contains(@class, "experience")]/p/abbr[@class="dtstamp"]/text()'). extract()) expTimeEnds = striplist( hxs.select( '//div[contains(@class, "experience")]/p/abbr[@class="dtend"]/text()'). extract()) expTimeEnds = timePresent + expTimeEnds item['expTimeEnds'] = expTimeEnds # Work Experience: Time duration expTimeDurations = striplist( hxs.select( '//div[contains(@class, "experience")]/p/span[@class="duration"]/text()'). extract()) item['expTimeDurations'] = expTimeDurations # Work Experience: Description # expDescriptions = striplist( # hxs.select('//p[@class=" description # past-position"]/text()').extract()) yield item else: # if it is a directory # take all of the subdirectories that show up and request them for url in hxs.select('//ul[@class="directory"]/li/a/@href').extract(): # random sampling. if not randomSampling or random.random() < samplingProbability: yield Request('http://www.linkedin.com'+url, callback=self.parse)
def parse(self, response): hxs = HtmlXPathSelector(response) #return if not hxs.select('//body[@class="guest directory"]' ): #if it is not a directory (its a regular page) if hxs.select('//meta[@name="pageImpressionID"]'): item = linkedInItem() item['url'] = response.url # I found that linkedIn changed around their HTML. Furthermore linkedin seems to have . #item['name'] = striplist(hxs.select('//h1/span/span/text()').extract()) item['headlineTitle'] = striplist( hxs.select( '//p[@class="headline-title title"]/text()').extract()) HTMLtitle = striplist(hxs.select('//title/text()').extract()) item['name'] = [HTMLtitle[0].split('|')[0].strip()] item['location'] = striplist( hxs.select('//dd/span/text()').extract()) #if not checkLocation(item['location']): #print item['location'] #sys.stdout.flush() #else: if filterForUS and checkLocation(item['location']): item['industry'] = striplist( hxs.select('//dd[@class="industry"]/text()').extract()) item['overviewCurrent'] = striplist( hxs.select( '//dd[@class="summary-current"]/ul[@class="current"]/li/text()' ).extract()) item['overviewPast'] = striplist( hxs.select( '//dd[@class="summary-past"]/ul[@class="past"]/li/text()' ).extract()) item['overviewEducation'] = striplist( hxs.select( '//dd[@class="summary-education"]/ul/li/text()'). extract()) #item['recommendations'] = striplist(hxs.select('').extract()) item['connections'] = striplist( hxs.select( '//dd[@class="overview-connections"]/p/strong/text()' ).extract()) #item['websites'] = striplist(hxs.select('').extract()) item['descriptionSummary'] = striplist( hxs.select('//p[@class=" description summary"]/text()' ).extract()) item['summarySpecialties'] = striplist( hxs.select('//div[@id="profile-specialties"]/p/text()' ).extract()) # ------------------------------------------------------------------------------------------------------------------ # Education # ------------------------------------------------------------------------------------------------------------------ # Education: School Names firstEducationSchool = striplist( hxs.select( '//div[@class="position first education vevent vcard"]/h3[@class="summary fn org"]/text()' ).extract()) schoolNames = striplist( hxs.select( '//div[@class="position education vevent vcard"]/h3[@class="summary fn org"]/text()' ).extract()) # Education: Degrees firstDegree = striplist( hxs.select( '//div[@class="position first education vevent vcard"]/h4/span[@class="degree"]/text()' ).extract()) schoolDegrees = striplist( hxs.select( '//div[@class="position education vevent vcard"]/h4/span[@class="degree"]/text()' ).extract()) # Education: Majors firstMajor = striplist( hxs.select( '//div[@class="position first education vevent vcard"]/h4/span[@class="major"]/text()' ).extract()) schoolMajors = striplist( hxs.select( '//div[@class="position education vevent vcard"]/h4/span[@class="major"]/text()' ).extract()) # Education: Time Start firstEducationStart = striplist( hxs.select( '//div[@class="position first education vevent vcard"]/p[@class="period"]/abbr[@class="dtstart"]/text()' ).extract()) educationStarts = striplist( hxs.select( '//div[@class="position education vevent vcard"]/p[@class="period"]/abbr[@class="dtstart"]/text()' ).extract()) # Education: Time End firstEducationEnd = striplist( hxs.select( '//div[@class="position first education vevent vcard"]/p[@class="period"]/abbr[@class="dtend"]/text()' ).extract()) educationEnds = striplist( hxs.select( '//div[@class="position education vevent vcard"]/p[@class="period"]/abbr[@class="dtend"]/text()' ).extract()) if firstEducationSchool: item[ 'educationSchoolName1'] = firstEducationSchool.pop( 0) if firstDegree: item['educationDegree1'] = firstDegree.pop(0) else: item['educationDegree1'] = [] if firstMajor: item['educationMajor1'] = firstMajor.pop(0) else: item['educationMajor1'] = [] if firstEducationStart: item['eduTimeStart1'] = firstEducationStart.pop(0) else: item['eduTimeStart1'] = [] if firstEducationEnd: item['eduTimeEnd1'] = firstEducationEnd.pop(0) else: item['eduTimeEnd1'] = [] elif schoolNames: item['educationSchoolName1'] = schoolNames.pop(0) if schoolDegrees: item['educationDegree1'] = schoolDegrees.pop(0) else: item['educationDegree1'] = [] if schoolMajors: item['educationMajor1'] = schoolMajors.pop(0) else: item['educationMajor1'] = [] if educationStarts: item['eduTimeStart1'] = educationStarts.pop(0) else: item['eduTimeStart1'] = [] if educationEnds: item['eduTimeEnd1'] = educationEnds.pop(0) else: item['eduTimeEnd1'] = [] else: item['educationSchoolName1'] = [] item['educationDegree1'] = [] item['educationMajor1'] = [] item['eduTimeStart1'] = [] item['eduTimeEnd1'] = [] if not schoolNames: item['educationSchoolName2'] = [] else: item['educationSchoolName2'] = schoolNames.pop(0) if not schoolNames: item['educationSchoolName3'] = [] else: item['educationSchoolName3'] = schoolNames.pop(0) if not schoolDegrees: item['educationDegree2'] = [] else: item['educationDegree2'] = schoolDegrees.pop(0) if not schoolDegrees: item['educationDegree3'] = [] else: item['educationDegree3'] = schoolDegrees.pop(0) if not schoolMajors: item['educationMajor2'] = [] else: item['educationMajor2'] = schoolMajors.pop(0) if not schoolMajors: item['educationMajor3'] = [] else: item['educationMajor3'] = schoolMajors.pop(0) if not educationStarts: item['eduTimeStart2'] = [] else: item['eduTimeStart2'] = educationStarts.pop(0) if not educationStarts: item['eduTimeStart3'] = [] else: item['eduTimeStart3'] = educationStarts.pop(0) if not educationEnds: item['eduTimeEnd2'] = [] else: item['eduTimeEnd2'] = educationEnds.pop(0) if not educationEnds: item['eduTimeEnd3'] = [] else: item['eduTimeEnd3'] = educationEnds.pop(0) #------------------------------------------------------------------------------------------------------------------ # Work Experience #------------------------------------------------------------------------------------------------------------------ # Work Experience: title experienceHeads = striplist( hxs.select( '//h3[@class="position-title anet"]/span[@class="title"]/text()' ).extract()) item['experienceHeads'] = striplist( hxs.select( '//h3[@class="position-title anet"]/span[@class="title"]/text()' ).extract()) # Work Experience: Time started currentExpTimeStart = striplist( hxs.select( '//div[@class="position first experience vevent vcard summary-current"]/p/abbr[@class="dtstart"]/text()' ).extract()) if not currentExpTimeStart: currentExpTimeStart = striplist( hxs.select( '//div[@class="position first experience vevent vcard current-position"]/p/abbr[@class="dtstart"]/text()' ).extract()) moreExpTimeStart = striplist( hxs.select( '//div[@class="position experience vevent vcard summary-current"]/p/abbr[@class="dtstart"]/text()' ).extract()) if not moreExpTimeStart: moreExpTimeStart = striplist( hxs.select( '//div[@class="position experience vevent vcard current-position"]/p/abbr[@class="dtstart"]/text()' ).extract()) expTimeStarts = striplist( hxs.select( '//div[@class="position experience vevent vcard summary-past"]/p/abbr[@class="dtstart"]/text()' ).extract()) if not expTimeStarts: expTimeStarts = striplist( hxs.select( '//div[@class="position experience vevent vcard past-position"]/p/abbr[@class="dtstart"]/text()' ).extract()) item[ 'expTimeStarts'] = currentExpTimeStart + moreExpTimeStart + expTimeStarts # Work Experience: Time ended present = striplist( hxs.select( '//p[@class="period"]/abbr[@class="dtstamp"]/text()' ).extract()) expTimeEnds = striplist( hxs.select( '//div[@class="position experience vevent vcard summary-past"]/p[@class="period"]/abbr[@class="dtend"]/text()' ).extract()) if not expTimeEnds: expTimeEnds = striplist( hxs.select( '//div[@class="position experience vevent vcard past-position"]/p[@class="period"]/abbr[@class="dtend"]/text()' ).extract()) item['expTimeEnds'] = present + expTimeEnds ''' print 'experienceHeads' print experienceHeads print 'currentExpTimeStart' print currentExpTimeStart print 'moreExpTimeStart' print moreExpTimeStart print 'expTimeStarts' print expTimeStarts print 'present' print present print 'expTimeEnds' print expTimeEnds ''' # Work Experience: Time duration ''' currentDuration = striplist(hxs.select('//div[@class="position first experience vevent vcard summary-current"]/p/span[@class="duration"]/text()').extract()) expTimeDurations = striplist(hxs.select('//div[@class="position experience vevent vcard summary-past"]/p/span[@class="duration"]/text()').extract()) ''' # Work Experience: Description #currentDescription = striplist(hxs.select('//p[@class=" description current-position"]/text()').extract()) #expDescriptions = striplist(hxs.select('//p[@class=" description past-position"]/text()').extract()) #divs = hxs.select('//p[@class=" description past-position"]/text()') #for p in divs.select('.//br') : # print p.extract() # if not currentDescription: # if not expDescriptions: # item['expDescription1'] = [] # else: # item['expDescription1'] = expDescriptions.pop(0) # else: # item['expDescription1'] = currentDescription.pop(0) # # if not expDescriptions: # item['expDescription2'] = [] # else: # item['expDescription2'] = expDescriptions.pop(0) # if not expDescriptions: # item['expDescription3'] = [] # else: # item['expDescription3'] = expDescriptions.pop(0) # if not expDescriptions: # item['expDescription4'] = [] # else: # item['expDescription4'] = expDescriptions.pop(0) # if not expDescriptions: # item['expDescription5'] = [] # else: # item['expDescription5'] = expDescriptions.pop(0) # #item['expDescription2'] = [] #item['expDescription3'] = [] #item['expDescription4'] = [] #item['expDescription5'] = [] yield item else: #if it is a directory for url in hxs.select( '//ul[@class="directory"]/li/a/@href' ).extract( ): #take all of the subdirectories that show up and request them if not randomSampling or random.random( ) < samplingProbability: #random sampling. yield Request('http://www.linkedin.com' + url, callback=self.parse)