示例#1
0
    def _get_url(self):

        tag = ph.match_first_pattern(self._html, self._REGEX)
    
        start_index = tag.index("href='/")
        title_index = tag.index("trk=")
        
        #TODO:  Clean this up
        return LINKEDIN_URL + \
            ph.clean_data(tag[start_index + 7:title_index])
示例#2
0
 def body(self):
     # author : ajbharani
     # article -> body -> sec(recursive) -> p
     # 'body'
     result = ''
     bodies = self.dom.getElementsByTagName('body')
     for body in bodies:
         ph = ParserHelper()
         ph.rectext(body, 'p')
         result += ph.rtext
     return result
示例#3
0
 def abstract(self):
     # author : ajbharani
     # article -> front -> abstract
     # 'abstract'
     result = ''
     abstracts = self.dom.getElementsByTagName('abstract')
     for abstract in abstracts:
         ph = ParserHelper()
         ph.rectext(abstract, 'p')
         result += ph.rtext
     return result
示例#4
0
	def body(self):
		# author : ajbharani
		# article -> body -> sec(recursive) -> p
		# 'body'
		result = ''
		bodies = self.dom.getElementsByTagName('body')
		for body in bodies:
			ph = ParserHelper()
			ph.rectext(body,'p')
			result += ph.rtext
		return result
示例#5
0
	def abstract(self):
		# author : ajbharani
		# article -> front -> abstract
		# 'abstract'
		result = ''
		abstracts = self.dom.getElementsByTagName('abstract')
		for abstract in abstracts:
			ph = ParserHelper()
			ph.rectext(abstract,'p')
			result += ph.rtext
		return result
示例#6
0
 def pubdates(self):
     # author : saranya
     # article -> front -> article-meta -> pub-date
     # [date1, date2]
     # date:
     # {'pub-type':'val','pub-date':'yyyy-mm-dd'}
     ph = ParserHelper()
     result = []
     startTag = self.dom.getElementsByTagName('front')
     for front in startTag:
         for tagsInFront in front.childNodes:
             if tagsInFront.nodeName == 'article-meta':
                 for articleMetaTags in tagsInFront.childNodes:
                     if articleMetaTags.nodeName == 'pub-date':
                         date = dict()
                         datestr = ''
                         month = '01'
                         day = '01'
                         year = '1900'
                         date['pub-type'] = articleMetaTags.getAttribute(
                             'pub-type')
                         for tagsInPubDate in articleMetaTags.childNodes:
                             if tagsInPubDate.nodeName == 'month':
                                 try:
                                     month = tagsInPubDate.firstChild.data
                                 except AttributeError:
                                     month = '01'
                                 month = ph.date_format_helper(month)
                             if tagsInPubDate.nodeName == 'day':
                                 try:
                                     day = tagsInPubDate.firstChild.data
                                 except AttributeError:
                                     day = '01'
                                 day = ph.date_format_helper(day)
                             if tagsInPubDate.nodeName == 'year':
                                 try:
                                     year = tagsInPubDate.firstChild.data
                                 except AttributeError:
                                     year = '1900'
                         datestr += year + '-'
                         datestr += month + '-'
                         datestr += day
                         date['pub-date'] = datestr
                         result.append(date)
     return result
示例#7
0
	def pubdates(self):
		# author : saranya
		# article -> front -> article-meta -> pub-date
		# [date1, date2]
		# date:
		# {'pub-type':'val','pub-date':'yyyy-mm-dd'}
		ph = ParserHelper()
		result = []		
		startTag = self.dom.getElementsByTagName('front')
		for front in startTag:
			for tagsInFront in front.childNodes:
				if tagsInFront.nodeName == 'article-meta':
					for articleMetaTags in tagsInFront.childNodes:
						if articleMetaTags.nodeName == 'pub-date':
							date = dict()
							datestr = '' 
							month = '01'
							day = '01'
							year = '1900'
							date['pub-type'] = articleMetaTags.getAttribute('pub-type')
							for tagsInPubDate in articleMetaTags.childNodes:
								if tagsInPubDate.nodeName == 'month':
									try:
										month = tagsInPubDate.firstChild.data
									except AttributeError:
										month = '01'
									month = ph.date_format_helper(month)
								if tagsInPubDate.nodeName == 'day':
									try:
										day = tagsInPubDate.firstChild.data
									except AttributeError:
										day = '01'
									day = ph.date_format_helper(day)
								if tagsInPubDate.nodeName == 'year':
									try:
										year = tagsInPubDate.firstChild.data
									except AttributeError:
										year = '1900'
							datestr += year + '-'
							datestr += month + '-'
							datestr += day
							date['pub-date'] = datestr
							result.append(date)
		return result
示例#8
0
    def _profile_url(self):
        
        pattern = "<a href=(.*)/profile[^>]*>"
        url = "{0}profile/view?id=".format(LINKEDIN_URL)
        tag = ph.match_first_pattern(self._html, pattern)

        index_of_string = ''
        title_string = ''

        if tag.find("view?id"):
            index_of_string = "id="
            title_string = "&amp;authType"
        elif tag.find("viewProfile=&amp"):
            index_of_string = "key="
            title_string = "&amp;authToken"

        offset = len(index_of_string)

        start_index = offset + tag.index(index_of_string)
        title_index = tag.index(title_string)
        
        return url + ph.clean_data(tag[start_index:title_index])
示例#9
0
 def _name(self):
     
     pattern = " title='View profile'>.*</a>"
     tag = ph.match_first_pattern(self._html, pattern)
     return ph.extract_tag_text(tag)
示例#10
0
    def _get_description(self): 

        tag = ph.match_first_pattern(self._html, self._REGEX)
        return ph.extract_tag_text(tag)
示例#11
0
 def _extract_metric(self, pattern_value):
     pattern = "<span class='{0}'>.*</span>".format(pattern_value)
     tag = ph.match_first_pattern(self._html, pattern)
     return ph.extract_tag_text(tag)
示例#12
0
 def _job_title(self):
     
     pattern = "<dd class='title'>.*</dd>"
     tag = ph.match_first_pattern(self._html, pattern)
     return ph.extract_tag_text(tag)