def _get_url(self): tag = ph.match_first_pattern(self._html, self._REGEX) start_index = tag.index("href='/") title_index = tag.index("trk=") #TODO: Clean this up return LINKEDIN_URL + \ ph.clean_data(tag[start_index + 7:title_index])
def body(self): # author : ajbharani # article -> body -> sec(recursive) -> p # 'body' result = '' bodies = self.dom.getElementsByTagName('body') for body in bodies: ph = ParserHelper() ph.rectext(body, 'p') result += ph.rtext return result
def abstract(self): # author : ajbharani # article -> front -> abstract # 'abstract' result = '' abstracts = self.dom.getElementsByTagName('abstract') for abstract in abstracts: ph = ParserHelper() ph.rectext(abstract, 'p') result += ph.rtext return result
def body(self): # author : ajbharani # article -> body -> sec(recursive) -> p # 'body' result = '' bodies = self.dom.getElementsByTagName('body') for body in bodies: ph = ParserHelper() ph.rectext(body,'p') result += ph.rtext return result
def abstract(self): # author : ajbharani # article -> front -> abstract # 'abstract' result = '' abstracts = self.dom.getElementsByTagName('abstract') for abstract in abstracts: ph = ParserHelper() ph.rectext(abstract,'p') result += ph.rtext return result
def pubdates(self): # author : saranya # article -> front -> article-meta -> pub-date # [date1, date2] # date: # {'pub-type':'val','pub-date':'yyyy-mm-dd'} ph = ParserHelper() result = [] startTag = self.dom.getElementsByTagName('front') for front in startTag: for tagsInFront in front.childNodes: if tagsInFront.nodeName == 'article-meta': for articleMetaTags in tagsInFront.childNodes: if articleMetaTags.nodeName == 'pub-date': date = dict() datestr = '' month = '01' day = '01' year = '1900' date['pub-type'] = articleMetaTags.getAttribute( 'pub-type') for tagsInPubDate in articleMetaTags.childNodes: if tagsInPubDate.nodeName == 'month': try: month = tagsInPubDate.firstChild.data except AttributeError: month = '01' month = ph.date_format_helper(month) if tagsInPubDate.nodeName == 'day': try: day = tagsInPubDate.firstChild.data except AttributeError: day = '01' day = ph.date_format_helper(day) if tagsInPubDate.nodeName == 'year': try: year = tagsInPubDate.firstChild.data except AttributeError: year = '1900' datestr += year + '-' datestr += month + '-' datestr += day date['pub-date'] = datestr result.append(date) return result
def pubdates(self): # author : saranya # article -> front -> article-meta -> pub-date # [date1, date2] # date: # {'pub-type':'val','pub-date':'yyyy-mm-dd'} ph = ParserHelper() result = [] startTag = self.dom.getElementsByTagName('front') for front in startTag: for tagsInFront in front.childNodes: if tagsInFront.nodeName == 'article-meta': for articleMetaTags in tagsInFront.childNodes: if articleMetaTags.nodeName == 'pub-date': date = dict() datestr = '' month = '01' day = '01' year = '1900' date['pub-type'] = articleMetaTags.getAttribute('pub-type') for tagsInPubDate in articleMetaTags.childNodes: if tagsInPubDate.nodeName == 'month': try: month = tagsInPubDate.firstChild.data except AttributeError: month = '01' month = ph.date_format_helper(month) if tagsInPubDate.nodeName == 'day': try: day = tagsInPubDate.firstChild.data except AttributeError: day = '01' day = ph.date_format_helper(day) if tagsInPubDate.nodeName == 'year': try: year = tagsInPubDate.firstChild.data except AttributeError: year = '1900' datestr += year + '-' datestr += month + '-' datestr += day date['pub-date'] = datestr result.append(date) return result
def _profile_url(self): pattern = "<a href=(.*)/profile[^>]*>" url = "{0}profile/view?id=".format(LINKEDIN_URL) tag = ph.match_first_pattern(self._html, pattern) index_of_string = '' title_string = '' if tag.find("view?id"): index_of_string = "id=" title_string = "&authType" elif tag.find("viewProfile=&"): index_of_string = "key=" title_string = "&authToken" offset = len(index_of_string) start_index = offset + tag.index(index_of_string) title_index = tag.index(title_string) return url + ph.clean_data(tag[start_index:title_index])
def _name(self): pattern = " title='View profile'>.*</a>" tag = ph.match_first_pattern(self._html, pattern) return ph.extract_tag_text(tag)
def _get_description(self): tag = ph.match_first_pattern(self._html, self._REGEX) return ph.extract_tag_text(tag)
def _extract_metric(self, pattern_value): pattern = "<span class='{0}'>.*</span>".format(pattern_value) tag = ph.match_first_pattern(self._html, pattern) return ph.extract_tag_text(tag)
def _job_title(self): pattern = "<dd class='title'>.*</dd>" tag = ph.match_first_pattern(self._html, pattern) return ph.extract_tag_text(tag)