예제 #1
0
 def body(self):
     # author : ajbharani
     # article -> body -> sec(recursive) -> p
     # 'body'
     result = ''
     bodies = self.dom.getElementsByTagName('body')
     for body in bodies:
         ph = ParserHelper()
         ph.rectext(body, 'p')
         result += ph.rtext
     return result
예제 #2
0
 def abstract(self):
     # author : ajbharani
     # article -> front -> abstract
     # 'abstract'
     result = ''
     abstracts = self.dom.getElementsByTagName('abstract')
     for abstract in abstracts:
         ph = ParserHelper()
         ph.rectext(abstract, 'p')
         result += ph.rtext
     return result
예제 #3
0
 def pubdates(self):
     # author : saranya
     # article -> front -> article-meta -> pub-date
     # [date1, date2]
     # date:
     # {'pub-type':'val','pub-date':'yyyy-mm-dd'}
     ph = ParserHelper()
     result = []
     startTag = self.dom.getElementsByTagName('front')
     for front in startTag:
         for tagsInFront in front.childNodes:
             if tagsInFront.nodeName == 'article-meta':
                 for articleMetaTags in tagsInFront.childNodes:
                     if articleMetaTags.nodeName == 'pub-date':
                         date = dict()
                         datestr = ''
                         month = '01'
                         day = '01'
                         year = '1900'
                         date['pub-type'] = articleMetaTags.getAttribute(
                             'pub-type')
                         for tagsInPubDate in articleMetaTags.childNodes:
                             if tagsInPubDate.nodeName == 'month':
                                 try:
                                     month = tagsInPubDate.firstChild.data
                                 except AttributeError:
                                     month = '01'
                                 month = ph.date_format_helper(month)
                             if tagsInPubDate.nodeName == 'day':
                                 try:
                                     day = tagsInPubDate.firstChild.data
                                 except AttributeError:
                                     day = '01'
                                 day = ph.date_format_helper(day)
                             if tagsInPubDate.nodeName == 'year':
                                 try:
                                     year = tagsInPubDate.firstChild.data
                                 except AttributeError:
                                     year = '1900'
                         datestr += year + '-'
                         datestr += month + '-'
                         datestr += day
                         date['pub-date'] = datestr
                         result.append(date)
     return result