Exemplo n.º 1
0
 def parse_tree_to_articles(self, tree):
     i = 0
     for doc in tree.find_all('doc'):
         try:
             docid = doc.docid.text if doc.docid else None
             docsource = doc.source.text if doc.source else None
             prdsrvid = doc.prdsrvid.text if doc.prdsrvid else None
             
             if docid and 'nyt' in docid.lower():
                 yield self.parse_nyt(doc)
             elif docid and 'latwp' in docid.lower():
                 yield self.parse_latwp(doc)            
             elif docid and 'reu' in docid.lower():
                 yield self.parse_reu(doc) 
             elif (docsource  and 'WJ' in docsource) or (prdsrvid and 'WJ' in prdsrvid or 'WA' in prdsrvid):
                 yield self.parse_wj(doc) 
             else:
                 date = None
                 source = None
                 headline = doc.headline.text.strip() if doc.headline else None
                 dateline = doc.dateline.text if doc.dateline else None
                 other = None
                 text = doc.find('text').text.strip() if doc.find('text') else None
                 yield Article(headline, date, text, source, other, dateline)
         except Exception:
             raise Exception('Failed on: ' + str(tree.find_all('doc')[i]))
         finally:
             i += 1
Exemplo n.º 2
0
    def parse_nyt(self, doc):
        TEXT_AND_SOURCE = [('Los Angeles Daily News', Source.LA_DAILY), 
                           ('N.Y. Times', Source.NYT),
                           ('Cox News', Source.COX),
                           ('Economist', Source.ECO),]
        docid = doc.docid.text

        headline = doc.headline.text.strip() if doc.headline else None
        dateline = doc.dateline.text if doc.dateline else None
        date = datetime.datetime.strptime(re.findall(r'\d+', docid)[0], '%y%m%d')
        text = doc.find('text').text.strip() if doc.find('text') else None
        source = Source.NYT
        other = None

        preamble = doc.preamble.text if doc.preamble else None
        if preamble:
            src = preamble.split('\n')[1].split('-')[-1].split(' ')[0]
            if src in self.SOURCE_DEFAULTS:
                source = self.SOURCE_DEFAULTS[src]
            else:
                for txt, src in TEXT_AND_SOURCE:
                    if txt.lower() in preamble.lower():
                        source = src
        
        return Article(headline, date, text, source, other, dateline)
Exemplo n.º 3
0
    def parse_reu(self, doc):
        docid = doc.docid.text

        headline = doc.headline.text.strip() if doc.headline else None
        dateline = doc.dateline.text if doc.dateline else None
        source = Source.REUTE
        date = None
        text = doc.find('text').text.strip() if doc.find('text') else Nonei
        other = None

        header = doc.header.text.strip() if doc.header else None
        if header:
            try:
                date = re.findall(r'\d+', docid)[0][:2] + '-' + re.split(r' ', header)[1]
                date = re.sub('-', '', date)
                date = datetime.datetime.strptime(date, '%y%m%d')
            except Exception:
                try:
                    date = docid.split('.')[0][-6:]
                    date = datetime.datetime.strptime(date, '%y%m%d')
                except:
                    try:
                        date = re.findall(r'\d+', docid)[0][:2] + doc.keyword.strip()
                        date = datetime.datetime.strptime(date, '%y%m%d')
                    except:
                        date = None
                        pass

        return Article(headline, date, text, source, other, dateline)
Exemplo n.º 4
0
 def parse_tree_to_articles(self, tree):
     for doc in tree.getiterator(tag='DOC'):
         try:
             doc_attrs = dict(doc.items())
             headline = doc.find('HEADLINE')
             if headline is not None:
                 headline = ' '.join(headline.xpath('.//text()'))
             dateline = doc.find('DATELINE')
             if dateline is not None:
                 dateline = ' '.join(dateline.xpath('.//text()'))
             text = ' '.join(doc.find('TEXT').xpath('.//text()'))
             date_string = doc_attrs['id'].split('_')[-1].split('.')[0]
             date = datetime.datetime.strptime(date_string, '%Y%m%d')
             other = {'type': doc_attrs['type']}
             source = self.SOURCE_DEFAULTS[doc_attrs['id'].split('_')[0]]
             if dateline and source == Source.NYT:
                 slug_line = dateline[dateline.rfind('(') +
                                      1:dateline.rfind(')')]
                 slug_line = slug_line.split('-')[-1]
                 for slug in self.source_slug_mapping:
                     if slug.upper() == slug_line.upper().strip():
                         source = self.source_slug_mapping[slug]
             yield Article(headline, date, text, source, other, dateline)
         except Exception:
             raise Exception('Failed on: ' + etree.tostring(doc).decode())
Exemplo n.º 5
0
    def parse_wj(self, doc):
        headline = doc.hl.text.strip() if doc.hl else None
        dateline = doc.dateline.text if doc.dateline else None
        source = Source.WSJ
        date = doc.dspdate
        text = doc.find('text').text.strip() if doc.find('text') else None
        other = None

        if not date:
            date = doc.msgdate
        
        if date:
            date = date.text.strip()[2:]
            date = datetime.datetime.strptime(date, '%y%m%d')
        else:
            date = None

        return Article(headline, date, text, source, other, dateline)
Exemplo n.º 6
0
    def parse_latwp(self, doc):
        TEXT_AND_SOURCE = [('Newsday', Source.NEWSDAY),
                           ('Courant', Source.HARTC),
                           ('Sun', Source.BSUN),
                           ('Times', Source.LAT),
                           ('Post', Source.WAPO),]
        docid = doc.docid.text

        headline = (' ').join(doc.headline.text.strip().split(' ')[1:-2]) if doc.headline else None
        dateline = doc.dateline.text if doc.dateline else None
        date = datetime.datetime.strptime(re.findall(r'\d+', docid)[0], '%y%m%d')
        source = Source.LATW
        text = doc.find('text').text.strip() if doc.find('text') else None
        other = None
 
        copyright = doc.cpyright.text if doc.cpyright else None
        if copyright:
            for txt, src in TEXT_AND_SOURCE:
                if txt.lower() in copyright.lower():
                    source = src

        return Article(headline, date, text, source, other, dateline)
Exemplo n.º 7
0
 def parse_tree_to_articles(self, tree):
     for doc in tree.getiterator(tag='DOC'):
         try:
             docno = doc.find('DOCNO')
             if docno is not None:
                 doc_str = re.split(r'(\d+)',
                                    ''.join(docno.xpath('.//text()')))
                 date = datetime.datetime.strptime(doc_str[1], '%Y%m%d')
                 for abrv in self.SOURCE_DEFAULTS:
                     if abrv in doc_str[0]:
                         source = self.SOURCE_DEFAULTS[abrv]
             """
             date_str = doc.find('DATE_TIME')
             if date_str is not None:
                 date_str = ''.join(date_str.xpath('.//text()'))
                 date_str = date_str.strip().split(' ')
             date = datetime.datetime.strptime((date_str[0]),'%Y-%m-%d') 		
             """
             other = doc.find('DOCTYPE')
             if other is not None:
                 other = {'type': ''.join(other.xpath('.//text()')).strip()}
             body = doc.find('BODY')
             slug = body.find('SLUG')
             if (slug is not None) and (source
                                        == self.SOURCE_DEFAULTS['NYT']):
                 for abrv in self.SOURCE_DEFAULTS:
                     if abrv in (''.join(
                             slug.xpath('.//text()')).split('-')[-1]):
                         source = self.SOURCE_DEFAULTS[abrv]
             headline = body.find('HEADLINE')
             if headline is not None:
                 headline = ' '.join(headline.xpath('.//text()'))
             dateline = doc.find('DATELINE')
             text = ' '.join(body.find('TEXT').xpath('.//text()'))
             yield Article(headline, date, text, source, other, dateline)
         except Exception:
             raise Exception('Failed on: ' + etree.tostring(doc).decode())