def parse_tree_to_articles(self, tree): i = 0 for doc in tree.find_all('doc'): try: docid = doc.docid.text if doc.docid else None docsource = doc.source.text if doc.source else None prdsrvid = doc.prdsrvid.text if doc.prdsrvid else None if docid and 'nyt' in docid.lower(): yield self.parse_nyt(doc) elif docid and 'latwp' in docid.lower(): yield self.parse_latwp(doc) elif docid and 'reu' in docid.lower(): yield self.parse_reu(doc) elif (docsource and 'WJ' in docsource) or (prdsrvid and 'WJ' in prdsrvid or 'WA' in prdsrvid): yield self.parse_wj(doc) else: date = None source = None headline = doc.headline.text.strip() if doc.headline else None dateline = doc.dateline.text if doc.dateline else None other = None text = doc.find('text').text.strip() if doc.find('text') else None yield Article(headline, date, text, source, other, dateline) except Exception: raise Exception('Failed on: ' + str(tree.find_all('doc')[i])) finally: i += 1
def parse_nyt(self, doc): TEXT_AND_SOURCE = [('Los Angeles Daily News', Source.LA_DAILY), ('N.Y. Times', Source.NYT), ('Cox News', Source.COX), ('Economist', Source.ECO),] docid = doc.docid.text headline = doc.headline.text.strip() if doc.headline else None dateline = doc.dateline.text if doc.dateline else None date = datetime.datetime.strptime(re.findall(r'\d+', docid)[0], '%y%m%d') text = doc.find('text').text.strip() if doc.find('text') else None source = Source.NYT other = None preamble = doc.preamble.text if doc.preamble else None if preamble: src = preamble.split('\n')[1].split('-')[-1].split(' ')[0] if src in self.SOURCE_DEFAULTS: source = self.SOURCE_DEFAULTS[src] else: for txt, src in TEXT_AND_SOURCE: if txt.lower() in preamble.lower(): source = src return Article(headline, date, text, source, other, dateline)
def parse_reu(self, doc): docid = doc.docid.text headline = doc.headline.text.strip() if doc.headline else None dateline = doc.dateline.text if doc.dateline else None source = Source.REUTE date = None text = doc.find('text').text.strip() if doc.find('text') else Nonei other = None header = doc.header.text.strip() if doc.header else None if header: try: date = re.findall(r'\d+', docid)[0][:2] + '-' + re.split(r' ', header)[1] date = re.sub('-', '', date) date = datetime.datetime.strptime(date, '%y%m%d') except Exception: try: date = docid.split('.')[0][-6:] date = datetime.datetime.strptime(date, '%y%m%d') except: try: date = re.findall(r'\d+', docid)[0][:2] + doc.keyword.strip() date = datetime.datetime.strptime(date, '%y%m%d') except: date = None pass return Article(headline, date, text, source, other, dateline)
def parse_tree_to_articles(self, tree): for doc in tree.getiterator(tag='DOC'): try: doc_attrs = dict(doc.items()) headline = doc.find('HEADLINE') if headline is not None: headline = ' '.join(headline.xpath('.//text()')) dateline = doc.find('DATELINE') if dateline is not None: dateline = ' '.join(dateline.xpath('.//text()')) text = ' '.join(doc.find('TEXT').xpath('.//text()')) date_string = doc_attrs['id'].split('_')[-1].split('.')[0] date = datetime.datetime.strptime(date_string, '%Y%m%d') other = {'type': doc_attrs['type']} source = self.SOURCE_DEFAULTS[doc_attrs['id'].split('_')[0]] if dateline and source == Source.NYT: slug_line = dateline[dateline.rfind('(') + 1:dateline.rfind(')')] slug_line = slug_line.split('-')[-1] for slug in self.source_slug_mapping: if slug.upper() == slug_line.upper().strip(): source = self.source_slug_mapping[slug] yield Article(headline, date, text, source, other, dateline) except Exception: raise Exception('Failed on: ' + etree.tostring(doc).decode())
def parse_wj(self, doc): headline = doc.hl.text.strip() if doc.hl else None dateline = doc.dateline.text if doc.dateline else None source = Source.WSJ date = doc.dspdate text = doc.find('text').text.strip() if doc.find('text') else None other = None if not date: date = doc.msgdate if date: date = date.text.strip()[2:] date = datetime.datetime.strptime(date, '%y%m%d') else: date = None return Article(headline, date, text, source, other, dateline)
def parse_latwp(self, doc): TEXT_AND_SOURCE = [('Newsday', Source.NEWSDAY), ('Courant', Source.HARTC), ('Sun', Source.BSUN), ('Times', Source.LAT), ('Post', Source.WAPO),] docid = doc.docid.text headline = (' ').join(doc.headline.text.strip().split(' ')[1:-2]) if doc.headline else None dateline = doc.dateline.text if doc.dateline else None date = datetime.datetime.strptime(re.findall(r'\d+', docid)[0], '%y%m%d') source = Source.LATW text = doc.find('text').text.strip() if doc.find('text') else None other = None copyright = doc.cpyright.text if doc.cpyright else None if copyright: for txt, src in TEXT_AND_SOURCE: if txt.lower() in copyright.lower(): source = src return Article(headline, date, text, source, other, dateline)
def parse_tree_to_articles(self, tree): for doc in tree.getiterator(tag='DOC'): try: docno = doc.find('DOCNO') if docno is not None: doc_str = re.split(r'(\d+)', ''.join(docno.xpath('.//text()'))) date = datetime.datetime.strptime(doc_str[1], '%Y%m%d') for abrv in self.SOURCE_DEFAULTS: if abrv in doc_str[0]: source = self.SOURCE_DEFAULTS[abrv] """ date_str = doc.find('DATE_TIME') if date_str is not None: date_str = ''.join(date_str.xpath('.//text()')) date_str = date_str.strip().split(' ') date = datetime.datetime.strptime((date_str[0]),'%Y-%m-%d') """ other = doc.find('DOCTYPE') if other is not None: other = {'type': ''.join(other.xpath('.//text()')).strip()} body = doc.find('BODY') slug = body.find('SLUG') if (slug is not None) and (source == self.SOURCE_DEFAULTS['NYT']): for abrv in self.SOURCE_DEFAULTS: if abrv in (''.join( slug.xpath('.//text()')).split('-')[-1]): source = self.SOURCE_DEFAULTS[abrv] headline = body.find('HEADLINE') if headline is not None: headline = ' '.join(headline.xpath('.//text()')) dateline = doc.find('DATELINE') text = ' '.join(body.find('TEXT').xpath('.//text()')) yield Article(headline, date, text, source, other, dateline) except Exception: raise Exception('Failed on: ' + etree.tostring(doc).decode())