def get_meta_from_articles_spec(tei_logger, url, bs): data = tei_defaultdict() data['sch:url'] = url article_root = bs.find('div', class_='site-content') if article_root is None: tei_logger.log( 'WARNING', f'{url}: ARTICLE ROOT NOT FOUND/UNKNOWN ARTICLE SCHEME!') return None date_tag = bs.find('div', class_='m-author__wrapCatDateTitulus') if date_tag is not None: titulus = date_tag.find('span') if titulus is not None: titulus.decompose() parsed_date = parse_date(date_tag.text.strip(), '%Y. %m. %d. %H:%M') if parsed_date is not None: data['sch:datePublished'] = parsed_date else: tei_logger.log('WARNING', f'{url}: DATE FORMAT ERROR!') else: tei_logger.log('WARNING', f'{url}: DATE NOT FOUND IN URL!') modified_date_tag = bs.find('meta', property='article:modified_time') if modified_date_tag is not None: parsed_moddate = parse_date(modified_date_tag.attrs['content'][:19], '%Y-%m-%dT%H:%M:%S') if parsed_moddate is not None: data['sch:dateModified'] = parsed_moddate else: tei_logger.log('WARNING', f'{url}: MODIFIED DATE FORMAT ERROR!') else: tei_logger.log('DEBUG', f'{url}: MODIFIED DATE NOT FOUND IN URL!') keywords = bs.find('meta', {'name': 'keywords', 'content': True}) if keywords is not None: keywords_list = keywords['content'].split(',') while SECTION in keywords_list: keywords_list.remove(SECTION) data['sch:keywords'] = keywords_list else: tei_logger.log('WARNING', f'{url}: KEYWORDS NOT FOUND!') title = article_root.find('h1', class_='o-post__title') if title is not None: data['sch:name'] = title.text.strip() else: tei_logger.log('WARNING', f'{url}: TITLE TAG NOT FOUND!') author = article_root.find_all('a', class_='m-author__imgLink') if len(author) > 0: authors = [] for i in author: author_tag = i.find('img', {'alt': True}) if author_tag is not None: authors.append(author_tag['alt']) if SOURCE in authors: data['sch:source'] = [SOURCE] authors.remove(SOURCE) if len(authors) > 0: data['sch:author'] = authors else: tei_logger.log('WARNING', f'{url}: AUTHOR TAG NOT FOUND!') data['sch:articleSection'] = SECTION return data
def get_meta_from_articles_spec(tei_logger, url, bs): data = tei_defaultdict() data['sch:url'] = url article_root = bs.find('div', class_='post_in') if article_root: title = article_root.find('h2') if title: data['sch:name'] = title.text.strip() else: tei_logger.log('WARNING', f'{url}: TITLE NOT FOUND IN URL!') date_tag = article_root.find('h3', class_='date') if date_tag is not None: # 2012. július 20. 02:07 parsed_date = parse_date( date_tag.find(text=True).strip(), '%Y. %B %d. %H:%M') if parsed_date: data['sch:datePublished'] = parsed_date else: tei_logger.log('WARNING', f'{url}: DATE FORMAT ERROR!') else: tei_logger.log('WARNING', f'{url}: DATE TAG NOT FOUND!') return data else: tei_logger.log( 'WARNING', f'{url}: ARTICLE BODY NOT FOUND OR UNKNOWBN ARTICLE SCHEME!') return None
def get_meta_from_articles_spec(tei_logger, url, bs): data = tei_defaultdict() data['sch:url'] = url date_tag = bs.find('time') if date_tag is not None: # 2021. május 25., kedd, 19:49 parsed_date = parse_date(date_tag.text.strip(), '%Y. %B %d., %A, %H:%M') if parsed_date is not None: data['sch:datePublished'] = parsed_date else: tei_logger.log('WARNING', f'{url}: DATE TEXT FORMAT ERROR!') else: tei_logger.log('WARNING', f'{url}: DATE TAG NOT FOUND!') title_and_tags = bs.find('div', class_='article_title') if title_and_tags is not None: data['sch:name'] = title_and_tags.find('h1').text.strip() tags = [a.text.strip() for a in title_and_tags.find_all('a')] if len(tags) > 0: data['sch:keywords'] = tags else: tei_logger.log('DEBUG', f'{url}: TAGS NOT FOUND!') else: tei_logger.log('WARNING', f'{url}: TITLE AND TAGS NOT FOUND IN URL!') author = bs.find('span', class_='author') if author is not None: data['sch:author'] = author.text.split('/') else: tei_logger.log('WARNING', f'{url}: AUTHOR TAG NOT FOUND!') section = url.split('/')[3] if section in SECTION_DICT.keys(): data['sch:articleSection'] = SECTION_DICT[section] else: tei_logger.log('WARNING', f'{url}: SECTION TAG NOT FOUND!') return data
def get_meta_from_articles_spec(tei_logger, url, bs): data = tei_defaultdict() data['sch:url'] = url article_root = bs.find('div', ), {'id': 'main-outlet'} if article_root: date_tags = bs.find_all('time', { 'class': 'post-time', 'datetime': True }) if len(date_tags) > 0: parsed_date = parse_date(date_tags[0].attrs['datetime'][:19], '%Y-%m-%dT%H:%M:%S') data['sch:datePublished'] = parsed_date parsed_mod_date = parse_date(date_tags[-1].attrs['datetime'][:19], '%Y-%m-%dT%H:%M:%S') data['sch:dateModified'] = parsed_mod_date else: tei_logger.log('WARNING', f'{url}: DATE FORMAT ERROR!') title_tag = bs.find('meta', {'property': 'og:title'}) if title_tag: data['sch:name'] = title_tag.attrs['content'] else: tei_logger.log('WARNING', f'{url}: TITLE NOT FOUND IN URL!') return data else: tei_logger.log('WARNING', f'{url}: ARTICLE BODY NOT FOUND!') return None
def get_meta_from_articles_spec(tei_logger, url, bs): data = tei_defaultdict() data['sch:url'] = url if bs.find('article'): date_tag = bs.find( 'time' ) # <time class="updated" datetime="2019-12-27T13:31:40+01:00"> if date_tag is not None and 'datetime' in date_tag.attrs.keys(): parsed_date = parse_date(date_tag.attrs['datetime'][:19], '%Y-%m-%dT%H:%M:%S') data['sch:datePublished'] = parsed_date else: tei_logger.log('WARNING', f'{url}: DATE FORMAT ERROR!') title = bs.find('h1', class_='entry-title') if title is not None: data['sch:name'] = title.text.strip() else: tei_logger.log('WARNING', f'{url}: TITLE NOT FOUND IN URL!') section_line = bs.find('div', class_='breadcrumbs-plus') if section_line is not None: section = section_line.find('a', title=True) if section is not None: data['sch:articleSection'] = section.text.strip() else: tei_logger.log('WARNING', f'{url}: SECTION NOT FOUND!') else: tei_logger.log('DEBUG', f'{url}: SECTION TAG NOT FOUND!') authors_tag = bs.find('h6', class_='entry-meta') if authors_tag is not None: authors = [ i.strip() for i in authors_tag.find_all(text=True, recursive=False) if len(i.strip()) > 0 ] if len(authors) > 0: data['sch:author'] = authors else: tei_logger.log('DEBUG', f'{url}: AUTHOR TAG NOT FOUND!') else: tei_logger.log('WARNING', f'{url}: AUTHOR TAG NOT FOUND!') keywords_root = bs.find('section', class_='tags-section') if keywords_root is not None: keywords_list = [ a.text.strip() for a in keywords_root.find_all('a', rel='tag') if a is not None ] if len(keywords_list) > 0: data['sch:keywords'] = keywords_list else: tei_logger.log('WARNING', f'{url}: TAGS NOT FOUND!') return data else: tei_logger.log('WARNING', f'{url}: ARTICLE BODY CONTAINER NOT FOUND!') return None
def get_meta_from_articles_spec(tei_logger, url, bs): data = tei_defaultdict() data['sch:url'] = url meta_root = bs.find('section', class_='article') if meta_root is not None: date_tag = bs.find('span', class_='date') if date_tag is not None: parsed_date = parse_date(date_tag.text.strip(), '%Y. %B %d., %A %H:%M') if parsed_date is not None: data['sch:datePublished'] = parsed_date else: tei_logger.log('WARNING', f'{url}: DATE FORMAT ERROR!') else: tei_logger.log('WARNING', f'{url}: DATE TAG NOT FOUND!') title = meta_root.find('h1', itemprop='headline') if title is not None: data['sch:name'] = title.text.strip().replace('/t', ' ') else: tei_logger.log('WARNING', f'{url}: TITLE TAG NOT FOUND!') author_tag = meta_root.find('span', itemprop='author') if author_tag is not None: author_text = author_tag.text.strip() """valójában ritkán van 'és', de a köv. sor az egyszerű esetet is kezeli""" # https://vs.hu/sport/osszes/magyarorszag-spanyolorszag-percrol-percre-1211#!s184 if ' – ' in author_text: author_text = author_text.split(' – ') elif ' és ' in author_text: author_text = author_text.split(' és ') else: author_text = [author_text] data['sch:author'] = author_text else: tei_logger.log('WARNING', f'{url}: AUTHOR TAG NOT FOUND!') keywords_list = [ t.text.strip() for t in meta_root.find_all('a', class_='tag') ] if len(keywords_list) > 0: data['sch:articleSection'] = keywords_list[0] if len(keywords_list) > 1: data['subsection'] = keywords_list[1] if len(keywords_list) > 2: data['sch:keywords'] = keywords_list[2:] else: tei_logger.log('WARNING', f'{url}: SUBJECT TAG NOT FOUND!') return data else: tei_logger.log( 'WARNING', f'{url}: ARTICLE BODY NOT FOUND OR UNKNOWN ARTICLE SCHEME!') return None
def get_meta_from_articles_spec(tei_logger, url, bs): data = tei_defaultdict() data['sch:url'] = url article_root = bs.find('div', id='cikk-content') # <div id="cikk-content" class=""> if article_root: date_tag = bs.find('meta', {'name': 'article:published_time'}) if date_tag is not None and 'content' in date_tag.attrs.keys(): parsed_date = parse_date(date_tag.attrs['content'][:19], '%Y-%m-%dT%H:%M:%S') data['sch:datePublished'] = parsed_date else: tei_logger.log('WARNING', f'{url}: DATE TAG NOT FOUND!') date_mod_tag = bs.find('meta', {'name': 'article:modified_time'}) if date_mod_tag is not None and 'content' in date_tag.attrs.keys(): parsed_mod_date = parse_date(date_mod_tag.attrs['content'][:19], '%Y-%m-%dT%H:%M:%S') data['sch:dateModified'] = parsed_mod_date title = article_root.find('h1') if title: data['sch:name'] = title.text.strip() else: tei_logger.log('WARNING', f'{url}: TITLE NOT FOUND IN URL!') subtitle = article_root.find('h2') if subtitle is not None: subtitle_text = subtitle.text.strip() if len(subtitle_text) > 0: data['sch:alternateName'] = subtitle_text authors = [author.text.strip() for author in article_root.find_all('a', class_='author__name')] post_authors = [] # authors of news feed for p_auth_tag in article_root.find_all('div', class_='article_author'): p_auth = p_auth_tag.find('em') if p_auth is not None: post_authors.append(p_auth.text.strip()) if len(post_authors) > 0: authors.extend(list(set(post_authors))) if len(authors) > 0: data['sch:author'] = authors elif len(authors) > 1: tei_logger.log('WARNING', f'{url}: AUTHOR TAG NOT FOUND!') tags = [a.attrs['content'] for a in bs.find_all('meta', {'name': 'article:tag'})] if len(tags) > 0: data['sch:articleSection'] = tags[0] if len(tags) > 1: tags.remove(tags[0]) data['sch:keywords'] = tags else: tei_logger.log('DEBUG', f'{url}: TAGS NOT FOUND!') return data tei_logger.log('WARNING', f'{url}: ARTICLE BODY NOT FOUND!') return None
def get_meta_from_articles_spec(tei_logger, url, bs): data = tei_defaultdict() data['sch:url'] = url article_root = bs.find('div', class_='post-inner group') if article_root is not None: date_tag = article_root.find('time', class_='published') if date_tag is not None and 'datetime' in date_tag.attrs.keys(): parsed_date = parse_date(date_tag.attrs['datetime'][0:19], '%Y-%m-%dT%H:%M:%S') data['sch:datePublished'] = parsed_date else: tei_logger.log('WARNING', f'{url}: DATE FORMAT ERROR!') mod_date_tag = article_root.find('time', class_='updated') if mod_date_tag is not None and 'datetime' in mod_date_tag.attrs.keys( ): parsed_mod_date = parse_date(mod_date_tag.attrs['datetime'][0:19], '%Y-%m-%dT%H:%M:%S') data['sch:dateModified'] = parsed_mod_date title = article_root.find('h1', class_='post-title') if title is not None: data['sch:name'] = title.text.strip() else: tei_logger.log('WARNING', f'{url}: TITLE NOT FOUND IN URL!') author = article_root.find('a', rel='author') if author is not None: data['sch:author'] = [author.text.strip()] else: tei_logger.log('WARNING', f'{url}: AUTHOR TAG NOT FOUND!') section_tag = bs.find('li', class_='category') if section_tag is not None: data['sch:articleSection'] = section_tag.text.strip() else: tei_logger.log('WARNING', f'{url}: SECTION TAG NOT FOUND!') keywords_root = bs.find('p', class_='post-tags') if keywords_root is not None: article_tags = [ a.text.strip() for a in keywords_root.find_all('a', rel='tag') if a is not None ] if len(article_tags) > 0: data['sch:keywords'] = article_tags else: tei_logger.log('DEBUG', f'{url}: TAGS NOT FOUND!') return data else: tei_logger.log('WARNING', f'{url}: METADATA CONTAINER NOT FOUND!') return None
def get_meta_from_articles_spec(tei_logger, url, bs): data = tei_defaultdict() data['sch:url'] = url write_it = '' article_root = bs.find() data['sch:datePublished'] = write_it # else: tei_logger.log('WARNING', f'{url}: DATE FORMAT ERROR!') data['sch:dateModified'] = write_it # else: tei_logger.log('WARNING', f'{url}: MODIFIED DATE TEXT FORMAT ERROR!') data['sch:name'] = write_it # else: tei_logger.log('WARNING', f'{url}: TITLE NOT FOUND IN URL!') data['sch:author'] = [] # else: tei_logger.log('WARNING', f'{url}: AUTHOR TAG NOT FOUND!') data['sch:articleSection'] = write_it # else: tei_logger.log('WARNING', f'{url}: SECTION TAG NOT FOUND!') data['sch:keywords'] = [] # else: tei_logger.log('WARNING', f'{url}: TAGS NOT FOUND!') return data
def get_meta_from_articles_spec(tei_logger, url, bs): data = tei_defaultdict() data['sch:url'] = url article_root = bs.find('article', class_=False) if not article_root: tei_logger.log('WARNING', f'{url}: ARTICLE BODY NOT FOUND!') return None header = article_root.find('header') if header is None: tei_logger.log('WARNING', f'{url}: HEADER TAG NOT FOUND!') else: pub_date = header.find('time') if pub_date: # 2021. augusztus 10. kedd - 16:59 parsed_date = parse_date(pub_date.text.strip(), '%Y. %B %d. %A - %H:%M') if parsed_date: data['sch:datePublished'] = parsed_date else: tei_logger.log('WARNING', f'{url}: DATE FORMAT ERROR!') else: tei_logger.log('WARNING', f'{url}: DATE NOT FOUND IN URL!') title_tag = header.find('h1', class_='typo-h1') if title_tag: data['sch:name'] = title_tag.text.strip() else: tei_logger.log('WARNING', f'{url}: TITLE NOT FOUND IN URL!') author_tag = header.find('span', class_='typo-weight-bold') if author_tag: data['sch:author'] = [author_tag.text.strip()] if ' ' in data['sch:author'] or ',' in data['sch:author'] or \ len(header.find('span', class_='typo-weight-bold')) > 1: print(url, author_tag) else: tei_logger.log('WARNING', f'{url}: AUTHOR TAG NOT FOUND!') data['sch:articleSection'] = 'Élménybeszámolók' keyword_root = bs.find('div', class_='my-5') if keyword_root: data['sch:keywords'] = [ a.text.strip() for a in keyword_root.find_all('a', class_='button') ] else: tei_logger.log('WARNING', f'{url}: TAGS NOT FOUND!') return data
def get_meta_from_articles_spec(tei_logger, url, bs): data = tei_defaultdict() data['sch:url'] = url if bs.find('article'): date_tag = bs.find('meta', {'property': 'article:published_time'}) if date_tag is not None and 'content' in date_tag.attrs.keys(): parsed_date = parse_date(date_tag.attrs['content'][:19], '%Y-%m-%dT%H:%M:%S') data['sch:datePublished'] = parsed_date else: print('WARNING', f'{url}: DATE TAG NOT FOUND!') date_mod_tag = bs.find('meta', {'property': 'article:modified_time'}) if date_mod_tag is not None and 'content' in date_tag.attrs.keys(): parsed_mod_date = parse_date(date_mod_tag.attrs['content'][:19], '%Y-%m-%dT%H:%M:%S') data['sch:dateModified'] = parsed_mod_date title_root = bs.find('h1') if title_root: title = title_root.find('span', class_='bt_bb_headline_content') if title is not None: data['sch:name'] = title.text.strip() else: tei_logger.log('WARNING', f'{url}: TITLE NOT FOUND IN URL!') section = title_root.find('span', class_='bt_bb_headline_superheadline') if section is not None: data['sch:articleSection'] = section.text.strip() else: tei_logger.log('WARNING', f'{url}: SECTION TAG NOT FOUND!') keywords_root = bs.find('div', class_='btTagsRow') if keywords_root: keywords = [ kw.text.strip() for kw in keywords_root.find_all('a', {'href': True}) ] if len(keywords) > 0: data['sch:keywords'] = keywords else: tei_logger.log('WARNING', f'{url}: TAGS NOT FOUND!') return data else: tei_logger.log('WARNING', f'{url}: ARTICLE BODY NOT FOUND!') return None
def get_meta_from_articles_spec(tei_logger, url, bs): data = tei_defaultdict() data['sch:url'] = url article_root = bs.find('header', class_='post-header') if article_root is not None: date_tag = article_root.find('time') # <time class="updated" datetime="2019-12-27T13:31:40+01:00"> if date_tag is not None and 'datetime' in date_tag.attrs.keys(): parsed_date = parse_date(date_tag.attrs['datetime'][:19], '%Y-%m-%dT%H:%M:%S') data['sch:datePublished'] = parsed_date else: tei_logger.log('WARNING', f'{url}: DATE FORMAT ERROR!') title = article_root.find('h1', class_='entry-title') if title is not None: data['sch:name'] = title.text.strip() else: title = bs.find('p', class_='matrix-item-title') if title is not None: data['sch:name'] = title.text.strip() else: tei_logger.log('WARNING', f'{url}: TITLE NOT FOUND IN URL!') authors = article_root.find(class_='byline author') if authors is not None: authors_list = [] for a in authors.find_all('a'): if ' és ' not in a.text: authors_list.append(a.text.strip()) else: authors_list.extend(a.text.strip().split(' és ')) data['sch:author'] = authors_list else: tei_logger.log('WARNING', f'{url}: AUTHOR TAG NOT FOUND!') keywords_root = article_root.find('div', class_='post-information') if keywords_root is not None: keywords_list = [a.text.strip() for a in keywords_root.find_all('a', rel='tag') if a is not None] data['sch:keywords'] = keywords_list return data else: # Single occurrence: https://abcug.hu/kozeposztaly/ title = bs.find('p', class_='matrix-item-title') if title is not None: data['sch:name'] = title.text.strip().encode('raw_unicode_escape').decode('UTF-8') return data
def get_meta_from_articles_spec(tei_logger, url, bs): data = tei_defaultdict() data['sch:url'] = url date_tags = bs.find_all('time') # <time datetime="2020-12-23T14:13:31+01:00" itemprop="datePublished"> if len(date_tags) > 0: for date in date_tags: if date.get('datetime'): parsed_date = parse_date(date.attrs['datetime'][:19], '%Y-%m-%dT%H:%M:%S') if date.get('itemprop') == 'datePublished': data['sch:datePublished'] = parsed_date elif date.get('itemprop') == 'dateModified': data['sch:dateModified'] = parsed_date else: tei_logger.log('WARNING', f'{url}: DATE TAG NOT FOUND ERROR!') title = bs.find('h2', itemprop='headline') if title: data['sch:name'] = title.text else: tei_logger.log('WARNING', f'{url}: TITLE NOT FOUND IN URL!') return data
def get_meta_from_articles_spec(tei_logger, url, bs): data = tei_defaultdict() data['sch:url'] = url title = bs.find('meta', {'property': 'og:title', 'content': True}) # check whether the encoding is correct encodingerror = True if title is not None: title = title.attrs['content'].strip() try: title = encoding_correction(title, encodingerror) except UnicodeDecodeError: encodingerror = False data['sch:name'] = title else: tei_logger.log('WARNING', f'{url}: TITLE TAG NOT FOUND!') article_root = bs.find('div', class_='bodywrapper') if article_root is not None: date_tag = article_root.find('div', class_='artTime') if date_tag is not None: date_text = date_tag.text.strip() if date_text is not None: data['sch:datePublished'] = parse_date(date_text, '%Y. %m. %d. - %H:%M') else: tei_logger.log('WARNING', f'{url}: DATE FORMAT ERROR!') else: tei_logger.log('WARNING', f'{url}: DATE TAG NOT FOUND!') source_author_list = [] author = article_root.find('div', class_='artAuthor') source = article_root.find('div', class_='artSource') if author is not None: author.span.decompose() source_author_list += [ encoding_correction(t.strip(), encodingerror) for t in re.split('[/,]', author.text) ] if source is not None: source.span.decompose() source_author_list += [ encoding_correction(t.strip(), encodingerror) for t in re.split('[/,]', source.text) ] if len(source_author_list) > 0: author_list = [] source_list = [] [ source_list.append(e) if e.lower() in SOURCE_LIST else author_list.append(e) for e in source_author_list ] if len(author_list) > 0: data['sch:author'] = author_list if len(source_list) > 0: data['sch:source'] = source_list else: tei_logger.log('WARNING', f'{url}: NEITHER SOURCE NOR AUTHOR TAG WAS FOUND!') section_tag = article_root.find('div', class_='breadCrumbs') if section_tag is not None: section_tree = section_tag.text.split('/') data['sch:articleSection'] = encoding_correction( section_tree[1].strip(), encodingerror) if len(section_tree) > 2: data['subsection'] = encoding_correction( section_tree[-1].strip(), encodingerror) else: tei_logger.log('WARNING', f'{url}: SECTION TAG NOT FOUND!') else: tei_logger.log( 'WARNING', f'{url}: ARTICLE BODY NOT FOUND OR UNKNOWN ARTICLE SCHEME!') return data
def get_meta_from_articles_spec(tei_logger, url, bs): data = tei_defaultdict() data['sch:url'] = url article_root = bs.find('article', class_='cikk-body') for args, kwargs in ARTICLE_ROOT_PARAMS_SPEC: article_root = bs.find(*args, **kwargs) if article_root is not None: break if article_root is not None: # Date: <span class="datum">/ 2014.12.15., hétfő 11:45 /</span> date_tag = bs.find('span', class_='datum').text.strip() if date_tag is not None: date_text = date_tag[2:-2] if date_tag[2] == '2': parsed_date = parse_date(date_text, '%Y.%m.%d., %A %H:%M') if parsed_date is not None: data['sch:datePublished'] = parsed_date else: tei_logger.log('WARNING', f'{url}: DATE FORMAT ERROR!') else: tei_logger.log('WARNING', f'{url}: NOT REAL DATE ERROR!') else: tei_logger.log('WARNING', f'{url}: DATE TAG NOT FOUND!') article_title = article_root.find('h1', itemprop='name') if article_title is not None: data['sch:name'] = article_title.text.strip() else: tei_logger.log('WARNING', f'{url}: TITLE NOT FOUND IN URL!') article_subtitle = article_root.find('h2') if article_subtitle is not None: article_subtitle_text = article_subtitle.text.strip() if len(article_subtitle_text) > 0 and article_subtitle_text != '-': data['sch:alternateName'] = article_subtitle_text # <span class="szerzo"><a class="szerzo" href="/szerzo/ujsagiro/-1656" rel="author"></a></span> # <span class="datum"> | 2014.10.12., vasárnap 19:10 |</span> # <span class="forras">Hírforrás: Válasz.hu</span> author_tags = article_root.find_all('a', rel='author') if len(author_tags) > 0: if any(len(elem.text.strip()) == 0 for elem in author_tags): source = article_root.find('span', class_="forras") if source is not None: data['sch:source'] = source.text.strip() else: data['sch:author'] = [a.text for a in author_tags] else: # The source and author fields can co-exist article_source = article_root.find('span', class_='forras') article_author2 = article_root.find('span', class_='szerzo') if article_source is not None: data['sch:source'] = article_source.text.strip() if article_author2 is not None: data['sch:author'] = [article_author2.text.strip()] keyword_root = bs.find('aside', class_='breadcrumb') if keyword_root is not None: a_list = [a.text.strip() for a in keyword_root.find_all('a')] # a_list[0] contains 'Főoldal' therefore omitted data['sch:articleSection'] = a_list[1] if len(a_list) == 4: data['subsection'] = a_list[2] return data else: tei_logger.log('WARNING', f'{url}: ARTICLE BODY NOT FOUND!') return None
def get_meta_from_articles_spec(tei_logger, url, bs): data = tei_defaultdict() data['sch:url'] = url date_root = bs.find('div', class_='dateAndSocials') if date_root: pub_date = bs.find('div', class_='date') if pub_date: parsed_date = parse_date(pub_date.text.strip(), '%Y.%m.%d. %H:%M') if parsed_date: data['sch:datePublished'] = parsed_date else: tei_logger.log('WARNING', f'{url}: DATE FORMAT ERROR!') else: tei_logger.log('WARNING', f'{url}: DATE NOT FOUND IN URL!') title = bs.find('h1') if title: data['sch:name'] = title.text.strip() else: tei_logger.log('WARNING', f'{url}: TITLE NOT FOUND IN URL!') rest_as_keyword = [] meta_root = bs.find('div', class_='sidebarInfos') if meta_root is None: meta_root = bs.find('div', class_='textCols clearfix') # gallery articles if meta_root: author_p = [ p for p in meta_root.find_all(['p', 'h4']) if len(p.text.strip()) > 0 ] for p in author_p: p_text = p.text.strip() if p_text.startswith('Szerzők'): authors = [ a.text.strip().replace('Fotók: ', '') for a in p.find_all('a') ] data['sch:author'] = authors elif p_text.startswith('Földrajzi'): places = [a.text.strip() for a in p.find_all('a')] data['sch:contentLocation'] = places elif p_text.startswith('Építészek'): artist = [a.text.strip() for a in p.find_all('a')] data['sch:artist'] = artist elif p.text.strip().startswith('Vélemények:') or p.text.strip().startswith('További') \ or p.text.strip().startswith('Letölthető'): break else: rest_as_keyword.extend([ a.text.strip() for a in p.find_all('a') if len(a.text.strip()) > 0 ]) # metadata categories can be developed with the followings: # Cég, szervezet: # https://epiteszforum.hu/irodahazak-ejszakaja-vi-well-iroda sourceOrganization Termék, technológia: # https://epiteszforum.hu/tetoablak-trend-energiamegtakaritas-es-ujrahasznosithatosag Letölthető # dokumentumok: https://epiteszforum.hu/belteri-falfelulet-megformalasa-a-nyiregyhazi-foiskola-tanszeki # -epulet-beruhazasahoz-kapcsolodva17 Térkép # https://epiteszforum.hu/alairtak-a-zeneakademia-epuletenek-rekonstrukciojat-celzo-108-milliard-forintrol # -szolo-tamogatasi-szerzodest címkék https://epiteszforum.hu/club-aliga-ahogyan-mar-sosem-fogjuk-latni # dosszié, projektinfó/földrajzi hely: https://epiteszforum.hu/elso-napunk-a-velencei-epiteszeti-biennalen if len(rest_as_keyword) > 0: data['sch:keywords'] = rest_as_keyword else: tei_logger.log('WARNING', f'{url}: AUTHOR AND KEYWORD TAG ROOT NOT FOUND!') section_root = bs.find('div', class_='data clearfix') if not section_root: section_root = bs.find('div', class_='type') # gallery articles if section_root: sections = section_root.text.strip().split('/') main_sec = sections[0].strip() data['sch:articleSection'] = main_sec if len(sections) > 1: sub_sec = sections[1].strip() data['subsection'] = sub_sec else: tei_logger.log('WARNING', f'{url}: SECTION TAG NOT FOUND!') return data
def get_meta_from_articles_spec(tei_logger, url, bs): data = tei_defaultdict() data['sch:url'] = url section = url[url.find('://') + 3:url.find('.')] if section in SECTION_DICT.keys(): data['sch:articleSection'] = SECTION_DICT[section] else: tei_logger.log('WARNING', f'{url}: SECTION TAG NOT FOUND!') if section not in {'tech', 'sarm', 'penz'}: header = bs.find_all('header') if len(header) > 1: header = header[1] date_tag = header.find('span', class_='date') if date_tag is not None: # 2021. május 27. 12:00, utolsó frissítés: 12:02 - a Transindex.ro portálról dates = date_tag.text.strip().split(',') parsed_date = parse_date(dates[0], '%Y. %B %d. %H:%M') if parsed_date is not None: data['sch:datePublished'] = parsed_date else: tei_logger.log('WARNING', f'{url}: DATE TEXT FORMAT ERROR!') mod_date = dates[1].strip() if mod_date.startswith('utolsó') and '201' not in mod_date and '202' not in mod_date \ and '200' not in mod_date: mod_date = (dates[0][:(dates[0].find(':') - 2)]) + ( mod_date[mod_date.find(': ') + 2:mod_date.find(' -')]) else: mod_date = mod_date[18:] parsed_moddate = parse_date(mod_date, '%Y. %B %d. %H:%M') if parsed_moddate is not None: data['sch:dateModified'] = parsed_moddate else: tei_logger.log( 'WARNING', f'{url}, {mod_date}: MODIFIED DATE TEXT FORMAT ERROR!') else: tei_logger.log('WARNING', f'{url}: DATE TAG NOT FOUND!') title = header.find('h1') if title is not None: title = title.text.strip() data['sch:name'] = title else: title = bs.find('h1') if title is not None: title = title.text.strip() data['sch:name'] = title else: tei_logger.log('WARNING', f'{url}: TITLE NOT FOUND IN URL!') author = header.find('span', class_='writer') if author is not None: data['sch:keywords'] = [author.text.strip()] else: tei_logger.log('DEBUG', f'{url}: TAGS NOT FOUND!') return data elif section in {'tech', 'sarm'}: date_tag = bs.find('ul', {'id': 'UtolsoModP1'}) if date_tag is not None: # <li>Utolsó frissítés: 16:54 GMT +2, <b>2007. március 22.</b></li> date_day = date_tag.find('b') hour_min = date_tag.find('li').text.strip()[ 18:23] # Utolsó frissítés: 16:54 GMT +2, date = f'{date_day.text.strip()} {hour_min}' parsed_date = parse_date(date, '%Y. %B %d. %H:%M') if parsed_date is not None: data['sch:datePublished'] = parsed_date else: # <ul id="UtolsoModP1"><li>Utolsó frissítés: 12: 1 GMT +2, <b>2007. 2-.</b></li></ul> parsed_date = parse_date(date_day.text.strip()[:4], '%Y') if parsed_date is not None: data['sch:datePublished'] = parsed_date else: tei_logger.log('WARNING', f'{url}: DATE/YEAR TEXT FORMAT ERROR!') else: tei_logger.log('WARNING', f'{url}: DATE TAG NOT FOUND!') title = bs.find('span', class_='MagazinCim') if title is not None: title_text = title.text.strip() data['sch:name'] = title_text else: tei_logger.log('WARNING', f'{url}: TITLE NOT FOUND IN URL!') subtitle_tag = bs.find('span', class_='MagazinAlcim') if subtitle_tag is not None: subtitle = subtitle_tag.text.strip() data['sch:alternateName'] = subtitle else: tei_logger.log('DEBUG', f'{url}: TITLE NOT FOUND IN URL!') author = bs.find('span', class_='MagazinSzerzo') if author is not None: data['sch:author'] = [author.text.strip()] else: tei_logger.log('DEBUG', f'{url}: AUTHOR TAG NOT FOUND!') return data elif section not in {'penz', 'sport'}: # The articles of 'penz' and 'sport' column cannot be processed, because these links redirect to th main page. # (Archiving these columns is under construction.) tei_logger.log('WARNING', f'{url}: UNKNOW ARTICLE SCHEMA!') return None
def get_meta_from_articles_spec(tei_logger, url, bs): data = tei_defaultdict() data['sch:url'] = url write_it = '' for args, kwargs in ARTICLE_ROOT_PARAMS_SPEC: article_root = bs.find(*args, **kwargs) if article_root is not None: break else: print('WARNING', f'{url} ARTICLE BODY ROOT NOT FOUND!') return None date_and_author = article_root.find('div', class_='author') dates_and_author_parts = [ date_part.strip() for date_part in date_and_author.text.replace( '\n\n\t\t\t\t\t\t', '•').split('•') ] if len(dates_and_author_parts) > 0: parsed_date = parse_date(dates_and_author_parts[1], '%Y. %B %d., %H:%M') if parsed_date is not None: data['sch:datePublished'] = parsed_date else: tei_logger.log('WARNING', f'{url}: DATE FORMAT ERROR!') print(parsed_date) if len(dates_and_author_parts ) > 2 and dates_and_author_parts[2].startswith('utolsó'): mod = dates_and_author_parts[2] mod = mod[mod.find(': ') + 2:] parsed_moddate = parse_date(mod, '%Y. %B %d., %H:%M') if parsed_moddate is not None: data['sch:dateModified'] = parsed_moddate else: tei_logger.log('WARNING', f'{url}: DATE TAG NOT FOUND!') is_author = dates_and_author_parts[0] if is_author != 'Székelyhon': data['sch:author'] = [ author.strip() for author in is_author.split(',') ] title = article_root.find('h1') # , class_='maintitle' if title is not None: data['sch:name'] = title.text.strip() else: tei_logger.log('WARNING', f'{url}: TITLE NOT FOUND IN URL!') section = url.split('/')[3] if section in SECTION_DICT.keys(): data['sch:articleSection'] = SECTION_DICT[section] else: tei_logger.log('WARNING', f'{url}: SECTION TAG NOT FOUND!') tags = article_root.find('div', class_='tags_con1') if tags is not None and len(tags) > 0: tags = [ tag.text.strip() for tag in tags.find_all('div', class_='tags_item') ] if len(tags) > 0: data['sch:keywords'] = tags else: tei_logger.log('DEBUG', f'{url}: TAGS NOT FOUND!') return data
def get_meta_from_articles_spec(tei_logger, url, bs): data = tei_defaultdict() data['sch:url'] = url article_root = bs.find('div', class_='maincontent8') if article_root is not None: # DATE: <p>2021.11.29. 13:10</p> article_main_content = bs.find('div', class_='main-content') if article_main_content is not None: date_tag_text = article_main_content.find( 'p', recursion=False).get_text(strip=True) if date_tag_text is not None: parsed_date = parse_date(date_tag_text, '%Y.%m.%d. %H:%M') if parsed_date is not None: data['sch:datePublished'] = parsed_date else: tei_logger.log('DEBUG', f'{url}: DATE FORMAT ERROR!') else: tei_logger.log('DEBUG', f'{url}: DATE TAG NOT FOUND!') # ARTICLESECTION, KEYWORDS, AUTHOR(S), and NAME cikkholder_tag = article_main_content.find('div', {'id': 'cikkholder'}) if cikkholder_tag is not None: # ARTICLESECTION plugin_holder_tag = cikkholder_tag.find('div', class_='plugin-holder') if plugin_holder_tag is not None: article_section_attribute_tag_text = \ plugin_holder_tag.find('a', class_='btn-link').get_text(strip=True) if article_section_attribute_tag_text is not None: data[ 'sch:articleSection'] = article_section_attribute_tag_text else: tei_logger.log('DEBUG', f'{url}: SECTION TAG NOT FOUND!') # KEYWORDS keyword_container_tag = cikkholder_tag.find('div', class_='text') if keyword_container_tag is not None: # there is duplication of 'a' tags without this level keyword_attribute_tags = keyword_container_tag.find_all( 'a', {'rel': 'tag'}) if len(keyword_attribute_tags) > 0: keywords_list = [ tag.get_text(strip=True) for tag in keyword_attribute_tags if tag.get_text(strip=True) is not None ] if len(keywords_list) > 0: data['sch:keywords'] = keywords_list else: tei_logger.log('DEBUG', f'{url}: KEYWORD TAGS NOT FOUND!') else: tei_logger.log('DEBUG', f'{url}: KEYWORD CONTAINER TAG EMPTY!') # AUTHOR(S) note_block_tag = article_main_content.find('div', class_='note-block') if note_block_tag is not None: author_or_source = note_block_tag.find( 'div', class_='text-wrap').get_text(strip=True) if author_or_source is not None: if author_or_source in SOURCE or author_or_source in SOURCE_SECONDARY: data["sch:source"] = [author_or_source] else: # split by: ANY OF THESE ',-–' CHARACTERS FOLLOWED BY WHITESPACE '\s' AND NOT 'a ', 'az ', # 'A ' or 'Az ' # regex solution may be over complicated split_list = re.split( "[,\-\–]\s(?!a\s|az\s|A\s|Az\s)", author_or_source) if len(split_list) > 0 and split_list[0] != '': source_list, author_list = [], [] for author in split_list: if author in SOURCE or author in SOURCE_SECONDARY: source_list.append(author.strip()) else: author_list.append(author.strip()) if len(author_list) > 0: data['sch:author'] = author_list if len(source_list) > 0: data['sch:source'] = source_list else: tei_logger.log('DEBUG', f'{url}: AUTHOR TAG TEXT EMPTY!') # NAME(title): article_name_tag_text = cikkholder_tag.find('h1').get_text( strip=True) if article_name_tag_text is not None: data['sch:name'] = article_name_tag_text else: tei_logger.log('WARNING', f'{url}: TITLE NOT FOUND IN URL!') else: tei_logger.log( 'DEBUG', f'{url}: AUTHOR-KEYWORDS-ARTICLESECTION-NAME TAG NOT FOUND!' ) # DATEMODIFIED: <script type='application/ld+json'... meta_script_tag_text = bs.find('script', { 'type': 'application/ld+json' }).get_text(strip=True) if meta_script_tag_text is not None: # tag text is written as str of dict - needs converting to dict /w json date_modified = json.loads( meta_script_tag_text)["@graph"][-1]["dateModified"] if date_modified is not None: parsed_modification_date = parse_date( date_modified, '%Y-%m-%dT%H:%M:%S%z') # only works python 3.6< if parsed_modification_date is not None: data['sch:dateModified'] = parsed_modification_date else: tei_logger.log('DEBUG', f'{url}: MODIFICATION DATE FORMAT ERROR!') else: tei_logger.log('DEBUG', f'{url}: MODIFICATION DATE NOT FOUND!') else: tei_logger.log('WARNING', f'{url}: UNKNOWN ARTICLE SCHEMA!') return None return data
def get_meta_from_articles_spec(tei_logger, url, bs): data = tei_defaultdict() data['sch:url'] = url if bs.find('div', class_='mindenkozben_post_content content'): # MINDEKÖZBEN: 'https://index.hu/mindekozben/poszt/2020/12/21/virusbiztos_pulcsi_karacsonyra/' data['sch:articleSection'] = 'Mindeközben' pub_date = bs.find('meta', {'name': 'i:publication'}) if pub_date is not None: parsed_date = parse_date(pub_date.attrs['content'].strip(), '%Y. %m. %d.') if parsed_date is not None: data['sch:datePublished'] = parsed_date else: tei_logger.log('WARNING', f'{url} UNKNOWN DATE FORMAT 1') else: tei_logger.log('WARNING', f'{url} MISSING DATE') keyword = bs.find('div', class_='heading') if keyword is not None: data['sch:keywords'] = [keyword.text.strip()] author = bs.find('div', class_='name') if author is not None: data['sch:author'] = [author.text.strip()] else: tei_logger.log('WARNING', f'{url}: AUTHOR TAG NOT FOUND!') title = bs.find('h3', class_='title') if title is not None: data['sch:name'] = title.text.strip() else: tei_logger.log('WARNING', f'{url}: TITLE NOT FOUND IN URL!') return data elif 'https://index.hu/' in url or 'https://velvet.hu/' in url or 'https://totalcar.hu/' in url or \ 'https://totalbike.hu/' in url: # 'https://index.hu/techtud/2020/03/19/koronavirus_netflix_korlatozas_karanten_europai_unio/' # 'https://totalcar.hu/magazin/kozelet/2020/04/05/autozas_a_jarvany_utan_semmi_nem_lesz_olyan_mint_elotte/' # <div class="datum"><span>2020.03.19. 20:29</span> # <span class="modositas-datuma-gomb" title="Módosítás dátuma">Módosítva: 2020.03.19. 21:16</span></div> dates_tag = bs.find('div', class_='datum') if dates_tag is not None and len(dates_tag.text.strip()) > 0: for span in dates_tag.find_all('span'): if 'class' in span.attrs.keys() and (span.attrs['class'] == ['modositas-datuma-gomb']): parsed_mod_date = parse_date(span.text.strip()[11:], '%Y.%m.%d. %H:%M') if parsed_mod_date is not None: data['sch:dateModified'] = parsed_mod_date else: tei_logger.log('WARNING', f'{url} UNKNOWN MODIFIED DATE FORMAT') else: parsed_date = parse_date(span.text.strip(), '%Y.%m.%d. %H:%M') if parsed_date is not None: data['sch:datePublished'] = parsed_date else: # <meta property="og:updated_time" content="2021-05-15T08:47:31+02:00 date_tag = bs.find('meta', {'property': 'og:updated_time'}) # <time class="updated" datetime="2019-12-27T13:31:40+01:00"> if date_tag is not None and 'content' in date_tag.attrs.keys(): parsed_date = parse_date(date_tag.attrs['content'][:19], '%Y-%m-%dT%H:%M:%S') if parsed_date: data['sch:datePublished'] = parsed_date else: tei_logger.log('WARNING', f'{url} UNKNOWN DATE FORMAT 2') else: tei_logger.log('WARNING', f'{url} MISSING DATE') else: post_pub_date = bs.find('span', class_='ido') if post_pub_date is not None: parsed_date = parse_date(post_pub_date.text.strip(), '%Y. %B %d., %H:%M') # 2020. április 1., 11:53 if parsed_date is not None: data['sch:datePublished'] = parsed_date else: # <meta property="og:updated_time" content="2021-05-15T08:47:31+02:00 date_tag = bs.find('meta', {'property': 'og:updated_time'}) # <time class="updated" datetime="2019-12-27T13:31:40+01:00"> if date_tag is not None and 'content' in date_tag.attrs.keys(): parsed_date = parse_date(date_tag.attrs['content'][:19], '%Y-%m-%dT%H:%M:%S') if parsed_date: data['sch:datePublished'] = parsed_date else: tei_logger.log('WARNING', f'{url} UNKNOWN DATE FORMAT 3b') else: tei_logger.log('WARNING', f'{url} MISSING DATE') # <span class="ido" data-timestamp="1585734801000"></span> else: tei_logger.log('WARNING', f'{url} MISSING DATE') if 'index' in url: if bs.find('div', class_='pp-list') is not None: # Reports: https://index.hu/belfold/2020/04/01/koronavirus_hirek_aprilis_1/ title_container = bs.find('div', class_='content-title') if title_container is not None: subtitle = title_container.find('h1', class_='alcim') if subtitle is not None: data['sch:alternateName'] = subtitle.text.strip() main_title = title_container.find('h2') if main_title is not None: data['sch:name'] = main_title.text.strip() else: data['sch:name'] = title_container.find('h1').text.strip() else: # Simple index.hu article title = bs.find('div', class_='content-title') if title is not None: main_title = title.find('h1') if main_title is not None: data['sch:name'] = main_title.text.strip() subtitle = title.find(class_='alcim') if subtitle is not None: data['sch:alternateName'] = subtitle.text.strip() else: title = bs.find('h3', class_=['podcast-title', 'title default']) # <h3 class="title default"> post címe if title is not None: data['sch:name'] = title.text.strip() else: title = bs.find('div', class_='_8z50') # the warc does not contain it if title is not None: data['sch:name'] = title.text.strip() # https://index.hu/belfold/2020/05/15/kibeszelo_home_office_koronavirus_elo_adas_facebook_live/ else: # <meta property="og:title" content="Majdnem 12 ezren vannak még karanténban" /> title = bs.find('meta', {'property': 'og:title', 'content': True}) if title is not None: data['sch:name'] = title.attrs['content'].strip() else: tei_logger.log('WARNING', f'{url}: TITLE NOT FOUND IN URL!') else: # VELVET # 'https://velvet.hu/gumicukor/2020/03/08/lakatos_mark_milanoi_divathet/' title = bs.find('h1') if title is not None: data['sch:name'] = title.text.strip() else: tei_logger.log('WARNING', f'{url}: TITLE NOT FOUND IN URL!') authors = [a.text.strip() for a in bs.find_all('a', rel='author')] if len(authors) > 0: data['sch:author'] = authors else: authors = [a.text.strip() for a in bs.find_all('div', class_='szerzo')] if authors is not None: data['sch:author'] = authors else: authors = [a.text.strip() for a in bs.find_all('div', class_='c-human_details_infos')] if len(authors) > 0: data['sch:author'] = authors else: post_author = bs.find('div', class_='name') # közvetítés post if post_author is not None: data['sch:author'] = [post_author.text.strip()] else: tei_logger.log('WARNING', f'{url}: AUTHOR TAG NOT FOUND!') cimkek = bs.find('ul', class_=["cikk-cimkek", "m-tag-list"]) if cimkek is not None: tags = [a.text.strip() for a in cimkek.find_all('a', class_='cimke-rovat-light')] if len(tags) > 0: data['sch:keywords'] = tags else: tei_logger.log('WARNING', f'{url}: TAGS NOT FOUND!') if 'index.hu' in url: section = cimkek.find('a', class_='cimke-rovat') if section is not None: data['sch:articleSection'] = section.text.strip() else: tei_logger.log('WARNING', f'{url}: SECTION TAG NOT FOUND!') else: # Velvet section = bs.find('meta', {'name': 'news_keywords'}) # velvet: <meta name="news_keywords" content="Élet" /> if section is not None: data['sch:articleSection'] = section.attrs['content'] else: tei_logger.log('WARNING', f'{url}: SECTION TAG NOT FOUND!') else: tei_logger.log('WARNING', f'{url}: SECTION TAG NOT FOUND!') return data elif 'https://femina.hu/' in url: # bs.find('header', class_='m-femina-header') # https://femina.hu/egeszseg/koronavirus-immunitas/ title = bs.find('div', class_='cim') if title is not None: data['sch:name'] = title.text.strip() else: tei_logger.log('WARNING', f'{url}: TITLE NOT FOUND IN URL!') author_and_date = [li.text.strip() for li in bs.find_all('li', class_='article-meta-item')] if len(author_and_date) > 0: data['sch:author'] = [author_and_date[0]] pub_date_text = author_and_date[1] parsed_date = parse_date(pub_date_text, '%Y.%m.%d.') if parsed_date is not None: data['sch:datePublished'] = parsed_date else: tei_logger.log('WARNING', f'{url} UNKNOWN DATE FORMAT 4') else: tei_logger.log('WARNING', f'{url}: AUTHOR TAG AND DATE CONTAINER NOT FOUND!') cimkek = bs.find('ul', class_="cikk-cimkek-list") if cimkek is not None: tags = [a.text.strip() for a in cimkek.find_all('a', class_='cimke-rovat-light')] if len(tags) > 0: data['sch:keywords'] = tags[1:] section = cimkek.find('a', class_='cimke-rovat') if section is not None: data['sch:articleSection'] = section.text.strip() else: tei_logger.log('WARNING', f'{url}: TAGS AND SECTION NOT FOUND!') return data elif 'https://divany.hu/' in url: # bs.find('div', class_='e-article-body'): # index/divany: 'https://divany.hu/szuloseg/2020/04/16/jarvany-visszaeles-zaklatas' title = bs.find('h1', class_='t-article-head_text_title') if title is not None: data['sch:name'] = title.text.strip() else: title2 = bs.find('h2', class_='t-article-head_text_title') if title2 is not None: data['sch:name'] = title2.text.strip() else: tei_logger.log('WARNING', f'{url}: TITLE NOT FOUND!') authors = [a.text.strip() for a in bs.find_all('a', rel='author')] if len(authors) > 0: data['sch:author'] = authors else: # <a class="c-human_details_infos_name authors2 = bs.find('a', class_='c-human_details_infos_name') if authors2 is not None: data['sch:author'] = [authors2.text.strip()] else: tei_logger.log('WARNING', f'{url}: AUTHOR TAG NOT FOUND!') date_tag = bs.find('time', class_='t-asd_share-date_date') # datetime="2021-01-20T08:00:06+01:00 if date_tag is not None and date_tag.get('datetime'): parsed_date = parse_date(date_tag.attrs['datetime'][:19], '%Y-%m-%dT%H:%M:%S') data['sch:datePublished'] = parsed_date # date_tag.attrs['datetime'] else: tei_logger.log('WARNING', f'{url} MISSING DATE OR UNKONOWN DATE FORMAT') cimkek = bs.find('ul', class_="m-tags") if cimkek is not None: tags = [a.text.strip() for a in cimkek.find_all('a', class_='cimke-rovat-light')] if len(tags) > 0: data['sch:keywords'] = tags section = cimkek.find('a', class_='cimke-rovat') if section is not None: data['sch:articleSection'] = section.text.strip() else: tei_logger.log('WARNING', f'{url}: SECTION NOT FOUND!') else: tei_logger.log('WARNING', f'{url}: TAGS NOT FOUND!') return data
def get_meta_from_articles_spec(tei_logger, url, bs): data = tei_defaultdict() data['sch:url'] = url article_root = bs.find('article') if article_root is not None: date_tag = bs.find('span', class_='en-article-dates-main') if date_tag is not None: # 2019. augusztus 23. péntek 14:22 article_date_text = date_tag.text parsed_date = parse_date(article_date_text, '%Y. %B %d. %A %H:%M') if parsed_date is not None: data['sch:datePublished'] = parsed_date else: tei_logger.log('WARNING', f'{url}: DATE TEXT FORMAT ERROR!') else: tei_logger.log('WARNING', f'{url}: DATE TAG NOT FOUND!') modified_date_tag = bs.find('span', class_='en-article-dates-updated') # 2018. 05. 09. 07:46 if modified_date_tag is not None: modified_date_text = modified_date_tag.text parsed_modified_date = parse_date(modified_date_text, '%Y. %m. %d. %H:%M') if parsed_modified_date is not None: data['sch:dateModified'] = parsed_modified_date else: tei_logger.log('WARNING', f'{url}: MODIFIED DATE TEXT FORMAT ERROR!') title = article_root.find('div', class_='et_main_title') if title is not None: article_title = title.find('h1') data['sch:name'] = article_title.text.strip() else: tei_logger.log('WARNING', f'{url}: TITLE NOT FOUND IN URL!') subtitle = article_root.find('div', class_='en-article-subtitle') if subtitle is not None: data['sch:alternateName'] = subtitle.text.strip() author = article_root.find('div', class_='en-article-author') source = article_root.find('div', class_='en-article-source col-sm') if author is not None: data['sch:author'] = [author.text.strip()] elif source is not None: # In case if not an author, only source (MTI) data['sch:source'] = source.text.strip() else: tei_logger.log('WARNING', f'{url}: AUTHOR TAG NOT FOUND!') article_tags = [] section_line = article_root.find('span', class_='en-article-header-column') if section_line is not None: sections = [a.text for a in section_line.find_all('a') if a is not None] for col in sections: if col in KNOWN_MAIN_COLUMNS: data['sch:articleSection'] = col elif col: article_tags.append(col) else: tei_logger.log('DEBUG', f'{url}: SECTION TAG NOT FOUND!') keywords_root = article_root.find('div', class_='en-article-tags') if keywords_root is not None: article_tags.extend(a.text.strip() for a in keywords_root.find_all('a', rel='tag') if a is not None) data['sch:keywords'] = article_tags else: tei_logger.log('DEBUG', f'{url}: TAGS NOT FOUND!') return data else: tei_logger.log('WARNING', f'{url}: ARTICLE BODY NOT FOUND!') return None
def get_meta_from_articles_spec(tei_logger, url, bs): data = tei_defaultdict() data['sch:url'] = url article_root = bs.find('div', class_='article-content-elements') percrol_root = bs.find('div', class_='group-right') author_root = bs.find('div', class_='field--items') tag_root = bs.find('div', class_='field field--name-field-tags' ' field--type-entity-reference field--label-hidden field--items') if article_root is not None: if percrol_root is not None: perc_h4_title = bs.find_all('h4', class_='esemeny-title') perc_h4_author_source = list(set(bs.find_all('h4')) - set(perc_h4_title)) if perc_h4_author_source is not None: perc_author_source_list = list(dict.fromkeys([t.text.strip() for t in perc_h4_author_source])) if perc_author_source_list is not None: perc_source_list = set(perc_author_source_list).intersection(SOURCE_LIST) data['sch:author'] = list(set(perc_author_source_list) - set(perc_source_list)) if len(perc_source_list) > 0: data['sch:source'] = perc_source_list else: tei_logger.log('WARNING', f'{url}: AUTHOR / SOURCE TAG NOT FOUND!') date_tag = bs.find('div', class_='article-dates') if date_tag is not None: date_text = date_tag.text.strip() if date_text is not None: data['sch:datePublished'] = parse_date(date_text.replace(' |', ''), '%Y. %B %d. %H:%M') else: tei_logger.log('WARNING', f'{url}: DATE FORMAT ERROR!') else: tei_logger.log('WARNING', f'{url}: DATE TAG NOT FOUND!') title = bs.find('h1', class_='page-title') if title is not None: data['sch:name'] = title.text.strip() else: tei_logger.log('WARNING', f'{url}: TITLE TAG NOT FOUND!') if author_root is not None: author_list = [t.text.strip() for t in author_root.find_all('h4')] if author_list is not None: data['sch:author'] = author_list else: tei_logger.log('DEBUG', f'{url}: AUTHOR TAG NOT FOUND!') if tag_root is not None: keywords_list = [t.text.strip() for t in tag_root.find_all('a')] if len(keywords_list) > 0: data['sch:keywords'] = keywords_list else: tei_logger.log('DEBUG', f'{url}: TAGS NOT FOUND!') source_in_text_1 = article_root.find( 'div', class_='field field--name-field-forras field--type-string field--label-inline') if source_in_text_1 is not None: data['sch:source'] = source_in_text_1.find('div', class_='field--item').text.strip() else: if len(article_root.find_all('p')) > 0: source_in_text_2 = article_root.find_all('p')[-1].text.strip() if len(source_in_text_2) > 0: if source_in_text_2[0] == '(' and source_in_text_2[-1] == ')': data['sch:source'] = source_in_text_2[1:-1] elif ' - ' in source_in_text_2: if len(source_in_text_2) < 40: data['sch:source'] = source_in_text_2.strip() else: if len(source_in_text_2) < 40: data['sch:source'] = source_in_text_2.strip() else: source_in_text_3 = article_root.find('div', class_='field field--name-body field--type-text-with-' 'summary field--label-hidden field--item') if source_in_text_3 is not None and len(article_root.find_all('p')) == 3: source_in_text_4 = article_root.find_all('p')[-2].text.strip() if len(source_in_text_4) < 40: data['sch:source'] = source_in_text_4.strip() elif source_in_text_3 is not None and 0 < len(article_root.find_all('p')) < 3: source_in_text_4 = article_root.find_all('p')[-1].text.strip() if len(source_in_text_4) < 40: data['sch:source'] = source_in_text_4.strip() else: tei_logger.log('WARNING', f'{url}: SOURCE TAG NOT FOUND!') return data else: tei_logger.log('WARNING', f'{url}: ARTICLE BODY NOT FOUND OR UNKNOWN ARTICLE SCHEME!') return None