async def read_article(self, url, parse_headline=True, parse_date=True): article_html = await self._get(url) headline = "" date = "" text = [] if parse_headline: headline_match = re.search(r">([^<]+)<\/h1>", article_html) if not headline_match: return None headline = clean_html_text(headline_match.group(1)) if parse_date: date_match = re.search( r'date">\s+(\w+ \d+, \d+ \d+:\d+\w\w\s+)<\/span>', article_html) date = text_to_datetime(date_match.group(1)) for p_match in re.finditer(r"<p>([\s\S]+?)<\/p>", article_html): paragraph = clean_html_text(p_match.group(1)).replace( "Already have an account? Login", "") if paragraph.count(" ") <= 1 or string_contains( paragraph, IGNORE_TEXT): continue text.append(paragraph) if len(text) == 0: return None return ("benzinga", headline, date, "\n\n\n".join(text), self.url + url)
async def read_article(self, url, parse_headline=True, parse_date=True): article_html = await self._get(url) text = [] headline = "" date = "" if parse_headline: headline_match = re.search(r'itemprop="headline">([^<]+)<\/h1>', article_html) if not headline_match: return None headline = clean_html_text(headline_match.group(1)) if parse_date: date_match = re.search(r"(\w+ \d+, \w+ \d+:\d+ \w+ \w+)\s+<\/time>", article_html) if not date_match: return None date = text_to_datetime(date_match.group(1)) text = [] for p_match in re.finditer(r"<p>([\s\S]+?)<\/p>", article_html): paragraph = clean_html_text(p_match.group(1)) if len(paragraph) == 0 or string_contains(paragraph, IGNORE_TEXT): continue text.append(paragraph) if len(text) == 0: return None return ("barrons", headline, date, "\n\n\n".join(text), self.url + url)
async def read_article(self, url, parse_headline=True, parse_date=True): article_html = await self._get(url) text = [] headline = "" date = "" if parse_headline: headline_match = re.search(r">([^<]+)<\/h1>", article_html) if not headline_match: return None headline = clean_html_text(headline_match.group(1)) if parse_date: date_match = re.search(r'datetime="([\d\-T:Z\.]+)" itemprop="datePublished"', article_html) if date_match: date = text_to_datetime(date_match.group(1)) else: return None for p_match in re.finditer(r"<(span|p) [^>]+>([\s\S]+?)<\/(span|p)>", article_html): paragraph = clean_html_text(p_match.group(2)) if paragraph.count(" ") <= 2 or string_contains(paragraph, IGNORE_TEXT) or paragraph[0] == ")": continue if "list is empty" in paragraph: break text.append(paragraph) if len(text) == 0: return None return ("yahoo", headline, date, "\n\n\n".join(text), self.url + url)
async def read_article(self, url): if "video/" in url: return None article_html = await self._get(url) headline_match = re.search(r">([^<]+)<\/h1>", article_html) if not headline_match: return None headline = clean_html_text(headline_match.group(1)) date_match = re.search(r'Publish Date" datetime="([\d\-:T+]+)">', article_html) if date_match: date = text_to_datetime(date_match.group(1)) else: return None text = [] for p_match in re.finditer(r"<p>([\s\S]+?)<\/p>", article_html): paragraph = clean_html_text(p_match.group(1)) if paragraph.count(" ") <= 2 or string_contains( paragraph, IGNORE_TEXT): continue text.append(paragraph) if len(text) == 0: return None return ("thestreet", headline, date, "\n\n\n".join(text), self.url + url)
async def read_prs_with_regex(self, regex, url_path, type_to_group={ "date": 1, "url": 2, "title": 3 }, full_url_path=False, article_url_base=None, **kwargs): req_url = self._url + url_path if full_url_path: req_url = url_path resp = await self._get(req_url, **kwargs) releases = [] for match in re.finditer(regex, resp): if type_to_group["date"] != -1: date = text_to_datetime( match.group(type_to_group["date"]).strip()) else: date = pendulum.now() if article_url_base is None: url = self._url + match.group(type_to_group["url"]).strip() else: url = article_url_base + match.group( type_to_group["url"]).strip() url = url.replace(" ", "%20") title = clean_html_text(match.group(type_to_group["title"])) if len(title) == 0: continue releases.append((self.NAME.lower(), title, date, "", url)) return self.SYMBOL, self.NAME, releases
async def read_prs(self): resp = await self._get( self._url + "/phpSide/index.php", method="POST", form_params={"url": "News"}, headers={ "referer": "https://www.selectabio.com/investors&media/news&events/", "sec-fetch-dest": "empty", "sec-fetch-mode": "cors", "sec-fetch-site": "same-site", "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36" + " (KHTML, like Gecko) Chrome/81.0.4044.113 Safari/537.36", }, ) releases = [] try: items = json.loads(resp)["data"] except: items = [] for item in items: date = text_to_datetime(item["releaseDate"]["dateUTC"]) url = item["link"]["hostedUrl"] title = clean_html_text(item["title"]) releases.append((self.NAME.lower(), title, date, "", url)) return self.SYMBOL, self.NAME, releases
async def read_article(self, url, parse_headline=True, parse_date=True): article_html = await self._get(url) text = [] headline = "" date = "" if parse_headline or parse_date: date_match = re.search(r"(\w+ \d+, \d{4}) \/\s+(\d+:\d+ \w+) ", article_html) headline_match = re.search( r'ArticleHeader_headline">([^<]+)<\/h1>', article_html) if not date_match or not headline_match: return None headline = clean_html_text(headline_match.group(1)) date = text_to_datetime( date_match.group(1) + " " + date_match.group(2)) start_idx = article_html.index("StandardArticleBody_body") try: end_idx = article_html.index("Attribution_container") except ValueError: end_idx = len(article_html) content_html = article_html[start_idx:end_idx] for paragraph_match in re.finditer(r"<p>([^<]+)<\/p>", content_html): paragraph = clean_html_text(paragraph_match.group(1)) if paragraph.count(" ") > 1: text.append(paragraph) if len(text) == 0: return None return ("reuters", headline, date, "\n\n\n".join(text), self.url + url)
async def read_prs_from_api(self, path, method="GET", params=None): releases = [] try: resp = await self._get(self._url + path, method=method, json_params=params) data = json.loads(resp)["GetPressReleaseListResult"] except: return self.SYMBOL, self.NAME, releases for item in data: date = text_to_datetime(item["PressReleaseDate"]) url = self._url + item["LinkToDetailPage"] title = clean_html_text(item["Headline"]) releases.append((self.NAME.lower(), title, date, "", url)) return self.SYMBOL, self.NAME, releases
async def read_article(self, url, parse_headline=True, parse_date=True, parse_text=True): article_html = await self._get(url) text = [] headline = "" date = None if parse_headline: headline_match = re.search(r'itemprop="headline">([^<]+)<', article_html) if not headline_match: return None headline = clean_html_text(headline_match.group(1)) if string_contains(headline, IGNORE_HEADLINE): return None if parse_date: date_match = re.search(r'content="([\d\-T:Z]+)" itemprop="datePub', article_html) date = text_to_datetime(date_match.group(1)) if parse_text: for bullet_match in re.finditer( r'<p class="bullets_li">([^<]+?)<\/p>', article_html): bullet_text = clean_html_text(bullet_match.group(1)) if len(bullet_text) == 0 or string_contains( bullet_text, IGNORE_TEXT): continue text.append(bullet_text) for p_match in re.finditer(r'<p class="p p1">([^<]+?)<\/p>', article_html): p_text = clean_html_text(p_match.group(1)) if len(p_text) == 0 or string_contains(p_text, IGNORE_TEXT): continue text.append(p_text) if len(text) < 2: return None return ("seekingalpha", headline, date, "\n\n\n".join(text), self.url + url)
async def read_article(self, url, parse_headline=True, parse_date=True): article_html = await self._get(url) text = [] headline = "" date = "" if parse_headline: headline_match = re.search(r'pg-headline">([^<]+)<\/h1>', article_html) if not headline_match: return None headline = clean_html_text(headline_match.group(1)) if parse_date: update_date_match = re.search( r"Updated (\d+:\d+ \w+ \w+), \w+ (\w+ \d+, \d+)", article_html) if update_date_match: date = text_to_datetime( update_date_match.group(2) + " " + update_date_match.group(1)) else: return None for p_match in re.finditer( r'<(\w{1,3}) class="zn-body__paragraph[ a-z]*">([\s\S]+?)<\/\1>', article_html): paragraph = clean_html_text(p_match.group(2)) paragraph = re.sub(r"\)([A-Z])", r") - \1", paragraph) if paragraph.count(" ") <= 1 or string_contains( paragraph, IGNORE_TEXT): continue text.append(paragraph) if len(text) == 0: return None return ("cnn", headline, date, "\n\n\n".join(text), self.url + url)
async def read_article(self, url, parse_headline=True, parse_date=True): article_html = await self._get(url) text = [] headline = "" date = "" if parse_headline: headline_match = re.search(r'itemprop="headline">([\s\S]+?)<\/h1>', article_html) if not headline_match: return None headline = clean_html_text(headline_match.group(1)) if parse_date: date_match = re.search(r"Published: ([^<]+?)<\/time>", article_html) if not date_match: return None date = text_to_datetime(date_match.group(1)) try: start_idx = article_html.index("articleBody") except: return None try: end_idx = article_html.index("author-commentPromo") except ValueError: end_idx = len(article_html) content_html = article_html[start_idx:end_idx] for paragraph_match in re.finditer(r"<p>([\s\S]+?)<\/p>", content_html): p = clean_html_text(paragraph_match.group(1)) if len(p) >= 30 and not string_contains(p, IGNORE_TEXT): text.append(p) return ("marketwatch", headline, date, "\n\n\n".join(text), self.url + url)
async def read_article(self, url): article_html = await self._get(url) headline_match = re.search(r">([^<]+)<\/h1>", article_html) if not headline_match: return None headline = clean_html_text(headline_match.group(1)) if headline == "Whoops!": return None date_match = re.search(r'data-timestamp="([\-+\dTZ:]+)"', article_html) if not date_match: return None date = text_to_datetime(date_match.group(1)) article_match = re.search( r'<script type="application\/ld\+json">([\s\S]+?)<\/script>', article_html) if not article_match: return None article_text = json.loads( article_match.group(1).strip())["articleBody"] if ">>" in article_text: article_text = article_text[:article_text.index(">>")] article_text = re.sub("([a-z])([A-Z])", "\\1. \\2", article_text) article_text = re.sub("\.([A-Z])", ". \\1", article_text) for dtext in DELETE_TEXT: article_text = article_text.replace(dtext, "") text = clean_html_text(article_text).split("\n") text = [p for p in text if not string_contains(p, IGNORE_TEXT)] return ("businessinsider", headline, date, "\n\n\n".join(text), self.url + url)