def collectData(): data = [] #Cycle through every page on the website for i in range(1, 4): print(i) sauce = urllib.request.urlopen(base_url + str(i)).read() soup = bs.BeautifulSoup(sauce, 'html.parser', parse_only=only_tr_tags) articles = soup.findAll('tr', {'style': ''}) for article in articles: dataPart = {} link_title = article.find('a', {'class': 'press-link'}) if (link_title != None): #Find all the titles of the articles dataPart['title'] = link_title.get_text() #Find all the links of the articles dataPart['link'] = link_title.get('href') #Find all the dates of the articles date = article.find('td', {'style': ''}) if (date != None): dataPart['date'] = date.get_text() #Add the dataPart to the data if (dataPart != {} and dataPart['title'] != ''): data.append(dataPart) return data
def checkDate(cur_url): #检查正在遍历的网页是否是当天的新闻 html_content = gethtml(cur_url) soup = BeautifulSoup(html_content, 'html.parser') date = soup.find('span') if date.get_text() == todayDate(): return True else: return False
def get_fin_stat_links(): """ Get urls of new financial statements news from Source: https://zse.hr/default.aspx?id=36774&ticker1=&Page=1 """ url = 'https://zse.hr/default.aspx?id=36774&ticker1=&Page=1' res = get_request(url) bs = BeautifulSoup(res.text, features="lxml") link = [a['href'] for a in bs.find_all("a", href=True)] link = ["http://www.zse.hr/" + l for l in link if "UserDocsImages/financ" in l] dtime = [date.get_text().strip() for date in bs.select('.vijestRowDatumDat')] dtime = pd.to_datetime(dtime) return dtime, link
def read_posts_bitcointalk(self, response): url_post_string = [ 'topic', ] if any(substring in response.url for substring in url_post_string): self.pages_crawled += 1 self.check_max_pages() soup = BeautifulSoup(response.body, "html.parser") texts_raw = soup.find_all('div', class_="post") dates_raw = soup.find_all('div', class_="smalltext") dates = [] for date in dates_raw: date = date.get_text() if any(substring in date for substring in date_word_list) and len(date) < 30: date = convert_date_to_unix_time(date) dates.append(date) texts = [] for text in texts_raw: text = text.get_text().encode('utf-8') if not text.isdigit(): texts.append(text) filename_date = "temp_date_output.txt" filename_text = "temp_text_output.txt" try: os.remove(filename_date) except OSError: pass try: os.remove(filename_text) except OSError: pass with open(filename_date, "a") as f1: pickle.dump(dates, f1) with open(filename_text, "a") as f2: pickle.dump(texts, f2) url_board_string = ["board=5", "board=7", "board=8"] if any(substring in response.url for substring in url_board_string): self.parse(response)
def news_sentiments(self): # Returns news articles curated via Finviz, Yahoo, and Google News, GET UNUSUAL OPTION ACTIVITY BASE_URL = f'https://finviz.com/quote.ashx?t={self.ticker}' soup = self._get_soup(BASE_URL) table = soup.find('table', {'class': 'fullview-news-outer'}) rows = table.find_all('tr') df_data = [] for row in rows: date = row.find('td', {'align': 'right'}) article = row.find('td', {'align': 'left'}) link = article.find('a')['href'] df_data.append((date.get_text(), article.get_text(), link)) df = pd.DataFrame(df_data, columns=['Time', 'Headline', 'Link']) BASE_URL = f'https://finance.yahoo.com/quote/{self.ticker}/news?p={self.ticker}' soup = self._get_soup(BASE_URL) links = soup.find_all('a', {'class': 'js-content-viewer wafer-caas Fw(b) Fz(18px) Lh(23px) LineClamp(2,46px) Fz(17px)--sm1024 Lh(19px)--sm1024 LineClamp(2,38px)--sm1024 mega-item-header-link Td(n) C(#0078ff):h C(#000) LineClamp(2,46px) LineClamp(2,38px)--sm1024 not-isInStreamVideoEnabled'}) news = [(link.get_text(), str('yahoo.com' + link['href'])) for link in links] BASE_URL = f'https://finance.yahoo.com/quote/{self.ticker}/press-releases?p={self.ticker}' soup = self._get_soup(BASE_URL) links = soup.find_all('a', {'class': 'js-content-viewer wafer-caas Fw(b) Fz(18px) Lh(23px) LineClamp(2,46px) Fz(17px)--sm1024 Lh(19px)--sm1024 LineClamp(2,38px)--sm1024 mega-item-header-link Td(n) C(#0078ff):h C(#000) LineClamp(2,46px) LineClamp(2,38px)--sm1024 not-isInStreamVideoEnabled'}) press_releases = [(link.get_text(), str('yahoo.com' + link['href'])) for link in links] # Look for keywords in the news? Any showcases, Investor/analyst days, Analyst revisions, Management transitions # Product launches, Significant stock buyback changes # Getting news from google news search googlenews = GoogleNews(lang='en', period='14d') # Specify period for news googlenews.get_news(f'${self.ticker} stock') stock_news = googlenews.results() # print([(i, j) for i, j in zip(googlenews.get_texts(), googlenews.get_links())]) # To get other pages, do googlenews.get_page(2), etc. # Have whitelist of websites to search articles from. Maybe have key word to filter out stupid stuff. sectors = self.find_competition() sector_news = [] if sectors: for sector in sectors: googlenews = GoogleNews(lang='en', period='14d') googlenews.get_news(f'{sector} sector stocks') sector_news.append(googlenews.result()) return df, news, press_releases, sector_news, stock_news
def get_bouhgt_house(config,source): url = 'http://sh.lianjia.com/chengjiao/' web_data = requests.get(url) soup = BeautifulSoup(web_data.text,'lxml') #print(soup) house_pages = soup.select('body > div.wrapper > div.main-box.clear > div > div.page-box.house-lst-page-box > a') #print(pages) for page in house_pages: if page.get_text().isdigit(): pages = page.get_text() else: break url_base = 'http://sh.lianjia.com/chengjiao/' for page in range(1,int(pages)+1): print('present page is------------------',page,'------------------','\n') more_page = 'd'+str(page) url = url_base + more_page web_data = requests.get(url) soup = BeautifulSoup(web_data.text,'lxml') house_name = soup.select('body > div.wrapper > div.main-box.clear > div > div.list-wrap > ul > li > div.info-panel > h2 > a') prices_per_area = soup.select('body > div.wrapper > div.main-box.clear > div > div.list-wrap > ul > li > div.info-panel > div > div.col-2.fr > div > div:nth-of-type(2) > div') bought_date = soup.select('body > div.wrapper > div.main-box.clear > div > div.list-wrap > ul > li > div.info-panel > div > div.col-2.fr > div > div:nth-of-type(1) > div') prices = soup.select('body > div.wrapper > div.main-box.clear > div > div.list-wrap > ul > li > div.info-panel > div > div.col-2.fr > div > div.fr > div') for name,price_per_area,date,price in zip(house_name,prices_per_area,bought_date,prices): names = name.get_text() #print('names',names,'-----------------','\n') #print('prices',prices,'---------------','\n') name_layout_area = names.split(' ') name = name_layout_area[0].encode('UTF-8','ignore') layout = name_layout_area[1] area = re.findall(r'(\w*[0-9]+\.*[0-9]+)\w*',name_layout_area[2]) #print(name,'---------------',layout,'---------------------',area,'-------------------','\n') price_per_area = re.findall(r'(\w*[0-9]+\.*[0-9]+)\w*',price_per_area.get_text()) date = date.get_text() price = re.findall(r'(\w*[0-9]+\.*[0-9]+)\w*',price.get_text()) #print(type(price_per_area),price_per_area) #print('house----------',name,layout,area,price_per_area,date,price) connection = pymysql.connect(**config) try: with connection.cursor() as cursor: # 执行sql语句,插入记录 sql = 'INSERT INTO house_bought (name, price, area, layout, source, price_per_area, bought_date, import_date) VALUES (%s, %s, %s, %s, %s, %s, %s, %s)' cursor.execute(sql, (name, price, area, layout, source, price_per_area[0], date, present_date)) # 没有设置默认自动提交,需要主动提交,以保存所执行的语句 connection.commit() finally: connection.close() time.sleep(1)
'sep': 9, 'oct': 10, 'nov': 11, 'dec': 12 } if date[0].lower() in monthDict.keys(): month = monthDict[date[0].lower()] else: month = date[0].lower() day = date[1] year = date[2] fullDate = date[0] + ' ' + date[1] + ' ' + date[2] return fullDate, day, month, year date, day, month, year = treat_date(removeDC(date.get_text())) text.get_text() newsDict = { 'content': [], 'date': [], 'source': [], 'categoria': [], 'day': [], 'month': [], 'year': [], 'link': [] } categorias = [ 'politics', 'technology', 'health', 'education', 'immigration',
driver.HideCommandPromptWindow = True print('URL:') print(URL) driver.get(URL) #wait for it to load sleep(5) keepLoading = True iterations = 0 print('') print('*******' + Name + '*******') while keepLoading: soup = BeautifulSoup(driver.page_source, 'html.parser') for days in soup.find_all('div', {'class' : 'trn-gamereport-list__group'}): date = days.find('h3', {'class' : 'trn-gamereport-list__title'}) if date is not None: date = date.get_text().lstrip().strip() if date == 'Matches Today': #TODO this site doesn't support year so who knows what will happen if you use this beyond 2020 date = today.strftime("2020-%m-%d") else: month = date[:3] month = str(strptime(month,'%b').tm_mon).zfill(2) day = date[-2:].zfill(2) #TODO this site doesn't support year so who knows what will happen if you use this beyond 2020 date = '2020-' + str(month) + '-' + str(day) if date < startDate: keepLoading = False print(date) if keepLoading: print('load') #scroll down to see the load more matches button
def getContent(self, list, wordlist): """ BS4로 추출된 기사URL에서 내용물을 뽑아낸다. 반환형 : Information 클래스 리스트 words : Tag 리스트 result : 결과값 """ words = [] result = [] for word in wordlist: words.append([word[0], word[1], 0]) # print(words) count = 0 for index, url in enumerate(list): if url.count("sid1=105") > 0: high_tag = 'IT' elif url.count("sid1=101") > 0: high_tag = '경제' else: continue news_url = url print(news_url) response = Request(news_url) html_content = urlopen(response).read() navigator = bs4.BeautifulSoup(html_content, 'html5lib') contents = navigator.find("div", id="main_content") # 기사 입력일 추출 date = navigator.find("span", {"class": "t11"}) if date is not None: datetext = self.getDateInNews(date.get_text()).strip().replace("\"\r\n\t", '') # 기사 제목 추출 header = contents.h3.get_text().strip().replace("\"\r\n\t", '') # 기사 내용 추출 text = "" content = contents.find("div", id="articleBodyContents") if content.find("table") is None: text = content.get_text() else: # 봇이 쓴 기사는 제외 continue # else: # tables = content.find_all("table") # for table in tables: # tbodies = table.find_all("tbody") # for tbody in tbodies: # trs = tbody.find_all("tr") # for tr in trs: # tds = tr.find_all("td") # tds = [ele.text for ele in tds] # tds = [ele for ele in tds if ele] # for td in tds: # text += td print(text) text = text.strip().replace("\"\r\n\t", '') total = header.upper() + " " + text.upper() # 기사 내용과 키워드 매칭 & 카운트(TAG) trigger = False tags = "[" for word in words: if word[0] == high_tag: word[2] = total.count("" + word[1].upper()) if word[2] is not 0: tags += "{\"" + word[1] + "\":" + str(word[2]) + "}," trigger = True if trigger is True: tags = tags[:-1] count += 1 else: continue tags += "]" if high_tag is '사회': high_tag = '경제' # 기사 표현을 위한 og meta 태그 추출 og_title = navigator.find("meta", property="og:title") og_type = navigator.find("meta", property="og:type") og_url = navigator.find("meta", property="og:url") og_image = navigator.find("meta", property="og:image") og_description = navigator.find("meta", property="og:description") metas = str(og_title) + str(og_type) + str(og_url) + str(og_image) + str(og_description) # 내용물 SET info = Information() info.url(news_url.replace('&', '%26')) info.title(header) info.content(text) info.pDate(datetext) info.high(high_tag) info.low(tags) info.meta(metas.replace('&', '%26')) result.append(info) print('[%d개] ' % (count) + str(info) + ' Original') return result
page = requests.get('https://www.9news.com.au/coronavirus/1') soup = BeautifulSoup(page.content, 'html.parser') feed = soup.find('div', {'class': 'feed__stories'}) story = feed.find_all('div', {'class': 'story__wrapper'}) links = [link.a.attrs['href'] for link in story] contentPage = requests.get(links[0]) contentSoup = BeautifulSoup(contentPage.content, 'html.parser') date = contentSoup.find('time', {'class': 'text--byline'}) date = date.get_text() splt = date.split(" ") splt = splt[1] + ' ' + re.sub(",", '', splt[2]) + ' ' + splt[3] splt def treat_date(date): splt = date.split(" ") splt = splt[1] + ' ' + re.sub(",", '', splt[2]) + ' ' + splt[3] return splt text = contentSoup.find('div', {'class': 'article__body-croppable'})