Пример #1
0
def collectData():
    data = []
    #Cycle through every page on the website
    for i in range(1, 4):
        print(i)
        sauce = urllib.request.urlopen(base_url + str(i)).read()
        soup = bs.BeautifulSoup(sauce, 'html.parser', parse_only=only_tr_tags)

        articles = soup.findAll('tr', {'style': ''})
        for article in articles:
            dataPart = {}

            link_title = article.find('a', {'class': 'press-link'})
            if (link_title != None):
                #Find all the titles of the articles
                dataPart['title'] = link_title.get_text()
                #Find all the links of the articles
                dataPart['link'] = link_title.get('href')

            #Find all the dates of the articles
            date = article.find('td', {'style': ''})
            if (date != None):
                dataPart['date'] = date.get_text()

            #Add the dataPart to the data
            if (dataPart != {} and dataPart['title'] != ''):
                data.append(dataPart)

    return data
Пример #2
0
def checkDate(cur_url):
    #检查正在遍历的网页是否是当天的新闻
    html_content = gethtml(cur_url)
    soup = BeautifulSoup(html_content, 'html.parser')
    date = soup.find('span')
    if date.get_text() == todayDate():
        return True
    else:
        return False
Пример #3
0
def get_fin_stat_links():
    """
    Get urls of new financial statements news from
    Source: https://zse.hr/default.aspx?id=36774&ticker1=&Page=1
    """
    url =  'https://zse.hr/default.aspx?id=36774&ticker1=&Page=1'
    res = get_request(url)
    bs = BeautifulSoup(res.text, features="lxml")
    link = [a['href'] for a in bs.find_all("a", href=True)]
    link = ["http://www.zse.hr/" + l for l in link if "UserDocsImages/financ" in l]
    dtime = [date.get_text().strip() for date in bs.select('.vijestRowDatumDat')]
    dtime = pd.to_datetime(dtime)
    
    return dtime, link
    def read_posts_bitcointalk(self, response):
        url_post_string = [
            'topic',
        ]

        if any(substring in response.url for substring in url_post_string):
            self.pages_crawled += 1
            self.check_max_pages()

            soup = BeautifulSoup(response.body, "html.parser")
            texts_raw = soup.find_all('div', class_="post")
            dates_raw = soup.find_all('div', class_="smalltext")

            dates = []
            for date in dates_raw:
                date = date.get_text()
                if any(substring in date
                       for substring in date_word_list) and len(date) < 30:
                    date = convert_date_to_unix_time(date)
                    dates.append(date)

            texts = []
            for text in texts_raw:
                text = text.get_text().encode('utf-8')
                if not text.isdigit():
                    texts.append(text)

            filename_date = "temp_date_output.txt"
            filename_text = "temp_text_output.txt"

            try:
                os.remove(filename_date)
            except OSError:
                pass

            try:
                os.remove(filename_text)
            except OSError:
                pass

            with open(filename_date, "a") as f1:
                pickle.dump(dates, f1)

            with open(filename_text, "a") as f2:
                pickle.dump(texts, f2)

        url_board_string = ["board=5", "board=7", "board=8"]
        if any(substring in response.url for substring in url_board_string):
            self.parse(response)
Пример #5
0
    def news_sentiments(self): # Returns news articles curated via Finviz, Yahoo, and Google News, GET UNUSUAL OPTION ACTIVITY
        BASE_URL = f'https://finviz.com/quote.ashx?t={self.ticker}'
        soup = self._get_soup(BASE_URL)

        table = soup.find('table', {'class': 'fullview-news-outer'})
        rows = table.find_all('tr')
        df_data = []
        for row in rows:
            date = row.find('td', {'align': 'right'})
            article = row.find('td', {'align': 'left'})
            link = article.find('a')['href']
            df_data.append((date.get_text(), article.get_text(), link))
        df = pd.DataFrame(df_data, columns=['Time', 'Headline', 'Link'])


        BASE_URL = f'https://finance.yahoo.com/quote/{self.ticker}/news?p={self.ticker}'
        soup = self._get_soup(BASE_URL)

        links = soup.find_all('a', {'class': 'js-content-viewer wafer-caas Fw(b) Fz(18px) Lh(23px) LineClamp(2,46px) Fz(17px)--sm1024 Lh(19px)--sm1024 LineClamp(2,38px)--sm1024 mega-item-header-link Td(n) C(#0078ff):h C(#000) LineClamp(2,46px) LineClamp(2,38px)--sm1024 not-isInStreamVideoEnabled'})
        news = [(link.get_text(), str('yahoo.com' + link['href'])) for link in links]

        BASE_URL = f'https://finance.yahoo.com/quote/{self.ticker}/press-releases?p={self.ticker}'
        soup = self._get_soup(BASE_URL)

        links = soup.find_all('a', {'class': 'js-content-viewer wafer-caas Fw(b) Fz(18px) Lh(23px) LineClamp(2,46px) Fz(17px)--sm1024 Lh(19px)--sm1024 LineClamp(2,38px)--sm1024 mega-item-header-link Td(n) C(#0078ff):h C(#000) LineClamp(2,46px) LineClamp(2,38px)--sm1024 not-isInStreamVideoEnabled'})
        press_releases = [(link.get_text(), str('yahoo.com' + link['href'])) for link in links]
        # Look for keywords in the news? Any showcases, Investor/analyst days, Analyst revisions, Management transitions
        # Product launches, Significant stock buyback changes
  
  
          # Getting news from google news search
        googlenews = GoogleNews(lang='en', period='14d') # Specify period for news
        googlenews.get_news(f'${self.ticker} stock')
        stock_news = googlenews.results()
  
        # print([(i, j) for i, j in zip(googlenews.get_texts(), googlenews.get_links())])
        # To get other pages, do googlenews.get_page(2), etc.
  
        # Have whitelist of websites to search articles from. Maybe have key word to filter out stupid stuff.
  
        sectors = self.find_competition()
        sector_news = []
        if sectors:
            for sector in sectors:
                googlenews = GoogleNews(lang='en', period='14d')
                googlenews.get_news(f'{sector} sector stocks')
                sector_news.append(googlenews.result())
    
        return df, news, press_releases, sector_news, stock_news
Пример #6
0
def get_bouhgt_house(config,source):
    url = 'http://sh.lianjia.com/chengjiao/'
    web_data = requests.get(url)
    soup = BeautifulSoup(web_data.text,'lxml')
    #print(soup)
    house_pages = soup.select('body > div.wrapper > div.main-box.clear > div > div.page-box.house-lst-page-box > a')
    #print(pages)
    for page in house_pages:
        if page.get_text().isdigit():
            pages = page.get_text()
        else:
            break
    url_base = 'http://sh.lianjia.com/chengjiao/'
    for page in range(1,int(pages)+1):
        print('present page is------------------',page,'------------------','\n')
        more_page = 'd'+str(page)
        url = url_base + more_page
        web_data = requests.get(url)
        soup = BeautifulSoup(web_data.text,'lxml')
        house_name = soup.select('body > div.wrapper > div.main-box.clear > div > div.list-wrap > ul > li > div.info-panel > h2 > a')
        prices_per_area = soup.select('body > div.wrapper > div.main-box.clear > div > div.list-wrap > ul > li > div.info-panel > div > div.col-2.fr > div > div:nth-of-type(2) > div')
        bought_date = soup.select('body > div.wrapper > div.main-box.clear > div > div.list-wrap > ul > li > div.info-panel > div > div.col-2.fr > div > div:nth-of-type(1) > div')
        prices = soup.select('body > div.wrapper > div.main-box.clear > div > div.list-wrap > ul > li > div.info-panel > div > div.col-2.fr > div > div.fr > div')
        for name,price_per_area,date,price in zip(house_name,prices_per_area,bought_date,prices):
            names = name.get_text()
            #print('names',names,'-----------------','\n')
            #print('prices',prices,'---------------','\n')
            name_layout_area = names.split(' ')
            name = name_layout_area[0].encode('UTF-8','ignore')
            layout = name_layout_area[1]
            area = re.findall(r'(\w*[0-9]+\.*[0-9]+)\w*',name_layout_area[2])
            #print(name,'---------------',layout,'---------------------',area,'-------------------','\n')
            price_per_area = re.findall(r'(\w*[0-9]+\.*[0-9]+)\w*',price_per_area.get_text())
            date = date.get_text()
            price = re.findall(r'(\w*[0-9]+\.*[0-9]+)\w*',price.get_text())
            #print(type(price_per_area),price_per_area)
            #print('house----------',name,layout,area,price_per_area,date,price)
            connection = pymysql.connect(**config)
            try:
                with connection.cursor() as cursor:
                    # 执行sql语句,插入记录
                    sql = 'INSERT INTO house_bought (name, price, area, layout, source, price_per_area, bought_date, import_date) VALUES (%s, %s, %s, %s, %s, %s, %s, %s)'
                    cursor.execute(sql, (name, price, area, layout, source, price_per_area[0], date, present_date))
                    # 没有设置默认自动提交,需要主动提交,以保存所执行的语句
                connection.commit()
            finally:
                connection.close()
    time.sleep(1)
Пример #7
0
        'sep': 9,
        'oct': 10,
        'nov': 11,
        'dec': 12
    }
    if date[0].lower() in monthDict.keys():
        month = monthDict[date[0].lower()]
    else:
        month = date[0].lower()
    day = date[1]
    year = date[2]
    fullDate = date[0] + ' ' + date[1] + ' ' + date[2]
    return fullDate, day, month, year


date, day, month, year = treat_date(removeDC(date.get_text()))

text.get_text()

newsDict = {
    'content': [],
    'date': [],
    'source': [],
    'categoria': [],
    'day': [],
    'month': [],
    'year': [],
    'link': []
}
categorias = [
    'politics', 'technology', 'health', 'education', 'immigration',
Пример #8
0
    driver.HideCommandPromptWindow = True
    print('URL:')
    print(URL)
    driver.get(URL)
    #wait for it to load
    sleep(5)
    keepLoading = True
    iterations = 0
    print('')
    print('*******' + Name + '*******')
    while keepLoading:
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        for days in soup.find_all('div', {'class' : 'trn-gamereport-list__group'}):
            date = days.find('h3', {'class' : 'trn-gamereport-list__title'})
            if date is not None:
                date = date.get_text().lstrip().strip()
                if date == 'Matches Today':
		    #TODO this site doesn't support year so who knows what will happen if you use this beyond 2020
                    date = today.strftime("2020-%m-%d")
                else:
                    month = date[:3]
                    month = str(strptime(month,'%b').tm_mon).zfill(2)
                    day = date[-2:].zfill(2)
		    #TODO this site doesn't support year so who knows what will happen if you use this beyond 2020
                    date = '2020-' + str(month) + '-' + str(day)
                if date < startDate:
                    keepLoading = False
                print(date)
        if keepLoading:
            print('load')
            #scroll down to see the load more matches button
    def getContent(self, list, wordlist):
        """
        BS4로 추출된 기사URL에서 내용물을 뽑아낸다.
        반환형 : Information 클래스 리스트
        words : Tag 리스트
        result : 결과값
        """
        words = []
        result = []
        for word in wordlist:
            words.append([word[0], word[1], 0])
        # print(words)
        count = 0
        for index, url in enumerate(list):
            if url.count("sid1=105") > 0:
                high_tag = 'IT'
            elif url.count("sid1=101") > 0:
                high_tag = '경제'
            else:
                continue
            news_url = url
            print(news_url)
            response = Request(news_url)
            html_content = urlopen(response).read()
            navigator = bs4.BeautifulSoup(html_content, 'html5lib')
            contents = navigator.find("div", id="main_content")
            # 기사 입력일 추출
            date = navigator.find("span", {"class": "t11"})
            if date is not None:
                datetext = self.getDateInNews(date.get_text()).strip().replace("\"\r\n\t", '')
                # 기사 제목 추출
                header = contents.h3.get_text().strip().replace("\"\r\n\t", '')
                # 기사 내용 추출

                text = ""
                content = contents.find("div", id="articleBodyContents")
                if content.find("table") is None:
                    text = content.get_text()
                else:
                    # 봇이 쓴 기사는 제외
                    continue
                # else:
                #     tables = content.find_all("table")
                #     for table in tables:
                #         tbodies = table.find_all("tbody")
                #         for tbody in tbodies:
                #             trs = tbody.find_all("tr")
                #             for tr in trs:
                #                 tds = tr.find_all("td")
                #                 tds = [ele.text for ele in tds]
                #                 tds = [ele for ele in tds if ele]
                #                 for td in tds:
                #                     text += td
                print(text)
                text = text.strip().replace("\"\r\n\t", '')
                total = header.upper() + " " + text.upper()
                # 기사 내용과 키워드 매칭 & 카운트(TAG)
                trigger = False
                tags = "["
                for word in words:
                    if word[0] == high_tag:
                        word[2] = total.count("" + word[1].upper())
                        if word[2] is not 0:
                            tags += "{\"" + word[1] + "\":" + str(word[2]) + "},"
                            trigger = True
                if trigger is True:
                    tags = tags[:-1]
                    count += 1
                else:
                    continue
                tags += "]"
                if high_tag is '사회':
                    high_tag = '경제'
                # 기사 표현을 위한 og meta 태그 추출
                og_title = navigator.find("meta", property="og:title")
                og_type = navigator.find("meta", property="og:type")
                og_url = navigator.find("meta", property="og:url")
                og_image = navigator.find("meta", property="og:image")
                og_description = navigator.find("meta", property="og:description")
                metas = str(og_title) + str(og_type) + str(og_url) + str(og_image) + str(og_description)
                # 내용물 SET
                info = Information()
                info.url(news_url.replace('&', '%26'))
                info.title(header)
                info.content(text)
                info.pDate(datetext)
                info.high(high_tag)
                info.low(tags)
                info.meta(metas.replace('&', '%26'))
                result.append(info)
                print('[%d개] ' % (count) + str(info) + ' Original')
        return result
Пример #10
0
page = requests.get('https://www.9news.com.au/coronavirus/1')
soup = BeautifulSoup(page.content, 'html.parser')

feed = soup.find('div', {'class': 'feed__stories'})

story = feed.find_all('div', {'class': 'story__wrapper'})

links = [link.a.attrs['href'] for link in story]

contentPage = requests.get(links[0])
contentSoup = BeautifulSoup(contentPage.content, 'html.parser')

date = contentSoup.find('time', {'class': 'text--byline'})

date = date.get_text()

splt = date.split(" ")

splt = splt[1] + ' ' + re.sub(",", '', splt[2]) + ' ' + splt[3]

splt


def treat_date(date):
    splt = date.split(" ")
    splt = splt[1] + ' ' + re.sub(",", '', splt[2]) + ' ' + splt[3]
    return splt


text = contentSoup.find('div', {'class': 'article__body-croppable'})