def test_news_perm(): n1 = News("asdf", 0) n2 = News("asdf_23", 0) n3 = News("asdf_", 1) n4 = News("asdf_23", 1) nv1 = NewsVector() nv1.add(n1) nv1.add(n2) nv1.label = n1.label nv2 = NewsVector() nv2.add(n3) nv2.add(n4) nv2.label = n3.label news_vecs = [[nv1, nv1], [nv2, nv2]] * 10 num_agents = 2 X, y, union = helpers.get_feature_vectors(news_vecs, num_agents) classifier, y_pred, y_true = ml.train_and_test(X, y, verbose=True) test_accuracy = (y_pred == y_true).sum() / sum(map(len, y_pred)) print(f"Test acc: {test_accuracy}") test_stat = correctly_classified p_value = permutation_test.blocked_sampled_test(y_pred, y_true, test_stat) return p_value
def fetch_penpai_news(): news_list = [] # 新闻列表 # 提取首页的新闻数据 index_resp = r.get(penpai_url).text index_html = etree.HTML(index_resp) news_urls = index_html.xpath( '//div[@class="news_li"]/div[@class="news_tu"]/a') # 新闻链接 imgs_urls = index_html.xpath( '//div[@class="news_li"]/div[@class="news_tu"]/a/img') # 新闻图片 overviews = index_html.xpath('//div[@class="news_li"]/p') # 新闻简介 times = index_html.xpath('//div[@class="pdtt_trbs"]/span[1]') # 新闻时间 origins = index_html.xpath('//div[@class="pdtt_trbs"]/a') # 新闻来源 for i in range(0, int(len(news_urls) / 2)): news_list.append( News(_id=news_urls[i].get('href').split('_')[-1], title=imgs_urls[i].get('alt'), overview=overviews[i].text.replace('\n', '').replace(' ', ''), url=penpai_url + news_urls[i].get('href'), image='http:' + imgs_urls[i].get('src'), publish_time=times[i].text, origin=origins[i].text).to_dict()) # 正则提取topCids topCids = '' ids = cids_pattern.search(index_resp) if topCids is not None: topCids = ids.group(1) # 设置Ajax请求头 ajax_params = { 'nodeids': 25949, 'topCids': '2840959,2840504,2840804,2841177,', } pageidx = 2 while True: ajax_params['pageidx'] = pageidx ajax_params['lastTime'] = int(round(time.time() * 1000)) resp = r.get(penpai_ajax_url, params=ajax_params, headers=penpai_headers) resp_content = resp.text print("爬取:", resp.url) results = news_pattern.findall(resp_content) for result in results: if '小时前' in result[5]: hours_before = hours_pattern.search(result[5]) if hours_before is not None: if int(hours_before.group(1)) > 12: return news_list else: news_list.append( News(_id=result[0].split('_')[-1], title=result[2], overview=result[3].replace('\n', '').replace( ' ', ''), url=penpai_url + result[0], image='http:' + result[1], publish_time=result[5], origin=result[4]).to_dict()) pageidx += 1 time.sleep(random.randint(0, 2))
def get_domain_news(self, domain, now_time): ''' domain: str, the domain of source (e.g. 'bbc.com') now_time: Datetime, the current datetime in iso form return: domain_news_result: list<News> The news from the last query until now ''' overall_count, first_page_content = self.get_domain_news_count( domain, now_time) domain_result = [] domain_result.extend(first_page_content) page_number = math.ceil(overall_count / 20) #Get the total number of pages if page_number >= 2: for page in range(2, min(page_number + 1, 5)): page_result = self.get_domain_news_at_page_n( domain, now_time, page) domain_result.extend(page_result) domain_news_result = [] for r in domain_result: news_r = News(domain, r["title"], r["description"], r["publishedAt"], r["urlToImage"], r["url"]) domain_news_result.append(news_r) return domain_news_result
def dump_clusters(): args = get_args() if args['-train'] == '': args['-train'] = 'src/resources/output' + args['-k'] w2vobj = W2V(args['-input'], args['-train'], args['-k']) news = News() articles = news.get_articles() w2vobj.train() # Sentence vectorization by averaging article_vecs = [w2vobj.get_sentence_vector_avg(article['cleaned_title']) for article in articles] # Sentence vectorization by "newtonian" method '''article_vecs = [] for article in articles: newtonian_vec = w2vobj.get_sentence_vector_newtonian(article['cleaned_title']) if newtonian_vec is not None: article_vecs.append(newtonian_vec)''' cluster_obj = Clustering(article_vecs, w2vobj) r_conn = redis.from_url(os.getenv('REDIS_URL',"redis://localhost:6379/")) if args['-cluster'] == 'agg': if args['-prune'] == 'true' or args['-prune'] == 'True': utilities.redis_kmeans_clusters(cluster_obj, articles, True, int(args['-limit']), r_conn) print("redis dump complete") else: utilities.redis_kmeans_clusters(cluster_obj, articles, False, int(args['-limit']), r_conn) print("redis dump complete") else: #TODO dump to redis utilties.print_ann_clusters(cluster_obj, articles)
def storyteller(): form = ReusableForm(request.form) if request.method == "POST": if form.validate(): email = request.form["email"] password = str(request.form["password"]) login = firebase.login(email, password) if login == 0: news = News() news.title = request.form["title"] news.message = message_with_signature(request.form["message"], email) news.url = request.form["url"] news.date = time.strftime("%Y-%m-%d") news.is_private = False firebase.fcm(news, True) print(news.message) flash("Messaggio inviato con successo") if login == 1: flash("Errore: nome utente o password errata") elif login == 2: flash("Errore: chiave API non definita") elif login == 3: flash("Errore: account non valido") else: flash("Compila tutti i campi") return render_template("storyteller.html", form=form)
def news_test(companies): t = News( companies, output_root=r'C:\Users\zleirdahl\Desktop\PythonScripts\iex\Data\News\\', header_fields=['Date', 'Headline', 'Source', 'URL', 'Summary'], file_suffix='news') t.run()
def crawler_PBOC(): with open(news_list_indexes_file, "r") as fr: with open(out_file, "w") as fw: csvwriter = csv.writer(fw) csvwriter.writerow(["title", "href", "date", "content"]) for index_url in fr.readlines(): # print(index_url) html = urlopen(index_url) # print(html) bsObj = BeautifulSoup(html, "lxml") print(bsObj) news_objs = bsObj.find("div", {"class":"mainw950"})\ .find("div", {"opentype":"page"}).find("td", {"colspan":"2"})\ .find("div", {"id":"r_con"}).find("div", {"class":"portlet"})\ .find("div", {"style":"height:480px"}).find("table").find("td").findAll("table") # print(news_objs) # return for news_obj in news_objs: try: news = News() news.date = news_obj.find("span", {"class": "hui12"}) news.href = url_domain_pboc + news_obj.find( "a").attrs['href'] news.title = news_obj.find("a").text news.content = getget_content(news.href) r = [news.title, news.href, news.date, news.content] csvwriter.writerow(r) except: print("except..")
def crawler_FRB(): html = urlopen(url_frb_2016) bsObj = BeautifulSoup(html, "html.parser") events_list_obj = bsObj.find("div", { "class": "row eventlist" }).find("div", {"class": "col-xs-12 col-sm-8 col-md-8"}) event_rows_obj = events_list_obj.findAll("div", {"class": "row"}) # news_list = list() with open(base_dir + "csv_frb.csv", "a") as fw: csvwriter = csv.writer(fw) csvwriter.writerow(["title", "href", "date", "type", "content"]) for event_row_obj in event_rows_obj: try: news = News() date_obj = event_row_obj.find( "div", {"class": "col-xs-3 col-md-2 eventlist__time"}) news.date = date_obj.find("time").text event_obj = event_row_obj.find( "div", {"class": "col-xs-9 col-md-10 eventlist__event"}) news.href = url_domain_frb + event_obj.find("a").attrs['href'] news.title = event_obj.find("p").find("a").find("em").text news.type = event_obj.find("p", { "class": "eventlist__press" }).find("em").find("strong").text news.content = get_content(news.href) r = [news.title, news.href, news.date, news.type, news.content] csvwriter.writerow(r) # news_list.append(news) except: print("except..")
def fetch_news(category): news_list = [] for i in range(0, 2): resp = r.get(data_base_url, params={ "cre": "tianyi", "mod": category, "_": int(round(time.time() * 1000)), "offset": 20 * i }, headers=headers) print('爬取:', resp.url) if resp is not None: resp_json = resp.json() data = resp_json['data'] for d in data: news_list.append( News(_id=d['uuid'], title=d['title'], overview=d['intro'], image=d['thumb'], publish_time=d['ctime'], origin=d['author'], url=d['url_https']).to_dict()) time.sleep(random.randint(0, 2)) return news_list
def fetch_gd_news(): news_list = [] xhs_headers['Host'] = xhs_gd_host resp = r.get(xhs_gd_url, headers=xhs_headers) resp.encoding = 'utf-8' bs = BeautifulSoup(resp.text, 'lxml') data_list = bs.find("ul", attrs={'class': 'gallery l-list-selected l-m'}) lis = data_list.findAll('li') for li in lis: l_cbox = li.find('div', attrs={'class': 'l-cbox'}) spans = l_cbox.find('div', attrs={ 'class': 'l-foot-par' }).findAll('span') news_id_result = xhs_news_id_pattern.search(li.a['href']) if news_id_result is not None: # 判断新闻的发布时间与当前时间的时间间隔,只保存12个小时以内的新闻 publish_time = spans[1].text.replace('\n', '').strip() if int(round(time.time())) - int( time.mktime( time.strptime(publish_time, "%Y-%m-%d %H:%M:%S"))) < 43200: news_list.append( News(_id=news_id_result.group(1), url=li.a['href'], title=li.a.img['alt'], image=xhs_gd_url + li.a.img['src'], origin=spans[0].text, publish_time=publish_time, overview=l_cbox.p.text).to_dict()) return news_list
def fetch_diyicaijing_news(): news_list = [] resp = r.get(diyicaijing_url, params={'page': 2}, headers=diyicaijing_headers) bs = BeautifulSoup(resp.text, 'lxml') articles = bs.findAll('article', attrs={'class': 'article-item clearfix'}) for article in articles: detail_url = diyicaijing_url[:-1] + article.a['href'] if not detail_url.endswith('subscribe'): news_content = article.div.text.replace(' ', '').replace('\n', '') text_result = msg_extract_pattern.findall(news_content) if text_result is not None: for content in text_result: news_list.append( News( _id=detail_url.split('/')[-1], url=detail_url, image=url_extract_pattern.search( article.a['style']).group(1), origin=content[0], title=content[1], publish_time=content[2], ).to_dict()) return news_list
def scrap_news_company(comp): num = comp.stock global count_fail, count_suc url = news_url + str(num).zfill(5) html = scrap_html(url) response_soup = BeautifulSoup(html, 'html.parser') list_node = response_soup.find('div', class_='ulList02') stamp_now = datetime.now().timestamp() if list_node: # print("get stock:", num) h1 = response_soup.find("h1", class_="tf") if h1: comp.name = h1.get("title") up = response_soup.find("div", class_="div002 up") if not up: up = response_soup.find("div", class_="div002 down") if up: spans = up.find_all("span") if spans: lens = len(spans) value = spans[lens-1].text comp.up =value list = list_node.find_all("li") count_suc += 1 count_hot = 0 hot_news = [] comp.ishot = len(list) > 3 for li in list: if not li.find("a"): continue if not li.find("div", class_="bar01"): continue txt = li.find("a").text; link = li.find("a").get("href") date = li.find("div", class_="bar01").text date = date.split(":").pop() cdate = datetime.strptime(date, "%Y-%m-%d %H:%M") # print("== %s=== %s=====+++" % (txt, cdate)) stamp_new = cdate.timestamp() if stamp_now - stamp_new < 24 * 60 * 60 * 2: n = News(txt, date, link) hot_news.append(n) # print("finished get stock: %s ;hot new:%d" % (num, len(hot_news))) return hot_news else: print("error happend", num) count_fail = count_fail + 1
def news(self): """ """ if self._news is None: from news import News self._news = News(self) return self._news
def fetch_iheima_news(): page = 1 news_list = [] while True: resp = r.get(iheima_url, params={ 'page': page, 'pagesize': 20 }, headers=iheima_headers) print("爬取:", resp.url) if resp is not None: resp_json = resp.json() contents = resp_json['contents'] for content in contents: # 只抓取12个小时以内的新闻 if int(round(time.time())) - int( time.mktime( time.strptime(content['published'], "%Y-%m-%d %H:%M"))) > 86400: return news_list else: news_list.append( News(_id=content['contentid'], title=content['title'], url=iheima_url[:-1] + content['url'], image=content['thumb'], publish_time=content['published'], origin=content['author'], overview=content['description']).to_dict()) page += 1
def fetch_news(page): news_list = [] resp = r.get(ajax_url, params={ 'm': 'lists', 'a': 'ajaxNews', 'cid': 4, 'page': page }, headers=headers) print('爬取:', resp.url) if resp is not None: resp.encoding = 'utf8' rst = json.loads(resp.text[1:-1])['rst'] pq = PyQuery(rst) news_item = pq('div.item-news') for item in news_item.items(): a_url = item('div > p > a').attr('href') item_main = title_extract_pattern.search( item('div.item-main').text()) if item_main is not None: news_list.append( News(_id=a_url.split('/')[-1].replace('.html', ''), url=a_url, title=item_main.group(1), overview=item_main.group(2), publish_time=item('div.item-date').text()).to_dict()) return news_list
def fetch_web_news_more(start_id): global data_list headers['Referer'] = web_news_url resp = r.get(load_more_base_url, params={ 'type': 'web_latest_article', 'b_id': start_id, 'per_page': 30 }, headers=headers) print("抓取:", resp.url) if resp is not None: resp_json = resp.json() items = resp_json['data']['items'] for item in items: post = item['post'] motifs = post['motifs'] motifs_name = motifs[0]['name'] if motifs is not None else '' data_list.append( News(_id=str(item['id']), title=post['title'], url=news_detail_base_url + str(post['id']), image=post['cover'], publish_time=post['published_at'], overview=post['summary'], origin=post['user']['name'] + '|' + motifs_name).to_dict()) if int(round(time.time())) - int( time.mktime( time.strptime(items[-1]['post']['published_at'], "%Y-%m-%d %H:%M:%S"))) > 86400: return None else: return fetch_web_news_more(items[-1]['id'])
def fetch_more_news(min_id): news_list = [] sort_field = '' resp = r.get(load_more_url, params={ '_render': '', 'min_id': min_id, '_': count_time }, headers=headers) print("爬取:", resp.url) if resp is not None: data_result = more_data_extract_pattern.search(resp.text) if data_result is not None: data_json = data_result.group(1) data_dict = json.loads(data_json) for data in data_dict['data']['list']: news_list.append( News(_id=data['id'], title=data['title'], overview=data['brief'], image=data['thumb'], publish_time=data['time'], url=data['url'], origin=data['columnName']).to_dict()) sort_field = data['sort_field'] return news_list, sort_field
def spider2(startDate, endDate): startDateArray = startDate.split("-") endDateArray = endDate.split("-") #要查询的年份 years = list(set((startDateArray[0], endDateArray[0]))) # 起始页面 start_urls = [ "https://www.ids.ac.uk/news-and-opinion/news/?select-year%5B0%5D={}&hidden-current-page=1&hidden-sort-by=ndate¤t-page=1#listing" .format(years[0]) ] if len(years) > 1: start_urls.append( "https://www.ids.ac.uk/news-and-opinion/news/?select-year%5B0%5D={}&hidden-current-page=1&hidden-sort-by=ndate¤t-page=1#listing" .format(years[1])) # 结果 results = [] for start_url in start_urls: driver.get(start_url) time.sleep(0.5) #日期不满足要求 dateFlag = False while True: html = etree.HTML(driver.page_source) dates = dateConver( html.xpath( "//article[@class='c-content-item c-content-item--news c-listing__item']//p[@class='c-content-item__date ts-caption']/text()" )) links = html.xpath( "//article[@class='c-content-item c-content-item--news c-listing__item']//a/@href" ) titles = html.xpath( "//article[@class='c-content-item c-content-item--news c-listing__item']//a/text()" ) summarys = html.xpath( "//article[@class='c-content-item c-content-item--news c-listing__item']//p[@class='c-content-item__description ts-body ts-body--small']/text()" ) for i in range(len(links)): #日期在范围内 if checkDateRange(startDate, endDate, dates[i]): news = News(dates[i], links[i], titles[i].strip(), summarys[i].strip(), "") print(news.date) results.append(news) elif dates[i] < startDate: dateFlag = True break #日期不满足要求 if dateFlag: break # 下一页DOM try: next_page_btn = driver.find_element_by_xpath( "//a[@title='Next page']") except: #最后一页 break driver.execute_script("arguments[0].click();", next_page_btn) time.sleep(5) return results
def welcome(): if request.wants_json(): links = navs links.append(build_link('/', 'self', 'application/json')) root = {'version': '0.1', 'title': 'VT Bash', 'links': links} return jsonify(root, 'application/json') news = News() return render_template('index.html', nav=navs, news=news.news)
def testNews(): # Try to add test url fetchWeb.test_parse_url() news = News() testurl = 'http://www.appledaily.com.tw/realtimenews/article/new/20150822/675760/' result = news.loadfromdb(testurl) return "Result: " + str(news)
def news_func(self): self.speak('Opening News.') from news import News self.news_win = News() self.news_win.show() self.speak( 'Welcome to News.\nThese are the latest international headlines according to BBC News Network.' )
def _test_get_from_source(self, source, count): articles = News().get_from_source(source, count) self.assertEqual(count, len(articles), "Result length is correct") for article in articles: self.assertIsNotNone(article.url, "Article url is not None") self.assertIsNotNone(article.title, "Article title is not None") self.assertIsNotNone(article.snippet, "Article snippet is not None")
def send_news(message): news = News() news.find_supermain() msg = f"<b>Main News on <a href=\"https://www.zakon.kz/\">zakon.kz</a> for {today_modified}</b>\n\n" msg += f'<a href="{news.url}">{news.title}</a>\n\n' for i in range(4): news.find_main(i) msg += f"<a href=\"{news.url}\">{news.title}</a>\n\n" bot.send_message(message.chat.id, msg, parse_mode='HTML')
def _parse(self, file_path: str) -> News: with open(file_path) as f: lines = [line.replace(' ', '').strip() for line in f if line is not None] url = lines[0] date_time = lines[1] title = lines[2] content = ''.join(lines[3:]) label = file_path.split('/')[1] return News(url=url, date_time=date_time, title=title, content=content, label=label)
def test_to_dict(): sample = News("title", "description", "published", "url", "full_text") assert sample.to_dict() == { "title": "title", "description": "description", "url": "url", "published": "published", "full_text": "full_text", }
def news(): response = request.json['details'] news_objects = [] for news_piece in response: news_objects.append(News(news_piece)) audio_file = text_to_speech(request.json['message']) return render_template('news.html', title='News', news=news_objects, audio_file = audio_file)
def anylisor(resp): timecounter.updateprogress() if not resp: return response_soup = BeautifulSoup(resp, 'html.parser') list_node = response_soup.find('div', class_='ulList02') if list_node: stamp_now = datetime.now().timestamp() comp = Company(stock="") h1 = response_soup.find("h1", class_="tf") if h1: comp.name = h1.get("title") compnents = h1.text.split(".") if len(compnents) > 0: stock_num = compnents[0] comp.stock = stock_num else: return None up = response_soup.find("div", class_="div002 up") if not up: up = response_soup.find("div", class_="div002 down") if up: spans = up.find_all("span") if spans: lens = len(spans) value = spans[lens - 1].text comp.up = value list = list_node.find_all("li") count_hot = 0 hot_news = [] comp.ishot = len(list) > 3 for li in list: if not li.find("a"): continue if not li.find("div", class_="bar01"): continue txt = li.find("a").text link = li.find("a").get("href") date = li.find("div", class_="bar01").text date = date.split(":").pop() cdate = datetime.strptime(date, "%Y-%m-%d %H:%M") stamp_new = cdate.timestamp() if stamp_now - stamp_new < 24 * 60 * 60 * 2: n = News(txt, date, link) hot_news.append(n) comp.news = hot_news return comp
def prep_news_data(self): if not self.news_market_data: print 'Preparing news and stock data...\n' news = News('Resources/articles.db') raw = news.db_articles() train_raw, test_raw = divide_list_by_ratio(raw) # prep_news_data returns a tuple of vectors, labels self.train_vecs, self.train_labs = self.prep_news_articles(train_raw, fit=True) self.test_vecs, self.test_labs = self.prep_news_articles(test_raw) self.news_market_data = True self.movie_review_data = False
def parse_user(self): with codecs.open(self.path, 'r', 'utf-8-sig') as lines: for lin in lines: lin = lin.strip().split() userid, newsid, scan_time, title, create_time = int( lin[0]), int(lin[1]), lin[2], lin[3], lin[-1] news = News(int(userid), int(newsid), title, scan_time, [], create_time) self.AllNews.append(news)
def parse_single_url(url): content = urllib2.urlopen(url).read() if "該則即時新聞不存在" in content: return False else: soup = BeautifulSoup( content, from_encoding='utf-8', ) title = str(soup.find("h1", {"id": "h1"}).string) contents = soup.find("p", {"id": "summary"}) while "</iframe>" in contents.renderContents(): if contents.iframe.decompose() == None: break desc_contents = contents.renderContents() popularity_data = soup.find("a", attrs={"class": "function_icon clicked"}) if popularity_data == None: popularity = 0 else: popularity = parse_string_to_popularity(popularity_data.string) news_datetime = parse_string_to_datetime(soup.find("time").string) news_url = soup.find("meta", {"property": "og:url"})['content'] news_source = soup.find("meta", {"property": "og:site_name"})['content'] img_url1 = soup.find("a", attrs={"class": "t1"}) img_url2 = soup.find("figure", attrs={"class": "lbimg sgimg sglft"}) if img_url1 != None: img_url = img_url1.img['src'] elif img_url2 != None: img_url = img_url2.a.img['src'] else: img_url = "" logging.debug("news_url: " + str(news_url)) logging.debug("title: " + str(title)) logging.debug("content: " + str(desc_contents)) logging.debug("popularity: " + str(popularity)) logging.debug("news_datetime: " + str(news_datetime)) logging.debug("news_first_image_url: " + str(img_url)) logging.debug("news_source: " + str(news_source)) news = News(news_url=news_url, title=title, content=desc_contents, popularity=popularity, news_datetime=news_datetime, news_first_image_url=img_url, news_source=news_source) logging.info("Add news: " + str(news)) news.writetodb() return True