def load_previous(data_base): previous = [] try: file = open("./documents/web/news.bank", "r", encoding='utf8') for line in file: previous.append(line) i = 0 while i < len(previous): url = regex.get_data('>\s(.+?)\s<', previous[i + 4])[0] key = regex.get_data('>\s(.+?)\s<', previous[i + 1])[0] #date = regex.get_data('>\s(.+?)\s<',previous[i+5])[0] data_base[key].append(url) #data_base[url][key] = date #data_base[url] = defaultdict(str) #data_base[id]['id'] = previous[i] #data_base[key]['key'] = previous[i] #data_base[url]['title'] = previous[i+1] #data_base[url]['source'] = previous[i+2] #data_base[url]['url'] = previous[i+3] #data_base[url]['date'] = previous[i+4] #data_base[url]['author'] = previous[i+5] #data_base[url]['content1'] = previous[i+6] #data_base[url]['content2'] = previous[i+7] i += 10 except FileNotFoundError: pass
def update_key(data_base, url, kkey): keys_saved = regex.get_data('<key>\s(.+?)\s<', data_base[url]['key']) if kkey not in keys_saved: data_base[url]['key'] = data_base[url]['key'][:-1] data_base[url]['key'] += ' <key> ' + kkey + ' <\key>\n' return True return False
def check_last_update(url, date): count = 0 for u in url: d = regex.get_data('\S+\/(\d+\/\d+\/\d+)\S+', u)[0] d = int(re.sub(r'/', '', d)) if d < date: return count count += 1 return -1
def washington_post(data_base,data_print,key,date): hp.login("*****@*****.**", "qazxdr12") kkey = fmt.file_name(key,'_') kkkey = fmt.file_name(key,'+') print("----- "+"washington_post."+kkey+" -----") print("Start loading Urls...") #case for exact keyword search url1='https://sitesearchapp.washingtonpost.com/sitesearch-api/v2/search.json?count=20&datefilter=displaydatetime:%5B*+TO+NOW%2FDAY%2B1DAY%5D&facets.fields=%7B!ex%3Dinclude%7Dcontenttype,%7B!ex%3Dinclude%7Dname&filter=%7B!tag%3Dinclude%7Dcontenttype:("Article"+OR+(contenttype:"Blog"+AND+name:("Opinions")))&highlight.fields=headline,body&highlight.on=true&highlight.snippets=1&query="' url2='"&sort=&startat=' url3='&callback=angular.callbacks._0' baseurl = url1+kkkey+url2+'0'+url3 try: page = hp.getHtml(baseurl) except urllib.error.URLError: pass article_number = regex.get_data('"total"\S(\S+?),"documents',page)[0] if article_number == 0: url1='https://sitesearchapp.washingtonpost.com/sitesearch-api/v2/search.json?count=20&datefilter=displaydatetime:%5B*+TO+NOW%2FDAY%2B1DAY%5D&facets.fields=%7B!ex%3Dinclude%7Dcontenttype,%7B!ex%3Dinclude%7Dname&filter=%7B!tag%3Dinclude%7Dcontenttype:("Article"+OR+(contenttype:"Blog"+AND+name:("Opinions")))&highlight.fields=headline,body&highlight.on=true&highlight.snippets=1&query=' url2='&sort=&startat=' url3='&callback=angular.callbacks._0' baseurl = url1+kkkey+url2+'0'+url3 try: page = hp.getHtml(baseurl) except urllib.error.URLError: print("Washington Post website is not correct, please check the code!") return -1 article_number = regex.get_data('"total"\S(\S+?),"documents',page)[0] if article_number == 0: print("No Washington Post article was found by this key word") return -1 #get all urls count = 0 index = 0 urls = [] page_total = int(article_number) / 20 + 1 while(count < page_total): currenturl = url1+key+url2+str(index)+url3 try: page = hp.getHtml(currenturl) except urllib.error.URLError: continue url = regex.get_data('"contenturl"\S"(https:\/\/www.washingtonpost.com\/opinions/\S+?)"\S"',page) if date != 0: a_num = check.check_last_update(url,date) if a_num != -1: urls += url[:-(len(url)-a_num )] break urls += url index += 20 count += 1 print(str(len(urls))+" Urls loaded...") print("There are "+str(len(data_base)+len(data_print))+" loaded file...",) print("Now starting updating...",) count = 0 #count2 = 0 for url in urls: if url in data_base and kkey in data_base[url]: #if check.update_key(data_base, url, kkey): # count2 += 1 continue try: html = hp.getHtml(url) except urllib.error.URLError: continue title = regex.get_data('"headline":"(.*?)",',html) #<meta content="Julian Zelizer, CNN Political Analyst" name="author"> author = regex.get_data('this.props.author="(.*?)";',html) #<meta content="2018-02-17T00:19:47Z" name="pubdate"> date = regex.get_data('"datePublished":"(\S+?)T',html) text2 = regex.get_data('<article.*?>(.*?)<\/p>\s<\/article>',html) if text2 != []: text = regex.get_data('<p.*?>(.*?)<\/p>',text2[0]) else: text = regex.get_data('<p.*?>(.*?)<\/p>',html) if text == [] or title == []: continue data_print[url] = defaultdict(str) # line 1 data_print[url]['ID'] = fmt.formatted_id(len(data_base)+len(data_print)-1) data_print[url]['key'] = fmt.formatted_key(kkey) # line 2 data_print[url]['title'] = fmt.formatted_title(title) # line 3 data_print[url]['source'] = fmt.formatted_source("Washington Post") # line 4 data_print[url]['url'] = fmt.formatted_url(url) # line 5 data_print[url]['date'] = fmt.formatted_date(date) # line 6 data_print[url]['author'] = fmt.formatted_author(author,';') # line 7 data_print[url]['content1'] = fmt.formatted_content_with_symbol(text) # line 8 data_print[url]['content2'] = fmt.formatted_content(text) count += 1 print("Updated "+str(count)+" articles...") #if count2 > 0: # print("Updated "+str(count2)+" keys...") print("There are "+str(len(data_base)+len(data_print))+" articles...")
def cbs(data_base, data_print, key, date_, previous_len): kkey = fmt.file_name(key, '_') kkkey = fmt.file_name(key, '+') print("----- " + "cbs." + kkey + " -----") print("Start loading Urls...") kkkey = re.sub(r'/', '%2F', kkkey) #kkkey = re.sub(r'+', '%2B', kkkey) kkkey = re.sub(r'%', '%25', kkkey) #case for exact keyword search url1 = 'https://www.cbsnews.com/search/?q=' url2 = '&o=1&p=' url3 = '&t=opinion' baseurl = url1 + kkkey + url2 + '1' + url3 article_number = '0' try: page = hp.getHtml(baseurl) except urllib.error.URLError: print("CBS website is not correct, please check the code!") return -1 try: article_number = regex.get_text('<h1\sclass="result-title">(\d+)\s', page)[0] except IndexError: article_number = '0' if int(article_number) == 0: print("No CBS article was found by this key word") return -1 #get all urls count = 0 index = 0 page_num = 1 urls = defaultdict(str) page_total = int(int(article_number) / 10 + 1) reach_updated = False print("There are " + article_number + " articles...") print("Start loading and Updating...") while (count < page_total): currenturl = url1 + key + url2 + str(page_num) + url3 try: page = hp.getHtml(currenturl) except urllib.error.URLError: continue url = regex.get_text('<a\shref="(\S+?)"><h3\sclass="title"', page) date = regex.get_text( '<span\sclass="date">(\S+?\s\d+,\s\d+?)\s\S+\s\S+?\s\S+<\/span>', page) for cnt in range(0, len(date)): date[cnt] = fmt.convert_date(date[cnt]) if date_ > date[cnt]: reach_updated = True break for i in range(0, cnt + 1): try: urls['https://www.cbsnews.com' + url[i]] = str(date[i])[0:4] + '-' + str( date[i])[4:6] + '-' + str(date[i])[6:8] except IndexError: break if reach_updated: break index += 10 page_num += 1 count += 1 print(str(len(urls)) + " URLs loaded...") print("Updating database...") for url in urls: if url in data_base[kkey]: continue try: html = hp.getHtml(url) except urllib.error.URLError: continue title = regex.get_data('<title>([^<]+?)\s-\s[^<]+?<\/title>', html) if title == 'Noun': title = regex.get_data('<title>([^<]+?)<\/title>', html) author = regex.get_data( '"author":{".type":"[^"]+?","name":"([^"]+?)"}', html) text1 = [] text1.append( regex.get_data('<div\sdata-page=[^>]+?><[^>]*?>\n?([^\n]+?)<.?p>', html)) text2 = regex.get_text('<p>([^\n]+?)<\/p>', html) text = text1 + text2 if text == [] or title == "Noun": continue data_base[kkey].append(url) data_print[url] = defaultdict(str) # line 1 data_print[url]['ID'] = fmt.formatted_id( len(data_base[kkey]) - 1 + previous_len) # line 2 data_print[url]['key'] = fmt.formatted_key(kkey) # line 3 data_print[url]['title'] = fmt.formatted_title(title) # line 4 data_print[url]['source'] = fmt.formatted_source("CBS") # line 5 data_print[url]['url'] = fmt.formatted_url(url) # line 6 data_print[url]['date'] = fmt.formatted_date(urls[url]) # line 7 if len(author) != 0: aa = author.split(',') if len(aa) > 1: author = ','.join(aa[:-1]) elif len(aa) == 0: author = 'Noun Noun' else: author = 'Noun Noun' data_print[url]['author'] = fmt.formatted_author(author, ',') # line 8 data_print[url]['content'] = fmt.formatted_content(text) # line 9 #data_print[url[i]]['content2'] = fmt.formatted_content(text) print('■', end='', flush=True) print("\nThere are " + str(len(data_print) + previous_len) + " articles...") print("Updated " + str(len(data_print)) + " articles...")
def cnn(data_base, data_print, key, date_, previous_len): kkey = fmt.file_name(key, '_') kkkey = fmt.file_name(key, '+') print("----- " + "cnn." + kkey + " -----") #case for exact keyword search url1 = 'https://search.api.cnn.io/content?size=10&q=%22' url2 = '%22&category=opinion' baseurl = url1 + key + url2 article_number = '0' try: page = hp.getHtml(baseurl) except urllib.error.URLError: print("CNN website is not correct, please update the scraper!") return -1 article_number = regex.get_text('"meta":{\S+?"of":(\d+?),"maxScore', page)[0] if int(article_number) == 0: print("No CNN article was found by this key word") return -1 #get all urls count = 0 index = 0 page_num = 1 urls = defaultdict(str) page_total = int(int(article_number) / 10 + 1) reach_updated = False print("There are " + article_number + " articles...") print("Start loading URLs...") while (count < page_total): currenturl = url1 + key + url2 + '&from=' + str( index) + '&page=' + str(page_num) try: page = hp.getHtml(currenturl) except urllib.error.URLError: continue url = regex.get_text('"url":"([^,]+?.html)"\S"', page) #title = regex.get_text('"headline":"([^{]*?)"',page) #author = regex.get_text('"byLine":(.*?),',page) for i in range(0, len(url)): try: d = regex.get_data('\/(\d+?\/\d+?\/\d+?)\/', url[i]) except IndexError: break d_int = int(re.sub(r'/', '', d)) if date_ > d_int: reach_updated = True break urls[url[i]] = re.sub(r'/', '-', d) if reach_updated: break index += 10 page_num += 1 count += 1 print(str(len(urls)) + " URLs loaded...") print("Updating database...") for url in urls: if url in data_base[kkey]: continue try: html = hp.getHtml(url) except urllib.error.URLError: continue title = regex.get_data('<title>([^<]+?)\s-\s\w+?<\/title>', html) if title == 'Noun': title = regex.get_data('<title>([^<]+?)<\/title>', html) author = regex.get_data('<meta\scontent\S"([^"]+?)"\sname="author">', html) text2 = [] text2.append( regex.get_data( '<cite\sclass="el-editorial-source">\s\S\S\S\S\S</cite>([^=]*?)<\/p><\/div>', html)) text1 = regex.get_text( '<div\sclass="zn-body__paragraph\s*?\w*?">([^=]+?)</div>?', html) text = text2 + text1 if text == [] or title == "Noun": continue data_base[kkey].append(url) data_print[url] = defaultdict(str) # line 1 data_print[url]['ID'] = fmt.formatted_id( len(data_base[kkey]) - 1 + previous_len) # line 2 data_print[url]['key'] = fmt.formatted_key(kkey) # line 3 data_print[url]['title'] = fmt.formatted_title(title) # line 4 data_print[url]['source'] = fmt.formatted_source("CNN") # line 5 data_print[url]['url'] = fmt.formatted_url(url) # line 6 data_print[url]['date'] = fmt.formatted_date(urls[url]) # line 7 if len(author) > 5: if author[0:3] == "By ": author = author[3:] aa = author.split(',') if len(aa) > 1: author = ','.join(aa[:-1]) else: author = 'Noun Noun' data_print[url]['author'] = fmt.formatted_author(author, ',') # line 8 data_print[url]['content'] = fmt.formatted_content(text) # line 9 #data_print[url[i]]['content2'] = fmt.formatted_content(text) print('■', end='', flush=True) print("\nThere are " + str(len(data_print) + previous_len) + " articles...") print("Updated " + str(len(data_print)) + " articles...")
def politico(data_base, data_print, key, date_, previous_len): kkey = fmt.file_name(key, '_') kkkey = fmt.file_name(key, '+') print("----- " + "politico." + kkey + " -----") # https://www.politico.com/search/2?s=newest&q=tax%20reform&adv=true&c=0000014b-324d-d4f3-a3cb-f3ff415e0035&pv=0000014e-a307-d012-a3fe-bb8793910000 url1 = 'https://www.politico.com/search/' url2 = '?s=newest&q="' url3 = '"&adv=true&c=0000014b-324d-d4f3-a3cb-f3ff415e0035&pv=0000014e-a307-d012-a3fe-bb8793910000' baseurl = url1 + '1' + url2 + key + url3 article_number = '0' try: page = hp.getHtml(baseurl) except urllib.error.URLError: print("Politico website is not correct, please update the scraper!") return -1 article_number = regex.get_text( '<h1>Results[^<]+?<\/h1>[^<]+?<p>(\d+?)\sSearch\sResults<\/p>', page)[0] if int(article_number) == 0: print("No Policito article was found by this key word") return -1 #get all urls count = 0 page_num = 1 urls = defaultdict(str) page_total = int(int(article_number) / 20 + 1) reach_updated = False print("There are " + article_number + " articles...") print("Start loading URLs...") while (count < page_total): currenturl = url1 + str(page_num) + url2 + key + url3 try: page = hp.getHtml(currenturl) except urllib.error.URLError: continue url = regex.get_text('<a\shref="([^"]+?)"\s[^<]+?<\/a><\/h3>', page) date = regex.get_text( '<time datetime=.(\d+?\-\d+?\-\d+?)T\S+.>[^<]+?<\/time><\/p>', page) #title = regex.get_data('"title":"([^{]*?)",',page) for cnt in range(0, len(date)): date[cnt] = int(re.sub(r'-', '', date[cnt])) if date_ > date[cnt]: reach_updated = True break for i in range(0, cnt + 1): try: urls[url[i]] = str(date[i])[0:4] + '-' + str( date[i])[4:6] + '-' + str(date[i])[6:8] except IndexError: break if reach_updated: break page_num += 1 count += 1 print(str(len(urls)) + " URLs loaded...") print("Updating database...") for url in urls: if url in data_base[kkey]: continue try: html = hp.getHtml(url) except urllib.error.URLError: continue title = regex.get_data('<title>([^-]+?)\s-\s[^<]+?<\/title>', html) if title == 'None': title = regex.get_data('<title>([^<]+?)<\/title>', html) author = regex.get_data( '<div\sitemprop="author"[^>]+?>[^<]+?<meta\s[^<]+?\s[^"]+?="([^>]+?)"\/>', html) text = regex.get_text('<p>([^\n]*?)</p>', html) if text != []: text = text[:-1] if text == [] or title == "Noun": continue data_base[kkey].append(url) data_print[url] = defaultdict(str) # line 1 data_print[url]['ID'] = fmt.formatted_id( len(data_base[kkey]) - 1 + previous_len) # line 2 data_print[url]['key'] = fmt.formatted_key(kkey) # line 3 data_print[url]['title'] = fmt.formatted_title(title) # line 4 data_print[url]['source'] = fmt.formatted_source("Politico") # line 5 data_print[url]['url'] = fmt.formatted_url(url) # line 6 data_print[url]['date'] = fmt.formatted_date(urls[url]) # line 7 data_print[url]['author'] = fmt.formatted_author(author, ',') # line 8 data_print[url]['content'] = fmt.formatted_content(text) # line 9 #data_print[url[i]]['content2'] = fmt.formatted_content(text) print('■', end='', flush=True) print("\nThere are " + str(len(data_print) + previous_len) + " articles...") print("Updated " + str(len(data_print)) + " articles...")
def foxnews(data_base, data_print, key, date_, previous_len): kkey = fmt.file_name(key, '_') print("----- " + "foxnews." + kkey + " -----") print("Start loading Urls...") #case for exact keyword search url1 = 'http://api.foxnews.com/v1/content/search?q="' url2 = '"&fields=date,description,title,url,image,type,taxonomy&sort=latest§ion.path=fnc/opinion&type=article&start=' url3 = '&callback=angular.callbacks._0' baseurl = url1 + key + url2 + '0' + url3 article_number = '0' try: page = hp.getHtml(baseurl) except urllib.error.URLError: print("Foxnews website is not correct, please update the scraper!") return -1 article_number = regex.get_text('"response"\S\S"numFound":(\S+),"docs":\S', page)[0] if int(article_number) == 0: print("No Foxnews article was found by this key word") return -1 #get all urls count = 0 index = 0 urls = defaultdict(str) page_total = int(int(article_number) / 10 + 1) reach_updated = False print("There are " + article_number + " articles...") print("Start loading and Updating...") while (count < page_total): currenturl = url1 + key + url2 + str(index) + url3 try: page = hp.getHtml(currenturl) except urllib.error.URLError: continue url = regex.get_text('url":\S"(\S+?)"\S', page) #title = regex.get_data('"title":"([^{]*?)",',page) for i in range(0, len(url)): try: d = regex.get_data('\/(\d+\/\d+\/\d+)', url[i]) except IndexError: break d_int = int(re.sub(r'/', '', d)) if date_ > d_int: reach_updated = True break urls[url[i]] = re.sub(r'/', '-', d) if reach_updated: break index += 10 count += 1 print(str(len(urls)) + " URLs loaded...") print("Updating database...") for url in urls: if url in data_base[kkey]: continue try: html = hp.getHtml(url) except urllib.error.URLError: continue title = regex.get_data('<meta\sname="dc.title"\scontent="([^=]+?)">', html) author = regex.get_data( '<meta\sname="dc.creator"\scontent="([^"]+?)">', html) text = regex.get_text('<p[^>]*?>([^\n]*?)</p>[^<]*?<[^/]', html) if text != []: text = text[:-1] if text == [] or title == "Noun": continue data_base[kkey].append(url) data_print[url] = defaultdict(str) # line 1 data_print[url]['ID'] = fmt.formatted_id( len(data_base[kkey]) - 1 + previous_len) # line 2 data_print[url]['key'] = fmt.formatted_key(kkey) # line 3 data_print[url]['title'] = fmt.formatted_title(title) # line 4 data_print[url]['source'] = fmt.formatted_source("Foxnews") # line 5 data_print[url]['url'] = fmt.formatted_url(url) # line 6 data_print[url]['date'] = fmt.formatted_date(urls[url]) # line 7 data_print[url]['author'] = fmt.formatted_author(author, ',') # line 8 data_print[url]['content'] = fmt.formatted_content(text) # line 9 #data_print[url[i]]['content2'] = fmt.formatted_content(text) print('■', end='', flush=True) print("\nThere are " + str(len(data_print) + previous_len) + " articles...") print("Updated " + str(len(data_print)) + " articles...")