def web_scraper(input): key_words = input.split(',') keys = [] #dates = [] time = str(datetime.datetime.now().date()) for key_word in key_words: try: while (key_word[0] == ' '): key_word = key_word[1:] while (key_word[len(key_word) - 1] == ' '): key_word = key_word[:-1] except IndexError: continue if (len(key_word) == 0): continue inputs = key_word.split(' ') key = '' for i in inputs: key += i key += '%20' key = key[:-3].lower() key = re.sub(r'[ ]+', ' ', key) keys.append(key) #dates.append(check.check_date(data_base, kkeys[i])) for i in range(0, len(keys)): previous_len = 0 try: for lines in open("./documents/web/Articles.bank", 'r', encoding='utf-8'): previous_len += 1 except FileNotFoundError: pass stored_keys = check.load_keywords_info() data_base = check.load_url_info() data_print = {} kkey = fmt.file_name(keys[i], '_') for kk in stored_keys: if abs(len(kkey) - len(kk)) < 2 and check.MinEditDist(kkey, kk) == 1: kkey = kk if kkey in stored_keys: date = int(re.sub(r'-', '', stored_keys[kkey])) else: date = 0 stored_keys[kkey] = time if kkey not in data_base: data_base[kkey] = [] cnn.cnn(data_base, data_print, keys[i], date, previous_len) foxnews.foxnews(data_base, data_print, keys[i], date, previous_len) cbs.cbs(data_base, data_print, keys[i], date, previous_len) politico.politico(data_base, data_print, keys[i], date, previous_len) #washington_post.washington_post(data_base,data_print,key,date) if len(data_print) > 1: check.save_keywords_info(stored_keys) check.save_url_info(data_base) output_file.output(data_print, data_base, previous_len) print("Update date: " + time)
def washington_post(data_base,data_print,key,date): hp.login("*****@*****.**", "qazxdr12") kkey = fmt.file_name(key,'_') kkkey = fmt.file_name(key,'+') print("----- "+"washington_post."+kkey+" -----") print("Start loading Urls...") #case for exact keyword search url1='https://sitesearchapp.washingtonpost.com/sitesearch-api/v2/search.json?count=20&datefilter=displaydatetime:%5B*+TO+NOW%2FDAY%2B1DAY%5D&facets.fields=%7B!ex%3Dinclude%7Dcontenttype,%7B!ex%3Dinclude%7Dname&filter=%7B!tag%3Dinclude%7Dcontenttype:("Article"+OR+(contenttype:"Blog"+AND+name:("Opinions")))&highlight.fields=headline,body&highlight.on=true&highlight.snippets=1&query="' url2='"&sort=&startat=' url3='&callback=angular.callbacks._0' baseurl = url1+kkkey+url2+'0'+url3 try: page = hp.getHtml(baseurl) except urllib.error.URLError: pass article_number = regex.get_data('"total"\S(\S+?),"documents',page)[0] if article_number == 0: url1='https://sitesearchapp.washingtonpost.com/sitesearch-api/v2/search.json?count=20&datefilter=displaydatetime:%5B*+TO+NOW%2FDAY%2B1DAY%5D&facets.fields=%7B!ex%3Dinclude%7Dcontenttype,%7B!ex%3Dinclude%7Dname&filter=%7B!tag%3Dinclude%7Dcontenttype:("Article"+OR+(contenttype:"Blog"+AND+name:("Opinions")))&highlight.fields=headline,body&highlight.on=true&highlight.snippets=1&query=' url2='&sort=&startat=' url3='&callback=angular.callbacks._0' baseurl = url1+kkkey+url2+'0'+url3 try: page = hp.getHtml(baseurl) except urllib.error.URLError: print("Washington Post website is not correct, please check the code!") return -1 article_number = regex.get_data('"total"\S(\S+?),"documents',page)[0] if article_number == 0: print("No Washington Post article was found by this key word") return -1 #get all urls count = 0 index = 0 urls = [] page_total = int(article_number) / 20 + 1 while(count < page_total): currenturl = url1+key+url2+str(index)+url3 try: page = hp.getHtml(currenturl) except urllib.error.URLError: continue url = regex.get_data('"contenturl"\S"(https:\/\/www.washingtonpost.com\/opinions/\S+?)"\S"',page) if date != 0: a_num = check.check_last_update(url,date) if a_num != -1: urls += url[:-(len(url)-a_num )] break urls += url index += 20 count += 1 print(str(len(urls))+" Urls loaded...") print("There are "+str(len(data_base)+len(data_print))+" loaded file...",) print("Now starting updating...",) count = 0 #count2 = 0 for url in urls: if url in data_base and kkey in data_base[url]: #if check.update_key(data_base, url, kkey): # count2 += 1 continue try: html = hp.getHtml(url) except urllib.error.URLError: continue title = regex.get_data('"headline":"(.*?)",',html) #<meta content="Julian Zelizer, CNN Political Analyst" name="author"> author = regex.get_data('this.props.author="(.*?)";',html) #<meta content="2018-02-17T00:19:47Z" name="pubdate"> date = regex.get_data('"datePublished":"(\S+?)T',html) text2 = regex.get_data('<article.*?>(.*?)<\/p>\s<\/article>',html) if text2 != []: text = regex.get_data('<p.*?>(.*?)<\/p>',text2[0]) else: text = regex.get_data('<p.*?>(.*?)<\/p>',html) if text == [] or title == []: continue data_print[url] = defaultdict(str) # line 1 data_print[url]['ID'] = fmt.formatted_id(len(data_base)+len(data_print)-1) data_print[url]['key'] = fmt.formatted_key(kkey) # line 2 data_print[url]['title'] = fmt.formatted_title(title) # line 3 data_print[url]['source'] = fmt.formatted_source("Washington Post") # line 4 data_print[url]['url'] = fmt.formatted_url(url) # line 5 data_print[url]['date'] = fmt.formatted_date(date) # line 6 data_print[url]['author'] = fmt.formatted_author(author,';') # line 7 data_print[url]['content1'] = fmt.formatted_content_with_symbol(text) # line 8 data_print[url]['content2'] = fmt.formatted_content(text) count += 1 print("Updated "+str(count)+" articles...") #if count2 > 0: # print("Updated "+str(count2)+" keys...") print("There are "+str(len(data_base)+len(data_print))+" articles...")
def cbs(data_base, data_print, key, date_, previous_len): kkey = fmt.file_name(key, '_') kkkey = fmt.file_name(key, '+') print("----- " + "cbs." + kkey + " -----") print("Start loading Urls...") kkkey = re.sub(r'/', '%2F', kkkey) #kkkey = re.sub(r'+', '%2B', kkkey) kkkey = re.sub(r'%', '%25', kkkey) #case for exact keyword search url1 = 'https://www.cbsnews.com/search/?q=' url2 = '&o=1&p=' url3 = '&t=opinion' baseurl = url1 + kkkey + url2 + '1' + url3 article_number = '0' try: page = hp.getHtml(baseurl) except urllib.error.URLError: print("CBS website is not correct, please check the code!") return -1 try: article_number = regex.get_text('<h1\sclass="result-title">(\d+)\s', page)[0] except IndexError: article_number = '0' if int(article_number) == 0: print("No CBS article was found by this key word") return -1 #get all urls count = 0 index = 0 page_num = 1 urls = defaultdict(str) page_total = int(int(article_number) / 10 + 1) reach_updated = False print("There are " + article_number + " articles...") print("Start loading and Updating...") while (count < page_total): currenturl = url1 + key + url2 + str(page_num) + url3 try: page = hp.getHtml(currenturl) except urllib.error.URLError: continue url = regex.get_text('<a\shref="(\S+?)"><h3\sclass="title"', page) date = regex.get_text( '<span\sclass="date">(\S+?\s\d+,\s\d+?)\s\S+\s\S+?\s\S+<\/span>', page) for cnt in range(0, len(date)): date[cnt] = fmt.convert_date(date[cnt]) if date_ > date[cnt]: reach_updated = True break for i in range(0, cnt + 1): try: urls['https://www.cbsnews.com' + url[i]] = str(date[i])[0:4] + '-' + str( date[i])[4:6] + '-' + str(date[i])[6:8] except IndexError: break if reach_updated: break index += 10 page_num += 1 count += 1 print(str(len(urls)) + " URLs loaded...") print("Updating database...") for url in urls: if url in data_base[kkey]: continue try: html = hp.getHtml(url) except urllib.error.URLError: continue title = regex.get_data('<title>([^<]+?)\s-\s[^<]+?<\/title>', html) if title == 'Noun': title = regex.get_data('<title>([^<]+?)<\/title>', html) author = regex.get_data( '"author":{".type":"[^"]+?","name":"([^"]+?)"}', html) text1 = [] text1.append( regex.get_data('<div\sdata-page=[^>]+?><[^>]*?>\n?([^\n]+?)<.?p>', html)) text2 = regex.get_text('<p>([^\n]+?)<\/p>', html) text = text1 + text2 if text == [] or title == "Noun": continue data_base[kkey].append(url) data_print[url] = defaultdict(str) # line 1 data_print[url]['ID'] = fmt.formatted_id( len(data_base[kkey]) - 1 + previous_len) # line 2 data_print[url]['key'] = fmt.formatted_key(kkey) # line 3 data_print[url]['title'] = fmt.formatted_title(title) # line 4 data_print[url]['source'] = fmt.formatted_source("CBS") # line 5 data_print[url]['url'] = fmt.formatted_url(url) # line 6 data_print[url]['date'] = fmt.formatted_date(urls[url]) # line 7 if len(author) != 0: aa = author.split(',') if len(aa) > 1: author = ','.join(aa[:-1]) elif len(aa) == 0: author = 'Noun Noun' else: author = 'Noun Noun' data_print[url]['author'] = fmt.formatted_author(author, ',') # line 8 data_print[url]['content'] = fmt.formatted_content(text) # line 9 #data_print[url[i]]['content2'] = fmt.formatted_content(text) print('■', end='', flush=True) print("\nThere are " + str(len(data_print) + previous_len) + " articles...") print("Updated " + str(len(data_print)) + " articles...")
def cnn(data_base, data_print, key, date_, previous_len): kkey = fmt.file_name(key, '_') kkkey = fmt.file_name(key, '+') print("----- " + "cnn." + kkey + " -----") #case for exact keyword search url1 = 'https://search.api.cnn.io/content?size=10&q=%22' url2 = '%22&category=opinion' baseurl = url1 + key + url2 article_number = '0' try: page = hp.getHtml(baseurl) except urllib.error.URLError: print("CNN website is not correct, please update the scraper!") return -1 article_number = regex.get_text('"meta":{\S+?"of":(\d+?),"maxScore', page)[0] if int(article_number) == 0: print("No CNN article was found by this key word") return -1 #get all urls count = 0 index = 0 page_num = 1 urls = defaultdict(str) page_total = int(int(article_number) / 10 + 1) reach_updated = False print("There are " + article_number + " articles...") print("Start loading URLs...") while (count < page_total): currenturl = url1 + key + url2 + '&from=' + str( index) + '&page=' + str(page_num) try: page = hp.getHtml(currenturl) except urllib.error.URLError: continue url = regex.get_text('"url":"([^,]+?.html)"\S"', page) #title = regex.get_text('"headline":"([^{]*?)"',page) #author = regex.get_text('"byLine":(.*?),',page) for i in range(0, len(url)): try: d = regex.get_data('\/(\d+?\/\d+?\/\d+?)\/', url[i]) except IndexError: break d_int = int(re.sub(r'/', '', d)) if date_ > d_int: reach_updated = True break urls[url[i]] = re.sub(r'/', '-', d) if reach_updated: break index += 10 page_num += 1 count += 1 print(str(len(urls)) + " URLs loaded...") print("Updating database...") for url in urls: if url in data_base[kkey]: continue try: html = hp.getHtml(url) except urllib.error.URLError: continue title = regex.get_data('<title>([^<]+?)\s-\s\w+?<\/title>', html) if title == 'Noun': title = regex.get_data('<title>([^<]+?)<\/title>', html) author = regex.get_data('<meta\scontent\S"([^"]+?)"\sname="author">', html) text2 = [] text2.append( regex.get_data( '<cite\sclass="el-editorial-source">\s\S\S\S\S\S</cite>([^=]*?)<\/p><\/div>', html)) text1 = regex.get_text( '<div\sclass="zn-body__paragraph\s*?\w*?">([^=]+?)</div>?', html) text = text2 + text1 if text == [] or title == "Noun": continue data_base[kkey].append(url) data_print[url] = defaultdict(str) # line 1 data_print[url]['ID'] = fmt.formatted_id( len(data_base[kkey]) - 1 + previous_len) # line 2 data_print[url]['key'] = fmt.formatted_key(kkey) # line 3 data_print[url]['title'] = fmt.formatted_title(title) # line 4 data_print[url]['source'] = fmt.formatted_source("CNN") # line 5 data_print[url]['url'] = fmt.formatted_url(url) # line 6 data_print[url]['date'] = fmt.formatted_date(urls[url]) # line 7 if len(author) > 5: if author[0:3] == "By ": author = author[3:] aa = author.split(',') if len(aa) > 1: author = ','.join(aa[:-1]) else: author = 'Noun Noun' data_print[url]['author'] = fmt.formatted_author(author, ',') # line 8 data_print[url]['content'] = fmt.formatted_content(text) # line 9 #data_print[url[i]]['content2'] = fmt.formatted_content(text) print('■', end='', flush=True) print("\nThere are " + str(len(data_print) + previous_len) + " articles...") print("Updated " + str(len(data_print)) + " articles...")
def web_scraper(input): key_words = input.split(',') keys = [] #dates = [] symbols = ['(',')',';','[',']','{','}','£','!','?','$','#','&',\ '@','^','*','+','\\','<','>','“','”','~'] time = str(datetime.datetime.now().date()) for key_word in key_words: try: while (key_word[0] == ' '): key_word = key_word[1:] while (key_word[len(key_word) - 1] == ' '): key_word = key_word[:-1] except IndexError: continue if (len(key_word) == 0): continue inputs = key_word.split(' ') key = '' for i in inputs: key += i key += '%20' key = key[:-3].lower() key = re.sub(r'[ ]+', ' ', key) keys.append(key) #dates.append(check.check_date(data_base, kkeys[i])) for i in range(0, len(keys)): s_key = fmt.file_name(keys[i], ' ') l_key = '' for letter in s_key: if letter.isdigit() or letter.isalpha(): l_key += letter else: l_key += ' ' keys[i] = re.sub(r'[ ]+', '%20', l_key) for i in range(0, len(keys)): previous_len = 0 try: for lines in open("Articles.bank", 'r', encoding='utf-8'): previous_len += 1 except FileNotFoundError: pass stored_keys = check.load_keywords_info() data_base = check.load_url_info() data_print = {} kkey = fmt.file_name(keys[i], '_') for kk in stored_keys: if abs(len(kkey) - len(kk)) <= 2 and check.MinEditDist(kkey, kk) <= 1: kkey = kk keys[i] = re.sub(r'_', '%20', kkey) if kkey in stored_keys: date = int(re.sub(r'-', '', stored_keys[kkey])) else: date = 0 stored_keys[kkey] = time if kkey not in data_base: data_base[kkey] = [] cnn.cnn(data_base, data_print, keys[i], date, previous_len) foxnews.foxnews(data_base, data_print, keys[i], date, previous_len) cbs.cbs(data_base, data_print, keys[i], date, previous_len) politico.politico(data_base, data_print, keys[i], date, previous_len) #washington_post.washington_post(data_base,data_print,key,date) if len(data_print) > 0: check.save_keywords_info(stored_keys) check.save_url_info(data_base) output_file.output(data_print, data_base, previous_len) print("Update date: " + time)
def politico(data_base, data_print, key, date_, previous_len): kkey = fmt.file_name(key, '_') kkkey = fmt.file_name(key, '+') print("----- " + "politico." + kkey + " -----") # https://www.politico.com/search/2?s=newest&q=tax%20reform&adv=true&c=0000014b-324d-d4f3-a3cb-f3ff415e0035&pv=0000014e-a307-d012-a3fe-bb8793910000 url1 = 'https://www.politico.com/search/' url2 = '?s=newest&q="' url3 = '"&adv=true&c=0000014b-324d-d4f3-a3cb-f3ff415e0035&pv=0000014e-a307-d012-a3fe-bb8793910000' baseurl = url1 + '1' + url2 + key + url3 article_number = '0' try: page = hp.getHtml(baseurl) except urllib.error.URLError: print("Politico website is not correct, please update the scraper!") return -1 article_number = regex.get_text( '<h1>Results[^<]+?<\/h1>[^<]+?<p>(\d+?)\sSearch\sResults<\/p>', page)[0] if int(article_number) == 0: print("No Policito article was found by this key word") return -1 #get all urls count = 0 page_num = 1 urls = defaultdict(str) page_total = int(int(article_number) / 20 + 1) reach_updated = False print("There are " + article_number + " articles...") print("Start loading URLs...") while (count < page_total): currenturl = url1 + str(page_num) + url2 + key + url3 try: page = hp.getHtml(currenturl) except urllib.error.URLError: continue url = regex.get_text('<a\shref="([^"]+?)"\s[^<]+?<\/a><\/h3>', page) date = regex.get_text( '<time datetime=.(\d+?\-\d+?\-\d+?)T\S+.>[^<]+?<\/time><\/p>', page) #title = regex.get_data('"title":"([^{]*?)",',page) for cnt in range(0, len(date)): date[cnt] = int(re.sub(r'-', '', date[cnt])) if date_ > date[cnt]: reach_updated = True break for i in range(0, cnt + 1): try: urls[url[i]] = str(date[i])[0:4] + '-' + str( date[i])[4:6] + '-' + str(date[i])[6:8] except IndexError: break if reach_updated: break page_num += 1 count += 1 print(str(len(urls)) + " URLs loaded...") print("Updating database...") for url in urls: if url in data_base[kkey]: continue try: html = hp.getHtml(url) except urllib.error.URLError: continue title = regex.get_data('<title>([^-]+?)\s-\s[^<]+?<\/title>', html) if title == 'None': title = regex.get_data('<title>([^<]+?)<\/title>', html) author = regex.get_data( '<div\sitemprop="author"[^>]+?>[^<]+?<meta\s[^<]+?\s[^"]+?="([^>]+?)"\/>', html) text = regex.get_text('<p>([^\n]*?)</p>', html) if text != []: text = text[:-1] if text == [] or title == "Noun": continue data_base[kkey].append(url) data_print[url] = defaultdict(str) # line 1 data_print[url]['ID'] = fmt.formatted_id( len(data_base[kkey]) - 1 + previous_len) # line 2 data_print[url]['key'] = fmt.formatted_key(kkey) # line 3 data_print[url]['title'] = fmt.formatted_title(title) # line 4 data_print[url]['source'] = fmt.formatted_source("Politico") # line 5 data_print[url]['url'] = fmt.formatted_url(url) # line 6 data_print[url]['date'] = fmt.formatted_date(urls[url]) # line 7 data_print[url]['author'] = fmt.formatted_author(author, ',') # line 8 data_print[url]['content'] = fmt.formatted_content(text) # line 9 #data_print[url[i]]['content2'] = fmt.formatted_content(text) print('■', end='', flush=True) print("\nThere are " + str(len(data_print) + previous_len) + " articles...") print("Updated " + str(len(data_print)) + " articles...")
def foxnews(data_base, data_print, key, date_, previous_len): kkey = fmt.file_name(key, '_') print("----- " + "foxnews." + kkey + " -----") print("Start loading Urls...") #case for exact keyword search url1 = 'http://api.foxnews.com/v1/content/search?q="' url2 = '"&fields=date,description,title,url,image,type,taxonomy&sort=latest§ion.path=fnc/opinion&type=article&start=' url3 = '&callback=angular.callbacks._0' baseurl = url1 + key + url2 + '0' + url3 article_number = '0' try: page = hp.getHtml(baseurl) except urllib.error.URLError: print("Foxnews website is not correct, please update the scraper!") return -1 article_number = regex.get_text('"response"\S\S"numFound":(\S+),"docs":\S', page)[0] if int(article_number) == 0: print("No Foxnews article was found by this key word") return -1 #get all urls count = 0 index = 0 urls = defaultdict(str) page_total = int(int(article_number) / 10 + 1) reach_updated = False print("There are " + article_number + " articles...") print("Start loading and Updating...") while (count < page_total): currenturl = url1 + key + url2 + str(index) + url3 try: page = hp.getHtml(currenturl) except urllib.error.URLError: continue url = regex.get_text('url":\S"(\S+?)"\S', page) #title = regex.get_data('"title":"([^{]*?)",',page) for i in range(0, len(url)): try: d = regex.get_data('\/(\d+\/\d+\/\d+)', url[i]) except IndexError: break d_int = int(re.sub(r'/', '', d)) if date_ > d_int: reach_updated = True break urls[url[i]] = re.sub(r'/', '-', d) if reach_updated: break index += 10 count += 1 print(str(len(urls)) + " URLs loaded...") print("Updating database...") for url in urls: if url in data_base[kkey]: continue try: html = hp.getHtml(url) except urllib.error.URLError: continue title = regex.get_data('<meta\sname="dc.title"\scontent="([^=]+?)">', html) author = regex.get_data( '<meta\sname="dc.creator"\scontent="([^"]+?)">', html) text = regex.get_text('<p[^>]*?>([^\n]*?)</p>[^<]*?<[^/]', html) if text != []: text = text[:-1] if text == [] or title == "Noun": continue data_base[kkey].append(url) data_print[url] = defaultdict(str) # line 1 data_print[url]['ID'] = fmt.formatted_id( len(data_base[kkey]) - 1 + previous_len) # line 2 data_print[url]['key'] = fmt.formatted_key(kkey) # line 3 data_print[url]['title'] = fmt.formatted_title(title) # line 4 data_print[url]['source'] = fmt.formatted_source("Foxnews") # line 5 data_print[url]['url'] = fmt.formatted_url(url) # line 6 data_print[url]['date'] = fmt.formatted_date(urls[url]) # line 7 data_print[url]['author'] = fmt.formatted_author(author, ',') # line 8 data_print[url]['content'] = fmt.formatted_content(text) # line 9 #data_print[url[i]]['content2'] = fmt.formatted_content(text) print('■', end='', flush=True) print("\nThere are " + str(len(data_print) + previous_len) + " articles...") print("Updated " + str(len(data_print)) + " articles...")