def is_valid_url(self, response, url, text, custom_url_pattern): word_count = len(re.split('\\s+', text.strip())) if text else 0 if custom_url_pattern: return word_count >= 5 and custom_url_pattern.search(url) else: site_domain = urltools.parse(response.url).domain url_domain = urltools.parse(url).domain return word_count >= 5 and url_domain == site_domain and urls.valid_url( url)
def filterLinksForArticles(self, urls): validArticleUrls = [] for url in urls: # some links on source web pages only contain the subdomains and not the entire domain, therefore # prepend the base url to the url if "http" not in url: url = self.baseUrl + url # if the the url doesn't containt a certain number of sub folders, it's definitely not an # article url urlSplit = url.split("/") if len(urlSplit) < 5: continue # validate url to make sure it is an article url before appending to the validArticleUrls list if urlChecker.valid_url(url): validArticleUrls.append(url) return validArticleUrls
def test_valid_urls(self): """Prints out a list of urls with our heuristic guess if it is a valid news url purely based on the url """ from newspaper.urls import valid_url with open(os.path.join(TEST_DIR, 'data/test_urls.txt'), 'r') as f: lines = f.readlines() test_tuples = [tuple(l.strip().split(' ')) for l in lines] # tuples are ('1', 'url_goes_here') form, '1' means valid, # '0' otherwise for lst, url in test_tuples: truth_val = bool(int(lst)) try: self.assertEqual(truth_val, valid_url(url, test=True)) except AssertionError: print('\t\turl: %s is supposed to be %s' % (url, truth_val)) raise
def test_valid_urls(self): """Prints out a list of urls with our heuristic guess if it is a valid news url purely based on the url """ from newspaper.urls import valid_url with open(os.path.join(TEST_DIR, 'data/test_urls.txt'), 'r') as f: lines = f.readlines() test_tuples = [tuple(l.strip().split(' ')) for l in lines] # tuples are ('1', 'url_goes_here') form, '1' means valid, # '0' otherwise for tup in test_tuples: lst = int(tup[0]) url = tup[1] assert len(tup) == 2 truth_val = True if lst == 1 else False try: assert truth_val == valid_url(url, test=True) except AssertionError, e: print '\t\turl: %s is supposed to be %s' % (url, truth_val) raise
def test_valid_urls(self): """Prints out a list of urls with our heuristic guess if it is a valid news url purely based on the url """ from newspaper.urls import valid_url with open(os.path.join(TEST_DIR, 'data/test_urls.txt'), 'r') as f: lines = f.readlines() test_tuples = [tuple(l.strip().split(' ')) for l in lines] # tuples are ('1', 'url_goes_here') form, '1' means valid, # '0' otherwise for tup in test_tuples: lst = int(tup[0]) url = tup[1] assert len(tup) == 2 truth_val = True if lst == 1 else False try: assert truth_val == valid_url(url, test=True) except AssertionError: print('\t\turl: %s is supposed to be %s' % (url, truth_val)) raise
def process_links(self, links): return [link for link in links if news_urls.valid_url(link.url)]
#from newspaper import import json import sys from newspaper import urls url = sys.argv[1] is_news = urls.valid_url(url) is_news_data = {} is_news_data["is_news"] = is_news print(json.dumps(is_news_data))
url = url['href'] if checkers.is_url(url): count = count + 1 print(url) print(str(count) + " of " + str(len(links)) + " links are articles") ########################################################################################## page = requests.get( "http://www.marion-press.com/2020/01/asian-rivers-riddled-with-plastic-trash/" ) soupPage = soup(page.content, 'html.parser') links = soupPage.find_all('a', href=True) count = 0 for url in links: url = url['href'] urlSplit = url.split("/") if len(urlSplit) < 5: continue if urlSplit[-2:-1][0].isnumeric() and urlSplit[-3:-2][0].isnumeric(): continue if urls.valid_url(url): print(url) count = count + 1 print(str(count) + " of " + str(len(links)) + " links are articles") ##########################################################################################