Пример #1
0
    def is_valid_url(self, response, url, text, custom_url_pattern):
        word_count = len(re.split('\\s+', text.strip())) if text else 0

        if custom_url_pattern:
            return word_count >= 5 and custom_url_pattern.search(url)
        else:
            site_domain = urltools.parse(response.url).domain
            url_domain = urltools.parse(url).domain

            return word_count >= 5 and url_domain == site_domain and urls.valid_url(
                url)
Пример #2
0
    def filterLinksForArticles(self, urls):
        validArticleUrls = []
        for url in urls:
            # some links on source web pages only contain the subdomains and not the entire domain, therefore
            # prepend the base url to the url
            if "http" not in url:
                url = self.baseUrl + url

            # if the the url doesn't containt a certain number of sub folders, it's definitely not an
            # article url
            urlSplit = url.split("/")
            if len(urlSplit) < 5:
                continue

            # validate url to make sure it is an article url before appending to the validArticleUrls list
            if urlChecker.valid_url(url):
                validArticleUrls.append(url)

        return validArticleUrls
Пример #3
0
    def test_valid_urls(self):
        """Prints out a list of urls with our heuristic guess if it is a
        valid news url purely based on the url
        """
        from newspaper.urls import valid_url

        with open(os.path.join(TEST_DIR, 'data/test_urls.txt'), 'r') as f:
            lines = f.readlines()
            test_tuples = [tuple(l.strip().split(' ')) for l in lines]
            # tuples are ('1', 'url_goes_here') form, '1' means valid,
            # '0' otherwise

        for lst, url in test_tuples:
            truth_val = bool(int(lst))
            try:
                self.assertEqual(truth_val, valid_url(url, test=True))
            except AssertionError:
                print('\t\turl: %s is supposed to be %s' % (url, truth_val))
                raise
Пример #4
0
    def test_valid_urls(self):
        """Prints out a list of urls with our heuristic guess if it is a
        valid news url purely based on the url
        """
        from newspaper.urls import valid_url

        with open(os.path.join(TEST_DIR, 'data/test_urls.txt'), 'r') as f:
            lines = f.readlines()
            test_tuples = [tuple(l.strip().split(' ')) for l in lines]
            # tuples are ('1', 'url_goes_here') form, '1' means valid,
            # '0' otherwise

        for lst, url in test_tuples:
            truth_val = bool(int(lst))
            try:
                self.assertEqual(truth_val, valid_url(url, test=True))
            except AssertionError:
                print('\t\turl: %s is supposed to be %s' % (url, truth_val))
                raise
Пример #5
0
    def test_valid_urls(self):
        """Prints out a list of urls with our heuristic guess if it is a
        valid news url purely based on the url
        """
        from newspaper.urls import valid_url

        with open(os.path.join(TEST_DIR, 'data/test_urls.txt'), 'r') as f:
            lines = f.readlines()
            test_tuples = [tuple(l.strip().split(' ')) for l in lines]
            # tuples are ('1', 'url_goes_here') form, '1' means valid,
            # '0' otherwise

        for tup in test_tuples:
            lst = int(tup[0])
            url = tup[1]
            assert len(tup) == 2
            truth_val = True if lst == 1 else False
            try:
                assert truth_val == valid_url(url, test=True)
            except AssertionError, e:
                print '\t\turl: %s is supposed to be %s' % (url, truth_val)
                raise
Пример #6
0
    def test_valid_urls(self):
        """Prints out a list of urls with our heuristic guess if it is a
        valid news url purely based on the url
        """
        from newspaper.urls import valid_url

        with open(os.path.join(TEST_DIR, 'data/test_urls.txt'), 'r') as f:
            lines = f.readlines()
            test_tuples = [tuple(l.strip().split(' ')) for l in lines]
            # tuples are ('1', 'url_goes_here') form, '1' means valid,
            # '0' otherwise

        for tup in test_tuples:
            lst = int(tup[0])
            url = tup[1]
            assert len(tup) == 2
            truth_val = True if lst == 1 else False
            try:
                assert truth_val == valid_url(url, test=True)
            except AssertionError:
                print('\t\turl: %s is supposed to be %s' % (url, truth_val))
                raise
Пример #7
0
 def process_links(self, links):
     return [link for link in links if news_urls.valid_url(link.url)]
#from newspaper import
import json
import sys
from newspaper import urls

url = sys.argv[1]

is_news = urls.valid_url(url)

is_news_data = {}
is_news_data["is_news"] = is_news

print(json.dumps(is_news_data))
Пример #9
0
    url = url['href']
    if checkers.is_url(url):
        count = count + 1
        print(url)

print(str(count) + " of " + str(len(links)) + " links are articles")

##########################################################################################

page = requests.get(
    "http://www.marion-press.com/2020/01/asian-rivers-riddled-with-plastic-trash/"
)
soupPage = soup(page.content, 'html.parser')
links = soupPage.find_all('a', href=True)

count = 0
for url in links:
    url = url['href']
    urlSplit = url.split("/")
    if len(urlSplit) < 5:
        continue
    if urlSplit[-2:-1][0].isnumeric() and urlSplit[-3:-2][0].isnumeric():
        continue
    if urls.valid_url(url):
        print(url)
        count = count + 1

print(str(count) + " of " + str(len(links)) + " links are articles")

##########################################################################################