Exemplo n.º 1
0
def get_articles(folder, sitemap):
    last_index = get_last_index('crawling/%s' % folder)
    total = len(sitemap)
    for index, a in enumerate(sitemap):
        key = hash(a['link'])
        if not key in loaded:
            loaded[key] = 1
            article = get_article_name(index + last_index)
            base = 'crawling/%s/%s' % (folder, article)
            try: 
                r = requests.get(a['link'], timeout=p.url_timeout)
                # r = urllib2.urlopen(a['link'])
                html = Soup(r.text)
                title = html.find('h1')
                if title:
                    title = getText(title)
                else:
                    title = ''
                content = content_extractor.analyze(r.content)
                if len(content.split(' ')) >= p.min_length:
                    # print([])
                    content = title.encode('utf=8') + '\n' + a['link'].encode('utf-8') + '\n' + content
                    utils.save_file(base + '.txt', content, False)
                    #get images
                get_images(base, a['images'])
            except requests.exceptions.Timeout:
                utils.save_file('cached.pkl', loaded)
                print("Timeout url: %s" % a['link'])
            except Exception as e:
                utils.save_file('cached.pkl', loaded)
                print("Error occured", e)
        utils.update_progress((index + 1) * 1.0 / total)
def getDragnet(url):
    # fetch HTML
    #url = 'http://cnn.com/2016/07/17/health/south-africa-meerkat-telescope-galaxies/index.html'
    try:
        r = requests.get(url)
        # get main article without comments
        content = content_extractor.analyze(r.content).replace('\n', '')
        return '', content
    except Exception, e:
        return '', ''
Exemplo n.º 3
0
 def clean_content((url, content)):
     try:
         blocks = content_extractor.analyze(content, blocks=True)
         content = ''.join(
             etree.tostring(b.features['block_start_element'])
             for b in blocks)
         if len(content) < CommonCrawlArticles.THRESHOLD_CONTENT_SZ:
             yield url, content
     except (BlockifyError, etree.SerialisationError):
         pass
Exemplo n.º 4
0
def try_dragnet():
    # fetch HTML
    r = requests.get(ARTICLE)
    # get main article without comments
    content = content_extractor.analyze(r.content)
    print("======")
    print(content)
    # get article and comments
    content_comments = content_comments_extractor.analyze(r.content)
    print("======")
    print(content_comments)
Exemplo n.º 5
0
def dragnet_test():
    from dragnet import content_extractor
    for i in range(0, 1000):
        input_filename = 'page/' + str(i) + '.txt'
        output_filename = 'dragnet/' + str(i) + '.txt'
        input_file = open(input_filename, 'r')
        s = input_file.read()
        input_file.close()
        article = content_extractor.analyze(s)
        output_file = open(output_filename, 'wb')
        output_file.write(article)
        output_file.close()
Exemplo n.º 6
0
def download_abstracts_scholar(db, start, num_results, keyword, time_delay=1200):
    """ Download abstracts from google scholar
    :param db: Mongodb database
    :param start: The start page
    :param num_results: Number of results
    :param keyword: Keyword to search for
    :param time_delay: time delay
    :return: None
    """
    querier = ScholarQuerier()
    settings = ScholarSettings()
    querier.apply_settings(settings)
    query = SearchScholarQuery()
    query.set_phrase(keyword)
    query.set_num_page_results(min(20, num_results))
    total = start
    while total < num_results:
        try:
            query.set_start(total)
            querier.send_query(query)
            # querier.save_cookies()
            items = csv(querier)
            for index, item in enumerate(items):
                url = item.strip().split('|')[1]
                content = ''
                try:
                    r = requests.get(url)
                    try:
                        content = content_extractor.analyze(r.content)
                    except Exception as e:
                        sys.stderr.write('Error fetching content: ' + str(e) + '\n')
                except requests.packages.urllib3.exceptions.ProtocolError:
                        sys.stderr.write('Error: ' + str(e) + '\n')
                except requests.exceptions.RequestException as e:
                        sys.stderr.write('Error fetching URL ' + url + ': ' + str(e) + '\n')
                except Exception as e:
                        sys.stderr.write('Error fetching URL ' + url + ': ' + str(e) + '\n')
                print(" --------- Abstract %d  ------------ " % (index + 1 + total))
                print(content)
                hash_value = hashlib.md5(content).hexdigest()
                item = db.papers.find_one({'hash': hash_value})
                text = str()
                acm_msg = 'Did you know the ACM DL App is now available? Did you know your Organization can subscribe to the ACM Digital Library?'
                if item is None and content != acm_msg:
                            d = {'keyword': keyword, 'abstract': content, 'text': text, 'hash': hash_value}
                            db.papers.insert(d)
            delay = random.randint(time_delay, time_delay + 600)
            print('Sleeping for %d seconds ... ' % delay)
            time.sleep(delay)
            total += 20
        except KeyboardInterrupt:
            break
    database.dump_db()
Exemplo n.º 7
0
def populate(redis_client):
    """Populates the entries in the database with fields such as headline,
    body, html and url

    # Arguments
        lang: language of the database

    # Returns
        news: news objects populated with required fields
    """
    keys = redis_client.keys()
    folder = 'docs/{}/'.format(redis_client.lang)
    for key in keys:
        value = redis_client.get(key)
        f = folder + value['id'] + '.json'
        if os.path.isfile(f):
            logging.info('Skipping existing document: {}'.format(f))
            continue
        if value['wayback_url'] == 'None':
            html = fetch(value['url'])
        else:
            html = fetch(value['wayback_url'])
        time.sleep(1)
        if html:
            soup = BeautifulSoup(html, 'html.parser')
        else:
            continue
        headline_elems = soup.select(value['headline_selector'], None)
        if len(headline_elems) > 0:
            headline = headline_elems[0].text.strip()
        else:
            logging.debug(
                'Headline can not be refound: url={}, selector={}'.format(
                    value['url'], value['headline_selector']))
            continue
        news = OrderedDict()
        news['id'] = value['id']
        news['timestamp'] = value['timestamp']
        news['lang'] = redis_client.lang
        news['url'] = value['url']
        news['wayback_url'] = value['wayback_url']
        news['headline'] = headline.strip()
        news['body'] = content_extractor.analyze(html).strip()
        yield news
Exemplo n.º 8
0
def populate(redis_client):
    """Populates the entries in the database with fields such as headline,
    body, html and url

    # Arguments
        lang: language of the database

    # Returns
        news: news objects populated with required fields
    """
    keys = redis_client.keys()
    folder = 'docs/{}/'.format(redis_client.lang)
    for key in keys:
        value = redis_client.get(key)
        f = folder + value['id'] + '.json'
        if os.path.isfile(f):
            logging.info('Skipping existing document: {}'.format(f))
            continue
        if value['wayback_url'] == 'None':
            html = fetch(value['url'])
        else:
            html = fetch(value['wayback_url'])
        time.sleep(1)
        if html:
            soup = BeautifulSoup(html, 'html.parser')
        else:
            continue
        headline_elems = soup.select(value['headline_selector'], None)
        if len(headline_elems) > 0:
            headline = headline_elems[0].text.strip()
        else:
            logging.debug('Headline can not be refound: url={}, selector={}'
                          .format(value['url'], value['headline_selector']))
            continue
        news = OrderedDict()
        news['id'] = value['id']
        news['timestamp'] = value['timestamp']
        news['lang'] = redis_client.lang
        news['url'] = value['url']
        news['wayback_url'] = value['wayback_url']
        news['headline'] = headline.strip()
        news['body'] = content_extractor.analyze(html).strip()
        yield news
Exemplo n.º 9
0
# https://github.com/seomoz/dragnet

import sys
import requests
from dragnet import content_extractor

# fetch HTML
# https://github.com/seomoz/dragnet
# https://moz.com/devblog/dragnet-content-extraction-from-diverse-feature-sets/
# http://antonioleiva.com/collapsing-toolbar-layout/
url = sys.argv[1]
r = requests.get(url, timeout=10)

# get main article without comments
content = content_extractor.analyze(r.content)

print content
Exemplo n.º 10
0
    def process_item(self, item, spider):
        fullHTML = item['content']
        content = content_extractor.analyze(fullHTML)
        item['content'] = content

        return item
Exemplo n.º 11
0
def extract_by_dragnet(html):
    content = content_extractor.analyze(html)

    return {
        'body': content.decode('utf-8'),
    }