Пример #1
0
def scrape(curr_url, hash, soup, results):
    print('Found gaceta.es...')

    # article
    for t in soup.find_all('div', class_='article-post-content'):
        if len(t.find_all('h1', class_='entry-title')) > 0:
            print('Getting wordpress article...')

            dt = {}
            dm = {}

            dm["id"] = str(hash)
            dm["type"] = 'article'
            dm["source"] = curr_url
            for c in t.find_all('time', class_='date'):
                dm["meta"] = utils.clean_soup(c)
            for c in t.find_all('h1', class_='entry-title'):
                dm["title"] = utils.clean_soup(c)

            dt["meta"] = dm
            for c in t.find_all('div', class_='post-content'):
                dt["text"] = utils.clean_soup(c)

            result = json.dumps(dt, ensure_ascii=False)
            results.append(result)
            print(result)
def scrape(curr_url, hash, soup, results):
    print('Found ilpopulista.it...')

    for t in soup.find_all('body', class_='news'):
        if len(soup.find_all('div', class_='vc_article_body')) > 0:
            print('Getting custom article...')

            dt = {}
            dm = {}

            dm["id"] = str(hash)
            dm["type"] = 'article'
            dm["source"] = curr_url
            dm["meta"] = ''
            for c in t.find_all('p', class_='autore_articolo'):
                dm["meta"] = dm["meta"] + utils.clean_soup(c) + ' '
            for c in t.find_all('p', class_='data_articolo'):
                dm["meta"] = dm["meta"] + utils.clean_soup(c) + ' '
            dm["title"] = ''
            for c in t.find_all('h1', class_='titolo_articolo'):
                dm["title"]  = dm["title"]  + utils.clean_soup(c) + ' '

            dt["meta"] = dm
            dt["text"] = ''
            for c in t.find_all('div', class_='vc_article_body'):
                dt["text"] = dt["text"] + utils.clean_soup(c) + ' '

            result = json.dumps(dt, ensure_ascii=False)
            results.append(result)
            print(result)
Пример #3
0
def scrape(curr_url, hash, soup, results):
    print('Found mediterraneodigital.com...')

    # article
    for t in soup.find_all('div', class_='item-page'):
        print('Getting joomla article...')

        dt = {}
        dm = {}

        dm["id"] = str(hash)
        dm["type"] = 'article'
        dm["source"] = curr_url
        dm["meta"] = ''
        for c in t.find_all('dl', class_='article-info'):
            dm["meta"] = dm["meta"] + utils.clean_soup(c) + ' '
        dm["title"] = ''
        for c in t.find_all('h1', class_='article-title'):
            dm["title"] = dm["title"] + utils.clean_soup(c) + ' '

        dt["meta"] = dm
        dt["text"] = ''
        for c in t.find_all('section', class_='article-content'):
            for d in c.find_all('p', class_=''):
                dt["text"] = dt["text"] + utils.clean_soup(d) + ' '

        result = json.dumps(dt, ensure_ascii=False)
        results.append(result)
        print(result)
def scrape(curr_url, hash, soup, results):
    print('Found arxaiaithomi.gr...')

    # article
    for t in soup.find_all('div', class_='post'):
        if len(soup.find_all('body', class_='single-post')) > 0:
            print('Getting wordpress article...')

            result = '{\"meta\":{'
            result = result + '\"id\":\"' + str(hash) + '\",'
            result = result + '\"type\":\"article\",'
            result = result + '\"source\":\"' + curr_url + '\",'
            result = result + '\"meta\":\"'
            for c in t.find_all('div', class_='post-footer'):
                #
                result = result + utils.clean_soup(c)
            result = result + '\",'
            result = result + '\"title\":\"'
            for c in t.select('div.post-headline > h2'):
                #
                result = result + utils.clean_soup(c)
            result = result + '\"'
            result = result + '},'

            result = result + '\"text\":\"'
            for c in t.find_all('div', class_='post-bodycopy'):
                for d in c.find_all(recursive=False):
                    if d.name != 'div':
                        result = result + utils.clean_soup(d) + ' '
            result = result + '\"'
            result = result + '}'

            result = utils.clean_whitespaces(result)
            results.append(result)
            print(result)
def scrape(curr_url, hash, soup, results):
    print('Found periodistadigital.com...')

    for t in soup.find_all('div', id='m4p-post-detail'):
        print('Getting wordpress article...')

        result = '{\"meta\":{'
        result = result + '\"id\":\"' + str(hash) + '\",'
        result = result + '\"type\":\"article\",'
        result = result + '\"source\":\"' + curr_url + '\",'
        for c in t.find_all('div', class_='m4p-author_time'):
            result = result + '\"meta\":\"' + utils.clean_soup(c)
        result = result + '\",'
        for c in t.find_all('h1', class_='m4p-size-1'):
            result = result + '\"title\":\"' + utils.clean_soup(c)
        result = result + '\"'
        result = result + '},'

        for c in t.find_all('div', class_='m4p-post-content'):
            result = result + '\"text\":\"' + utils.clean_soup(c) + '\"'
        result = result + '}'

        result = utils.clean_whitespaces(result)
        results.append(result)
        print(result)
Пример #6
0
def scrape(curr_url, hash, soup, results):
    print('Found la7.it...')

    for t in soup.find_all('body', class_='node-type-la7-video'):
        print('Getting drupal article...')

        dt = {}
        dm = {}

        dm["id"] = str(hash)
        dm["type"] = 'article'
        dm["source"] = curr_url
        dm["meta"] = ''
        for c in t.find_all('div', class_='infoVideoRow'):
            for d in c.find_all('div', class_='dateVideo'):
                dm["meta"] = dm["meta"] + utils.clean_soup(d) + ' '
        dm["title"] = ''
        for c in t.find_all('div', class_='infoVideoRow'):
            for d in c.find_all('h1'):
                dm["title"] = dm["title"] + utils.clean_soup(d) + ' '

        dt["meta"] = dm
        dt["text"] = ''
        for c in t.find_all('div', class_='occhiello'):
            for d in c.find_all('p'):
                dt["text"] = dt["text"] + utils.clean_soup(d) + ' '

        result = json.dumps(dt, ensure_ascii=False)
        results.append(result)
        print(result)
def scrape(curr_url, hash, soup, results):
    print('Found rainews.it...')

    # article
    for t in soup.find_all('div', class_='boxArticle'):
        if len(t.find_all('article', class_='')) > 0:
            print('Getting custom article...')

            dt = {}
            dm = {}

            dm["id"] = str(hash)
            dm["type"] = 'article'
            dm["source"] = curr_url
            dm["meta"] = ''
            for c in t.find_all('div', class_='text'):
                for d in c.find_all('time', class_='articleDate'):
                    dm["meta"] = dm["meta"] + utils.clean_soup(d) + ' '
            dm["title"] = ''
            for c in t.find_all('div', class_='title'):
                for d in c.find_all('h1', class_=''):
                    dm["title"] = dm["title"] + utils.clean_soup(d) + ' '

            dt["meta"] = dm
            dt["text"] = ''
            for c in t.find_all('div', class_='text'):
                #
                dt["text"] = dt["text"] + utils.clean_soup(c) + ' '

            result = json.dumps(dt, ensure_ascii=False)
            results.append(result)
            print(result)
def scrape(curr_url, hash, soup, results):
    print('Found ilprimatonazionale.it...')

    # article
    for t in soup.find_all('article', class_='post'):
        if len(t.find_all('div', class_='td-post-content')) > 0:
            print('Getting wordpress article...')

            dt = {}
            dm = {}

            dm["id"] = str(hash)
            dm["type"] = 'article'
            dm["source"] = curr_url
            dm["meta"] = ''
            for c in t.find_all('div', class_='td-module-meta-info'):
                for d in c.find_all('span', class_='td-post-date'):
                    dm["meta"] = dm["meta"] + utils.clean_soup(d) + ' '
                for d in c.find_all('div', class_='td-post-author-name'):
                    dm["meta"] = dm["meta"] + utils.clean_soup(d) + ' '
                break
            dm["title"] = ''
            for c in t.find_all('h1', class_='entry-title'):
                #
                dm["title"] = dm["title"] + utils.clean_soup(c) + ' '

            dt["meta"] = dm
            dt["text"] = ''
            for c in t.find_all('div', class_='td-post-content'):
                for d in c.find_all('p', class_=''):
                    dt["text"] = dt["text"] + utils.clean_soup(d) + ' '

            result = json.dumps(dt, ensure_ascii=False)
            results.append(result)
            print(result)
def scrape(curr_url, hash, soup, results):
    print('Found okdiario.com...')

    for t in soup.find_all('article', class_='post'):
        if len(t.find_all('div', class_='entry-content')) > 0:
            print('Getting wordpress article...')

            result = '{\"meta\":{'
            result = result + '\"id\":\"' + str(hash) + '\",'
            result = result + '\"type\":\"article\",'
            result = result + '\"source\":\"' + curr_url + '\",'
            result = result + '\"meta\":\"'
            for c in t.find_all('address', class_='autor'):
                result = result + utils.clean_soup(c)
            result = result + '\",'
            result = result + '\"title\":\"'
            for c in t.find_all('h1', class_='entry-title'):
                result = result + utils.clean_soup(c)
            result = result + '\"'
            result = result + '},'

            for c in t.find_all('div', class_='entry-content'):
                result = result + '\"text\":\"' + utils.clean_soup(c) + '\"'
            result = result + '}'

            result = utils.clean_whitespaces(result)
            results.append(result)
            print(result)
Пример #10
0
def scrape(curr_url, hash, soup, results):
    print('Found ekklisiaonline.gr...')

    # article
    for t in soup.find_all('body', class_='single-post'):
        if len(soup.find_all('article', class_='')) > 0:
            print('Getting wordpress article...')

            result = '{\"meta\":{'
            result = result + '\"id\":\"' + str(hash) + '\",'
            result = result + '\"type\":\"article\",'
            result = result + '\"source\":\"' + curr_url + '\",'
            result = result + '\"meta\":\"'
            for c in t.select('h6.entry-date'):
                #
                result = result + utils.clean_soup(c) + ' '
            result = result + '\",'
            result = result + '\"title\":\"'
            for c in t.find_all('h1', class_='entry-title'):
                result = result + utils.clean_soup(c)
                break
            result = result + '\"'
            result = result + '},'

            result = result + '\"text\":\"'
            for c in t.select('div#article > article'):
                for d in c.find_all('p', class_=None, recursive=False):
                    result = result + utils.clean_soup(d) + ' '
            result = result + '\"'
            result = result + '}'

            result = utils.clean_whitespaces(result)
            results.append(result)
            print(result)
Пример #11
0
def scrape(curr_url, hash, soup, results):
    print('Found fratelli-italia.it...')

    for t in soup.find_all('body', class_='single-post'):
        print('Getting wordpress article...')

        dt = {}
        dm = {}

        dm["id"] = str(hash)
        dm["type"] = 'article'
        dm["source"] = curr_url
        dm["meta"] = ''
        for c in t.find_all('ul', class_='post-options'):
            for d in c.find_all('time'):
                dm["meta"] = dm["meta"] + utils.clean_soup(d) + ' '
        for c in t.find_all('div', class_='post-tags'):
            for d in c.find_all('a'):
                dm["meta"] = dm["meta"] + utils.clean_soup(d) + ' '
        dm["title"] = ''
        for c in t.find('div', id='wrappermain-cs').find(
                'div', class_='breadcrumb').find_all('h1',
                                                     class_='cs-page-title'):
            dm["title"] = dm["title"] + utils.clean_soup(c) + ' '

        dt["meta"] = dm
        dt["text"] = ''
        for c in t.find('article', class_='type-post').find(
                'div', class_='detail_text').find_all('p'):
            dt["text"] = dt["text"] + utils.clean_soup(c) + ' '

        result = json.dumps(dt, ensure_ascii=False)
        results.append(result)
        print(result)
def scrape(curr_url, hash, soup, results):
    print('Found lasvocesdelpueblo.com...')

    for t in soup.find_all('div', class_='wrap-content'):
        if len(t.find_all('div', class_='entry-content-inner')) > 0:
            print('Getting wordpress article...')

            dt = {}
            dm = {}

            dm["id"] = str(hash)
            dm["type"] = 'article'
            dm["source"] = curr_url
            for c in t.find_all('div', class_='avatar-meta'):
                dm["meta"] = utils.clean_soup(c)
            for c in t.find_all('h1', class_='entry-title'):
                dm["title"] = utils.clean_soup(c)

            dt["meta"] = dm
            for c in t.find_all('div', class_='entry-content-inner'):
                dt["text"] = utils.clean_soup(c)

            result = json.dumps(dt, ensure_ascii=False)
            results.append(result)
            print(result)
def scrape(curr_url, hash, soup, results):
    print('Found diarioya.es...')

    # article
    for t in soup.find_all('div', class_='node-content'):
        if len(t.find_all('h1', class_='title')) > 0:
            print('Getting custom article...')

            dt = {}
            dm = {}

            dm["id"] = str(hash)
            dm["type"] = 'article'
            dm["source"] = curr_url
            for c in t.find_all('span', class_='article-header__time'):
                dm["meta"] = utils.clean_soup(c)
            for c in t.find_all('h1', class_='title'):
                dm["title"] = utils.clean_soup(c)

            dt["meta"] = dm
            for c in t.find_all('div', class_='content'):
                dt["text"] = utils.clean_soup(c)

            result = json.dumps(dt, ensure_ascii=False)
            results.append(result)
            print(result)
Пример #14
0
def scrape(curr_url, hash, soup, results):
    print('Found alertadigital.com...')

    for t in soup.find_all('div', id='homepost'):
        if len(t.find_all('div', class_='post')) > 0:
            print('Getting wordpress article...')

            result = '{\"meta\":{'
            result = result + '\"id\":\"' + str(hash) + '\",'
            result = result + '\"type\":\"article\",'
            result = result + '\"source\":\"' + curr_url + '\",'
            for c in t.find_all('div', id='datemeta'):
                result = result + '\"meta\":\"' + utils.clean_soup(c) + '\",'
            for c in t.find_all('h2', class_=''):
                result = result + '\"title\":\"' + utils.clean_soup(c)
            result = result + '\"'
            result = result + '},'

            for c in t.find_all('div', class_='entry'):
                result = result + '\"text\":\"' + utils.clean_soup(c) + '\"'
            result = result + '}'

            result = utils.clean_whitespaces(result)
            results.append(result)
            print(result)
def scrape(curr_url, hash, soup, results):
    print('Found elikoncc.info...')

    # article
    for t in soup.find_all('body', class_='single-post'):
        if len(soup.find_all('article', class_='post')) > 0:
            print('Getting wordpress article...')

            dt = {}
            dm = {}

            dm["id"] = str(hash)
            dm["type"] = 'article'
            dm["source"] = curr_url
            dm["meta"] = ''
            for c in t.find_all('div', class_='post-category'):
                dm["meta"] = dm["meta"] + utils.clean_soup(c) + ' '
            dm["title"] = ''
            for c in t.find_all('h1', class_='entry-title'):
                dm["title"] = dm["title"] + utils.clean_soup(c) + ' '

            dt["meta"] = dm
            dt["text"] = ''
            for c in t.find_all('div', class_='entry-content'):
                for d in c.find_all(class_=None, recursive=False):
                    dt["text"] = dt["text"] + utils.clean_soup(d) + ' '
            for c in t.find_all('div', class_='single-content'):
                for d in c.find_all(class_=None, recursive=False):
                    dt["text"] = dt["text"] + utils.clean_soup(d) + ' '

            result = json.dumps(dt, ensure_ascii=False)
            results.append(result)
            print(result)
Пример #16
0
def scrape(curr_url, hash, soup, results):
    print('Found defencereview.gr...')

    # article
    for t in soup.find_all('body', class_='single-post'):
        if len(soup.find_all('article', class_='post')) > 0:
            print('Getting wordpress article...')

            result = '{\"meta\":{'
            result = result + '\"id\":\"' + str(hash) + '\",'
            result = result + '\"type\":\"article\",'
            result = result + '\"source\":\"' + curr_url + '\",'
            result = result + '\"meta\":\"'
            for c in t.find_all('div', class_='newsmag-post-meta'):
                for d in c.select('a'):
                    result = result + utils.clean_soup(d) + ' '
                for d in c.select('div.newsmag-date'):
                    result = result + utils.clean_soup(d) + ' '
            result = result + '\",'
            result = result + '\"title\":\"'
            for c in t.select('div.newsmag-custom-header'):
                #
                result = result + utils.clean_soup(c)
            result = result + '\"'
            result = result + '},'

            result = result + '\"text\":\"'
            for c in t.find_all('div', class_='entry-content'):
                for d in c.find_all(class_=None, recursive=False):
                    result = result + utils.clean_soup(d) + ' '
            result = result + '\"'
            result = result + '}'

            result = utils.clean_whitespaces(result)
            results.append(result)
            print(result)

    # comments
    for t in soup.find_all('div', id='comments'):
        print('Getting wordpress comments...')
        for c in t.find_all('div', class_='comment-content'):

            result = '{\"meta\":{'
            result = result + '\"id\":\"' + str(hash) + '\",'
            result = result + '\"type\":\"comment\",'
            result = result + '\"source\":\"' + curr_url + '\"'
            result = result + '},'

            result = result + '\"text\":\"'
            for d in c.find_all('p', class_=''):
                #
                result = result + utils.clean_soup(d)
            result = result + '\"'
            result = result + '}'

            result = utils.clean_whitespaces(result)
            results.append(result)
            print(result)
Пример #17
0
def scrape(curr_url, hash, soup, results):
    print('Found destra.it...')

    # article
    for t in soup.find_all('div', class_='single'):
        print('Getting wordpress article...')

        dt = {}
        dm = {}

        dm["id"] = str(hash)
        dm["type"] = 'article'
        dm["source"] = curr_url
        dm["meta"] = ''
        for c in t.find_all('div', class_='post-meta'):
            for d in c.find_all('span', class_='post-author'):
                dm["meta"] = dm["meta"] + utils.clean_soup(d) + ' '
        for c in t.find_all('div', class_='post-meta'):
            for d in c.find_all('span', class_='post-date'):
                dm["meta"] = dm["meta"] + utils.clean_soup(d) + ' '
        dm["title"] = ''
        for c in t.find_all('div', class_='post-meta'):
            for d in c.find_all('h1', class_=''):
                dm["title"] = dm["title"] + utils.clean_soup(d) + ' '

        dt["meta"] = dm
        dt["text"] = ''
        for c in t.find_all('div', class_='post-content'):
            #
            dt["text"] = dt["text"] + utils.clean_soup(c) + ' '

        result = json.dumps(dt, ensure_ascii=False)
        results.append(result)
        print(result)

    # comments
    for t in soup.find_all('div', id='comments'):
        print('Getting wordpress comments...')
        for c in t.find_all('div', class_='comment-text'):

            dt = {}
            dm = {}

            dm["id"] = str(hash)
            dm["type"] = 'comment'
            dm["source"] = curr_url

            dt["meta"] = dm
            dt["text"] = ''
            for d in c.find_all('p', class_=''):
                dt["text"] = dt["text"] + utils.clean_soup(d) + ' '

            result = json.dumps(dt, ensure_ascii=False)
            results.append(result)
            print(result)
def scrape(curr_url, hash, soup, results):
    print('Found identità.it...')

    # article
    for t in soup.find_all('body', class_='single-post'):
        if len(soup.find_all('article', class_='single')) > 0:
            print('Getting wordpress article...')

            dt = {}
            dm = {}

            dm["id"] = str(hash)
            dm["type"] = 'article'
            dm["source"] = curr_url
            dm["meta"] = ''
            for c in t.find_all('div', class_='entry-meta'):
                #
                dm["meta"] = dm["meta"] + utils.clean_soup(c) + ' '
            dm["title"] = ''
            for c in t.find_all('header', class_=''):
                for d in c.find_all('h2', class_=''):
                    dm["title"] = dm["title"] + utils.clean_soup(d) + ' '

            dt["meta"] = dm
            dt["text"] = ''
            for c in t.find_all('div', class_='entry-content'):
                for d in c.find_all('p', class_=''):
                    dt["text"] = dt["text"] + utils.clean_soup(d) + ' '

            result = json.dumps(dt, ensure_ascii=False)
            results.append(result)
            print(result)

    # comments
    for t in soup.find_all('div', id='comments'):
        print('Getting wordpress comments...')
        for c in t.find_all('div', class_='comment-text'):

            dt = {}
            dm = {}

            dm["id"] = str(hash)
            dm["type"] = 'comment'
            dm["source"] = curr_url

            dt["meta"] = dm
            dt["text"] = ''
            for d in c.find_all('p', class_=''):
                dt["text"] = dt["text"] + utils.clean_soup(d) + ' '

            result = json.dumps(dt, ensure_ascii=False)
            results.append(result)
            print(result)
Пример #19
0
def scrape(curr_url, hash, soup, results):
    print('Found olympia.gr...')

    # reload with selenium
    with webdriver.Firefox() as driver:

        try:
            driver.implicitly_wait(5)
            driver.maximize_window()
            driver.get(curr_url)
            driver.find_element_by_tag_name('body').send_keys(Keys.END)
            WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.ID, 'disqus_thread')))
            content = driver.page_source
            # for i in cont.find_all('iframe'):
            #     if i.has_attr('src') and i['src'].find('disqus.com/embed') >= 0:
            #         ds_url = i['src']
            #         print('found discus thread with url:', ds_url)
            #         break
        except:
            content = ''
            print('webdriver timeout... ')

        driver.close()

    # article
    for t in BeautifulSoup(content, "html.parser").find_all('article', class_='post'):
        if len(soup.find_all('body', class_='single-post')) > 0:
            print('Getting wordpress article...')

            dt = {}
            dm = {}

            dm["id"] = str(hash)
            dm["type"] = 'article'
            dm["source"] = curr_url
            dm["meta"] = ''
            for c in t.select('div.tdb-block-inner > time.entry-date'):
                dm["meta"] = dm["meta"] + utils.clean_soup(c) + ' '
            for c in t.select('ul.tdb-tags > li > a'):
                dm["meta"] = dm["meta"] + utils.clean_soup(c) + ' '
            dm["title"] = ''
            for c in t.select('h1.tdb-title-text'):
                dm["title"] = dm["title"] + utils.clean_soup(c) + ' '

            dt["meta"] = dm
            dt["text"] = ''
            for c in t.select('div.wpb_wrapper > div.tdb_single_content > div.tdb-block-inner'):
                for d in c.find_all('p', class_=None):
                    dt["text"] = dt["text"] + utils.clean_soup(d) + ' '

            result = json.dumps(dt, ensure_ascii=False)
            results.append(result)
            print(result)
Пример #20
0
def scrape(curr_url, hash, soup, results):
    print('Found disidentia.com...')
    counter = 0

    # article
    for t in soup.find_all('article', class_='type-post'):
        print('Getting wordpress article...')

        counter += 1
        result = '{\"meta\":{'
        result = result + '\"id\":\"' + str(hash) + str(counter) + '\",'
        result = result + '\"type\":\"article\",'
        result = result + '\"source\":\"' + curr_url + '\",'
        for c in t.find_all('div', class_='td-module-meta-info'):
            result = result + '\"meta\":\"' + utils.clean_soup(c) + '\",'
            break
        for c in t.find_all('h1', class_='entry-title'):
            result = result + '\"title\":\"' + utils.clean_soup(c)
            break
        result = result + '\"'
        result = result + '},'

        for c in t.find_all('div', class_='td-post-content tagdiv-type'):
            result = result + '\"text\":\"'
            for d in c.find_all('p', class_=''):
                result = result + utils.clean_soup(d)
            result = result + '\"'
        result = result + '}'

        result = utils.clean_whitespaces(result)
        results.append(result)
        print(result)

    # comments
    if len(soup.find_all('ol', class_='comment-list')) > 0:
        print('Getting custom comments...')
        for t in soup.find_all('div', class_='comment-content'):

            counter += 1
            result = '{\"meta\":{'
            result = result + '\"id\":\"' + str(hash) + str(counter) + '\",'
            result = result + '\"type\":\"comment\",'
            result = result + '\"source\":\"' + curr_url + '\"'
            result = result + '},'

            result = result + '\"text\":\"' + utils.clean_soup(t) + '\"'
            result = result + '}'

            result = utils.clean_whitespaces(result)
            results.append(result)
            print(result)
Пример #21
0
def scrape(curr_url, hash, soup, results):
    print('Found espana2000.es...')

    # article
    for t in soup.find_all('body', class_='single-post'):
        # if t.has_attr('id') and t['id'].find('post') >= 0:
        print('Getting wordpress article...')

        dt = {}
        dm = {}

        dm["id"] = str(hash)
        dm["type"] = 'article'
        dm["source"] = curr_url
        dm["meta"] = ''
        for c in t.find_all('div', class_='post-meta-wrapper'):
            dm["meta"] = dm["meta"] + utils.clean_soup(c) + ' '
        dm["title"] = ''
        for c in t.find_all('h1', class_='entry-title'):
            dm["title"] = dm["title"] + utils.clean_soup(c) + ' '

        dt["meta"] = dm
        dt["text"] = ''
        for c in t.find_all('div', class_='entry-content'):
            dt["text"] = dt["text"] + utils.clean_soup(c) + ' '

        result = json.dumps(dt, ensure_ascii=False)
        results.append(result)
        print(result)

    # comments
    for t in soup.find_all('div', class_='comments-wrapper'):
        print('Getting wordpress comments...')
        for c in t.find_all('div', class_='comment-content'):

            dt = {}
            dm = {}

            dm["id"] = str(hash)
            dm["type"] = 'comment'
            dm["source"] = curr_url

            dt["meta"] = dm
            dt["text"] = ''
            for d in c.find_all('p', class_=''):
                dt["text"] = dt["text"] + utils.clean_soup(d) + ' '

            result = json.dumps(dt, ensure_ascii=False)
            results.append(result)
            print(result)
def scrape(curr_url, hash, soup, results):
    print('Found somatemps.me...')

    # article
    for t in soup.find_all('article', class_='type-post'):
        print('Getting wordpress article...')

        dt = {}
        dm = {}

        dm["id"] = str(hash)
        dm["type"] = 'article'
        dm["source"] = curr_url
        dm["meta"] = ''
        for c in t.find_all('p', class_='postmetadata'):
            dm["meta"] = dm["meta"] + utils.clean_soup(c) + ' '
        dm["title"] = ''
        for c in t.find_all('h1', class_='posttitle'):
            dm["title"] = dm["title"] + utils.clean_soup(c) + ' '

        dt["meta"] = dm
        dt["text"] = ''
        for c in t.find_all('section', class_='entry'):
            for d in c.find_all('p', classs_=''):
                dt["text"] = dt["text"] + utils.clean_soup(d) + ' '

        result = json.dumps(dt, ensure_ascii=False)
        results.append(result)
        print(result)

    # comments
    for t in soup.find_all('ol', class_='commentlist'):
        print('Getting wordpress comments...')
        for c in t.find_all('div', class_='comment-wrapper'):

            dt = {}
            dm = {}

            dm["id"] = str(hash)
            dm["type"] = 'comment'
            dm["source"] = curr_url

            dt["meta"] = dm
            dt["text"] = ''
            for d in c.find_all('p', class_=''):
                dt["text"] = dt["text"] + utils.clean_soup(d) + ' '

            result = json.dumps(dt, ensure_ascii=False)
            results.append(result)
            print(result)
def scrape(curr_url, hash, soup, results):
    print('Found katohika.gr...')

    # article
    for t in soup.find_all('div', id='content'):
        if len(soup.find_all('div', class_='entry-content')) > 0:
            print('Getting wordpress article...')

            result = '{\"meta\":{'
            result = result + '\"id\":\"' + str(hash) + '\",'
            result = result + '\"type\":\"article\",'
            result = result + '\"source\":\"' + curr_url + '\",'
            for c in t.find_all('div', class_='entry-author'):
                result = result + '\"meta\":\"' + utils.clean_soup(c) + '\",'
            for c in t.find_all('h1', class_='entry-title'):
                result = result + '\"title\":\"' + utils.clean_soup(c) + '\"'
            result = result + '},'

            result = result + '\"text\":\"'
            for c in t.find_all('div', class_='entry-content'):
                for d in c.find_all(class_=None, recursive=False):
                    result = result + utils.clean_soup(d) + ' '
            result = result + '\"'
            result = result + '}'

            result = utils.clean_whitespaces(result)
            results.append(result)
            print(result)

    # comments
    for t in soup.find_all('div', id='comments-section'):
        print('Getting wordpress comments...')
        for c in t.find_all('div', class_='comment-content'):

            result = '{\"meta\":{'
            result = result + '\"id\":\"' + str(hash) + '\",'
            result = result + '\"type\":\"comment\",'
            result = result + '\"source\":\"' + curr_url + '\"'
            result = result + '},'

            result = result + '\"text\":\"'
            for d in c.find_all('p', class_=''):
                result = result + utils.clean_soup(d)
            result = result + '\"'
            result = result + '}'

            result = utils.clean_whitespaces(result)
            results.append(result)
            print(result)
def scrape(curr_url, hash, soup, results):
    print('Found makeleio.gr...')

    # article
    for t in soup.find_all('div', class_='single-style1-wrap'):
        print('Getting wordpress article...')

        result = '{\"meta\":{'
        result = result + '\"id\":\"' + str(hash) + '\",'
        result = result + '\"type\":\"article\",'
        result = result + '\"source\":\"' + curr_url + '\",'
        for c in t.find_all('div', class_='single-style1-meta-tag'):
            result = result + '\"meta\":\"' + utils.clean_soup(c) + '\",'
        for c in t.find_all('div', class_='single-style1-title'):
            result = result + '\"title\":\"' + utils.clean_soup(c) + '\"'
        result = result + '},'

        for c in t.find_all('div', class_='single-style1-content'):
            result = result + '\"text\":\"' + utils.clean_soup(c) + '\"'
        result = result + '}'

        result = utils.clean_whitespaces(result)
        results.append(result)
        print(result)

    # comments
    for t in soup.find_all('div', class_='comments-area'):
        print('Getting wordpress comments...')
        for c in t.find_all('div', class_='comment-content'):

            result = '{\"meta\":{'
            result = result + '\"id\":\"' + str(hash) + '\",'
            result = result + '\"type\":\"comment\",'
            result = result + '\"source\":\"' + curr_url + '\"'
            result = result + '},'

            result = result + '\"text\":\"'
            for d in c.find_all('p', class_=''):
                result = result + utils.clean_soup(d)
            result = result + '\"'
            result = result + '}'

            result = utils.clean_whitespaces(result)
            results.append(result)
            print(result)
def scrape(curr_url, hash, soup, results):
    print('Found sioeeu.wordpress.com...')

    # article
    for t in soup.select('article.type-post.format-standard'):
        if len(t.select('div.single-entry-content > p')) > 0:
            print('Getting wordpress article...')

            dt = {}
            dm = {}
            dm["id"] = str(hash)
            dm["type"] = 'article'
            dm["source"] = curr_url
            for c in t.select('header.single-entry-header > p'):
                dm["meta"] = utils.clean_soup(c)
            for c in t.find_all('h1', class_='entry-title'):
                dm["title"] = utils.clean_soup(c)
            dt["meta"] = dm

            dt["text"] = ''
            for c in t.select('div.single-entry-content > p'):
                dt["text"] = dt["text"] + ' ' + utils.clean_soup(c)

            result = json.dumps(dt, ensure_ascii=False)
            results.append(result)
            print(result)

    # comments
    for t in soup.find_all('div', id='comments'):
        print('Getting wordpress comments...')
        for c in t.select('div.comment-body > p'):

            dt = {}
            dm = {}
            dm["id"] = str(hash)
            dm["type"] = 'comment'
            dm["source"] = curr_url
            dt["meta"] = dm
            dt["text"] = utils.clean_soup(c)

            result = json.dumps(dt, ensure_ascii=False)
            results.append(result)
            print(result)
def scrape(curr_url, hash, soup, results):
    print('Found youtube...')

    # load and manipulate the website
    with webdriver.Firefox(options=FirefoxOptions()) as driver:

        driver.maximize_window()
        driver.implicitly_wait(5)
        ac = ActionChains(driver)

        # load the website
        try:
            driver.get(curr_url)
            time.sleep(5)
            # WebDriverWait(driver, 5).until(EC.visibility_of_element_located((By.ID, "info-text"))).click()
            # WebDriverWait(driver, 5).until(EC.visibility_of_element_located((By.TAG_NAME, "body"))).send_keys(Keys.DOWN)
            # for item in driver.find_elements_by_tag_name('body'):
            #     ac.move_to_element(item).move_by_offset(5, 5).click().perform()
            #     time.sleep(1)
            for item in range(10):
                driver.find_element_by_tag_name('body').send_keys(Keys.DOWN)
                time.sleep(1)
            for item in range(10):
                driver.find_element_by_tag_name('body').send_keys(Keys.END)
                time.sleep(5)
            content = driver.page_source
        except:
            print('webdriver timeout... ')
            content = ''

        # close the driver
        driver.close()

    # parse the comments
    for t in BeautifulSoup(content, "html.parser").find_all(
            'yt-formatted-string',
            id='content-text'):  # class_='post-message'):

        dt = {}
        dm = {}

        dm["id"] = str(hash)
        dm["type"] = 'yt_comment'
        dm["source"] = curr_url

        dt["meta"] = dm
        dt["text"] = utils.clean_soup(t)

        result = json.dumps(dt, ensure_ascii=False)
        results.append(result)
        print(result)
Пример #27
0
def scrape(curr_url, hash, soup, results):
    print('Found termometropolitico.it...')
    for t in soup.find_all('body', class_='single-post'):
        # if t.has_attr('id') and t['id'].find('post') >= 0:
        print('Getting wordpress article...')

        dt = {}
        dm = {}
        dm["id"] = str(hash)
        dm["type"] = 'article'
        dm["source"] = curr_url
        for c in t.find_all('div', class_='single_info'):
            dm["meta"] = utils.clean_soup(c)
        for c in t.find_all('h1', class_='single_title'):
            dm["title"] = utils.clean_soup(c)

        dt["meta"] = dm
        for c in t.find_all('div', class_='single_content'):
            dt["text"] = utils.clean_soup(c)

        result = json.dumps(dt, ensure_ascii=False)
        results.append(result)
        print(result)
Пример #28
0
def scrape(curr_url, hash, soup, results):
    print('Found imolaoggi.it...')

    # article
    for t in soup.find_all('article', class_='post'):
        if len(t.find_all('h1', class_='entry-title')) > 0:
            print('Getting wordpress article...')

            dt = {}
            dm = {}

            dm["id"] = str(hash)
            dm["type"] = 'article'
            dm["source"] = curr_url
            dm["meta"] = ''
            for c in t.find_all('span', class_='post-author'):
                dm["meta"] = dm["meta"] + utils.clean_soup(c) + ' '
            for c in t.find_all('span', class_='posted-on'):
                dm["meta"] = dm["meta"] + utils.clean_soup(c) + ' '
            for c in t.find_all('span', class_='cat-links'):
                dm["meta"] = dm["meta"] + utils.clean_soup(c) + ' '
            dm["title"] = ''
            for c in t.find_all('h1', class_='entry-title'):
                dm["title"] = dm["title"] + utils.clean_soup(c) + ' '

            dt["meta"] = dm
            dt["text"] = ''
            for c in t.find_all(class_=''):
                dt["text"] = dt["text"] + utils.clean_soup(c) + ' '

            result = json.dumps(dt, ensure_ascii=False)
            results.append(result)
            print(result)

    # comments
    for t in soup.find_all('ol', class_='commentlist'):
        print('Getting wordpress comments...')
        for c in t.find_all('li', class_='comment'):

            dt = {}
            dm = {}

            dm["id"] = str(hash)
            dm["type"] = 'comment'
            dm["source"] = curr_url

            dt["meta"] = dm
            dt["text"] = ''
            for d in c.find_all('p', class_=''):
                dt["text"] = dt["text"] + utils.clean_soup(d) + ' '

            result = json.dumps(dt, ensure_ascii=False)
            results.append(result)
            print(result)
Пример #29
0
def scrape(curr_url, hash, soup, results):
    print('Found a website...')

    # load and manipulate the website
    with webdriver.Firefox(options=FirefoxOptions()) as driver:

        driver.maximize_window()
        driver.implicitly_wait(5)

        # load the website
        try:
            driver.get(curr_url)
            for item in range(10):
                driver.find_element_by_tag_name('body').send_keys(Keys.DOWN)
                time.sleep(1)
                driver.find_element_by_tag_name('body').send_keys(Keys.END)
                time.sleep(1)
            soup = BeautifulSoup(driver.page_source, "html.parser")
            content = driver.page_source
        except:
            print('webdriver timeout... ')
            content = ''

        # close the driver
        driver.close()

    # parse the data
    dt = {}
    dm = {}

    dm["id"] = str(hash)
    dm["type"] = 'web_unstructured'
    dm["source"] = curr_url

    dt["meta"] = dm
    dt["text"] = utils.clean_soup(BeautifulSoup(content, "html.parser"))

    result = json.dumps(dt, ensure_ascii=False)
    results.append(result)
    print(result)
def scrape(curr_url, hash, soup, results):
    print('Found hellenicns.gr...')

    # articles
    for t in soup.find_all('body', class_='single-post'):
        if len(soup.find_all('article', class_='post')) > 0:
            print('Getting wordpress article...')

            result = '{\"meta\":{'
            result = result + '\"id\":\"' + str(hash) + '\",'
            result = result + '\"type\":\"article\",'
            result = result + '\"source\":\"' + curr_url + '\",'
            result = result + '\"meta\":\"'
            for c in t.find_all('div', class_='below-entry-meta'):
                for d in c.select('time.published'):
                    result = result + utils.clean_soup(d) + ' '
                for d in c.select('span.author > a'):
                    result = result + utils.clean_soup(d) + ' '
                for d in c.select('span.tag-links'):
                    result = result + utils.clean_soup(d) + ' '
                break
            result = result + '\",'
            result = result + '\"title\":\"'
            for c in t.find_all('h1', class_='entry-title'):
                result = result + utils.clean_soup(c)
                break
            result = result + '\",'

            result = result + '\"text\":\"'
            for c in t.find_all('div', class_='entry-content'):
                for d in c.find_all(class_=None, recursive=False):
                    result = result + utils.clean_soup(d) + ' '
            for c in t.find_all('div', class_='single-content'):
                for d in c.find_all(class_=None, recursive=False):
                    result = result + utils.clean_soup(d) + ' '
            result = result + '\"'
            result = result + '}'

            result = utils.clean_whitespaces(result)
            results.append(result)
            print(result)