Python access_url 예제들, interfaces.url_access.url_access.access_url Python 예제들

예제 #1

0

파일 보기

파일: bbc_home.py 프로젝트: tomconnolly94/Beef-Data-Scraper

def scrape_bbc_home(uReq, soup, keyword_list):
    
    logging = None
    
    base_url = 'http://www.bbc.co.uk' #url to scrape
    init_path = "/news" #base url extension
    
    page_html = access_url(base_url + init_path, uReq)#make request for page
    
    if page_html is not None:
    
        page_soup = soup(page_html, "html.parser") #convert the html to a soup object
        tag_array = page_soup.findAll("div", {"class" : "gs-c-promo"}) #find tags in the soup object

        if len(tag_array) > 0: #only execute if tags have been found

            beef_objects = []
            
            #load saved urls
            saved_urls = get_saved_urls(base_url)
            
            percent_per_scrape = 100/len(tag_array)

            for x in range(0, len(tag_array)): #for each tag
                
                print(str(round(x * percent_per_scrape)) + "% complete.")

                if(tag_array[x].a): #ensure the element has an anchor tag

                    if("http://" in tag_array[x].a["href"]): #check if the a href is an absolute url or an absolute path
                        sub_page_url = tag_array[x].a["href"]

                    else:
                        sub_page_url = base_url + tag_array[x].a["href"]
                        
                    path_split_1 = sub_page_url.split("/")#split path by /
                    path_split_2 = path_split_1[len(path_split_1) - 1 ].split("-")#get final field in path_split_1 and split by -
                    
                    if path_split_2[0] != "blogs": #ensure we are not scraping a blog page
                        
                        if any(url_obj["url"] == sub_page_url for url_obj in saved_urls): #check through pre loaded urls to ensure url has not already been scraped
                            if logging:
                                print("preloaded url found, aborting scrape.")
                        
                        else:
                            if logging:
                                print("preloaded url not found, initiating scrape.")
                                
                            #url must be saved under these conditions: 1. it has not been previously scraped, 2. it may not be relevant to beef and therefore may not be added to selected events, 
                            save_url(base_url, sub_page_url)
                            
                            beef_object = scrape_article(sub_page_url, uReq, soup, keyword_list) #scrape this article
                            
                            if beef_object != None:
                                beef_objects.append(beef_object)
                                #beef_object.print_beef()

            return beef_objects
    else:
        return []

예제 #2

0

파일 보기

파일: hiphopdx_home.py 프로젝트: tomconnolly94/Beef-Data-Scraper

def scrape_hiphopdx_home(uReq, soup, keyword_list):

    logging = None

    base_url = 'https://hiphopdx.com'  #url to scrape
    initial_suffix = "/news"

    raw_page_html = access_url(base_url + initial_suffix,
                               uReq)  #make request for page

    if raw_page_html is not None:

        page_soup = soup(raw_page_html,
                         "html.parser")  #convert the html to a soup object

        news_tag = page_soup.find(
            "div",
            {"class", "wire"})  #, text=pattern) #find tags in the soup object

        beef_objects = []

        #load saved urls
        saved_urls = get_saved_urls(base_url)

        percent_per_scrape = 100 / len(news_tag.findAll("a"))

        if len(news_tag) > 0:  #only execute if tags have been found

            for x, a in enumerate(news_tag.findAll("a")):

                print(str(round(x * percent_per_scrape)) + "% complete.")

                if a and a["href"] and a["class"][0] != "next":

                    sub_page_url = base_url + a["href"]

                    if any(
                            url_obj["url"] == sub_page_url
                            for url_obj in saved_urls
                    ):  #check through pre loaded urls to ensure url has not already been scraped
                        if logging:
                            print("preloaded url found, aborting scrape.")

                    else:
                        if logging:
                            print(
                                "preloaded url not found, initiating scrape.")

                        #url must be saved under these conditions: 1. it has not been previously scraped, 2. it may not be relevant to beef and therefore may not be added to selected events,
                        save_url(base_url, sub_page_url)

                        beef_object = scrape_article(sub_page_url, uReq, soup,
                                                     keyword_list)

                        if beef_object != None:
                            beef_objects.append(beef_object)

        return beef_objects
    else:
        return []

예제 #3

0

파일 보기

파일: hip_hop_beef_article_scraper.py 프로젝트: tomconnolly94/Beef-Data-Scraper

def scrape_article(path, uReq, soup, keyword_list):
    
    
    sub_page_html = access_url(path, uReq)
    
    if sub_page_html is not None:
    
        sub_page_soup = soup(sub_page_html, "html.parser")

        content_tag_array = sub_page_soup.findAll("div", {"class" : "page-content"}) #find tags in the soup object
        
        relevant_story = None;
        
        if(len(content_tag_array) > 0):
            
            content_string = "" #init content string
            img_link = "" #init content string

            #check each p tag found for words from the keyword list
            for p in content_tag_array[0].findAll('p'):

                if p.a is None:
                    content_string += p.text
                elif p.a.img is not None and p.a.img["src"] is not None:
                    img_link = p.a.img["src"]

                if(len(keyword_list) > 0): #if keyword list has values, use them to filter stories, if it is empty, automatically approve story

                    #check if any text from page contains key words stored in list, if keyword found, print page text
                    if(any(keyword in p.text for keyword in keyword_list)):
                        relevant_story = True

                else:
                    relevant_story = True

            #clean content string
            globals.scrub_content_text(content_string)

            title = sub_page_soup.findAll("div", {"class" : "page_header"})[0].h2.text #find tags in the soup object
            
            #article is relevant, build a beef record
            if(relevant_story): #execute if a story contains a keyword

                store_event_classification(title, content_string) #classify event and store the classification for later use
                
                media_tag_aray = sub_page_soup.findAll("iframe") #find tags in the soup object

                date_string = sub_page_soup.find("span", {"class" : "date"}).text.replace("Posted ", "") #find tags in the soup object
                date_split = date_string.split("/")
                date_string = date_split[1] + "/" + date_split[0] + "/" + date_split[2]

                actors_list = extract_names(content_string) #extract actors from content_string
                highlights = extract_quotes(content_string) #extract quotes from content_string
                categories = [1]

                media_link = {
                    "link": "",
                    "type": ""                    
                }

                if len(media_tag_aray) > 0:
                    link = media_tag_aray[0]["src"]
                    link_type = ""

                    if "youtube" in link:
                        link_type = "youtube"
                    elif "spotify" in link:
                        link_type = "spotify"
                    elif "soundcloud" in link:
                        link_type = "soundcloud"
                    elif "twitter" in link:
                        link_type = "twitter"

                    media_link = {
                        "link": link,
                        "type": link_type 
                    }

                #frame BeefObject( title, relevant_actors, content, date, highlights, data_source, categories, img_title)
                beef_obj = BeefObject(title, actors_list, content_string, date_string, highlights, path, categories, img_link, media_link) #create beefObject 

                return beef_obj
            else:
                return None
        else:
            return None
    else:
        return None

예제 #4

0

파일 보기

파일: hot_new_hip_hop_article_scraper.py 프로젝트: tomconnolly94/Beef-Data-Scraper

def scrape_article(path, uReq, soup, keyword_list):

    sub_page_html = access_url(path, uReq)

    if sub_page_html is not None:

        sub_page_soup = soup(sub_page_html, "html.parser")

        body_tag = sub_page_soup.find("div",
                                      {"class": "article-content-container"
                                       })  #find tags in the soup object

        relevant_story = None

        if body_tag:

            content_string = ""  #init content string

            #check each p tag found for words from the keyword list
            for p in body_tag.section.findAll("p"):

                content_string += p.text

                if p is not None and len(
                        keyword_list
                ) > 0:  #if keyword list has values, use them to filter stories, if it is empty, automatically approve story

                    #check if any text from page contains key words stored in list, if keyword found, print page text
                    if (any(keyword in p.text for keyword in keyword_list)):
                        relevant_story = True
                        break

                else:
                    relevant_story = True

            #clean content string
            globals.scrub_content_text(content_string)

            title = body_tag.h2.text.strip(
            )  #find tags in the soup object for beef object title

            #article is relevant, build a beef record
            if (relevant_story):  #execute if a story contains a keyword

                store_event_classification(
                    title, content_string
                )  #classify event and store the classification for later use

                img_tag_array = sub_page_soup.findAll(
                    "img", {"class": "article-gallery-cover"})

                if len(img_tag_array) > 0 and img_tag_array[0]["src"]:
                    img_link = img_tag_array[0]["src"]

                #relevant_story = None;

                date_string = sub_page_soup.find("div", {
                    "class": "editorBlock-date"
                }).text.replace("\n", "")  #find tags in the soup object
                date_split = date_string.lstrip().split(
                    ", "
                )  #split to get month and day in slot [0] and year and rest of string in [1]
                secondary_date_split = date_split[0].split(
                    " ")  #split to seperate month and day
                tertiary_date_split = date_split[1].split(
                    " ")  #split to seperate year from rest of string

                final_date_string = secondary_date_split[1] + "/" + str(
                    globals.get_month_number(secondary_date_split[0])
                ) + "/" + tertiary_date_split[0]

                actors_list = extract_names(
                    content_string)  #extract actors from content_string
                highlights = extract_quotes(
                    content_string)  #extract quotes from content_string
                categories = [1]

                link_raw = body_tag.findAll("iframe")
                link = ""
                link_type = ""
                media_link = {"link": "", "type": ""}

                if len(link_raw) > 0:
                    link = link_raw[0]["src"]

                    if "youtube" in link:
                        link_type = "youtube"
                    elif "spotify" in link:
                        link_type = "spotify"
                    elif "soundcloud" in link:
                        link_type = "soundcloud"
                    elif "twitter" in link:
                        link_type = "twitter"
                    else:
                        link_type = "video_embed"

                    media_link = {"link": link, "type": link_type}

                #frame BeefObject( title, relevant_actors, content, date, highlights, data_source, categories, img_title)
                beef_obj = BeefObject(title, actors_list, content_string,
                                      final_date_string, highlights, path,
                                      categories, img_link,
                                      media_link)  #create beefObject

                #beef_obj.print_beef()

                return beef_obj

            else:
                return None
        else:
            return None

    else:
        return None

예제 #5

0

파일 보기

파일: bossip_article_scraper.py 프로젝트: tomconnolly94/Beef-Data-Scraper

def scrape_article(path, uReq, soup, keyword_list):

    sub_page_html = access_url(path, uReq)

    if sub_page_html is not None:

        sub_page_soup = soup(sub_page_html, "html.parser")

        #body_tag = sub_page_soup.find("div", {"class" : "article-content-container"}) #find tags in the soup object

        relevant_story = None

        if sub_page_soup:

            content_string = ""  #init content string

            #check each p tag found for words from the keyword list
            for p in sub_page_soup.findAll("p"):

                if p is not None and (
                        p.a == None or "bossip" in p.a["href"]
                ) and "Bossip Newsletter" not in p.text and "WENN" not in p.text:
                    content_string += p.text

                if p is not None and len(
                        keyword_list
                ) > 0:  #if keyword list has values, use them to filter stories, if it is empty, automatically approve story

                    #check if any text from page contains key words stored in list, if keyword found, print page text
                    if any(keyword in p.text for keyword in
                           keyword_list) or len(keyword_list) == 0:
                        relevant_story = True

                else:
                    relevant_story = True

            #clean content string
            globals.scrub_content_text(content_string)

            title_tag = sub_page_soup.find("h1")

            if title_tag and title_tag.text:
                title = title_tag.text.split("[")[0]

            #article is relevant, build a beef record
            if (relevant_story):  #execute if a story contains a keyword

                store_event_classification(
                    title, content_string
                )  #classify event and store the classification for later use

                img_tag_array = sub_page_soup.findAll(
                    "img", {"class": ["size-large", "size-full"]})

                if len(img_tag_array) > 0 and img_tag_array[0]["src"]:
                    img_link = img_tag_array[0]["src"]
                else:
                    return None
                date_string = sub_page_soup.find(
                    "time", {"class": "date"
                             })["datetime"]  #find tags in the soup object
                date_split = date_string.lstrip().split(
                    "-"
                )  #split to get month and day in slot [0] and year and rest of string in [1]

                final_date_string = date_split[2].split(
                    " ")[0] + "/" + date_split[1] + "/" + date_split[0]

                actors_list = extract_names(
                    content_string)  #extract actors from content_string
                highlights = extract_quotes(
                    content_string)  #extract quotes from content_string
                categories = [1]

                link_raw = sub_page_soup.findAll("iframe")
                link = ""
                link_type = ""
                media_link = {"link": "", "type": ""}

                if len(link_raw) > 0:
                    link = link_raw[0]["src"]

                    if "youtube" in link:
                        link_type = "youtube"
                    elif "spotify" in link:
                        link_type = "spotify"
                    elif "soundcloud" in link:
                        link_type = "soundcloud"
                    elif "twitter" in link:
                        link_type = "twitter"
                    else:
                        link_type = "video_embed"

                    media_link = {"link": link, "type": link_type}

                #frame BeefObject( title, relevant_actors, content, date, highlights, data_source, categories, img_title)
                beef_obj = BeefObject(title, actors_list, content_string,
                                      final_date_string, highlights, path,
                                      categories, img_link,
                                      media_link)  #create beefObject

                #beef_obj.print_beef()

                return beef_obj

            else:
                return None
        else:
            return None

    else:
        return None

예제 #6

0

파일 보기

def scrape_cnn_home(uReq, soup, keyword_list):

    logging = None

    base_url = 'http://edition.cnn.com'  #url to scrape

    raw_page_html = access_url(base_url, uReq)  #make request for page

    if raw_page_html is not None:

        print(raw_page_html)

        page_soup = soup(raw_page_html,
                         "html.parser")  #convert the html to a soup object
        tag_array = page_soup.findAll(
            "script")  #, text=pattern) #find tags in the soup object

        if len(tag_array) > 0:  #only execute if tags have been found

            if (tag_array[10].text):  #ensure the element has an anchor tag

                beef_objects = []

                #load saved urls
                saved_urls = get_saved_urls(base_url)

                script_text = tag_array[10].text
                print(script_text)
                result = re.search('CNN.contentModel = (.*);', script_text)
                if (result.group):
                    script_json = demjson.decode(result.group(1))

                    percent_per_scrape = 100 / len(
                        script_json['siblings']['articleList'])

                    for x in range(0,
                                   len(script_json['siblings']
                                       ['articleList'])):  #for each tag

                        print(
                            str(round(x * percent_per_scrape)) + "% complete.")

                        sub_page_url = base_url + script_json['siblings'][
                            'articleList'][x]['uri']

                        if any(
                                url_obj["url"] == sub_page_url
                                for url_obj in saved_urls
                        ):  #check through pre loaded urls to ensure url has not already been scraped
                            if logging:
                                print("preloaded url found, aborting scrape.")

                        else:
                            if logging:
                                print(
                                    "preloaded url not found, initiating scrape."
                                )

                            #url must be saved under these conditions: 1. it has not been previously scraped, 2. it may not be relevant to beef and therefore may not be added to selected events,
                            save_url(base_url, sub_page_url)

                            beef_object = scrape_article(
                                sub_page_url, uReq, soup, keyword_list)

                            if beef_object != None:
                                beef_objects.append(beef_object)

                    return beef_objects
                else:
                    return []
    else:
        return []

예제 #7

0

파일 보기

파일: give_me_sport_article_scraper.py 프로젝트: tomconnolly94/Beef-Data-Scraper

def scrape_article(path, uReq, soup, keyword_list):

    logging = None

    sub_page_html = access_url(path, uReq)

    if sub_page_html is not None:

        sub_page_soup = soup(sub_page_html, "html.parser")

        body_tag = sub_page_soup.find(
            "div", {"id": "OutputBody"})  #find tags in the soup object

        relevant_story = None

        if body_tag:

            content_string = ""  #init content string

            #check each p tag found for words from the keyword list
            for p in body_tag.findAll("p"):

                if p is not None and "Do YOU want to write for GiveMeSport?" not in p.text and "Have your say in the comments section below." not in p.text:

                    content_string += p.text

                    if len(
                            keyword_list
                    ) > 0:  #if keyword list has values, use them to filter stories, if it is empty, automatically approve story

                        #check if any text from page contains key words stored in list, if keyword found, print page text
                        if (any(keyword in p.text
                                for keyword in keyword_list)):
                            relevant_story = True

                    else:
                        relevant_story = True

            #clean content string
            globals.scrub_content_text(content_string)

            title = sub_page_soup.find("h1", {
                "class": "gms-article-title"
            }).text

            #article is relevant, build a beef record
            if (relevant_story):  #execute if a story contains a keyword

                store_event_classification(
                    title, content_string
                )  #classify event and store the classification for later use

                img_link = sub_page_soup.find("img", {"id": "EdImg"})["src"]

                date_string = sub_page_soup.find("p", {
                    "class": "gms-article-data"
                }).span.time["datetime"]  #find date in the soup object
                date_split = date_string.lstrip().split(
                    "-"
                )  #split to get month and day in slot [0] and year and rest of string in [1]

                final_date_string = date_split[2].split(
                    "T")[0] + "/" + date_split[1] + "/" + date_split[0]

                actors_list = extract_names(
                    content_string)  #extract actors from content_string
                highlights = extract_quotes(
                    content_string)  #extract quotes from content_string
                categories = [4]

                link_raw = body_tag.findAll("iframe")
                link = ""
                link_type = ""
                media_link = {"link": "", "type": ""}

                if len(link_raw) > 0:
                    link = link_raw[0]["data-url"]

                    if "youtube" in link:
                        link_type = "youtube"
                    elif "spotify" in link:
                        link_type = "spotify"
                    elif "soundcloud" in link:
                        link_type = "soundcloud"
                    elif "twitter" in link:
                        link_type = "twitter"
                    else:
                        link_type = "video_embed"

                    media_link = {"link": link, "type": link_type}

                if logging:
                    print(content_string)

                #frame = BeefObject( title, relevant_actors, content, date, highlights, data_source, categories, img_title)
                beef_obj = BeefObject(title, actors_list, content_string,
                                      final_date_string, highlights, path,
                                      categories, img_link,
                                      media_link)  #create beefObject

                #beef_obj.print_beef()

                return beef_obj
            else:
                return None
        else:
            return None

    else:
        return None

예제 #8

0

파일 보기

def scrape_article(path, uReq, soup, keyword_list):

    sub_page_html = access_url(path, uReq)

    if sub_page_html is not None:

        sub_page_soup = soup(sub_page_html, "html.parser")

        content_tag_array = sub_page_soup.findAll(
            "div",
            {"class": "story-body__inner"})  #find tags in the soup object

        relevant_story = None

        if len(content_tag_array) > 0:

            content_string = ""  #init content string

            #check each p tag found for words from the keyword list
            for p in content_tag_array[0].findAll('p'):

                content_string += p.text

                if (
                        len(keyword_list) > 0
                ):  #if keyword list has values, use them to filter stories, if it is empty, automatically approve story

                    #check if any text from page contains key words stored in list, if keyword found, print page text
                    if (any(keyword in p.text for keyword in keyword_list)):
                        relevant_story = True
                        #break

                else:
                    relevant_story = True

            #clean content string
            globals.scrub_content_text(content_string)

            title = sub_page_soup.findAll("h1", {
                "class": "story-body__h1"
            })[0].text  #find tags in the soup object for beef object title

            #article is relevant, build a beef record
            if relevant_story:  #execute if a story contains a keyword

                #store_event_classification(title, content_string) #classify event and store the classification for later use

                mini_info_panel_tag_array = sub_page_soup.findAll(
                    "li",
                    {"class": "mini-info-list__item"
                     })  #find tags in the soup object for beef object date
                date_string_split = mini_info_panel_tag_array[0].div[
                    "data-datetime"].split(" ")  #format date
                date_string = str(date_string_split[0]) + "/" + str(
                    globals.get_month_number(
                        date_string_split[1])) + "/" + str(
                            date_string_split[2])

                actors_list = extract_names(
                    content_string)  #extract actors from content_string
                highlights = extract_quotes(
                    content_string)  #extract quotes from content_string

                categories = []

                if len(mini_info_panel_tag_array
                       ) > 1 and mini_info_panel_tag_array[
                           1].a is not None and mini_info_panel_tag_array[
                               1].a.text is not None:

                    category = mini_info_panel_tag_array[1].a.text

                    if "politics" in category.lower():
                        categories.append(2)

                    if "sport" in category.lower():
                        categories.append(4)

                    if "technology" in category.lower():
                        categories.append(6)

                img_tag_array = sub_page_soup.findAll(
                    "span", {"class": "image-and-copyright-container"
                             })  #find tags in the soup object

                img_link = ""

                if len(img_tag_array) > 0:
                    if img_tag_array[
                            0].div:  #if article contains references to images, extract the first one
                        img_link = img_tag_array[0].div["data-src"]
                    elif img_tag_array[0].img:
                        img_link = img_tag_array[0].img["src"]

                media_link = {"link": "", "type": ""}

                media_tag_array = sub_page_soup.findAll(
                    "figure", {"class": "media-player"})

                if len(media_tag_array) == 1:
                    link_json = demjson.decode(
                        media_tag_array[0]["data-playable"])

                    link = link_json["settings"]["externalEmbedUrl"]

                    link_type = ""

                    if "youtube" in link:
                        link_type = "youtube"
                    elif "spotify" in link:
                        link_type = "spotify"
                    elif "soundcloud" in link:
                        link_type = "soundcloud"
                    elif "twitter" in link:
                        link_type = "twitter"
                    elif "bbc" in link:
                        link_type = "bbc_embed"

                    media_link = {"link": link, "type": link_type}

                #frame BeefObject( title, relevant_actors, content, date, highlights, data_source, categories, img_title)
                beef_obj = BeefObject(title, actors_list, content_string,
                                      date_string, highlights, path,
                                      categories, img_link,
                                      media_link)  #create beefObject

                return beef_obj

            else:
                return None
        else:
            return None
    else:
        return None

예제 #9

0

파일 보기

파일: give_me_sport_home.py 프로젝트: tomconnolly94/Beef-Data-Scraper

def scrape_give_me_sport_home(uReq, soup, keyword_list):

    logging = None

    base_url = 'http://www.givemesport.com/'  #url to scrape

    raw_page_html = access_url(base_url, uReq)  #make request for page

    if raw_page_html is not None:

        page_soup = soup(raw_page_html,
                         "html.parser")  #convert the html to a soup object

        news_tag_array = page_soup.find("section", {"id": "gms-trending"})

        if news_tag_array:
            news_tag_array = news_tag_array.findAll(
                "article", {"class", "gms-feature"
                            })  #, text=pattern) #find tags in the soup object

            beef_objects = []

            #load saved urls
            saved_urls = get_saved_urls(base_url)

            percent_per_scrape = 100 / len(news_tag_array)

            if len(news_tag_array) > 0:  #only execute if tags have been found

                for x, news_tag in enumerate(news_tag_array):

                    print(str(round(x * percent_per_scrape)) + "% complete.")

                    if news_tag and news_tag.a and news_tag.a["href"]:

                        sub_page_url = base_url + news_tag.a["href"]

                        if any(
                                url_obj["url"] == sub_page_url
                                for url_obj in saved_urls
                        ):  #check through pre loaded urls to ensure url has not already been scraped
                            if logging:
                                print("preloaded url found, aborting scrape.")

                        else:
                            if logging:
                                print(
                                    "preloaded url not found, initiating scrape."
                                )

                            #url must be saved under these conditions: 1. it has not been previously scraped, 2. it may not be relevant to beef and therefore may not be added to selected events,
                            save_url(base_url, sub_page_url)

                            beef_object = scrape_article(
                                sub_page_url, uReq, soup, keyword_list)
                            if beef_object != None:
                                beef_objects.append(beef_object)

            return beef_objects
        else:
            return []
    else:
        return []

예제 #10

0

파일 보기

def scrape_video(path, uReq, soup, keyword_list):

    sub_page_html = access_url(path, uReq)

    if sub_page_html is not None:

        sub_page_soup = soup(sub_page_html, "html.parser")

        title_tag_array = sub_page_soup.findAll(
            "div", {"class": "page_header"})  #find tags in the soup object
        title = title_tag_array[0].h1.text

        store_event_classification(
            title,
            title)  #classify event and store the classification for later use

        media_tag_array = sub_page_soup.findAll(
            "iframe")  #find tags in the soup object

        content_string = title_tag_array[0].h1.text
        img_link = ""

        if len(title_tag_array) > 0 and title_tag_array[0] and title_tag_array[
                0].div and title_tag_array[0].div.img:
            img_link = title_tag_array[0].div.img["src"]

        #relevant_story = None;

        date_string = sub_page_soup.find("span", {
            "class": "date"
        }).text.replace("Posted on: ", "")  #find tags in the soup object

        actors_list = extract_names(
            title_tag_array[0].h1.text)  #extract actors from title
        highlights = extract_quotes(
            title_tag_array[0].h1.text)  #extract quotes from title
        categories = [1]

        media_link = {"link": "", "type": ""}

        if len(media_tag_array) > 0:
            link = media_tag_array[0]["src"]
            link_type = ""

            if "youtube" in link:
                link_type = "youtube"
            elif "spotify" in link:
                link_type = "spotify"
            elif "soundcloud" in link:
                link_type = "soundcloud"
            elif "twitter" in link:
                link_type = "twitter"

            media_link = {"link": link, "type": link_type}

        #frame BeefObject( title, relevant_actors, content, date, highlights, data_source, categories, img_title)
        beef_obj = BeefObject(title_tag_array[0].h1.text, actors_list,
                              content_string, date_string, highlights, path,
                              categories, img_link,
                              media_link)  #create beefObject

        return beef_obj
    else:
        return None

예제 #11

0

파일 보기

파일: cnn_article_scraper.py 프로젝트: tomconnolly94/Beef-Data-Scraper

def scrape_article(path, uReq, soup, keyword_list):

    sub_page_html = access_url(path, uReq)

    if sub_page_html is not None:

        sub_page_soup = soup(sub_page_html, "html.parser")

        content_tag_array = sub_page_soup.findAll(
            "div",
            {"class": "zn-body__paragraph"})  #find tags in the soup object

        relevant_story = None

        if (len(content_tag_array) > 0):

            content_string = ""  #init content string

            #check each p tag found for words from the keyword list
            for p in content_tag_array:

                content_string += p.text + " "

                if (
                        len(keyword_list) > 0
                ):  #if keyword list has values, use them to filter stories, if it is empty, automatically approve story
                    #check if any text from page contains key words stored in list, if keyword found, print page text
                    if (any(keyword in p.text for keyword in keyword_list)):
                        relevant_story = True

            else:
                relevant_story = True

            #clean content string
            globals.scrub_content_text(content_string)

            title = sub_page_soup.findAll(
                "h1", {"class": "pg-headline"
                       })[0].text  #find tags in the soup object

            #article is relevant, build a beef record
            if (relevant_story):  #execute if a story contains a keyword

                store_event_classification(
                    title, content_string
                )  #classify event and store the classification for later use

                date_tag_array = sub_page_soup.findAll(
                    "p",
                    {"class": "update-time"})  #find tags in the soup object

                split_date = date_tag_array[0].text.split(
                    " ")  #split the date string into parts
                date_string = split_date[6].split(",")[0] + "/" + str(
                    globals.get_month_number(
                        split_date[5])) + "/" + split_date[
                            7]  #rebuild date string with only relevant parts

                actors_list = extract_names(
                    content_string)  #extract actors from content_string

                highlights = extract_quotes(
                    content_string)  #extract quotes from content_string

                categories = []

                if "politics" in path:
                    categories.append(2)

                if "sport" in path:
                    categories.append(4)

                if "technology" in path:
                    categories.append(6)

                img_tag_array = sub_page_soup.findAll(
                    "div", {"class": "el__image--fullwidth"
                            })  #find tags in the soup object

                img_link = ""

                if (img_tag_array is not None) and (
                        len(img_tag_array) > 0
                ) and (img_tag_array[0].div) and (
                        img_tag_array[0].div.img
                ) and (
                        img_tag_array[0].div.img['data-src-large']
                ):  #if article contains references to images, extract the first one
                    img_link = img_tag_array[0].div.img['data-src-large']

                media_tag_array = sub_page_soup.findAll(
                    "div", {"class": "media__video--thumbnail-wrapper"
                            })  #find tags in the soup object

                media_link = {"link": "", "type": ""}

                if len(media_tag_array
                       ) > 0 and media_tag_array[0] and media_tag_array[
                           0].script and media_tag_array[0].script.text:

                    json_video_data = demjson.decode(
                        media_tag_array[0].script.text)
                    link = json_video_data["embedUrl"]
                    link_type = ""

                    if "youtube" in link:
                        link_type = "youtube"
                    elif "spotify" in link:
                        link_type = "spotify"
                    elif "soundcloud" in link:
                        link_type = "soundcloud"

                    media_link = {"link": link, "type": link_type}

                #frame BeefObject( title, relevant_actors, content, date, highlights, data_source, categories, img_title)
                beef_obj = BeefObject(title, actors_list, content_string,
                                      date_string, highlights, path,
                                      categories, img_link,
                                      media_link)  #create beefObject

                return beef_obj
    else:
        return None