예제 #1
0
def get_headline_details(obj):
    try:
        from datetime import datetime
        timestamp_tag = obj.parent.parent.find(
            "span", {"class": "time-dt"}
        )
        if timestamp_tag is None:
            timestamp = datetime.now()
        else:
            content = timestamp_tag.contents[0].strip()
            timestamp = datetime.strptime(
                content,
                "%b %d, %Y %H:%M"
            )
        return {
            "content": "NA",
            "link": obj["href"].split("?")[0],
            "scraped_at": datetime.utcnow().isoformat(),
            "published_at": ist_to_utc(timestamp).isoformat(),
            "title": "\n".join(filter(
                str_is_set,
                map(
                    str.strip,
                    filter(is_string, obj.children)
                )
            ))
        }
    except KeyError:
        import pdb
        pdb.set_trace()
예제 #2
0
def get_chronological_headlines(url):
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, "html.parser")
        soup.find("div", id="c_articlelist_widgets_1").decompose()

        data = []
        objs = soup.find("div", {
            "class": "main-content"
        }).find_all("span", {"class": "w_tle"})

        for obj in objs:
            dt = obj.find_next("span").find("span").get("rodate")
            if dt is not None:
                clean_dt = ist_to_utc(datetime.strptime(
                    dt, "%d %b %Y, %H:%M")).isoformat()

            data.append({
                "link":
                "https://timesofindia.indiatimes.com" +
                obj.find("a").get("href"),
                "content":
                "NA",
                "scraped_at":
                datetime.utcnow().isoformat(),
                "published_at":
                clean_dt,
                "title":
                obj.find("a").get("title")
            })

        return data
예제 #3
0
 def get_content(obj):
     from time import sleep
     sleep(0.7)
     response = requests.get(obj["link"])
     if response.status_code == 200:
         soup = BeautifulSoup(response.text, "html.parser")
         pub_tag = soup.find("span", id="pub_date")
         str_time = pub_tag.find("script").text.split("'")[1].split(
             "<strong>First Published:</strong> ")[1]
         obj["published_at"] = ist_to_utc(
             datetime.strptime(str_time,
                               "%B %d, %Y, %I:%M %p %Z")).isoformat()
         for i in soup.find("div", id="article_body").find_all("style"):
             i.decompose()
         soup.find("div", id="article_body").find("div", {
             "class": "tag"
         }).decompose()
         obj["content"] = soup.find("div", id="article_body").text
     return "NA"
예제 #4
0
def get_headline_details(obj):
    try:
        from datetime import datetime
        timestamp_tag = obj.find("span", {"class": "SunChDt2"})
        if timestamp_tag is None:
            timestamp = datetime.now()
        else:
            content = timestamp_tag.contents[0].strip()
            timestamp = datetime.strptime(content, "%d %b %Y %I:%M %p")
        return {
            "content": "NA",
            "link": "https://www.deccanchronicle.com" + obj["href"],
            "scraped_at": datetime.utcnow().isoformat(),
            "published_at": ist_to_utc(timestamp).isoformat(),
            "title": obj.find(['h3', 'h2']).contents[0].strip()
        }
    except KeyError:
        import pdb
        pdb.set_trace()
예제 #5
0
def get_trending_headline_details(obj):
    try:
        from datetime import datetime
        timestamp_tag = obj.find("p", {"class": "date"})
        if timestamp_tag is None:
            timestamp = datetime.now()
        else:
            content = timestamp_tag.contents[0].strip()
            timestamp = datetime.strptime(content, "%d-%m-%Y | %I:%M %p")
        return {
            "content": "NA",
            "link": "http://ddnews.gov.in" + obj.find("a")["href"],
            "scraped_at": datetime.utcnow().isoformat(),
            "published_at": ist_to_utc(timestamp).isoformat(),
            "title": obj.find("a").contents[0].strip()
        }
    except KeyError:
        import pdb
        pdb.set_trace()
예제 #6
0
def get_headline_details(obj):
    try:
        timestamp = datetime.strptime(
            obj["title"].split("Published: ")[1].split(" IST")[0],
            "%B %d, %Y %H:%M")
        return {
            "content":
            "NA",
            "link":
            obj["href"],
            "scraped_at":
            datetime.utcnow().isoformat(),
            "published_at":
            ist_to_utc(timestamp).isoformat(),
            "title":
            "\n".join(
                filter(str_is_set,
                       map(str.strip, filter(is_string, obj.children))))
        }
    except KeyError:
        import pdb
        pdb.set_trace()
예제 #7
0
    def get_content(url, obj):
        response = requests.get(url)
        if response.status_code == 200:
            html_content = BeautifulSoup(response.text, "html.parser")

            # extracting time here
            timestamp_tag = html_content.find('ul', {
                'class': 'rowUl'
            }).find('li')
            if timestamp_tag is None:
                timestamp = datetime.now()
            else:
                content = timestamp_tag.contents[2].strip()
                timestamp = datetime.strptime(content, "%d.%m.%y, %I:%M %p")
            obj["published_at"] = ist_to_utc(timestamp).isoformat()

            contents = html_content.find(
                'div', {'class': 'padiingDetails story-advertise'})
            text = ''
            for cont in contents.stripped_strings:
                text += cont + ' '
            return text
        return "NA"
예제 #8
0
def get_headline_details(obj):
    try:
        from datetime import datetime
        timestamp_tag = obj.parent.parent.find("div",
                                               {"class": "nstory_dateline"})
        if timestamp_tag is None:
            timestamp = datetime.now()
        else:
            content = timestamp_tag.contents[-1].strip()
            date = content.split("| ")[-1].split(", ")
            if date[-1].isdigit():
                date = " ".join(date)
            else:
                for i in range(1, 10):
                    if date[-i].isdigit():
                        break
                i -= 1
                date = " ".join(date[:-i])
            timestamp = datetime.strptime(date + " 05:30", "%A %B %d %Y %H:%M")
        return {
            "content":
            "NA",
            "link":
            obj["href"].split("?")[0],
            "scraped_at":
            datetime.utcnow().isoformat(),
            "published_at":
            ist_to_utc(timestamp).isoformat(),
            "title":
            "\n".join(
                filter(str_is_set,
                       map(str.strip, filter(is_string, obj.children))))
        }
    except KeyError:
        import pdb
        pdb.set_trace()