示例#1
0
def main():
    source = requests.get('http://goodyfoodies.blogspot.com').text
    soup = BeautifulSoup(source, 'lxml')

    posts = []

    articles = soup.find_all('div',
                             class_='post hentry uncustomized-post-template')

    for article in articles:
        headline = article.h3.text
        summary = article.find('div', class_='post-body entry-content').text

        commentLink = article.find('span',
                                   class_='post-comment-link').a['href']

        commentSource = requests.get(commentLink).text
        commentSoup = BeautifulSoup(commentSource, 'lxml')

        comments = commentSoup.find_all('div', class_='comment-block')

        id = 0

        commList = []

        for comment in comments:
            id = id + 1

            refid = 1

            user = comment.div.cite.text

            date = comment.find('span',
                                class_='datetime secondary-text').a.text
            parsed_date = parse(date)
            # print('Date:', parsed_date.date())
            # print('time:', parsed_date.time())
            timestamp = datetime.datetime.timestamp(parsed_date)
            # print(timestamp)

            msg = comment.p.text

            comm = Comment(id, refid, timestamp, user, msg)
            commList.append(comm)

        posts.append(Post(headline, summary, commList))

    for post in posts:
        print(post)
def main():
    source = requests.get('http://journeyofanitaliancook.blogspot.com/').text
    soup = BeautifulSoup(source, 'lxml')

    posts = []

    articles = soup.find_all('div', class_='post hentry')

    for article in articles:
        headline = article.h3.text
        summary = article.find('div', class_='post-body entry-content').text

        commentLink = article.find('span', class_='post-comment-link').a['href']

        commentSource = requests.get(commentLink).text
        commentSoup = BeautifulSoup(commentSource, 'lxml')

        comments = commentSoup.find_all('dl', id='comments-block')

        id = 0

        commList = []

        for comment in comments:
            id = id + 1

            refid = 1

            user = comment.dt.span.text

            date = comment.find('p', class_='comment-timestamp').text
            parsed_date = parse(date)
            timestamp = datetime.datetime.timestamp(parsed_date)

            msg = comment.dd.p.text

            comm = Comment(id, refid, timestamp, user, msg)
            commList.append(comm)

        posts.append(Post(headline, summary, commList))

    for post in posts:
        print(post)
示例#3
0
def main():
    source = requests.get('http://theworldaccordingtoeggface.blogspot.com/').text
    soup = BeautifulSoup(source, 'lxml')

    posts = []

    articles = soup.find_all('div', class_='post hentry')

    for article in articles:
        headline = article.h3.text
        summary = article.find('div', class_='post-body entry-content').text

        commentLink = article.h3.a['href'] #este link-ul din headline pentru ca nu exista nicio referinta la comentarii in pagina home

        commentSource = requests.get(commentLink).text
        commentSoup = BeautifulSoup(commentSource, 'lxml')

        # comments = commentSoup.find('div', class_='comments')
# nu exista niciun comentariu pe acest blog

        posts.append(Post(headline, summary, ""))

    for post in posts:
        print(post)
示例#4
0
def main():
    source = requests.get('https://mainlymacro.blogspot.com/').text
    soup = BeautifulSoup(source, 'lxml')

    posts = []

    articles = soup.find_all('div',
                             class_='post hentry uncustomized-post-template')

    for article in articles:
        headline = article.h3.a.text

        summarySource = requests.get(article.h3.a['href']).text
        summarySoup = BeautifulSoup(summarySource, 'lxml')
        summaryList = summarySoup.find(
            'div', class_='post-body entry-content').find_all('span')
        summary = ""
        for s in summaryList:
            summary = summary + " " + s.text.replace('\n', '').replace(
                '\t', '')

        try:
            comments = summarySoup.find('ol', id='top-ra').find_all('li')
        except Exception as e:
            comments = []

        id = 0

        commList = []

        anonymous = 1

        for comment in comments:
            id = id + 1

            refid = 1

            user = comment.find('div',
                                class_='comment-header').cite.text.replace(
                                    '\n', '').replace('\t', '')
            if user == 'Unknown' or user == 'Anonymous':
                user = '******' + str(anonymous)
                anonymous = anonymous + 1

            date = comment.find('span',
                                class_='datetime secondary-text').text.replace(
                                    '\n', '').replace('\t', '')
            parsed_date = parse(date)
            timestamp = datetime.datetime.timestamp(parsed_date)

            msg = comment.find('p', class_='comment-content').text.replace(
                '\n', '').replace('\t', '')

            comm = Comment(id, refid, timestamp, user, msg)
            commList.append(comm)

            fail_condition = True
            while fail_condition:
                try:
                    replies = comment.find(
                        'div', class_='comment-replies').find_all('li')

                    refid = id

                    for reply in replies:
                        id = id + 1

                        user = reply.find(
                            'div', class_='comment-header').cite.text.replace(
                                '\n', '').replace('\t', '')
                        if user == 'Unknown' or user == 'Anonymous':
                            user = '******' + str(anonymous)
                            anonymous = anonymous + 1

                        date = reply.find(
                            'span',
                            class_='datetime secondary-text').text.replace(
                                '\n', '').replace('\t', '')
                        parsed_date = parse(date)
                        timestamp = datetime.datetime.timestamp(parsed_date)

                        msg = reply.find(
                            'p', class_='comment-content').text.replace(
                                '\n', '').replace('\t', '')

                        comm = Comment(id, refid, timestamp, user, msg)
                        commList.append(comm)

                    fail_condition = False
                except Exception as e:
                    fail_condition = False

        posts.append(Post(headline, summary, commList))

    for post in posts:
        print(post)
示例#5
0
def main():
    source = requests.get('https://www.honeywerehome.com/').text
    soup = BeautifulSoup(source, 'lxml')

    posts = []

    articles = soup.find_all('div', class_='entry-summary')

    for article in articles:
        headline = article.div.a['title']

        summaryList = article.find_all('p')
        summary = ""
        for s in summaryList:
            summary = summary + " " + s.text

        commentLink = article.a['href']

        commentSource = requests.get(commentLink).text
        commentSoup = BeautifulSoup(commentSource, 'lxml')

        comments = commentSoup.find('ol', class_='comment-list').find_all('li')

        id = 0

        commList = []

        for comment in comments:
            id = id + 1

            refid = 1

            user = comment.find('div', class_='comment-author').cite.text

            date = comment.find('span', class_='comment-date').text
            parsed_date = parse(date)
            timestamp = datetime.datetime.timestamp(parsed_date)

            msg = comment.find('div', class_='comment-content').p.text

            comm = Comment(id, refid, timestamp, user, msg)
            commList.append(comm)

            fail_condition = True
            while fail_condition:
                try:
                    replies = comment.find('ul',
                                           class_='children').find_all('li')

                    refid = id

                    for reply in replies:
                        id = id + 1

                        user = reply.find('div',
                                          class_='comment-author').cite.text

                        date = reply.find('span', class_='comment-date').text
                        parsed_date = parse(date)
                        timestamp = datetime.datetime.timestamp(parsed_date)

                        msg = reply.find('div',
                                         class_='comment-content').p.text

                        print(user)

                        comm = Comment(id, refid, timestamp, user, msg)
                        commList.append(comm)

                    fail_condition = False
                except Exception as e:
                    fail_condition = False

        posts.append(Post(headline, summary, commList))

    for post in posts:
        print(post)
示例#6
0
def main():
    source = requests.get('https://thinkmarkets.wordpress.com').text
    soup = BeautifulSoup(source, 'lxml')

    posts = []

    articles = soup.find_all('article')

    for article in articles:
        headline = article.header.h1.a.text

        summarySource = requests.get(article.header.h1.a['href']).text
        summarySoup = BeautifulSoup(summarySource, 'lxml')
        summaryList = summarySoup.find('div',
                                       class_='entry-content').find_all('p')
        summary = ""
        for s in summaryList:
            summary = summary + " " + s.text.replace('\n', '').replace(
                '\t', '')

        try:
            comments = summarySoup.find('ol',
                                        class_='comment-list').find_all('li')
        except Exception as e:
            comments = []

        id = 0

        commList = []

        anonymous = 1

        for comment in comments:
            id = id + 1

            refid = 1

            user = comment.find('div',
                                class_='comment-author vcard').b.text.replace(
                                    '\n', '').replace('\t', '')
            if user == '':
                user = comment.find(
                    'div', class_='comment-author vcard').b.a.text.replace(
                        '\n', '').replace('\t', '')
            if user == 'Unknown' or user == 'Anonymous':
                user = '******' + str(anonymous)
                anonymous = anonymous + 1

            date = comment.find('div',
                                class_='comment-metadata').a.time.text.replace(
                                    '\n', '').replace('\t', '')
            parsed_date = parse(date)
            timestamp = datetime.datetime.timestamp(parsed_date)

            msg = ""
            msgList = comment.find('div',
                                   class_='comment-content').find_all('p')
            for m in msgList:
                msg = msg + m.text.replace('\n', '').replace('\t', '')

            comm = Comment(id, refid, timestamp, user, msg)
            commList.append(comm)

            fail_condition = True
            while fail_condition:
                try:
                    replies = comment.find('ul',
                                           class_='children').find_all('li')

                    refid = id

                    for reply in replies:
                        id = id + 1

                        user = reply.find(
                            'div',
                            class_='comment-author vcard').b.text.replace(
                                '\n', '').replace('\t', '')
                        if user == '':
                            user = reply.find('div',
                                              class_='comment-author vcard'
                                              ).b.a.text.replace('\n',
                                                                 '').replace(
                                                                     '\t', '')
                        if user == 'Unknown' or user == 'Anonymous':
                            user = '******' + str(anonymous)
                            anonymous = anonymous + 1

                        date = reply.find(
                            'div',
                            class_='comment-metadata').a.time.text.replace(
                                '\n', '').replace('\t', '')
                        parsed_date = parse(date)
                        timestamp = datetime.datetime.timestamp(parsed_date)

                        msg = ""
                        msgList = reply.find(
                            'div', class_='comment-content').find_all('p')
                        for m in msgList:
                            msg = msg + m.text.replace('\n', '').replace(
                                '\t', '')

                        comm = Comment(id, refid, timestamp, user, msg)
                        commList.append(comm)

                    fail_condition = False
                except Exception as e:
                    fail_condition = False

        posts.append(Post(headline, summary, commList))

    for post in posts:
        print(post)
示例#7
0
def main():
    source = requests.get('http://warnewsupdates.blogspot.com/').text
    soup = BeautifulSoup(source, 'lxml')

    posts = []

    articles = soup.find('div',
                         class_='widget Blog').find_all('div',
                                                        class_='post hentry')

    for article in articles:
        headline = article.h3.a.text

        summarySource = requests.get(article.h3.a['href']).text
        summarySoup = BeautifulSoup(summarySource, 'lxml')

        summary = summarySoup.find('div',
                                   class_='post-body entry-content').text
        summary = summary.replace('\n', "")
        # print(summary)

        try:
            comments = summarySoup.find('ol',
                                        class_='commentlist').find_all('li')
        except Exception as e:
            comments = []

        id = 0

        commList = []

        anonymous = 1

        for comment in comments:
            id = id + 1

            refid = 1

            user = comment.find('div', class_='comment-author vcard').cite.text
            if user == 'Unknown' or user == 'Anonymous':
                user = '******' + str(anonymous)
                anonymous = anonymous + 1

            date = comment.find('div',
                                class_='comment-meta commentmetadata').a.text
            parsed_date = parse(date)
            timestamp = datetime.datetime.timestamp(parsed_date)

            msg = ""
            msgList = comment.find('div', class_='comment-body').find_all('p')
            for m in msgList:
                if 'Like' not in m.text:
                    msg = msg + m.text + " "

            comm = Comment(id, refid, timestamp, user, msg)
            commList.append(comm)

            fail_condition = True
            while fail_condition:
                try:
                    replies = comment.find('ul',
                                           class_='children').find_all('li')

                    refid = id

                    for reply in replies:
                        id = id + 1

                        user = reply.find(
                            'div', class_='comment-author vcard').cite.text
                        if user == 'Unknown' or user == 'Anonymous':
                            user = '******' + str(anonymous)
                            anonymous = anonymous + 1

                        date = reply.find(
                            'div',
                            class_='comment-meta commentmetadata').a.text
                        parsed_date = parse(date)
                        timestamp = datetime.datetime.timestamp(parsed_date)

                        msg = ""
                        msgList = reply.find(
                            'div', class_='comment-body').find_all('p')
                        for m in msgList:
                            if m.text != 'Like':
                                if 'Like' not in m.text:
                                    msg = msg + m.text + " "

                        comm = Comment(id, refid, timestamp, user, msg)
                        commList.append(comm)

                    fail_condition = False
                except Exception as e:
                    fail_condition = False

        posts.append(Post(headline, summary, commList))

    for post in posts:
        print(post)
示例#8
0
def main():
    source = requests.get('http://2politicaljunkies.blogspot.com/').text
    soup = BeautifulSoup(source, 'lxml')

    posts = []

    articles = soup.find_all('div', class_='post hentry')

    for article in articles:
        headline = article.h3.a.text

        summarySource = requests.get(article.h3.a['href']).text
        summarySoup = BeautifulSoup(summarySource, 'lxml')
        summary = summarySoup.find(
            'div', class_='post-body entry-content').text.replace('\n', '')

        try:
            commentUsers = summarySoup.find('div', id='comments').find_all(
                'dt', class_='comment-author')
            commentBodies = summarySoup.find('div', id='comments').find_all(
                'dd', class_='comment-body')
            commentFooters = summarySoup.find('div', id='comments').find_all(
                'dd', class_='comment-footer')

        except Exception as e:
            comments = []

        id = 0

        commList = []

        anonymous = 1

        for comment in commentBodies:
            id = id + 1

            refid = 1

            commentUser = commentUsers.pop(0)
            user = commentUser.text.replace('said...', '').replace('\n', '')
            if user == 'Unknown' or user == 'Anonymous':
                user = '******' + str(anonymous)
                anonymous = anonymous + 1

            commentFooter = commentFooters.pop(0)

            date = commentFooter.find('span',
                                      class_='comment-timestamp').a.text
            parsed_date = parse(date)
            timestamp = datetime.datetime.timestamp(parsed_date)

            try:
                msg = comment.find('p').text.replace('\n', ' ')
            except Exception as e:
                msg = ''

            comm = Comment(id, refid, timestamp, user, msg)
            commList.append(comm)

        posts.append(Post(headline, summary, commList))

    for post in posts:
        print(post)
示例#9
0
def main():
    source = requests.get('http://inspiringscience.net').text
    soup = BeautifulSoup(source, 'lxml')

    posts = []

    articles = soup.find('div', id='content').find_all('article')

    for article in articles:
        headline = article.header.h1.a.text

        summarySource = requests.get(article.header.h1.a['href']).text
        summarySoup = BeautifulSoup(summarySource, 'lxml')
        summaryList = summarySoup.find('div',
                                       class_='post-entry').find_all('p')
        summary = ""
        for s in summaryList:
            summary = summary + " " + s.text

        try:
            comments = summarySoup.find('ol',
                                        class_='commentlist').find_all('li')
        except Exception as e:
            comments = []

        id = 0

        commList = []

        anonymous = 1

        for comment in comments:
            id = id + 1

            refid = 1

            user = comment.find('p', class_='comment-author').span.text
            if user == 'Unknown' or user == 'Anonymous':
                user = '******' + str(anonymous)
                anonymous = anonymous + 1

            date = comment.find('p', class_='comment-date').a.time.text
            parsed_date = parse(date)
            timestamp = datetime.datetime.timestamp(parsed_date)

            msg = ''
            msgList = comment.find('div', class_='comment-text').find_all('p')
            del msgList[-1]
            for m in msgList:
                msg = msg + m.text + " "

            comm = Comment(id, refid, timestamp, user, msg)
            commList.append(comm)

            fail_condition = True
            while fail_condition:
                try:
                    replies = comment.find('ul',
                                           class_='children').find_all('li')

                    refid = id

                    for reply in replies:
                        id = id + 1

                        user = reply.find('p',
                                          class_='comment-author').span.text
                        if user == 'Unknown' or user == 'Anonymous':
                            user = '******' + str(anonymous)
                            anonymous = anonymous + 1

                        date = reply.find('p',
                                          class_='comment-date').a.time.text
                        parsed_date = parse(date)
                        timestamp = datetime.datetime.timestamp(parsed_date)

                        msg = ''
                        msgList = reply.find(
                            'div', class_='comment-text').find_all('p')
                        del msgList[-1]
                        for m in msgList:
                            msg = msg + m.text + " "

                        comm = Comment(id, refid, timestamp, user, msg)
                        commList.append(comm)

                    fail_condition = False
                except Exception as e:
                    fail_condition = False

        posts.append(Post(headline, summary, commList))

    for post in posts:
        print(post)
示例#10
0
def main():
    source = requests.get('https://fullymyelinated.wordpress.com/').text
    soup = BeautifulSoup(source, 'lxml')

    posts = []

    articles = soup.find('div', id='content-left').find_all('div',
                                                            class_='entry')

    for article in articles:
        headline = article.h2.a.text

        summarySource = requests.get(article.h2.a['href']).text
        summarySoup = BeautifulSoup(summarySource, 'lxml')
        summary = ""
        summaryList = summarySoup.find('div', class_='entry').find_all('p')
        summaryList.pop(0)  #remove 'leave a comment' text
        for s in summaryList:
            summary = summary + s.text.replace('\n', '')

        try:
            comments = summarySoup.find(
                'ol', class_='commentlist snap_preview').find_all('li')
        except Exception as e:
            comments = []

        id = 0

        commList = []

        anonymous = 1

        for comment in comments:
            id = id + 1

            refid = 1

            try:
                user = comment.find('div',
                                    class_='comment-author vcard').cite.text
                if user == 'Unknown' or user == 'Anonymous':
                    user = '******' + str(anonymous)
                    anonymous = anonymous + 1

                date = comment.find(
                    'div', class_='comment-meta commentmetadata').a.text
                parsed_date = parse(date)
                timestamp = datetime.datetime.timestamp(parsed_date)

                msg = ""
                msgList = comment.find('div',
                                       class_='comment-body').find_all('p')
                for m in msgList:
                    msg = msg + m.text.replace('\n', '') + ' '

                comm = Comment(id, refid, timestamp, user, msg)
                commList.append(comm)
            except Exception as e:
                comm = None

            fail_condition = True
            while fail_condition:
                try:
                    replies = comment.find('ul',
                                           class_='children').find_all('li')

                    refid = id

                    for reply in replies:
                        id = id + 1

                        user = reply.find(
                            'div', class_='comment-author vcard').cite.text
                        if user == 'Unknown' or user == 'Anonymous':
                            user = '******' + str(anonymous)
                            anonymous = anonymous + 1

                        date = reply.find(
                            'div',
                            class_='comment-meta commentmetadata').a.text
                        parsed_date = parse(date)
                        timestamp = datetime.datetime.timestamp(parsed_date)

                        msg = ""
                        msgList = reply.find(
                            'div', class_='comment-body').find_all('p')
                        for m in msgList:
                            msg = msg + m.text.replace('\n', '') + ' '

                        comm = Comment(id, refid, timestamp, user, msg)
                        commList.append(comm)

                    fail_condition = False
                except Exception as e:
                    fail_condition = False

        posts.append(Post(headline, summary, commList))

    for post in posts:
        print(post)