Пример #1
0
def get_content(username):
    """Get data."""
    if username:
        tweets = [t for t in get_tweets(username, pages=25)]
    else:
        tweets = ['error', 'no username set properly']
    return tweets
Пример #2
0
def main():
    """Main function that runs everything."""
    if not args.logo_off:  # print or not print logo
        print(logo)
    if args.test:
        #r = requests.get(base64.b64decode(that_url).decode('utf8')+'/jamescampbell')
        tweets = '\n'.join([t['text'] for t in get_tweets('jamescampbell', pages=20)])
        if args.verbose:
            text_model = markovify.Text(tweets)
            print(text_model.make_short_sentence(140))
            exit()
            #print(r.text)
        exit()
    else:
        tweets = get_content(args.username)
        if args.verbose:
            tweetbreak = []
            print(f"Total found: {len(tweets)}")
            print(f"First tweet {tweets[0]['time']}")
            for idx, tweet in enumerate(tweets):
                timeone = tweet['time']
                try:
                    timetwo = (tweets[idx+1]['time'])
                except:
                    timetwo = tweet['time']
                #print(timetwo)
                tdelta = timeone - timetwo
                #print(tdelta.total_seconds())
                tweetbreak.append(tdelta.total_seconds())
            # print(tweetbreak[0])
            print("Average time between tweets: {} minutes".format(get_other_mean(tweetbreak)/60))
            exit()
    jsondata = request_response.json()
    # [trend['name'] for trend in the_data[0]['trends']]
    print()
    if args.print_me:  # if we are running a test or not
        print('json data:')
        pprint(jsondata)
        print('fields available:')
        for k,v in jsondata['results'][0].items():
            print(k)
        exit('thanks for trying')
    average_price = get_mean(jsondata)
    print("The average price of the \033[94m{0}\033[0m items matching search term\033[92m {1}\033[0m: ${2:.2f}".format(jsondata['resultCount'], args['search_term'], average_price))
    if args.output_table:  # if we want to output a table instead of json
        print(pd.DataFrame(jsondata['results'], columns=["price", "artistName", "trackName"]))
    else:
        with open('{}.json'.format(args['search_term']), 'w') as f:
            f.write(''.join(str(x) for x in [request_response.json()]))
        exit('file saved as {}.json'.format(args['search_term']))
Пример #3
0
 def test_languages(self):
     user = '******'
     tweets = list(get_tweets(user=user, pages=1))
     self.assertIn('likes', tweets[0])
     self.assertIsInstance(tweets[0]['replies'], int)
     self.assertGreaterEqual(tweets[1]['retweets'], 0)
Пример #4
0
    def test_child(self):
        user = '******'
        tweets = list(get_tweets(user=user, pages=1))

        self.assertEqual(tweets[1]['text'],
                         'If I could, I would, but if I can’t, I wan’t.')
Пример #5
0
def scrape(account, numPage) :
	tweets = '\n'.join([t for t in get_tweets(account, pages=numPage)])
	return tweets
Пример #6
0
import sys
import math
import json
from twitter_scraper import get_tweets

# Get username and number of tweets from command line arguments
username = str(sys.argv[1])
num_tweets = int(sys.argv[2])

# Compute number of pages to retrieve
# Each page has 20 tweets
num_pages = math.ceil(num_tweets / 20.0)

# Retrieve tweets by pages
json_tweet = {"tweets": []}
for tweet in get_tweets(username, pages=num_pages):
    json_tweet['tweets'].append(tweet['text'])

# Remove unwanted tweets
json_tweet['tweets'] = json_tweet['tweets'][0:num_tweets]

# Dumps
json_tweet_dumps = json.dumps(json_tweet)

# Return tweets to Node server
print(json_tweet_dumps)
Пример #7
0
def scrape():
    browser = init_browser()

    #NASA
    url = "https://mars.nasa.gov/news.html/"
    browser.visit(url)
    time.sleep(1)
    html = browser.html
    soup = bs(html, 'html.parser')
    news = soup.find("div", class_="list_text")
    news_title = soup.find("div", class_="content_title").text
    first_paragraph = news.find("div", class_="article_teaser_body").text
    
    #JPL
    jpl_url = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars"
    browser.visit(jpl_url)
    jpl_html = browser.html
    soup = bs(jpl_html, 'html.parser')
    img_tag = soup.find("div", class_="carousel_items")
    img_tag = img_tag.find("article")['style'].split("('", 1)[1].split("')")[0]
    img_url = 'https://www.jpl.nasa.gov' + str(img_tag)
    
    #Weather
    tweet_url = "https://twitter.com/marswxreport"
    browser.visit(tweet_url)
    tweet_html = browser.html
    soup = bs(tweet_html, 'html')
    tweets = []
    for tweet in get_tweets('@MarsWxReport', pages=1):
        tweeet = tweet['text']
        tweets.append(tweeet)
    
    #Facts
    mars_url = "https://space-facts.com/mars/"
    tables = pd.read_html(mars_url)
    tables = pd.DataFrame(tables[0])
    tables.columns = ["", "values"]
    tables = tables.to_html()   
    
    #Hemis
    hemis_url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"
    hemis_short_url = "https://astrogeology.usgs.gov/"
    browser.visit(hemis_url)
    hemis_html = browser.html
    soup = bs(hemis_html, 'html')
    hemisphere_imageurls = []
    for i in range(4):
        label = soup.find_all("img", class_="thumb")[i]["src"]
        title = soup.find_all("h3")[i].text
        pic_url = hemis_short_url + label
        hemis_pic = {"title": title, "img_url": pic_url}
        hemisphere_imageurls.append(hemis_pic)
    hemisphere_imageurls

    #Summary
    mars = {
        "title": news_title, 
        "content": first_paragraph,
        "jpl_pic": img_url,
        "tweet": tweets[0],
        "facts": tables,
        "hemispheres": hemisphere_imageurls
    }

    return mars
Пример #8
0
from twitter_scraper import get_tweets

for tweet in get_tweets('kennethreitz', pages=1):
    print(tweet)
    print(dict(tweet))
    print(tweet['entries']['hashtags'])
Пример #9
0
import urllib.request

trends = t.get_trends()
pages_to_search = 10
ids = []


def dl_img(url, name):
    path = f'./images/{name}.jpg'
    urllib.request.urlretrieve(url, path)


for trend in trends:
    try:
        for i in range(1, pages_to_search):
            for tweet in t.get_tweets(trend, pages=i):
                if len(tweet['entries']['photos']):
                    t_id = tweet['tweetId']
                    if t_id not in ids:
                        ids.append(t_id)
                        for j in range(0, len(tweet['entries']['photos'])):
                            try:
                                url = tweet['entries']['photos'][j]
                                name = f'{trend}_{t_id}_{j}'
                                dl_img(url, name)
                                print('image downloaded')
                            except:
                                print('couldn\'t download image')
                    else:
                        print('image already downloaded')
                else:
Пример #10
0
def getData(username, count):
    # Export global var
    global listStopword
    global additionalStopword

    # Try read CSV
    fileCsvExist = path.exists(f'{username}.csv')
    if fileCsvExist:
        # read from CSV
        print('read from CSV')
        dateParse = lambda x: datetime.strptime(x, "%Y-%m-%d %H:%M:%S")
        df = pd.read_csv(f'{username}.csv',
                         header='infer',
                         parse_dates=['date'],
                         date_parser=dateParse)
        df = df.sort_values(by='date')

        print(f'got {len(df)} tweets from @{username} from CSV')

        # define token
        tokenAll = []  # token all, alternative for set
        tokenNer = []

        # loop per tweet
        print('\nfound:')
        i = 1
        for index, row in df.iterrows():
            # print(index, row['date'], row['content'])

            # NER TAG
            doc, ner = nex.getDocNer(row['content'])

            # remove punctuation, number and to lowercase
            noNumbTweetContent = re.sub(r'\d+', '', row['content'])
            cleanTweetContent = noNumbTweetContent.translate(
                noNumbTweetContent.maketrans('', '', string.punctuation))

            # tokenize
            token = nltk.tokenize.word_tokenize(cleanTweetContent.lower())
            for t in token:
                if t not in listStopword \
                   and t not in additionalStopword:
                    tokenAll.append(lemmatizer.lemmatize(t))

            # fill token ner
            tokenNer = tokenNer + processTokenNer(ner)

            # print tweet
            printTweet(i, row['date'], row['content'], ner)
            i = i + 1
            print('')

        # create dataframe all token
        print('Calculate Frequency Distributions…')
        dfToken = pd.DataFrame(columns=['token', 'freq'])
        fd = FreqDist(tokenAll)
        for f in fd:
            entry = {'token': f, 'freq': fd[f]}
            dfToken = dfToken.append(entry, ignore_index=True)

        # create dataframe for ner
        dfNer = pd.DataFrame(columns=['ner', 'type', 'freq'])
        fdNer = FreqDist(tokenNer)
        for f in fdNer:
            entryNer = f.split('/')
            entry = {'ner': entryNer[0], 'type': entryNer[1], 'freq': fdNer[f]}
            dfNer = dfNer.append(entry, ignore_index=True)

        return df, dfToken, dfNer

    else:
        # get from Twitter
        print(f'try get {count} tweets from @{username}')
        page = int(round(count / TWEET_ONE_PAGE))

        # create dataframe
        df = pd.DataFrame(columns=[
            'date',  # Date tweet
            'year',
            'month',
            'day',
            'ner_person',  # PERSON : People, including fictional.
            'ner_norp',  # NORP : Nationalities or religious or political groups.
            'ner_fac',  # FAC : Buildings, airports, highways, bridges, etc.
            'ner_org',  # ORG : Companies, agencies, institutions, etc.
            'ner_gpe',  # GPE : Countries, cities, states.
            'ner_loc',  # LOC : Non-GPE locations, mountain ranges, bodies of water.
            'ner_product',  # PRODUCT : Objects, vehicles, foods, etc. (Not services.)
            'ner_event',  # EVENT : Named hurricanes, battles, wars, sports events, etc.
            'ner_work_of_art',  # WORK_OF_ART : Titles of books, songs, etc.
            'ner_law',  # LAW : Named documents made into laws.
            'ner_date',  # DATE : Absolute or relative dates or periods.
            'ner_time',  # TIME : Times smaller than a day.
            'ner_money',  # MONEY : Monetary values, including unit.
            'content'
        ]  # Original tweet content
                          )

        # define token
        tokenAll = []  # token all, alternative for set
        tokenNer = []

        # loop per fetched tweet
        print('\nfound:')
        i = 1
        for t in get_tweets(username, pages=page):
            tweetDate = t[CONST.IDX_DATE]
            tweetContent = t[CONST.IDX_TWEET]
            respReplies = t[CONST.IDX_REPLIES]
            respRetweet = t[CONST.IDX_RETWEET]
            respLikes = t[CONST.IDX_LIKES]

            # Remove any links and numbers
            noLinkTweetContent = tweetContent.replace(
                'pic.twitter.com', 'http://pic.twitter.com')
            noLinkTweetContent = re.sub(
                r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', '',
                noLinkTweetContent)
            noNumbTweetContent = re.sub(r'\d+', '', noLinkTweetContent)

            # remove punctuation
            cleanTweetContent = noNumbTweetContent.translate(
                noNumbTweetContent.maketrans('', '', string.punctuation))

            # NER TAG
            doc, ner = nex.getDocNer(noLinkTweetContent)

            # tokenize
            token = nltk.tokenize.word_tokenize(cleanTweetContent.lower())
            for t in token:
                if t not in listStopword \
                   and t not in additionalStopword:
                    tokenAll.append(lemmatizer.lemmatize(t))

            # fill token ner
            tokenNer = tokenNer + processTokenNer(ner)

            # add to dataframe
            entryDf = {
                'date': tweetDate,
                'year': int(tweetDate.year),
                'month': int(tweetDate.month),
                'day': int(tweetDate.day),
                'ner_person': ner[nex.PERSON],
                'ner_norp': ner[nex.NORP],
                'ner_fac': ner[nex.FAC],
                'ner_org': ner[nex.ORG],
                'ner_gpe': ner[nex.GPE],
                'ner_loc': ner[nex.LOC],
                'ner_product': ner[nex.PRODUCT],
                'ner_event': ner[nex.EVENT],
                'ner_work_of_art': ner[nex.WORK_OF_ART],
                'ner_law': ner[nex.LAW],
                'ner_date': ner[nex.DATE],
                'ner_time': ner[nex.TIME],
                'ner_money': ner[nex.MONEY],
                'content': noLinkTweetContent
            }
            df = df.append(entryDf, ignore_index=True)

            # print tweet
            printTweet(i, tweetDate, noLinkTweetContent, ner)
            i = i + 1
            print('')

        # print(df.head())

        # Save to CSV as buffer data
        if len(df) > 0:
            df = df.sort_values(by='date', ascending=False)
            df.to_csv(f'{username}.csv')

        # create dataframe all token
        print('Calculate Frequency Distributions…')
        dfToken = pd.DataFrame(columns=['token', 'freq'])
        fd = FreqDist(tokenAll)
        for f in fd:
            entry = {'token': f, 'freq': fd[f]}
            dfToken = dfToken.append(entry, ignore_index=True)

        # create dataframe for ner
        dfNer = pd.DataFrame(columns=['ner', 'type', 'freq'])
        fdNer = FreqDist(tokenNer)
        for f in fdNer:
            entryNer = f.split('/')
            entry = {'ner': entryNer[0], 'type': entryNer[1], 'freq': fdNer[f]}
            dfNer = dfNer.append(entry, ignore_index=True)

        return df, dfToken, dfNer
Пример #11
0
def postMessage(userName, pages):
    # conn = mysql.connector.connect(user="******", password="******", host="localhost", database="test", buffered=True)
    conn = mysql.connector.connect(user="******",
                                   password="******",
                                   host="localhost",
                                   database="twitter_scrap",
                                   buffered=True)
    cursor = conn.cursor()
    try:
        for tweet in get_tweets(userName, pages=pages):
            # print(tweet)
            tweetId = tweet["tweetId"]
            username = tweet["username"]
            tweetUrl = tweet["tweetUrl"]
            isRetweet = tweet["isRetweet"]

            # 抓取到的数据每一条进行判断,按ID查找,在数据库里存在不操作,不存在发送请求给机器人,然后保存
            select_query = "SELECT count(1) FROM twitter_scraper WHERE tweetId  = " + tweetId + ";"
            try:
                cursor.execute(select_query)
                c = cursor.fetchone()[0]
            except mysql.connector.Error as err:
                c = 0
            if c == 0:
                # print("发送机器人请求")
                text1 = tweet["text"]
                p = "pic\.twitter\.com.*"
                text = re.sub(p, "", text1)
                entries = tweet["entries"]
                title = ""
                if isRetweet:
                    title = userName + "转发了" + username + "的推特"
                else:
                    title = userName + "更新了推特"
                photoList = entries["photos"]
                joinPhotoList = []
                if len(photoList):
                    # 对图片列表中的每个图片进行本地保存,文件名是twitterID+第几张
                    photoListNew = []
                    for i in range(len(photoList)):
                        data = requests.get(photoList[i])
                        filename = tweetId + "-" + str(i) + ".png"
                        with open(filename, 'wb') as fp:
                            fp.write(data.content)
                        photoListNew.append("http://45.32.75.149:8080/" +
                                            filename)

                    def joinPhoto(x):
                        return "![image](" + x + ")"

                    joinPhotoList = list(map(joinPhoto, photoListNew))

                markDownList = [
                    "#### " + title + "\n\n", text + "\n\n", joinPhotoList,
                    "[原文链接](https://twitter.com" + tweetUrl + ")"
                ]
                markdownText = ""
                for c in markDownList:
                    if type(c).__name__ == 'list':
                        for v in c:
                            markdownText = markdownText + v + "\n\n"
                    else:
                        markdownText = markdownText + c + "\n\n"
                # print(markdownText)
                dingTalkSing = getDingTailSign()
                boturl = "https://oapi.dingtalk.com/robot/send?" \
                         "access_token=5da1a623d549026fba616a3358e8557725af32f6ca681671b2930b0790b44bb0" \
                         "&timestamp=" + dingTalkSing['timestamp'] + \
                         "&sign=" + dingTalkSing['sign']
                headers = {'Content-Type': 'application/json;charset=utf-8'}

                data = {
                    "msgtype": "markdown",
                    "markdown": {
                        "title": title,
                        "text": markdownText
                    }
                }
                r = requests.post(boturl,
                                  data=json.dumps(data),
                                  headers=headers)

                if "ok" in r.text:
                    # 发送机器人请求成功后,插入到数据库
                    insert_query = 'INSERT INTO twitter_scraper  (tweetId, username, tweetUrl, isRetweet ,text)'
                    insert_query += ' VALUES ( %s, %s, %s, %s, %s)'
                    try:
                        cursor.execute(
                            insert_query,
                            (tweetId, username, tweetUrl, isRetweet, text))
                    except mysql.connector.Error as err:
                        print(err.msg)
                    else:
                        # print(cursor.rowcount)
                        conn.commit()
    except Exception as e:
        print(e)
    finally:
        cursor.close()
        conn.close()
Пример #12
0
!pip install twitter_scraper
import pandas as pd
from datetime import datetime, timedelta
from twitter_scraper import get_tweets

tweet_data = []
duration = datetime.today() - timedelta(days=7)
groupLists=['FactlyIndia','TimesFactCheck','QuintFactCheck']
dateTime = datetime.now()
fileName = 'data_twitter_'+str(dateTime)+'.csv'
for item in groupLists:
  for tweet in get_tweets(item, pages=20):
    post_date = (tweet['time'])
    if post_date > duration: 
      temp_dict=dict(tweet)
      tweet_data.append(temp_dict)
tweet_df = pd.DataFrame(tweet_data)
tweet_df['scrapedDate'] = dateTime
tweet_df.head()
tweet_df.to_csv(fileName)
Пример #13
0
def dump_user(username):
    dump_username = get_tweets(username, pages=pages_to_dump)
    dump = []
    for i in dump_username:
        dump.append(i)
    return dump
Пример #14
0
from twitter_scraper import get_tweets
#
# for tweet in get_tweets('kennethreitz', pages=1):
#     print(tweet['text'])
#

import markovify

tweets = '\n'.join([t['text'] for t in get_tweets('kennethreitz', pages=25)])
text_model = markovify.Text(tweets)

print(text_model.make_short_sentence(140))
#path = 'C:\\Users\\Jayendra Vadrevu\\Google Drive\\Darius\\1. DSA\\Course Material\\6. Text Analytics\\R and Python\R\\amazon_reviews.xlsx' # set reviews file path.
#raw_reviews = pd.read_excel(path,sheet_name="Reviews",names=['reviews']) # read reviews excel as pandas dataframe.
#url = "https://www.amazon.in/Avatar-Blu-ray-3D-Sam-Worthington/product-reviews/B01N9514ND/ref=cm_cr_arp_d_paging_btm_2?ie=UTF8&reviewerType=all_reviews&pageNumber=2"
url = "https://www.amazon.com/Interstellar-Matthew-McConaughey/product-reviews/B00TU9UO1W/ref=cm_cr_arp_d_paging_btm_2?ie=UTF8&reviewerType=all_reviews&pageNumber=2"
from amazon_scraper import amazon

reviews_list = amazon(url, 10)
len(reviews_list)
raw_reviews = pd.DataFrame({'reviews': reviews_list})
raw_reviews.shape  # examine dimensions/shape of dataframe.
raw_reviews.head(10)  # examine first n (i.e 10 in this case) rows of dataframe

from twitter_scraper import get_tweets

reviews_list_tw = (get_tweets("#interstellar", 'en', 'recent', 1000))
len(reviews_list_tw)
raw_reviews_tw = pd.DataFrame({'reviews': reviews_list_tw})
raw_reviews_tw.shape  # examine dimensions/shape of dataframe.
raw_reviews_tw.head(
    10)  # examine first n (i.e 10 in this case) rows of dataframe
############### text cleaning function #############################


def text_clean_one():  #regular expressions
    for i in range(0, len(raw_reviews.reviews), 1):
        raw_reviews['reviews'].iloc[i] = re.sub(
            "RT @[\w_]+: ", "",
            raw_reviews['reviews'].iloc[i])  #Removes RT @<username>:
        raw_reviews['reviews'].iloc[i] = re.sub(
            "<.*?>", "", raw_reviews['reviews'].iloc[i])  # Removes HTML tags.
Пример #16
0
from twitter_scraper import get_tweets

for tweet in get_tweets('covid19', pages=1):
    print(tweet['text'])
Пример #17
0
import pprint
import json
from twitter_scraper import get_tweets

pp = pprint.PrettyPrinter(indent=4)

all_tweets = []

users = ["realDonaldTrump", "barackobama", "usagsessions"]

for user in users:
    for tweet in get_tweets(user, pages=1):
        tweet["user"] = user
        tweet["time"] = tweet["time"].isoformat()
        all_tweets.append(tweet)

print(json.dumps(all_tweets))
def scrape_mars():

    # News

    url = 'https://mars.nasa.gov/news/?page=0&per_page=40&order=publish_date+desc%2Ccreated_at+desc&search=&category=19%2C165%2C184%2C204&blank_scope=Latest'
    response = requests.get(url)
    response = response.text
    soup = bs(response, 'html.parser')

    titles = soup.find_all('div', class_='content_title')
    title1 = titles[0].a.text.replace('\n', '')

    p = soup.find_all('div', class_='rollover_description_inner')
    p1 = p[0].text.replace('\n', '')

    # Images

    executable_path = {'executable_path': '/usr/local/bin/chromedriver'}
    browser = Browser('chrome', **executable_path, headless=False)

    browser.visit(
        'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars')

    html = browser.html
    soup2 = bs(html, "html.parser")

    image = soup2.find('div', class_='carousel_container')
    image = image.article['style']
    image = image.replace("background-image: url('",
                          "https://www.jpl.nasa.gov")
    image = image.replace("');", "")

    # Weather

    dict = []
    for tweet in get_tweets('MarsWxReport', pages=1):
        if re.search('InSight sol.+', tweet['text']):
            dict.append(tweet)

    temp = dict[0]['text']

    # Facts

    url = 'https://space-facts.com/mars/'
    tables = pd.read_html(url)

    facts = tables[0]

    # Hemispheres

    browser.visit(
        'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
    )
    browser.click_link_by_partial_text("Cerberus Hemisphere Enhanced")
    html = browser.html
    soup3 = bs(html, "html.parser")
    cerberus_img = soup3.find('li')
    cerberus_img = cerberus_img.a['href']
    cerberus_title = soup3.find('title')
    cerberus_title = cerberus_title.text

    browser.visit(
        'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
    )
    browser.click_link_by_partial_text("Schiaparelli Hemisphere Enhanced")
    html = browser.html
    soup3 = bs(html, "html.parser")
    schiaparelli_img = soup3.find('li')
    schiaparelli_img = schiaparelli_img.a['href']
    schiaparelli_title = soup3.find('title')
    schiaparelli_title = schiaparelli_title.text

    browser.visit(
        'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
    )
    browser.click_link_by_partial_text("Syrtis Major Hemisphere Enhanced")
    html = browser.html
    soup3 = bs(html, "html.parser")
    syrtis_img = soup3.find('li')
    syrtis_img = syrtis_img.a['href']
    syrtis_title = soup3.find('title')
    syrtis_title = syrtis_title.text

    browser.visit(
        'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
    )
    browser.click_link_by_partial_text("Valles Marineris Hemisphere Enhanced")
    html = browser.html
    soup3 = bs(html, "html.parser")
    valles_img = soup3.find('li')
    valles_img = valles_img.a['href']
    valles_title = soup3.find('title')
    valles_title = valles_title.text

    hemisphere_image_urls = [{
        'title': cerberus_title,
        'img_url': cerberus_img
    }, {
        'title': schiaparelli_title,
        'img_url': schiaparelli_img
    }, {
        'title': syrtis_title,
        'img_url': syrtis_img
    }, {
        'title': valles_title,
        'img_url': valles_img
    }]

    # Results

    results = {
        'dict1': {
            'key': 'news',
            'title': title1,
            'teaser': p1
        },
        'dict2': {
            'key': 'image',
            'image': image
        },
        'dict3': {
            'key': 'weather',
            'temp': temp
        },
        'dict4': {
            'key': 'facts',
            'facts': f'''{facts}'''
        },
        'dict5': {
            'key': 'hemispheres',
            'facts': hemisphere_image_urls
        }
    }

    return results
Пример #19
0
    config_ini = configparser.ConfigParser()
    config_ini.read("config.ini",encoding="utf-8")
    
    #打ち上げ時刻を設定
    t_launch = datetime.datetime.strptime(config_ini["LAUNCH"]["Launch_Time"],"%Y-%m-%d %H:%M:%S")
    margin = config_ini["LAUNCH"]["Wake_Up_Minutes"]
    
    t_start = datetime.datetime.now()
    while t_launch-datetime.datetime.now()>datetime.timedelta(minutes=int(margin)):
        td = t_launch-datetime.datetime.now()
        print("T-",td)
        time.sleep(1)

    #GO/NOGOチェック
    flg_nogo = False
    for tweet in get_tweets(config_ini["TWITTER"]["Target_Twitter_User"], pages=2):
        if tweet['isRetweet'] == True: #リツイートは見ない
            continue
        
        if tweet['time'] < t_start: #プログラム起動時より古いツイートは見ない
            break
        
        print(tweet['text'])
        print("-----------------------------------\n")
    
        if tweet['text'].startswith(config_ini["TWITTER"]["Keyword"]):
            flg_nogo = True

    if flg_nogo == False:        
        winsound.PlaySound("alarm_clock.wav", winsound.SND_FILENAME|winsound.SND_LOOP|winsound.SND_ASYNC)
        print("打ち上げ"+margin+"分前")
Пример #20
0
from twitter_scraper import get_tweets

for tweet in get_tweets('KonyTim946', pages=1):
    print('------------------')
    print(tweet['text'])
    print('------------------')
Пример #21
0
from twitter_scraper import get_tweets
for t in get_tweets('jairbolsonaro'):
    print(t['text'], t['likes'], t['retweets'])
Пример #22
0
import csv
import sys
from twitter_scraper import get_tweets

if len(sys.argv) >= 3:
    twitter_handle = sys.argv[1]
    twitter_pages = sys.argv[2]
else:
    print("Two command line arguments are expected")
    sys.exit()

tweets = get_tweets(twitter_handle, pages=int(twitter_pages))

filename = "tweet_data_" + twitter_handle + ".csv"
with open(filename, 'w') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['handle', 'text'])
    for tweet in tweets:
        writer.writerow([twitter_handle, tweet['text']])
Пример #23
0
with open(filename, 'r') as f:
    try:
        fcntl.flock(f.fileno(), fcntl.LOCK_EX)
        user_queue = f.read().splitlines()
    except IOError:
        print(traceback.format_exc())
    finally:
        fcntl.flock(f.fileno(), fcntl.LOCK_UN)

for screen_name in user_queue:
    if screen_name in user_done:
        continue
    now = datetime.now(timezone('Asia/Tokyo'))
    print('\n\n{} Start scraping  @{})'.format(
        now.strftime('%Y-%m-%d %H:%M:%S'), screen_name))

    for tweet in get_tweets(screen_name):
        if not tweet['entries']['photos']:
            continue
        if tweet['likes'] < 50:
            continue
        handle_tweet(tweet)

    now = datetime.now(timezone('Asia/Tokyo'))
    print('\n\n{} Finish scraping  @{})'.format(
        now.strftime('%Y-%m-%d %H:%M:%S'), screen_name))

    with open(file_done, 'a') as f_done:
        print(screen_name, file=f_done)
    break
import openpyxl
###

### PARAMETERS YOU SHOULD SET:
# Add @mentions or #hashtags as queries
searchqueries = ['@ladygaga', '@billieeilish', '@ritaora', '@beyonce']
# Set a cut off date: how far back in time...
cutoffdate = datetime(2020, 1, 1, 00, 00, 00)
###

### DO NOT CHANGE ANYTHING BEYOND THIS LINE
twitterdata = []
ignorepin = 0

for searchquery in searchqueries:
    for tweet in get_tweets(searchquery):
        if cutoffdate < tweet['time']:
            tweet['query'] = searchquery
            print(searchquery, '- Harvested tweet posted on', tweet['time'])
            twitterdata.append(tweet)

        else:
            ignorepin += 1
            if ignorepin == 5:
                break

print('Successfully harvested', len(twitterdata), 'tweets')

df = pd.DataFrame(twitterdata)
df.to_excel('twitterdata.xlsx')
###
Пример #25
0
from twitter_scraper import get_tweets
import pandas as pd 

#selected tag = ['#coronavirus,#Coronaoutbreak,]

res_list = []
result_df = pd.DataFrame()
num_pages = 10

for i in range(1, num_pages+1):
    for tweet in get_tweets('#CoronaVirus', pages=i):
        tweet['time'] = str( tweet['time'])  ## convert datetime object to str 
        res_list.append(tweet) 
Пример #26
0
async def twit(event):
    hesap = event.pattern_match.group(1)
    if len(hesap) < 1:
        await event.edit(
            "`Lütfen bir Twitter hesabı belirtin. Örnek: ``.twit st4r_m0rn1ng`"
        )
        return
    try:
        twits = list(twitter_scraper.get_tweets(hesap, pages=1))
    except Exception as e:
        await event.edit(
            f"`Muhtemelen böyle bir hesap yok. Çünkü hata oluştu. Hata: {e}`")
        return

    if len(twits) > 2:
        if twits[0]["tweetId"] < twits[1]["tweetId"]:
            twit = twits[1]
            fotolar = twit['entries']['photos']
            sonuc = []
            if len(fotolar) >= 1:
                i = 0
                while i < len(fotolar):
                    with open(f"{hesap}-{i}.jpg", 'wb') as load:
                        load.write(get(fotolar[i]).content)
                    sonuc.append(f"{hesap}-{i}.jpg")
                    i += 1
                await event.client.send_file(
                    event.chat_id,
                    sonuc,
                    caption=
                    f"**{hesap}**\n{twit['time']}\n\n`{twit['text']}`\n\n💬{twit['replies']} 🔁{twit['retweets']} ❤️{twit['likes']}"
                )
                await event.delete()
                return
            await event.edit(
                f"**{hesap}**\n{twit['time']}\n\n`{twit['text']}`\n\n💬{twit['replies']} 🔁{twit['retweets']} ❤️{twit['likes']}"
            )
        else:
            twit = twits[1]
            fotolar = twit['entries']['photos']
            sonuc = []
            if len(fotolar) >= 1:
                i = 0
                while i < len(fotolar):
                    with open(f"{hesap}-{i}.jpg", 'wb') as load:
                        load.write(get(fotolar[i]).content)
                    sonuc.append(f"{hesap}-{i}.jpg")
                    i += 1
                print(sonuc)
                await event.client.send_file(
                    event.chat_id,
                    sonuc,
                    caption=
                    f"**{hesap}**\n{twit['time']}\n\n`{twit['text']}`\n\n💬{twit['replies']} 🔁{twit['retweets']} ❤️{twit['likes']}"
                )
                await event.delete()
                return
            await event.edit(
                f"**{hesap}**\n{twit['time']}\n\n`{twit['text']}`\n\n💬{twit['replies']} 🔁{twit['retweets']} ❤️{twit['likes']}"
            )
        return
    else:
        twit = twits[0]
        fotolar = twit['entries']['photos']
        sonuc = []
        if len(fotolar) >= 1:
            i = 0
            while i < len(fotolar):
                with open(f"{hesap}-{i}.jpg", 'wb') as load:
                    load.write(get(fotolar[i]).content)
                sonuc.append(f"{hesap}-{i}.jpg")
                i += 1
            await event.client.send_file(
                event.chat_id,
                sonuc,
                caption=
                f"**{hesap}**\n{twit['time']}\n\n`{twit['text']}`\n\n💬{twit['replies']} 🔁{twit['retweets']} ❤️{twit['likes']}"
            )
            await event.delete()
            return
        await event.edit(
            f"**{hesap}**\n{twit['time']}\n\n`{twit['text']}`\n\n💬{twit['replies']} 🔁{twit['retweets']} ❤️{twit['likes']}"
        )
        return
Пример #27
0
    def test_mother(self):
        user = '******'
        tweets = list(get_tweets(user=user, pages=1))

        self.assertTrue(tweets[0]['text'].__contains__(
            'It is a gift to be alive in the time of Beyonce'))
Пример #28
0
def scrape():
    from bs4 import BeautifulSoup as bs
    from splinter import Browser
    import requests
    import pandas as pd
    from twitter_scraper import get_tweets
    from twitterscraper import query_tweets

    # In[2]:

    executable_path = {'executable_path': 'chromedriver'}
    browser = Browser('chrome', **executable_path, headless=False)

    # # Nasa Mars News

    # In[3]:

    nasa_mars_news_url = 'https://mars.nasa.gov/news/'
    # Retrieve page with the requests module
    news_response = requests.get(nasa_mars_news_url)
    # Create BeautifulSoup object; parse with 'html.parser'
    soup = bs(news_response.text, 'html.parser')

    # In[4]:

    #Display the result to figure out what you want to scrape
    print(soup.prettify())

    # In[5]:

    # results are returned as an iterable list
    results = soup.find_all(class_="slide")
    titles_list = []
    paragraphs_list = []
    # Loop through returned results
    for result in results:
        # Error handling
        try:
            #Find title and paragraph for each link. The title is found within the second link in each slide, the paragraph
            #is found inside an inner description div tag.
            links = result.find_all('a')
            title = links[1].text
            paragraph = result.find(class_="rollover_description_inner").text
            #Append both to a list
            titles_list.append(title)
            paragraphs_list.append(paragraph)
        except AttributeError as e:
            print(e)

    # In[6]:

    #Save the first title and body into variables for use later
    news_title = titles_list[0]
    news_p = paragraphs_list[0]
    print(news_title)
    print(news_p)

    # # JPL Mars Space Images

    # In[7]:

    #Second Web Scrape for Mars Image
    mars_image_url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
    # Retrieve page with the requests module
    image_response = requests.get(mars_image_url)
    # Create BeautifulSoup object; parse with 'html.parser'
    soup = bs(image_response.text, 'html.parser')
    # Examine the results
    print(soup.prettify())

    # In[8]:

    # results are returned as an iterable list
    results = soup.find_all(class_="carousel_items")
    # Loop through returned results
    for result in results:
        # Error handling
        try:
            #Find article tag and note that the link is in the 'style' parameter
            article = result.find('article', class_="carousel_item")
            article_link = article['style']
            #Use modification to fix the link to be in the correct format
            cleaned_article_link = article['style'].lstrip(
                'background-image: url(')
            cleaned_article_link = cleaned_article_link.rstrip(');')
        except AttributeError as e:
            print(e)

    # In[9]:

    #Remove single quotes from the start and end of the string and then construct the image url
    cleaned_article_link = cleaned_article_link.replace("'", "")
    featured_image_link = 'https://www.jpl.nasa.gov' + cleaned_article_link
    #Print image url as a test
    print(featured_image_link)

    # # Mars Weather

    # In[10]:

    #Third Web Scrape for Mars Weather Tweet
    mars_twitter = 'https://twitter.com/marswxreport?lang=en'
    # Retrieve page with the requests module
    weather_response = requests.get(mars_twitter)
    browser.visit(mars_twitter)

    # Create BeautifulSoup object; parse with 'html.parser'
    soup = bs(weather_response.text, 'html.parser')
    # Examine the results
    print(soup.prettify())

    # In[11]:

    # Scrap Tweets from MarsWxReport
    mars_tweets = []
    for tweet in get_tweets('MarsWxReport', pages=1):
        mars_tweets.append(tweet)  # Add values to the list

    # Extract the weather value of the latest MarsWxReport Tweet
    mars_weather_dict = {}
    mars_weather_dict = mars_tweets[0]
    mars_weather = mars_weather_dict.get('text')
    print('The latest Mars Weather Report is: ' + mars_weather)

    # # Mars Facts

    # In[12]:

    mars_facts_url = "https://space-facts.com/mars/"
    #Scrape using pandas
    facts_table = pd.read_html(mars_facts_url)
    facts_table

    # # Mars Hepispheres

    # In[13]:

    mars_hemispheres_url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"
    browser.visit(mars_hemispheres_url)
    html = browser.html
    soup = bs(html, 'html.parser')
    mars_hemisphere = []

    products = soup.find("div", class_="result-list")
    hemispheres = products.find_all("div", class_="item")

    for hemisphere in hemispheres:
        title = hemisphere.find("h3").text
        title = title.replace("Enhanced", "")
        end_link = hemisphere.find("a")["href"]
        image_link = "https://astrogeology.usgs.gov/" + end_link
        browser.visit(image_link)
        html = browser.html
        soup = bs(html, "html.parser")
        downloads = soup.find("div", class_="downloads")
        image_url = downloads.find("a")["href"]
        mars_hemisphere.append({"title": title, "img_url": image_url})
        print(title)
        print(image_url)

    return (scraped_dict)
Пример #29
0
 def test_25pages(self):
     """I don't know why but in some cases it only crawls 2~5 pages"""
     user = '******'
     tweets = list(get_tweets(user=user, pages=25))
     self.assertGreater(len(tweets), 486)
if __name__ == "__main__":
    # users = ['Youtube', 'Twitter', 'instagram',
    #          'BBCBreaking', 'Reuters', 'cnnbrk', 'nytimes',
    #          'ExpressTechie', 'techreview', 'hcltech', 'NASA_Technology',
    #          'Inspire_Us', 'BuddhaQuotes', 'wordstionary',
    #          'BarackObama', 'justinbieber', 'Cristiano',
    #          'realDonaldTrump', 'BillGates', 'jimmyfallon']
    users = ['Funny_Truth', 'ohteenquotes', 'wordstionary',
             'BuddhaQuotes', 'Inspire_Us', 'FactSoup', 'MrKeyNotes1',
             'IntThings', 'NASA_Technology', 'hcltech', 'techreview']
    
    tweets = []
    for user in users:
        print(f'Scraping @{user}...')
        t_list = []
        for tweet in get_tweets(user=user, pages=50):
            tweet['user'] = user
            t_list.append(tweet)
        tweets.extend(t_list)
    
    print('Creating dataframe...')
    df = pd.DataFrame(tweets)
    df = df[['tweetId', 'time', 'user', 'text', 'likes', 'retweets', 'replies']]

    print('Saving as CSV file...')
    path = './data/'
    if not os.path.exists(path):
        os.mkdir(path)
    df.to_csv('{}{}'.format(path, 'scraped_tweets.csv'), index=False)
Пример #31
0
    def test_father(self):
        user = '******'
        tweets = list(get_tweets(user=user, pages=1))

        self.assertTrue(tweets[0]['text'].__contains__('Want to feel old?'))
Пример #32
0
from twitter_scraper import get_tweets

for tweet in get_tweets('TeamCoco', pages=2):
    print("\n\n *** TWEET ***")
    print("\n")
    print(tweet['text'])
    print("\n")
    print("**** THE END ****\n\n")