def get_content(username): """Get data.""" if username: tweets = [t for t in get_tweets(username, pages=25)] else: tweets = ['error', 'no username set properly'] return tweets
def main(): """Main function that runs everything.""" if not args.logo_off: # print or not print logo print(logo) if args.test: #r = requests.get(base64.b64decode(that_url).decode('utf8')+'/jamescampbell') tweets = '\n'.join([t['text'] for t in get_tweets('jamescampbell', pages=20)]) if args.verbose: text_model = markovify.Text(tweets) print(text_model.make_short_sentence(140)) exit() #print(r.text) exit() else: tweets = get_content(args.username) if args.verbose: tweetbreak = [] print(f"Total found: {len(tweets)}") print(f"First tweet {tweets[0]['time']}") for idx, tweet in enumerate(tweets): timeone = tweet['time'] try: timetwo = (tweets[idx+1]['time']) except: timetwo = tweet['time'] #print(timetwo) tdelta = timeone - timetwo #print(tdelta.total_seconds()) tweetbreak.append(tdelta.total_seconds()) # print(tweetbreak[0]) print("Average time between tweets: {} minutes".format(get_other_mean(tweetbreak)/60)) exit() jsondata = request_response.json() # [trend['name'] for trend in the_data[0]['trends']] print() if args.print_me: # if we are running a test or not print('json data:') pprint(jsondata) print('fields available:') for k,v in jsondata['results'][0].items(): print(k) exit('thanks for trying') average_price = get_mean(jsondata) print("The average price of the \033[94m{0}\033[0m items matching search term\033[92m {1}\033[0m: ${2:.2f}".format(jsondata['resultCount'], args['search_term'], average_price)) if args.output_table: # if we want to output a table instead of json print(pd.DataFrame(jsondata['results'], columns=["price", "artistName", "trackName"])) else: with open('{}.json'.format(args['search_term']), 'w') as f: f.write(''.join(str(x) for x in [request_response.json()])) exit('file saved as {}.json'.format(args['search_term']))
def test_languages(self): user = '******' tweets = list(get_tweets(user=user, pages=1)) self.assertIn('likes', tweets[0]) self.assertIsInstance(tweets[0]['replies'], int) self.assertGreaterEqual(tweets[1]['retweets'], 0)
def test_child(self): user = '******' tweets = list(get_tweets(user=user, pages=1)) self.assertEqual(tweets[1]['text'], 'If I could, I would, but if I can’t, I wan’t.')
def scrape(account, numPage) : tweets = '\n'.join([t for t in get_tweets(account, pages=numPage)]) return tweets
import sys import math import json from twitter_scraper import get_tweets # Get username and number of tweets from command line arguments username = str(sys.argv[1]) num_tweets = int(sys.argv[2]) # Compute number of pages to retrieve # Each page has 20 tweets num_pages = math.ceil(num_tweets / 20.0) # Retrieve tweets by pages json_tweet = {"tweets": []} for tweet in get_tweets(username, pages=num_pages): json_tweet['tweets'].append(tweet['text']) # Remove unwanted tweets json_tweet['tweets'] = json_tweet['tweets'][0:num_tweets] # Dumps json_tweet_dumps = json.dumps(json_tweet) # Return tweets to Node server print(json_tweet_dumps)
def scrape(): browser = init_browser() #NASA url = "https://mars.nasa.gov/news.html/" browser.visit(url) time.sleep(1) html = browser.html soup = bs(html, 'html.parser') news = soup.find("div", class_="list_text") news_title = soup.find("div", class_="content_title").text first_paragraph = news.find("div", class_="article_teaser_body").text #JPL jpl_url = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars" browser.visit(jpl_url) jpl_html = browser.html soup = bs(jpl_html, 'html.parser') img_tag = soup.find("div", class_="carousel_items") img_tag = img_tag.find("article")['style'].split("('", 1)[1].split("')")[0] img_url = 'https://www.jpl.nasa.gov' + str(img_tag) #Weather tweet_url = "https://twitter.com/marswxreport" browser.visit(tweet_url) tweet_html = browser.html soup = bs(tweet_html, 'html') tweets = [] for tweet in get_tweets('@MarsWxReport', pages=1): tweeet = tweet['text'] tweets.append(tweeet) #Facts mars_url = "https://space-facts.com/mars/" tables = pd.read_html(mars_url) tables = pd.DataFrame(tables[0]) tables.columns = ["", "values"] tables = tables.to_html() #Hemis hemis_url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars" hemis_short_url = "https://astrogeology.usgs.gov/" browser.visit(hemis_url) hemis_html = browser.html soup = bs(hemis_html, 'html') hemisphere_imageurls = [] for i in range(4): label = soup.find_all("img", class_="thumb")[i]["src"] title = soup.find_all("h3")[i].text pic_url = hemis_short_url + label hemis_pic = {"title": title, "img_url": pic_url} hemisphere_imageurls.append(hemis_pic) hemisphere_imageurls #Summary mars = { "title": news_title, "content": first_paragraph, "jpl_pic": img_url, "tweet": tweets[0], "facts": tables, "hemispheres": hemisphere_imageurls } return mars
from twitter_scraper import get_tweets for tweet in get_tweets('kennethreitz', pages=1): print(tweet) print(dict(tweet)) print(tweet['entries']['hashtags'])
import urllib.request trends = t.get_trends() pages_to_search = 10 ids = [] def dl_img(url, name): path = f'./images/{name}.jpg' urllib.request.urlretrieve(url, path) for trend in trends: try: for i in range(1, pages_to_search): for tweet in t.get_tweets(trend, pages=i): if len(tweet['entries']['photos']): t_id = tweet['tweetId'] if t_id not in ids: ids.append(t_id) for j in range(0, len(tweet['entries']['photos'])): try: url = tweet['entries']['photos'][j] name = f'{trend}_{t_id}_{j}' dl_img(url, name) print('image downloaded') except: print('couldn\'t download image') else: print('image already downloaded') else:
def getData(username, count): # Export global var global listStopword global additionalStopword # Try read CSV fileCsvExist = path.exists(f'{username}.csv') if fileCsvExist: # read from CSV print('read from CSV') dateParse = lambda x: datetime.strptime(x, "%Y-%m-%d %H:%M:%S") df = pd.read_csv(f'{username}.csv', header='infer', parse_dates=['date'], date_parser=dateParse) df = df.sort_values(by='date') print(f'got {len(df)} tweets from @{username} from CSV') # define token tokenAll = [] # token all, alternative for set tokenNer = [] # loop per tweet print('\nfound:') i = 1 for index, row in df.iterrows(): # print(index, row['date'], row['content']) # NER TAG doc, ner = nex.getDocNer(row['content']) # remove punctuation, number and to lowercase noNumbTweetContent = re.sub(r'\d+', '', row['content']) cleanTweetContent = noNumbTweetContent.translate( noNumbTweetContent.maketrans('', '', string.punctuation)) # tokenize token = nltk.tokenize.word_tokenize(cleanTweetContent.lower()) for t in token: if t not in listStopword \ and t not in additionalStopword: tokenAll.append(lemmatizer.lemmatize(t)) # fill token ner tokenNer = tokenNer + processTokenNer(ner) # print tweet printTweet(i, row['date'], row['content'], ner) i = i + 1 print('') # create dataframe all token print('Calculate Frequency Distributions…') dfToken = pd.DataFrame(columns=['token', 'freq']) fd = FreqDist(tokenAll) for f in fd: entry = {'token': f, 'freq': fd[f]} dfToken = dfToken.append(entry, ignore_index=True) # create dataframe for ner dfNer = pd.DataFrame(columns=['ner', 'type', 'freq']) fdNer = FreqDist(tokenNer) for f in fdNer: entryNer = f.split('/') entry = {'ner': entryNer[0], 'type': entryNer[1], 'freq': fdNer[f]} dfNer = dfNer.append(entry, ignore_index=True) return df, dfToken, dfNer else: # get from Twitter print(f'try get {count} tweets from @{username}') page = int(round(count / TWEET_ONE_PAGE)) # create dataframe df = pd.DataFrame(columns=[ 'date', # Date tweet 'year', 'month', 'day', 'ner_person', # PERSON : People, including fictional. 'ner_norp', # NORP : Nationalities or religious or political groups. 'ner_fac', # FAC : Buildings, airports, highways, bridges, etc. 'ner_org', # ORG : Companies, agencies, institutions, etc. 'ner_gpe', # GPE : Countries, cities, states. 'ner_loc', # LOC : Non-GPE locations, mountain ranges, bodies of water. 'ner_product', # PRODUCT : Objects, vehicles, foods, etc. (Not services.) 'ner_event', # EVENT : Named hurricanes, battles, wars, sports events, etc. 'ner_work_of_art', # WORK_OF_ART : Titles of books, songs, etc. 'ner_law', # LAW : Named documents made into laws. 'ner_date', # DATE : Absolute or relative dates or periods. 'ner_time', # TIME : Times smaller than a day. 'ner_money', # MONEY : Monetary values, including unit. 'content' ] # Original tweet content ) # define token tokenAll = [] # token all, alternative for set tokenNer = [] # loop per fetched tweet print('\nfound:') i = 1 for t in get_tweets(username, pages=page): tweetDate = t[CONST.IDX_DATE] tweetContent = t[CONST.IDX_TWEET] respReplies = t[CONST.IDX_REPLIES] respRetweet = t[CONST.IDX_RETWEET] respLikes = t[CONST.IDX_LIKES] # Remove any links and numbers noLinkTweetContent = tweetContent.replace( 'pic.twitter.com', 'http://pic.twitter.com') noLinkTweetContent = re.sub( r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', '', noLinkTweetContent) noNumbTweetContent = re.sub(r'\d+', '', noLinkTweetContent) # remove punctuation cleanTweetContent = noNumbTweetContent.translate( noNumbTweetContent.maketrans('', '', string.punctuation)) # NER TAG doc, ner = nex.getDocNer(noLinkTweetContent) # tokenize token = nltk.tokenize.word_tokenize(cleanTweetContent.lower()) for t in token: if t not in listStopword \ and t not in additionalStopword: tokenAll.append(lemmatizer.lemmatize(t)) # fill token ner tokenNer = tokenNer + processTokenNer(ner) # add to dataframe entryDf = { 'date': tweetDate, 'year': int(tweetDate.year), 'month': int(tweetDate.month), 'day': int(tweetDate.day), 'ner_person': ner[nex.PERSON], 'ner_norp': ner[nex.NORP], 'ner_fac': ner[nex.FAC], 'ner_org': ner[nex.ORG], 'ner_gpe': ner[nex.GPE], 'ner_loc': ner[nex.LOC], 'ner_product': ner[nex.PRODUCT], 'ner_event': ner[nex.EVENT], 'ner_work_of_art': ner[nex.WORK_OF_ART], 'ner_law': ner[nex.LAW], 'ner_date': ner[nex.DATE], 'ner_time': ner[nex.TIME], 'ner_money': ner[nex.MONEY], 'content': noLinkTweetContent } df = df.append(entryDf, ignore_index=True) # print tweet printTweet(i, tweetDate, noLinkTweetContent, ner) i = i + 1 print('') # print(df.head()) # Save to CSV as buffer data if len(df) > 0: df = df.sort_values(by='date', ascending=False) df.to_csv(f'{username}.csv') # create dataframe all token print('Calculate Frequency Distributions…') dfToken = pd.DataFrame(columns=['token', 'freq']) fd = FreqDist(tokenAll) for f in fd: entry = {'token': f, 'freq': fd[f]} dfToken = dfToken.append(entry, ignore_index=True) # create dataframe for ner dfNer = pd.DataFrame(columns=['ner', 'type', 'freq']) fdNer = FreqDist(tokenNer) for f in fdNer: entryNer = f.split('/') entry = {'ner': entryNer[0], 'type': entryNer[1], 'freq': fdNer[f]} dfNer = dfNer.append(entry, ignore_index=True) return df, dfToken, dfNer
def postMessage(userName, pages): # conn = mysql.connector.connect(user="******", password="******", host="localhost", database="test", buffered=True) conn = mysql.connector.connect(user="******", password="******", host="localhost", database="twitter_scrap", buffered=True) cursor = conn.cursor() try: for tweet in get_tweets(userName, pages=pages): # print(tweet) tweetId = tweet["tweetId"] username = tweet["username"] tweetUrl = tweet["tweetUrl"] isRetweet = tweet["isRetweet"] # 抓取到的数据每一条进行判断,按ID查找,在数据库里存在不操作,不存在发送请求给机器人,然后保存 select_query = "SELECT count(1) FROM twitter_scraper WHERE tweetId = " + tweetId + ";" try: cursor.execute(select_query) c = cursor.fetchone()[0] except mysql.connector.Error as err: c = 0 if c == 0: # print("发送机器人请求") text1 = tweet["text"] p = "pic\.twitter\.com.*" text = re.sub(p, "", text1) entries = tweet["entries"] title = "" if isRetweet: title = userName + "转发了" + username + "的推特" else: title = userName + "更新了推特" photoList = entries["photos"] joinPhotoList = [] if len(photoList): # 对图片列表中的每个图片进行本地保存,文件名是twitterID+第几张 photoListNew = [] for i in range(len(photoList)): data = requests.get(photoList[i]) filename = tweetId + "-" + str(i) + ".png" with open(filename, 'wb') as fp: fp.write(data.content) photoListNew.append("http://45.32.75.149:8080/" + filename) def joinPhoto(x): return "![image](" + x + ")" joinPhotoList = list(map(joinPhoto, photoListNew)) markDownList = [ "#### " + title + "\n\n", text + "\n\n", joinPhotoList, "[原文链接](https://twitter.com" + tweetUrl + ")" ] markdownText = "" for c in markDownList: if type(c).__name__ == 'list': for v in c: markdownText = markdownText + v + "\n\n" else: markdownText = markdownText + c + "\n\n" # print(markdownText) dingTalkSing = getDingTailSign() boturl = "https://oapi.dingtalk.com/robot/send?" \ "access_token=5da1a623d549026fba616a3358e8557725af32f6ca681671b2930b0790b44bb0" \ "×tamp=" + dingTalkSing['timestamp'] + \ "&sign=" + dingTalkSing['sign'] headers = {'Content-Type': 'application/json;charset=utf-8'} data = { "msgtype": "markdown", "markdown": { "title": title, "text": markdownText } } r = requests.post(boturl, data=json.dumps(data), headers=headers) if "ok" in r.text: # 发送机器人请求成功后,插入到数据库 insert_query = 'INSERT INTO twitter_scraper (tweetId, username, tweetUrl, isRetweet ,text)' insert_query += ' VALUES ( %s, %s, %s, %s, %s)' try: cursor.execute( insert_query, (tweetId, username, tweetUrl, isRetweet, text)) except mysql.connector.Error as err: print(err.msg) else: # print(cursor.rowcount) conn.commit() except Exception as e: print(e) finally: cursor.close() conn.close()
!pip install twitter_scraper import pandas as pd from datetime import datetime, timedelta from twitter_scraper import get_tweets tweet_data = [] duration = datetime.today() - timedelta(days=7) groupLists=['FactlyIndia','TimesFactCheck','QuintFactCheck'] dateTime = datetime.now() fileName = 'data_twitter_'+str(dateTime)+'.csv' for item in groupLists: for tweet in get_tweets(item, pages=20): post_date = (tweet['time']) if post_date > duration: temp_dict=dict(tweet) tweet_data.append(temp_dict) tweet_df = pd.DataFrame(tweet_data) tweet_df['scrapedDate'] = dateTime tweet_df.head() tweet_df.to_csv(fileName)
def dump_user(username): dump_username = get_tweets(username, pages=pages_to_dump) dump = [] for i in dump_username: dump.append(i) return dump
from twitter_scraper import get_tweets # # for tweet in get_tweets('kennethreitz', pages=1): # print(tweet['text']) # import markovify tweets = '\n'.join([t['text'] for t in get_tweets('kennethreitz', pages=25)]) text_model = markovify.Text(tweets) print(text_model.make_short_sentence(140))
#path = 'C:\\Users\\Jayendra Vadrevu\\Google Drive\\Darius\\1. DSA\\Course Material\\6. Text Analytics\\R and Python\R\\amazon_reviews.xlsx' # set reviews file path. #raw_reviews = pd.read_excel(path,sheet_name="Reviews",names=['reviews']) # read reviews excel as pandas dataframe. #url = "https://www.amazon.in/Avatar-Blu-ray-3D-Sam-Worthington/product-reviews/B01N9514ND/ref=cm_cr_arp_d_paging_btm_2?ie=UTF8&reviewerType=all_reviews&pageNumber=2" url = "https://www.amazon.com/Interstellar-Matthew-McConaughey/product-reviews/B00TU9UO1W/ref=cm_cr_arp_d_paging_btm_2?ie=UTF8&reviewerType=all_reviews&pageNumber=2" from amazon_scraper import amazon reviews_list = amazon(url, 10) len(reviews_list) raw_reviews = pd.DataFrame({'reviews': reviews_list}) raw_reviews.shape # examine dimensions/shape of dataframe. raw_reviews.head(10) # examine first n (i.e 10 in this case) rows of dataframe from twitter_scraper import get_tweets reviews_list_tw = (get_tweets("#interstellar", 'en', 'recent', 1000)) len(reviews_list_tw) raw_reviews_tw = pd.DataFrame({'reviews': reviews_list_tw}) raw_reviews_tw.shape # examine dimensions/shape of dataframe. raw_reviews_tw.head( 10) # examine first n (i.e 10 in this case) rows of dataframe ############### text cleaning function ############################# def text_clean_one(): #regular expressions for i in range(0, len(raw_reviews.reviews), 1): raw_reviews['reviews'].iloc[i] = re.sub( "RT @[\w_]+: ", "", raw_reviews['reviews'].iloc[i]) #Removes RT @<username>: raw_reviews['reviews'].iloc[i] = re.sub( "<.*?>", "", raw_reviews['reviews'].iloc[i]) # Removes HTML tags.
from twitter_scraper import get_tweets for tweet in get_tweets('covid19', pages=1): print(tweet['text'])
import pprint import json from twitter_scraper import get_tweets pp = pprint.PrettyPrinter(indent=4) all_tweets = [] users = ["realDonaldTrump", "barackobama", "usagsessions"] for user in users: for tweet in get_tweets(user, pages=1): tweet["user"] = user tweet["time"] = tweet["time"].isoformat() all_tweets.append(tweet) print(json.dumps(all_tweets))
def scrape_mars(): # News url = 'https://mars.nasa.gov/news/?page=0&per_page=40&order=publish_date+desc%2Ccreated_at+desc&search=&category=19%2C165%2C184%2C204&blank_scope=Latest' response = requests.get(url) response = response.text soup = bs(response, 'html.parser') titles = soup.find_all('div', class_='content_title') title1 = titles[0].a.text.replace('\n', '') p = soup.find_all('div', class_='rollover_description_inner') p1 = p[0].text.replace('\n', '') # Images executable_path = {'executable_path': '/usr/local/bin/chromedriver'} browser = Browser('chrome', **executable_path, headless=False) browser.visit( 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars') html = browser.html soup2 = bs(html, "html.parser") image = soup2.find('div', class_='carousel_container') image = image.article['style'] image = image.replace("background-image: url('", "https://www.jpl.nasa.gov") image = image.replace("');", "") # Weather dict = [] for tweet in get_tweets('MarsWxReport', pages=1): if re.search('InSight sol.+', tweet['text']): dict.append(tweet) temp = dict[0]['text'] # Facts url = 'https://space-facts.com/mars/' tables = pd.read_html(url) facts = tables[0] # Hemispheres browser.visit( 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars' ) browser.click_link_by_partial_text("Cerberus Hemisphere Enhanced") html = browser.html soup3 = bs(html, "html.parser") cerberus_img = soup3.find('li') cerberus_img = cerberus_img.a['href'] cerberus_title = soup3.find('title') cerberus_title = cerberus_title.text browser.visit( 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars' ) browser.click_link_by_partial_text("Schiaparelli Hemisphere Enhanced") html = browser.html soup3 = bs(html, "html.parser") schiaparelli_img = soup3.find('li') schiaparelli_img = schiaparelli_img.a['href'] schiaparelli_title = soup3.find('title') schiaparelli_title = schiaparelli_title.text browser.visit( 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars' ) browser.click_link_by_partial_text("Syrtis Major Hemisphere Enhanced") html = browser.html soup3 = bs(html, "html.parser") syrtis_img = soup3.find('li') syrtis_img = syrtis_img.a['href'] syrtis_title = soup3.find('title') syrtis_title = syrtis_title.text browser.visit( 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars' ) browser.click_link_by_partial_text("Valles Marineris Hemisphere Enhanced") html = browser.html soup3 = bs(html, "html.parser") valles_img = soup3.find('li') valles_img = valles_img.a['href'] valles_title = soup3.find('title') valles_title = valles_title.text hemisphere_image_urls = [{ 'title': cerberus_title, 'img_url': cerberus_img }, { 'title': schiaparelli_title, 'img_url': schiaparelli_img }, { 'title': syrtis_title, 'img_url': syrtis_img }, { 'title': valles_title, 'img_url': valles_img }] # Results results = { 'dict1': { 'key': 'news', 'title': title1, 'teaser': p1 }, 'dict2': { 'key': 'image', 'image': image }, 'dict3': { 'key': 'weather', 'temp': temp }, 'dict4': { 'key': 'facts', 'facts': f'''{facts}''' }, 'dict5': { 'key': 'hemispheres', 'facts': hemisphere_image_urls } } return results
config_ini = configparser.ConfigParser() config_ini.read("config.ini",encoding="utf-8") #打ち上げ時刻を設定 t_launch = datetime.datetime.strptime(config_ini["LAUNCH"]["Launch_Time"],"%Y-%m-%d %H:%M:%S") margin = config_ini["LAUNCH"]["Wake_Up_Minutes"] t_start = datetime.datetime.now() while t_launch-datetime.datetime.now()>datetime.timedelta(minutes=int(margin)): td = t_launch-datetime.datetime.now() print("T-",td) time.sleep(1) #GO/NOGOチェック flg_nogo = False for tweet in get_tweets(config_ini["TWITTER"]["Target_Twitter_User"], pages=2): if tweet['isRetweet'] == True: #リツイートは見ない continue if tweet['time'] < t_start: #プログラム起動時より古いツイートは見ない break print(tweet['text']) print("-----------------------------------\n") if tweet['text'].startswith(config_ini["TWITTER"]["Keyword"]): flg_nogo = True if flg_nogo == False: winsound.PlaySound("alarm_clock.wav", winsound.SND_FILENAME|winsound.SND_LOOP|winsound.SND_ASYNC) print("打ち上げ"+margin+"分前")
from twitter_scraper import get_tweets for tweet in get_tweets('KonyTim946', pages=1): print('------------------') print(tweet['text']) print('------------------')
from twitter_scraper import get_tweets for t in get_tweets('jairbolsonaro'): print(t['text'], t['likes'], t['retweets'])
import csv import sys from twitter_scraper import get_tweets if len(sys.argv) >= 3: twitter_handle = sys.argv[1] twitter_pages = sys.argv[2] else: print("Two command line arguments are expected") sys.exit() tweets = get_tweets(twitter_handle, pages=int(twitter_pages)) filename = "tweet_data_" + twitter_handle + ".csv" with open(filename, 'w') as csvfile: writer = csv.writer(csvfile) writer.writerow(['handle', 'text']) for tweet in tweets: writer.writerow([twitter_handle, tweet['text']])
with open(filename, 'r') as f: try: fcntl.flock(f.fileno(), fcntl.LOCK_EX) user_queue = f.read().splitlines() except IOError: print(traceback.format_exc()) finally: fcntl.flock(f.fileno(), fcntl.LOCK_UN) for screen_name in user_queue: if screen_name in user_done: continue now = datetime.now(timezone('Asia/Tokyo')) print('\n\n{} Start scraping @{})'.format( now.strftime('%Y-%m-%d %H:%M:%S'), screen_name)) for tweet in get_tweets(screen_name): if not tweet['entries']['photos']: continue if tweet['likes'] < 50: continue handle_tweet(tweet) now = datetime.now(timezone('Asia/Tokyo')) print('\n\n{} Finish scraping @{})'.format( now.strftime('%Y-%m-%d %H:%M:%S'), screen_name)) with open(file_done, 'a') as f_done: print(screen_name, file=f_done) break
import openpyxl ### ### PARAMETERS YOU SHOULD SET: # Add @mentions or #hashtags as queries searchqueries = ['@ladygaga', '@billieeilish', '@ritaora', '@beyonce'] # Set a cut off date: how far back in time... cutoffdate = datetime(2020, 1, 1, 00, 00, 00) ### ### DO NOT CHANGE ANYTHING BEYOND THIS LINE twitterdata = [] ignorepin = 0 for searchquery in searchqueries: for tweet in get_tweets(searchquery): if cutoffdate < tweet['time']: tweet['query'] = searchquery print(searchquery, '- Harvested tweet posted on', tweet['time']) twitterdata.append(tweet) else: ignorepin += 1 if ignorepin == 5: break print('Successfully harvested', len(twitterdata), 'tweets') df = pd.DataFrame(twitterdata) df.to_excel('twitterdata.xlsx') ###
from twitter_scraper import get_tweets import pandas as pd #selected tag = ['#coronavirus,#Coronaoutbreak,] res_list = [] result_df = pd.DataFrame() num_pages = 10 for i in range(1, num_pages+1): for tweet in get_tweets('#CoronaVirus', pages=i): tweet['time'] = str( tweet['time']) ## convert datetime object to str res_list.append(tweet)
async def twit(event): hesap = event.pattern_match.group(1) if len(hesap) < 1: await event.edit( "`Lütfen bir Twitter hesabı belirtin. Örnek: ``.twit st4r_m0rn1ng`" ) return try: twits = list(twitter_scraper.get_tweets(hesap, pages=1)) except Exception as e: await event.edit( f"`Muhtemelen böyle bir hesap yok. Çünkü hata oluştu. Hata: {e}`") return if len(twits) > 2: if twits[0]["tweetId"] < twits[1]["tweetId"]: twit = twits[1] fotolar = twit['entries']['photos'] sonuc = [] if len(fotolar) >= 1: i = 0 while i < len(fotolar): with open(f"{hesap}-{i}.jpg", 'wb') as load: load.write(get(fotolar[i]).content) sonuc.append(f"{hesap}-{i}.jpg") i += 1 await event.client.send_file( event.chat_id, sonuc, caption= f"**{hesap}**\n{twit['time']}\n\n`{twit['text']}`\n\n💬{twit['replies']} 🔁{twit['retweets']} ❤️{twit['likes']}" ) await event.delete() return await event.edit( f"**{hesap}**\n{twit['time']}\n\n`{twit['text']}`\n\n💬{twit['replies']} 🔁{twit['retweets']} ❤️{twit['likes']}" ) else: twit = twits[1] fotolar = twit['entries']['photos'] sonuc = [] if len(fotolar) >= 1: i = 0 while i < len(fotolar): with open(f"{hesap}-{i}.jpg", 'wb') as load: load.write(get(fotolar[i]).content) sonuc.append(f"{hesap}-{i}.jpg") i += 1 print(sonuc) await event.client.send_file( event.chat_id, sonuc, caption= f"**{hesap}**\n{twit['time']}\n\n`{twit['text']}`\n\n💬{twit['replies']} 🔁{twit['retweets']} ❤️{twit['likes']}" ) await event.delete() return await event.edit( f"**{hesap}**\n{twit['time']}\n\n`{twit['text']}`\n\n💬{twit['replies']} 🔁{twit['retweets']} ❤️{twit['likes']}" ) return else: twit = twits[0] fotolar = twit['entries']['photos'] sonuc = [] if len(fotolar) >= 1: i = 0 while i < len(fotolar): with open(f"{hesap}-{i}.jpg", 'wb') as load: load.write(get(fotolar[i]).content) sonuc.append(f"{hesap}-{i}.jpg") i += 1 await event.client.send_file( event.chat_id, sonuc, caption= f"**{hesap}**\n{twit['time']}\n\n`{twit['text']}`\n\n💬{twit['replies']} 🔁{twit['retweets']} ❤️{twit['likes']}" ) await event.delete() return await event.edit( f"**{hesap}**\n{twit['time']}\n\n`{twit['text']}`\n\n💬{twit['replies']} 🔁{twit['retweets']} ❤️{twit['likes']}" ) return
def test_mother(self): user = '******' tweets = list(get_tweets(user=user, pages=1)) self.assertTrue(tweets[0]['text'].__contains__( 'It is a gift to be alive in the time of Beyonce'))
def scrape(): from bs4 import BeautifulSoup as bs from splinter import Browser import requests import pandas as pd from twitter_scraper import get_tweets from twitterscraper import query_tweets # In[2]: executable_path = {'executable_path': 'chromedriver'} browser = Browser('chrome', **executable_path, headless=False) # # Nasa Mars News # In[3]: nasa_mars_news_url = 'https://mars.nasa.gov/news/' # Retrieve page with the requests module news_response = requests.get(nasa_mars_news_url) # Create BeautifulSoup object; parse with 'html.parser' soup = bs(news_response.text, 'html.parser') # In[4]: #Display the result to figure out what you want to scrape print(soup.prettify()) # In[5]: # results are returned as an iterable list results = soup.find_all(class_="slide") titles_list = [] paragraphs_list = [] # Loop through returned results for result in results: # Error handling try: #Find title and paragraph for each link. The title is found within the second link in each slide, the paragraph #is found inside an inner description div tag. links = result.find_all('a') title = links[1].text paragraph = result.find(class_="rollover_description_inner").text #Append both to a list titles_list.append(title) paragraphs_list.append(paragraph) except AttributeError as e: print(e) # In[6]: #Save the first title and body into variables for use later news_title = titles_list[0] news_p = paragraphs_list[0] print(news_title) print(news_p) # # JPL Mars Space Images # In[7]: #Second Web Scrape for Mars Image mars_image_url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars' # Retrieve page with the requests module image_response = requests.get(mars_image_url) # Create BeautifulSoup object; parse with 'html.parser' soup = bs(image_response.text, 'html.parser') # Examine the results print(soup.prettify()) # In[8]: # results are returned as an iterable list results = soup.find_all(class_="carousel_items") # Loop through returned results for result in results: # Error handling try: #Find article tag and note that the link is in the 'style' parameter article = result.find('article', class_="carousel_item") article_link = article['style'] #Use modification to fix the link to be in the correct format cleaned_article_link = article['style'].lstrip( 'background-image: url(') cleaned_article_link = cleaned_article_link.rstrip(');') except AttributeError as e: print(e) # In[9]: #Remove single quotes from the start and end of the string and then construct the image url cleaned_article_link = cleaned_article_link.replace("'", "") featured_image_link = 'https://www.jpl.nasa.gov' + cleaned_article_link #Print image url as a test print(featured_image_link) # # Mars Weather # In[10]: #Third Web Scrape for Mars Weather Tweet mars_twitter = 'https://twitter.com/marswxreport?lang=en' # Retrieve page with the requests module weather_response = requests.get(mars_twitter) browser.visit(mars_twitter) # Create BeautifulSoup object; parse with 'html.parser' soup = bs(weather_response.text, 'html.parser') # Examine the results print(soup.prettify()) # In[11]: # Scrap Tweets from MarsWxReport mars_tweets = [] for tweet in get_tweets('MarsWxReport', pages=1): mars_tweets.append(tweet) # Add values to the list # Extract the weather value of the latest MarsWxReport Tweet mars_weather_dict = {} mars_weather_dict = mars_tweets[0] mars_weather = mars_weather_dict.get('text') print('The latest Mars Weather Report is: ' + mars_weather) # # Mars Facts # In[12]: mars_facts_url = "https://space-facts.com/mars/" #Scrape using pandas facts_table = pd.read_html(mars_facts_url) facts_table # # Mars Hepispheres # In[13]: mars_hemispheres_url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars" browser.visit(mars_hemispheres_url) html = browser.html soup = bs(html, 'html.parser') mars_hemisphere = [] products = soup.find("div", class_="result-list") hemispheres = products.find_all("div", class_="item") for hemisphere in hemispheres: title = hemisphere.find("h3").text title = title.replace("Enhanced", "") end_link = hemisphere.find("a")["href"] image_link = "https://astrogeology.usgs.gov/" + end_link browser.visit(image_link) html = browser.html soup = bs(html, "html.parser") downloads = soup.find("div", class_="downloads") image_url = downloads.find("a")["href"] mars_hemisphere.append({"title": title, "img_url": image_url}) print(title) print(image_url) return (scraped_dict)
def test_25pages(self): """I don't know why but in some cases it only crawls 2~5 pages""" user = '******' tweets = list(get_tweets(user=user, pages=25)) self.assertGreater(len(tweets), 486)
if __name__ == "__main__": # users = ['Youtube', 'Twitter', 'instagram', # 'BBCBreaking', 'Reuters', 'cnnbrk', 'nytimes', # 'ExpressTechie', 'techreview', 'hcltech', 'NASA_Technology', # 'Inspire_Us', 'BuddhaQuotes', 'wordstionary', # 'BarackObama', 'justinbieber', 'Cristiano', # 'realDonaldTrump', 'BillGates', 'jimmyfallon'] users = ['Funny_Truth', 'ohteenquotes', 'wordstionary', 'BuddhaQuotes', 'Inspire_Us', 'FactSoup', 'MrKeyNotes1', 'IntThings', 'NASA_Technology', 'hcltech', 'techreview'] tweets = [] for user in users: print(f'Scraping @{user}...') t_list = [] for tweet in get_tweets(user=user, pages=50): tweet['user'] = user t_list.append(tweet) tweets.extend(t_list) print('Creating dataframe...') df = pd.DataFrame(tweets) df = df[['tweetId', 'time', 'user', 'text', 'likes', 'retweets', 'replies']] print('Saving as CSV file...') path = './data/' if not os.path.exists(path): os.mkdir(path) df.to_csv('{}{}'.format(path, 'scraped_tweets.csv'), index=False)
def test_father(self): user = '******' tweets = list(get_tweets(user=user, pages=1)) self.assertTrue(tweets[0]['text'].__contains__('Want to feel old?'))
from twitter_scraper import get_tweets for tweet in get_tweets('TeamCoco', pages=2): print("\n\n *** TWEET ***") print("\n") print(tweet['text']) print("\n") print("**** THE END ****\n\n")