def searchWordpress(raw_keyword): keyword = urllib2.quote(raw_keyword) opsecHeader.writeLastCheckedTime('wordpress') ############### WORDPRESS ################## # # See http://en.search.wordpress.com/?q=obama&s=date&f=json # # Arguments: # q = keyword to search for # s = sort by; we want date; not relevance # f = format; we want JSON wordpressQueryString = 'http://en.search.wordpress.com/?q=' + keyword + '&s=date&f=json' opsecHeader.queryWebsiteJSON("wordpress", wordpressQueryString) wordpressLatestEpoch = getLatestWordpress() wordpressResults = opsecHeader.readResultsJSON('wordpress') epochTime = wordpressResults[0]['epoch_time'] if str(wordpressLatestEpoch) == str(epochTime): print "No new blog posts since last query." else: for x in wordpressResults: epochTime = x['epoch_time'] if int(wordpressLatestEpoch) < int(epochTime): title = (x['title']).encode('utf-8') author = (x['author']).encode('utf-8') content = (x['content']).encode('utf-8') link = (x['link']).encode('utf-8') writeLatestWordpress(epochTime, title, author, content, link, keyword) opsecHeader.sendEmail(keyword, "Wordpress")
def searchTwitter(raw_keyword): keyword = urllib2.quote(raw_keyword) opsecHeader.writeLastCheckedTime('twitter') # See https://dev.twitter.com/docs/api/1/get/search tweetSinceDate = str(getLatestTweet(None, keyword)[0]) searchQueryString = 'http://search.twitter.com/search.json?q=' + keyword + '&rpp=10&result_type=recent' if tweetSinceDate != '0': # Twitter does not play nice with invalid since_id's searchQueryString += '&since_id=' + tweetSinceDate opsecHeader.queryWebsiteJSON("twitter", searchQueryString) twitterResults = opsecHeader.readResultsJSON('twitter') twitterAllResults = twitterResults['results'] if not twitterAllResults: print "No results." else: existingEpochTime = getLatestTweet(None, keyword)[1] for x in twitterAllResults: created_at = (x['created_at']).encode('utf-8') epochTimeFound = calendar.timegm((time.strptime(created_at, '%a, %d %b %Y %H:%M:%S +0000'))) if int(epochTimeFound) > int(existingEpochTime): twitterID = (x['id']) from_user = (x['from_user']).encode('utf-8') text = (x['text']).encode('utf-8') created_at = (x['created_at']).encode('utf-8') profile_image_url_https = (x['profile_image_url_https']).encode('utf-8') location, lat, lng = genGeo(from_user) writeTweet(twitterID, from_user, text, created_at, keyword, location, lat, lng, epochTimeFound, profile_image_url_https) opsecHeader.sendEmail(keyword, "Twitter")
def getPastes(): global pasteIDsfound, pasteMaxSize if (len(pasteIDsfound) >= (pasteMaxSize * 2)): print "[-] cleaning list" for i in range(0, len(pasteIDsfound) - (pasteMaxSize)): pasteIDsfound.pop(0) print "[-] Pulling archive list..." try: page = urllib2.urlopen("http://www.pastebin.com/archive.php").read() regex = re.compile( '<td><img src="/i/t.gif" .*?<a href="/(.*?)">(.*?)</a></td>.*?<td>(.*?)</td>', re.S) pastes = regex.findall(page) for p in pastes: pasteID = p[0] pasteTitle = p[1] fetchAttempt = 0 opsecHeader.writeLastCheckedTime('pastebin') if (pasteID not in pasteIDsfound): print "[-] New paste(", pasteID, ")" pasteIDsfound.append(pasteID) print len(pasteIDsfound) pastePage = '' fetchAttempts = 0 while (pastePage == ''): print "[+] Pulling Raw paste" sock = urllib2.urlopen("http://pastebin.com/raw.php?i=" + pasteID) pastePage = sock.read() encoding = sock.headers['Content-type'].split('charset=')[ 1] # iso-8859-1 try: pastePage = pastePage.decode(encoding).encode('utf-8') if (pastePage == ''): pastePage = 'empty paste from http://pastebin.com/raw.php?i=' + pasteID if "requesting a little bit too much" in pastePage: pastePage = '' print "[-] hitting pastebin too quickly, sleeping for 2 seconds and trying again.." time.sleep(2) except: print "[!] couldnt decode page to utf-8" print "[-] Sleeping for 1 second" time.sleep(1) fetchAttempt = fetchAttempt + 1 if (fetchAttempt > 1): print "[+] Couldnt fetch " + "http://pastebin.com/raw.php?i=" + pasteID + " after 2 tries" pastePage = ' ' addPaste(pasteTitle, pasteID, pastePage) else: print "[-] Already seen ", pasteID sleeptime = random.randint(15, 45) print "[-] sleeping for", sleeptime, "seconds.." time.sleep(sleeptime) return 1 except IOError: print "[!] Error fetching list of pastes, sleeping for 10 seconds and trying again" time.sleep(10) return 0
def getPost(account_id, site, user_id, content_type): latest_epoch_time = getLatestPost(user_id, site, content_type) queryString = 'http://api.stackexchange.com/2.1/users/' + str(user_id) + '/' + str(content_type) + 's?fromdate=' + str(latest_epoch_time) + '&order=desc&sort=creation&site=' + site + '&key=' + opsecHeader.stackexchange_api_key opsecHeader.queryWebsiteJSON(str(site) + str(user_id) + str(content_type), queryString) opsecHeader.writeLastCheckedTime('stackexchange') results = opsecHeader.readResultsJSON(str(site) + str(user_id) + str(content_type)) items = results['items'] for x in items: creation_date = x['creation_date'] if(latest_epoch_time != creation_date): if(content_type == 'question'): question_id = x['question_id'] url = x['link'] html = urllib2.urlopen(url).read() soup = BeautifulSoup(html) dirty_content = soup.find('div', {'class': 'post-text', 'itemprop': 'description'}) content = ''.join(dirty_content.findAll(text=True)) elif(content_type == 'answer'): answer_id = x['answer_id'] url = "http://" + str(site) + ".com/a/" + str(answer_id) html = urllib2.urlopen(url).read() soup = BeautifulSoup(html) answer_id = 'answer-' + str(answer_id) div_content = soup.find('div', {'id': answer_id}) dirty_content = div_content.find('div', {'class': 'post-text'}) content = ''.join(dirty_content.findAll(text=True)) elif(content_type == 'comment'): comment_id = x['comment_id'] post_id = x['post_id'] short_url = 'http://' + str(site) + '.com/q/' + str(post_id) long_url = str(urllib2.urlopen(short_url).geturl()) long_url = long_url.split("#")[0] url = long_url + '#comment' + str(comment_id) + '_' + str(post_id) html = urllib2.urlopen(url).read() soup = BeautifulSoup(html) comment_id_format = 'comment-' + str(comment_id) try: #Will fail if comments need to be loaded via AJAX comment_tr = soup.find('tr', {'id': comment_id_format}) dirty_content = comment_tr.find('span', {'class': 'comment-copy'}) content = ''.join(dirty_content.findAll(text=True)) except AttributeError: content = 'See website' profile_image = x['owner']['profile_image'] display_name = x['owner']['display_name'] writeDisplayName(account_id, display_name) writeLatestPost(account_id, user_id, site, content_type, creation_date, profile_image, url, content, display_name) keywords = opsecHeader.getUserKeywords(account_id, 'stackexchange') for keyword in keywords: if keyword in content: opsecHeader.sendEmail(keyword, "Stack Exchange", display_name)
def getPastes(): global pasteIDsfound, pasteMaxSize if(len(pasteIDsfound) >= (pasteMaxSize * 2)): print "[-] cleaning list" for i in range(0, len(pasteIDsfound) - (pasteMaxSize)): pasteIDsfound.pop(0) print "[-] Pulling archive list..." try: page = urllib2.urlopen("http://www.pastebin.com/archive.php").read() regex = re.compile('<td><img src="/i/t.gif" .*?<a href="/(.*?)">(.*?)</a></td>.*?<td>(.*?)</td>', re.S) pastes = regex.findall(page) for p in pastes: pasteID = p[0] pasteTitle = p[1] fetchAttempt = 0 opsecHeader.writeLastCheckedTime('pastebin') if(pasteID not in pasteIDsfound): print "[-] New paste(", pasteID, ")" pasteIDsfound.append(pasteID) print len(pasteIDsfound) pastePage = '' fetchAttempts = 0 while (pastePage == ''): print "[+] Pulling Raw paste" sock = urllib2.urlopen("http://pastebin.com/raw.php?i=" + pasteID) pastePage = sock.read() encoding = sock.headers['Content-type'].split('charset=')[1] # iso-8859-1 try: pastePage = pastePage.decode(encoding).encode('utf-8') if(pastePage == ''): pastePage = 'empty paste from http://pastebin.com/raw.php?i=' + pasteID if "requesting a little bit too much" in pastePage: pastePage = '' print "[-] hitting pastebin too quickly, sleeping for 2 seconds and trying again.." time.sleep(2) except: print "[!] couldnt decode page to utf-8" print "[-] Sleeping for 1 second" time.sleep(1) fetchAttempt = fetchAttempt + 1 if(fetchAttempt > 1): print "[+] Couldnt fetch " + "http://pastebin.com/raw.php?i=" + pasteID + " after 2 tries" pastePage = ' ' addPaste(pasteTitle, pasteID, pastePage) else: print "[-] Already seen ", pasteID sleeptime = random.randint(15, 45) print "[-] sleeping for", sleeptime, "seconds.." time.sleep(sleeptime) return 1 except IOError: print "[!] Error fetching list of pastes, sleeping for 10 seconds and trying again" time.sleep(10) return 0
def getUserComments(user): #http://www.reddit.com/dev/api user = urllib2.quote(user) redditQueryString = 'http://www.reddit.com/user/' + user + '/overview.json' opsecHeader.queryWebsiteJSON("reddit", redditQueryString, opsecHeader.reddit_api_key) opsecHeader.writeLastCheckedTime('reddit') redditResults = opsecHeader.readResultsJSON('reddit') try: redditAllResults = redditResults['data']['children'] except KeyError: redditAllResults = None epoch_time_existing = getLatestUserEpoch(user) if not redditAllResults: print "No results." else: for x in redditAllResults: epoch_time_found = str( (x['data']['created_utc'])).encode('utf-8')[:-2] if int(epoch_time_found) > int(epoch_time_existing): try: link_id = (x['data']['link_id']).encode('utf-8')[3:] except KeyError: link_id = '' comment_id = (x['data']['id']).encode('utf-8') author = (x['data']['author']).encode('utf-8') try: body = (x['data']['body']).encode('utf-8') except KeyError: body = '' try: link_title = (x['data']['link_title']).encode('utf-8') except: link_title = '' subreddit = (x['data']['subreddit']).encode('utf-8') permalink = 'http://www.reddit.com/r/' + subreddit + '/comments/' + link_id + '/' + urllib2.quote( link_title) + '/' + comment_id writeLatestPost(author, body, link_id, comment_id, link_title, subreddit, epoch_time_found, permalink) keywords = opsecHeader.getUserKeywords(author, 'reddit') for keyword in keywords: if keyword in body: opsecHeader.sendEmail(keyword, "Reddit", author)
def getUserTweets(user): screen_name = urllib2.quote(user) opsecHeader.writeLastCheckedTime('twitter') # See https://dev.twitter.com/docs/api/1/get/statuses/user_timeline tweetSinceDate = str(getLatestTweet(screen_name, None)[0]) epochTimeExisting = getLatestTweet(screen_name, None)[1] twitterQueryString = 'https://api.twitter.com/1/statuses/user_timeline.json?screen_name=' + screen_name + '&count=10' if tweetSinceDate != '0': # Twitter does not play nice with invalid since_id's twitterQueryString += '&since_id=' + tweetSinceDate opsecHeader.queryWebsiteJSON("twitterUserTweets", twitterQueryString) twitterResults = opsecHeader.readResultsJSON('twitterUserTweets') if twitterResults is not None: twitterAllResults = twitterResults else: twitterAllResults = None if not twitterAllResults: print "No results." else: for x in twitterAllResults: created_at = (x['created_at']).encode('utf-8') epochTimeFound = calendar.timegm( (email.utils.parsedate(created_at))) if int(epochTimeFound) > int(epochTimeExisting): twitterID = (x['id']) text = (x['text']).encode('utf-8') from_user = (x['user']['screen_name']).encode('utf-8') created_at = (x['created_at']).encode('utf-8') profile_image_url_https = ( x['user']['profile_image_url_https']).encode('utf-8') location, lat, lng = genGeo(from_user) writeTweet(twitterID, from_user, text, created_at, '', location, lat, lng, epochTimeFound, profile_image_url_https) keywords = opsecHeader.getUserKeywords(from_user, 'twitter') for keyword in keywords: if keyword in text: opsecHeader.sendEmail(keyword, "Twitter", from_user)
def getUserComments(user): #http://www.reddit.com/dev/api user = urllib2.quote(user) redditQueryString = 'http://www.reddit.com/user/' + user + '/overview.json' opsecHeader.queryWebsiteJSON("reddit", redditQueryString, opsecHeader.reddit_api_key) opsecHeader.writeLastCheckedTime('reddit') redditResults = opsecHeader.readResultsJSON('reddit') try: redditAllResults = redditResults['data']['children'] except KeyError: redditAllResults = None epoch_time_existing = getLatestUserEpoch(user) if not redditAllResults: print "No results." else: for x in redditAllResults: epoch_time_found = str((x['data']['created_utc'])).encode('utf-8')[:-2] if int(epoch_time_found) > int(epoch_time_existing): try: link_id = (x['data']['link_id']).encode('utf-8')[3:] except KeyError: link_id = '' comment_id = (x['data']['id']).encode('utf-8') author = (x['data']['author']).encode('utf-8') try: body = (x['data']['body']).encode('utf-8') except KeyError: body = '' try: link_title = (x['data']['link_title']).encode('utf-8') except: link_title = '' subreddit = (x['data']['subreddit']).encode('utf-8') permalink = 'http://www.reddit.com/r/' + subreddit + '/comments/' + link_id + '/' + urllib2.quote(link_title) + '/' + comment_id writeLatestPost(author, body, link_id, comment_id, link_title, subreddit, epoch_time_found, permalink) keywords = opsecHeader.getUserKeywords(author, 'reddit') for keyword in keywords: if keyword in body: opsecHeader.sendEmail(keyword, "Reddit", author)
def getUserTweets(user): screen_name = urllib2.quote(user) opsecHeader.writeLastCheckedTime('twitter') # See https://dev.twitter.com/docs/api/1/get/statuses/user_timeline tweetSinceDate = str(getLatestTweet(screen_name, None)[0]) epochTimeExisting = getLatestTweet(screen_name, None)[1] twitterQueryString = 'https://api.twitter.com/1/statuses/user_timeline.json?screen_name=' + screen_name + '&count=10' if tweetSinceDate != '0': # Twitter does not play nice with invalid since_id's twitterQueryString += '&since_id=' + tweetSinceDate opsecHeader.queryWebsiteJSON("twitterUserTweets", twitterQueryString) twitterResults = opsecHeader.readResultsJSON('twitterUserTweets') if twitterResults is not None: twitterAllResults = twitterResults else: twitterAllResults = None if not twitterAllResults: print "No results." else: for x in twitterAllResults: created_at = (x['created_at']).encode('utf-8') epochTimeFound = calendar.timegm((email.utils.parsedate(created_at))) if int(epochTimeFound) > int(epochTimeExisting): twitterID = (x['id']) text = (x['text']).encode('utf-8') from_user = (x['user']['screen_name']).encode('utf-8') created_at = (x['created_at']).encode('utf-8') profile_image_url_https = (x['user']['profile_image_url_https']).encode('utf-8') location, lat, lng = genGeo(from_user) writeTweet(twitterID, from_user, text, created_at, '', location, lat, lng, epochTimeFound, profile_image_url_https) keywords = opsecHeader.getUserKeywords(from_user, 'twitter') for keyword in keywords: if keyword in text: opsecHeader.sendEmail(keyword, "Twitter", from_user)
def searchFacebook(raw_keyword): opsecHeader.writeLastCheckedTime('facebook') keyword = urllib2.quote(raw_keyword) # See https://developers.facebook.com/docs/reference/api/ # # Arguments: # q = keyword we are searching for # type = kind of object we are searching for e.g post # # Returns: # name; id (facebook.com/id for their profile) facebookLatestEpoch = getLatestPostTime() facebookQueryString = 'https://graph.facebook.com/search?q=' + keyword + '&type=post' opsecHeader.queryWebsiteJSON("facebook", facebookQueryString) print "Parsing Facebook data..." facebookResults = opsecHeader.readResultsJSON('facebook') facebookAllResults = facebookResults['data'] if facebookAllResults: for x in facebookAllResults: if 'message' in x: message = x['message'].encode('utf-8') name = (x['from']['name']).encode('utf-8') user_id = (x['from']['id']).encode('utf-8') updated_time = (x['updated_time']).encode('utf-8') epoch_time = calendar.timegm( (time.strptime(updated_time, '%Y-%m-%dT%H:%M:%S+0000'))) if int(epoch_time) > int(facebookLatestEpoch): profilePicture = getProfilePicture(user_id) writeLatestPost(name, user_id, message, profilePicture, updated_time, keyword, epoch_time) opsecHeader.sendEmail(keyword, "Facebook") print "Updated Time: " + updated_time else: print "Post too old."
def searchTwitter(raw_keyword): keyword = urllib2.quote(raw_keyword) opsecHeader.writeLastCheckedTime('twitter') # See https://dev.twitter.com/docs/api/1/get/search tweetSinceDate = str(getLatestTweet(None, keyword)[0]) searchQueryString = 'http://search.twitter.com/search.json?q=' + keyword + '&rpp=10&result_type=recent' if tweetSinceDate != '0': # Twitter does not play nice with invalid since_id's searchQueryString += '&since_id=' + tweetSinceDate opsecHeader.queryWebsiteJSON("twitter", searchQueryString) twitterResults = opsecHeader.readResultsJSON('twitter') twitterAllResults = twitterResults['results'] if not twitterAllResults: print "No results." else: existingEpochTime = getLatestTweet(None, keyword)[1] for x in twitterAllResults: created_at = (x['created_at']).encode('utf-8') epochTimeFound = calendar.timegm( (time.strptime(created_at, '%a, %d %b %Y %H:%M:%S +0000'))) if int(epochTimeFound) > int(existingEpochTime): twitterID = (x['id']) from_user = (x['from_user']).encode('utf-8') text = (x['text']).encode('utf-8') created_at = (x['created_at']).encode('utf-8') profile_image_url_https = ( x['profile_image_url_https']).encode('utf-8') location, lat, lng = genGeo(from_user) writeTweet(twitterID, from_user, text, created_at, keyword, location, lat, lng, epochTimeFound, profile_image_url_https) opsecHeader.sendEmail(keyword, "Twitter")
def searchFacebook(raw_keyword): opsecHeader.writeLastCheckedTime('facebook') keyword = urllib2.quote(raw_keyword) # See https://developers.facebook.com/docs/reference/api/ # # Arguments: # q = keyword we are searching for # type = kind of object we are searching for e.g post # # Returns: # name; id (facebook.com/id for their profile) facebookLatestEpoch = getLatestPostTime() facebookQueryString = 'https://graph.facebook.com/search?q=' + keyword + '&type=post' opsecHeader.queryWebsiteJSON("facebook", facebookQueryString) print "Parsing Facebook data..." facebookResults = opsecHeader.readResultsJSON('facebook') facebookAllResults = facebookResults['data'] if facebookAllResults: for x in facebookAllResults: if 'message' in x: message = x['message'].encode('utf-8') name = (x['from']['name']).encode('utf-8') user_id = (x['from']['id']).encode('utf-8') updated_time = (x['updated_time']).encode('utf-8') epoch_time = calendar.timegm((time.strptime(updated_time, '%Y-%m-%dT%H:%M:%S+0000'))) if int(epoch_time) > int(facebookLatestEpoch): profilePicture = getProfilePicture(user_id) writeLatestPost(name, user_id, message, profilePicture, updated_time, keyword, epoch_time) opsecHeader.sendEmail(keyword, "Facebook") print "Updated Time: " + updated_time else: print "Post too old."
def getPost(account_id, site, user_id, content_type): latest_epoch_time = getLatestPost(user_id, site, content_type) queryString = 'http://api.stackexchange.com/2.1/users/' + str( user_id ) + '/' + str(content_type) + 's?fromdate=' + str( latest_epoch_time ) + '&order=desc&sort=creation&site=' + site + '&key=' + opsecHeader.stackexchange_api_key opsecHeader.queryWebsiteJSON( str(site) + str(user_id) + str(content_type), queryString) opsecHeader.writeLastCheckedTime('stackexchange') results = opsecHeader.readResultsJSON( str(site) + str(user_id) + str(content_type)) items = results['items'] for x in items: creation_date = x['creation_date'] if (latest_epoch_time != creation_date): if (content_type == 'question'): question_id = x['question_id'] url = x['link'] html = urllib2.urlopen(url).read() soup = BeautifulSoup(html) dirty_content = soup.find('div', { 'class': 'post-text', 'itemprop': 'description' }) content = ''.join(dirty_content.findAll(text=True)) elif (content_type == 'answer'): answer_id = x['answer_id'] url = "http://" + str(site) + ".com/a/" + str(answer_id) html = urllib2.urlopen(url).read() soup = BeautifulSoup(html) answer_id = 'answer-' + str(answer_id) div_content = soup.find('div', {'id': answer_id}) dirty_content = div_content.find('div', {'class': 'post-text'}) content = ''.join(dirty_content.findAll(text=True)) elif (content_type == 'comment'): comment_id = x['comment_id'] post_id = x['post_id'] short_url = 'http://' + str(site) + '.com/q/' + str(post_id) long_url = str(urllib2.urlopen(short_url).geturl()) long_url = long_url.split("#")[0] url = long_url + '#comment' + str(comment_id) + '_' + str( post_id) html = urllib2.urlopen(url).read() soup = BeautifulSoup(html) comment_id_format = 'comment-' + str(comment_id) try: #Will fail if comments need to be loaded via AJAX comment_tr = soup.find('tr', {'id': comment_id_format}) dirty_content = comment_tr.find('span', {'class': 'comment-copy'}) content = ''.join(dirty_content.findAll(text=True)) except AttributeError: content = 'See website' profile_image = x['owner']['profile_image'] display_name = x['owner']['display_name'] writeDisplayName(account_id, display_name) writeLatestPost(account_id, user_id, site, content_type, creation_date, profile_image, url, content, display_name) keywords = opsecHeader.getUserKeywords(account_id, 'stackexchange') for keyword in keywords: if keyword in content: opsecHeader.sendEmail(keyword, "Stack Exchange", display_name)