def test_badparams(self): self.assertFalse(checkFilamentBlacklist.checkBlacklist()) self.assertFalse( checkFilamentBlacklist.checkBlacklist(user_id=self.user_id))
def test_unblocked(self): #use acct id -1 to test self.assertFalse( checkFilamentBlacklist.checkBlacklist(user_id=self.user_id, account_id=-1))
def test_nodb(self): self.dblink = infoModule.info.site['dblink'] infoModule.info.site['dblink'] = None self.assertFalse( checkFilamentBlacklist.checkBlacklist(user_id=self.user_id, account_id=self.account_id))
def scanStreams(user_id): ##scanStreams: ##params: user_id ##returns: bool ##base function for reading all the streams from a user's accounts and importing them ## into the db global totalStreamsRead global streamList streamList = [] totalStreamsRead = 0 dblink = infoModule.info.site['dblink'] #fetch accounts, and last_read id sql = "SELECT account_id, account_user_name, last_read_id, streams.name, public_key, account_user_name, private_key, external_id, user_id from peepbuzz.accounts INNER JOIN peepbuzz.streams ON accounts.stream_id=streams.stream_id WHERE user_id=" + str(user_id) accountsQ = mysql_tools.mysqlQuery(sql, dblink) try: accounts = accountsQ.fetch_row(0,1) except AttributeError: print "accountsQ error: " + sql if accounts == (): infoModule.info.errorList.append('no accounts found for user_id ' + str(user_id)) return False # pprint.pprint(accounts) # sys.exit() # for each account type, launch a thread to request the stream # also use this look to create a dict for calculating the most recent update to that account account_latest_ids = {} account_latest_date = {} for account in accounts: print account['name'] account_latest_ids[account['account_id']] = '' account_latest_date[account['account_id']] = 0 #update last time read for this account sql = "UPDATE peepbuzz.accounts set last_scanned=now() WHERE account_id=" + account['account_id'] mysql_tools.mysqlQuery(sql, dblink) authDict = {'site_key' : account['public_key'], 'account_password' : account['private_key'], 'account_id' : account['account_id'], 'last_read_id' : account['last_read_id'], 'external_id' : account['external_id'], 'user_id' : account['user_id']} if account['name'] == 'twitter': twitterThread(authDict).start() elif account['name'] == 'facebook': facebookThread(authDict).start() elif account['name'] == 'homepage': homepageThread(account['user_id']).start() #wait for threads to finish #for tr in threading.enumerate(): # print pprint.pprint(tr); # total timeout after 120 secs disasterTimeout = 120.0 startTime = time.time() while True: if len(accounts) == totalStreamsRead: break #disaster timeout and crash if time.time() - startTime > disasterTimeout: print "scanStreams timeout" break #urlList is list of all urls found in this pass of reading the stream urlList = [] print "streamList" pprint.pprint(streamList) # sys.exit() ## now we have all filaments needed to go through the list and process each one for filament in streamList: #clear filament_id for safety print '-------------------' print "FILAMENT" print filament['created'] print filament['stream_name'] # if filament['external_id'] == '142283309153694_175236912525000': # print "pig radio" ## find promoter ID currentStreamID = infoModule.info.site['stream_name_to_id'][filament['stream_name']] (filament['promoter_account_id'],filament['promoter_account_table']) = accountFinder.accountFinder(currentStreamID, filament['promoter_id'], filament['promoter'], filament['thumbnail']) ## check that promoter isn't in blacklist. Skip this record if they are blackListed = checkFilamentBlacklist.checkBlacklist(user_id, filament['promoter_account_id']) if blackListed: filament['status'] = 'blocked' else: filament['status'] = 'active' # pprint.pprint(filament) # sys.exit ## insert filament info into the filament table, checking to make sure it's not a dupe filamentJSON = json.dumps(filament) print "storing filament" filament_id = storeFilament.storeFilament(user_id, filamentJSON) if filament_id == False: continue ## add following filament['filament_id'] = filament_id # keep track of latest item for each account # if stream name isn't homepage, timestamp is likely to be in facebook format if filament['stream_name'] != 'homepage': time_in_struct = time.strptime(filament['created'][:-5], '%Y-%m-%dT%H:%M:%S') time_in_secs = int(time.mktime(time_in_struct)) if time_in_secs > account_latest_date[filament['account_id']]: #this is the latest filament from this account account_latest_date[filament['account_id']] = time_in_secs account_latest_ids[filament['account_id']] = filament['external_id'] else: # created timestamp is now for homepage current_time_in_secs = time.mktime(time.localtime()) account_latest_date[filament['account_id']] = current_time_in_secs account_latest_ids[filament['account_id']] = filament['external_id'] addFollowing.addFollowing(user_id, account_id=filament['promoter_account_id']) ## update hashtags stats if appropriate. if 'hashtags' in filament: for hashtag in filament['hashtags']: hashtagUpdate.hashtagUpdate(hashtag) ## insert discussions pprint.pprint(filament['discussion']) if len(filament['discussion']) > 0: discussion_json = json.dumps(filament['discussion']) store_discussion_element.store_discussion(discussion_json, filament_id) ## is there a url in the filament? append it to a url list structure ## that contains the filament ID and the url if filament['urls'] != None: urlsInFilament = filament['urls'] if len(urlsInFilament) > 0: print "urls to read:" pprint.pprint(urlsInFilament) # only add the uniques for URL in urlsInFilament: try: urlList.index(URL) except ValueError: # check against failed_urls sql = "SELECT original_url FROM peepbuzz.failed_urls WHERE original_url='" + URL + "'" check_failed_query = mysql_tools.mysqlQuery(sql, dblink) if check_failed_query.num_rows() == 0: # and test against db story_id = URLInStories(URL) if story_id > 0: #story already exists, just link story id to filament linkFilamentToStory(filament['filament_id'], story_id) else: ## add url to original_url in stories, then link filament ## after a few transformations URL = re.sub('\?utm_source.*$', '', URL) urlList.append((URL, filament['account_id'])) sql = "insert into peepbuzz.stories set original_url='" + str(URL) + "'" mysql_tools.mysqlQuery(sql, dblink) story_id = dblink.insert_id() linkFilamentToStory(filament['filament_id'], story_id) else: pass #now outside of this loop, fetch all urls in parallel print "urls to fetch" pprint.pprint(urlList) if urlList == []: print "no urls to read" else: #urlStoringThread(urlList).start() print "storing exploded URLs" storiesFound = storeUrlContents.storeUrlContents(json.dumps(urlList)) print "stories read: " + str(storiesFound) for key in account_latest_ids: sql = "update peepbuzz.accounts set last_read_id='" + str(account_latest_ids[key]) + "' where account_id=" + str(key) mysql_tools.mysqlQuery(sql, dblink)
def test_blocked(self): self.assertTrue( checkFilamentBlacklist.checkBlacklist(user_id=self.user_id, account_id=self.account_id))
def test_badparams(self): self.assertFalse(checkFilamentBlacklist.checkBlacklist()) self.assertFalse(checkFilamentBlacklist.checkBlacklist(user_id=self.user_id))
def test_nodb(self): self.dblink = infoModule.info.site['dblink'] infoModule.info.site['dblink'] = None self.assertFalse(checkFilamentBlacklist.checkBlacklist(user_id=self.user_id, account_id=self.account_id))
def test_unblocked(self): #use acct id -1 to test self.assertFalse(checkFilamentBlacklist.checkBlacklist(user_id=self.user_id, account_id=-1))
def test_blocked(self): self.assertTrue(checkFilamentBlacklist.checkBlacklist(user_id=self.user_id, account_id=self.account_id))
def scanStreams(user_id): ##scanStreams: ##params: user_id ##returns: bool ##base function for reading all the streams from a user's accounts and importing them ## into the db global totalStreamsRead global streamList streamList = [] totalStreamsRead = 0 dblink = infoModule.info.site["dblink"] # fetch accounts, and last_read id sql = ( "SELECT account_id, account_user_name, last_read_id, streams.name, public_key, account_user_name, private_key, external_id, user_id from peepbuzz.accounts INNER JOIN peepbuzz.streams ON accounts.stream_id=streams.stream_id WHERE user_id=" + str(user_id) ) accountsQ = mysql_tools.mysqlQuery(sql, dblink) try: accounts = accountsQ.fetch_row(0, 1) except AttributeError: print "accountsQ error: " + sql if accounts == (): infoModule.info.errorList.append("no accounts found for user_id " + str(user_id)) return False # pprint.pprint(accounts) # sys.exit() # for each account type, launch a thread to request the stream # also use this look to create a dict for calculating the most recent update to that account account_latest_ids = {} account_latest_date = {} for account in accounts: print account["name"] account_latest_ids[account["account_id"]] = "" account_latest_date[account["account_id"]] = 0 # update last time read for this account sql = "UPDATE peepbuzz.accounts set last_scanned=now() WHERE account_id=" + account["account_id"] mysql_tools.mysqlQuery(sql, dblink) authDict = { "site_key": account["public_key"], "account_password": account["private_key"], "account_id": account["account_id"], "last_read_id": account["last_read_id"], "external_id": account["external_id"], "user_id": account["user_id"], } if account["name"] == "twitter": twitterThread(authDict).start() elif account["name"] == "facebook": facebookThread(authDict).start() elif account["name"] == "homepage": homepageThread(account["user_id"]).start() # wait for threads to finish # for tr in threading.enumerate(): # print pprint.pprint(tr); # total timeout after 120 secs disasterTimeout = 120.0 startTime = time.time() while True: if len(accounts) == totalStreamsRead: break # disaster timeout and crash if time.time() - startTime > disasterTimeout: print "scanStreams timeout" break # urlList is list of all urls found in this pass of reading the stream urlList = [] print "streamList" pprint.pprint(streamList) # sys.exit() ## now we have all filaments needed to go through the list and process each one for filament in streamList: # clear filament_id for safety print "-------------------" print "FILAMENT" print filament["created"] print filament["stream_name"] # if filament['external_id'] == '142283309153694_175236912525000': # print "pig radio" ## find promoter ID currentStreamID = infoModule.info.site["stream_name_to_id"][filament["stream_name"]] (filament["promoter_account_id"], filament["promoter_account_table"]) = accountFinder.accountFinder( currentStreamID, filament["promoter_id"], filament["promoter"], filament["thumbnail"] ) ## check that promoter isn't in blacklist. Skip this record if they are blackListed = checkFilamentBlacklist.checkBlacklist(user_id, filament["promoter_account_id"]) if blackListed: filament["status"] = "blocked" else: filament["status"] = "active" # pprint.pprint(filament) # sys.exit ## insert filament info into the filament table, checking to make sure it's not a dupe filamentJSON = json.dumps(filament) print "storing filament" filament_id = storeFilament.storeFilament(user_id, filamentJSON) if filament_id == False: continue ## add following filament["filament_id"] = filament_id # keep track of latest item for each account # if stream name isn't homepage, timestamp is likely to be in facebook format if filament["stream_name"] != "homepage": time_in_struct = time.strptime(filament["created"][:-5], "%Y-%m-%dT%H:%M:%S") time_in_secs = int(time.mktime(time_in_struct)) if time_in_secs > account_latest_date[filament["account_id"]]: # this is the latest filament from this account account_latest_date[filament["account_id"]] = time_in_secs account_latest_ids[filament["account_id"]] = filament["external_id"] else: # created timestamp is now for homepage current_time_in_secs = time.mktime(time.localtime()) account_latest_date[filament["account_id"]] = current_time_in_secs account_latest_ids[filament["account_id"]] = filament["external_id"] addFollowing.addFollowing(user_id, account_id=filament["promoter_account_id"]) ## update hashtags stats if appropriate. if "hashtags" in filament: for hashtag in filament["hashtags"]: hashtagUpdate.hashtagUpdate(hashtag) ## insert discussions pprint.pprint(filament["discussion"]) if len(filament["discussion"]) > 0: discussion_json = json.dumps(filament["discussion"]) store_discussion_element.store_discussion(discussion_json, filament_id) ## is there a url in the filament? append it to a url list structure ## that contains the filament ID and the url if filament["urls"] != None: urlsInFilament = filament["urls"] if len(urlsInFilament) > 0: print "urls to read:" pprint.pprint(urlsInFilament) # only add the uniques for URL in urlsInFilament: try: urlList.index(URL) except ValueError: # check against failed_urls sql = "SELECT original_url FROM peepbuzz.failed_urls WHERE original_url='" + URL + "'" check_failed_query = mysql_tools.mysqlQuery(sql, dblink) if check_failed_query.num_rows() == 0: # and test against db story_id = URLInStories(URL) if story_id > 0: # story already exists, just link story id to filament linkFilamentToStory(filament["filament_id"], story_id) else: ## add url to original_url in stories, then link filament ## after a few transformations URL = re.sub("\?utm_source.*$", "", URL) urlList.append((URL, filament["account_id"])) sql = "insert into peepbuzz.stories set original_url='" + str(URL) + "'" mysql_tools.mysqlQuery(sql, dblink) story_id = dblink.insert_id() linkFilamentToStory(filament["filament_id"], story_id) else: pass # now outside of this loop, fetch all urls in parallel print "urls to fetch" pprint.pprint(urlList) if urlList == []: print "no urls to read" else: # urlStoringThread(urlList).start() print "storing exploded URLs" storiesFound = storeUrlContents.storeUrlContents(json.dumps(urlList)) print "stories read: " + str(storiesFound) for key in account_latest_ids: sql = ( "update peepbuzz.accounts set last_read_id='" + str(account_latest_ids[key]) + "' where account_id=" + str(key) ) mysql_tools.mysqlQuery(sql, dblink)