def __init__(self, db_address, load_model_name=None, model_config_name=None): self.posts = set() self.past_votes_len = 0 self.db_address = db_address db_blacklist, _ = load_data(self.db_address, BLACK_LIST_COL) self.blacklist = set(x['_id'] for x in db_blacklist) logger.info("Blacklist loaded: %d", len(self.blacklist)) if load_model_name: from steevebase.model import models model_config = load_training_config(model_config_name) self.output_size = model_config['MODEL']['OUTPUT_SIZE'] DATA_CONFIG = model_config['DATA'] self.max_seq_len = DATA_CONFIG['MAX_SEQ_LEN'] self.embedding_size = len( DATA_CONFIG['ALPHABET'] ) + 1 # 1 symbol reserved for unknown chars self.model_class = models.create_cnn_class(model_config) self.model_class.load_weights(load_model_name) logger.info("Model %s loaded..." % load_model_name) self.char2ind, _ = get_char_ind_dicts( model_config['DATA']['ALPHABET']) self.probability = 0.
def __init__(self, db_address=None, load_model_name=None, model_config_name=None): if db_address: self.posts = set() self.past_votes_len = 0 self.db_address = db_address db_blacklist, _ = load_data(self.db_address, BLACK_LIST_COL) # self.blacklist = set(x['_id'] for x in db_blacklist if 'reason' not in x or x['reason'] != 'unsubscribe') self.blacklist = set(x['_id'] for x in db_blacklist) logger.info("Blacklist loaded: %d", len(self.blacklist)) self.spam_check = None if load_model_name: # init spam check init_time_interval_hrs = 24 self.spam_check = SpamCheck( db_address=db_address, init_time_interval_hrs=init_time_interval_hrs) logger.info('SpamCheck init on last %d hours', init_time_interval_hrs) from steevebase.model import models model_config = load_training_config(model_config_name) self.output_size = model_config['MODEL']['OUTPUT_SIZE'] DATA_CONFIG = model_config['DATA'] self.max_seq_len = DATA_CONFIG['MAX_SEQ_LEN'] self.embedding_size = len( DATA_CONFIG['ALPHABET'] ) + 1 # 1 symbol reserved for unknown chars self.model_class = models.create_cnn_class(model_config) self.model_class.load_weights(load_model_name) logger.info("Model %s loaded..." % load_model_name) self.char2ind, _ = get_char_ind_dicts( model_config['DATA']['ALPHABET']) self.probability = 0.
def __init__(self, args): assert args.db_address.startswith("mongodb://") self.checks = Checks(args.db_address, args.load_model_name, args.model_config_name) self.posts = set() self.df_current_posts = pd.DataFrame({'post': [], 'pending_times': []}) self.df_past_votes = pd.DataFrame({ 'vote_time': pd.Series(dtype='datetime64[ns]'), '_id': [] }) self.db_address = args.db_address time_now = datetime.utcnow() minusdelta24 = time_now - timedelta(hours=24) query = {"vote_time": {"$gt": minusdelta24}} old_voted_posts_iter, _ = load_data(self.db_address, PAST_VOTES, query) for post in old_voted_posts_iter: self.df_past_votes = self.df_past_votes.append( { 'vote_time': int(post["vote_time"].timestamp()), '_id': post['_id'] }, ignore_index=True) logger.info("Past votes loaded: %d", len(self.df_past_votes)) self.steem_address = args.steem_address self.wif = WIF start_block_number = get_steem_info( self.steem_address)['head_block_number'] - int( VOTING_DELAY / BLOCK_INTERVAL) self.stream = stream_ops('comment', steem_address=self.steem_address, wif=self.wif, start_block_number=start_block_number) self.a_lock = allocate_lock()
def check_author_not_spamming(self, post): try: author = post['author'] created = post['created'] except PostDoesNotExist: return False # Check whether called in Voter or in Wrangler - not nice :( if datetime.utcnow() - created >= timedelta( hours=DELAY / (3600 / 3)): # DELAY in number of blocks return True # find out how many posts were created by the author in the last 24 hours start_time = datetime.utcnow() - timedelta(hours=24) last_24_hrs = {'$gte': start_time} query = {'author': author, 'created': last_24_hrs} _, count = load_data(self.db_address, RAW_POSTS_COL, query) if count > 2 * SPAM_LIMIT: self.add_to_blacklist(author, count) return count < SPAM_LIMIT
def wrangler_iterator(input_name, blacklist_db_address, block_step=BLOCK_STEP, query=None): posts_count = 0 filtered_count = 0 checks = Checks(db_address=blacklist_db_address) raw_posts_iter, raw_posts_count = load_data( input_name, col_name=RAW_POSTS_COLLECTION_NAME, query=query) num_batches = math.ceil(raw_posts_count / block_step) for batch in range(num_batches): step = min(block_step, raw_posts_count - block_step * batch) # download & clean posts = [clean_post(next(raw_posts_iter)) for i in range(step)] posts_count += len(posts) # filter filtered_posts = [ post for post in posts if checks.check_conditions(post, Checks.checklist_all) ] filtered_count += len(filtered_posts) # transform transformed_posts = [extract_features(post) for post in filtered_posts] logger.info("Number of posts processed: %d / %d", posts_count, raw_posts_count) yield transformed_posts logger.info("Total clean posts: %d / %d", filtered_count, raw_posts_count)
def fetch_training_data(mongo_address, clean_posts_col_name, start_time): query = {'created': {'$gte': start_time}} return load_data(mongo_address, clean_posts_col_name, query)