def _split_from_epoch(time_range: TimeRange) -> (TimeRange, TimeRange): last_epoch = api_settings.get_last_epoch() if time_range.high <= last_epoch: return time_range, None if time_range.low >= last_epoch: return None, time_range before_epoch = TimeRange(time_range.low, last_epoch) after_epoch = TimeRange(last_epoch + 1, time_range.high) return before_epoch, after_epoch
def collect_posts_from_subreddit(self, subreddit: str, coin: CoinType, time_range: TimeRange, limit: int): print("RealtimeRedditCrawler:", "Collecting from", subreddit, "with time range", time_range) posts = [] coin_subreddit = self.spider.subreddit(subreddit) for submission in coin_subreddit.new(limit=limit): created_time = int(submission.created_utc) if time_range.is_higher(created_time): continue if time_range.is_lower(created_time): break print("RealtimeRedditCrawler:", "Found post", submission.title, "with time", time_to_str(created_time)) interaction_score = calculate_interaction_score( submission.num_comments, submission.score) subreddit_source = "reddit/" + submission.subreddit.display_name # Concatenate the title and the contents of the post. submission_text = submission.title + submission.selftext submission_model = Post( unique_id="rs" + submission.id, user=(submission.author.name if submission.author is not None else "deleted"), content=submission_text, interaction=interaction_score, source=subreddit_source.lower(), time=created_time, coin_type=coin) posts.append(submission_model) submission = self.spider.submission(id=submission.id) # Expand the comments. submission.comments.replace_more(limit=3) if not self.settings.collect_comments: continue # Iterate over all the comments. for top_comment in submission.comments.list(): if isinstance(top_comment, MoreComments): continue # Discard the comments with no content and deleted comments. if top_comment.body is None or top_comment.author is None or top_comment.body.strip( ) == '': continue comment_interaction_score = calculate_interaction_score( len(top_comment.replies), top_comment.score) comment_model = Post( unique_id="rc" + top_comment.id, user=(top_comment.author.name if top_comment.author is not None else "deleted"), content=top_comment.body, interaction=comment_interaction_score, source=subreddit_source.lower(), time=top_comment.created_utc, coin_type=coin) posts.append(comment_model) return posts
def _example_pull_request(): """ An example pull request for reference and debugging purposes. """ plt.plot( list( pull_coin_history(CoinType.doge, TimeRange(1559347200, 1612137600), "1h")["Price"])) plt.show()
def read_uncached(self, time_range: TimeRange): collector_state = self.collector.state() if self.dynamic_low: last_collected = db.session.query(func.max(self.model.time))\ .filter(self.model.type == collector_state)\ .scalar() if last_collected is None or last_collected < time_range.low: if last_collected is not None and last_collected < time_range.low: new_low = last_collected if last_collected is None: new_low = api_settings.GENESIS print("UncachedReader: Adjusting the time range start from", time_range.low, "to", new_low, "for", collector_state) time_range.low = new_low pre_query = self.model.query\ .filter(self.model.time <= time_range.high)\ .filter(self.model.time > time_range.low)\ .filter(self.model.type == collector_state) # First, remove the old data. print("UncachedReader: Found", pre_query.count(), "many old rows.") if self.replace_old: print("UncachedReader: Removing the old rows...") pre_query.delete() db.session.commit() interval_generator = (time_range, ) if self.save_interval is not None: interval_generator = closed_distinct_intervals( time_range, self.save_interval) # Then, collect the new data. for tr in interval_generator: print("UncachedReader: Initiating the collection within", tr) while True: try: collected = list( tqdm(self.collector.collect(tr), "Collecting...")) break except Exception as e: print("UncachedReader: Encountered an error", e) if not self.retry_on_error: print("UncachedReader: Discarding...") collected = [] break print("UncachedReader: Retrying...") print("UncachedReader: Successfully collected", len(collected), "points. Saving into the database.") # Set the type of the model to the operation description/collector state. for c in collected: c.type = collector_state db.session.bulk_save_objects(collected) db.session.commit() # Now, read the data back from the database. inserted = db.session.query(self.model)\ .filter(self.model.time <= time_range.high)\ .filter(self.model.time >= time_range.low)\ .filter(self.model.type == collector_state)\ .all() return inserted
def prepare_change_map(self, before_epoch_model, after_epoch_model, curr_time): for time_window_str, time_window_seconds in api_settings.TRIGGER_TIME_WINDOW_TO_SECONDS.items(): self.post_count_change_map[time_window_str] = {} effective_time_range = TimeRange(curr_time - time_window_seconds, curr_time) for aggr_source in itertools.chain(map(lambda c: "coin:" + c.value, self.coins), map(lambda s: "source:" + s, self.sources)): coin_change = self._calculate_aggregate_percent_change(effective_time_range, before_epoch_model, after_epoch_model, aggr_source) self.post_count_change_map[time_window_str][aggr_source] = coin_change
def update_stream(): from_time = api_settings.get_last_aggr_stream_time(default=GENESIS) to_time = api_settings.get_last_streamed_post_time(default=None) if to_time is None or from_time >= to_time: return "no new streamed posts" effective_time_range = TimeRange(from_time + 1, to_time) print("Update stream endpoint: Updating within", effective_time_range) create_streamed_aggregate_post_counts(COINS, [], effective_time_range) # TODO: Deploy notifications. db.session.commit() return "ok"
def update_posts(): from_time = api_settings.get_last_aggr_post_time(default=GENESIS) to_time = api_settings.get_last_crawled_post_time(default=None) if to_time is None or from_time >= to_time: return "no new posts" effective_time_range = TimeRange(from_time + 1, to_time) print("Update posts endpoint: Updating within", effective_time_range) # groups = list(filter(lambda s: s.startswith("*"), get_all_sources())) # create_aggregate_post_impacts(coin_types, groups, effective_time_range) create_aggregate_post_counts(COINS, [], effective_time_range) update_post_impacts(effective_time_range) return "ok"
def collect(self, time_range: TimeRange) -> iter: divisor = time.time() - self.settings.realtime_threshold archived_range, realtime_range = time_range.split(divisor) print("RedditMultiplexedCrawler: Archived range is", archived_range, "and realtime range is", realtime_range) if archived_range is not None: print("RedditMultiplexedCrawler: Collecting from the archived crawler.") for p in self.archived.collect(archived_range): yield p if realtime_range is not None: print("RedditMultiplexedCrawler: Collecting from the realtime crawler.") for p in self.realtime.collect(realtime_range): yield p
def collect_prices(): curr_time = request.form.get("time", type=int, default=None) if curr_time is None: print("Collect prices endpoint: Invalid time. Using current time.") curr_time = time.time() from_time = api_settings.get_last_price_time(default=GENESIS) effective_time_range = TimeRange(from_time + 1, curr_time) print("Collect prices endpoint: Collecting new prices within", effective_time_range) print(effective_time_range) old_threshold = int(time.time()) - delta_time.days(5) if effective_time_range.in_range(old_threshold): old_effective_time_range = TimeRange(effective_time_range.low, old_threshold + 5) effective_time_range = TimeRange(old_threshold + 5, effective_time_range.high) print("old effective", old_effective_time_range) print("new effective", effective_time_range) old_price_reader = UncachedReader(YahooPriceCrawler(resolution="1h"), Price, dynamic_low=False) for coin in COINS: old_price_reader.collector.update_coin(coin) old_price_reader.read_uncached(old_effective_time_range) price_reader = UncachedReader(YahooPriceCrawler(resolution="1m"), Price, dynamic_low=False) for coin in COINS: price_reader.collector.update_coin(coin) price_reader.read_uncached(effective_time_range) return "ok"
def read(self, time_range: TimeRange, price_window: int) -> (list, list): print("DataReader: Invoked to run within", time_range) # Collect all the posts within the time range. posts = sorted(reduce( list.__add__, map(lambda c: c.read_cached(time_range), self.cached_post_readers), []), key=lambda x: x.time) # Collect all the possible prices according to the window. min_price_time = time_range.low - price_window max_price_time = time_range.high + price_window prices = self.cached_price_reader.read_cached( TimeRange(min_price_time, max_price_time)) # Sort and return. return posts, sorted(prices, key=lambda x: x.time)
def __init__(self, **kwargs): if len(kwargs) == 3: post = kwargs["post"] prices = kwargs["prices"] vectorizer = kwargs["vectorizer"] self.X = CryptoSpeculationX(post=post, vectorizer=vectorizer) self.y = CryptoSpeculationY(prices=list( filter( lambda price: TimeRange(post.time - 60 * 60 * 24 * 15, post .time + 60 * 60 * 24 * 15). in_range(price.time), prices))) elif len(kwargs) == 6: self.X = CryptoSpeculationX(content=kwargs["content"], user=kwargs["user"], source=kwargs["source"], coin=kwargs["coin"], interaction=kwargs["interaction"]) self.y = CryptoSpeculationY(impact=kwargs["impact"])
def _example(): from data.collector.yahoo import YahooPriceCrawler from data.collector.reddit import ArchivedRedditCrawler from data.collector.twitter import TwitterCrawler dataset = CryptoSpeculationDataset( "sample_set_2020_2021", social_media_crawlers=[ ArchivedRedditCrawler(interval=60 * 60 * 24 * 60, api_settings={ 'limit': 100, 'score': '>7' }, collect_comments=True) ], price_crawler=YahooPriceCrawler(resolution="1h"), coin_types=[CoinType.btc], time_range=TimeRange(1577836800, 1578836800)) dataset.save()
def start(mailer: Mailer): social_media_crawlers = [ TwitterCrawler(), ArchivedRedditCrawler(interval=60 * 60 * 24 * 7, api_settings={ 'limit': 2000, 'score': '>4' }), RealtimeRedditCrawler() ] price_crawler = YahooPriceCrawler(resolution="1h") data_reader = DataReader(social_media_crawlers=[], price_crawler=price_crawler) coin_types = [CoinType.btc, CoinType.eth, CoinType.doge] while True: # Wait until aligned with sleep_interval t = int(time.time()) # while t % SLEEP_INTERVAL != 0: # t = int(time.time()) # time.sleep(1) effective_time_range = TimeRange(t - SLEEP_INTERVAL + 1, t) print("Novel data collection initiated...") new_posts = [] for c in coin_types: data_reader.update_coin_type(c) posts, _ = data_reader.read(effective_time_range, SLEEP_INTERVAL) # new_posts += posts # new_posts += posts # Post-processing... # update_impacts(new_posts) groups = list(filter(lambda s: s.startswith("*"), get_all_sources())) # create_aggregate_post_impacts(coin_types, groups, effective_time_range) create_aggregate_post_counts(coin_types, [], effective_time_range) # Deploy the web-site notifications. # affected_triggers = deploy_notifications(t, coin_types, groups) # Find the affected triggers that should be notified by e-mail. # mail_triggers = list(filter(lambda t: t.follow.notify_email, affected_triggers)) # Send the appropriate e-mails... # mailer.deploy_mails(mail_triggers) time.sleep(SLEEP_INTERVAL) break
def collect_posts(): curr_time = request.form.get("time", type=int, default=None) if curr_time is None: print("Collect posts endpoint: Invalid time. Using current time.") curr_time = time.time() from_time = api_settings.get_last_crawled_post_time(default=api_settings.GENESIS) effective_time_range = TimeRange(from_time + 1, curr_time) print("Collect posts endpoint: Collecting new posts within", effective_time_range) archived_reddit_crawler = ArchivedRedditCrawler(interval=delta_time.days(1), api_settings={'limit': 2000}) realtime_reddit_crawler = RealtimeRedditCrawler() social_media_crawlers = [TwitterCrawler(), RedditMultiplexedCrawler(delta_time.days(2), realtime_reddit_crawler, archived_reddit_crawler)] cached_post_readers = list(map(lambda c: UncachedReader(c, Post, save_interval=delta_time.days(10)), social_media_crawlers)) new_posts = [] for coin in COINS: print("Collect posts endpoint: Switching coin to", coin.value) for cr in cached_post_readers: cr.collector.update_coin(coin) new_posts += cr.read_uncached(effective_time_range) return "ok"
def collect(self, time_range: TimeRange) -> iter: print("ArchivedRedditCrawler: Initiated collection within", time_range) for t in range(time_range.low, time_range.high + 1, self.settings.interval): tr = TimeRange(t, min(t + self.settings.interval, time_range.high)) print("ArchivedRedditCrawler: Collecting within", tr) for subreddit in COIN_SUBREDDITS[self.settings.coin.value]: sbm = self.api.search_submissions(subreddit=subreddit, before=tr.high, after=tr.low, **self.settings.api_settings) for p in sbm: content = p.title + (" " + p.selftext if hasattr( p, 'selftext') else "") yield Post(coin_type=self.settings.coin, user=p.author, content=content, source="reddit/" + subreddit.lower(), interaction=calculate_interaction_score( p.num_comments, p.score), time=p.created_utc, unique_id="rs" + p.id) # Skip collecting the comments. if not self.settings.collect_comments: continue cmt = self.api.search_comments(subreddit=subreddit, before=tr.high, after=tr.low, **self.settings.api_settings) for p in cmt: yield Post(coin_type=self.settings.coin, user=p.author, content=p.body, source="reddit/" + subreddit.lower(), interaction=calculate_interaction_score( 0, p.score), time=p.created_utc, unique_id="rc" + p.id)
def _collect_dataset(): from data.collector.yahoo import YahooPriceCrawler from data.collector.reddit import ArchivedRedditCrawler from data.collector.twitter import TwitterCrawler print("Collecting dataset") dataset = CryptoSpeculationDataset( "Jun19_May21_Big", social_media_crawlers=[ ArchivedRedditCrawler(interval=60 * 60 * 24 * 7, api_settings={ 'limit': 1000, 'score': '>7' }), TwitterCrawler(only_users=True) ], price_crawler=YahooPriceCrawler(resolution="1h"), coin_types=[ CoinType.btc, CoinType.eth, CoinType.doge, CoinType.ada, CoinType.link, CoinType.ltc, CoinType.omg, CoinType.xlm, CoinType.xrp ], time_range=TimeRange(1560556800, 1619827200)) dataset.save()
def cached_range_reader(range_type: str): cached_ranges = CachedRange.query.filter_by(type=range_type) return list(map(lambda cr: TimeRange(cr.low, cr.high), cached_ranges))