def _split_from_epoch(time_range: TimeRange) -> (TimeRange, TimeRange):
     last_epoch = api_settings.get_last_epoch()
     if time_range.high <= last_epoch:
         return time_range, None
     if time_range.low >= last_epoch:
         return None, time_range
     before_epoch = TimeRange(time_range.low, last_epoch)
     after_epoch = TimeRange(last_epoch + 1, time_range.high)
     return before_epoch, after_epoch
 def collect_posts_from_subreddit(self, subreddit: str, coin: CoinType,
                                  time_range: TimeRange, limit: int):
     print("RealtimeRedditCrawler:", "Collecting from", subreddit,
           "with time range", time_range)
     posts = []
     coin_subreddit = self.spider.subreddit(subreddit)
     for submission in coin_subreddit.new(limit=limit):
         created_time = int(submission.created_utc)
         if time_range.is_higher(created_time):
             continue
         if time_range.is_lower(created_time):
             break
         print("RealtimeRedditCrawler:", "Found post", submission.title,
               "with time", time_to_str(created_time))
         interaction_score = calculate_interaction_score(
             submission.num_comments, submission.score)
         subreddit_source = "reddit/" + submission.subreddit.display_name
         # Concatenate the title and the contents of the post.
         submission_text = submission.title + submission.selftext
         submission_model = Post(
             unique_id="rs" + submission.id,
             user=(submission.author.name
                   if submission.author is not None else "deleted"),
             content=submission_text,
             interaction=interaction_score,
             source=subreddit_source.lower(),
             time=created_time,
             coin_type=coin)
         posts.append(submission_model)
         submission = self.spider.submission(id=submission.id)
         # Expand the comments.
         submission.comments.replace_more(limit=3)
         if not self.settings.collect_comments:
             continue
         # Iterate over all the comments.
         for top_comment in submission.comments.list():
             if isinstance(top_comment, MoreComments):
                 continue
             # Discard the comments with no content and deleted comments.
             if top_comment.body is None or top_comment.author is None or top_comment.body.strip(
             ) == '':
                 continue
             comment_interaction_score = calculate_interaction_score(
                 len(top_comment.replies), top_comment.score)
             comment_model = Post(
                 unique_id="rc" + top_comment.id,
                 user=(top_comment.author.name
                       if top_comment.author is not None else "deleted"),
                 content=top_comment.body,
                 interaction=comment_interaction_score,
                 source=subreddit_source.lower(),
                 time=top_comment.created_utc,
                 coin_type=coin)
             posts.append(comment_model)
     return posts
示例#3
0
def _example_pull_request():
    """
    An example pull request for reference and debugging purposes.
    """
    plt.plot(
        list(
            pull_coin_history(CoinType.doge, TimeRange(1559347200, 1612137600),
                              "1h")["Price"]))
    plt.show()
 def read_uncached(self, time_range: TimeRange):
     collector_state = self.collector.state()
     if self.dynamic_low:
         last_collected = db.session.query(func.max(self.model.time))\
             .filter(self.model.type == collector_state)\
             .scalar()
         if last_collected is None or last_collected < time_range.low:
             if last_collected is not None and last_collected < time_range.low:
                 new_low = last_collected
             if last_collected is None:
                 new_low = api_settings.GENESIS
             print("UncachedReader: Adjusting the time range start from",
                   time_range.low, "to", new_low, "for", collector_state)
             time_range.low = new_low
     pre_query = self.model.query\
         .filter(self.model.time <= time_range.high)\
         .filter(self.model.time > time_range.low)\
         .filter(self.model.type == collector_state)
     # First, remove the old data.
     print("UncachedReader: Found", pre_query.count(), "many old rows.")
     if self.replace_old:
         print("UncachedReader: Removing the old rows...")
         pre_query.delete()
         db.session.commit()
     interval_generator = (time_range, )
     if self.save_interval is not None:
         interval_generator = closed_distinct_intervals(
             time_range, self.save_interval)
     # Then, collect the new data.
     for tr in interval_generator:
         print("UncachedReader: Initiating the collection within", tr)
         while True:
             try:
                 collected = list(
                     tqdm(self.collector.collect(tr), "Collecting..."))
                 break
             except Exception as e:
                 print("UncachedReader: Encountered an error", e)
                 if not self.retry_on_error:
                     print("UncachedReader: Discarding...")
                     collected = []
                     break
                 print("UncachedReader: Retrying...")
         print("UncachedReader: Successfully collected", len(collected),
               "points. Saving into the database.")
         # Set the type of the model to the operation description/collector state.
         for c in collected:
             c.type = collector_state
         db.session.bulk_save_objects(collected)
         db.session.commit()
     # Now, read the data back from the database.
     inserted = db.session.query(self.model)\
         .filter(self.model.time <= time_range.high)\
         .filter(self.model.time >= time_range.low)\
         .filter(self.model.type == collector_state)\
         .all()
     return inserted
 def prepare_change_map(self, before_epoch_model, after_epoch_model, curr_time):
     for time_window_str, time_window_seconds in api_settings.TRIGGER_TIME_WINDOW_TO_SECONDS.items():
         self.post_count_change_map[time_window_str] = {}
         effective_time_range = TimeRange(curr_time - time_window_seconds, curr_time)
         for aggr_source in itertools.chain(map(lambda c: "coin:" + c.value, self.coins),
                                            map(lambda s: "source:" + s, self.sources)):
             coin_change = self._calculate_aggregate_percent_change(effective_time_range, before_epoch_model,
                                                                    after_epoch_model, aggr_source)
             self.post_count_change_map[time_window_str][aggr_source] = coin_change
def update_stream():
    from_time = api_settings.get_last_aggr_stream_time(default=GENESIS)
    to_time = api_settings.get_last_streamed_post_time(default=None)
    if to_time is None or from_time >= to_time:
        return "no new streamed posts"
    effective_time_range = TimeRange(from_time + 1, to_time)
    print("Update stream endpoint: Updating within", effective_time_range)
    create_streamed_aggregate_post_counts(COINS, [], effective_time_range)
    # TODO: Deploy notifications.
    db.session.commit()
    return "ok"
def update_posts():
    from_time = api_settings.get_last_aggr_post_time(default=GENESIS)
    to_time = api_settings.get_last_crawled_post_time(default=None)
    if to_time is None or from_time >= to_time:
        return "no new posts"
    effective_time_range = TimeRange(from_time + 1, to_time)
    print("Update posts endpoint: Updating within", effective_time_range)
    # groups = list(filter(lambda s: s.startswith("*"), get_all_sources()))
    # create_aggregate_post_impacts(coin_types, groups, effective_time_range)
    create_aggregate_post_counts(COINS, [], effective_time_range)
    update_post_impacts(effective_time_range)
    return "ok"
 def collect(self, time_range: TimeRange) -> iter:
     divisor = time.time() - self.settings.realtime_threshold
     archived_range, realtime_range = time_range.split(divisor)
     print("RedditMultiplexedCrawler: Archived range is", archived_range, "and realtime range is", realtime_range)
     if archived_range is not None:
         print("RedditMultiplexedCrawler: Collecting from the archived crawler.")
         for p in self.archived.collect(archived_range):
             yield p
     if realtime_range is not None:
         print("RedditMultiplexedCrawler: Collecting from the realtime crawler.")
         for p in self.realtime.collect(realtime_range):
             yield p
def collect_prices():
    curr_time = request.form.get("time", type=int, default=None)
    if curr_time is None:
        print("Collect prices endpoint: Invalid time. Using current time.")
        curr_time = time.time()
    from_time = api_settings.get_last_price_time(default=GENESIS)
    effective_time_range = TimeRange(from_time + 1, curr_time)
    print("Collect prices endpoint: Collecting new prices within", effective_time_range)

    print(effective_time_range)
    old_threshold = int(time.time()) - delta_time.days(5)
    if effective_time_range.in_range(old_threshold):

        old_effective_time_range = TimeRange(effective_time_range.low, old_threshold + 5)
        effective_time_range = TimeRange(old_threshold + 5, effective_time_range.high)
        print("old effective", old_effective_time_range)
        print("new effective", effective_time_range)

        old_price_reader = UncachedReader(YahooPriceCrawler(resolution="1h"), Price, dynamic_low=False)
        for coin in COINS:
            old_price_reader.collector.update_coin(coin)
            old_price_reader.read_uncached(old_effective_time_range)
    price_reader = UncachedReader(YahooPriceCrawler(resolution="1m"), Price, dynamic_low=False)
    for coin in COINS:
        price_reader.collector.update_coin(coin)
        price_reader.read_uncached(effective_time_range)
    return "ok"
示例#10
0
 def read(self, time_range: TimeRange, price_window: int) -> (list, list):
     print("DataReader: Invoked to run within", time_range)
     # Collect all the posts within the time range.
     posts = sorted(reduce(
         list.__add__,
         map(lambda c: c.read_cached(time_range), self.cached_post_readers),
         []),
                    key=lambda x: x.time)
     # Collect all the possible prices according to the window.
     min_price_time = time_range.low - price_window
     max_price_time = time_range.high + price_window
     prices = self.cached_price_reader.read_cached(
         TimeRange(min_price_time, max_price_time))
     # Sort and return.
     return posts, sorted(prices, key=lambda x: x.time)
示例#11
0
 def __init__(self, **kwargs):
     if len(kwargs) == 3:
         post = kwargs["post"]
         prices = kwargs["prices"]
         vectorizer = kwargs["vectorizer"]
         self.X = CryptoSpeculationX(post=post, vectorizer=vectorizer)
         self.y = CryptoSpeculationY(prices=list(
             filter(
                 lambda price: TimeRange(post.time - 60 * 60 * 24 * 15, post
                                         .time + 60 * 60 * 24 * 15).
                 in_range(price.time), prices)))
     elif len(kwargs) == 6:
         self.X = CryptoSpeculationX(content=kwargs["content"],
                                     user=kwargs["user"],
                                     source=kwargs["source"],
                                     coin=kwargs["coin"],
                                     interaction=kwargs["interaction"])
         self.y = CryptoSpeculationY(impact=kwargs["impact"])
示例#12
0
def _example():
    from data.collector.yahoo import YahooPriceCrawler
    from data.collector.reddit import ArchivedRedditCrawler
    from data.collector.twitter import TwitterCrawler

    dataset = CryptoSpeculationDataset(
        "sample_set_2020_2021",
        social_media_crawlers=[
            ArchivedRedditCrawler(interval=60 * 60 * 24 * 60,
                                  api_settings={
                                      'limit': 100,
                                      'score': '>7'
                                  },
                                  collect_comments=True)
        ],
        price_crawler=YahooPriceCrawler(resolution="1h"),
        coin_types=[CoinType.btc],
        time_range=TimeRange(1577836800, 1578836800))
    dataset.save()
def start(mailer: Mailer):
    social_media_crawlers = [
        TwitterCrawler(),
        ArchivedRedditCrawler(interval=60 * 60 * 24 * 7,
                              api_settings={
                                  'limit': 2000,
                                  'score': '>4'
                              }),
        RealtimeRedditCrawler()
    ]
    price_crawler = YahooPriceCrawler(resolution="1h")
    data_reader = DataReader(social_media_crawlers=[],
                             price_crawler=price_crawler)
    coin_types = [CoinType.btc, CoinType.eth, CoinType.doge]
    while True:
        # Wait until aligned with sleep_interval
        t = int(time.time())
        # while t % SLEEP_INTERVAL != 0:
        #     t = int(time.time())
        #     time.sleep(1)
        effective_time_range = TimeRange(t - SLEEP_INTERVAL + 1, t)
        print("Novel data collection initiated...")
        new_posts = []
        for c in coin_types:
            data_reader.update_coin_type(c)
            posts, _ = data_reader.read(effective_time_range, SLEEP_INTERVAL)
            # new_posts += posts
            # new_posts += posts
        # Post-processing...
        # update_impacts(new_posts)
        groups = list(filter(lambda s: s.startswith("*"), get_all_sources()))
        # create_aggregate_post_impacts(coin_types, groups, effective_time_range)
        create_aggregate_post_counts(coin_types, [], effective_time_range)
        # Deploy the web-site notifications.
        # affected_triggers = deploy_notifications(t, coin_types, groups)
        # Find the affected triggers that should be notified by e-mail.
        # mail_triggers = list(filter(lambda t: t.follow.notify_email, affected_triggers))
        # Send the appropriate e-mails...
        # mailer.deploy_mails(mail_triggers)
        time.sleep(SLEEP_INTERVAL)
        break
def collect_posts():
    curr_time = request.form.get("time", type=int, default=None)
    if curr_time is None:
        print("Collect posts endpoint: Invalid time. Using current time.")
        curr_time = time.time()
    from_time = api_settings.get_last_crawled_post_time(default=api_settings.GENESIS)
    effective_time_range = TimeRange(from_time + 1, curr_time)
    print("Collect posts endpoint: Collecting new posts within", effective_time_range)
    archived_reddit_crawler = ArchivedRedditCrawler(interval=delta_time.days(1), api_settings={'limit': 2000})
    realtime_reddit_crawler = RealtimeRedditCrawler()
    social_media_crawlers = [TwitterCrawler(), RedditMultiplexedCrawler(delta_time.days(2), realtime_reddit_crawler,
                                                                        archived_reddit_crawler)]
    cached_post_readers = list(map(lambda c: UncachedReader(c, Post, save_interval=delta_time.days(10)),
                                   social_media_crawlers))
    new_posts = []
    for coin in COINS:
        print("Collect posts endpoint: Switching coin to", coin.value)
        for cr in cached_post_readers:
            cr.collector.update_coin(coin)
            new_posts += cr.read_uncached(effective_time_range)
    return "ok"
 def collect(self, time_range: TimeRange) -> iter:
     print("ArchivedRedditCrawler: Initiated collection within", time_range)
     for t in range(time_range.low, time_range.high + 1,
                    self.settings.interval):
         tr = TimeRange(t, min(t + self.settings.interval, time_range.high))
         print("ArchivedRedditCrawler: Collecting within", tr)
         for subreddit in COIN_SUBREDDITS[self.settings.coin.value]:
             sbm = self.api.search_submissions(subreddit=subreddit,
                                               before=tr.high,
                                               after=tr.low,
                                               **self.settings.api_settings)
             for p in sbm:
                 content = p.title + (" " + p.selftext if hasattr(
                     p, 'selftext') else "")
                 yield Post(coin_type=self.settings.coin,
                            user=p.author,
                            content=content,
                            source="reddit/" + subreddit.lower(),
                            interaction=calculate_interaction_score(
                                p.num_comments, p.score),
                            time=p.created_utc,
                            unique_id="rs" + p.id)
             # Skip collecting the comments.
             if not self.settings.collect_comments:
                 continue
             cmt = self.api.search_comments(subreddit=subreddit,
                                            before=tr.high,
                                            after=tr.low,
                                            **self.settings.api_settings)
             for p in cmt:
                 yield Post(coin_type=self.settings.coin,
                            user=p.author,
                            content=p.body,
                            source="reddit/" + subreddit.lower(),
                            interaction=calculate_interaction_score(
                                0, p.score),
                            time=p.created_utc,
                            unique_id="rc" + p.id)
示例#16
0
def _collect_dataset():
    from data.collector.yahoo import YahooPriceCrawler
    from data.collector.reddit import ArchivedRedditCrawler
    from data.collector.twitter import TwitterCrawler

    print("Collecting dataset")
    dataset = CryptoSpeculationDataset(
        "Jun19_May21_Big",
        social_media_crawlers=[
            ArchivedRedditCrawler(interval=60 * 60 * 24 * 7,
                                  api_settings={
                                      'limit': 1000,
                                      'score': '>7'
                                  }),
            TwitterCrawler(only_users=True)
        ],
        price_crawler=YahooPriceCrawler(resolution="1h"),
        coin_types=[
            CoinType.btc, CoinType.eth, CoinType.doge, CoinType.ada,
            CoinType.link, CoinType.ltc, CoinType.omg, CoinType.xlm,
            CoinType.xrp
        ],
        time_range=TimeRange(1560556800, 1619827200))
    dataset.save()
示例#17
0
def cached_range_reader(range_type: str):
    cached_ranges = CachedRange.query.filter_by(type=range_type)
    return list(map(lambda cr: TimeRange(cr.low, cr.high), cached_ranges))