示例#1
0
 def rules_setup(self):
     print("Get original rules...")
     current_rules = self.get_all_rules()
     print("Delete original rules...")
     self.delete_all_rules(current_rules)
     print("Set new rules...")
     self.set_rules()
     print("Current rules:")
     current_rules = self.get_all_rules()
     write_to_log(self.log_file, f"Streaming with filter rules: {str(current_rules)}...")
    def __init__(self, query, start_time, end_time, bearer_token, db_conn,
                 db_cur):
        self.auth = bearer_token
        self.query = query
        self.start_time = start_time.strftime("%Y-%m-%dT%H:%M:%SZ")
        self.end_time = end_time.strftime("%Y-%m-%dT%H:%M:%SZ")
        self.next_token = None
        self.db_conn = db_conn
        self.db_cur = db_cur

        self.log_file = f'../logs/recent_search_{LOGFILE_NAME}_{datetime.now().strftime("%Y%m%d_%H:%M:%S")}.txt'

        write_to_log(
            self.log_file,
            f"Started to search from {self.start_time} to {self.end_time} with query {self.query}..."
        )
    def save_next_batch(self):
        # try:
        print("Started new batch!")
        try:
            tweets_details, next_token = premium_search_retweet(
                product=self.product,
                label=self.label,
                query=self.query,
                from_date=self.start_time,
                to_date=self.end_time,
                bearer_token=self.auth,
                next_token=self.next_token)
        except Exception as e:
            if str(e).startswith("429"):
                raise Exception("Rate limit exceeded!")

        r_obj_list = tweets_details
        self.next_token = next_token

        for r_obj in r_obj_list:
            # save only retweets
            if r_obj["is_retweet"]:
                try:
                    todb_values = [
                        r_obj["tweet_id"],
                        r_obj["author_id"],
                        r_obj["created_at"],
                        r_obj["parent_tweet_id"],
                        r_obj["parent_tweet_author_id"],
                        r_obj["like_count"],
                        r_obj["quote_count"],
                        r_obj["reply_count"],
                        r_obj["retweet_count"],
                        r_obj["text"],
                    ]
                    batch_insert(RETWEETS.name, RETWEETS.cols, [todb_values],
                                 self.db_cur)
                    self.db_conn.commit()
                    write_to_log(
                        self.log_file,
                        f'Saved to retweets! tweet_id: {r_obj["tweet_id"]}')
                except sqlite3.IntegrityError:
                    print("Retweet already saved!")

        write_to_log(self.log_file, f"Saved one batch!")
        print("Saved one batch!")
    def __init__(self, product, label, query, start_time, end_time,
                 bearer_token, db_conn, db_cur):
        self.auth = bearer_token
        self.product = product
        self.label = label
        self.query = query
        self.start_time = start_time.strftime("%Y%m%d%H%M")
        self.end_time = end_time.strftime("%Y%m%d%H%M")
        self.next_token = None
        self.db_conn = db_conn
        self.db_cur = db_cur

        self.log_file = f'../logs/july_retweet_search_{LOGFILE_NAME}_{datetime.now().strftime("%Y%m%d_%H:%M:%S")}.txt'

        write_to_log(
            self.log_file,
            f"Started to search from {self.start_time} to {self.end_time} with query {self.query}..."
        )
    def save_next_batch(self):
        if self.next_id_ptr == 0:
            write_to_log(
                self.log_file,
                f'-------[{datetime.now().strftime("%H:%M:%S")}] Starting to update retweets-------'
            )

        print(f"Processing next batch...")

        limit_exceeded = False
        trange = tqdm(range(self.next_id_ptr, len(self.tweets_id_list)))
        for idx in trange:
            tweet_id = self.tweets_id_list[idx]
            try:
                r_obj = get_single_tweet_by_id_labs(tweet_id, self.auth)
                values = [
                    r_obj["like_count"], r_obj["quote_count"],
                    r_obj["reply_count"], r_obj["retweet_count"], tweet_id
                ]
                update_row(self.table_name, "tweet_id", [
                    "like_count", "quote_count", "reply_count", "retweet_count"
                ], [values], self.db_cur)
                self.db_conn.commit()
                write_to_log(self.log_file,
                             f"Updated stats for tweet: {tweet_id}")
            except Exception as e:
                error_msg = str(e)
                if error_msg == "[Self Defined]Error in response!":
                    pass
                elif error_msg.startswith("429"):
                    limit_exceeded = True
                    self.next_id_ptr = idx
                    write_to_log(
                        self.log_file,
                        f'**[{datetime.now().strftime("%H:%M:%S")}]** Finished {idx} tweets! Next tweet idx: {self.next_id_ptr}'
                    )
                    trange.close()
                    break
                else:
                    # print(traceback.format_exc())
                    raise Exception("error!")

        write_to_log(
            self.log_file,
            f'**{datetime.now().strftime("%H:%M:%S")}**Finished one batch!')

        if limit_exceeded:
            raise Exception("[Self Defined]Limit Exceeded!")
        else:
            self.next_id_ptr = len(self.tweets_id_list)
示例#6
0
    def next_batch(self):
        tweets_todb = []
        from_date_str = self.next_datetime.strftime("%Y%m%d%H%M")
        to_date_str = (self.next_datetime +
                       timedelta(days=1)).strftime("%Y%m%d%H%M")

        print(f"Searching for date {from_date_str}...")
        write_to_log(
            self.log_file,
            f'-------[{datetime.now().strftime("%H:%M:%S")}] Start searching for date {from_date_str}-------'
        )

        tweets_count = 0
        next_token = None
        while tweets_count < self.tweets_per_day and next_token != "Not existed!":
            try:
                tweets_list, next_token = premium_search(
                    product=self.product,
                    label=self.label,
                    query=self.query,
                    from_date=from_date_str,
                    to_date=to_date_str,
                    next_token=next_token)
                tweets_count += len(tweets_list)
                for obj in tweets_list:
                    tweets_todb.append([
                        obj["tweet_id"], obj["full_text"], obj["created_at"],
                        obj["language"], obj["hashtags_str"],
                        obj["mentions_str"], obj["favorite_count"],
                        obj["retweet_count"]
                    ])
            except Exception as e:
                error_code = str(e).split(":")[0]
                if error_code == "88" or error_code == "429":
                    print("Rate limit exceeded!")
                    write_to_log(
                        self.log_file,
                        f'**[{datetime.now().strftime("%H:%M:%S")}]** Rate limit exceeded! Next date to search: {from_date_str}'
                    )
                    program_sleep(61)
                else:
                    write_to_log(self.log_file, f"{e}")
                    break

        self.next_datetime += timedelta(days=1)
        write_to_log(
            self.log_file,
            f'-------[{datetime.now().strftime("%H:%M:%S")}] Finished searching for date {from_date_str}-------\n'
        )
        return tweets_todb
    def save_next_batch(self):
        # try:
        results = recent_search_labs(self.query, self.start_time,
                                     self.end_time, self.next_token, self.auth)
        # except Exception as e:
        # 	write_to_log(self.log_file, f'[ERROR]{datetime.now().strftime("%Y%m%d_%H:%M:%S")}: {e}')
        # 	return

        r_obj_list = results["r_obj_list"]
        self.next_token = results["next_token"]

        for r_obj in r_obj_list:
            # # find original tweet if is reply
            # while r_obj['is_reply'] and r_obj["tweet_id"] not in VISITED_REPLIES:
            # 	try:
            # 		write_to_log(self.log_file, f'Reply tweet! tweet_id: {r_obj["tweet_id"]}. Looking for parent tweet! tweet_id: {r_obj["parent_tweet_id"]}')
            # 		VISITED_REPLIES.add(r_obj["tweet_id"])
            # 		r_obj = get_single_tweet_by_id_labs(r_obj['parent_tweet_id'], self.auth)
            # 	except Exception as e:
            # 		print_dict(r_obj)
            # 		write_to_log(self.log_file, e)
            # 		break
            if not r_obj["is_reply"]:
                try:
                    todb_values = [
                        r_obj["tweet_id"],
                        r_obj["author_id"],
                        r_obj["created_at"],
                        r_obj["text"],
                        r_obj["expanded_urls"],
                        r_obj["hashtags_str"],
                        r_obj["mentions_str"],
                        r_obj["like_count"],
                        r_obj["quote_count"],
                        r_obj["reply_count"],
                        r_obj["retweet_count"],
                    ]
                    batch_insert(REGULAR_TWEETS.name, REGULAR_TWEETS.cols,
                                 [todb_values], self.db_cur)
                    self.db_conn.commit()
                    write_to_log(
                        self.log_file,
                        f'Saved to regular_tweets! tweet_id: {r_obj["tweet_id"]}'
                    )
                except sqlite3.IntegrityError:
                    print("Tweet already saved!")
                except sqlite3.OperationalError:
                    write_to_log(
                        self.log_file,
                        f'-------[ERROR] Cannot save to regular_tweets! tweet_id: {r_obj["tweet_id"]}-------\n'
                        + f"Tweet values: {str(todb_values)}\n" +
                        f"----------------------------------------------------------------------------------"
                    )

        write_to_log(
            self.log_file,
            f'**{datetime.now().strftime("%H:%M:%S")}**Finished one batch! Next next_token: {self.next_token}'
        )
示例#8
0
    def next_batch(self):
        if self.next_id_ptr == 0:
            write_to_log(
                self.log_file,
                f'-------[{datetime.now().strftime("%H:%M:%S")}] Starting getting full tweets for {self.table_name}-------'
            )

        print(f"Getting next batch for {self.table_name}...")
        limit_exceeded = False
        tweets_todb = []
        images_todb = []
        videos_todb = []
        gifs_todb = []
        externals_todb = []

        trange = tqdm(range(self.next_id_ptr, len(self.tweets_id_list)))
        for idx in trange:
            tweet_id = self.tweets_id_list[idx]
            try:
                obj = get_single_tweet_by_id(tweet_id)
                if obj:
                    tweets_todb.append([
                        obj["tweet_id"], obj["full_text"], obj["created_at"],
                        obj["language"], obj["hashtags_str"],
                        obj["mentions_str"], obj["favorite_count"],
                        obj["retweet_count"]
                    ])
                    for media in obj["all_media_urls"]:
                        if media["media_type"] == "photo":
                            images_todb.append([tweet_id, media["media_url"]])
                        elif media["media_type"] == "video":
                            videos_todb.append([tweet_id, media["media_url"]])
                        elif media["media_type"] == "animated_gif":
                            gifs_todb.append([tweet_id, media["media_url"]])
                        elif media["media_type"] == "other":
                            externals_todb.append(
                                [tweet_id, media["media_url"]])

            except Exception as e:
                error_code = str(e).split("}]:")[0]
                error_code = error_code.split("'code': ")[1]
                if error_code == "88":
                    limit_exceeded = True
                    self.next_id_ptr = idx

                    write_to_log(
                        self.log_file,
                        f'**[{datetime.now().strftime("%H:%M:%S")}]** Finished {idx} tweets! Next tweet idx: {self.next_id_ptr}'
                    )
                    trange.close()
                    break
                else:
                    write_to_log(self.log_file, f"{e}")

            except Exception as e:
                write_to_log(self.log_file, f"{e}")

        trange.close()
        time.sleep(1)
        if not limit_exceeded:
            self.next_id_ptr = len(self.tweets_id_list)
            write_to_log(
                self.log_file,
                f'-------[{datetime.now().strftime("%H:%M:%S")}] Finished getting full tweets for {self.table_name}!-------\n'
            )

        return {
            "limit_exceeded": limit_exceeded,
            "tweets_todb": tweets_todb,
            "images": images_todb,
            "videos": videos_todb,
            "gifs": gifs_todb,
            "externals": externals_todb
        }
    def next_batch(self):
        if self.next_cached_id_ptr >= len(self.cached_ids):
            self.cached_ids = []
            self.next_cached_id_ptr = (self.next_cached_id_ptr
                                       if self.is_first_batch else 0)
            self.current_id_file = self.id_files[self.next_file_ptr]
            with open(self.current_id_file, "r") as infile:
                for line in infile:
                    tweet_id = line.strip()
                    self.cached_ids.append(tweet_id)

            write_to_log(
                self.log_file,
                f'-------[{datetime.now().strftime("%H:%M:%S")}] Starting processing file with idx {self.next_file_ptr}: {self.current_id_file}-------',
            )
            self.next_file_ptr += 1
            self.is_first_batch = False

        print(f"Getting Tweets in {self.current_id_file}...")

        limit_exceeded = False
        covid19_tweets_todb = []
        images_todb = []
        videos_todb = []
        gifs_todb = []
        externals_todb = []

        trange = tqdm(range(self.next_cached_id_ptr, len(self.cached_ids)))
        for idx in trange:
            tweet_id = self.cached_ids[idx]
            try:
                obj = get_single_tweet_by_id(tweet_id)
                if obj:
                    covid19_tweets_todb.append([
                        obj["tweet_id"],
                        obj["full_text"],
                        obj["created_at"],
                        obj["language"],
                        obj["hashtags_str"],
                        obj["mentions_str"],
                        obj["favorite_count"],
                        obj["retweet_count"],
                    ])
                    for media in obj["all_media_urls"]:
                        if media["media_type"] == "photo":
                            images_todb.append([tweet_id, media["media_url"]])
                        elif media["media_type"] == "video":
                            videos_todb.append([tweet_id, media["media_url"]])
                        elif media["media_type"] == "animated_gif":
                            gifs_todb.append([tweet_id, media["media_url"]])
                        elif media["media_type"] == "other":
                            externals_todb.append(
                                [tweet_id, media["media_url"]])

            except Exception as e:
                try:
                    error_code = str(e).split("}]:")[0]
                    error_code = error_code.split("'code': ")[1]
                    if error_code == "88":
                        limit_exceeded = True
                        self.next_cached_id_ptr = idx
                        write_to_log(
                            self.log_file,
                            f'**[{datetime.now().strftime("%H:%M:%S")}]** Finished {idx} tweets in file {self.next_file_ptr - 1}! Next tweet idx: {self.next_cached_id_ptr}',
                        )
                        trange.close()
                        break
                    else:
                        write_to_log(self.log_file, f"{e}")
                except Exception as e:
                    write_to_log(self.log_file, f"{e}")

        trange.close()
        time.sleep(1)
        if not limit_exceeded:
            self.next_cached_id_ptr = len(self.cached_ids)
            write_to_log(
                self.log_file,
                f'-------[{datetime.now().strftime("%H:%M:%S")}] Finished processing file with idx {self.next_file_ptr - 1}: {self.current_id_file}-------\n',
            )

        return {
            "limit_exceeded": limit_exceeded,
            "covid19_tweets": covid19_tweets_todb,
            "images": images_todb,
            "videos": videos_todb,
            "gifs": gifs_todb,
            "externals": externals_todb,
        }
示例#10
0
 def add_text_column(self):
     add_column(RETWEETS.name, "text", "TEXT", self.db_cur)
     self.db_conn.commit()
     write_to_log(self.log_file, "[TABLE CHANGED] Added column text")
示例#11
0
    def stream_connect(self):
        response = requests.get(self.stream_url, auth=self.auth, stream=True, params={"tweet.fields": "author_id,created_at,entities,text,public_metrics", "user.fields": "id", "expansions": "referenced_tweets.id,referenced_tweets.id.author_id"})

        if response.status_code > 201:
            raise Exception(f"{response.status_code}: {response.text}")

        # window_count = 0
        # checkpoint_timestamp = datetime.now()
        for response_line in response.iter_lines():
            if response_line:
                tweet_dict = json.loads(response_line)
                r_obj = get_tweet_details_labs(tweet_dict, metric_fieldname="public_metrics")

                # find original tweet if is retweet
                while r_obj["is_retweet"]:
                    try:
                        todb_values = [
                            r_obj["tweet_id"],
                            r_obj["author_id"],
                            r_obj["created_at"],
                            r_obj["parent_tweet_id"],
                            r_obj["parent_tweet_author_id"],
                            r_obj["like_count"],
                            r_obj["quote_count"],
                            r_obj["reply_count"],
                            r_obj["retweet_count"],
                            r_obj["text"],
                        ]
                        batch_insert(RETWEETS.name, RETWEETS.cols, [todb_values], self.db_cur)
                        self.db_conn.commit()
                        write_to_log(self.log_file, f'Saved to retweets! tweet_id: {r_obj["tweet_id"]}. Looking for parent tweet! tweet_id: {r_obj["parent_tweet_id"]}')
                        r_obj = get_single_tweet_by_id_labs(r_obj["parent_tweet_id"], self.auth)

                    except sqlite3.IntegrityError:
                        print("Retweet already saved!")
                        break
                    except sqlite3.OperationalError:
                        write_to_log(
                            self.log_file,
                            f'-------[ERROR] Cannot save to retweets! tweet_id: {r_obj["tweet_id"]}-------\n'
                            + f"Tweet values: {str(todb_values)}\n"
                            + f'Looking for parent tweet! tweet_id: {r_obj["parent_tweet_id"]}\n'
                            + f"----------------------------------------------------------------------------",
                        )
                        r_obj = get_single_tweet_by_id_labs(r_obj["parent_tweet_id"], self.auth)
                    except Exception as e:
                        write_to_log(self.log_file, e)
                        break

                if not r_obj["is_retweet"]:
                    # find original tweet if is reply
                    while r_obj["is_reply"]:
                        try:
                            write_to_log(self.log_file, f'Reply tweet! tweet_id: {r_obj["tweet_id"]}. Looking for parent tweet! tweet_id: {r_obj["parent_tweet_id"]}')
                            r_obj = get_single_tweet_by_id_labs(r_obj["parent_tweet_id"], self.auth)
                        except Exception as e:
                            write_to_log(self.log_file, e)
                            break

                    try:
                        todb_values = [
                            r_obj["tweet_id"],
                            r_obj["author_id"],
                            r_obj["created_at"],
                            r_obj["text"],
                            r_obj["expanded_urls"],
                            r_obj["hashtags_str"],
                            r_obj["mentions_str"],
                            r_obj["like_count"],
                            r_obj["quote_count"],
                            r_obj["reply_count"],
                            r_obj["retweet_count"],
                        ]
                        batch_insert(REGULAR_TWEETS.name, REGULAR_TWEETS.cols, [todb_values], self.db_cur)
                        self.db_conn.commit()
                        write_to_log(self.log_file, f'Saved to regular_tweets! tweet_id: {r_obj["tweet_id"]}')

                    except sqlite3.IntegrityError:
                        print("Original tweet already saved!")
                    except sqlite3.OperationalError:
                        write_to_log(
                            self.log_file,
                            f'-------[ERROR] Cannot save to regular_tweets! tweet_id: {r_obj["tweet_id"]}-------\n' + f"Tweet values: {str(todb_values)}\n" + f"----------------------------------------------------------------------------------",
                        )
                    except Exception as e:
                        write_to_log(self.log_file, e)