def run_query(): ''' Function to run a specific query using the Twitter API ''' # See all available languages and their code in ./utils/constants.py --> AVAILABLE_LANGS lang = AVAILABLE_LANGS['Spanish'] # Date in the format 'yyyy-MM-dd'. Maximum possible end date is today start_date = '2021-05-20' end_date = '2021-05-22' # Maximum number of tweets to be retrieved from Twitter max_tweets = 12 # Combination of keywords and hashtags that needs to be met by the retrieved tweets keys = 'Ceuta (inmigrante OR migrante)' # (Partial) URL to be contained in the tweet url = '' # 'youtube.com' query_params = { 'hashtags': [], 'keywords': [], 'keys': keys, 'url': url, 'date_since': start_date, 'date_to': end_date, 'lang': lang, 'max_tweets': max_tweets, 'max_results': min(500, max_tweets) } searcher = Searcher() searcher.run_query(query_params, filename='raw_mode.csv')
def conversation_search(): ''' Function to search for specific conversation_ids ''' query_params = { 'lang': 'es', 'date_since': f'2006-03-21', 'max_tweets': MAX_TWEETS, 'max_results': min(500, MAX_TWEETS) } filename = 'conversation_test.csv' if os.path.isfile(os.path.join(DATA_PATH, filename)): os.remove(os.path.join(DATA_PATH, filename)) conversation_ids = [929343192272637952] searcher = Searcher() for id in conversation_ids: print(f'\033[94m Searching conversation {id}...\033[0m') query_params['conversation_id'] = id searcher.run_query(query_params, filename, initial_header=True, rh_id=None) while searcher.is_running: pass time.sleep(searcher.sleep_time)
def run_title_strategy(title, date, source, rh_id, keep_stopwords=True): ''' Method to retrieve those tweets that (partially in case stopwords are removed) contain the racial hoax title in their text. Filter on date (within 5 years prior to the racial hoax verification date) and language is performed. ''' filename = f'{source}_tweets.csv' if not keep_stopwords: title = ' '.join([w for w in nltk.word_tokenize(title) if w not in stopwords.words(LANG_MAPPER[SOURCE_LANG[source]])]) title = re.sub('\s+', ' ', re.sub('[^\w\s]', '', title)) query_params = {'keys': title, 'date_since': f'{date-5}-01-01', 'date_to': '', # min(f'{date}-12-31', time.strftime('%Y-%m-%d')), 'lang': SOURCE_LANG[source]} # Specify whether the column names must be initially written to the output file or not if os.path.isfile(f'./data/{filename}'): header = False else: header = True searcher = Searcher(max_tweets=MAX_TWEETS) searcher.run_query(query_params, filename, header, rh_id) # Wait for the searcher to finish before going to another query while searcher.is_running: pass time.sleep(searcher.sleep_time)
def __init__(self, *args, **kwargs): super(DownloadWindow, self).__init__(*args, **kwargs) self.setWindowTitle("Twitter Data Downloader") self.status = QtWidgets.QStatusBar(self) self.default_styleSheet = self.status.styleSheet() self.setStatusBar(self.status) self.printer = Printer(self.status) self.searcher = Searcher(self.printer) # -------------- SETTING THE LAYOUTS -------------- # gen_layout = get_layout(QtWidgets.QVBoxLayout()) upper_layout = get_layout(QtWidgets.QHBoxLayout()) bottom_layout = get_layout(QtWidgets.QHBoxLayout()) self.add_query_grid(upper_layout) self.add_buttons(bottom_layout) gen_layout.addLayout(get_layout(QtWidgets.QHBoxLayout()), 15) gen_layout.addLayout(upper_layout, 40) gen_layout.addLayout(get_layout(QtWidgets.QHBoxLayout()), 10) gen_layout.addLayout(bottom_layout, 25) gen_layout.addLayout(get_layout(QtWidgets.QHBoxLayout()), 10) widget = QtWidgets.QWidget() widget.setLayout(gen_layout) # Set the central widget of the Window. Widget will expand # to take up all the space in the window by default. self.setCentralWidget(widget)
def multiple_search(queries, query_params, rh_id, filename): searcher = Searcher(max_tweets=MAX_TWEETS) if os.path.isfile(f'./data/{filename}'): header = False else: header = True for query in queries: print(f'\t+++ Searching the following keys: {query}') query_params['keys'] = query searcher.run_query(query_params, filename, header, rh_id) while searcher.is_running: pass time.sleep(searcher.sleep_time) header = False
def first_tweets_retrieval(filename): ''' Function to do a tweet lookup using Twitter API in search of the root tweets that are missing from several conversations (the tweet_id of the root tweet is the same as the conversation_id). :param filename: .csv file that contains the Twitter downloaded conversations with missing root tweets. ''' searcher = Searcher(max_tweets=MAX_TWEETS) tweets = read_tweets(filename) tweets.drop_duplicates(subset=['tweet_id'], inplace=True) # Apparently, Twitter API search by conversation_id does not retrieve the first tweet in multiple cases lookup_ids = tweets.groupby('conversation_id').apply( lambda s: not any(pd.isna(s.in_reply_to_tweet_id))) lookup_ids = lookup_ids[lookup_ids].index.tolist() print( f'Looking for {len(lookup_ids)} initial comments to add to the {tweets.shape[0]} existing tweets...' ) searcher.tweet_lookup(lookup_ids, filename) while searcher.is_running: pass
def expand_conversations(fact_checker): filename = f'{fact_checker}_tweets.csv' tweets = read_tweets(filename) # Save backup file tweets.to_csv(os.path.join(DATA_PATH, filename[:-4] + '_unexpanded.csv'), mode='w', index=False, quoting=csv.QUOTE_ALL) # Expand only conversation_ids of tweets either having replies (posterior thread) or being replies (previous thread) ixs_interest = np.bitwise_or(tweets.reply_count > 0, tweets.conversation_id != tweets.tweet_id) expand_ids = tweets[ixs_interest][['conversation_id', 'rh_id']].drop_duplicates() print(f'\033[94m Expanding {expand_ids.shape[0]} conversations for {fact_checker} fact checker...\033[0m') # 'start_time' must be on or after 2006-03-21T00:00Z (Twitter constraint) query_params = {'lang': SOURCE_LANG[fact_checker], 'date_since': f'2006-03-21', 'max_tweets': MAX_TWEETS, 'max_results': min(500, MAX_TWEETS)} searcher = Searcher() for _, row in expand_ids.iterrows(): print(f'\033[94m Expanding {row.conversation_id} corresponding to {row.rh_id}...\033[0m') if row.conversation_id in EXCLUDE_CONV_IDS.get(fact_checker, []): print(f'\t+++ This conversation id has been manually excluded from the expansion') else: query_params['conversation_id'] = row.conversation_id searcher.run_query(query_params, filename, initial_header=False, rh_id=row.rh_id) while searcher.is_running: pass time.sleep(searcher.sleep_time) # Deduplicate retrieved tweets by tweet_id tweets = read_tweets(filename) tweets.drop_duplicates(subset=['tweet_id'], inplace=True) # Apparently, Twitter API search by conversation_id does not retrieve the first tweet in multiple cases lookup_ids = tweets.groupby('conversation_id').apply(lambda s: not any(pd.isna(s.in_reply_to_tweet_id))) lookup_ids = lookup_ids[lookup_ids].index.tolist() if lookup_ids: print(f'\033[94m Looking for {len(lookup_ids)} initial comments to add to the {tweets.shape[0]} ' f'existing tweets...\033[0m') searcher.tweet_lookup(lookup_ids, filename) while searcher.is_running: pass else: print(f'\033[94m All conversations have their initial comments...\033[0m') # Deduplicate tweets based on clean text (without user tags or URLs) tweets = read_tweets(filename) tweets = clean_duplicates(tweets) # Populate Racial Hoax ID for the starting tweets previously retrieved unexpanded_data = read_tweets(f'{fact_checker}_tweets_unexpanded.csv') mapper = unexpanded_data.drop_duplicates(subset='conversation_id')[['conversation_id', 'rh_id']] mapper = mapper.set_index('conversation_id').rh_id tweets.rh_id = tweets.conversation_id.map(mapper) # Save to file with the new data tweets.to_csv(os.path.join(DATA_PATH, filename), mode='w', index=False, quoting=csv.QUOTE_ALL)
def run_url_tweets_strategy(url, fact_checker, rh_id): ''' Some racial hoax articles contain several embedded tweets. This method extracts those tweet identifiers and performs a tweet lookup using Twitter API. ''' html_content = requests.get(url).text soup = BeautifulSoup(html_content, "lxml") links = soup.find_all("blockquote", attrs={"class": "twitter-tweet"}) # embed_tweets = [re.sub('\s+pic\.twitter\..*\s?', ' ', item.find('p').text) for item in links] embed_tweet_ids = [re.sub('.*/status/(.*)\?.*', r'\g<1>', item.find_all('a')[-1]['href']) for item in links] filename = f'{fact_checker}_tweets.csv' # for i, (q, ix) in enumerate(zip(embed_tweets, embed_tweet_ids)): # print(f'\t{i} --> {q} : {ix}') if embed_tweet_ids: print(f'\t+++ Searching for {len(embed_tweet_ids)} embedded tweet ids...') Searcher(max_tweets=MAX_TWEETS).tweet_lookup(embed_tweet_ids, filename=filename, rh_id=rh_id) else: print('\t+++ No embedded tweets were found...')
def tweets_search(): ''' In case the tweet_id is known a priori, this functions implements the corresponding tweets lookup. ''' Searcher(max_tweets=10).tweet_lookup(['1125295472908886016'], 'test.csv')
class DownloadWindow(QtWidgets.QMainWindow): def __init__(self, *args, **kwargs): super(DownloadWindow, self).__init__(*args, **kwargs) self.setWindowTitle("Twitter Data Downloader") self.status = QtWidgets.QStatusBar(self) self.default_styleSheet = self.status.styleSheet() self.setStatusBar(self.status) self.printer = Printer(self.status) self.searcher = Searcher(self.printer) # -------------- SETTING THE LAYOUTS -------------- # gen_layout = get_layout(QtWidgets.QVBoxLayout()) upper_layout = get_layout(QtWidgets.QHBoxLayout()) bottom_layout = get_layout(QtWidgets.QHBoxLayout()) self.add_query_grid(upper_layout) self.add_buttons(bottom_layout) gen_layout.addLayout(get_layout(QtWidgets.QHBoxLayout()), 15) gen_layout.addLayout(upper_layout, 40) gen_layout.addLayout(get_layout(QtWidgets.QHBoxLayout()), 10) gen_layout.addLayout(bottom_layout, 25) gen_layout.addLayout(get_layout(QtWidgets.QHBoxLayout()), 10) widget = QtWidgets.QWidget() widget.setLayout(gen_layout) # Set the central widget of the Window. Widget will expand # to take up all the space in the window by default. self.setCentralWidget(widget) def set_windows(self, windows): self.home_window = windows['home'] def add_query_grid(self, parent_layout): # Hashtags, Keywords, Start Date (inclusive), End Date (inclusive), Admit Replies grid_layout = QtWidgets.QGridLayout() labels = [] # Hashtags are concatenated with " OR " (The tweet only needs to contain one of the input hashtags) self.hashtags = QtWidgets.QTextEdit() helper_text = 'Hashtags (if multiple, enter a comma-separated sequence)\n' \ 'Example: #hate, #stopMigration, #zeroBlack' self.hashtags.setPlaceholderText(helper_text) self.hashtags.setAcceptRichText(False) labels.append(QtWidgets.QLabel('HASHTAGS')) # All Keywords must appear in the tweet so they are concatenated with " AND " self.keywords = QtWidgets.QTextEdit() helper_text = 'Keywords (if multiple, enter a comma-separated sequence)\n' \ 'Example: hate, migration, racist, black people' self.keywords.setPlaceholderText(helper_text) self.keywords.setAcceptRichText(False) labels.append(QtWidgets.QLabel('KEYWORDS')) # Both dates are inclusive (start-end) self.start_date = QtWidgets.QDateEdit(displayFormat='dd-MMM-yyyy', calendarPopup=True) self.start_date.setDateTime(QtCore.QDateTime.currentDateTime().addDays(-1)) self.start_date.dateChanged.connect(self.start_date_dateedit) self.start_date.setFixedWidth(200) self.start_date.setAlignment(Qt.AlignCenter) labels.append(QtWidgets.QLabel('START DATE')) self.end_date = QtWidgets.QDateEdit(displayFormat='dd-MMM-yyyy', calendarPopup=True) self.end_date.setDateTime(QtCore.QDateTime.currentDateTime().addDays(-1)) self.end_date.dateChanged.connect(self.end_date_dateedit) self.end_date.setFixedWidth(200) self.end_date.setAlignment(Qt.AlignCenter) labels.append(QtWidgets.QLabel('END DATE')) # Select the maximum number of tweets to be retrieved from twitter self.max_tweets = QtWidgets.QLineEdit() self.max_tweets.setPlaceholderText('Minimum value: 10, Maximum value: 2.000.000') self.max_tweets.setValidator(QtGui.QIntValidator()) self.max_tweets.setAlignment(Qt.AlignCenter) self.max_tweets.setMinimumSize(300, 0) labels.append(QtWidgets.QLabel('MAXIMUM NUMBER OF TWEETS')) # Select the language of the tweets self.language = QtWidgets.QComboBox() self.language.addItems(AVAILABLE_LANGS.keys()) labels.append(QtWidgets.QLabel('LANGUAGE')) l_height = 50 for l in labels: l.setStyleSheet('background-color:#FAEB7C;color:#000000;font-weight: bold;') l.setAlignment(Qt.AlignCenter) l.setFixedHeight(l_height) # Add items to the grid grid_layout.addWidget(labels[0], 0, 0) grid_layout.addWidget(self.hashtags, 1, 0) grid_layout.addWidget(labels[1], 0, 1) grid_layout.addWidget(self.keywords, 1, 1) grid_layout.addWidget(labels[2], 2, 0) grid_layout.addWidget(self.start_date, 3, 0, Qt.AlignCenter) grid_layout.addWidget(labels[3], 2, 1) grid_layout.addWidget(self.end_date, 3, 1, Qt.AlignCenter) grid_layout.addWidget(labels[4], 4, 0) grid_layout.addWidget(self.max_tweets, 5, 0, Qt.AlignCenter) grid_layout.addWidget(labels[5], 4, 1) grid_layout.addWidget(self.language, 5, 1, Qt.AlignCenter) parent_layout.addLayout(grid_layout, Qt.AlignCenter) def add_buttons(self, layout): # Submit query, Clean Query, Last Query grid_layout = QtWidgets.QGridLayout() grid_layout.setHorizontalSpacing(20) grid_layout.setVerticalSpacing(20) size = (100, 40) submit_button = get_button(label='Submit', action=self.submit_query, size=size, shortcut=Qt.Key_Enter, color='6CFE87') cancel_button = get_button(label='Cancel', action=self.cancel_query, size=size, color='FF8585') clear_button = get_button(label='Clear', action=self.clear_fields, size=size, color='B9B9B9') back_button = get_button(label='Back', action=self.back_action, size=size, color='B9B9B9') grid_layout.addWidget(submit_button, 0, 0, Qt.AlignRight) grid_layout.addWidget(cancel_button, 0, 2, Qt.AlignLeft) grid_layout.addWidget(clear_button, 2, 0, Qt.AlignRight) grid_layout.addWidget(back_button, 2, 2, Qt.AlignLeft) layout.addLayout(grid_layout, stretch=True) def submit_query(self): if not self.searcher.is_running: max_tweets = min(max(int(self.max_tweets.text()) if self.max_tweets.text() else 10, 10), 2000000) hashtags = list(filter(None, re.split(',\s*', self.hashtags.toPlainText()))) hashtags = [re.sub('\s', '', h if h.startswith('#') else '#' + h) for h in hashtags] keywords = re.split('\s*,\s*', self.keywords.toPlainText()) query_params = {'hashtags': hashtags, 'keywords': keywords, 'date_since': self.start_date.dateTime().toString('yyyy-MM-dd'), 'date_to': self.end_date.dateTime().addDays(1).toString('yyyy-MM-dd'), 'lang': AVAILABLE_LANGS[self.language.currentText()], 'max_tweets': max_tweets, 'max_results': min(500, max_tweets)} self.hashtags.setText(' OR '.join(hashtags)) self.keywords.setText(' OR '.join(keywords)) self.searcher.run_query(query_params) else: self.printer.show_message('Already running another query', 1500, 'error') def cancel_query(self): self.printer.show_message('Cancel button clicked', 1000, 'success') def clear_fields(self): self.hashtags.clear() self.keywords.clear() self.start_date.setDateTime(QtCore.QDateTime.currentDateTime().addDays(-1)) self.end_date.setDateTime(QtCore.QDateTime.currentDateTime().addDays(-1)) self.language.setCurrentIndex(0) self.max_tweets.clear() self.printer.show_message('Clear button clicked', 1000, 'success') def back_action(self): self.home_window.showMaximized() self.close() def start_date_dateedit(self): if self.start_date.dateTime() > self.end_date.dateTime(): self.end_date.setDateTime(self.start_date.dateTime()) def end_date_dateedit(self): if self.start_date.dateTime() > self.end_date.dateTime(): self.start_date.setDateTime(self.end_date.dateTime())