Python Searcher示例，utils.search.Searcher Python示例

示例#1

0

显示文件

def run_query():
    '''
    Function to run a specific query using the Twitter API
    '''
    # See all available languages and their code in ./utils/constants.py --> AVAILABLE_LANGS
    lang = AVAILABLE_LANGS['Spanish']
    # Date in the format 'yyyy-MM-dd'. Maximum possible end date is today
    start_date = '2021-05-20'
    end_date = '2021-05-22'
    # Maximum number of tweets to be retrieved from Twitter
    max_tweets = 12
    # Combination of keywords and hashtags that needs to be met by the retrieved tweets
    keys = 'Ceuta (inmigrante OR migrante)'
    # (Partial) URL to be contained in the tweet
    url = ''  # 'youtube.com'
    query_params = {
        'hashtags': [],
        'keywords': [],
        'keys': keys,
        'url': url,
        'date_since': start_date,
        'date_to': end_date,
        'lang': lang,
        'max_tweets': max_tweets,
        'max_results': min(500, max_tweets)
    }

    searcher = Searcher()
    searcher.run_query(query_params, filename='raw_mode.csv')

示例#2

0

显示文件

def conversation_search():
    '''
    Function to search for specific conversation_ids
    '''
    query_params = {
        'lang': 'es',
        'date_since': f'2006-03-21',
        'max_tweets': MAX_TWEETS,
        'max_results': min(500, MAX_TWEETS)
    }
    filename = 'conversation_test.csv'
    if os.path.isfile(os.path.join(DATA_PATH, filename)):
        os.remove(os.path.join(DATA_PATH, filename))
    conversation_ids = [929343192272637952]
    searcher = Searcher()
    for id in conversation_ids:
        print(f'\033[94m Searching conversation {id}...\033[0m')
        query_params['conversation_id'] = id
        searcher.run_query(query_params,
                           filename,
                           initial_header=True,
                           rh_id=None)
        while searcher.is_running:
            pass
        time.sleep(searcher.sleep_time)

示例#3

0

显示文件

文件： strategies.py 项目： alarca94/racial-hoaxes

def run_title_strategy(title, date, source, rh_id, keep_stopwords=True):
    '''
    Method to retrieve those tweets that (partially in case stopwords are removed) contain the racial hoax title in
    their text. Filter on date (within 5 years prior to the racial hoax verification date) and language is performed.
    '''
    filename = f'{source}_tweets.csv'

    if not keep_stopwords:
        title = ' '.join([w for w in nltk.word_tokenize(title)
                          if w not in stopwords.words(LANG_MAPPER[SOURCE_LANG[source]])])
        title = re.sub('\s+', ' ', re.sub('[^\w\s]', '', title))

    query_params = {'keys': title,
                    'date_since': f'{date-5}-01-01',
                    'date_to': '',  # min(f'{date}-12-31', time.strftime('%Y-%m-%d')),
                    'lang': SOURCE_LANG[source]}

    # Specify whether the column names must be initially written to the output file or not
    if os.path.isfile(f'./data/{filename}'):
        header = False
    else:
        header = True

    searcher = Searcher(max_tweets=MAX_TWEETS)
    searcher.run_query(query_params, filename, header, rh_id)

    # Wait for the searcher to finish before going to another query
    while searcher.is_running:
        pass

    time.sleep(searcher.sleep_time)

示例#4

0

显示文件

    def __init__(self, *args, **kwargs):
        super(DownloadWindow, self).__init__(*args, **kwargs)

        self.setWindowTitle("Twitter Data Downloader")

        self.status = QtWidgets.QStatusBar(self)
        self.default_styleSheet = self.status.styleSheet()
        self.setStatusBar(self.status)

        self.printer = Printer(self.status)
        self.searcher = Searcher(self.printer)

        # -------------- SETTING THE LAYOUTS -------------- #
        gen_layout = get_layout(QtWidgets.QVBoxLayout())
        upper_layout = get_layout(QtWidgets.QHBoxLayout())
        bottom_layout = get_layout(QtWidgets.QHBoxLayout())

        self.add_query_grid(upper_layout)
        self.add_buttons(bottom_layout)

        gen_layout.addLayout(get_layout(QtWidgets.QHBoxLayout()),  15)
        gen_layout.addLayout(upper_layout,  40)
        gen_layout.addLayout(get_layout(QtWidgets.QHBoxLayout()), 10)
        gen_layout.addLayout(bottom_layout, 25)
        gen_layout.addLayout(get_layout(QtWidgets.QHBoxLayout()), 10)

        widget = QtWidgets.QWidget()
        widget.setLayout(gen_layout)

        # Set the central widget of the Window. Widget will expand
        # to take up all the space in the window by default.
        self.setCentralWidget(widget)

示例#5

0

显示文件

文件： strategies.py 项目： alarca94/racial-hoaxes

def multiple_search(queries, query_params, rh_id, filename):
    searcher = Searcher(max_tweets=MAX_TWEETS)
    if os.path.isfile(f'./data/{filename}'):
        header = False
    else:
        header = True
    for query in queries:
        print(f'\t+++ Searching the following keys: {query}')
        query_params['keys'] = query
        searcher.run_query(query_params, filename, header, rh_id)
        while searcher.is_running:
            pass
        time.sleep(searcher.sleep_time)
        header = False

示例#6

0

显示文件

def first_tweets_retrieval(filename):
    '''
    Function to do a tweet lookup using Twitter API in search of the root tweets that are missing from several
    conversations (the tweet_id of the root tweet is the same as the conversation_id).
    :param filename: .csv file that contains the Twitter downloaded conversations with missing root tweets.
    '''
    searcher = Searcher(max_tweets=MAX_TWEETS)
    tweets = read_tweets(filename)
    tweets.drop_duplicates(subset=['tweet_id'], inplace=True)

    # Apparently, Twitter API search by conversation_id does not retrieve the first tweet in multiple cases
    lookup_ids = tweets.groupby('conversation_id').apply(
        lambda s: not any(pd.isna(s.in_reply_to_tweet_id)))
    lookup_ids = lookup_ids[lookup_ids].index.tolist()
    print(
        f'Looking for {len(lookup_ids)} initial comments to add to the {tweets.shape[0]} existing tweets...'
    )
    searcher.tweet_lookup(lookup_ids, filename)
    while searcher.is_running:
        pass

示例#7

0

显示文件

文件： strategies.py 项目： alarca94/racial-hoaxes

def expand_conversations(fact_checker):
    filename = f'{fact_checker}_tweets.csv'
    tweets = read_tweets(filename)

    # Save backup file
    tweets.to_csv(os.path.join(DATA_PATH, filename[:-4] + '_unexpanded.csv'),
                  mode='w', index=False, quoting=csv.QUOTE_ALL)

    # Expand only conversation_ids of tweets either having replies (posterior thread) or being replies (previous thread)
    ixs_interest = np.bitwise_or(tweets.reply_count > 0, tweets.conversation_id != tweets.tweet_id)
    expand_ids = tweets[ixs_interest][['conversation_id', 'rh_id']].drop_duplicates()
    print(f'\033[94m Expanding {expand_ids.shape[0]} conversations for {fact_checker} fact checker...\033[0m')

    # 'start_time' must be on or after 2006-03-21T00:00Z (Twitter constraint)
    query_params = {'lang': SOURCE_LANG[fact_checker],
                    'date_since': f'2006-03-21',
                    'max_tweets': MAX_TWEETS,
                    'max_results': min(500, MAX_TWEETS)}
    searcher = Searcher()
    for _, row in expand_ids.iterrows():
        print(f'\033[94m Expanding {row.conversation_id} corresponding to {row.rh_id}...\033[0m')
        if row.conversation_id in EXCLUDE_CONV_IDS.get(fact_checker, []):
            print(f'\t+++ This conversation id has been manually excluded from the expansion')
        else:
            query_params['conversation_id'] = row.conversation_id
            searcher.run_query(query_params, filename, initial_header=False, rh_id=row.rh_id)
            while searcher.is_running:
                pass
            time.sleep(searcher.sleep_time)

    # Deduplicate retrieved tweets by tweet_id
    tweets = read_tweets(filename)
    tweets.drop_duplicates(subset=['tweet_id'], inplace=True)

    # Apparently, Twitter API search by conversation_id does not retrieve the first tweet in multiple cases
    lookup_ids = tweets.groupby('conversation_id').apply(lambda s: not any(pd.isna(s.in_reply_to_tweet_id)))
    lookup_ids = lookup_ids[lookup_ids].index.tolist()

    if lookup_ids:
        print(f'\033[94m Looking for {len(lookup_ids)} initial comments to add to the {tweets.shape[0]} '
              f'existing tweets...\033[0m')
        searcher.tweet_lookup(lookup_ids, filename)
        while searcher.is_running:
            pass
    else:
        print(f'\033[94m All conversations have their initial comments...\033[0m')

    # Deduplicate tweets based on clean text (without user tags or URLs)
    tweets = read_tweets(filename)
    tweets = clean_duplicates(tweets)

    # Populate Racial Hoax ID for the starting tweets previously retrieved
    unexpanded_data = read_tweets(f'{fact_checker}_tweets_unexpanded.csv')
    mapper = unexpanded_data.drop_duplicates(subset='conversation_id')[['conversation_id', 'rh_id']]
    mapper = mapper.set_index('conversation_id').rh_id
    tweets.rh_id = tweets.conversation_id.map(mapper)

    # Save to file with the new data
    tweets.to_csv(os.path.join(DATA_PATH, filename), mode='w', index=False, quoting=csv.QUOTE_ALL)

示例#8

0

显示文件

文件： strategies.py 项目： alarca94/racial-hoaxes

def run_url_tweets_strategy(url, fact_checker, rh_id):
    '''
    Some racial hoax articles contain several embedded tweets. This method extracts those tweet identifiers and
    performs a tweet lookup using Twitter API.
    '''
    html_content = requests.get(url).text
    soup = BeautifulSoup(html_content, "lxml")
    links = soup.find_all("blockquote", attrs={"class": "twitter-tweet"})
    # embed_tweets = [re.sub('\s+pic\.twitter\..*\s?', ' ', item.find('p').text) for item in links]
    embed_tweet_ids = [re.sub('.*/status/(.*)\?.*', r'\g<1>', item.find_all('a')[-1]['href']) for item in links]
    filename = f'{fact_checker}_tweets.csv'
    # for i, (q, ix) in enumerate(zip(embed_tweets, embed_tweet_ids)):
    #     print(f'\t{i} --> {q} : {ix}')
    if embed_tweet_ids:
        print(f'\t+++ Searching for {len(embed_tweet_ids)} embedded tweet ids...')
        Searcher(max_tweets=MAX_TWEETS).tweet_lookup(embed_tweet_ids, filename=filename, rh_id=rh_id)
    else:
        print('\t+++ No embedded tweets were found...')

示例#9

0

显示文件

def tweets_search():
    '''
    In case the tweet_id is known a priori, this functions implements the corresponding tweets lookup.
    '''
    Searcher(max_tweets=10).tweet_lookup(['1125295472908886016'], 'test.csv')

示例#10

0

显示文件

class DownloadWindow(QtWidgets.QMainWindow):
    def __init__(self, *args, **kwargs):
        super(DownloadWindow, self).__init__(*args, **kwargs)

        self.setWindowTitle("Twitter Data Downloader")

        self.status = QtWidgets.QStatusBar(self)
        self.default_styleSheet = self.status.styleSheet()
        self.setStatusBar(self.status)

        self.printer = Printer(self.status)
        self.searcher = Searcher(self.printer)

        # -------------- SETTING THE LAYOUTS -------------- #
        gen_layout = get_layout(QtWidgets.QVBoxLayout())
        upper_layout = get_layout(QtWidgets.QHBoxLayout())
        bottom_layout = get_layout(QtWidgets.QHBoxLayout())

        self.add_query_grid(upper_layout)
        self.add_buttons(bottom_layout)

        gen_layout.addLayout(get_layout(QtWidgets.QHBoxLayout()),  15)
        gen_layout.addLayout(upper_layout,  40)
        gen_layout.addLayout(get_layout(QtWidgets.QHBoxLayout()), 10)
        gen_layout.addLayout(bottom_layout, 25)
        gen_layout.addLayout(get_layout(QtWidgets.QHBoxLayout()), 10)

        widget = QtWidgets.QWidget()
        widget.setLayout(gen_layout)

        # Set the central widget of the Window. Widget will expand
        # to take up all the space in the window by default.
        self.setCentralWidget(widget)

    def set_windows(self, windows):
        self.home_window = windows['home']

    def add_query_grid(self, parent_layout):
        # Hashtags, Keywords, Start Date (inclusive), End Date (inclusive), Admit Replies
        grid_layout = QtWidgets.QGridLayout()

        labels = []
        # Hashtags are concatenated with " OR " (The tweet only needs to contain one of the input hashtags)
        self.hashtags = QtWidgets.QTextEdit()
        helper_text = 'Hashtags (if multiple, enter a comma-separated sequence)\n' \
                      'Example: #hate, #stopMigration, #zeroBlack'
        self.hashtags.setPlaceholderText(helper_text)
        self.hashtags.setAcceptRichText(False)
        labels.append(QtWidgets.QLabel('HASHTAGS'))
        # All Keywords must appear in the tweet so they are concatenated with " AND "
        self.keywords = QtWidgets.QTextEdit()
        helper_text = 'Keywords (if multiple, enter a comma-separated sequence)\n' \
                      'Example: hate, migration, racist, black people'
        self.keywords.setPlaceholderText(helper_text)
        self.keywords.setAcceptRichText(False)
        labels.append(QtWidgets.QLabel('KEYWORDS'))
        # Both dates are inclusive (start-end)
        self.start_date = QtWidgets.QDateEdit(displayFormat='dd-MMM-yyyy', calendarPopup=True)
        self.start_date.setDateTime(QtCore.QDateTime.currentDateTime().addDays(-1))
        self.start_date.dateChanged.connect(self.start_date_dateedit)
        self.start_date.setFixedWidth(200)
        self.start_date.setAlignment(Qt.AlignCenter)
        labels.append(QtWidgets.QLabel('START DATE'))
        self.end_date = QtWidgets.QDateEdit(displayFormat='dd-MMM-yyyy', calendarPopup=True)
        self.end_date.setDateTime(QtCore.QDateTime.currentDateTime().addDays(-1))
        self.end_date.dateChanged.connect(self.end_date_dateedit)
        self.end_date.setFixedWidth(200)
        self.end_date.setAlignment(Qt.AlignCenter)
        labels.append(QtWidgets.QLabel('END DATE'))
        # Select the maximum number of tweets to be retrieved from twitter
        self.max_tweets = QtWidgets.QLineEdit()
        self.max_tweets.setPlaceholderText('Minimum value: 10, Maximum value: 2.000.000')
        self.max_tweets.setValidator(QtGui.QIntValidator())
        self.max_tweets.setAlignment(Qt.AlignCenter)
        self.max_tweets.setMinimumSize(300, 0)
        labels.append(QtWidgets.QLabel('MAXIMUM NUMBER OF TWEETS'))
        # Select the language of the tweets
        self.language = QtWidgets.QComboBox()
        self.language.addItems(AVAILABLE_LANGS.keys())
        labels.append(QtWidgets.QLabel('LANGUAGE'))

        l_height = 50
        for l in labels:
            l.setStyleSheet('background-color:#FAEB7C;color:#000000;font-weight: bold;')
            l.setAlignment(Qt.AlignCenter)
            l.setFixedHeight(l_height)

        # Add items to the grid
        grid_layout.addWidget(labels[0], 0, 0)
        grid_layout.addWidget(self.hashtags, 1, 0)
        grid_layout.addWidget(labels[1], 0, 1)
        grid_layout.addWidget(self.keywords, 1, 1)
        grid_layout.addWidget(labels[2], 2, 0)
        grid_layout.addWidget(self.start_date, 3, 0, Qt.AlignCenter)
        grid_layout.addWidget(labels[3], 2, 1)
        grid_layout.addWidget(self.end_date, 3, 1, Qt.AlignCenter)
        grid_layout.addWidget(labels[4], 4, 0)
        grid_layout.addWidget(self.max_tweets, 5, 0, Qt.AlignCenter)
        grid_layout.addWidget(labels[5], 4, 1)
        grid_layout.addWidget(self.language, 5, 1, Qt.AlignCenter)

        parent_layout.addLayout(grid_layout, Qt.AlignCenter)

    def add_buttons(self, layout):
        # Submit query, Clean Query, Last Query
        grid_layout = QtWidgets.QGridLayout()
        grid_layout.setHorizontalSpacing(20)
        grid_layout.setVerticalSpacing(20)

        size = (100, 40)
        submit_button = get_button(label='Submit', action=self.submit_query, size=size, shortcut=Qt.Key_Enter,
                                   color='6CFE87')
        cancel_button = get_button(label='Cancel', action=self.cancel_query, size=size, color='FF8585')
        clear_button = get_button(label='Clear', action=self.clear_fields, size=size, color='B9B9B9')
        back_button = get_button(label='Back', action=self.back_action, size=size, color='B9B9B9')

        grid_layout.addWidget(submit_button, 0, 0, Qt.AlignRight)
        grid_layout.addWidget(cancel_button, 0, 2, Qt.AlignLeft)
        grid_layout.addWidget(clear_button, 2, 0, Qt.AlignRight)
        grid_layout.addWidget(back_button, 2, 2, Qt.AlignLeft)
        layout.addLayout(grid_layout, stretch=True)

    def submit_query(self):
        if not self.searcher.is_running:
            max_tweets = min(max(int(self.max_tweets.text()) if self.max_tweets.text() else 10, 10), 2000000)
            hashtags = list(filter(None, re.split(',\s*', self.hashtags.toPlainText())))
            hashtags = [re.sub('\s', '', h if h.startswith('#') else '#' + h) for h in hashtags]
            keywords = re.split('\s*,\s*', self.keywords.toPlainText())
            query_params = {'hashtags': hashtags,
                            'keywords': keywords,
                            'date_since': self.start_date.dateTime().toString('yyyy-MM-dd'),
                            'date_to': self.end_date.dateTime().addDays(1).toString('yyyy-MM-dd'),
                            'lang': AVAILABLE_LANGS[self.language.currentText()],
                            'max_tweets': max_tweets,
                            'max_results': min(500, max_tweets)}
            self.hashtags.setText(' OR '.join(hashtags))
            self.keywords.setText(' OR '.join(keywords))
            self.searcher.run_query(query_params)
        else:
            self.printer.show_message('Already running another query', 1500, 'error')

    def cancel_query(self):
        self.printer.show_message('Cancel button clicked', 1000, 'success')

    def clear_fields(self):
        self.hashtags.clear()
        self.keywords.clear()
        self.start_date.setDateTime(QtCore.QDateTime.currentDateTime().addDays(-1))
        self.end_date.setDateTime(QtCore.QDateTime.currentDateTime().addDays(-1))
        self.language.setCurrentIndex(0)
        self.max_tweets.clear()
        self.printer.show_message('Clear button clicked', 1000, 'success')

    def back_action(self):
        self.home_window.showMaximized()
        self.close()

    def start_date_dateedit(self):
        if self.start_date.dateTime() > self.end_date.dateTime():
            self.end_date.setDateTime(self.start_date.dateTime())

    def end_date_dateedit(self):
        if self.start_date.dateTime() > self.end_date.dateTime():
            self.start_date.setDateTime(self.end_date.dateTime())