示例#1
0
def get_tweets(word, begin_date, end_date='today', limit=None):
    """
    :return: tweets data as dataframe: returns a dictionary of twitter user data
    """
    if end_date == 'today':
        list_of_tweets = twitterscraper.query_tweets(word,
                                                     limit=limit,
                                                     begindate=begin_date)
    else:
        list_of_tweets = twitterscraper.query_tweets(word,
                                                     begindate=begin_date,
                                                     enddate=end_date)

    tweets_data = []
    for tweet in list_of_tweets:
        tweet_data = {}
        tweet_data['tweetID'] = tweet.tweet_id
        tweet_data['time'] = tweet.timestamp.strftime("%d-%b-%Y (%H:%M)")
        tweet_data['text'] = tweet.text
        tweet_data['likes'] = tweet.likes
        tweet_data['usedID'] = tweet.username
        tweet_data['URL'] = tweet.tweet_url
        tweet_data['timestamp'] = tweet.timestamp

        tweets_data.append(tweet_data)

    tweets_data = pd.DataFrame(tweets_data)
    return tweets_data, list_of_tweets
示例#2
0
def main(event, context):
    #a = np.arange(15).reshape(3, 5)

    print("Access twitter data")
    #print(a)
    list_of_tweets = query_tweets("Modi", 5)

    #print the retrieved tweets to the screen:
    for tweet in query_tweets("Modi", 5):
        print("**************************")
        print(tweet)
示例#3
0
def get_tweets(user):
    ed = dt.date.today()
    bd = ed - dt.timedelta(days=1)
    tweets = query_tweets(user, limit=20, begindate=bd)
    if not tweets:
        bd = ed - dt.timedelta(days=20)
        tweets = query_tweets(user, limit=20, begindate=bd)
        if not tweets:
            return ["\n"]
    # return [str(i.text).replace('\\','') for i in tweets]
    return [
        re.sub('[^A-Za-z0-9#@ .,$&(){}[]]+', ' ', str(i.text)) for i in tweets
    ]
示例#4
0
def load_tweets(d, m, y, country, limit=10):
    flat_tweets1 = query_tweets(country + " coronavirus", limit,
                                dt.date(y, m, d))
    flat_tweets2 = query_tweets(country + " covid", limit, dt.date(y, m, d))
    flat_tweets3 = query_tweets(country + " covid19", limit, dt.date(y, m, d))

    tweets = []

    load_in_tweets(tweets, flat_tweets1)
    load_in_tweets(tweets, flat_tweets2)
    load_in_tweets(tweets, flat_tweets3)

    return tweets
def tw_scraper(hashtag, start_date, end_date):
    '''
    The maximum search for Twitter API is 72,000 tweets per hour
    Using twitterscraper module can scrape without this limitation
    But the maximum tweets per search on my machine is around 13,000
    
    This function recursively scrape tweets with twitterscraper
    of certain hashtag in smaller time period to avoid those constraints
    
    Inputs:
        hashtag: (string) a hashtag keyword
        startdate: (str) string of start date ("year-month-date")
        enddate: (str) string of end date ("year-month-date")
        path: (string) archive path
    
    Ouput:
        df: (pandas dataframe) tweets in the time period
    '''
    
    
    assert len(startdate) == 10, "Wrong start date format, (yyyy-mm-dd)"
    assert len(edndate) == 10, "Wrong end date format, (yyyy-mm-dd)"
    
    start_ls = list(map(int, startdate.split("-")))
    end_ls = list(map(int, enddate.split("-")))
    
    assert start_ls[0] > 1989 and start_ls[0] < 2021, "Wrong start year, range: 1990~2020"
    assert end_ls[0] > 1989 and end_ls[0] < 2021, "Wrong end year, range: 1990~2020"
    assert start_ls[1] > 0 and start_ls[1] < 13, "Wrong start month, range: 1~12"
    assert end_ls[1] > 0 and end_ls[1] < 13, "Wrong end month, range: 1~12"
    assert start_ls[2] > 0 and start_ls[2] < 32, "Wrong start day, range: 1~31"
    assert end_ls[2] > 0 and end_ls[2] < 32, "Wrong end day, range: 1~31"
    
    
    start_date = dt.date(*start_date)
    end_date = dt.date(*end_date)
    hashtag = "#"+hashtag
    
    if end_date > start_date+dt.timedelta(days=5):
        med_date = start_date + dt.timedelta(days=5)
        tweets = twitterscraper.query_tweets(hashtag, begindate=start_date, enddate=med_date)
        df = pd.DataFrame(t.__dict__ for t in tweets)
        return pd.concat([df,tw_scraper(hashtag, med_date+dt.timedelta(days=1), end_date)])
    
    else:
        tweets = twitterscraper.query_tweets(hashtag, begindate=start_date, enddate=end_date)
        df = pd.DataFrame(t.__dict__ for t in tweets)
        return df
示例#6
0
def return_data():
    if request.method == 'POST':
        today = dt.date.today()
        yesterday = today - dt.timedelta(days=1)
        begin_date = yesterday
        end_date = today
        limit = 100
        lang = 'en'
        tweets = query_tweets("corona",
                              begindate=begin_date,
                              enddate=end_date,
                              limit=limit,
                              lang=lang)
        df = pd.DataFrame(t.__dict__ for t in tweets)
        i = df.text.str.strip()
        j = df.timestamp
        l = random.randint(1, len(i))
        tweet = i[l]
        tweet_vector = vectorize.transform([tweet])
        sent = classifier.predict(tweet_vector).tolist()
        mytext = tweet
        language = 'en'
        myobj = gTTS(text=mytext, lang=language, slow=False)
        myobj.save("audio.mp3")

        return render_template('index.html', tw=tweet, time=j[l], sent=sent[0])
    else:
        return render_template('index.html')
示例#7
0
def harvest_words(keyword_to_harvest, stopwords, limit=2000):
    words = []
    # Decode the text to support swedish characters
    keyword_to_harvest = keyword_to_harvest.encode('utf-8')
    # Query twitter with the "keyword"
    for query in query_tweets(keyword_to_harvest,
                              lang='sv',
                              limit=limit,
                              poolsize=20,
                              begindate=datetime.date(
                                  2014,
                                  1,
                                  1,
                              )):
        # Split the result to get the words
        new_list = query.text.split(u' ')
        for word in new_list:
            try:
                # Remove the stopwords --> Not interesting!
                if word.encode('utf-8').lower() in stopwords:
                    new_list.remove(word)
                else:
                    words.append(word)
            except:
                print(word, " is not unicode")
    return words
示例#8
0
def retrieve_tweets():
    retrieved = 0
    for tweet in query_tweets('from:python_tip exclude:replies'):
        res = process_tweet(tweet)
        if res:
            retrieved += 1
    print(f'Stored {retrieved} new tweets')
    def tweets_scraper_inner(begin_date, end_date, keyword, limit, lang="en"):
        """
        Using the twitterscraper API. Github source: https://github.com/taspinar/twitterscraper
        return: dataframe.
        """
        tweets = query_tweets(keyword,
                              begindate=begin_date,
                              enddate=end_date,
                              limit=limit,
                              lang=lang)
        text, timestamp, likes, retweets, replies = [], [], [], [], []

        for tweet in tweets:
            text.append(tweet.text)
            timestamp.append(tweet.timestamp)
            likes.append(tweet.likes)
            retweets.append(tweet.retweets)
            replies.append(tweet.replies)

        tweets = pd.DataFrame({
            "text": text,
            "timestamp": timestamp,
            "likes": likes,
            "retweets": retweets,
            "replies": replies
        })

        # Don't need the exact h-m-s, cast it to date object.
        tweets['timestamp'] = tweets['timestamp'].apply(
            lambda x: str(x.date()))
        return tweets
示例#10
0
def scrapyTweets(keywords, book_name_xls, sheet_name_xls, datanumber,
                 startdate):
    start = time()
    totalNum = 0
    data = []
    value_title = [
        [
            "username", "fullname", "user_id", "tweet_id", "tweet_url", "text",
            "timestamp", "replies", "retweets", "is_retweet",
            "retweeter_username", "retweet_id"
        ],
    ]
    write_excel_xls(book_name_xls, sheet_name_xls, value_title)
    for tweet in query_tweets(keywords, datanumber)[:datanumber]:
        tempList = []
        if (checkTime(str(tweet.timestamp), startdate)):
            totalNum = totalNum + 1
            appendToList(tweet, tempList)
            data.append(tempList)
        if (len(data) > 5000):
            write_excel_xls_append(book_name_xls, data)
            data = []

    write_excel_xls_append(book_name_xls, data)

    end = time()
    totalTime = end - start

    return totalNum, totalTime
示例#11
0
def twscrape_search(query,
                    *,
                    count=None,
                    begindate=datetime.date(2006, 3, 21),
                    enddate=datetime.date.today(),
                    poolsize=20,
                    lang=''):
    result = query_tweets(query,
                          limit=count,
                          begindate=begindate,
                          enddate=enddate,
                          poolsize=poolsize,
                          lang=lang)

    data = ({
        'id': item.id,
        'text': merge_whitespaces(item.text),
        'timestamp': item.timestamp,
        'likes': item.likes,
        'retweets': item.retweets,
        'replies': item.replies,
        'url': item.url,
        'html': merge_whitespaces(item.html),
        'user': item.user,
        'fullname': merge_whitespaces(item.fullname)
    } for item in result)

    return sorted(data, key=lambda x: int(x['id']), reverse=True)
示例#12
0
def main():
    logging.basicConfig(format='%(levelname)s: %(message)s', level=logging.INFO)
    try:
        parser = ArgumentParser(
            description=__doc__
        )

        parser.add_argument("query", type=str, help="Advanced twitter query")
        parser.add_argument("-o", "--output", type=str, default="tweets.json",
                            help="Path to a JSON file to store the gathered "
                                 "tweets to.")
        parser.add_argument("-l", "--limit", type=int, default=None,
                            help="Number of minimum tweets to gather.")
        parser.add_argument("-a", "--all", action='store_true',
                            help="Set this flag if you want to get all tweets "
                                 "in the history of twitter. This may take a "
                                 "while but also activates parallel tweet "
                                 "gathering. The number of tweets however, "
                                 "will be capped at around 100000 per 10 "
                                 "days.")
        args = parser.parse_args()

        if isfile(args.output):
            logging.error("Output file already exists! Aborting.")
            exit(-1)

        if args.all:
            tweets = query_all_tweets(args.query)
        else:
            tweets = query_tweets(args.query, args.limit)

        with open(args.output, "w") as output:
            dump(tweets, output, cls=JSONEncoder)
    except KeyboardInterrupt:
        logging.info("Program interrupted by user. Quitting...")
示例#13
0
def run():
    data = load_data()
    if not data:
        download_s3()
        data = load_data()
    if not data:
        raise Exception('something went horribly wrong')
    print 'fetching tweets from Twitter'
    scraper_tweets = query_tweets(TWITTER_QUERY, 100)
    new_tweets_by_id = {
        st.id: st
        for st in [SimpleTweet.from_scraper(rt) for rt in scraper_tweets]
    }
    old_count = len(data['tweets_by_id'])
    data['tweets_by_id'].update(new_tweets_by_id)
    new_count = len(data['tweets_by_id'])
    print 'updates: %d' % (new_count - old_count)
    if new_count == old_count:
        print 'nothing new, skipping save'
    else:
        print 'saving tweets locally'
        out = {
            'blacklist': data['blacklist'],
            'tweets': data['tweets_by_id'].values(),
        }
        with open(DATA_PATH, 'w') as output:
            json.dump(out, output)
        upload_s3()
示例#14
0
def twitterscraperSearch():
    with open("Alost_user.ndjson", "a", newline='') as output:
        list_of_tweets = ts.query_tweets(
            "Aalst (from:HLN_BE) until:2020-03-01 since:2020-02-14", 30)
        for tweet in list_of_tweets:
            json_data = json.dumps({
                "tweet_id":
                int(tweet.tweet_id),
                "screen_name":
                tweet.screen_name,
                "user_id":
                int(tweet.user_id),
                "created_at":
                str(tweet.timestamp
                    ),  #sinon, format date, mais pas reconnu par json
                "entities.hashtags": [{
                    "text": x
                } for x in tweet.hashtags],
                "entities.user_mentions": [],
                "lang":
                getLang(tweet.text_html),
                "full_text":
                remove_html_tags(tweet.text_html)
            })
            output.writelines(json_data + "\n")
示例#15
0
def top_results(ipstr):
    two_months = dt.timedelta(days=60)
    end_date = dt.date.today()
    begin_date = end_date - two_months

    limit = 400
    lang = "english"

    tweets = query_tweets(ipstr,
                          begindate=begin_date,
                          enddate=end_date,
                          limit=limit,
                          lang=lang)

    #df=pd.DataFrame(tweets,columns = ['screen_name','username','user id','tweet id','tweet url','timestamp','var','text','text html','links','hashtags','has media','img urls','video url','likes','retweets','replies','is replied','parent tweet id','reply to users'])
    df = pd.DataFrame(t.__dict__ for t in tweets)
    #df.sort_values(by=['likes','retweets','replies'], inplace=True, ascending=False)
    #removing unwanted columns
    df.drop(df.columns[[1, 2, 3, 4, 6, 8, 9, 10, 11, 12, 13, 17, 18, 19, 20]],
            axis=1,
            inplace=True)
    #df = df[df.likes >=0]
    #df.set_index('username',inplace=True)
    #df.drop_duplicates(subset ="screen_name", inplace = True)
    return df
def scrape_twitter(track_list,limit,poolsize,begindate,enddate,loc_near,radius):
    """ 
        Parameters
        ----------
        track_list : list of strings

        limit : integer

        poolsize : integer
            Number of parallel processes TwitterScraper should initiate while scraping for your query

        begindate : date object

        enddate : date object
            
        loc_near : string
            The location from which the tweets should come. This has to be a city name
            followed by the abbreviate state of the of the form "The City,ST"
            ex) "Grand Junction,CO" or "New York,NY" or "Colorado Springs,CO"

        radius : int
            The radius (in miles) around the location from which the tweets should come
        
        Returns
        -------
        tweets : twitterscraper.tweet.Tweet objects
    """
    tweets = []
    string_query = make_string_query(track_list=track_list, loc_near=loc_near, loc_within_mi=radius)

    for tweet in query_tweets(query=string_query, limit=limit, poolsize=poolsize, begindate=begindate, enddate=enddate, lang="en"):
        tweets.append(tweet)
    print("Number of Tweets fround for " + loc_near + ": ", len(tweets))
    return tweets
示例#17
0
	def get_tweet(app):
		return query_tweets(app, limit=None,
							begindate=dt.date.today()-dt.timedelta(days=1), 
							enddate=dt.date.today(), 
							poolsize=20, 
							lang='en'
							)
示例#18
0
    def collectData():
        from twitterscraper import query_tweets
        import datetime as dt
        import json

        #Build query string with all marker variations specified
        query_str = ""
        for marker in self.markers:
            query_str+=marker+" "

        #Collect data
        tweets = query_tweets(query_str,begindate=self.start_time,enddate=start.end_time,limit=self.limit,lang=self.lang)


        #Convert data into format that is jsonable
        list_of_json_tweets = []
        for tweet in tweets:
                tweet_time = tweet.timestamp
                tweet.timestamp = tweet_time.strftime('%Y-%m-%d %H:%M:%S')
                list_of_json_tweets.append(vars(tweet))

        #Write output
        file_name = self.start_time.strftime("%Y-%m-%d")
        jsonfile = open(self.destination+file_name+".json","w")
        length_of_tweet_list = len(list_of_json_tweets)
        length_of_tweet_list-=1
        for index,item in enumerate(list_of_json_tweets):
            json.dump(item,jsonfile)
            if length_of_tweet_list != index:
                json_file.write(",")

        json_file.close()
示例#19
0
def scrapingTweets(since, until):
    """
    Scrapes Tweets within a date range and writes them to a csv file. Scrapes Tweet text, Tweet ID, and Timestamp
    :param since: When you want to start. Should be in YYYY-MM-DD format.
    :param until: When you want to stop. Should be in YYYY-MM-DD format.
    :return: Filename of csv with tweets scraped
    """
    # upper bound and lower bound of queries. sometimes it is necessary to rescrape certain dates because twitter will
    # start blocking your queries

    startDate = dt.datetime.strptime(since, '%Y-%m-%d').date()
    endDate = dt.datetime.strptime(until, '%Y-%m-%d').date()

    scraped_tweets_filename = "Coachella" + startDate.strftime(
        "%Y" + "-" + "%m" + "-" + "%d") + "_" + endDate.strftime(
            ("%Y" + "-" + "%m" + "-" + "%d"))
    # queries tweets and writes each tweet to file

    collected_tweets = twitterscraper.query_tweets("Coachella",
                                                   limit=None,
                                                   begindate=startDate,
                                                   enddate=endDate)

    with open(scraped_tweets_filename, 'w') as file:

        for tweet in collected_tweets:
            tweet_writer = csv.writer(file,
                                      delimiter=';',
                                      quoting=csv.QUOTE_ALL)
            tweet_writer.writerow([
                tweet.id,
                tweet.timestamp.strftime("%m/%d/%Y %H:%M:%S"),
                tweet.text.replace('\n', ' '), tweet.user
            ])
示例#20
0
def scrape(start_date, end_date, keyword="Bitcoin"):

    for dt in rrule(DAILY, dtstart=start_date, until=end_date):
        df = pd.DataFrame(columns=[
            'ID', 'Tweet', 'Time', 'User', 'Likes', 'Replies', 'Retweet'
        ])

        yesterday = dt - timedelta(days=1)
        tomorrow = dt + timedelta(days=1)

        begin = yesterday.date()
        end = tomorrow.date()

        print("{} {}".format(begin, end))
        list_of_tweets = query_tweets("Bitcoin",
                                      1000,
                                      begindate=begin,
                                      enddate=end)

        for tweet in list_of_tweets:
            df = df.append(
                {
                    'ID': tweet.id,
                    'Tweet': tweet.text,
                    'Time': tweet.timestamp,
                    'User': tweet.user,
                    'Likes': tweet.likes,
                    'Replies': tweet.replies,
                    'Retweet': tweet.retweets
                },
                ignore_index=True)

        df.to_csv("{}\extracted\{}.csv".format(keyword.lower(),
                                               dt.strftime('%Y-%m-%d'),
                                               index=False))
示例#21
0
 def recoleccion(self):
     from twitterscraper import query_tweets
     # Inicia recoleccion con idioma seleccionado por usuario
     print("=== RECOLECCIÓN TWEETSCRAPPING INICIADO")
     list_of_tweets = query_tweets(query=self.palabra,
                                   limit=self.limite,
                                   begindate=self.fecha_inicio,
                                   enddate=self.fecha_final,
                                   lang=self.idioma,
                                   poolsize=self.bins)
     # Transforma recolección a lista de diccionarios
     print("Transformando recolección a diccionarios...")
     tweets = self.get_tweets_info(list_of_tweets)
     # Crear Dataframe
     dataframe = pd.DataFrame(tweets)
     # Filtrar por columnas de interes
     tweets_df = dataframe[["username", "timestamp", "text"]]
     # Estandarizar nombre de las columnas
     tweets_df = tweets_df.rename(columns={
         "username": "******",
         "timestamp": "Date",
         "text": "Text"
     })
     # Borrar archivo para que se vuelva a escribir
     if self.ruta_extraccion.exists():
         os.remove(self.ruta_extraccion)
     # Exportar extracción a CSV
     print("Exportando CSV: extraccion_tweets.csv")
     tweets_df.to_csv(self.ruta_extraccion, index=None, header=True)
示例#22
0
    def collect_data(self):
        start_time = time.time()
        print("INFO: starting to gather tweets...")
        list_of_tweets = query_tweets(
            query='',
            limit=2000,
            begindate=datetime.date(2017, 2, 1),
            lang='en')  #get list of tweets from twitterscraper API
        print("INFO: -----TOTAL AMOUNT OF TWEETS: {length}-----".format(
            length=len(list_of_tweets)))
        print("INFO: execution time for gathering of tweets: {time} seconds".
              format(time=time.time() - start_time))
        print('')

        count = 0

        print('INFO: *****ADDING NEW TWEETS*****')
        for tweet in list_of_tweets:
            if tweet.id not in self.idtotweet:
                self.idtotweet[tweet.id] = tweet.text
                count += 1

        print('INFO: {cnt} new tweets added to the corpus!'.format(cnt=count))

        with open('read_from.json', 'w') as fw:
            json.dump(self.idtotweet, fw)

        for key in self.idtotweet:
            self.corpus.append(self.idtotweet[key])

        print('INFO: new length of permanent corpus: {twts}'.format(
            twts=len(self.idtotweet)))
示例#23
0
文件: views.py 项目: five6fo3/UnMask
def get_name(request):
    tweetList = []
    # if this is a POST request we need to process the form data
    if request.method == 'POST':
        # create a form instance and populate it with data from the request:
        form = NameForm(request.POST)
        # check whether it's valid:
        if form.is_valid():
            # process the data in form.cleaned_data as required
            # ...
            # redirect to a new URL:
            y_name = form.cleaned_data['your_name']

            for tweet in query_tweets("from:%s" % y_name, 7)[:7]:
                print(tweet.text.encode('utf-8'))
                tweetList.append(unicode(tweet.text).replace("'", ""))

            scoreboard(tweetList)
            with open('test_sentences.csv', 'rb') as f:
                reader = csv.reader(f)
                your_list = list(reader)
            deepmojiTokenizer()
            hateSpeechTokenizer()
            average = wrap()

            #return HttpResponseRedirect('/thanks/')
            return render(request, 'name.html', {'form': average})

    # if a GET (or any other method) we'll create a blank form
    else:
        form = NameForm()

    return render(request, 'name.html', {'form': form})
示例#24
0
def twload(ipstr):
    ytload(ipstr)
    conn = create_connection()
    create_tabletw(conn)

    two_months = dt.timedelta(days=60)
    end_date = dt.date.today()
    begin_date = end_date - two_months

    limit = 500
    lang = "english"

    tweets = query_tweets(ipstr,
                          begindate=begin_date,
                          enddate=end_date,
                          limit=limit,
                          lang=lang)

    df = pd.DataFrame(t.__dict__ for t in tweets)
    df.sort_values(by=['likes', 'retweets', 'replies'],
                   inplace=True,
                   ascending=False)
    #removing unwanted columns
    df.drop(df.columns[[3, 4, 6, 8, 9, 10, 11, 12, 13, 17, 18, 19, 20]],
            axis=1,
            inplace=True)
    df = df[df.likes >= 0]
    df.set_index('username', inplace=True)

    df.insert(0, 'id', range(0, len(df)))

    df = analyse_sentiment(df, "text")
    #print(df['sentiment'])

    df.to_sql('twsentiment', conn, if_exists='replace')
示例#25
0
文件: main.py 项目: kosciak9/bazarki
def find_bazarki():
    """
    scrapes twitter for bazarki-like tweets
    returns list of them (tweet-like objects)

    use memoize=True to... memoize!
    """
    list_of_tweets = query_tweets(
        "#wybory2019 OR #WyboryParlamentarne2019 #bazarek",
        1,
        begindate=date(2019, 10, 11),
        enddate=date(2019, 10, 14),
    )

    bazarki = []
    # print the retrieved tweets to the screen:
    for tweet in list_of_tweets:
        regex_tester = r"pis.+\s[0-9]+"
        match = re.search(regex_tester, tweet.text, flags=re.I)
        if match:
            bazarki.append({
                "text": tweet.text,
                "url": "https://twitter.com" + tweet.tweet_url
            })

    print(f"znaleziono {len(bazarki)} bazarków")
    return bazarki
示例#26
0
def query_and_output(query,
                     output_name,
                     limit=None,
                     begindate=dt.date(2006, 3, 21),
                     enddate=dt.date.today(),
                     poolsize=20,
                     lang=''):
    # perform query
    tweets = query_tweets(query, limit, begindate, enddate, poolsize, lang)
    # construct string csv
    output = None
    if tweets:
        si = StringIO()
        cw = csv.writer(si)
        cw.writerow([
            "timestamp", "user", "fullname", "text", "hashtags", "id", "url",
            "retweets", "favorites", "replies"
        ])
        for x in tweets:
            # parse text for hashtags
            tag_set = set(re.findall('\#\w+', x.text))
            tag_values = " ".join(tag_set)
            # add row for tweet in csv
            cw.writerow([
                x.timestamp, x.user, x.fullname, x.text, tag_values, x.id,
                x.url, x.retweets, x.likes, x.replies
            ])
        output = make_response(si.getvalue())
        output.headers[
            'Content-Disposition'] = 'attachment; filename=export.csv'
        output.headers['Content-type'] = 'text/csv'
    return output
示例#27
0
def tweets_search(request):
    if request.method == 'POST':
        key_groups = []
        username = request.POST.get('username')
        word_group = request.POST.get('group').split('&')

        for group in word_group:
            key_groups.append(group.split(','))

        start_date = request.POST.get('startDate').split('-')
        end_date = request.POST.get('endDate').split('-')
        keywords = request.POST.get('words').split(',')

        begin_date = f'{start_date[2]}-{start_date[1]}-{start_date[0]}'
        end_date = f'{end_date[2]}-{end_date[1]}-{end_date[0]}'

        tweets = []
        for tweet in query_tweets(
                query=
                f"{query_constructor(keywords, key_groups)} from:{username}",
                limit=40,
                poolsize=1,
                begindate=date.fromisoformat(begin_date),
                enddate=date.fromisoformat(end_date)):
            tweets.append(tweet.text)
        return render(request, 'portal/tweets_list.html', {'tweets': tweets})
    else:
        return render(request, 'portal/tweets_search.html')
    def getTweets(self, query, begin_date, end_date, limit=None, n_jobs=2):
        """A function that takes a query and some dates and scrapes all tweets
        from Twitter that satisfy the given criteria.

        Args:
            query (str): The search string.
            begin_date (string): The start date that tweets are going to be
                scraped.
            end_date (string): The start date that tweets are going to be
                scraped.
            limit (string): A threshold number of tweets downloaded
            n_jobs (int): The degree of parallelism.

        Returns:
            text_list (list): A list of the scraped tweets.
        """
        # Date format is DD-MM-YYY (string)
        twitterscraper.query.HEADER = {
            'User-Agent': random.choice(twitterscraper.query.HEADERS_LIST)
        }
        b_day, b_month, b_year = begin_date.split('-')
        e_day, e_month, e_year = end_date.split('-')
        tweets = query_tweets(query,
                              begindate=dt.date(int(b_year), int(b_month),
                                                int(b_day)),
                              enddate=dt.date(int(e_year), int(e_month),
                                              int(e_day)),
                              limit=limit,
                              lang='en',
                              poolsize=n_jobs)
        text_list = [(str(t.timestamp.strftime('%d-%m-%Y')),
                      t.text.strip().replace('\n', ' ')) for t in tweets]
        return text_list
示例#29
0
def twitter_scraper(query, quantity=10, outfile='twitter.json'):
    '''

    @query, may contain the following pattern:

        'happy hour' -> Finds tweets: containing the exact phrase 'happy hour'.

        'love OR hate' -> Finds tweets: containing either 'love' or 'hate' (or both).

        'beer -root' -> Finds tweets: containing 'beer' but not 'root.'

        '#haiku' -> Finds tweets: containing the hashtag 'haiku.'

        'from:alexiskold' -> Finds tweets: sent from person 'alexiskold.'

        'to:techcrunch' -> Finds tweets: sent to person 'techcrunch.'

        '@mashable' -> Finds tweets: Referencing person 'mashable.'

        '"happy hour" near:"san francisco"' -> Finds tweets: containing the exact phrase 'happy hour' and sent near 'san francisco.'

        'near:NYC within:15mi' -> Finds tweets: sent within 15 miles of 'NYC.'

        'superhero since:2010-12-27' -> Finds tweets: containing 'superhero' and sent since date '2010-12-27' (year-month-day).

        'ftw until:2010-12-27' -> Finds tweets: containing 'ftw' and sent up to date '2010-12-27.'

        'movie -scary :)' -> Finds tweets: containing 'movie', but not 'scary,' and with a positive attitude.

        'flight :(' -> Finds tweets: containing 'flight' and with a negative attitude.

        'traffic ?' -> Finds tweets: containing 'traffic' and asking a question.

        'hilarious filter:links' -> Finds tweets: containing 'hilarious' and linking to URLs.

        'news source:twitterfeed' -> Finds tweets: containing 'news' and entered via TwitterFeed

    Note: any advanced query supported by https://twitter.com/search-advanced?lang=en,
          can be implemented by the 'query_tweets' function.

    '''

    # local variables
    tweets = []

    # structure tweets
    for tweet in query_tweets(query, int(quantity)):
        tweets.append({
            'text': tweet.text,
            'likes': tweet.likes,
            'retweets': tweet.retweets,
            'replies': tweet.replies,
            'user': tweet.user,
            'timestamp': str(tweet.timestamp)
        })

    # write to file
    if len(tweets):
        with open(outfile, 'w') as file:
            json.dump(tweets, file, indent=4)
示例#30
0
def request_twitter(company, start_date, end_date):
    query = company + ' -filter:retweets -filter:replies'
    delta = dt.timedelta(days=7)
    twts_raw = pd.DataFrame()
    twts_df = pd.DataFrame()
    EndDT = start_date + delta
    BeginDT = start_date
    while EndDT <= EndDate:
    
        tweets = query_tweets(query, limit=None, begindate=BeginDT,
                              enddate=EndDT, lang='en')
        tweets_df = pd.DataFrame(t.__dict__ for t in tweets)
        twts_df = twts_df.append(tweets_df)
    
    
        if (EndDate - EndDT) > delta:
            BeginDT = BeginDT + delta
            EndDT = EndDT + delta
        else:
            BeginDT = BeginDT + delta
            EndDT = EndDate
            if BeginDT > EndDT:
                break
    twts_raw = twts_df        
    twts_df = twts_df.drop_duplicates(subset=['timestamp'])
    twts_df = twts_df.set_index('timestamp')
    twts_df = twts_df.sort_index()
    
    return twts_df, twts_raw
def top_results(ipstr):
    two_months = dt.timedelta(days=60)
    end_date = dt.date.today()
    begin_date = end_date - two_months

    limit = 100
    lang = "english"

    tweets = query_tweets(ipstr,
                          begindate=begin_date,
                          enddate=end_date,
                          limit=limit,
                          lang=lang)

    df = pd.DataFrame(t.__dict__ for t in tweets)
    df.sort_values(by=['likes', 'retweets', 'replies'],
                   inplace=True,
                   ascending=False)

    df.drop(df.columns[[2, 3, 5, 6, 8, 11, 13, 17, 18, 19, 20]],
            axis=1,
            inplace=True)
    df = df[df.likes >= 30]
    df.set_index('username', inplace=True)
    return (df)
示例#32
0
# In[2]:

from twitterscraper import query_tweets
import re
import nltk
from string import digits
from datetime import datetime


# In[146]:

time = []
# docker to store results of query_tweets for one search
docker = []
for tweet in query_tweets("bitcoin", 20)[:40]:
    docker.append(tweet)
    text = tweet.text.encode('utf-8').decode('utf-8')
    text = re.sub(r"http\s+","",text)
    testdata = text.encode('utf-8')
    time.append(tweet.timestamp)


# In[3]:

docker[0].timestamp


# In[4]:

time # docker[0].timestamp = time[0] ?
示例#33
0
文件: News_pre_l.py 项目: ZanW/Python
# In[251]:

from twitterscraper import query_tweets
import re
import nltk
from nltk.corpus import stopwords
from string import digits
import string
import snowballstemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer, TfidfTransformer


# In[247]:

textData = []
for tweet in query_tweets("bitcoin%20since%3A2017-09-01%20until%3A2017-09-13", 10)[:40]:
    text = tweet.text.encode('utf-8').decode('utf-8')
    # remove all http links included in the twitter
    text = re.sub(r"http\S+", "",text)
   
    # remove all digits included in the twitter

    remove_digits = str.maketrans('', '', digits)
    text = text.translate(remove_digits)    
    textData.append(text)
    #print(tweet.text.encode('utf-8'))


# In[248]:

def preprocess_document(data):
示例#34
0
#####Scrape tweets from Twitter to classify
##Example: 
#1) Scrape tweets from Twitter that have #bucs, #buccaneers, or #siegetheday in their text and are in English
#2) Save these tweets as a row to a .csv file
import twitterscraper
with open('nflbucs.csv','a',newline = '',encoding='utf-8') as fil:
    writer = csv.writer(fil)
    for tweet in twitterscraper.query_tweets("%23bucs%20OR%20%23buccaneers%20OR%20%23siegetheday%20lang%3Aen%20include%3Aretweets", 1000):
        writer.writerow(tweet)


####Train classifier based on tweet data
#0) Load data and setup
from nltk.corpus import twitter_samples

##take a sample of data
twitter_samples.strings('positive_tweets.json')[1]
twitter_samples.strings('negative_tweets.json')[1]

##create function word_feats() to turn string into a dictionary
def word_feats(words):
    return dict([(word, True) for word in words])
 
 
#1) a) Tokenize tweets from sample data
#b) Use word_feats() to create a dictionary out of the tokenized words
#c) Create list variable of positive and negative features using the dictionary from (b) and append 'pos' or 'neg'
import nltk
posfeats = [(word_feats(nltk.TweetTokenizer(preserve_case = False).tokenize(row)),'pos') for row in twitter_samples.strings('positive_tweets.json')]
len(posfeats) #check length - equivalent to number of tweets
negfeats = [(word_feats(nltk.TweetTokenizer(preserve_case = False).tokenize(row)),'neg') for row in twitter_samples.strings('negative_tweets.json')]