Exemplo n.º 1
0
def main(args):
    """Main entry point allowing external calls

    Args:
      args ([str]): command line parameter list
    """
    args = parse_args(args)
    setup_logging(args.loglevel)
    _logger.info("Scrapping tweets for {0}".format(args.username))
    if args.username is not None and args.username.startswith("@"):
        if args.pages is not None:
            ts = TweetScrapperProfile(args.username, args.pages)
        else:
            ts = TweetScrapperProfile(args.username)
        tweets = ts.get_profile_tweets(False)
        for tweet in tweets:
            print(str(tweet))
        return tweets

    elif args.search_term is not None:
        if args.pages is not None:
            ts = TweetScrapperSearch(args.search_term, args.pages)
        else:
            ts = TweetScrapperSearch(args.search_term)
        tweets = ts.get_search_tweets(False)
        for tweet in tweets:
            print(str(tweet))
        return tweets

    else:
        raise ValueError(
            "No matching argument. Provide a twitter username eg. -u @5hirish or"
            " a twitter hashtag eg. -s #Python or any search term.")
Exemplo n.º 2
0
def test_time_interval_tweets(test_until, test_since, test_from, test_page):
    ts = TweetScrapperSearch(search_from_accounts=test_from,
                             search_till_date=test_until,
                             search_since_date=test_since,
                             num_tweets=test_page,
                             tweet_dump_path='twitter.csv',
                             tweet_dump_format='csv')

    tweet_count, tweet_id, last_time, dump_path = ts.get_search_tweets()

    since_timestamp = datetime.strptime(test_since, '%Y-%m-%d').timestamp()
    until_timestamp = datetime.strptime(test_until, '%Y-%m-%d').timestamp()

    assert os.path.exists(dump_path)

    assert tweet_count == pytest.approx(test_page, abs=5)

    with open(dump_path, 'r') as csv_fp:
        csv_reader = csv.DictReader(csv_fp)
        for tweets in csv_reader:
            assert tweets.get('id') is not None
            assert tweets.get('text') is not None
            assert tweets.get('time') is not None
            # tweet_time = int(tweets.get('time')) / 1000
            # assert until_timestamp >= tweet_time >= since_timestamp

    os.remove(dump_path)
Exemplo n.º 3
0
def test_hashtag_tweets(test_tag, test_page):

    ts = TweetScrapperSearch(search_hashtags=test_tag,
                             num_tweets=test_page,
                             tweet_dump_path='twitter.csv',
                             tweet_dump_format='csv')

    tweet_count, tweet_id, last_time, dump_path = ts.get_search_tweets()

    assert os.path.exists(dump_path)

    # assert tweet_count >= (test_page - 1) * 20
    assert tweet_count >= 5

    with open(dump_path, 'r') as csv_fp:
        csv_reader = csv.DictReader(csv_fp)
        for tweets in csv_reader:
            assert tweets.get('id') is not None
            assert tweets.get('text') is not None
            assert tweets.get('time') is not None
            assert tweets.get('hashtags') is not None
            assert len(ast.literal_eval(tweets.get('hashtags'))) > 0
            # extracted_hastags = [ht.lower() for ht in ast.literal_eval(tweets.get('hashtags'))]
            # assert test_tag.lower() in extracted_hastags

    os.remove(dump_path)
Exemplo n.º 4
0
def test_search_tweets(test_term, test_page):

    ts = TweetScrapperSearch(test_term, test_page)
    extracted_tweets = ts.get_search_tweets(False)
    assert len(extracted_tweets) > 0
    for tweets in extracted_tweets:
        assert tweets.get_tweet_id() is not None
        assert tweets.get_tweet_text() is not None
Exemplo n.º 5
0
def test_hashtag_tweets(test_tag, test_page):

    ts = TweetScrapperSearch(test_tag, test_page)
    extracted_tweets = ts.get_search_tweets(False)
    assert len(extracted_tweets) > 0
    for tweets in extracted_tweets:
        assert tweets.get_tweet_id() is not None
        assert tweets.get_tweet_text() is not None
        extracted_hastags = [ht.lower() for ht in tweets.get_tweet_hashtags()]
        assert test_tag.lower() in extracted_hastags
Exemplo n.º 6
0
def get_tweets(word, date, number):

    #gather the tweets and export to a csv
    tweet_scrapper = TweetScrapperSearch(
        search_all=word,
        search_till_date=date,
        num_tweets=number,
        tweet_dump_path=f'./data/{date}_{word}_tweets.csv',
        tweet_dump_format='csv')
    tweet_count, tweet_id, tweet_time, dump_path = tweet_scrapper.get_search_tweets(
    )

    #read the csv back in as a dataframe
    tweets = pd.read_csv(f'./data/{date}_{word}_tweets.csv')
    tweets = tweets.drop_duplicates()

    tweets = tweets.reset_index()

    return tweets
Exemplo n.º 7
0
def test_search_tweets(test_term, test_page):

    ts = TweetScrapperSearch(search_all=test_term,
                             num_tweets=test_page,
                             tweet_dump_path='twitter.csv',
                             tweet_dump_format='csv')

    tweet_count, tweet_id, last_time, dump_path = ts.get_search_tweets()

    assert os.path.exists(dump_path)

    # assert tweet_count > (test_page - 1) * 20
    assert tweet_count >= 5

    with open(dump_path, 'r') as csv_fp:
        csv_reader = csv.DictReader(csv_fp)
        for tweets in csv_reader:
            assert tweets.get('id') is not None
            assert tweets.get('text') is not None
            assert tweets.get('time') is not None

    os.remove(dump_path)
def main(args):
    """Main entry point allowing external calls

    Args:
      args ([str]): command line parameter list
    """
    args = parse_args(args)
    setup_logging(args.loglevel)
    _logger.info("Scrapping tweets")

    if args.username is not None:

        ts = TweetScrapperProfile(username=args.username,
                                  num_tweets=args.pages,
                                  tweet_dump_path=args.tweet_dump_path,
                                  tweet_dump_format=args.tweet_dump_format,
                                  request_proxies=args.request_proxies)

        l_tweet_count, l_tweet_id, l_tweet_time, l_dump_path = ts.get_profile_tweets(
        )
        print("Extracted {0} tweets till {1} at {2}".format(
            l_tweet_count, l_tweet_time, l_dump_path))
        return "Extracted {0} tweets till {1} at {2}".format(
            l_tweet_count, l_tweet_time, l_dump_path)

    else:

        ts = TweetScrapperSearch(
            search_all=args.search_all,
            search_exact=args.search_exact,
            search_any=args.search_any,
            search_excludes=args.search_excludes,
            search_hashtags=args.search_hashtags,
            search_from_accounts=args.search_from_accounts,
            search_to_accounts=args.search_to_accounts,
            search_mentions=args.search_mentions,
            search_near_place=args.search_near_place,
            search_till_date=args.search_till_date,
            search_since_date=args.search_since_date,
            num_tweets=args.pages,
            language=args.language,
            tweet_dump_path=args.tweet_dump_path,
            tweet_dump_format=args.tweet_dump_format,
            request_proxies=args.request_proxies)

        l_tweet_count, l_tweet_id, l_tweet_time, l_dump_path = ts.get_search_tweets(
        )
        print("Extracted {0} tweets till {1} at {2}".format(
            l_tweet_count, l_tweet_time, l_dump_path))
        return "Extracted {0} tweets till {1} at {2}".format(
            l_tweet_count, l_tweet_time, l_dump_path)
    def get_profile_tweets(self, save_output=False):
        output_file_name = '/' + self.username + '_profile'
        # Search Profile since: until: from:
        if self.username is not None and self.username != "":
            # self.update_request_url(self.__twitter_profile_timeline_url__)
            self.username = self.username.replace("@", "")
            tweet_count, last_tweet_id, last_tweet_time, dump_path = \
                self.execute_twitter_request(username=self.username,
                                             log_output=save_output,
                                             log_file=output_file_name)

            if self.pages == -1 or (self.pages - 1 * 20) > tweet_count:
                logger.info(
                    "Switching to search mode. Profile Limit exhausted")
                ts = TweetScrapperSearch(
                    search_from_accounts=self.username,
                    search_since_date=TweetScrapperSearch.twitter_from_date,
                    search_till_date=last_tweet_time)
                append_tweet_count, last_tweet_id, last_tweet_time, dump_path = ts.get_search_tweets(
                    save_output)
                tweet_count += append_tweet_count

            return tweet_count, last_tweet_id, last_tweet_time, dump_path
        return 0, 0, 0, output_file_name
Exemplo n.º 10
0
def zoo(myreq, file_list):

    con = sql.connect("my.db")
    con.execute("DROP TABLE IF EXISTS third")
    con.execute(
        'create table if not exists third("id" integer primary key autoincrement,"username" text not null, "email" text not null, "score" integer not null)'
    )
    con.close()

    check = str(myreq)
    data = pd.read_csv("mbti_1.csv")

    # print(data.head(10))
    def get_types(row):
        t = row['type']

        I = 0
        N = 0
        T = 0
        J = 0

        if t[0] == 'I':
            I = 1
        elif t[0] == 'E':
            I = 0
        else:
            print('I-E incorrect')

        if t[1] == 'N':
            N = 1
        elif t[1] == 'S':
            N = 0
        else:
            print('N-S incorrect')

        if t[2] == 'T':
            T = 1
        elif t[2] == 'F':
            T = 0
        else:
            print('T-F incorrect')

        if t[3] == 'J':
            J = 1
        elif t[3] == 'P':
            J = 0
        else:
            print('J-P incorrect')
        return pd.Series({'IE': I, 'NS': N, 'TF': T, 'JP': J})

    data = data.join(data.apply(lambda row: get_types(row), axis=1))
    # print(data.head(5))

    # print ("Introversion (I) /  Extroversion (E):\t", data['IE'].value_counts()[0], " / ", data['IE'].value_counts()[1])
    # print ("Intuition (N) – Sensing (S):\t\t", data['NS'].value_counts()[0], " / ", data['NS'].value_counts()[1])
    # print ("Thinking (T) – Feeling (F):\t\t", data['TF'].value_counts()[0], " / ", data['TF'].value_counts()[1])
    # print ("Judging (J) – Perceiving (P):\t\t", data['JP'].value_counts()[0], " / ", data['JP'].value_counts()[1])

    b_Pers = {'I': 0, 'E': 1, 'N': 0, 'S': 1, 'F': 0, 'T': 1, 'J': 0, 'P': 1}
    b_Pers_list = [{
        0: 'I',
        1: 'E'
    }, {
        0: 'N',
        1: 'S'
    }, {
        0: 'F',
        1: 'T'
    }, {
        0: 'J',
        1: 'P'
    }]

    def translate_personality(personality):
        # transform mbti to binary vector

        return [b_Pers[l] for l in personality]

    def translate_back(personality):
        # transform binary vector to mbti personality

        s = ""
        for i, l in enumerate(personality):
            s += b_Pers_list[i][l]
        return s

    # Check ...
    d = data.head(4)
    list_personality_bin = np.array([translate_personality(p) for p in d.type])
    # print("Binarize MBTI list: \n%s" % list_personality_bin)

    unique_type_list = [
        'INFJ', 'ENTP', 'INTP', 'INTJ', 'ENTJ', 'ENFJ', 'INFP', 'ENFP', 'ISFP',
        'ISTP', 'ISFJ', 'ISTJ', 'ESTP', 'ESFP', 'ESTJ', 'ESFJ'
    ]

    unique_type_list = [x.lower() for x in unique_type_list]

    # Lemmatize
    stemmer = PorterStemmer()
    lemmatiser = WordNetLemmatizer()

    # Cache the stop words for speed
    cachedStopWords = stopwords.words("english")

    def pre_process_data(data,
                         remove_stop_words=True,
                         remove_mbti_profiles=True):
        list_personality = []
        list_posts = []
        len_data = len(data)
        i = 0

        for row in data.iterrows():
            i += 1
            # if (i % 500 == 0 or i == 1 or i == len_data):
            # print("%s of %s rows" % (i, len_data))

            ##### Remove and clean comments
            posts = row[1].posts
            temp = re.sub(
                'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|(?:%[0-9a-fA-F][0-9a-fA-F]))+',
                ' ', posts)
            temp = re.sub("[^a-zA-Z]", " ", temp)
            temp = re.sub(' +', ' ', temp).lower()
            if remove_stop_words:
                temp = " ".join([
                    lemmatiser.lemmatize(w) for w in temp.split(' ')
                    if w not in cachedStopWords
                ])
            else:
                temp = " ".join(
                    [lemmatiser.lemmatize(w) for w in temp.split(' ')])

            if remove_mbti_profiles:
                for t in unique_type_list:
                    temp = temp.replace(t, "")

            type_labelized = translate_personality(row[1].type)
            list_personality.append(type_labelized)
            list_posts.append(temp)

        list_posts = np.array(list_posts)
        list_personality = np.array(list_personality)
        return list_posts, list_personality

    list_posts, list_personality = pre_process_data(data,
                                                    remove_stop_words=True)

    # Posts to a matrix of token counts
    cntizer = CountVectorizer(analyzer="word",
                              max_features=1500,
                              tokenizer=None,
                              preprocessor=None,
                              stop_words=None,
                              max_df=0.7,
                              min_df=0.1)

    # Learn the vocabulary dictionary and return term-document matrix
    print("CountVectorizer...")
    # test2=""
    # with open('in.csv', encoding='ISO-8859-2') as f:
    #   for text in f.read().split("\n"):
    #	    test2+=str((word_tokenize(text)))
    X_cnt = cntizer.fit_transform(list_posts)

    # Transform the count matrix to a normalized tf or tf-idf representation
    tfizer = TfidfTransformer()

    print("Tf-idf...")
    # Learn the idf vector (fit) and transform a count matrix to a tf-idf representation
    X_tfidf = tfizer.fit_transform(X_cnt).toarray()
    # feature_names = list(enumerate(cntizer.get_feature_names()))
    # print(feature_names)

    # ----------------------------------------------------------------LOOP-----------------------------------------------
    # f = open("myfile.txt", "r")
    # f1 = f.readlines()
    r = []
    i = 0
    for x in file_list:
        con = sql.connect("my.db")
        cur = con.cursor()
        cur.execute("select * from first where twitter=?", (x, ))
        ans = cur.fetchall()
        for y in ans:
            p = y[4]
            em = y[2]
            name = y[1]
        scorepre = p
        ema = em
        con.close()
        # con = sql.connect("my.db")
        #cur = con.cursor()
        use = x.rstrip('\n')
        tweet_scrapper = TweetScrapperSearch(search_from_accounts=use,
                                             search_till_date="2020-04-09",
                                             search_since_date="2019-10-19",
                                             num_tweets=40,
                                             tweet_dump_path=use + '1.csv',
                                             tweet_dump_format='csv')
        tweet_count, tweet_id, tweet_time, dump_path = tweet_scrapper.get_search_tweets(
        )
        if tweet_count == 0:
            print("NO")
            score = scorepre
            r += [(score, name, ema)]
        else:
            print("YES", tweet_count)
            # print(tweet_scrapper.text())
            # print("Extracted {0} tweets till {1} at {2}".format(tweet_count, tweet_time, dump_path))

            df1 = pd.read_csv(
                use + "1.csv")  # read csv file and store it in a dataframe
            # df.to_csv('hrdata_modified.csv')
            my_columns = DataFrame(df1["text"])
            # add in.csv instead of x.txt
            # my_columns.drop([0])
            my_columns.to_csv(use + ".csv")

            my_posts = ""
            with open(use + '.csv', encoding='ISO-8859-2') as f:
                for text in f.read().split("\n"):
                    my_posts += str((word_tokenize(text)))

            # my_posts  = "Getting started with data science and applying machine learning has never been as simple as it is now. There are many free and paid online tutorials and courses out there to help you to get started. I’ve recently started to learn, play, and work on Data Science & Machine Learning on Kaggle.com. In this brief post, I’d like to share my experience with the Kaggle Python Docker image, which simplifies the Data Scientist’s life."
            mydata = pd.DataFrame(data={'type': ['INFJ'], 'posts': [my_posts]})

            my_posts, dummy = pre_process_data(mydata, remove_stop_words=True)

            my_X_cnt = cntizer.transform(my_posts)
            my_X_tfidf = tfizer.transform(my_X_cnt).toarray()

            param = {}
            param['n_estimators'] = 200
            param['max_depth'] = 2
            param['nthread'] = 8
            param['learning_rate'] = 0.2

            result = []
            type_indicators = [
                "IE: Introversion (I) / Extroversion (E)",
                "NS: Intuition (N) – Sensing (S)",
                "FT: Feeling (F) - Thinking (T)",
                "JP: Judging (J) – Perceiving (P)"
            ]

            # Let's train type indicator individually
            X = X_tfidf
            for l in range(len(type_indicators)):
                print("%s ..." % (type_indicators[l]))

                Y = list_personality[:, l]

                # split data into train and test sets
                seed = 7
                test_size = 0.33
                X_train, X_test, y_train, y_test = train_test_split(
                    X, Y, test_size=test_size, random_state=seed)

                # fit model on training data
                model = XGBClassifier(**param)
                model.fit(X_train, y_train)

                # make predictions for my  data
                y_pred = model.predict(my_X_tfidf)
                # predictions = [round(value) for value in y_pred]
                # accuracy = accuracy_score(y_test, predictions)
                # print("* %s Accuracy: %.2f%%" % (type_indicators[l], accuracy * 100.0))
                result.append(y_pred[0])
            # print("* %s prediction: %s" % (type_indicators[l], y_pred))
            print("The result is: ", translate_back(result))

            x = translate_back(result)
            score = scorepre
            for p in check:
                if p in x:
                    score = score + 10
            print("MY SCORE----------", score)

            r += [(score, name, ema)]

    print("SORTED TWITTER IDs BASED ON THE SCORE")

    def Sort_Tuple(tup):
        # reverse = None (Sorts in Ascending order)
        # key is set to sort using second element of
        # sublist lambda has been used
        return (sorted(tup, key=lambda x: x[0], reverse=True))

    p = Sort_Tuple(r)
    for item in p:
        m.insertthird(item[1], item[2], item[0])
    con = sql.connect("my.db")
    cur = con.cursor()
    #x = '@EvaNamratha'
    cur.execute("select * from third")
    ans3 = cur.fetchall()
    #con.close()
    return ans3
#print("Tf-idf...")
# Learn the idf vector (fit) and transform a count matrix to a tf-idf representation
X_tfidf = tfizer.fit_transform(X_cnt).toarray()
#feature_names = list(enumerate(cntizer.get_feature_names()))
#print(feature_names)

#----------------------------------------------------------------LOOP-----------------------------------------------
f = open("id.txt", "r")
f1 = f.readlines()
r = []
i = 0
for x in f1:
    use = x.rstrip('\n')
    tweet_scrapper = TweetScrapperSearch(search_from_accounts=use,
                                         search_till_date="2020-03-10",
                                         search_since_date="2019-10-19",
                                         num_tweets=40,
                                         tweet_dump_path=use + '1.csv',
                                         tweet_dump_format='csv')
    tweet_count, tweet_id, tweet_time, dump_path = tweet_scrapper.get_search_tweets(
    )
    #print(tweet_scrapper.text())
    #print("Extracted {0} tweets till {1} at {2}".format(tweet_count, tweet_time, dump_path))

    df1 = pd.read_csv(use +
                      "1.csv")  # read csv file and store it in a dataframe
    #df.to_csv('hrdata_modified.csv')
    my_columns = DataFrame(df1["text"])
    #add in.csv instead of x.txt
    #my_columns.drop([0])
    my_columns.to_csv(use + ".csv")