def label_data(tweets_file: str, output_file: str, num_samples: int, start_index: int): dataset: List[Dict[str, Any]] = [] for tweet_id, tweet in enumerate(get_tweets(tweets_file)): if tweet_id < start_index: continue if len(dataset) >= num_samples: break print('Tweet {0}: {1}'.format(tweet_id, tweet.original)) while True: try: print('Enter a label: ', end=' ') label = int(input().strip()) break except ValueError: pass if label in (0, 1, 2): dataset.append( dict(tweet=tweet.original, label=label, url=tweet.url)) print('Dataset Size: {0}'.format(len(dataset))) print('==========') write_as_json_gz(dataset, output_file)
def analyze_sentiments(input_file: str): sample_dict: Dict[int, SentimentTuple] = dict() polarities: List[float] = [] subjectivities: List[float] = [] for index, tweet in enumerate(get_tweets(input_file)): text = TextBlob(tweet) sentiment = text.sentiment sample_dict[index] = SentimentTuple(text=tweet, polarity=sentiment.polarity, subjectivity=sentiment.subjectivity) polarities.append(sentiment.polarity) subjectivities.append(sentiment.subjectivity) print('==== Sentiment Statistics ====') print('Average polarity: {0}'.format(np.average(polarities))) print('Std polarity: {0}'.format(np.std(polarities))) print('Average subjectivity: {0}'.format(np.average(subjectivities))) print('Std subjectivity: {0}'.format(np.std(subjectivities))) print('==============================') print('==== Clustering Based On Polarity ====') cluster(sample_dict, mode='polarity') print('======================================') print('==== Clustering Based on Subjectivity ====') cluster(sample_dict, mode='subjectivity') print('==========================================')
def get_dataset( tweets_path: str, labeled_data_paths: List[str] ) -> Tuple[np.ndarray, np.ndarray, CountVectorizer]: # Create the tweet vectorizer using the full dataset tweets = [t.cleaned for t in get_tweets(tweets_path)] vectorizer, _ = count_vectorize(tweets, min_df=0.01) # Lists to hold inputs and outputs X: List[np.ndarray] = [] y: List[int] = [] # Fetch labeled tweets label_counter: Counter = Counter() labeled_tweets: Iterable[Dict[str, Any]] = chain( *(read_as_json_gz(path) for path in labeled_data_paths)) for tweet_dict in labeled_tweets: cleaned_tweet: CleanedTweet = clean_tweet(tweet_dict['tweet']) input_features = vectorizer.transform([cleaned_tweet.text ]).toarray()[0] label = int(tweet_dict['label']) label_counter[label] += 1 X.append(input_features) y.append(label) print('Count distribution: 0 -> {0}, 1 -> {1}, 2 -> {2}'.format( label_counter[0], label_counter[1], label_counter[2])) return np.array(X), np.array(y), vectorizer
def label_dataset(tweets_path: str, model: Any, vectorizer: CountVectorizer, output_file: str): labeled_dataset: List[Dict[str, Any]] = [] for tweet in get_tweets(tweets_path): features = vectorizer.transform([tweet.cleaned]).toarray() label = model.predict(features)[0] labeled_dataset.append( dict(tweet=tweet.original, label=int(label), url=tweet.url)) write_as_json_gz(labeled_dataset, output_file)
def topic_model(input_path: str, num_topics: int, num_words: int): tweets = list(get_tweets(input_path)) vectorizer, features = count_vectorize(tweets) vocab = vectorizer.get_feature_names() # Fit the topic model lda = LatentDirichletAllocation(n_components=num_topics, random_state=0) lda.fit(features) topic_words = dict() for index, component in enumerate(lda.components_): top_indices = np.argsort(component)[::-1][:num_words] topic_tokens = [vocab[i] for i in top_indices] print('Topic {0}: {1}'.format(index, ' '.join(topic_tokens)))
""" CRON JOB 2 """ from tweet_utils import get_tweets topic = "Full_Stack" keyword_list = ["frontend", "backend", "fullstack"] limit = 15 get_tweets(topic, keyword_list, limit)