def preprocess_tweets(nouns_only=False, pr=None): """ Preprocesses Tweets for use in the tf-idf approach. :param nouns_only: Optional. Default: False. Whether only nouns should be used. :param pr: Optional. Can be used to record progress for large datasets. :return: """ users = set(Tweet.objects.filter(text__isnull=False).select_related("user__name") .values_list("user__name", flat=True)) # data_dir = os.path.join(DATA_DIR, "CLUSTER", "preprocessed_tweets") objs_update = [] nlp = get_spacy_model(lang="en") for i, user in enumerate(users): if pr is not None: pr.set_progress(i + 1, len(users), description=f"Processing Tweets of user {user}.") else: print(f"--> Processing Tweets of user {user} ({i + 1} of {len(users)}) <--") tweets = Tweet.objects.filter(text__isnull=False, user__name=user) texts = [x.text for x in tweets if x.text] if not texts: continue texts = preprocess(texts, nouns_only=nouns_only, nlp=nlp) m, _created = PreprocessedText.objects.get_or_create(name=user, nouns_only=nouns_only) m.texts = texts objs_update.append(m) with transaction.atomic(): for obj in objs_update: obj.save()
def assign_kws_to_tweets_external(clusters: [ClusterResult], name: str, lang="en", pr=None): """ Assigns KWs to Tweets, uses the database. :param clusters: ClusterResult objects representing the topic clusters. :param name: The name of the output. :param lang: The language of the texts. :param pr: Can be used to record progress for long running tasks. :return: Nothing, results are stored in database as KlangInput instances. """ nlp = get_spacy_model(lang=lang) relations = KlangRelationsFix() for i, cluster_obj in enumerate(clusters): if pr is not None: pr.set_progress(i + 1, len(clusters), description=f"Assigning keywords to Tweets.") texts = cluster_obj.texts kws = cluster_obj.keywords tweets = texts.get_tweets() tfidf = cluster_obj.vectorizer m = cluster_obj.clusterer for tweet in tweets: text = preprocess([tweet.text], nlp=nlp) text = tfidf.transform(text) cluster_id = m.predict(text)[0] try: cluster_elems = kws[cluster_id].split(",") except IndexError: continue for elem in cluster_elems: month = (tweet.created_at.month / 12) - 1 / 12 year = tweet.created_at.year d = year + month relations.add_unquantified("hasKeyword", tweet.pk, elem, d) relations.relations = default_to_regular(relations.relations) # TODO: clear out old clusters m2m? try: m = KlangInput.objects.get(name=name) except ObjectDoesNotExist: m = KlangInput(name=name) m.model = relations m.save() m.clusters.clear() m.clusters.add(*clusters)
def preprocess(texts, nlp=None): texts = [x for x in texts if x] if nlp is None: nlp = get_spacy_model(lang="en") t = [] for i, doc in enumerate(nlp.pipe(texts, batch_size=50)): if i > 0 and i % 100 == 0: print(f"--> Preprocessing text {i + 1} out of {len(texts)} <--") d = [] for sentence in doc.sents: for token in sentence: token = process_term(token, nouns_only=True) if token[0] == "#" and token[-1] == "#": continue d.append(token) t.append(" ".join(d)) return t
def preprocess_tweets_ex(user, lang="en", nlp=None): """ Preprocess external Tweets. :param user: The screen name of the user. :param lang: Optional. Default: "en". The language of the Tweets. :param nlp: Optional. Default: None. The spaCy model to use. :return: A PreprocessedText representing the texts of the user. """ tweets = TweetEx.objects.filter(user__screen_name=user) start = min(x.created_at for x in tweets) end = max(x.created_at for x in tweets) if nlp is None: nlp = get_spacy_model(lang=lang) texts = [x.text for x in tweets if x.text] texts = preprocess(texts, nlp=nlp) m, _created = PreprocessedText.objects.get_or_create(name=user, nouns_only=True, external=True, start=start, end=end) m.texts = texts m.save() return m
def preprocess(texts, nouns_only=False, nlp=None): """ Preprocesses texts used in the tf-idf approach. Removes hashtags, URLs, numbers, ... :param texts: A list of texts. :param nlp: The spaCy object used to split the texts into tokens. :return: The texts, but preprocessed. """ texts = [x for x in texts if x] if nlp is None: nlp = get_spacy_model(lang="en") t = [] for i, doc in enumerate(nlp.pipe(texts, batch_size=50)): if i % 100 == 0: print(f"--> Preprocessing text {i + 1} out of {len(texts)} <--") d = [] for sentence in doc.sents: for token in sentence: token = process_term(token, nouns_only=nouns_only) if token[0] == "#" and token[-1] == "#": continue d.append(token) t.append(" ".join(d)) return t
def assign_kws_to_tweets(nouns_only): """ tf-idf approach for all Twitter users. Creates Klang output that can be used as input for Klang. :param nouns_only: Whether only nouns should be used. :return: Nothing. Writes a .pickle field containing the relations that can be used as input for the Klang algorithm to the disk. """ users = sorted( set( Tweet.objects.filter( text__isnull=False).select_related("user__name").values_list( "user__name", flat=True))) # users = ["London Business School"] nlp = get_spacy_model(lang="en") relations = KlangRelationsFix() for i, user in enumerate(users): print(f"{i+1} out of {len(users)}") tweets = Tweet.objects.filter(text__isnull=False, user__name=user) try: r = ClusterResult.objects.get(name=user, nouns_only=nouns_only) except ObjectDoesNotExist: continue clusters = r.keywords tfidf = r.vectorizer tfidf.tokenizer = tokenize_only m = r.clusterer # data_dir_path = os.path.join(DATA_DIR, "CLUSTER", "tweets", f"{user}") # try: # with open(os.path.join(data_dir_path, f"keywords_TWEETS_{user}_{nouns_only}.p"), "rb") as f: # clusters = dill.load(f) # except FileNotFoundError: # continue # with open(os.path.join(data_dir_path, f"tfidf_{user}_{nouns_only}.p"), "rb") as f: # tfidf = dill.load(f) # tfidf.tokenizer = tokenize_only # with open(os.path.join(data_dir_path, f"mbatch_{user}_{nouns_only}.p"), "rb") as f: # m = dill.load(f) for tweet in tweets: text = preprocess([tweet.text], nlp=nlp) text = tfidf.transform(text) cluster_id = m.predict(text)[0] try: cluster_elems = clusters[cluster_id].split(",") except IndexError: continue for elem in cluster_elems: month = (tweet.created_at.month / 12) - 1 / 12 year = tweet.created_at.year d = year + month relations.add_unquantified("hasKeyword", tweet.pk, elem, d) # relations.add_unquantified("userHasKeyword", user, elem, tweet.created_at.year) relations.relations = default_to_regular(relations.relations) data_dir = os.path.join(DATA_DIR, "KLANG") # with open(os.path.join(data_dir, f"titaness_cluster_all_{nouns_only}NEW_FIX.p"), "wb") as f: # dill.dump(relations.relations, f) with open( os.path.join(data_dir, f"titaness_cluster_all_{nouns_only}__THESIS.p"), "wb") as f: dill.dump(relations.relations, f)