Пример #1
0
def preprocess_tweets(nouns_only=False, pr=None):
    """
    Preprocesses Tweets for use in the tf-idf approach.
    :param nouns_only: Optional. Default: False. Whether only nouns should be used.
    :param pr: Optional. Can be used to record progress for large datasets.
    :return:
    """
    users = set(Tweet.objects.filter(text__isnull=False).select_related("user__name")
                .values_list("user__name", flat=True))
    # data_dir = os.path.join(DATA_DIR, "CLUSTER", "preprocessed_tweets")
    objs_update = []
    nlp = get_spacy_model(lang="en")
    for i, user in enumerate(users):
        if pr is not None:
            pr.set_progress(i + 1, len(users), description=f"Processing Tweets of user {user}.")
        else:
            print(f"--> Processing Tweets of user {user} ({i + 1} of {len(users)}) <--")
        tweets = Tweet.objects.filter(text__isnull=False, user__name=user)
        texts = [x.text for x in tweets if x.text]
        if not texts:
            continue
        texts = preprocess(texts, nouns_only=nouns_only, nlp=nlp)
        m, _created = PreprocessedText.objects.get_or_create(name=user, nouns_only=nouns_only)
        m.texts = texts
        objs_update.append(m)
    with transaction.atomic():
        for obj in objs_update:
            obj.save()
Пример #2
0
def assign_kws_to_tweets_external(clusters: [ClusterResult],
                                  name: str,
                                  lang="en",
                                  pr=None):
    """
    Assigns KWs to Tweets, uses the database.
    :param clusters: ClusterResult objects representing the topic clusters.
    :param name: The name of the output.
    :param lang: The language of the texts.
    :param pr: Can be used to record progress for long running tasks.
    :return: Nothing, results are stored in database as KlangInput instances.
    """
    nlp = get_spacy_model(lang=lang)
    relations = KlangRelationsFix()
    for i, cluster_obj in enumerate(clusters):
        if pr is not None:
            pr.set_progress(i + 1,
                            len(clusters),
                            description=f"Assigning keywords to Tweets.")
        texts = cluster_obj.texts
        kws = cluster_obj.keywords
        tweets = texts.get_tweets()
        tfidf = cluster_obj.vectorizer
        m = cluster_obj.clusterer
        for tweet in tweets:
            text = preprocess([tweet.text], nlp=nlp)
            text = tfidf.transform(text)
            cluster_id = m.predict(text)[0]
            try:
                cluster_elems = kws[cluster_id].split(",")
            except IndexError:
                continue
            for elem in cluster_elems:
                month = (tweet.created_at.month / 12) - 1 / 12
                year = tweet.created_at.year
                d = year + month
                relations.add_unquantified("hasKeyword", tweet.pk, elem, d)
    relations.relations = default_to_regular(relations.relations)
    # TODO: clear out old clusters m2m?
    try:
        m = KlangInput.objects.get(name=name)
    except ObjectDoesNotExist:
        m = KlangInput(name=name)
    m.model = relations
    m.save()
    m.clusters.clear()
    m.clusters.add(*clusters)
Пример #3
0
def preprocess(texts, nlp=None):
    texts = [x for x in texts if x]
    if nlp is None:
        nlp = get_spacy_model(lang="en")
    t = []
    for i, doc in enumerate(nlp.pipe(texts, batch_size=50)):
        if i > 0 and i % 100 == 0:
            print(f"--> Preprocessing text {i + 1} out of {len(texts)} <--")
        d = []
        for sentence in doc.sents:
            for token in sentence:
                token = process_term(token, nouns_only=True)
                if token[0] == "#" and token[-1] == "#":
                    continue
                d.append(token)
        t.append(" ".join(d))
    return t
Пример #4
0
def preprocess_tweets_ex(user, lang="en", nlp=None):
    """
    Preprocess external Tweets.
    :param user: The screen name of the user.
    :param lang: Optional. Default: "en". The language of the Tweets.
    :param nlp: Optional. Default: None. The spaCy model to use.
    :return: A PreprocessedText representing the texts of the user.
    """
    tweets = TweetEx.objects.filter(user__screen_name=user)
    start = min(x.created_at for x in tweets)
    end = max(x.created_at for x in tweets)
    if nlp is None:
        nlp = get_spacy_model(lang=lang)
    texts = [x.text for x in tweets if x.text]
    texts = preprocess(texts, nlp=nlp)
    m, _created = PreprocessedText.objects.get_or_create(name=user,
                                                         nouns_only=True,
                                                         external=True,
                                                         start=start,
                                                         end=end)
    m.texts = texts
    m.save()
    return m
Пример #5
0
def preprocess(texts, nouns_only=False, nlp=None):
    """
    Preprocesses texts used in the tf-idf approach.
    Removes hashtags, URLs, numbers, ...
    :param texts: A list of texts.
    :param nlp: The spaCy object used to split the texts into tokens.
    :return: The texts, but preprocessed.
    """
    texts = [x for x in texts if x]
    if nlp is None:
        nlp = get_spacy_model(lang="en")
    t = []
    for i, doc in enumerate(nlp.pipe(texts, batch_size=50)):
        if i % 100 == 0:
            print(f"--> Preprocessing text {i + 1} out of {len(texts)} <--")
        d = []
        for sentence in doc.sents:
            for token in sentence:
                token = process_term(token, nouns_only=nouns_only)
                if token[0] == "#" and token[-1] == "#":
                    continue
                d.append(token)
        t.append(" ".join(d))
    return t
Пример #6
0
def assign_kws_to_tweets(nouns_only):
    """
    tf-idf approach for all Twitter users.
    Creates Klang output that can be used as input for Klang.
    :param nouns_only: Whether only nouns should be used.
    :return: Nothing. Writes a .pickle field containing the relations that can be used as input for the Klang
        algorithm to the disk.
    """
    users = sorted(
        set(
            Tweet.objects.filter(
                text__isnull=False).select_related("user__name").values_list(
                    "user__name", flat=True)))
    # users = ["London Business School"]
    nlp = get_spacy_model(lang="en")
    relations = KlangRelationsFix()
    for i, user in enumerate(users):
        print(f"{i+1} out of {len(users)}")
        tweets = Tweet.objects.filter(text__isnull=False, user__name=user)
        try:
            r = ClusterResult.objects.get(name=user, nouns_only=nouns_only)
        except ObjectDoesNotExist:
            continue
        clusters = r.keywords
        tfidf = r.vectorizer
        tfidf.tokenizer = tokenize_only
        m = r.clusterer
        # data_dir_path = os.path.join(DATA_DIR, "CLUSTER", "tweets", f"{user}")
        # try:
        #     with open(os.path.join(data_dir_path, f"keywords_TWEETS_{user}_{nouns_only}.p"), "rb") as f:
        #         clusters = dill.load(f)
        # except FileNotFoundError:
        #     continue
        # with open(os.path.join(data_dir_path, f"tfidf_{user}_{nouns_only}.p"), "rb") as f:
        #     tfidf = dill.load(f)
        # tfidf.tokenizer = tokenize_only
        # with open(os.path.join(data_dir_path, f"mbatch_{user}_{nouns_only}.p"), "rb") as f:
        #     m = dill.load(f)
        for tweet in tweets:
            text = preprocess([tweet.text], nlp=nlp)
            text = tfidf.transform(text)
            cluster_id = m.predict(text)[0]
            try:
                cluster_elems = clusters[cluster_id].split(",")
            except IndexError:
                continue
            for elem in cluster_elems:
                month = (tweet.created_at.month / 12) - 1 / 12
                year = tweet.created_at.year
                d = year + month
                relations.add_unquantified("hasKeyword", tweet.pk, elem, d)
                # relations.add_unquantified("userHasKeyword", user, elem, tweet.created_at.year)
    relations.relations = default_to_regular(relations.relations)
    data_dir = os.path.join(DATA_DIR, "KLANG")
    # with open(os.path.join(data_dir, f"titaness_cluster_all_{nouns_only}NEW_FIX.p"), "wb") as f:
    #     dill.dump(relations.relations, f)
    with open(
            os.path.join(data_dir,
                         f"titaness_cluster_all_{nouns_only}__THESIS.p"),
            "wb") as f:
        dill.dump(relations.relations, f)