Пример #1
0
def generate_long_term_model(user_id):
    print("updating long term model for {}".format(user_id))
    short_term_model = ShortTermInterest.objects.filter(user_id=user_id,
                                                        used_in_calc=False)
    short_term_data = {
        item.keyword.name: item.weight
        for item in short_term_model
    }
    long_term_data = {
        item.keyword.name: item.weight
        for item in LongTermInterest.objects.filter(user_id=user_id)
    }
    if not short_term_data:
        return
    new_data = update_interest_models(short_term_data, long_term_data)
    LongTermInterest.objects.filter(user_id=user_id).delete()
    short_term_model.update(used_in_calc=True)

    for keyword, weight in new_data.items():
        print(keyword, weight)
        keyword_instance, created = Keyword.objects.get_or_create(
            name=keyword.lower())
        if created:
            print("getting wiki categories")
            categories = wikicategory(keyword)
            for category in categories:
                category_instance, _ = Category.objects.get_or_create(
                    name=category)
                keyword_instance.categories.add(category_instance)
            keyword_instance.save()
        else:
            print("Keyword found in db")
        print("keyword obtained")

        long_term_model = LongTermInterest.objects.create(**{
            "user_id": user_id,
            "keyword": keyword_instance,
            "weight": weight
        })
        tweet_list = [
            tweet for tweet in Tweet.objects.filter(
                user_id=user_id, full_text__icontains=keyword.lower())
        ]
        paper_list = [
            paper for paper in Paper.objects.filter(
                Q(user_id=user_id) & (Q(abstract__icontains=keyword.lower())
                                      | Q(title__icontains=keyword.lower())))
        ]
        if tweet_list:
            long_term_model.tweets.add(*tweet_list)
            long_term_model.source = ShortTermInterest.TWITTER
        if paper_list:
            long_term_model.papers.add(*paper_list)
            long_term_model.source = ShortTermInterest.SCHOLAR
        if tweet_list and paper_list:
            long_term_model.source = (
                f"{ShortTermInterest.SCHOLAR} & {ShortTermInterest.TWITTER}")
        long_term_model.save()
Пример #2
0
 def post(self, request, *args, **kwargs):
     inputs = self.serializer_class(data=request.data)
     inputs.is_valid(raise_exception=True)
     payload = inputs.validated_data
     categories = {}
     for interest in payload["interests"]:
         category = wikicategory(interest)
         categories[interest] = category
     return Response(categories)
Пример #3
0
def generate_short_term_model(user_id, source):
    blacklisted_keywords = list(
        BlacklistedKeyword.objects.filter(user_id=user_id).values_list(
            "keyword__name", flat=True))

    if source == ShortTermInterest.TWITTER:
        tweet_candidates = Tweet.objects.filter(user_id=user_id,
                                                used_in_calc=False)
        month_wise_text = {}

        for tweet in tweet_candidates:
            key = f"{tweet.created_at.month}_{tweet.created_at.year}"
            if key not in month_wise_text:
                month_wise_text[key] = ""
            month_wise_text[key] = f"{month_wise_text[key]} {tweet.full_text}"

        for key, text in month_wise_text.items():
            month, year = key.split("_")
            try:
                keywords = getKeyword(text or "", model="Yake", num=20)
            except:
                # silencing errors like
                # interests/Keyword_Extractor/utils/datarepresentation.py:106: RuntimeWarning: Mean of empty slice
                continue
            print(f"got keywords {keywords}")
            if not len(keywords.keys()):
                print("No keywords found")
                continue
            wiki_keyword_redirect_mapping, keyword_weight_mapping = wikifilter(
                keywords)
            print(keyword_weight_mapping)
            if not len(keyword_weight_mapping.keys()):
                print("No keywords found in weight mapping")
                continue
            keywords = normalize(keyword_weight_mapping)
            for keyword, weight in keywords.items():
                original_keyword_name = wiki_keyword_redirect_mapping.get(
                    keyword, keyword)
                keyword = keyword.lower()
                if keyword in blacklisted_keywords:
                    print("Skipping {} as its blacklisted".format(keyword))
                    continue
                keyword_instance, created = Keyword.objects.get_or_create(
                    name=keyword.lower())
                if created:
                    print("getting wiki categories")
                    categories = wikicategory(keyword)
                    for category in categories:
                        category_instance, _ = Category.objects.get_or_create(
                            name=category)
                        keyword_instance.categories.add(category_instance)
                    keyword_instance.save()
                try:
                    original_keywords = json.loads(
                        keyword_instance.original_keywords)
                except:
                    original_keywords = []
                original_keywords.append(original_keyword_name.lower())
                keyword_instance.original_keywords = json.dumps(
                    list(set(original_keywords)))
                keyword_instance.save()

                s_interest, _ = ShortTermInterest.objects.update_or_create(
                    user_id=user_id,
                    keyword=keyword_instance,
                    model_month=month,
                    model_year=year,
                    defaults={
                        "source": source,
                        "weight": weight
                    },
                )
                for t in tweet_candidates.filter(full_text__icontains=keyword):
                    s_interest.tweets.add(t)
        tweet_candidates.update(used_in_calc=True)

    if source == ShortTermInterest.SCHOLAR:
        paper_candidates = Paper.objects.filter(user_id=user_id,
                                                used_in_calc=False)
        year_wise_text = {}
        for paper in paper_candidates:
            if paper.year not in year_wise_text:
                year_wise_text[paper.year] = ""
            year_wise_text[
                paper.
                year] = f"{year_wise_text[paper.year]} {paper.title} {paper.abstract}"

        for year, text in year_wise_text.items():
            try:
                keywords = getKeyword(text, model="SingleRank", num=20)
            except:
                # silencing errors like
                # interests/Keyword_Extractor/utils/datarepresentation.py:106: RuntimeWarning: Mean of empty slice
                continue
            print(f"got keywords {keywords}")
            if not len(keywords.keys()):
                print("No keywords found")
                continue
            wiki_keyword_redirect_mapping, keyword_weight_mapping = wikifilter(
                keywords)
            if not len(keyword_weight_mapping.keys()):
                print("No keywords found in weight mapping")
                continue
            keywords = normalize(keyword_weight_mapping)
            for keyword, weight in keywords.items():
                original_keyword_name = wiki_keyword_redirect_mapping.get(
                    keyword, keyword)
                keyword = keyword.lower()
                if keyword in blacklisted_keywords:
                    print("Skipping {} as its blacklisted".format(keyword))
                    continue
                keyword_instance, created = Keyword.objects.get_or_create(
                    name=keyword.lower())
                if created:
                    print("getting wiki categories")
                    categories = wikicategory(keyword)
                    for category in categories:
                        category_instance, _ = Category.objects.get_or_create(
                            name=category)
                        keyword_instance.categories.add(category_instance)
                    keyword_instance.save()
                try:
                    original_keywords = json.loads(
                        keyword_instance.original_keywords)
                except:
                    original_keywords = []
                original_keywords.append(original_keyword_name.lower())
                keyword_instance.original_keywords = json.dumps(
                    list(set(original_keywords)))

                keyword_instance.save()

                s_interest, _ = ShortTermInterest.objects.update_or_create(
                    user_id=user_id,
                    keyword=keyword_instance,
                    model_month=1,
                    model_year=year,
                    defaults={
                        "source": source,
                        "weight": weight
                    },
                )
                for p in paper_candidates.filter(
                        Q(title__icontains=keyword)
                        | Q(abstract__icontains=keyword)):
                    s_interest.papers.add(p)
        paper_candidates.update(used_in_calc=True)