예제 #1
0
 def test_simple(self):
     trefs = ['Genesis 1:1', 'Genesis 1:2', 'Genesis 1:4']
     refs = [Ref(tref) for tref in trefs]
     clusters = RecommendationEngine.cluster_close_refs(refs,
                                                        [None] * len(refs),
                                                        dist_threshold=2)
     assert len(clusters) == 1
예제 #2
0
 def test_two_clusters(self):
     trefs = ['Genesis 1:1', 'Genesis 1:2', 'Genesis 1:5', 'Genesis 1:7']
     refs = [Ref(tref) for tref in trefs]
     clusters = RecommendationEngine.cluster_close_refs(refs,
                                                        [None] * len(refs),
                                                        dist_threshold=2)
     assert len(clusters) == 2
     assert clusters[0][0]['ref'].normal() == 'Genesis 1:1'
     assert clusters[1][0]['ref'].normal() == 'Genesis 1:5'
예제 #3
0
def generate_all_topic_links_from_sheets(topic=None):
    """
    Processes all public source sheets to create topic links.
    """
    from sefaria.system.database import db
    from sefaria.recommendation_engine import RecommendationEngine
    from tqdm import tqdm
    from statistics import mean, stdev

    all_topics = {}
    # ignore sheets that are copies or were assignments
    query = {
        "status": "public",
        "viaOwner": {
            "$exists": 0
        },
        "assignment_id": {
            "$exists": 0
        }
    }
    if topic:
        query['topics.slug'] = topic
    projection = {"topics": 1, "includedRefs": 1, "owner": 1}
    sheet_list = db.sheets.find(query, projection)
    for sheet in tqdm(sheet_list, desc="aggregating sheet topics"):
        sheet_topics = sheet.get("topics", [])
        for topic_dict in sheet_topics:
            slug = topic_dict['slug']
            if slug not in all_topics:
                all_topics[slug] = {
                    "topic": slug,
                    "sources_dict": defaultdict(set),
                    "related_topics_dict": defaultdict(set)
                }
            for tref in sheet.get("includedRefs", []):
                try:
                    oref = Ref(tref)
                    for sub_oref in oref.range_list():
                        all_topics[slug]["sources_dict"][
                            sub_oref.normal()].add(sheet['owner'])
                except:
                    continue
            for related_topic_dict in sheet_topics:
                if slug != related_topic_dict['slug']:
                    all_topics[slug]["related_topics_dict"][
                        related_topic_dict['slug']].add(sheet['owner'])

    already_created_related_links = {}
    related_links = []
    source_links = []
    for slug, blob in tqdm(all_topics.items(),
                           desc="creating sheet topic links"):
        if topic is not None and slug != topic:
            continue

        # filter related topics with less than 2 users who voted for it
        related_topics = [
            related_topic
            for related_topic in blob['related_topics_dict'].items()
            if len(related_topic[1]) >= 2
        ]
        for related_topic, user_votes in related_topics:
            if related_topic == slug:
                continue
            key = (related_topic,
                   slug) if related_topic > slug else (slug, related_topic)
            if already_created_related_links.get(key, False):
                continue
            already_created_related_links[key] = True
            related_links += [{
                'a': related_topic,
                'b': slug,
                'user_votes': len(user_votes)
            }]
        # filter sources with less than 3 users who added it
        sources = [
            source for source in blob['sources_dict'].items()
            if len(source[1]) >= 3
        ]

        # transform data to more convenient format
        temp_sources = []
        for source in sources:
            temp_sources += [(Ref(source[0]), len(source[1]))]
        sources = temp_sources

        # cluster refs that are close to each other and break up clusters where counts differ by more than 2 standard deviations
        STD_DEV_CUTOFF = 2
        temp_sources = []
        if len(sources) == 0:
            continue
        refs, counts = zip(*sources)
        clustered = RecommendationEngine.cluster_close_refs(refs, counts, 2)
        for cluster in clustered:
            counts = [(x['ref'], x['data']) for x in cluster]
            curr_range_start = 0
            for icount, (_, temp_count) in enumerate(counts):
                temp_counts = [x[1] for x in counts[curr_range_start:icount]]
                if len(temp_counts) < 2:
                    # variance requires two data points
                    continue
                count_xbar = mean(temp_counts)
                count_std = max(1 / STD_DEV_CUTOFF,
                                stdev(temp_counts, count_xbar))
                if temp_count > (STD_DEV_CUTOFF * count_std +
                                 count_xbar) or temp_count < (
                                     count_xbar - STD_DEV_CUTOFF * count_std):
                    temp_range = counts[curr_range_start][0].to(counts[icount -
                                                                       1][0])
                    temp_sources += [
                        (temp_range.normal(),
                         [r.normal()
                          for r in temp_range.range_list()], count_xbar)
                    ]
                    curr_range_start = icount
            temp_counts = [x[1] for x in counts[curr_range_start:]]
            count_xbar = mean(temp_counts)
            temp_range = counts[curr_range_start][0].to(counts[-1][0])
            temp_sources += [(temp_range.normal(),
                              [r.normal()
                               for r in temp_range.range_list()], count_xbar)]
        sources = temp_sources

        # create links
        if not topic:
            for source in sources:
                source_links += [{
                    "class": "refTopic",
                    "toTopic": slug,
                    "ref": source[0],
                    "expandedRefs": source[1],
                    "linkType": "about",
                    "is_sheet": False,
                    "dataSource": "sefaria-users",
                    "generatedBy": "sheet-topic-aggregator",
                    "order": {
                        "user_votes": source[2]
                    }
                }]

    if not topic:
        final_related_links = calculate_tfidf_related_sheet_links(
            related_links)
        sheet_links = generate_sheet_topic_links()
        # now that we've gathered all the new links, delete old ones and insert new ones
        RefTopicLinkSet({"generatedBy": "sheet-topic-aggregator"}).delete()
        IntraTopicLinkSet({"generatedBy": "sheet-topic-aggregator"}).delete()
        db.topic_links.insert_many(sheet_links + source_links +
                                   final_related_links,
                                   ordered=False)
예제 #4
0
def generate_all_topic_links_from_sheets(topic=None):
    """
    Processes all public source sheets to create topic links.
    """
    from sefaria.recommendation_engine import RecommendationEngine
    from statistics import mean, stdev
    import math

    OWNER_THRESH = 3
    TFIDF_CUTOFF = 0.15
    STD_DEV_CUTOFF = 2

    all_related_topics = defaultdict(lambda: defaultdict(set))
    all_related_refs = defaultdict(
        lambda: defaultdict(lambda: defaultdict(float)))
    topic_ref_counts = defaultdict(lambda: defaultdict(int))
    # ignore sheets that are copies or were assignments
    query = {
        "status": "public",
        "viaOwner": {
            "$exists": 0
        },
        "assignment_id": {
            "$exists": 0
        }
    }
    if topic:
        query['topics.slug'] = topic
    projection = {"topics": 1, "expandedRefs": 1, "owner": 1}
    sheet_list = db.sheets.find(query, projection)
    for sheet in tqdm(sheet_list, desc="aggregating sheet topics"):
        sheet_topics = sheet.get("topics", [])
        for topic_dict in sheet_topics:
            slug = topic_dict['slug']
            for tref in sheet.get("expandedRefs", []):
                value = all_related_refs[tref][slug].get(sheet['owner'], 0)
                all_related_refs[tref][slug][sheet['owner']] = max(
                    1 / len(sheet_topics), value)
                topic_ref_counts[slug][tref] += 1
            for related_topic_dict in sheet_topics:
                if slug != related_topic_dict['slug']:
                    all_related_topics[slug][related_topic_dict['slug']].add(
                        sheet['owner'])

    already_created_related_links = {}
    related_links = []
    source_links = []
    for slug, related_topics_to_slug in tqdm(
            all_related_topics.items(),
            desc="creating sheet related topic links"):
        if topic is not None and slug != topic:
            continue

        # filter related topics with less than 2 users who voted for it
        related_topics = [
            related_topic for related_topic in related_topics_to_slug.items()
            if len(related_topic[1]) >= 2
        ]
        for related_topic, user_votes in related_topics:
            if related_topic == slug:
                continue
            key = (related_topic,
                   slug) if related_topic > slug else (slug, related_topic)
            if already_created_related_links.get(key, False):
                continue
            already_created_related_links[key] = True
            related_links += [{
                'a': related_topic,
                'b': slug,
                'user_votes': len(user_votes)
            }]
    topic_idf_dict = {
        slug: math.log2(len(all_related_refs) / len(ref_dict))
        for slug, ref_dict in topic_ref_counts.items()
    }
    raw_topic_ref_links = defaultdict(list)
    for tref, related_topics_to_tref in tqdm(
            all_related_refs.items(), desc="creating sheet related ref links"):
        # filter sources with less than 3 users who added it and tfidf of at least 0.15
        numerator_list = []
        owner_counts = []
        for slug, owner_map in related_topics_to_tref.items():
            numerator = sum(owner_map.values())
            owner_counts += [len(owner_map)]
            numerator_list += [numerator]
        denominator = sum(numerator_list)
        topic_scores = [
            (slug, (numerator / denominator) * topic_idf_dict[slug], owners)
            for slug, numerator, owners in zip(related_topics_to_tref.keys(),
                                               numerator_list, owner_counts)
        ]
        # transform data to more convenient format
        oref = Ref(tref)
        for slug, _, owners in filter(
                lambda x: x[1] >= TFIDF_CUTOFF and x[2] >= OWNER_THRESH,
                topic_scores):
            raw_topic_ref_links[slug] += [(oref, owners)]

    for slug, sources in tqdm(raw_topic_ref_links.items()):
        # cluster refs that are close to each other and break up clusters where counts differ by more than 2 standard deviations
        temp_sources = []
        if len(sources) == 0:
            continue
        refs, counts = zip(*sources)
        clustered = RecommendationEngine.cluster_close_refs(refs, counts, 2)
        for cluster in clustered:
            counts = [(x['ref'], x['data']) for x in cluster]
            curr_range_start = 0
            for icount, (_, temp_count) in enumerate(counts):
                temp_counts = [x[1] for x in counts[curr_range_start:icount]]
                if len(temp_counts) < 2:
                    # variance requires two data points
                    continue
                count_xbar = mean(temp_counts)
                count_std = max(1 / STD_DEV_CUTOFF,
                                stdev(temp_counts, count_xbar))
                if temp_count > (STD_DEV_CUTOFF * count_std +
                                 count_xbar) or temp_count < (
                                     count_xbar - STD_DEV_CUTOFF * count_std):
                    temp_range = counts[curr_range_start][0].to(counts[icount -
                                                                       1][0])
                    temp_sources += [
                        (temp_range.normal(),
                         [r.normal()
                          for r in temp_range.range_list()], count_xbar)
                    ]
                    curr_range_start = icount
            temp_counts = [x[1] for x in counts[curr_range_start:]]
            count_xbar = mean(temp_counts)
            temp_range = counts[curr_range_start][0].to(counts[-1][0])
            temp_sources += [(temp_range.normal(),
                              [r.normal()
                               for r in temp_range.range_list()], count_xbar)]
        sources = temp_sources

        # create links
        if not topic:
            for source in sources:
                source_links += [{
                    "class": "refTopic",
                    "toTopic": slug,
                    "ref": source[0],
                    "expandedRefs": source[1],
                    "linkType": "about",
                    "is_sheet": False,
                    "dataSource": "sefaria-users",
                    "generatedBy": "sheet-topic-aggregator",
                    "order": {
                        "user_votes": source[2]
                    }
                }]

    if not topic:
        related_links = calculate_tfidf_related_sheet_links(related_links)
        sheet_links = generate_sheet_topic_links()

        # convert to objects
        source_links = [RefTopicLink(l) for l in source_links]
        related_links = [IntraTopicLink(l) for l in related_links]
        sheet_links = [RefTopicLink(l) for l in sheet_links]
        return source_links, related_links, sheet_links