Exemplo n.º 1
0
    def name_geo_similarity(cls,
                            tfidf1,
                            tfidf2,
                            geo_tfidf1,
                            geo_tfidf2,
                            geo_model_proportion=DEFAULT_GEO_MODEL_PROPORTION):
        tfidf_sim = soft_tfidf_similarity(
            TFIDF.normalized_tfidf_vector(tfidf1),
            TFIDF.normalized_tfidf_vector(tfidf2))
        geo_tfidf_sim = soft_tfidf_similarity(
            TFIDF.normalized_tfidf_vector(geo_tfidf1),
            TFIDF.normalized_tfidf_vector(geo_tfidf2))

        return (geo_model_proportion * geo_tfidf_sim) + (
            (1.0 - geo_model_proportion) * tfidf_sim)
Exemplo n.º 2
0
    def doc_scores(cls, doc_words, doc_frequency, total_docs, min_count=1):
        if min_count > 1:
            doc_frequency = cls.filter_min_doc_frequency(doc_frequency,
                                                         min_count=min_count)

        num_partitions = doc_words.getNumPartitions()

        doc_ids_word_stats = doc_words.join(
            doc_frequency).map(lambda (word, ((doc_id, pos), doc_frequency)):
                               (doc_id, (word, pos, doc_frequency)))
        docs_tfidf = doc_ids_word_stats.groupByKey() \
                                       .mapValues(lambda vals: [(word, TFIDF.tfidf_score(1.0, doc_frequency, total_docs)) for word, pos, doc_frequency in sorted(vals, key=operator.itemgetter(1))])

        return docs_tfidf.coalesce(num_partitions)
Exemplo n.º 3
0
    def docs_tfidf(cls,
                   doc_word_counts,
                   doc_frequency,
                   total_docs,
                   min_count=1):
        if min_count > 1:
            doc_frequency = cls.filter_min_doc_frequency(doc_frequency,
                                                         min_count=min_count)

        num_partitions = doc_word_counts.getNumPartitions()

        doc_ids_word_stats = doc_word_counts.join(doc_frequency).map(
            lambda (word, ((doc_id, term_frequency), doc_frequency)):
            (doc_id, (word, term_frequency, doc_frequency)))
        docs_tfidf = doc_ids_word_stats.groupByKey() \
                                       .mapValues(lambda vals: {word: TFIDF.tfidf_score(term_frequency, doc_frequency, total_docs)
                                                                for word, term_frequency, doc_frequency in vals})
        return docs_tfidf.coalesce(num_partitions)
Exemplo n.º 4
0
    def doc_scores(cls,
                   doc_words,
                   geo_doc_frequency,
                   total_docs_by_geo,
                   min_count=1):
        if min_count > 1:
            geo_doc_frequency = cls.filter_min_doc_frequency(
                geo_doc_frequency, min_count=min_count)

        num_partitions = doc_words.getNumPartitions()

        geo_doc_frequency_totals = geo_doc_frequency.map(lambda ((geo, word), count): (geo, (word, count))) \
                                                    .join(total_docs_by_geo) \
                                                    .map(lambda (geo, ((word, count), num_docs)): ((geo, word), (count, num_docs)))
        doc_ids_word_stats = doc_words.join(geo_doc_frequency_totals) \
                                      .map(lambda ((geo, word), ((doc_id, pos), (doc_frequency, num_docs))): (doc_id, (word, pos, doc_frequency, num_docs)))

        docs_tfidf = doc_ids_word_stats.groupByKey() \
                                       .mapValues(lambda vals: [(word, TFIDF.tfidf_score(1.0, doc_frequency, num_docs)) for word, pos, doc_frequency, num_docs in sorted(vals, key=operator.itemgetter(1))])

        return docs_tfidf.coalesce(num_partitions)
Exemplo n.º 5
0
    def docs_tfidf(cls,
                   doc_word_counts,
                   geo_doc_frequency,
                   total_docs_by_geo,
                   min_count=1):
        if min_count > 1:
            geo_doc_frequency = cls.filter_min_doc_frequency(
                geo_doc_frequency, min_count=min_count)

        num_partitions = doc_word_counts.getNumPartitions()

        geo_doc_frequency_totals = geo_doc_frequency.map(lambda ((geo, word), count): (geo, (word, count))) \
                                                    .join(total_docs_by_geo) \
                                                    .map(lambda (geo, ((word, count), num_docs)): ((geo, word), (count, num_docs)))
        doc_ids_word_stats = doc_word_counts.join(geo_doc_frequency_totals) \
                                            .map(lambda ((geo, word), ((doc_id, term_frequency), (doc_frequency, num_docs))): (doc_id, (geo, word, term_frequency, doc_frequency, num_docs)))

        docs_tfidf = doc_ids_word_stats.groupByKey() \
                                       .mapValues(lambda vals: {word: TFIDF.tfidf_score(term_frequency, doc_frequency, num_docs)
                                                                for geo, word, term_frequency, doc_frequency, num_docs in vals})
        return docs_tfidf.coalesce(num_partitions)
Exemplo n.º 6
0
 def name_similarity(cls, tfidf1, tfidf2):
     return soft_tfidf_similarity(TFIDF.normalized_tfidf_vector(tfidf1),
                                  TFIDF.normalized_tfidf_vector(tfidf2))