def name_geo_similarity(cls, tfidf1, tfidf2, geo_tfidf1, geo_tfidf2, geo_model_proportion=DEFAULT_GEO_MODEL_PROPORTION): tfidf_sim = soft_tfidf_similarity( TFIDF.normalized_tfidf_vector(tfidf1), TFIDF.normalized_tfidf_vector(tfidf2)) geo_tfidf_sim = soft_tfidf_similarity( TFIDF.normalized_tfidf_vector(geo_tfidf1), TFIDF.normalized_tfidf_vector(geo_tfidf2)) return (geo_model_proportion * geo_tfidf_sim) + ( (1.0 - geo_model_proportion) * tfidf_sim)
def doc_scores(cls, doc_words, doc_frequency, total_docs, min_count=1): if min_count > 1: doc_frequency = cls.filter_min_doc_frequency(doc_frequency, min_count=min_count) num_partitions = doc_words.getNumPartitions() doc_ids_word_stats = doc_words.join( doc_frequency).map(lambda (word, ((doc_id, pos), doc_frequency)): (doc_id, (word, pos, doc_frequency))) docs_tfidf = doc_ids_word_stats.groupByKey() \ .mapValues(lambda vals: [(word, TFIDF.tfidf_score(1.0, doc_frequency, total_docs)) for word, pos, doc_frequency in sorted(vals, key=operator.itemgetter(1))]) return docs_tfidf.coalesce(num_partitions)
def docs_tfidf(cls, doc_word_counts, doc_frequency, total_docs, min_count=1): if min_count > 1: doc_frequency = cls.filter_min_doc_frequency(doc_frequency, min_count=min_count) num_partitions = doc_word_counts.getNumPartitions() doc_ids_word_stats = doc_word_counts.join(doc_frequency).map( lambda (word, ((doc_id, term_frequency), doc_frequency)): (doc_id, (word, term_frequency, doc_frequency))) docs_tfidf = doc_ids_word_stats.groupByKey() \ .mapValues(lambda vals: {word: TFIDF.tfidf_score(term_frequency, doc_frequency, total_docs) for word, term_frequency, doc_frequency in vals}) return docs_tfidf.coalesce(num_partitions)
def doc_scores(cls, doc_words, geo_doc_frequency, total_docs_by_geo, min_count=1): if min_count > 1: geo_doc_frequency = cls.filter_min_doc_frequency( geo_doc_frequency, min_count=min_count) num_partitions = doc_words.getNumPartitions() geo_doc_frequency_totals = geo_doc_frequency.map(lambda ((geo, word), count): (geo, (word, count))) \ .join(total_docs_by_geo) \ .map(lambda (geo, ((word, count), num_docs)): ((geo, word), (count, num_docs))) doc_ids_word_stats = doc_words.join(geo_doc_frequency_totals) \ .map(lambda ((geo, word), ((doc_id, pos), (doc_frequency, num_docs))): (doc_id, (word, pos, doc_frequency, num_docs))) docs_tfidf = doc_ids_word_stats.groupByKey() \ .mapValues(lambda vals: [(word, TFIDF.tfidf_score(1.0, doc_frequency, num_docs)) for word, pos, doc_frequency, num_docs in sorted(vals, key=operator.itemgetter(1))]) return docs_tfidf.coalesce(num_partitions)
def docs_tfidf(cls, doc_word_counts, geo_doc_frequency, total_docs_by_geo, min_count=1): if min_count > 1: geo_doc_frequency = cls.filter_min_doc_frequency( geo_doc_frequency, min_count=min_count) num_partitions = doc_word_counts.getNumPartitions() geo_doc_frequency_totals = geo_doc_frequency.map(lambda ((geo, word), count): (geo, (word, count))) \ .join(total_docs_by_geo) \ .map(lambda (geo, ((word, count), num_docs)): ((geo, word), (count, num_docs))) doc_ids_word_stats = doc_word_counts.join(geo_doc_frequency_totals) \ .map(lambda ((geo, word), ((doc_id, term_frequency), (doc_frequency, num_docs))): (doc_id, (geo, word, term_frequency, doc_frequency, num_docs))) docs_tfidf = doc_ids_word_stats.groupByKey() \ .mapValues(lambda vals: {word: TFIDF.tfidf_score(term_frequency, doc_frequency, num_docs) for geo, word, term_frequency, doc_frequency, num_docs in vals}) return docs_tfidf.coalesce(num_partitions)
def name_similarity(cls, tfidf1, tfidf2): return soft_tfidf_similarity(TFIDF.normalized_tfidf_vector(tfidf1), TFIDF.normalized_tfidf_vector(tfidf2))