示例#1
0
文件: tfidf.py 项目: dilshank/lieu
 def geo_aliases(cls, total_docs_by_geo, min_doc_count=1000):
     keep_geos = total_docs_by_geo.filter(
         lambda (geo, count): count >= min_doc_count)
     alias_geos = total_docs_by_geo.subtract(keep_geos)
     return alias_geos.keys() \
                      .flatMap(lambda key: [(neighbor, key) for neighbor in geohash.neighbors(key)]) \
                      .join(keep_geos) \
                      .map(lambda (neighbor, (key, count)): (key, (neighbor, count))) \
                      .groupByKey() \
                      .map(lambda (key, values): (key, sorted(values, key=operator.itemgetter(1), reverse=True)[0][0]))
示例#2
0
    def doc_scores(cls, doc_words, doc_frequency, total_docs, min_count=1):
        if min_count > 1:
            doc_frequency = cls.filter_min_doc_frequency(doc_frequency,
                                                         min_count=min_count)

        num_partitions = doc_words.getNumPartitions()

        doc_ids_word_stats = doc_words.join(
            doc_frequency).map(lambda (word, ((doc_id, pos), doc_frequency)):
                               (doc_id, (word, pos, doc_frequency)))
        docs_tfidf = doc_ids_word_stats.groupByKey() \
                                       .mapValues(lambda vals: [(word, TFIDF.tfidf_score(1.0, doc_frequency, total_docs)) for word, pos, doc_frequency in sorted(vals, key=operator.itemgetter(1))])

        return docs_tfidf.coalesce(num_partitions)
示例#3
0
    def doc_scores(cls,
                   doc_words,
                   geo_doc_frequency,
                   total_docs_by_geo,
                   min_count=1):
        if min_count > 1:
            geo_doc_frequency = cls.filter_min_doc_frequency(
                geo_doc_frequency, min_count=min_count)

        num_partitions = doc_words.getNumPartitions()

        geo_doc_frequency_totals = geo_doc_frequency.map(lambda ((geo, word), count): (geo, (word, count))) \
                                                    .join(total_docs_by_geo) \
                                                    .map(lambda (geo, ((word, count), num_docs)): ((geo, word), (count, num_docs)))
        doc_ids_word_stats = doc_words.join(geo_doc_frequency_totals) \
                                      .map(lambda ((geo, word), ((doc_id, pos), (doc_frequency, num_docs))): (doc_id, (word, pos, doc_frequency, num_docs)))

        docs_tfidf = doc_ids_word_stats.groupByKey() \
                                       .mapValues(lambda vals: [(word, TFIDF.tfidf_score(1.0, doc_frequency, num_docs)) for word, pos, doc_frequency, num_docs in sorted(vals, key=operator.itemgetter(1))])

        return docs_tfidf.coalesce(num_partitions)
示例#4
0
    def doc_scores(cls, doc_words, word_info_gain):
        num_partitions = doc_words.getNumPartitions()

        doc_word_stats = doc_words.join(word_info_gain).map(lambda (word, (
            (doc_id, pos), info_gain)): (doc_id, (word, pos, info_gain)))

        docs_info_gain = doc_word_stats.groupByKey() \
                                       .mapValues(lambda vals: [(word, val) for word, pos, val in sorted(vals, key=operator.itemgetter(1))])

        return docs_info_gain.coalesce(num_partitions)
示例#5
0
 def geo_aliases(cls, total_docs_by_geo, min_doc_count=1000):
     keep_geos = total_docs_by_geo.filter(
         lambda geo_count: geo_count[1] >= min_doc_count)
     alias_geos = total_docs_by_geo.subtract(keep_geos)
     return list(alias_geos.keys()) \
                      .flatMap(lambda key: [(neighbor, key) for neighbor in geohash.neighbors(key)]) \
                      .join(keep_geos) \
                      .map(lambda neighbor_key_count: (neighbor_key_count[1][0], (neighbor_key_count[0], neighbor_key_count[1][1]))) \
                      .groupByKey() \
                      .map(lambda key_values: (key_values[0], sorted(key_values[1], key_values[0]=operator.itemgetter(1), reverse=True)[0][0]))