예제 #1
0
def map_contents(input=sys.stdin, output=sys.stdout, stop_words=None):
    for line in input:
        docname = os.environ['mapreduce_map_input_file']
        if stop_words is None:
            contents = mru.clean_text(line)
        else:
            contents = mru.clean_text(line, stop_words)
        key = {'filename': docname}
        value = {'words': [word for word in contents]}
        # we emit as if we were a reducer since the contents don't get put
        # through a reducer
        mru.reducer_emit(key, value, output)
예제 #2
0
def map_contents(input=sys.stdin, output=sys.stdout, stop_words=None):
    for line in input:
        docname = os.environ['mapreduce_map_input_file']
        if stop_words is None:
            contents = mru.clean_text(line)
        else:
            contents = mru.clean_text(line, stop_words)
        key = {'filename': docname}
        value = {'words': [word for word in contents]}
        # we emit as if we were a reducer since the contents don't get put
        # through a reducer
        mru.reducer_emit(key, value, output)
예제 #3
0
def map_claims(input=sys.stdin, output=sys.stdout,
               kv_delim=INPUT_KV_DELIM, stop_words_file=None, stem=True):
    for line in input:
        key, value = line.strip().split(kv_delim)
        patent_id = key.strip()
        if stop_words_file is not None:
            stop_words = json.loads(open(stop_words_file).read())
            contents = mru.clean_text(value, stop_words, stem)
        else:
            contents = mru.clean_text(value, stem=stem)
        key = {'filename': patent_id}
        contents = {'words': [word for word in contents]}
        mru.reducer_emit(key, contents, output)
예제 #4
0
def map_contents(input=sys.stdin, output=sys.stdout):
    """
    (file_contents) --> (file_name) (file_contents)

    for each line from stdin consisting of a document in the corpus, emits
    a key-value pair to stdout with a key of the corresponding filename
    and a value of the file contents cleaned with
    map_reduce_utils.clean_text
    """
    template = '{}\t{}'
    for line in input:
        docname = os.environ['mapreduce_map_input_file']
        contents = clean_text(line)
        result = template.format(docname, ' '.join(map(str, contents)))
        print(result, file=output)
예제 #5
0
def compare_file(db, filename, k):
    """
    finds the k most similar documents to filename in the database referenced
    by the sqlite3 cursor db and prints them along with the cosine similarity
    metric between filename and each of the k documents
    """
    with open(filename, 'r') as f:
        contents = f.read()
    contents = mru.clean_text(contents)
    counts = {}
    for word in contents:
        if word in counts:
            counts[word] += 1
        else:
            counts[word] = 1
    docs_containing = {w: num_docs_containing(db, w) for w in set(contents)}

    # we're going to use the number of documents in the original
    # corpus to calculate tfidf, not including the file we are now
    # analyzing, since the tfidf scores we have in the database were
    # calculated with this number
    corp_size = get_corpus_size(db)
    doc_size = len(contents)
    tfidfs = {
        word: tfidf(count, doc_size, corp_size, docs_containing[word])
        for word, count in counts.items()
    }

    # now, calculate the similarity metric with each document in the database
    similarities = {}
    documents = db.execute('SELECT DISTINCT document FROM tfidf;').fetchall()
    for doc in map(lambda x: x[0], documents):
        similarity = 0
        for word in set(contents):
            other_doc_tfidf = get_tfidf(db, doc, word)
            this_doc_tfidf = tfidfs[word]
            similarity += this_doc_tfidf * other_doc_tfidf
        similarities[doc] = similarity
    top_k = collections.Counter(similarities).most_common(k)
    print '\n'.join(map(lambda x: ':\t'.join([repr(i) for i in x]), top_k))
예제 #6
0
def compare_file(db, filename, k):
    """
    finds the k most similar documents to filename in the database referenced
    by the sqlite3 cursor db and prints them along with the cosine similarity
    metric between filename and each of the k documents
    """
    with open(filename, 'r') as f:
        contents = f.read()
    contents = mru.clean_text(contents)
    counts = {}
    for word in contents:
        if word in counts:
            counts[word] += 1
        else:
            counts[word] = 1
    docs_containing = {w: num_docs_containing(db, w) for w in set(contents)}

    # we're going to use the number of documents in the original
    # corpus to calculate tfidf, not including the file we are now
    # analyzing, since the tfidf scores we have in the database were
    # calculated with this number
    corp_size = get_corpus_size(db)
    doc_size = len(contents)
    tfidfs = {word: tfidf(count, doc_size, corp_size, docs_containing[word])
              for word, count in counts.items()}

    # now, calculate the similarity metric with each document in the database
    similarities = {}
    documents = db.execute('SELECT DISTINCT document FROM tfidf;').fetchall()
    for doc in map(lambda x: x[0], documents):
        similarity = 0
        for word in set(contents):
            other_doc_tfidf = get_tfidf(db, doc, word)
            this_doc_tfidf = tfidfs[word]
            similarity += this_doc_tfidf * other_doc_tfidf
        similarities[doc] = similarity
    top_k = collections.Counter(similarities).most_common(k)
    print '\n'.join(map(lambda x: ':\t'.join([repr(i) for i in x]), top_k))