def map_contents(input=sys.stdin, output=sys.stdout, stop_words=None): for line in input: docname = os.environ['mapreduce_map_input_file'] if stop_words is None: contents = mru.clean_text(line) else: contents = mru.clean_text(line, stop_words) key = {'filename': docname} value = {'words': [word for word in contents]} # we emit as if we were a reducer since the contents don't get put # through a reducer mru.reducer_emit(key, value, output)
def map_claims(input=sys.stdin, output=sys.stdout, kv_delim=INPUT_KV_DELIM, stop_words_file=None, stem=True): for line in input: key, value = line.strip().split(kv_delim) patent_id = key.strip() if stop_words_file is not None: stop_words = json.loads(open(stop_words_file).read()) contents = mru.clean_text(value, stop_words, stem) else: contents = mru.clean_text(value, stem=stem) key = {'filename': patent_id} contents = {'words': [word for word in contents]} mru.reducer_emit(key, contents, output)
def map_contents(input=sys.stdin, output=sys.stdout): """ (file_contents) --> (file_name) (file_contents) for each line from stdin consisting of a document in the corpus, emits a key-value pair to stdout with a key of the corresponding filename and a value of the file contents cleaned with map_reduce_utils.clean_text """ template = '{}\t{}' for line in input: docname = os.environ['mapreduce_map_input_file'] contents = clean_text(line) result = template.format(docname, ' '.join(map(str, contents))) print(result, file=output)
def compare_file(db, filename, k): """ finds the k most similar documents to filename in the database referenced by the sqlite3 cursor db and prints them along with the cosine similarity metric between filename and each of the k documents """ with open(filename, 'r') as f: contents = f.read() contents = mru.clean_text(contents) counts = {} for word in contents: if word in counts: counts[word] += 1 else: counts[word] = 1 docs_containing = {w: num_docs_containing(db, w) for w in set(contents)} # we're going to use the number of documents in the original # corpus to calculate tfidf, not including the file we are now # analyzing, since the tfidf scores we have in the database were # calculated with this number corp_size = get_corpus_size(db) doc_size = len(contents) tfidfs = { word: tfidf(count, doc_size, corp_size, docs_containing[word]) for word, count in counts.items() } # now, calculate the similarity metric with each document in the database similarities = {} documents = db.execute('SELECT DISTINCT document FROM tfidf;').fetchall() for doc in map(lambda x: x[0], documents): similarity = 0 for word in set(contents): other_doc_tfidf = get_tfidf(db, doc, word) this_doc_tfidf = tfidfs[word] similarity += this_doc_tfidf * other_doc_tfidf similarities[doc] = similarity top_k = collections.Counter(similarities).most_common(k) print '\n'.join(map(lambda x: ':\t'.join([repr(i) for i in x]), top_k))
def compare_file(db, filename, k): """ finds the k most similar documents to filename in the database referenced by the sqlite3 cursor db and prints them along with the cosine similarity metric between filename and each of the k documents """ with open(filename, 'r') as f: contents = f.read() contents = mru.clean_text(contents) counts = {} for word in contents: if word in counts: counts[word] += 1 else: counts[word] = 1 docs_containing = {w: num_docs_containing(db, w) for w in set(contents)} # we're going to use the number of documents in the original # corpus to calculate tfidf, not including the file we are now # analyzing, since the tfidf scores we have in the database were # calculated with this number corp_size = get_corpus_size(db) doc_size = len(contents) tfidfs = {word: tfidf(count, doc_size, corp_size, docs_containing[word]) for word, count in counts.items()} # now, calculate the similarity metric with each document in the database similarities = {} documents = db.execute('SELECT DISTINCT document FROM tfidf;').fetchall() for doc in map(lambda x: x[0], documents): similarity = 0 for word in set(contents): other_doc_tfidf = get_tfidf(db, doc, word) this_doc_tfidf = tfidfs[word] similarity += this_doc_tfidf * other_doc_tfidf similarities[doc] = similarity top_k = collections.Counter(similarities).most_common(k) print '\n'.join(map(lambda x: ':\t'.join([repr(i) for i in x]), top_k))