示例#1
0
# Preprocess script - build a single text file with cleaned, normalised documents
#  - tokenised, stemmed, one document per line.
# Track fileids to retrieve document text later

docs = 0
bad = 0

tokenizer = Tokenizer()

with open(Paths.text_index, "w") as fileid_out:
    with codecs.open(Paths.texts_clean, "w", "utf-8-sig") as out:

        for f in reuters.fileids():
            contents = reuters.open(f).read()

            try:
                tokens = tokenizer.tokenize(contents)
                docs += 1
                if docs % 1000 == 0:
                    print "Normalised %d documents" % (docs)

                out.write(" ".join(tokens) + "\n")
                fileid_out.write(f + "\n")

            except UnicodeDecodeError:
                bad += 1

print "Normalised %d documents" % (docs)
print "Skipped %d bad documents" % (bad)
print "Finished building " + Paths.texts_clean
示例#2
0
# Custom tokeniser/normaliser
tokenizer = Tokenizer()

# Recall NLTK corpus fileids to retrieve matching document
fileids = []
with open(Paths.text_index) as f:
	fileids = [line.rstrip() for line in f]


print "Enter a query document:"
s = raw_input('> ')

while s and s != 'exit':

	# Convert input document into LSI vector space
	tokens = tokenizer.tokenize(s)
	bow_vector = dictionary.doc2bow(tokens)
	lsi_vector = model[bow_vector]

	# Compute similarity of input vector to all document vectors
	similarities = index[lsi_vector]
	similarities = sorted(enumerate(similarities), key=lambda item: -item[1])

	# Get contents of most similar documents
	(file_no, score) = similarities[0]
	fileid = fileids[file_no]
	contents = reuters.open(fileid).read()

	# Re-convert most similar document to LSI space
	#  to examine similarity
	match_tokens = tokenizer.tokenize(contents.strip())