예제 #1
0
 def buscaCorreo2(x):
     documents = []
     documap = {}
     for archivo in os.listdir("Correos"):
         if archivo.endswith(".txt"):
             f = open("Correos/" + archivo, "r")
             f.readline()
             f.readline()
             f.readline()
             f.readline()
             mailbody = f.read()
             f.close()
             docu = Document(mailbody, name=archivo)
             documents.append(docu)
             docukey = int(archivo[0:-4])
             documap[docukey] = docu
     model = Model(documents=documents, weight=TFIDF)
     docu = documap[int(var.get())]
     tupla = model.neighbors(docu, top=1)[0]
     tkMessageBox.showinfo("Tk", "El documento que mas se parece es el " + tupla[1].name[0:-4] + ", con un " + str(tupla[0]) + " de similitud")
예제 #2
0
# For text, a better metric than Euclidean distance
# is called cosine similarity. This is what a Model uses:
d1 = m.document(name="lion")
d2 = m.document(name="tiger")
d3 = m.document(name="dolphin")
d4 = m.document(name="shark")
d5 = m.document(name="parakeet")
print "lion-tiger:", m.similarity(d1, d2)
print "lion-dolphin:", m.similarity(d1, d3)
print "dolphin-shark:", m.similarity(d3, d4)
print "dolphin-parakeet:", m.similarity(d3, d5)
print

print "Related to tiger:"
print m.neighbors(d2, top=3)  # Top three most similar.
print

print "Related to a search query ('water'):"
print m.search("water", top=10)

# In summary:

# A Document:
# - takes a string of text,
# - counts the words in the text,
# - constructs a vector of words (features) and normalized word count (weight).

# A Model:
# - groups multiple vectors in a matrix,
# - tweaks the weight with TF-IDF to find "unique" words in each document,
예제 #3
0
# For text, a better metric than Euclidean distance
# is called cosine similarity. This is what a Model uses:
d1 = m.document(name="lion")
d2 = m.document(name="tiger")
d3 = m.document(name="dolphin")
d4 = m.document(name="shark")
d5 = m.document(name="parakeet")
print("lion-tiger:", m.similarity(d1, d2))
print("lion-dolphin:", m.similarity(d1, d3))
print("dolphin-shark:", m.similarity(d3, d4))
print("dolphin-parakeet:", m.similarity(d3, d5))
print()

print("Related to tiger:")
print(m.neighbors(d2, top=3))  # Top three most similar.
print()

print("Related to a search query ('water'):")
print(m.search("water", top=10))

# In summary:

# A Document:
# - takes a string of text,
# - counts the words in the text,
# - constructs a vector of words (features) and normalized word count (weight).

# A Model:
# - groups multiple vectors in a matrix,
# - tweaks the weight with TF-IDF to find "unique" words in each document,
예제 #4
0
from pattern.vector import Document, Model, IG, TF, TFIDF, BINARY
import sys
import os

print "Reading sample code and instantiating documents..."
documents = []
exampleDir = "examples/"
for file in os.listdir(exampleDir):
    if os.path.isdir(exampleDir + file):
        for subfile in os.listdir(exampleDir + file):
            if (os.path.isfile(exampleDir + file + "/" + subfile)):
                with open (exampleDir + file + "/" + subfile, "r") as langDoc:
                    text = langDoc.read()
                    doc = Document(text, type=file)
                    documents.append(doc)

print "Creating statistical model..."
m = Model(documents=documents, weight=IG)

# Test with sample Java doc
print "Comparing test document..."
with open ("coffee.txt", "r") as myfile:
    testFile = myfile.read()
testDoc = Document(testFile, type='Java')
testSimilarities = m.neighbors(testDoc, top=10)
prediction = testSimilarities[0][1].type #neighbors() returns (similarity, document) list
confidence = testSimilarities[0][0]
print "LanguageLearn has predicted " + testSimilarities[0][1].type + " with a " + str(round(confidence * 100, 2)) + "% confidence"
예제 #5
0
def recommend_game(this_game):
    games = recommendable_games(this_game)

    total_recommendable = games.count()
    print 'Total recommendable games based on ' + this_game.title + ": " + total_recommendable.__str__()

    document_title = Document(this_game.title)
    document_publisher = Document(this_game.publisher)
    document_summary = Document(this_game.summary,
                                top=None,
                                threshold=0,
                                stemmer=None,
                                exclude=[],
                                stopwords=False,
                                language='en')
    document_keywords = Document(', '.join([x['name'] for x in this_game.keywords.all().values("name")]))
    document_genres = Document(', '.join([x['name'] for x in this_game.genres.all().values("name")]))

    # format: {"id":id, socre:"SUM(dist*pond)"}
    game_similarities = []
    summary_documents = []
    for game in games:
        score = 0
        game = Game.objects.filter(title=game['title'], platform=game['platform'])[0]

        title_similarity = 1 - distance(document_title.vector, Document(game.title).vector)
        publisher_similarity = 1 - distance(document_publisher.vector, Document(game.publisher).vector)
        genre_similarity = 1 - distance(document_genres.vector, Document(
            ', '.join([x['name'] for x in game.genres.all().values("name")])
        ).vector)
        keywords_similarity = 1 - distance(document_keywords.vector, Document(
            ', '.join([x['name'] for x in game.keywords.all().values("name")])
        ).vector)

        score = (0.15 * title_similarity) + (0.2 * genre_similarity) + (0.2 * publisher_similarity) + (
            0.20 * keywords_similarity)

        summary_documents.append(Document(game.summary,
                                          top=None,
                                          threshold=0,
                                          stemmer=None,
                                          exclude=[],
                                          stopwords=False,
                                          language='en',
                                          name=game.id))

        game_similarities.append({"id": game.id, "score": score})

    to_compare = Document(document_summary)

    model = Model(documents=summary_documents, weight=TFIDF)

    neighbours = model.neighbors(to_compare, top=total_recommendable)

    for neighbour in neighbours:
        for rec_game in game_similarities:
            if rec_game['id'] == neighbour[1].name:
                rec_game['score'] = rec_game['score'] + 0.25 * neighbour[0]

    recommended = sorted(game_similarities, key=lambda k: -k['score'])[0:total_recommendable]

    if len(recommended) >= 40:
        random_selection = random.sample(recommended[0:40], 25)
    else:
        random_selection = random.sample(recommended, 25)

    recommended_ids = [g['id'] for g in random_selection]

    return recommended_ids