예제 #1
0
 def calculate(self,minePackage,progress):
     webDocuments=[]
     query=Document((minePackage['searchKey']))
     clouds=minePackage['clouds']
     count=UnPack()
     totalLinks=count.total(clouds)
     progress.set_totalIR(totalLinks)#Total de documentos a recuperar
     progress.set_IRState('Ejecutando')#Actualiza el estado del proceso
     urlContent=UrlToPlainText()
     step=0
     for cloud in clouds:
         if not progress.get_stop():
             for n in cloud.graph.nodes():
                 if not progress.get_stop():
                     doc=cloud.graph.node[n]['methodData']
                     webDocuments.append(Document(doc.getData()))
                     step+=1
                     progress.set_IRProgress(step)#Progreso del proceso paso a paso
                 else:
                     break
         else:
             break
     if not progress.get_stop():
         m=Model(documents=webDocuments, weight=TFIDF)
         for cloud in clouds:
             for n in cloud.graph.nodes():
                 methodData=cloud.graph.node[n]['methodData']
                 vector=Document(methodData.getData())
                 cloud.graph.node[n]['weight_VSM']=m.similarity(vector,query)
예제 #2
0
def compare(origin_article_obj, tgt_article_objs):
	tgt_paragraph_docs = []
	tgt_grafs = []

	for obj in tgt_article_objs:
		if obj['paragraphs'] is not None:
			for graf in obj['paragraphs']:
				tgt_grafs.append({
					'text': graf,
					'url': obj['url'],
					'img': obj['img_src'],
					'title': obj['title']
				})
				tgt_paragraph_docs.append(Document(graf, description=obj['url']))

	origin_graf_doc = Document(' '.join(origin_article_obj['paragraphs']), description='origin')

	m = Model(documents=tgt_paragraph_docs+[origin_graf_doc], weight=TFIDF)

	tgts_by_dist = sorted(range(len(tgt_paragraph_docs)), key=lambda i: m.similarity(origin_graf_doc, tgt_paragraph_docs[i]))

	furthest = map(lambda i: tgt_grafs[i], tgts_by_dist)

	furthest_unique = []
	for entry in furthest[::-1]:
		if any([obj['url'] == entry['url'] or obj['text'] == entry['text'] for obj in furthest_unique]):
			pass
		else:
			furthest_unique.append(entry)

	if len(furthest_unique) >= 10:
		return furthest_unique[:10]
	else:
		return furthest_unique
예제 #3
0
 def calculate(self, minePackage):
     webDocuments = []
     query = Document((minePackage['searchKey']))
     clouds = minePackage['clouds']
     count = UnPack()
     totalLinks = count.total(clouds)
     urlContent = UrlToPlainText()
     step = 0
     for cloud in clouds:
         for n in cloud.graph.nodes():
             doc = cloud.graph.node[n]['methodData']
             webDocuments.append(Document(doc.getData()))
             step += 1
     m = Model(documents=webDocuments, weight=TFIDF)
     for cloud in clouds:
         for n in cloud.graph.nodes():
             methodData = cloud.graph.node[n]['methodData']
             vector = Document(methodData.getData())
             cloud.graph.node[n]['weight_VSM'] = m.similarity(
                 vector,
                 query)  #SETEA EL VALOR DE VSM EN EL CLOUD!!!!!!!!!!
def main():

    ##############################################################################################
    print('QUESTION 1, Part I: Web Crawling: Extraction of Book Titles')
    print("-" * 70)
    print('\n')
    print(
        'Retrieving Book Titles from the first two pages of Amazon search results! \n'
    )
    print('Please wait a minute... \n')

    print("~" * 70)

    #open the base URL webpage
    level_1_url = "https://www.amazon.com/s?url=search-alias%3Daps&field-keywords=Martin+Heidegger"

    all_titles = get_titles(level_1_url)

    #print with text wrapping
    format = '%s'

    pieces = [format % (ttl) for ttl in all_titles]
    output = ' | '.join(pieces)
    ttls = fill(output)
    print('The scraped book titles are:')
    print("_" * 40)
    print('\n')
    print('\n\n'.join(ttls.split('|')))
    print('\n')

    ##############################################################################################
    print(
        'QUESTION 1, Part II: Pairwise Text Cosine Similarity Scores of Book Titles'
    )
    print("-" * 70)
    print('\n')

    doc_list = []
    for i in range(len(all_titles)):
        doc_list.append(
            Document(all_titles[i], type=" ".join(all_titles[i].split())))

    m = Model(documents=doc_list, weight=TFIDF)

    cos_similarities = [(m.similarity(x, y), m.documents[i].type,
                         m.documents[j].type)
                        for i, x in enumerate(m.documents)
                        for j, y in enumerate(m.documents) if i != j]

    unique_cos_sim = [
        tuple(x) for x in set(map(frozenset, cos_similarities))
        if len(tuple(x)) == 3
    ]

    resorted_cos_sim_ttl = []
    for i in range(len(unique_cos_sim)):
        resorted_cos_sim_ttl.append(
            sorted(tuple(str(e) for e in unique_cos_sim[i])))
        resorted_cos_sim_ttl[i][0] = float(resorted_cos_sim_ttl[i][0])
        resorted_cos_sim_ttl[i] = tuple(resorted_cos_sim_ttl[i])

    print(
        'The number of calculated book title cosine similarity scores is: {} \n'
        .format(len(resorted_cos_sim_ttl)))

    print(
        'All non-zero book title cosine similarity scores, from smallest to largest: \n'
    )
    for tup in sorted(resorted_cos_sim_ttl):
        if tup[0] != 0:
            print(tup[0])
    print('\n')

    print("~" * 70)

    #print with text wrapping
    format = '%s'

    pieces = [
        format % (sim, ) for sim in sorted(
            resorted_cos_sim_ttl, key=lambda t: t[0], reverse=True)[:5]
    ]
    output = ' | '.join(pieces)
    sims = fill(output)
    print(
        'The cosine similarity scores of the five most similar book titles are: \n'
    )
    print('\n\n'.join(sims.split('|')))
    print('\n')

    print("~" * 70)

    pieces = [
        format % (sim, ) for sim in sorted(
            resorted_cos_sim_ttl, key=lambda t: t[0], reverse=False)[:5]
    ]
    output = ' | '.join(pieces)
    sims = fill(output)
    print(
        'The cosine similarity scores of the five most dissimilar book titles are: \n'
    )
    print('\n\n'.join(sims.split('|')))
    print('\n')

    #############################################################################################
    print(
        'QUESTION 1, Part III: Most Similar and Dissimilar Book Titles and Search Rankings'
    )
    print("-" * 70)
    print('\n')

    print('The most similar pair of book titles is: \n')
    print(max(resorted_cos_sim_ttl))
    print('\n')

    print('The most dissimilar pair of book titles is: \n')
    print(min(resorted_cos_sim_ttl))
    print('\n')

    print("~" * 70)

    doc_types = [doc.type for doc in m.documents]

    print(
        'The search ranking of the first element of the most similar book title pair is: \n'
    )
    print(doc_types.index(max(resorted_cos_sim_ttl)[1]))
    print('\n')

    print(
        'The search ranking of the second element of the most similar book title pair is: \n'
    )
    print(doc_types.index(max(resorted_cos_sim_ttl)[2]))
    print('\n')

    print(
        'The search ranking of the first element of the most dissimilar book title pair is: \n'
    )
    print(doc_types.index(min(resorted_cos_sim_ttl)[1]))
    print('\n')

    print(
        'The search ranking of the second element of the most dissimilar book title pair is: \n'
    )
    print(doc_types.index(min(resorted_cos_sim_ttl)[2]))
    print('\n')

    #############################################################################################
    print('QUESTION 2, Part I: Web Crawling: Extraction of Search Capsules')
    print("-" * 70)
    print('\n')

    orig_query = 'Ponderings XII–XV: Black Notebooks 1939–1941 (Studies in Continental Thought)'

    level_1_url = "https://www.google.com/search?q=" + orig_query.replace(
        ' ', '+')

    all_capsules = get_capsules(level_1_url)

    all_capsules_clean = []
    for cp in all_capsules:
        all_capsules_clean.append(
            unicodedata.normalize('NFKD', cp).encode('ascii',
                                                     'ignore').decode('utf-8'))

    #print with text wrapping
    format = '%s'

    pieces = [format % (cap) for cap in all_capsules_clean]
    output = ' | '.join(pieces)
    caps = fill(output)
    print('The scraped capsules are:')
    print("_" * 40)
    print('\n')
    print('\n\n'.join(caps.split('|')))
    print('\n')

    ##############################################################################################
    print(
        'QUESTION 2, Part II: Pairwise Text Cosine Similarity Scores of Search Capsules'
    )
    print("-" * 70)
    print('\n')

    query_list = []
    for i in range(len(all_capsules_clean)):
        query_list.append(
            Document(all_capsules_clean[i],
                     type=" ".join(all_capsules_clean[i].split())))

    m = Model(documents=query_list, weight=TFIDF)

    cos_similarities = [(m.similarity(x, y), m.documents[i].type,
                         m.documents[j].type)
                        for i, x in enumerate(m.documents)
                        for j, y in enumerate(m.documents) if i != j]

    unique_cos_sim = [
        tuple(x) for x in set(map(frozenset, cos_similarities))
        if len(tuple(x)) == 3
    ]

    resorted_cos_sim_caps = []
    for i in range(len(unique_cos_sim)):
        resorted_cos_sim_caps.append(
            sorted(tuple(str(e) for e in unique_cos_sim[i])))
        resorted_cos_sim_caps[i][0] = float(resorted_cos_sim_caps[i][0])
        resorted_cos_sim_caps[i] = tuple(resorted_cos_sim_caps[i])

    print(
        'The number of calculated capsule cosine similarity scores is: {} \n'.
        format(len(resorted_cos_sim_caps)))

    print(
        'All non-zero capsule cosine similarity scores, from smallest to largest: \n'
    )
    for tup in sorted(resorted_cos_sim_caps):
        if tup[0] != 0:
            print(tup[0])
    print('\n')

    print("~" * 70)

    #print with text wrapping
    format = '%s'

    pieces = [
        format % (sim, ) for sim in sorted(
            resorted_cos_sim_caps, key=lambda t: t[0], reverse=True)[:5]
    ]
    output = ' | '.join(pieces)
    sims = fill(output)
    print(
        'The Cosine Similarity scores of the five most similar capsule pairs are: \n'
    )
    print('\n\n'.join(sims.split('|')))
    print('\n')

    print("~" * 70)

    pieces = [
        format % (sim, ) for sim in sorted(
            resorted_cos_sim_caps, key=lambda t: t[0], reverse=False)[:5]
    ]
    output = ' | '.join(pieces)
    sims = fill(output)
    print(
        'The Cosine Similarity scores of the five most dissimilar capsule pairs are: \n'
    )
    print('\n\n'.join(sims.split('|')))
    print('\n')

    print("~" * 70)

    print(
        'Finding the capsule with the highest cosine similarity to the original query... \n'
    )
    all_capsules_clean.append(orig_query)

    caps_and_query = []
    for i in range(len(all_capsules_clean)):
        caps_and_query.append(
            Document(all_capsules_clean[i],
                     type=" ".join(all_capsules_clean[i].split())))

    m = Model(documents=caps_and_query, weight=TFIDF)

    cos_similarities = [(m.similarity(x, y), m.documents[i].type,
                         m.documents[j].type)
                        for i, x in enumerate(m.documents)
                        for j, y in enumerate(m.documents) if i != j]

    unique_cos_sim_query = [
        tuple(x) for x in set(map(frozenset, cos_similarities))
        if len(tuple(x)) == 3
    ]

    resorted_cos_sim_query = []
    for i in range(len(unique_cos_sim_query)):
        resorted_cos_sim_query.append(
            sorted(tuple(str(e) for e in unique_cos_sim_query[i])))
        resorted_cos_sim_query[i][0] = float(resorted_cos_sim_query[i][0])
        resorted_cos_sim_query[i] = tuple(resorted_cos_sim_query[i])

    result_list = []
    for tup in resorted_cos_sim_query:
        if orig_query in tup:
            result_list.append(tup)

    result_tup = max(result_list, key=lambda x: x[0])
    print(
        'The cosine similarity score of the capsule most similar to the original query is: \n'
    )
    print(result_tup)
    print('\n')

    print(
        'Finding search ranking of the capsule with the highest cosine similarity to the original query... \n'
    )

    match_list = []
    for item in all_capsules_clean:
        match_list.append(item.replace('\n', ''))

    print(
        'The search ranking of the capsule most similar to the original query is: \n'
    )
    print(match_list.index(result_tup[1]))
    print('\n')

    #############################################################################################
    print(
        'QUESTION 2, Part III: Most Similar and Dissimilar Capsules and Search Rankings'
    )
    print("-" * 70)
    print('\n')

    print('The most similar pair of capsules is: \n')
    print(max(resorted_cos_sim_caps))
    print('\n')

    print('The most dissimilar pair of capsules is: \n')
    print(min(resorted_cos_sim_caps))
    print('\n')

    print("~" * 70)

    doc_types = [doc.type for doc in m.documents]

    print(
        'The search ranking of the first element of the most similar capsule pair is: \n'
    )
    print(doc_types.index(max(resorted_cos_sim_caps)[1]))
    print('\n')

    print(
        'The search ranking of the second element of the most similar capsule pair is: \n'
    )
    print(doc_types.index(max(resorted_cos_sim_caps)[2]))
    print('\n')

    print(
        'The search ranking of the first element of the most dissimilar capsule pair is: \n'
    )
    print(doc_types.index(min(resorted_cos_sim_caps)[1]))
    print('\n')

    print(
        'The search ranking of the second element of the most dissimilar capsule pair is: \n'
    )
    print(doc_types.index(min(resorted_cos_sim_caps)[2]))
    print('\n')

    ############################################################################################

    print('Summary Report: Document Similarity Semantic Analysis')
    print("-" * 70)
    ################
    report = "A crawler with changing user-agent headers was used to scrape book titles on Amazon from the first two pages of results returned when searching the philosopher, Martin Heidegger. Using TF-IDF values derived from a model incorporating the scraped results, all pairwise cosine similarity scores were calculated for the corpus documents, each of which consisted of the book title and any accompanying subtitle text. The scores were filtered for unique book title pairs and sorted by ascending cosine similarity score, so the top 5 and bottom 5 pairs could be printed in terminal. As several pairings returned a cosine similarity score of 0, the most dissimilar pair among the lowest scores could not be decisively quantified. Interestingly, search rankings of the elements of the most similar and dissimilar pairs did not appear on the same page of results. Another crawler was used to scrape capsules returned by a Google search for one of the book titles appearing in the Amazon results. Capsules from the first three pages of Google results were Unicode normalized and decoded before they were incorporated into another model, from which TF-IDF values were derived. All pairwise cosine similarity scores were calculated for the new set of corpus documents, which consisted of all text appearing in each capsule. Scores were filtered for unique capsule pairs and sorted by ascending cosine similarity score; the top 5 and bottom 5 pairs were again printed in terminal. To identify the capsule most similar to the original query, the latter was then included in the model, from which a new set of TF-IDF values and cosine similarity scores were generated. Interestingly, the ranking of the most similar capsule appeared lower in the search results than expected, on the bottom of the second page. Intuitively, the search rankings of the capsules most similar to one another did, however, appear on the same page of Google results."
    ##############
    format = '%s'
    pieces = [format % (word) for word in report]
    output = ''.join(pieces)
    write_up = fill(output)
    print(write_up)

    return None
예제 #5
0
# For example, say we have two vectors with features "x" and "y".
# We can calculate the distance between two points (x, y) in 2-D space:
# d = sqrt(pow(x2 - x1, 2) + pow(y2 - y1, 2))
# This is the Euclidean distance in 2-D space.
# Similarily, we can calculate the distance in n-D space,
# in other words, for vectors with lots of features.

# For text, a better metric than Euclidean distance
# is called cosine similarity. This is what a Model uses:
d1 = m.document(name="lion")
d2 = m.document(name="tiger")
d3 = m.document(name="dolphin")
d4 = m.document(name="shark")
d5 = m.document(name="parakeet")
print "lion-tiger:", m.similarity(d1, d2)
print "lion-dolphin:", m.similarity(d1, d3)
print "dolphin-shark:", m.similarity(d3, d4)
print "dolphin-parakeet:", m.similarity(d3, d5)
print

print "Related to tiger:"
print m.neighbors(d2, top=3)  # Top three most similar.
print

print "Related to a search query ('water'):"
print m.search("water", top=10)

# In summary:

# A Document:
예제 #6
0
# For example, say we have two vectors with features "x" and "y".
# We can calculate the distance between two points (x, y) in 2-D space:
# d = sqrt(pow(x2 - x1, 2) + pow(y2 - y1, 2))
# This is the Euclidean distance in 2-D space.
# Similarily, we can calculate the distance in n-D space,
# in other words, for vectors with lots of features.

# For text, a better metric than Euclidean distance
# is called cosine similarity. This is what a Model uses:
d1 = m.document(name="lion")
d2 = m.document(name="tiger")
d3 = m.document(name="dolphin")
d4 = m.document(name="shark")
d5 = m.document(name="parakeet")
print("lion-tiger:", m.similarity(d1, d2))
print("lion-dolphin:", m.similarity(d1, d3))
print("dolphin-shark:", m.similarity(d3, d4))
print("dolphin-parakeet:", m.similarity(d3, d5))
print()

print("Related to tiger:")
print(m.neighbors(d2, top=3))  # Top three most similar.
print()

print("Related to a search query ('water'):")
print(m.search("water", top=10))

# In summary:

# A Document:
예제 #7
0
# document vector
v1 = Vector({"curiosity": 1, "kill": 1, "cat": 1})
v2 = Vector({"curiosity": 1, "explore": 1, "mars": 1})
print 1 - distance(v1, v2)
# model
d1 = Document('A tiger is a big yellow cat with stripes.', type='tiger')
d2 = Document(
    'A lion is a big yellow cat with manes.',
    type='lion',
)
d3 = Document('An elephant is a big grey animal with a slurf.',
              type='elephant')
print d1.vector
m = Model(documents=[d1, d2, d3], weight=TFIDF)
print d1.vector
print m.similarity(d1, d2)  # tiger vs. lion
print m.similarity(d1, d3)  # tiger vs. elephant
# lsa concept space
d1 = Document('The cat purrs.', name='cat1')
d2 = Document('Curiosity killed the cat.', name='cat2')
d3 = Document('The dog wags his tail.', name='dog1')
d4 = Document('The dog is happy.', name='dog2')
m = Model([d1, d2, d3, d4])
m.reduce(2)
for d in m.documents:
    print
    print d.name
    for concept, w1 in m.lsa.vectors[d.id].items():
        for feature, w2 in m.lsa.concepts[concept].items():
            if w1 != 0 and w2 != 0:
                print(feature, w1 * w2)