예제 #1
0
def similarity(corpus, routine, completion_start=0.0, completion_score=1.0):
    print """
	====================
	---- SIMILARITY ----
	====================
	"""
    # Stories vlows Enter Nebuloses
    log_routine(
        routine,
        entry="similarity measurement started succesfully",
        completion=0.0,
        completion_start=completion_start,
        completion_score=completion_score,
    )

    # 1. get number of document
    number_of_documents = Document.objects.filter(corpus=corpus, status="IN").count()

    logger.info("[%s:%s] SIMILARITY on %s 'IN' documents" % (corpus.name, corpus.id, number_of_documents))

    # 2. dictionary where keys are the ids of corpus docments and valuse
    documents = {}

    # out some information
    # 	print "[info] corpus:",corpus.json()
    # print "[info] document in corpus:",number_of_documents

    # get the list of every stemmed segments inside each document.
    # the distance algorithm will work on "stemmed" documents!
    # @todo: verify that PATTERN distance measurement works well with this algorithm.
    cursor = connection.cursor()
    cursor.execute(
        """
		SELECT count(*)
			FROM anta_document_segment ds
		JOIN anta_segment s ON ds.segment_id = s.id
		JOIN anta_document d ON ds.document_id = d.id
		WHERE d.corpus_id = %s AND s.status='IN'

	""",
        [corpus.id],
    )

    number_of_stems = cursor.fetchone()[0]

    number_of_groups = cursor.execute(
        """

		SELECT s.stemmed, d.id as document_id 
			FROM anta_document_segment ds
		JOIN anta_segment s ON ds.segment_id = s.id
		JOIN anta_document d ON ds.document_id = d.id
		WHERE d.corpus_id = %s AND d.status='IN' AND s.status='IN'

	""",
        [corpus.id],
    )  # we do not need ORDER BY d.id, ds.id

    logger.info("[%s:%s] %s document_segment found" % (corpus.name, corpus.id, number_of_groups))

    log_routine(
        routine,
        entry="similarity measurement started",
        completion=0.1,
        completion_start=completion_start,
        completion_score=completion_score,
    )

    # @todo: improve remapping
    for row in cursor.fetchall():
        stemmed, document_id = row
        # print document_id
        if document_id not in documents:
            documents[document_id] = []

        documents[document_id].append(stemmed)

    logger.info("[%s:%s]  creating documents for Pattern" % (corpus.name, corpus.id))

    # translate corpus id
    pattern_id_translation = {}

    # reformat each document into a PATTERN compatible document: join space separated stemmed segment values.
    for d in documents:

        documents[d] = pvDocument(" ".join(documents[d]))
        log_routine(
            routine,
            entry="join stemmed segments",
            completion=0.15,
            completion_start=completion_start,
            completion_score=completion_score,
        )

        pattern_id_translation[documents[d].id] = d

        # print "[info] document with segments in corpus:",len(pattern_id_translation)
    logger.info("[%s:%s] %s documents created for Pattern" % (corpus.name, corpus.id, pattern_id_translation))

    # store document in corpus.
    c = pvCorpus(documents.values())
    # computate and save similarities
    for counter, d in enumerate(documents):
        # print counter, "neighbors of" ,documents[d],
        neighbors = c.neighbors(documents[d], top=number_of_documents)
        if len(neighbors) == 0:
            logger.warning("no neighbors for document: %s" % pattern_id_translation[documents[d].id])
            continue

            # print "%s neighbors found for document: %s" % ( len(neighbors), pattern_id_translation[ documents[d].id ] )
        logger.info(
            "[%s:%s] %s neighbors found for document: %s, completion: %s"
            % (
                corpus.name,
                corpus.id,
                len(neighbors),
                pattern_id_translation[documents[d].id],
                (counter / float(number_of_documents)),
            )
        )

        for n in c.neighbors(documents[d], top=number_of_documents):
            alpha_id = pattern_id_translation[documents[d].id]
            omega_id = pattern_id_translation[n[1].id]
            cosine_similarity = n[0]

            try:
                dist = Distance.objects.get(alpha__id=alpha_id, omega__id=omega_id)
                # print "[info] distantce exists ( %s - %s ), old value: %s, difference: %s" % ( alpha_id, omega_id, dist.cosine_similarity,(dist.cosine_similarity - cosine_similarity) )
            except Distance.DoesNotExist, e:

                # print e
                dist = Distance(alpha_id=alpha_id, omega_id=omega_id)

                # print "[info] create Distance object", dist.id, cosine_similarity
                # 	print a distance exist between these two document
            dist.cosine_similarity = cosine_similarity
            dist.save()
예제 #2
0
def similarity(corpus, routine, completion_start=0.0, completion_score=1.0):
    print """
	====================
	---- SIMILARITY ----
	====================
	"""
    # Stories vlows Enter Nebuloses
    log_routine(
        routine,
        entry="similarity measurement started succesfully",
        completion=0.0,
        completion_start=completion_start,
        completion_score=completion_score,
    )
    transaction.commit()

    # 1. get number of document
    number_of_documents = Document.objects.filter(corpus=corpus).count()

    # 2. dictionary where keys are the ids of corpus docments and valuse
    documents = {}

    # out some information
    print "[info] corpus:", corpus.json()
    print "[info] document in corpus:", number_of_documents

    # get the list of every stemmed segments inside each document.
    # the distance algorithm will work on "stemmed" documents!
    # @todo: verify that PATTERN distance measurement works well with this algorithm.
    cursor = connection.cursor()
    cursor.execute(
        """
		SELECT count(*)
			FROM anta_document_segment ds
		JOIN anta_segment s ON ds.segment_id = s.id
		JOIN anta_document d ON ds.document_id = d.id
		WHERE d.corpus_id = %s

	""",
        [corpus.id],
    )

    number_of_stems = cursor.fetchone()[0]

    cursor.execute(
        """

		SELECT s.stemmed, d.id as document_id 
			FROM anta_document_segment ds
		JOIN anta_segment s ON ds.segment_id = s.id
		JOIN anta_document d ON ds.document_id = d.id
		WHERE d.corpus_id = %s

	""",
        [corpus.id],
    )  # we do not need ORDER BY d.id, ds.id

    log_routine(
        routine,
        entry="similarity measurement started",
        completion=0.1,
        completion_start=completion_start,
        completion_score=completion_score,
    )

    # @todo: improve remapping
    for row in cursor.fetchall():
        stemmed, document_id = row
        # print document_id
        if document_id not in documents:
            documents[document_id] = []

        documents[document_id].append(stemmed)

        # translate corpus id
    pattern_id_translation = {}

    # reformat each document into a PATTERN compatible document: join space separated stemmed segment values.
    for d in documents:
        documents[d] = pvDocument(" ".join(documents[d]))
        log_routine(
            routine,
            entry="join stemmed segments",
            completion=0.15,
            completion_start=completion_start,
            completion_score=completion_score,
        )

        pattern_id_translation[documents[d].id] = d

        # store document in corpus.
    c = pvCorpus(documents.values())

    # computate and save similarities
    for counter, d in enumerate(documents):
        print counter
        for n in c.neighbors(documents[d], top=number_of_documents):
            alpha_id = pattern_id_translation[documents[d].id]
            omega_id = pattern_id_translation[n[1].id]
            cosine_similarity = n[0]

            try:
                dist = Distance.objects.get(alpha__id=alpha_id, omega__id=omega_id)
                print "[info] distantce exists ( %s - %s ), old value: %s, difference: %s" % (
                    alpha_id,
                    omega_id,
                    dist.cosine_similarity,
                    (dist.cosine_similarity - cosine_similarity),
                )
            except Exception, e:
                print e
                dist = Distance(alpha_id=alpha_id, omega_id=omega_id)
                print "[info] create Distance object", dist.id, cosine_similarity
                # 	print a distance exist between these two document
            dist.cosine_similarity = cosine_similarity
            dist.save()

            #

            log_routine(
                routine,
                entry="neighbors computation",
                completion=(counter + 1.0) / number_of_documents,
                completion_start=completion_start,
                completion_score=completion_score,
            )

        transaction.commit()