示例#1
0
def clean(corpus, routine):
    print """
	========================
	---- CLEAN SEGMENTS ----
	========================
	"""
    # change routine type
    routine.type = "CLEAN"
    routine.save()
    transaction.commit()

    number_of_links = Document_Segment.objects.filter(document__corpus=corpus).count()
    loops = int(math.ceil(number_of_links / 25.0))

    # print "corpus: %s" % corpus.json()

    # print "number_of_links: %s" % number_of_links
    # print "loops: %s" % loops

    try:
        # manually change
        for i in range(0, loops):
            for j in Document_Segment.objects.filter(document__corpus=corpus)[0:25]:
                j.segment.delete()
                j.delete()

                log_routine(routine, completion=float(i) / loops)

            transaction.commit()
        log_routine(routine, completion=1.0)

    except Exception, e:
        close_routine(routine, error="Ho trovato Wally 149, Exception: %s" % e, status="ERR")
        transaction.commit()
示例#2
0
def importcsv(routine, csvfile, column="stemmed"):
    print """
	========================================
	---- IMPORT SEGMENTS ROM REFINE CSV ----
	========================================
	"""

    log_routine(routine, entry="importcsv started", completion=0)
    transaction.commit()

    rows = list(csvfile)
    totalrows = len(rows)
    for i, row in enumerate(rows):
        # update stemmed_refined cell
        try:
            s = Segment.objects.get(id=row["segment_id"])
            buffer_stemmed = s.stemmed
            s.stemmed = row["concept"]
            s.stemmed_refined = buffer_stemmed
            s.save()

        except Exception, e:
            # print	" segemnt id %s was not found!" % row['segment_id']
            close_routine(routine, error="Exception: %s %s" % (e, row), status="ERR")
            transaction.commit()
            return

        if i % 25 == 0:
            log_routine(routine, entry="importcsv at line: %s" % i, completion=i / float(totalrows))
            print i, i / float(totalrows)

            transaction.commit()
示例#3
0
def tf_tfidf( corpus, routine ):
	number_of_documents = Document.objects.filter(corpus=corpus ).count()



	if number_of_documents == 0:
		return close_routine( routine, error="tfidf routine: no document found", status="OK")

	logger.info("starting TF computation on corpus '%s' [%s], %s documents" % (corpus.name, corpus.id, number_of_documents))
	tf( corpus=corpus, routine=routine, completion_start=0.0, completion_score=0.4 )
	logger.info("TF computation done")
	if routine.status == "ERR":
		logger.error("TF computation failed on corpus '%s' [%s]" % (corpus.name, corpus.id))
		return
	logger.info("TF computation done on corpus '%s' [%s]" % (corpus.name, corpus.id ))
	logger.info("starting TFIDF computation on corpus '%s' [%s], %s documents" % (corpus.name, corpus.id, number_of_documents))
	
	tfidf( corpus=corpus, routine=routine, completion_start=0.4, completion_score=0.4 )

	if routine.status == "ERR":
		logger.error("TFIDF computation failed on corpus '%s' [%s]" % (corpus.name, corpus.id))
		return

	logger.info("TFIDF computation done on corpus '%s' [%s]" % (corpus.name, corpus.id ))
	logger.info("starting SIMILARITY computation on corpus '%s' [%s], %s documents" % (corpus.name, corpus.id, number_of_documents ))
	
	similarity( corpus=corpus, routine=routine, completion_start=0.8, completion_score=0.2  )

	logger.info("SIMILARITY computation done on corpus '%s' [%s]" % (corpus.name, corpus.id ))
	
	close_routine( routine, error="", status="OK" )
示例#4
0
def tf( corpus, routine, completion_start=0.0, completion_score=1.0 ):
	print """
	======================
	---- TF, RELOADED ----
	======================
	"""
	# change routine type
	routine.type = "TF"
	routine.save()
	transaction.commit()

	# get percentage info:
	number_of_documents_segments = Document_Segment.objects.count()
	logger.info( "number_of_documents_segments: %s" % number_of_documents_segments )

	if number_of_documents_segments == 0:
		logger.error( "TF Not enought segments in your corpus. Try 'standard' routine first..." )
		close_routine( routine, error="Not enought segments in your corpus. Try 'standard' routine first...", status="ERR")
		transaction.commit()
		return

	current_segment = 0

	for d in Document.objects.filter(corpus=corpus):
		# print "document: %s" % d
		number_of_stems_per_document = Document_Segment.objects.filter( document=d, segment__status='IN' ).values('segment__stemmed').distinct().count()
		# print "number_of_stems_per_document: %s" % number_of_stems_per_document

		logger.info( "document '%s' [%s], number of 'pseudo-stems': %s" % (d.title, d.id, number_of_stems_per_document) )
		

		for ds in Document_Segment.objects.filter( document=d,  segment__status='IN'):
			# count alliases( segment with same stemmed version )
			number_of_aliases = Document_Segment.objects.filter( document=d,  segment__status='IN', segment__stemmed=ds.segment.stemmed ).count()
			ds.tf = float(number_of_aliases) / number_of_stems_per_document
			ds.save()
			if number_of_aliases > 1: # print just some lines
				pass
				# print  ds.segment.content, ds.segment.stemmed, number_of_aliases, ds.tf
			if current_segment % 25 == 0:
				logger.info( "document '%s' [%s], completion: %s" % (d.title, d.id, ( float(current_segment) / number_of_documents_segments ) ) )
		
				log_routine( routine, completion = completion_start +  ( float(current_segment) / number_of_documents_segments ) * (completion_score-completion_start) )
				transaction.commit()
	

			current_segment = current_segment + 1
		logger.info( "document '%s' [%s] completed!" % (d.title, d.id ) )
		

	if completion_score == 1.0:
		logger.info( "closing routine, task completed" )
		close_routine( routine, error="", status="OK" )
	transaction.commit()
示例#5
0
def standard(corpus, routine):

    number_of_documents = Document.objects.filter(corpus=corpus).count()

    if number_of_documents == 0:
        return close_routine(routine, error="No document found", status="ERR")

        # 1. distiller.decant (tf computation )
    try:
        decant(corpus=corpus, routine=routine, settings=settings, ref_completion=0.5)
    except Exception, e:
        return close_routine(routine, error="Exception: %s" % e, status="ERR")
示例#6
0
def tf(corpus, routine, completion_start=0.0, completion_score=1.0):
    print """
	======================
	---- TF, RELOADED ----
	======================
	"""
    # change routine type
    routine.type = "TF"
    routine.save()
    transaction.commit()

    # get percentage info:
    number_of_documents_segments = Document_Segment.objects.count()
    print "number_of_documents_segments: %s" % number_of_documents_segments

    if number_of_documents_segments == 0:
        close_routine(
            routine, error="Not enought segments in your corpus. Try 'standard' routine first...", status="ERR"
        )
        transaction.commit()
        return

    current_segment = 0

    for d in Document.objects.filter(corpus=corpus):
        print "document: %s" % d
        number_of_stems_per_document = (
            Document_Segment.objects.filter(document=d).values("segment__stemmed").distinct().count()
        )
        print "number_of_stems_per_document: %s" % number_of_stems_per_document

        for ds in Document_Segment.objects.filter(document=d):
            # count alliases( segment with same stemmed version )
            number_of_aliases = Document_Segment.objects.filter(document=d, segment__stemmed=ds.segment.stemmed).count()
            ds.tf = float(number_of_aliases) / number_of_stems_per_document
            ds.save()
            if number_of_aliases > 1:  # print just some lines
                print ds.segment.content, ds.segment.stemmed, number_of_aliases, ds.tf
            if current_segment % 25 == 0:
                log_routine(
                    routine,
                    completion=completion_start
                    + (float(current_segment) / number_of_documents_segments) * (completion_score - completion_start),
                )
                transaction.commit()

            current_segment = current_segment + 1

    if completion_score == 1.0:
        close_routine(routine, error="", status="OK")
    transaction.commit()
示例#7
0
def standard( corpus, routine ):
	
	number_of_documents = Document.objects.filter(corpus=corpus ).count()
	logger.info("opening standard routine, corpus: '%s' [%s], %s documents" % ( corpus.name, corpus.id, number_of_documents) )

	if number_of_documents == 0:
		logger.error("routine closed, not enough documents")
		return close_routine( routine, error="standard routine: No document found", status="OK")

	# 1. distiller.decant (tf computation )
	try:
		decant( corpus=corpus, routine=routine, settings=settings, ref_completion=0.5 )
	except Exception, e:
		logger.exception("Exception: %s" % e)
		return close_routine( routine, error="Exception: %s" % e, status="ERR")
示例#8
0
def importcsv( routine, csvfile, corpus, column="stemmed" ):
	print """
	========================================
	---- IMPORT SEGMENTS ROM REFINE CSV ----
	========================================
	"""
	

	log_routine( routine, entry="importcsv started", completion=0 );
	transaction.commit()
	
	rows = list(csvfile)
	totalrows = len(rows)

	logger.info( "[corpus:%s] import csv, total rows: %s" % (corpus.name, totalrows) )


	for i, row in enumerate(rows):
		try:
			segment_id = row['segment_id']
			concept = row['concept']
			status = 'IN' if row['status'] == 'IN' else 'OUT'
		except KeyError,e:
			logger.error( "KeyError exception: %s" % e )
			close_routine( routine, error="Exception: %s %s" % (e,row), status="ERR" )
			
			transaction.rollback()
			return
			
		# update stemmed_refined cell
		try:
			s = Segment.objects.get(id=segment_id)
			# update all the similar segment with the brand new segment
			Segment.objects.filter( stemmed=s.stemmed, documents__corpus=corpus ).update( stemmed=concept, status=status )
			# buffer_stemmed = s.stemmed
			# s.stemmed = concept
			# s.status = status
			# s.stemmed_refined = buffer_stemmed
			# s.save()

		except Segment.DoesNotExist, e:
			logger.error( "[corpus:%s] import csv, row %s raised exception: %s" % (corpus.name, i,e) )
			#print	" segemnt id %s was not found!" % row['segment_id']
			close_routine( routine, error="Exception: %s %s" % (e,row), status="ERR" )
			transaction.commit()
			return
示例#9
0
def tf_tfidf(corpus, routine):
    number_of_documents = Document.objects.filter(corpus=corpus).count()

    if number_of_documents == 0:
        return close_routine(routine, error="No document found", status="ERR")

    tf(corpus=corpus, routine=routine, completion_start=0.0, completion_score=0.4)

    if routine.status == "ERR":
        return

    tfidf(corpus=corpus, routine=routine, completion_start=0.4, completion_score=0.4)

    if routine.status == "ERR":
        return

    similarity(corpus=corpus, routine=routine, completion_start=0.8, completion_score=0.2)

    close_routine(routine, error="", status="OK")
示例#10
0
def tfidf(corpus, routine, completion_start=0.0, completion_score=1.0, column="stemmed"):
    print """
	===========================
	---- TFIDF COMPUTATION ----
	===========================
	"""
    # routine.type = "TFIDF"
    # routine.save()

    # 1. get number of document
    number_of_documents = Document.objects.filter(corpus=corpus, status="IN").count()

    logger.info("[%s:%s] TFIDF on %s 'IN' documents" % (corpus.name, corpus.id, number_of_documents))

    # 2. get all languages in corpus
    number_of_languages = Document.objects.values("language").distinct().count()

    # 3. get GLOBAL number of segments (aka stems, segments grouped by their stemmed version: they do not need to be in the same corpus!!!)
    number_of_stems = (
        Segment.objects.values("stemmed", "language").annotate(Count("stemmed"), Count("language")).count()
    )

    logger.info("[%s:%s] %s segment groups" % (corpus.name, corpus.id, number_of_stems))
    # SELECT COUNT(*), stemmed FROM anta_segment GROUP BY stemmed

    # out some information
    # print "[info] column:",column
    # print "[info] corpus:",corpus.json()
    # print "[info] document in corpus:",number_of_documents
    # print "[info] stems in corpus (grouped by stemmed, language):",number_of_stems

    cursor = connection.cursor()

    # global counter (all languages cycle)
    current_stem = 0

    # 3. for each language, perform a tfidf
    for i in (
        Document.objects.filter(corpus=corpus, status="IN")
        .values("language")
        .annotate(num_document=Count("language"))
        .distinct()
    ):
        # print "[info] language info: ",i
        logger.info("[%s:%s] language %s" % (corpus.name, corpus.id, i["language"]))

        language = i["language"]
        # count tfidf group
        # SELECT COUNT(*), stemmed FROM anta_segment WHERE language="EN" GROUP BY stemmed
        stem_count = Segment.objects.filter(language=language).values("stemmed").annotate(Count("stemmed")).count()

        # check length. If it's 0, exit with error....
        if stem_count == 0:
            logger.error(
                "[%s:%s] not enought segments in your corpus with language %s" % (corpus.name, corpus.id, i["language"])
            )
            close_routine(
                routine, error="Not enought segments in your corpus. Try standard routine first...", status="ERR"
            )
            transaction.commit()
            return

            # 5. for each segment in this language...
        number_of_groups = cursor.execute(
            """
			SELECT
				COUNT( DISTINCT ds.document_id ) as distribution, 
				s.language,
				s.stemmed 
			FROM `anta_document_segment` ds
			JOIN anta_segment s ON ds.segment_id = s.id
			JOIN anta_document d ON d.id = ds.document_id
			WHERE d.corpus_id = %s AND s.language = %s AND s.status = 'IN'
			GROUP BY s.stemmed ORDER BY distribution DESC, stemmed ASC""",
            [corpus.id, language],
        )

        logger.info("[%s:%s] distribution query executed for language '%s'" % (corpus.name, corpus.id, language))

        language_cursor = 0.0

        for row in dictfetchall(cursor):
            # increment global runner (stats)
            current_stem = current_stem + 1
            language_cursor + 1

            # store tfidf inside each segment-document relationships
            try:
                dss = Document_Segment.objects.filter(segment__stemmed=row["stemmed"], segment__language=language)

                df = float(row["distribution"]) / number_of_documents
                # print float(current_stem) / number_of_stems * 100.0, row[ column ], row['distribution'], df
            except Exception, e:
                logger.exception("[%s:%s] uhandled exception ..." % (corpus.name, corpus.id))

                # print e
                close_routine(
                    routine, error="Ho trovato Wally alla liena 281 di metrics.py, Exception: %s" % e, status="ERR"
                )
                transaction.commit()
                return

            for ds in dss:
                ds.tfidf = ds.tf * math.log(1 / df)
                ds.save()

            if current_stem % 75 == 0:
                try:
                    completion = completion_start + (float(current_stem) / number_of_stems) * (
                        completion_score - completion_start
                    )
                    logger.info(
                        "[%s:%s] language completion: %s"
                        % (corpus.name, corpus.id, (float(current_stem) / number_of_stems))
                    )

                    log_routine(routine, completion=completion)

                    # save percentage and commit transaction
                    transaction.commit()
                except Exception, e:
                    logger.exception("[%s:%s] uhandled exception ..." % (corpus.name, corpus.id))
示例#11
0
    )

    if number_of_documents == 0:
        logger.error("routine closed, not enough documents")
        return close_routine(routine, error="standard routine: No document found", status="OK")

        # 1. distiller.decant (tf computation )
    try:
        decant(corpus=corpus, routine=routine, settings=settings, ref_completion=0.5)
    except Exception, e:
        logger.exception("Exception: %s" % e)
        return close_routine(routine, error="Exception: %s" % e, status="ERR")

        # 2. get all languages in corpus
    tf_tfidf(corpus=corpus, routine=routine)
    close_routine(routine, error="", status="OK")


@transaction.commit_manually
def entities_alchemy(corpus, routine):
    print """
	=================================
	---- ENTITIES VIA ALCHEMYAPI ----
	=================================
	"""
    from services import alchemy

    number_of_documents = Document.objects.filter(corpus=corpus).count()
    # print "[info] number_of_documents:",number_of_documents
    # print "[info] alchmy apikey:", settings.ALCHEMY_API_KEY