Exemplo n.º 1
0
def main( argv):
	usage = "usage: %prog -c corpus_name [-f]"
	parser = OptionParser( usage=usage )
	parser.add_option(
		"-c", "--corpus", dest="corpus",
		help="an anta corpus_name")
	
	parser.add_option(
		"-f", "--function", dest="function", default="decant", 
		help="function to be performed. Default to 'ampoule', to computate tf")
	
	parser.add_option(
		"-k", "--freebasekey", dest="freebasekey", default="",
		help="freebase api key (use with function 'freebase'")
	
	
	( options, argv ) = parser.parse_args()
	
	if options.corpus is None:
		error( message="Use -c to specify the corpus", parser=parser )
	
	if options.function is "decant":
		return decant(options.corpus, parser )
	elif options.function == "duplicates":
		return duplicates(options, parser )
	elif options.function == "freebase":
		return freebase( options, parser )
	else :
		return basic(options, parser)	
	
	error( message="function '"+options.function+"'was not found!", parser=parser )
Exemplo n.º 2
0
def segments( options, parser ):
	print """
	=========================
	---- IMPORT SEGMENTS ----
	=========================	
	"""
	try:
		corpus = Corpus.objects.get( name=options.corpus )
	except:
		return error( message="corpus was not found! use sync.py script to load corpora", parser=parser )
	print "corpus:",corpus
	
	if not os.path.exists( options.csv ):
		error( message="csv file was not found.", parser=parser )
	f = open( options.csv, 'rb' )
	c = csv.DictReader( f, delimiter=options.delimiter )
	
	print "filename:",options.csv
	for row in c:
		print row
		# update stemmed_refined cell
		try:
			s = Segment.objects.get(id=row['segment_id'])
			print s.id, s.content
		except:
			print	" segemnt id %s was not found!" % row['segment_id']
			continue
		
		buffer_stemmed = s.stemmed
		s.stemmed = row['stemmed']
		s.stemmed_refined = buffer_stemmed
		s.save()
Exemplo n.º 3
0
def export(corpus, language, parser, column="stemmed"):
    print """
	===========================================
	---- EXPORT SEGMENTS FOR REFINE SAMPLE ----
	===========================================
	"""
    try:
        c = Corpus.objects.get(name=corpus)
    except Exception, e:
        # print "Exception: %s" % e
        return error(message="corpus '%s' was not found!" % corpus, parser=parser)
Exemplo n.º 4
0
def main( argv):
	parser = OptionParser( usage="usage: %prog -f csv -c corpus_name [-d delimiter]" )
	parser.add_option( "-c", "--corpus", dest="corpus",
		help="anta corpus_name")
	
	parser.add_option( "-f", "--csv", dest="csv",
		help="csv file to be parsed")
	
	parser.add_option( "-d", "--delimiter", dest="delimiter", default="\t",
		help="csv separator")
	
	( options, argv ) = parser.parse_args()
	
	if options.corpus is None:
		return error( message="Use -c to specify the corpus", parser=parser )
	
	if options.csv is None:
		return error( message="Use -f to specify the csv file path", parser=parser )
	
	return segments(options, parser)	
Exemplo n.º 5
0
def main(argv):
    usage = "usage: %prog -f function [ standard|importcsv [-o column] [-x csvfile] ]"
    parser = OptionParser(usage=usage)

    parser.add_option("-r", "--routine", dest="routine", help="anta routine id")

    parser.add_option("-c", "--corpus", dest="corpus", help="anta Corpus.id")

    parser.add_option("-l", "--language", dest="language", default="en", help="language")

    parser.add_option("-f", "--function", dest="func", help="function")

    parser.add_option("-d", "--delimiter", dest="delimiter", default="\t", help="csv cell delimiter")

    parser.add_option("-o", "--tfidfcolumn", dest="tfidfcolumn", default="stemmed", help="function")

    parser.add_option("-x", "--csv", dest="filename", default="", help="csv file absolute path")

    (options, argv) = parser.parse_args()

    if options.func is None:
        error(message="Use -f to specify the desired function.", parser=parser)

    if options.routine is None:
        error(message="Use -r to specify the routine", parser=parser)
    try:
        routine = Routine.objects.get(pk=options.routine)
    except Exception, e:
        error(message="Ho trovato Wally 572 Exception: %s" % e, parser=parser)
Exemplo n.º 6
0
def duplicates( options, parser ):
	# get corpus, else exit
	try:
		corpus = Corpus.objects.get( name=options.corpus )
	except:
		return error( message="corpus was not found! use sync.py script to load corpora", parser=parser )
	
	#  number of segments inside the corpus
	document_segments = Document_Segment.objects.filter(document__corpus = corpus)
	num_of_segments = document_segments.count()
	# print similarity("ciao", "caio", metric=DICE)
	
	c = 0
	for i in range(0, num_of_segments ):
		for j in range (i+1, num_of_segments ):
			#print i,j, document_segments[i].segment.stemmed, document_segments[j].segment.stemmed
			a = document_segments[i].segment.stemmed
			b = document_segments[j].segment.stemmed
			
			if a == b:
				# equal strings? not now, please
				c+=1
				continue
				
			dl = len(a) - len(b)
			ml = max( len(a), len(b) )
			
			if abs(dl) > ml/10.0:
				continue
			
			
			
			# test levensh
			ratio = 1-levenshtein(a,b)/float(ml)
			
			if ratio < .75:
				continue
			# print similarity(a, b, metric=DICE)
			print
			print ratio
			print " ", document_segments[i].segment.stemmed, document_segments[j].segment.stemmed
			print " ", document_segments[i].segment.content, document_segments[j].segment.content
			print
			
			c+=1
		# inner cycle
		
		#break
	print "found", c, "duplicates"
Exemplo n.º 7
0
        # content = f.read()
        # print "delimiter: options.delimiter", content
        # csv.reader(f, dialect, **kwds)
        # csvfile = unicodecsv.reader( content, encoding='utf-8')

        # except Exception, e:
        # 	error_message = "Ho trovato Wally 559, Exception: %s" % e
        #
        #     ==============================
        #     ---- otuput error message ----
        #     ==============================
        #
    if error_message is not None:
        # print error_message
        close_routine(routine, error=error_message, status="ERR")
        error(message=error_message, parser=parser)

        #
        #     =================================
        #     ---- execute valid functions ----
        #     =================================
        #
    if options.func == "standard":
        return standard(routine=routine, corpus=corpus)  # pattern tf + stems tfidf

    elif options.func == "similarity":
        return similarity(routine=routine, corpus=corpus)  # computate distances and store them inside a dedicated table

    elif options.func == "tf_tfidf":
        return tf_tfidf(routine=routine, corpus=corpus)  # delete segments
Exemplo n.º 8
0
def freebase( options, parser ):
	# needs to have an api key provided
	# options['api_key']
	# @todo handle transaction 
	try:
		corpus = Corpus.objects.get( name=options.corpus )
	except:
		return error( message="corpus was not found! use sync.py script to load corpora", parser=parser )
	
	# for each segments in given documents
	# 
	segments = Segment.objects.filter( documents__corpus = corpus).annotate(num_concepts=Count('concepts'))
	num_of_segments = segments.count()
	
	
	print "[info] freebase analysis"
	print "[info] freebase api key:", options.freebasekey
	print "[info] num of segments:", num_of_segments
	
	
	for s in segments:
		
		
		# quick and dirty test
		
		
		
		#if s.num_concepts > 2:
			
		#	print s.concepts
		#try:
		print "[info] search:",s.content, s.language, s.num_concepts
		results =  fsearch({'query':s.content, 'key':options.freebasekey, 'lang':s.language.lower()}, stop_universes=[
			"/book/written_work", 
			"book/book/",
			"/film/film", "/music/track",
			"/book/book_edition",
			"/tv/tv_series_episode",
			"/fictional_universe/work_of_fiction",
			"/music/composition",
			"/music/release",
			"/tv/tv_program"
		])
		for r in results:
			
			# get existing relata
			try:
				relatum = Relatum.objects.get( slug=r['notable']['id'], language=s.language )
			except:
				relatum = Relatum( 
					content		= r['name'],
					language	= s.language,
					slug		= r['notable']['id'],
					name		= r['notable']['name']
				)
				relatum.save()
			
			# skip if relationship between relatum and segment exists
			relation = Segment_Semantic_Relation( segment=s, relatum=relatum )
			try:
				relation.save()
			except:
				continue
Exemplo n.º 9
0
def executere(options, parser ):
	try:
		corpus = Corpus.objects.get( name=options.corpus )
	except:
		return error( message="corpus was not found! use sync.py script to load corpora", parser=parser )