def main( argv): usage = "usage: %prog -c corpus_name [-f]" parser = OptionParser( usage=usage ) parser.add_option( "-c", "--corpus", dest="corpus", help="an anta corpus_name") parser.add_option( "-f", "--function", dest="function", default="decant", help="function to be performed. Default to 'ampoule', to computate tf") parser.add_option( "-k", "--freebasekey", dest="freebasekey", default="", help="freebase api key (use with function 'freebase'") ( options, argv ) = parser.parse_args() if options.corpus is None: error( message="Use -c to specify the corpus", parser=parser ) if options.function is "decant": return decant(options.corpus, parser ) elif options.function == "duplicates": return duplicates(options, parser ) elif options.function == "freebase": return freebase( options, parser ) else : return basic(options, parser) error( message="function '"+options.function+"'was not found!", parser=parser )
def segments( options, parser ): print """ ========================= ---- IMPORT SEGMENTS ---- ========================= """ try: corpus = Corpus.objects.get( name=options.corpus ) except: return error( message="corpus was not found! use sync.py script to load corpora", parser=parser ) print "corpus:",corpus if not os.path.exists( options.csv ): error( message="csv file was not found.", parser=parser ) f = open( options.csv, 'rb' ) c = csv.DictReader( f, delimiter=options.delimiter ) print "filename:",options.csv for row in c: print row # update stemmed_refined cell try: s = Segment.objects.get(id=row['segment_id']) print s.id, s.content except: print " segemnt id %s was not found!" % row['segment_id'] continue buffer_stemmed = s.stemmed s.stemmed = row['stemmed'] s.stemmed_refined = buffer_stemmed s.save()
def export(corpus, language, parser, column="stemmed"): print """ =========================================== ---- EXPORT SEGMENTS FOR REFINE SAMPLE ---- =========================================== """ try: c = Corpus.objects.get(name=corpus) except Exception, e: # print "Exception: %s" % e return error(message="corpus '%s' was not found!" % corpus, parser=parser)
def main( argv): parser = OptionParser( usage="usage: %prog -f csv -c corpus_name [-d delimiter]" ) parser.add_option( "-c", "--corpus", dest="corpus", help="anta corpus_name") parser.add_option( "-f", "--csv", dest="csv", help="csv file to be parsed") parser.add_option( "-d", "--delimiter", dest="delimiter", default="\t", help="csv separator") ( options, argv ) = parser.parse_args() if options.corpus is None: return error( message="Use -c to specify the corpus", parser=parser ) if options.csv is None: return error( message="Use -f to specify the csv file path", parser=parser ) return segments(options, parser)
def main(argv): usage = "usage: %prog -f function [ standard|importcsv [-o column] [-x csvfile] ]" parser = OptionParser(usage=usage) parser.add_option("-r", "--routine", dest="routine", help="anta routine id") parser.add_option("-c", "--corpus", dest="corpus", help="anta Corpus.id") parser.add_option("-l", "--language", dest="language", default="en", help="language") parser.add_option("-f", "--function", dest="func", help="function") parser.add_option("-d", "--delimiter", dest="delimiter", default="\t", help="csv cell delimiter") parser.add_option("-o", "--tfidfcolumn", dest="tfidfcolumn", default="stemmed", help="function") parser.add_option("-x", "--csv", dest="filename", default="", help="csv file absolute path") (options, argv) = parser.parse_args() if options.func is None: error(message="Use -f to specify the desired function.", parser=parser) if options.routine is None: error(message="Use -r to specify the routine", parser=parser) try: routine = Routine.objects.get(pk=options.routine) except Exception, e: error(message="Ho trovato Wally 572 Exception: %s" % e, parser=parser)
def duplicates( options, parser ): # get corpus, else exit try: corpus = Corpus.objects.get( name=options.corpus ) except: return error( message="corpus was not found! use sync.py script to load corpora", parser=parser ) # number of segments inside the corpus document_segments = Document_Segment.objects.filter(document__corpus = corpus) num_of_segments = document_segments.count() # print similarity("ciao", "caio", metric=DICE) c = 0 for i in range(0, num_of_segments ): for j in range (i+1, num_of_segments ): #print i,j, document_segments[i].segment.stemmed, document_segments[j].segment.stemmed a = document_segments[i].segment.stemmed b = document_segments[j].segment.stemmed if a == b: # equal strings? not now, please c+=1 continue dl = len(a) - len(b) ml = max( len(a), len(b) ) if abs(dl) > ml/10.0: continue # test levensh ratio = 1-levenshtein(a,b)/float(ml) if ratio < .75: continue # print similarity(a, b, metric=DICE) print print ratio print " ", document_segments[i].segment.stemmed, document_segments[j].segment.stemmed print " ", document_segments[i].segment.content, document_segments[j].segment.content print c+=1 # inner cycle #break print "found", c, "duplicates"
# content = f.read() # print "delimiter: options.delimiter", content # csv.reader(f, dialect, **kwds) # csvfile = unicodecsv.reader( content, encoding='utf-8') # except Exception, e: # error_message = "Ho trovato Wally 559, Exception: %s" % e # # ============================== # ---- otuput error message ---- # ============================== # if error_message is not None: # print error_message close_routine(routine, error=error_message, status="ERR") error(message=error_message, parser=parser) # # ================================= # ---- execute valid functions ---- # ================================= # if options.func == "standard": return standard(routine=routine, corpus=corpus) # pattern tf + stems tfidf elif options.func == "similarity": return similarity(routine=routine, corpus=corpus) # computate distances and store them inside a dedicated table elif options.func == "tf_tfidf": return tf_tfidf(routine=routine, corpus=corpus) # delete segments
def freebase( options, parser ): # needs to have an api key provided # options['api_key'] # @todo handle transaction try: corpus = Corpus.objects.get( name=options.corpus ) except: return error( message="corpus was not found! use sync.py script to load corpora", parser=parser ) # for each segments in given documents # segments = Segment.objects.filter( documents__corpus = corpus).annotate(num_concepts=Count('concepts')) num_of_segments = segments.count() print "[info] freebase analysis" print "[info] freebase api key:", options.freebasekey print "[info] num of segments:", num_of_segments for s in segments: # quick and dirty test #if s.num_concepts > 2: # print s.concepts #try: print "[info] search:",s.content, s.language, s.num_concepts results = fsearch({'query':s.content, 'key':options.freebasekey, 'lang':s.language.lower()}, stop_universes=[ "/book/written_work", "book/book/", "/film/film", "/music/track", "/book/book_edition", "/tv/tv_series_episode", "/fictional_universe/work_of_fiction", "/music/composition", "/music/release", "/tv/tv_program" ]) for r in results: # get existing relata try: relatum = Relatum.objects.get( slug=r['notable']['id'], language=s.language ) except: relatum = Relatum( content = r['name'], language = s.language, slug = r['notable']['id'], name = r['notable']['name'] ) relatum.save() # skip if relationship between relatum and segment exists relation = Segment_Semantic_Relation( segment=s, relatum=relatum ) try: relation.save() except: continue
def executere(options, parser ): try: corpus = Corpus.objects.get( name=options.corpus ) except: return error( message="corpus was not found! use sync.py script to load corpora", parser=parser )