def __init__(self, database_path, input_sentence, language_path, thesaurus_path, json_output_path): database = Database() database.load(database_path) #database.print_me() config = LangConfig() config.load(language_path) parser = Parser(database, config) if thesaurus_path is not None: thesaurus = Thesaurus() thesaurus.load(thesaurus_path) parser.set_thesaurus(thesaurus) queries = parser.parse_sentence(input_sentence) if json_output_path is not None: self.remove_json(json_output_path) for query in queries: query.print_json(json_output_path) for query in queries: print(query)
def __init__(self, database_path, language_path, input_sentence, json_output_path, thesaurus_path, stopwords_path): database = Database() stopwordsFilter = None if thesaurus_path is not None: thesaurus = Thesaurus() thesaurus.load(thesaurus_path) database.set_thesaurus(thesaurus) if stopwords_path is not None: stopwordsFilter = StopwordFilter() stopwordsFilter.load(stopwords_path) database.load(database_path) #database.print_me() config = LangConfig() config.load(language_path) parser = Parser(database, config) queries = parser.parse_sentence(input_sentence, stopwordsFilter) if json_output_path is not None: self.remove_json(json_output_path) for query in queries: query.print_json(json_output_path) for query in queries: print(query) self.query = queries
def __init__(self, database_path, input_sentence, language_path, thesaurus_path, json_output_path, working_database): try: database = Database() database.load(database_path) database.print_me() config = LangConfig() config.load(language_path) parser = Parser(database, config) if thesaurus_path is not None: thesaurus = Thesaurus() thesaurus.load(thesaurus_path) parser.set_thesaurus(thesaurus) queries, db_dictionary = parser.parse_sentence(input_sentence) if json_output_path is not None: self.remove_json(json_output_path) for query in queries: query.print_json(json_output_path) open("reply.txt", "w").close() with open('reply.txt', 'a') as wf: for query in queries: for name in fetch(query, working_database): wf.write(str(name[0]) + '\n') wf.write('\n' + 'Use keywords: ' + str(db_dictionary)) wf.write(str(query)) except Exception as exc: with open('reply.txt', 'a') as wf: wf.write(str(exc) + '\n') wf.write('Use keywords: ' + str(db_dictionary))
def __init__(self, database_path, input_sentence, language_path, thesaurus_path, json_output_path): database = Database() database.load(database_path) #database.print_me() config = LangConfig() config.load(language_path) parser = Parser(database, config) if thesaurus_path is not None: thesaurus = Thesaurus() thesaurus.load(thesaurus_path) parser.set_thesaurus(thesaurus) queries = parser.parse_sentence(input_sentence) if json_output_path is not None: self.remove_json(json_output_path) for query in queries: query.print_json(json_output_path) if (len(queries) > 1): if settings.DEBUG: print('--------- queries is more than one') self.query = None raise Exception('More than one query') else: self.query = queries[0] if settings.DEBUG: for query in queries: print query
def main(type_atc, argv): date_start = datetime.datetime.now() date_start = date_start.strftime("%Y-%m-%d %H:%M:%S") parameters = Parameters(type_atc, argv) contexts = parameters.getContexts() input_folder = parameters.getInputFolder() language = parameters.getLanguage() min_word_size = parameters.getMinWordSize() max_qty_terms = int(parameters.getMaxQtyTerms()) output_folder = parameters.getOutputFolder() temp_folder = parameters.getTempFolder() record_log = parameters.getRecordLog() record_intermediate = parameters.getRecordIntermediate() seeds_file = parameters.getSeedsFile() stoplist_file = parameters.getStoplistFile() sim_measure = parameters.getSimilarityMeasure() del parameters logfile = LogFile(record_log, str(date_start), None, input_folder, language, stoplist_file, min_word_size, max_qty_terms, None, output_folder, None, temp_folder, seeds_file, sim_measure) if contexts: logfile.writeLogfile('- Building syntactics relations from '+temp_folder) contexts = Contexts(temp_folder) del contexts else: logfile.writeLogfile('- Building syntactics relations from '+input_folder) ling_corpus = StanfordSyntacticContexts(input_folder, temp_folder, stoplist_file, min_word_size, record_intermediate) del ling_corpus logfile.writeLogfile('- Merging terms to '+temp_folder+'Relations2ndOrder.txt') command = 'cat '+temp_folder+'AN_Relations.txt '+temp_folder+'SV_Relations.txt '+temp_folder+'VO_Relations.txt '+' > '+temp_folder+'Relations2ndOrder.txt' os.system(command) logfile.writeLogfile('- Calculating similarity using '+sim_measure) measures = Measures(temp_folder+'Relations2ndOrder.txt', seeds_file) dic_topn = measures.getTopNToAllSeeds(sim_measure, max_qty_terms) del measures logfile.writeLogfile('- Building thesaurus in '+output_folder+'T_'+type_atc+'_'+sim_measure+'.xml') thesaurus = Thesaurus(output_folder+'T_'+type_atc+'_'+sim_measure+'.xml',max_qty_terms) thesaurus.write(dic_topn) del thesaurus date_end = datetime.datetime.now() date_end = date_end.strftime("%Y-%m-%d %H:%M:%S") logfile.writeLogfile('- Thesaurus sucessfully built!\nEnding process at: '+str(date_end)+'.\n') del logfile
def __init__(self, database_path, input_sentence, language_path, thesaurus_path, json_output_path): # if thesaurus_path is not None: thesaurus = Thesaurus() thesaurus.load("./thesaurus/test.dat") database = Database(thesaurus) database.load(database_path) # database.print_me() config = LangConfig() config.load(language_path) self.stopwordsFilter = None self.stopwordsFilter = StopwordFilter() self.stopwordsFilter.load("./stopwords/english.txt") input_sentence = self.stopwordsFilter.filter(input_sentence) parser = Parser(database, config) parser.set_thesaurus(thesaurus) # main flow queries = parser.parse_sentence(input_sentence) # if json_output_path is not None: # self.remove_json(json_output_path) # for query in queries: # query.print_json(json_output_path) if (len(queries) > 1): if settings.DEBUG: print('--------- queries is more than one') self.query = None raise Exception('More than one query') else: self.query = queries[0] if settings.DEBUG: for query in queries: print(query)
def main(type_atc, argv): date_start = datetime.datetime.now() date_start = date_start.strftime("%Y-%m-%d %H:%M:%S") parameters = Parameters(type_atc, argv) contexts = parameters.getContexts() input_folder = parameters.getInputFolder() language = parameters.getLanguage() min_word_size = int(parameters.getMinWordSize()) max_qty_terms = int(parameters.getMaxQtyTerms()) mi_precision = parameters.getMIPrecision() output_folder = parameters.getOutputFolder() window_size = parameters.getWindowSize() temp_folder = parameters.getTempFolder() record_log = parameters.getRecordLog() record_intermediate = parameters.getRecordIntermediate() seeds_file = parameters.getSeedsFile() sim_measure = parameters.getSimilarityMeasure() del parameters logfile = LogFile(record_log, str(date_start), None, input_folder, language, None, min_word_size, max_qty_terms, mi_precision, output_folder, window_size, temp_folder, seeds_file, sim_measure) stat_corpus = StatisticalCorpus(input_folder, temp_folder, min_word_size, window_size) if not contexts: logfile.writeLogfile('- Building statistical corpus at ' + temp_folder) if language == 'pt': stat_corpus.buildCorpus_pt() param_nsp = '--token ../misc/tokens_nsp.pl' elif language == 'en': stat_corpus.buildCorpus_en() param_nsp = '' """ Uses count.pl from NGram Statistical Package (NSP) to get Bigrams in a window """ logfile.writeLogfile('- Getting bigrams to W' + window_size + '_Statistical_corpus.txt') command = 'count.pl --ngram 2 ' + param_nsp + ' --window ' + window_size + ' ' + temp_folder + 'W' + window_size + '_Statistical_corpus.txt ' + temp_folder + 'Statistical_corpus.txt' os.system(command) logfile.writeLogfile('- Using ' + sim_measure + ' as similarity measure') if sim_measure == 'mutual_information': mi = MutualInformation( temp_folder, 'W' + window_size + '_Statistical_corpus.txt', seeds_file, mi_precision) dic_terms = mi.getDicMI() del mi else: stat_corpus.buildSTRelations( 'W' + window_size + '_Statistical_corpus.txt', seeds_file) measures = Measures( temp_folder + 'W' + window_size + '_Relations.txt', seeds_file) dic_terms = measures.getTopNToAllSeeds(sim_measure, max_qty_terms) del measures else: measures = Measures(temp_folder + 'W' + window_size + '_Relations.txt', seeds_file) dic_terms = measures.getTopNToAllSeeds(sim_measure, max_qty_terms) del measures del stat_corpus logfile.writeLogfile('- Building thesaurus in ' + output_folder + 'T' + window_size + '_' + type_atc + '_' + sim_measure + '.xml') thesaurus = Thesaurus( output_folder + 'T' + window_size + '_' + type_atc + '_' + sim_measure + '.xml', max_qty_terms) thesaurus.write(dic_terms) del thesaurus date_end = datetime.datetime.now() date_end = date_end.strftime("%Y-%m-%d %H:%M:%S") logfile.writeLogfile( '- Thesaurus sucessfully built!\nEnding process at: ' + str(date_end) + '.\n') del logfile
def main(type_atc, argv): date_start = datetime.datetime.now() date_start = date_start.strftime("%Y-%m-%d %H:%M:%S") parameters = Parameters(type_atc, argv) contexts = parameters.getContexts() input_folder = parameters.getInputFolder() language = parameters.getLanguage() min_word_size = int(parameters.getMinWordSize()) max_qty_terms = int(parameters.getMaxQtyTerms()) mi_precision = parameters.getMIPrecision() output_folder = parameters.getOutputFolder() window_size = parameters.getWindowSize() temp_folder = parameters.getTempFolder() record_log = parameters.getRecordLog() record_intermediate = parameters.getRecordIntermediate() seeds_file = parameters.getSeedsFile() sim_measure = parameters.getSimilarityMeasure() del parameters logfile = LogFile(record_log, str(date_start), None, input_folder, language, None, min_word_size, max_qty_terms, mi_precision, output_folder, window_size, temp_folder, seeds_file, sim_measure) stat_corpus = StatisticalCorpus(input_folder, temp_folder, min_word_size, window_size) if not contexts: logfile.writeLogfile('- Building statistical corpus at '+temp_folder) if language == 'pt': stat_corpus.buildCorpus_pt() param_nsp = '--token ../misc/tokens_nsp.pl' elif language == 'en': stat_corpus.buildCorpus_en() param_nsp = '' """ Uses count.pl from NGram Statistical Package (NSP) to get Bigrams in a window """ logfile.writeLogfile('- Getting bigrams to W'+window_size+'_Statistical_corpus.txt') command = 'count.pl --ngram 2 '+param_nsp+' --window '+window_size+' '+temp_folder+'W'+window_size+'_Statistical_corpus.txt '+temp_folder+'Statistical_corpus.txt' os.system(command) logfile.writeLogfile('- Using '+sim_measure+' as similarity measure') if sim_measure == 'mutual_information': mi = MutualInformation(temp_folder, 'W'+window_size+'_Statistical_corpus.txt', seeds_file, mi_precision) dic_terms = mi.getDicMI() del mi else: stat_corpus.buildSTRelations('W'+window_size+'_Statistical_corpus.txt', seeds_file) measures = Measures(temp_folder+'W'+window_size+'_Relations.txt', seeds_file) dic_terms = measures.getTopNToAllSeeds(sim_measure, max_qty_terms) del measures else: measures = Measures(temp_folder+'W'+window_size+'_Relations.txt', seeds_file) dic_terms = measures.getTopNToAllSeeds(sim_measure, max_qty_terms) del measures del stat_corpus logfile.writeLogfile('- Building thesaurus in '+output_folder+'T'+window_size+'_'+type_atc+'_'+sim_measure+'.xml') thesaurus = Thesaurus(output_folder+'T'+window_size+'_'+type_atc+'_'+sim_measure+'.xml',max_qty_terms) thesaurus.write(dic_terms) del thesaurus date_end = datetime.datetime.now() date_end = date_end.strftime("%Y-%m-%d %H:%M:%S") logfile.writeLogfile('- Thesaurus sucessfully built!\nEnding process at: '+str(date_end)+'.\n') del logfile