Exemplo n.º 1
0
    def __init__(self, database_path, input_sentence, language_path,
                 thesaurus_path, json_output_path):
        database = Database()
        database.load(database_path)
        #database.print_me()

        config = LangConfig()
        config.load(language_path)

        parser = Parser(database, config)

        if thesaurus_path is not None:
            thesaurus = Thesaurus()
            thesaurus.load(thesaurus_path)
            parser.set_thesaurus(thesaurus)

        queries = parser.parse_sentence(input_sentence)

        if json_output_path is not None:
            self.remove_json(json_output_path)
            for query in queries:
                query.print_json(json_output_path)

        for query in queries:
            print(query)
Exemplo n.º 2
0
    def __init__(self, database_path, language_path, input_sentence,
                 json_output_path, thesaurus_path, stopwords_path):
        database = Database()
        stopwordsFilter = None

        if thesaurus_path is not None:
            thesaurus = Thesaurus()
            thesaurus.load(thesaurus_path)
            database.set_thesaurus(thesaurus)

        if stopwords_path is not None:
            stopwordsFilter = StopwordFilter()
            stopwordsFilter.load(stopwords_path)

        database.load(database_path)
        #database.print_me()

        config = LangConfig()
        config.load(language_path)

        parser = Parser(database, config)

        queries = parser.parse_sentence(input_sentence, stopwordsFilter)

        if json_output_path is not None:
            self.remove_json(json_output_path)
            for query in queries:
                query.print_json(json_output_path)

        for query in queries:
            print(query)
        self.query = queries
Exemplo n.º 3
0
    def __init__(self, database_path, input_sentence, language_path,
                 thesaurus_path, json_output_path, working_database):
        try:
            database = Database()
            database.load(database_path)
            database.print_me()
            config = LangConfig()
            config.load(language_path)
            parser = Parser(database, config)
            if thesaurus_path is not None:
                thesaurus = Thesaurus()
                thesaurus.load(thesaurus_path)
                parser.set_thesaurus(thesaurus)
            queries, db_dictionary = parser.parse_sentence(input_sentence)

            if json_output_path is not None:
                self.remove_json(json_output_path)
                for query in queries:
                    query.print_json(json_output_path)
            open("reply.txt", "w").close()
            with open('reply.txt', 'a') as wf:
                for query in queries:
                    for name in fetch(query, working_database):
                        wf.write(str(name[0]) + '\n')
                    wf.write('\n' + 'Use keywords: ' + str(db_dictionary))
                    wf.write(str(query))

        except Exception as exc:
            with open('reply.txt', 'a') as wf:
                wf.write(str(exc) + '\n')
                wf.write('Use keywords: ' + str(db_dictionary))
Exemplo n.º 4
0
    def __init__(self, database_path, input_sentence, language_path,
                 thesaurus_path, json_output_path):
        database = Database()
        database.load(database_path)
        #database.print_me()

        config = LangConfig()
        config.load(language_path)

        parser = Parser(database, config)

        if thesaurus_path is not None:
            thesaurus = Thesaurus()
            thesaurus.load(thesaurus_path)
            parser.set_thesaurus(thesaurus)

        queries = parser.parse_sentence(input_sentence)

        if json_output_path is not None:
            self.remove_json(json_output_path)
            for query in queries:
                query.print_json(json_output_path)

        if (len(queries) > 1):
            if settings.DEBUG:
                print('--------- queries is more than one')
            self.query = None

            raise Exception('More than one query')
        else:
            self.query = queries[0]

        if settings.DEBUG:
            for query in queries:
                print query
Exemplo n.º 5
0
def main(type_atc, argv):
	date_start = datetime.datetime.now()
	date_start = date_start.strftime("%Y-%m-%d %H:%M:%S")
	
	parameters = Parameters(type_atc, argv)
	contexts = parameters.getContexts()
	input_folder = parameters.getInputFolder()
	language = parameters.getLanguage()
	min_word_size = parameters.getMinWordSize()
	max_qty_terms = int(parameters.getMaxQtyTerms())
	output_folder = parameters.getOutputFolder()
	temp_folder = parameters.getTempFolder()
	record_log = parameters.getRecordLog()
	record_intermediate = parameters.getRecordIntermediate()
	seeds_file = parameters.getSeedsFile()
	stoplist_file = parameters.getStoplistFile()
	sim_measure = parameters.getSimilarityMeasure()
	del parameters

	logfile = LogFile(record_log, str(date_start), None, input_folder, language, stoplist_file, min_word_size, max_qty_terms, None, output_folder, None, temp_folder, seeds_file, sim_measure)

	if contexts:
		logfile.writeLogfile('- Building syntactics relations from '+temp_folder)
		contexts = Contexts(temp_folder)
		del contexts
	else:
		logfile.writeLogfile('- Building syntactics relations from '+input_folder)
		ling_corpus = StanfordSyntacticContexts(input_folder, temp_folder, stoplist_file, min_word_size, record_intermediate)
		del ling_corpus

	logfile.writeLogfile('- Merging terms to '+temp_folder+'Relations2ndOrder.txt')

	command = 'cat '+temp_folder+'AN_Relations.txt '+temp_folder+'SV_Relations.txt '+temp_folder+'VO_Relations.txt '+' > '+temp_folder+'Relations2ndOrder.txt'
	os.system(command)

	logfile.writeLogfile('- Calculating similarity using '+sim_measure)
	measures = Measures(temp_folder+'Relations2ndOrder.txt', seeds_file)
	dic_topn = measures.getTopNToAllSeeds(sim_measure, max_qty_terms)
	del measures

	logfile.writeLogfile('- Building thesaurus in '+output_folder+'T_'+type_atc+'_'+sim_measure+'.xml')

	thesaurus = Thesaurus(output_folder+'T_'+type_atc+'_'+sim_measure+'.xml',max_qty_terms)
	thesaurus.write(dic_topn)
	del thesaurus

	date_end = datetime.datetime.now()
	date_end = date_end.strftime("%Y-%m-%d %H:%M:%S")
	logfile.writeLogfile('- Thesaurus sucessfully built!\nEnding process at: '+str(date_end)+'.\n')
	del logfile
    def __init__(self, database_path, input_sentence, language_path,
                 thesaurus_path, json_output_path):
        # if thesaurus_path is not None:
        thesaurus = Thesaurus()
        thesaurus.load("./thesaurus/test.dat")

        database = Database(thesaurus)
        database.load(database_path)
        # database.print_me()

        config = LangConfig()
        config.load(language_path)

        self.stopwordsFilter = None
        self.stopwordsFilter = StopwordFilter()
        self.stopwordsFilter.load("./stopwords/english.txt")
        input_sentence = self.stopwordsFilter.filter(input_sentence)
        parser = Parser(database, config)

        parser.set_thesaurus(thesaurus)
        # main flow
        queries = parser.parse_sentence(input_sentence)

        # if json_output_path is not None:
        #     self.remove_json(json_output_path)
        #     for query in queries:
        #         query.print_json(json_output_path)

        if (len(queries) > 1):
            if settings.DEBUG:
                print('--------- queries is more than one')
            self.query = None

            raise Exception('More than one query')
        else:
            self.query = queries[0]

        if settings.DEBUG:
            for query in queries:
                print(query)
Exemplo n.º 7
0
def main(type_atc, argv):
    date_start = datetime.datetime.now()
    date_start = date_start.strftime("%Y-%m-%d %H:%M:%S")

    parameters = Parameters(type_atc, argv)
    contexts = parameters.getContexts()
    input_folder = parameters.getInputFolder()
    language = parameters.getLanguage()
    min_word_size = int(parameters.getMinWordSize())
    max_qty_terms = int(parameters.getMaxQtyTerms())
    mi_precision = parameters.getMIPrecision()
    output_folder = parameters.getOutputFolder()
    window_size = parameters.getWindowSize()
    temp_folder = parameters.getTempFolder()
    record_log = parameters.getRecordLog()
    record_intermediate = parameters.getRecordIntermediate()
    seeds_file = parameters.getSeedsFile()
    sim_measure = parameters.getSimilarityMeasure()
    del parameters

    logfile = LogFile(record_log, str(date_start), None, input_folder,
                      language, None, min_word_size, max_qty_terms,
                      mi_precision, output_folder, window_size, temp_folder,
                      seeds_file, sim_measure)
    stat_corpus = StatisticalCorpus(input_folder, temp_folder, min_word_size,
                                    window_size)

    if not contexts:
        logfile.writeLogfile('- Building statistical corpus at ' + temp_folder)

        if language == 'pt':
            stat_corpus.buildCorpus_pt()
            param_nsp = '--token ../misc/tokens_nsp.pl'
        elif language == 'en':
            stat_corpus.buildCorpus_en()
            param_nsp = ''
        """
			Uses count.pl from NGram Statistical Package (NSP) to get Bigrams in a window
		"""

        logfile.writeLogfile('- Getting bigrams to W' + window_size +
                             '_Statistical_corpus.txt')

        command = 'count.pl --ngram 2 ' + param_nsp + ' --window ' + window_size + ' ' + temp_folder + 'W' + window_size + '_Statistical_corpus.txt ' + temp_folder + 'Statistical_corpus.txt'
        os.system(command)

        logfile.writeLogfile('- Using ' + sim_measure +
                             ' as similarity measure')

        if sim_measure == 'mutual_information':
            mi = MutualInformation(
                temp_folder, 'W' + window_size + '_Statistical_corpus.txt',
                seeds_file, mi_precision)
            dic_terms = mi.getDicMI()
            del mi
        else:
            stat_corpus.buildSTRelations(
                'W' + window_size + '_Statistical_corpus.txt', seeds_file)
            measures = Measures(
                temp_folder + 'W' + window_size + '_Relations.txt', seeds_file)
            dic_terms = measures.getTopNToAllSeeds(sim_measure, max_qty_terms)
            del measures

    else:
        measures = Measures(temp_folder + 'W' + window_size + '_Relations.txt',
                            seeds_file)
        dic_terms = measures.getTopNToAllSeeds(sim_measure, max_qty_terms)
        del measures

    del stat_corpus

    logfile.writeLogfile('- Building thesaurus in ' + output_folder + 'T' +
                         window_size + '_' + type_atc + '_' + sim_measure +
                         '.xml')

    thesaurus = Thesaurus(
        output_folder + 'T' + window_size + '_' + type_atc + '_' +
        sim_measure + '.xml', max_qty_terms)
    thesaurus.write(dic_terms)
    del thesaurus

    date_end = datetime.datetime.now()
    date_end = date_end.strftime("%Y-%m-%d %H:%M:%S")
    logfile.writeLogfile(
        '- Thesaurus sucessfully built!\nEnding process at: ' + str(date_end) +
        '.\n')
    del logfile
Exemplo n.º 8
0
def main(type_atc, argv):
	date_start = datetime.datetime.now()
	date_start = date_start.strftime("%Y-%m-%d %H:%M:%S")

	parameters = Parameters(type_atc, argv)
	contexts = parameters.getContexts()
	input_folder = parameters.getInputFolder()
	language = parameters.getLanguage()
	min_word_size = int(parameters.getMinWordSize())
	max_qty_terms = int(parameters.getMaxQtyTerms())
	mi_precision = parameters.getMIPrecision()
	output_folder = parameters.getOutputFolder()
	window_size = parameters.getWindowSize()
	temp_folder = parameters.getTempFolder()
	record_log = parameters.getRecordLog()
	record_intermediate = parameters.getRecordIntermediate()
	seeds_file = parameters.getSeedsFile()
	sim_measure = parameters.getSimilarityMeasure()
	del parameters
 
	logfile = LogFile(record_log, str(date_start), None, input_folder, language, None, min_word_size, max_qty_terms, mi_precision, output_folder, window_size, temp_folder, seeds_file, sim_measure)
	stat_corpus = StatisticalCorpus(input_folder, temp_folder, min_word_size, window_size)

	if not contexts:
		logfile.writeLogfile('- Building statistical corpus at '+temp_folder)
	
		if language == 'pt':
			stat_corpus.buildCorpus_pt()	
			param_nsp = '--token ../misc/tokens_nsp.pl'
		elif language == 'en':
			stat_corpus.buildCorpus_en()
			param_nsp = ''

		"""
			Uses count.pl from NGram Statistical Package (NSP) to get Bigrams in a window
		"""

		logfile.writeLogfile('- Getting bigrams to W'+window_size+'_Statistical_corpus.txt')

		command = 'count.pl --ngram 2 '+param_nsp+' --window '+window_size+' '+temp_folder+'W'+window_size+'_Statistical_corpus.txt '+temp_folder+'Statistical_corpus.txt'
		os.system(command)

		logfile.writeLogfile('- Using '+sim_measure+' as similarity measure')

		if sim_measure == 'mutual_information':
			mi = MutualInformation(temp_folder, 'W'+window_size+'_Statistical_corpus.txt', seeds_file, mi_precision)
			dic_terms = mi.getDicMI()
			del mi
		else:
			stat_corpus.buildSTRelations('W'+window_size+'_Statistical_corpus.txt', seeds_file)
			measures = Measures(temp_folder+'W'+window_size+'_Relations.txt', seeds_file)
			dic_terms = measures.getTopNToAllSeeds(sim_measure, max_qty_terms)
			del measures

	else:
		measures = Measures(temp_folder+'W'+window_size+'_Relations.txt', seeds_file)
		dic_terms = measures.getTopNToAllSeeds(sim_measure, max_qty_terms)
		del measures

	del stat_corpus

	logfile.writeLogfile('- Building thesaurus in '+output_folder+'T'+window_size+'_'+type_atc+'_'+sim_measure+'.xml')

	thesaurus = Thesaurus(output_folder+'T'+window_size+'_'+type_atc+'_'+sim_measure+'.xml',max_qty_terms)
	thesaurus.write(dic_terms)
	del thesaurus

	date_end = datetime.datetime.now()
	date_end = date_end.strftime("%Y-%m-%d %H:%M:%S")
	logfile.writeLogfile('- Thesaurus sucessfully built!\nEnding process at: '+str(date_end)+'.\n')
	del logfile