示例#1
0
def download_corpus (data_csv):
    aristo_data = AristoData(data_csv )
    data = " ".join([aristo_data.get_column_as_raw("A",join_rows_by=","), aristo_data.get_all_questions_as_raw() ,   aristo_data.get_column_as_raw("B",join_rows_by=",") , aristo_data.get_column_as_raw("C",join_rows_by=",") , aristo_data.get_column_as_raw("D",join_rows_by=",")])
    #sentence_list = [ aristo_data.get_all_questions_as_raw()]# aristo_data.get_column_as_raw("B",join_rows_by=",") , aristo_data.get_column_as_raw("C",join_rows_by=",") , aristo_data.get_column_as_raw("D",join_rows_by=",")]
    kc = KnowledgeCreator()
    key_words=get_key_words(data)

    corpus_file=os.path.join(os.path.dirname(__file__),"../../../corpus2/mediafile_{}.xml".format(time.strftime('%Y%m%d_%H%M%S')))
    kc.download_wikipedia_articles(key_words, corpus_file)
def run_test_data(data_csv):
    out_dir=os.path.join(os.path.dirname(__file__),"../../../outputdata/test_{}".format(time.strftime('%Y%m%d_%H%M%S')))
    os.makedirs(out_dir)
    logger = setup_log(out_dir)
    aristo_data = AristoData(data_csv)

    aristo_data.print_summary()
    pipeline = SolrWikipediaAllAnswerThenQuestionPipeline(data=aristo_data, logger = logger)
    pipeline.run_pipeline()


    pipeline.write_to_disk((out_dir))
    print(pipeline.score())
class IntegrationTestAristoData(unittest.TestCase):
    def setUp(self):
        data_file_path = os.path.join(os.path.dirname(__file__), "../../../inputdata/training_set.tsv")
        print(os.path.abspath(data_file_path))
        self._aristo_data = AristoData(data_file_path)

    def test_should_print_summary(self):
        self._aristo_data.print_summary()

    def test_should_get_x(self):
        self.assertEqual(len(self._aristo_data.x.columns), 5,
                         "The expected number of columns does not match the actual")

    def test_should_get_y_columns(self):
        self.assertEqual(len(self._aristo_data.y.columns), 1,
                         "The expected number of columns does not match the actual")

    def test_should_get_all_questions_as_raw(self):
        self.assertTrue(type(self._aristo_data.get_all_questions_as_raw()) is str)

    def test_should_get_all_questions_as_list(self):
        self.assertEqual(len(self._aristo_data.get_all_questions_answers_as_list()),2500*5)
示例#4
0
def run_train_data(train_data_csv):
    aristo_train_data = AristoData(train_data_csv, range(0,2000))
    aristo_test_data = AristoData(train_data_csv, range(100,110))
    aristo_test_data.print_summary()
    aristo_train_data.print_summary()
    pipeline = SimilarityPipeline(train_data=aristo_train_data, test_data=aristo_test_data)
    pipeline.run_pipeline()
    out_dir=os.path.join(os.path.dirname(__file__),"../../../outputdata/train_{}".format(time.strftime('%Y%m%d_%H%M%S')))

    os.makedirs(out_dir)
    pipeline.write_to_disk((out_dir))
 def setUp(self):
     data_file_path = os.path.join(os.path.dirname(__file__), "../../../inputdata/training_set.tsv")
     print(os.path.abspath(data_file_path))
     self._aristo_data = AristoData(data_file_path)