def test_export_to_csv_from_reader_001(self): self.blogger_corpus() self.prj_folder() self.blogger_lists() reader = Reader(os.path.join(self.tempdir_blogger_corp, self.txt_blogger_hightrepetativ_set), "txt", send_end_file_marker=False, regex_template="blogger", mode=self.mode) exporter = Exporter(reader.getlazy(), mode=self.mode) exporter.tocsv(self.tempdir_project_folder, "blogger_corpus", self.fieldnames, rows_limit_in_file=1) i = 0 for item in os.listdir(self.tempdir_project_folder): if ".csv" in item: i += 1 #p(list(reader.getlazy())) if len(list(reader.getlazy())) != i: assert False
def test_exporter_initialisation_with_reader_obj_001(self): self.blogger_corpus() reader = Reader(os.path.join(self.tempdir_blogger_corp, self.txt_blogger_hightrepetativ_set), "txt", regex_template="blogger", mode=self.mode) exporter = Exporter(reader.getlazy(), mode=self.mode) exporter.should.be.a(Exporter)
def test_lazyreader_from_twitter_json_for_given_colnames_512(self): self.twitter_corpus() end_file_marker = -1 reader = Reader(os.path.join(self.tempdir_twitter_corp, self.json_twitter_set), "json", formatter_name="TwitterStreamAPI", mode=self.mode, end_file_marker=end_file_marker) for data in reader.getlazy(colnames=["text"]): if data == end_file_marker: continue if data: assert isinstance(data, dict) assert len(data) == 1 assert 'text' in data
def test_lazyreader_from_json_for_given_colnames_510(self): self.blogger_corpus() end_file_marker = -1 reader = Reader(os.path.join(self.tempdir_blogger_corp, self.json_blogger_small_fake_set), "json", mode=self.mode, end_file_marker=end_file_marker) for data in reader.getlazy( colnames=["text", 'star_constellation', 'gender']): if data == end_file_marker: continue assert isinstance(data, dict) assert len(data) == 3 assert 'text' in data assert 'star_constellation' in data assert 'gender' in data
def test_getlazy_many_streams_from_txt_with_given_number_of_streams_without_adjust_for_current_cpu_521( self): self.blogger_corpus() end_file_marker = -1 reader = Reader(os.path.join(self.tempdir_blogger_corp, self.txt_blogger_small_fake_set), "txt", regex_template="blogger", mode=self.mode, end_file_marker=end_file_marker, send_end_file_marker=True) number_of_found_files = reader._get_number_of_left_over_files() #p(number_of_found_files) # Check for stream_number=3 len(reader.getlazy(stream_number=3, adjust_to_cpu=False)).should.be.equal(3) len([ rowdict for gen in reader.getlazy(stream_number=3, adjust_to_cpu=False) for rowdict in gen if end_file_marker == rowdict ]).should.be.equal(number_of_found_files) # Check for stream_number=2 #p(reader.getlazy(stream_number=2, adjust_to_cpu=False)) len(reader.getlazy(stream_number=2, adjust_to_cpu=False)).should.be.equal(2) len([ rowdict for gen in reader.getlazy(stream_number=2, adjust_to_cpu=False) for rowdict in gen if end_file_marker == rowdict ]).should.be.equal(number_of_found_files) i = 0 for gen, fname in zip( reader.getlazy(stream_number=3, adjust_to_cpu=False, min_files_pro_stream=1), reversed(reader.files_to_read_orig)): for row_dict in gen: if row_dict == end_file_marker: i += 1 continue t = codecs.open(fname, "r", encoding="utf-8").read() #p((row_dict["text"],t)) assert row_dict["text"] == t assert isinstance(row_dict, dict) assert len(row_dict) == 6 assert 'text' in row_dict assert 'star_constellation' in row_dict assert 'working_area' in row_dict assert 'age' in row_dict assert 'id' in row_dict assert 'gender' in row_dict assert number_of_found_files == i
def test_lazyreader_from_xml_for_given_colnames_507(self): self.blogger_corpus() end_file_marker = -1 reader = Reader(os.path.join(self.tempdir_blogger_corp, self.xml_blogger_small_fake_set), "xml", mode=self.mode, end_file_marker=end_file_marker) #reader = Reader(os.path.join(os.path.join(self.path_to_zas_rep_tools,self.path_to_test_sets_for_blogger_Corpus), self.xml_blogger_small_fake_set), "xml", mode=self.mode) for data in reader.getlazy( colnames=["text", 'star_constellation', 'gender']): if data == end_file_marker: continue assert isinstance(data, dict) assert len(data) == 3 assert 'text' in data assert 'star_constellation' in data assert 'gender' in data
def test_reader_initialisation_000(self): self.blogger_corpus() reader = Reader(os.path.join(self.tempdir_blogger_corp, self.txt_blogger_small_fake_set), "txt", regex_template="blogger", mode=self.mode) reader.should.be.a(Reader)
def test_export_to_sqlite_from_reader_007(self): self.blogger_corpus() self.prj_folder() self.blogger_lists() reader = Reader(os.path.join(self.tempdir_blogger_corp, self.txt_blogger_hightrepetativ_set), "txt", send_end_file_marker=False, regex_template="blogger", mode=self.mode) exporter = Exporter(reader.getlazy(), mode=self.mode) dbname = "blogger_corpus" exporter.tosqlite(self.tempdir_project_folder, dbname, self.fieldnames) for item in os.listdir(self.tempdir_project_folder): if ".db" in item: if dbname not in item: assert False
def test_lazyreader_from_json_with_utf8_509(self): self.blogger_corpus() end_file_marker = -1 reader = Reader(os.path.join(self.tempdir_blogger_corp, self.json_blogger_hightrepetativ_set), "json", mode=self.mode, end_file_marker=end_file_marker) for data in reader.getlazy(): if data == end_file_marker: continue assert isinstance(data, dict) assert len(data) == 6 assert 'text' in data assert 'star_constellation' in data assert 'working_area' in data assert 'age' in data assert 'id' in data assert 'gender' in data
def test_lazyreader_from_twitter_json_with_utf8_511(self): self.twitter_corpus() end_file_marker = -1 reader = Reader(os.path.join(self.tempdir_twitter_corp, self.json_twitter_set), "json", formatter_name="TwitterStreamAPI", mode=self.mode, end_file_marker=end_file_marker) for data in reader.getlazy(): if data == end_file_marker: continue if data: #p(data, c="r") assert isinstance(data, dict) #p(data["text"]) assert 'text' in data assert 'u_lang' in data assert 'id' in data assert 'u_id' in data
def test_lazyreader_from_txt_500(self): self.blogger_corpus() end_file_marker = -1 reader = Reader(os.path.join(self.tempdir_blogger_corp, self.txt_blogger_small_fake_set), "txt", regex_template="blogger", mode=self.mode, end_file_marker=end_file_marker) for data in reader.getlazy(): #p(data) if data == end_file_marker: continue assert isinstance(data, dict) assert len(data) == 6 assert 'text' in data assert 'star_constellation' in data assert 'working_area' in data assert 'age' in data assert 'id' in data assert 'gender' in data
def test_lazyreader_from_csv_with_utf8_503(self): self.blogger_corpus() end_file_marker = -1 reader = Reader(os.path.join(self.tempdir_blogger_corp, self.csv_blogger_hightrepetativ_set), "csv", mode=self.mode, end_file_marker=end_file_marker) for data in reader.getlazy(): if data == end_file_marker: continue assert isinstance(data, dict) assert len(data) == len( self.configer.docs_row_values(token=True, unicode_str=True)["blogger"][0]) assert 'text' in data assert 'star_constellation' in data assert 'working_area' in data assert 'age' in data assert 'id' in data assert 'gender' in data
def test_lazyreader_from_sifter_twitter_csv_with_utf8_513(self): self.twitter_corpus() end_file_marker = -1 #self.mode = "prod+" reader = Reader(os.path.join(self.tempdir_twitter_corp, "CSV/zas-rep-tool/sifter"), "csv", formatter_name="sifter", mode=self.mode, end_file_marker=end_file_marker) for data in reader.getlazy(csvdelimiter=";"): if data == end_file_marker: continue if data: #p(data, c="r") assert isinstance(data, dict) #p(data["text"]) assert 'text' in data assert 'u_lang' in data assert 'id' in data assert 'u_id' in data
def test_lazyreader_from_xml_with_ascii_505(self): self.blogger_corpus() end_file_marker = -1 reader = Reader(os.path.join(self.tempdir_blogger_corp, self.xml_blogger_small_fake_set), "xml", mode=self.mode, end_file_marker=end_file_marker) #p(reader.getlazy()) #p(len(reader.getlazy())) for data in reader.getlazy(): if data == end_file_marker: continue #p(data) #p(type(data)) assert isinstance(data, dict) assert len(data) == 6 assert 'text' in data assert 'star_constellation' in data assert 'working_area' in data assert 'age' in data assert 'id' in data assert 'gender' in data
def create_testsets_in_diff_file_formats(self, rewrite=False, abs_path_to_storage_place=False, silent_ignore = True): #p(abs_path_to_storage_place) #sys.exit() if not rewrite: rewrite = self._rewrite if not abs_path_to_storage_place: abs_path_to_storage_place = self._path_to_zas_rep_tools #p("fghjk") created_sets = [] if not abs_path_to_storage_place: sys.exit() try: # make test_sets for Blogger Corp for file_format, test_sets in self._types_folder_names_of_testsets.iteritems(): for name_of_test_set, folder_for_test_set in test_sets.iteritems(): if file_format == "txt": continue abs_path_to_current_test_case = os.path.join(abs_path_to_storage_place, self._path_to_testsets["blogger"], folder_for_test_set) # p((file_format, name_of_test_set)) # p(abs_path_to_current_test_case) if rewrite: if os.path.isdir(abs_path_to_current_test_case): shutil.rmtree(abs_path_to_current_test_case) #os.remove(abs_path_to_current_test_case) if not os.path.isdir(abs_path_to_current_test_case): os.makedirs(abs_path_to_current_test_case) path_to_txt_corpus = os.path.join(self.path_to_zas_rep_tools,self._path_to_testsets["blogger"] , self._types_folder_names_of_testsets["txt"][name_of_test_set] ) reader = Reader(path_to_txt_corpus, "txt", regex_template="blogger",logger_level= self._logger_level,logger_traceback=self._logger_traceback, logger_folder_to_save=self._logger_folder_to_save,logger_usage=self._logger_usage, logger_save_logs= self._logger_save_logs, mode=self._mode , error_tracking=self._error_tracking, ext_tb= self._ext_tb) exporter = Exporter(reader.getlazy(), rewrite=rewrite, silent_ignore=silent_ignore, logger_level= self._logger_level,logger_traceback=self._logger_traceback, logger_folder_to_save=self._logger_folder_to_save,logger_usage=self._logger_usage, logger_save_logs= self._logger_save_logs, mode=self._mode , error_tracking=self._error_tracking, ext_tb= self._ext_tb) if file_format == "csv": if name_of_test_set == "small": flag = exporter.tocsv(abs_path_to_current_test_case, "blogger_corpus",self._columns_in_doc_table["blogger"], rows_limit_in_file=5) if not flag: yield False else: created_sets.append("csv") yield True else: flag= exporter.tocsv(abs_path_to_current_test_case, "blogger_corpus",self._columns_in_doc_table["blogger"], rows_limit_in_file=2) if not flag: yield False else: created_sets.append("csv") yield True elif file_format == "xml": if name_of_test_set == "small": flag = exporter.toxml(abs_path_to_current_test_case, "blogger_corpus", rows_limit_in_file=5) if not flag: yield False else: created_sets.append("xml") yield True else: flag = exporter.toxml(abs_path_to_current_test_case, "blogger_corpus", rows_limit_in_file=2) if not flag: yield False else: created_sets.append("xml") yield True elif file_format == "json": if name_of_test_set == "small": flag = exporter.tojson(abs_path_to_current_test_case, "blogger_corpus", rows_limit_in_file=5) if not flag: yield False else: created_sets.append("json") yield True else: flag = exporter.tojson(abs_path_to_current_test_case, "blogger_corpus", rows_limit_in_file=2) if not flag: yield False else: created_sets.append("json") yield True elif file_format == "sqlite": flag = exporter.tosqlite(abs_path_to_current_test_case, "blogger_corpus",self._columns_in_doc_table["blogger"]) if not flag: yield False else: created_sets.append("sqlite") yield True #p(created_sets, "created_sets") for created_set in set(created_sets): path_to_set = os.path.join(abs_path_to_storage_place, self._path_to_testsets["blogger"], created_set) #p(path_to_set) #p(os.path.join(os.path.split(path_to_set)[0], created_set+".zip")) make_zipfile(os.path.join(os.path.split(path_to_set)[0], created_set+".zip"), path_to_set) self.logger.info("TestSets (diff file formats) was initialized.") except Exception, e: print_exc_plus() if self._ext_tb else "" self.logger.error("SubsetsCreaterError: Throw following Exception: '{}'. ".format(e), exc_info=self._logger_traceback)
def test_create_all_test_cases_for_diff_fileformats_502(self): self.prj_folder() configer = TestsConfiger(mode=self.mode) abs_path_to_storage_place = self.tempdir_project_folder #sys.exit() #configer.create_testsets_in_diff_file_formats(rewrite=True,abs_path_to_storage_place=abs_path_to_storage_place) returned_flags = set( list( configer.create_testsets_in_diff_file_formats( rewrite=False, abs_path_to_storage_place=abs_path_to_storage_place))) #p(returned_flags) if not (len(returned_flags) > 1) or True not in returned_flags: return False #sys.exit() for file_format, test_sets in configer.types_folder_names_of_testsets.iteritems( ): for name_of_test_set, folder_for_test_set in test_sets.iteritems(): if file_format == "txt": continue if file_format == "sqlite": continue abs_path_to_current_test_case = os.path.join( abs_path_to_storage_place, configer._path_to_testsets["blogger"], folder_for_test_set) #p(abs_path_to_current_test_case, c="r") if not os.path.isdir(abs_path_to_current_test_case): os.makedirs(abs_path_to_current_test_case) #p(configer._types_folder_names_of_testsets) path_to_txt_corpus = os.path.join( configer.path_to_zas_rep_tools, configer._path_to_testsets["blogger"], configer._types_folder_names_of_testsets["txt"] [name_of_test_set]) #p(path_to_txt_corpus) reader_txt = Reader(path_to_txt_corpus, "txt", regex_template="blogger", send_end_file_marker=False, mode=self.mode) reader_current_set = Reader(abs_path_to_current_test_case, file_format, send_end_file_marker=False, mode=self.mode) #p((list(reader_txt.getlazy()), )) data_from_txt = defaultdict(list) data_from_current_set = defaultdict(list) for item in reader_txt.getlazy(): for k, v in item.iteritems(): if unicode(v).isnumeric(): v = int(v) data_from_txt[k].append(v) for item in reader_current_set.getlazy(): for k, v in item.iteritems(): if unicode(v).isnumeric(): v = int(v) data_from_current_set[k].append(v) for col in self.configer.columns_in_doc_table["blogger"]: #p(col) if col != "rowid": for txt_item, current_set_item in zip( sorted(data_from_txt[col]), sorted(data_from_current_set[col])): #p((repr(txt_item), repr(current_set_item))) if txt_item != current_set_item: assert False
def test_getlazy_many_streams_from_txt_without_given_number_of_streams_adjusted_for_current_cpu_520( self): self.blogger_corpus() end_file_marker = -1 reader = Reader(os.path.join(self.tempdir_blogger_corp, self.txt_blogger_small_fake_set), "txt", regex_template="blogger", mode=self.mode, end_file_marker=end_file_marker, send_end_file_marker=True) number_of_found_files = reader._get_number_of_left_over_files() #p(number_of_found_files) len( reader.getlazy(stream_number=4, adjust_to_cpu=True, min_files_pro_stream=3)).should.be.equal( get_number_of_streams_adjust_cpu( 3, number_of_found_files, 4)) len( reader.getlazy(stream_number=4, adjust_to_cpu=True, min_files_pro_stream=5)).should.be.equal( get_number_of_streams_adjust_cpu( 3, number_of_found_files, 5)) len( reader.getlazy(stream_number=4, adjust_to_cpu=True, min_files_pro_stream=3)).should.be.equal( get_number_of_streams_adjust_cpu( 3, number_of_found_files, 4)) len( reader.getlazy(stream_number=4, adjust_to_cpu=True, min_files_pro_stream=2)).should.be.equal( get_number_of_streams_adjust_cpu( 2, number_of_found_files, 4)) len( reader.getlazy(stream_number=4, adjust_to_cpu=True, min_files_pro_stream=1)).should.be.equal( get_number_of_streams_adjust_cpu( 1, number_of_found_files, 4)) i = 0 for gen in reader.getlazy(stream_number=1000, adjust_to_cpu=True, min_files_pro_stream=1): for row_dict in gen: if row_dict == end_file_marker: i += 1 continue assert isinstance(row_dict, dict) assert len(row_dict) == 6 assert 'text' in row_dict assert 'star_constellation' in row_dict assert 'working_area' in row_dict assert 'age' in row_dict assert 'id' in row_dict assert 'gender' in row_dict #p((number_of_found_files, i)) assert number_of_found_files == i
def test_getlazy_many_streams_from_json_also_getted_from_zips_519(self): self.blogger_corpus() # Test 1: Check if number of getted files is correct end_file_marker = -1 reader = Reader(os.path.join(self.tempdir_blogger_corp), "json", mode=self.mode, read_from_zip=True, end_file_marker=end_file_marker, send_end_file_marker=True) number_of_found_files = reader._get_number_of_left_over_files() if number_of_found_files < 3: assert False if reader.files_number_in_zips != len(reader.files_to_read_orig): ## for this, it is important that the main folder of the test cases should be zipped!! That there is the same number of files assert False number_getted_files = len([ row for gen in reader.getlazy( stream_number=4, adjust_to_cpu=True, min_files_pro_stream=5) for row in gen if row == end_file_marker ]) if number_of_found_files != number_getted_files: assert False #p((number_of_found_files, number_getted_files), "number_of_found_files != number_getted_files") # Test 2: check if right number of streams will be returned len( reader.getlazy(stream_number=4, adjust_to_cpu=True, min_files_pro_stream=5)).should.be.equal( get_number_of_streams_adjust_cpu( 5, number_of_found_files, 4)) len( reader.getlazy(stream_number=4, adjust_to_cpu=True, min_files_pro_stream=3)).should.be.equal( get_number_of_streams_adjust_cpu( 3, number_of_found_files, 4)) len( reader.getlazy(stream_number=4, adjust_to_cpu=True, min_files_pro_stream=2)).should.be.equal( get_number_of_streams_adjust_cpu( 2, number_of_found_files, 4)) len( reader.getlazy(stream_number=4, adjust_to_cpu=True, min_files_pro_stream=1)).should.be.equal( get_number_of_streams_adjust_cpu( 1, number_of_found_files, 4)) i = 0 for gen in reader.getlazy(stream_number=1000, adjust_to_cpu=True, min_files_pro_stream=1): for row_dict in gen: #p(row_dict) #i+=1 if row_dict == end_file_marker: i += 1 continue assert isinstance(row_dict, dict) assert len(row_dict) == 6 assert 'text' in row_dict assert 'star_constellation' in row_dict assert 'working_area' in row_dict assert 'age' in row_dict assert 'id' in row_dict assert 'gender' in row_dict assert number_of_found_files == i