def load_go_data( self, data_type='train', # <1> num_samples=1000): # <2> index = KGSIndex(data_directory=self.data_dir) index.download_files() # <3> sampler = Sampler(data_dir=self.data_dir) data = sampler.draw_data(data_type, num_samples) # <4> zip_names = set() indices_by_zip_name = {} for filename, index in data: zip_names.add(filename) # <5> if filename not in indices_by_zip_name: indices_by_zip_name[filename] = [] indices_by_zip_name[filename].append(index) # <6> for zip_name in zip_names: base_name = zip_name.replace('.tar.gz', '') data_file_name = base_name + data_type if not os.path.isfile(self.data_dir + '/' + data_file_name): self.process_zip(zip_name, data_file_name, indices_by_zip_name[zip_name]) # <7> features_and_labels = self.consolidate_games(data_type, data) # <8> return features_and_labels
def load_go_data(self, data_type='train', num_samples=1000): # download all games from KGS to local data directory index = KGSIndex(data_directory=self.data_dir) index.download_files() # sampler instance selects the specified number of games for a data type sampler = Sampler(data_dir=self.data_dir) data = sampler.draw_data(data_type, num_samples) # collect all zip files zip_names = set() indices_by_zip_name = {} for filename, index in data: zip_names.add(filename) if filename not in indices_by_zip_name: indices_by_zip_name[filename] = [] indices_by_zip_name[filename].append(index) # group all SGF file indices by zip_file_name for zip_name in zip_names: base_name = zip_name.replace('.tar.gz', '') data_file_name = base_name + data_type # Process each zip file individually if not os.path.isfile(self.data_dir + '/' + data_file_name): self.process_zip(zip_name, data_file_name, indices_by_zip_name[zip_name]) # Aggregate and return files features_and_labels = self.consolidate_games(data_type, data) return features_and_labels
def load_go_data(self, data_type='train', num_samples=1000, download=False): """ 棋譜データの読み込み Parameters ---------- data_type : str trainまたはtest num_samples : int 読み込むゲームの数 download : bool データのwebからのダウンロードを行うかどうか Returns ------- features_and_labels [0]: 特徴量のリスト [1]: ラベルのリスト """ # 必要なら棋譜データをダウンロード if download: index = KGSIndex(data_directory=self.data_dir) index.download_files() # Samplerによって必要数のゲーム数を含む(zipファイル名,ゲームインデックス)のリストを取得 # なお,Samplerは2014年より前の棋譜か後の棋譜かでtestとtrainを分けている sampler = Sampler(data_dir=self.data_dir) data = sampler.draw_data(data_type, num_samples) # ファイル名のsetと「ファイル名->ゲームインデックスのリスト」となる辞書を作成 zip_names = set() indices_by_zip_name = {} for filename, index in data: zip_names.add(filename) if filename not in indices_by_zip_name: indices_by_zip_name[filename] = [] indices_by_zip_name[filename].append(index) # setに入っている必要なtar.gzを,未解凍なら解凍 for zip_name in zip_names: base_name = zip_name.replace('.tar.gz', '') data_file_name = base_name + data_type if not os.path.isfile(self.data_dir + '/' + data_file_name): self.process_zip(zip_name, data_file_name, indices_by_zip_name[zip_name]) # 必要なデータが分かったので特徴量とラベルを取得する features_and_labels = self.consolidate_games(data_type, data) return features_and_labels
def load_go_data(self, data_type='train', num_samples=1000, use_generator=False): index = KGSIndex(data_directory=self.data_dir) index.download_files() sampler = Sampler(data_dir=self.data_dir, num_test_games=int(num_samples/10)) data = sampler.draw_data(data_type, num_samples) self.map_to_workers(data_type, data) # <1> if use_generator: generator = DataGenerator(self.data_dir, data) return generator # <2> else: features_and_labels = self.consolidate_games(data_type, data) return features_and_labels # <3>
def load_go_data(self, data_type, num_samples, use_generator=False): index = KGSIndex(data_directory=self.data_dir) index.download_files() sampler = Sampler(data_dir=self.data_dir) data = sampler.draw_data(data_type, num_samples) self.map_to_workers(data_type, data) # <1> if use_generator: if self.data_dir is None: self.data_dir = 'D:\\CODE\\Python\\Go\\code\\dlgo\\data\\tarfiles' generator = DataGenerator(self.data_dir, data) return generator # <2> else: features_and_labels = self.consolidate_games(data_type, data) return features_and_labels # <3>
def load_go_data(self, data_type='train', num_samples=1000, use_generator=True, download=False): """ ファイルの読み込みを並列で行いながら特徴量とラベルを取得する Parameters ---------- data_type : str 'train'または'test' num_samples : int 取得する棋譜の数 use_generator : bool yieldによるミニバッチの取得を行う download : bool ファイルのダウンロードを行う Returns ------- 次のいずれか generator : generator ミニバッチの取得を行うgeneratorを返す features_and_labels : tuple 特徴量とラベルを一度に取得する """ if download: index = KGSIndex(data_directory=self.data_dir) index.download_files() sampler = Sampler(data_dir=self.data_dir) data = sampler.draw_data(data_type, num_samples) self.map_to_workers(data_type, data) if use_generator: # generatorではすべてのデータをメモリに持つわけではない # なので取得データの保存は行わない generator = DataGenerator(self.data_dir, data) return generator else: features_and_labels = self.consolidate_games(data_type, data) return features_and_labels
def load_go_data(self, data_type='train', num_samples=1000): index = KGSIndex(data_directory=self.data_dir) index.download_files() sampler = Sampler(data_dir=self.data_dir) data = sampler.draw_data(data_type, num_samples) #print(data) zip_names = set() indices_by_zip_name = {} for filename, index in data: zip_names.add(filename) if filename not in indices_by_zip_name: indices_by_zip_name[filename] = [] indices_by_zip_name[filename].append(index) for zip_name in zip_names: basename = zip_name.replace('.tar.gz', '') data_file_name = basename + data_type if not os.path.isfile(self.data_dir + data_file_name): #print(self.data_dir + data_file_name) self.process_zip(zip_name, data_file_name, indices_by_zip_name[zip_name])
def load_go_data(self, data_type='train', num_samples=1000, use_generator=False): index = KGSIndex(data_directory=self.data_dir) index.download_files() sampler = Sampler(data_dir=self.data_dir) data = sampler.draw_data(data_type, num_samples) # ワークロードをCPUにマップする self.map_to_workers(data_type, data) # <1> if use_generator: generator = DataGenerator(self.data_dir, data) # 囲碁データジェネレータを返すか return generator # <2> else: features_and_labels = self.consolidate_games(data_type, data) # 以前のように結合されたデータを返す return features_and_labels # <3>
def draw_training_games(self): index = KGSIndex(data_directory=self.data_dir) for fileinfo in index.file_info: filename = fileinfo['filename'] #year = int(filename.split('-')[1].split('-')[0]) #if year > self.cap_year: # continue num_games = fileinfo['num_games'] for i in range(num_games): sample = (filename, i) if sample not in self.train_games: self.train_games.append(sample) print('toal num training samples: ' + str(len(self.train_games)))
def load_go_data(self, data_type='train', num_samples=1000, use_generator=False): # Khoi tao KGSIndex() index = KGSIndex(data_directory=self.data_dir) # Download tat ca games tu KGS toi thu muc data_directory. Neu data co san, khong can download mot lan nua index.download_files() sampler = Sampler(data_dir=self.data_dir) # Sample chon so luong games cu the cho data_type data = sampler.draw_data(data_type, num_samples) # Map workload to CPUs self.map_to_workers(data_type, data) if use_generator: generator = DataGenerator(self.data_dir, data) # Tra ve Go data generator return generator else: features_and_labels = self.consolidate_games(data_type, data) # Tra ve features va labels return features_and_labels
def load_go_data( self, data_type='train', # <1> num_samples=1000): # <2> index = KGSIndex(data_directory=self.data_dir) # KGSから全てのゲームをローカルのデータディレクトリにダウンロード。 # データがすでに利用可能な場合は、再度ダウンロードされない。 index.download_files() # <3> sampler = Sampler(data_dir=self.data_dir) # Sampleインスタンスは、選択されたデータ種別のために指定された数のゲームを選択する data = sampler.draw_data(data_type, num_samples) # <4> zip_names = set() indices_by_zip_name = {} for filename, index in data: # データに含まれるすべてのzipファイル名をリストにまとめる zip_names.add(filename) # <5> if filename not in indices_by_zip_name: indices_by_zip_name[filename] = [] # 全てのSGFファイルのインデックスをzipファイル名でグループ化する indices_by_zip_name[filename].append(index) # <6> for zip_name in zip_names: base_name = zip_name.replace('.tar.gz', '') data_file_name = base_name + data_type if not os.path.isfile(self.data_dir + '/' + data_file_name): # zipファイルは個別に処理される self.process_zip(zip_name, data_file_name, indices_by_zip_name[zip_name]) # <7> # 各zipの特徴量とラベルが結合され、返される features_and_labels = self.consolidate_games(data_type, data) # <8> return features_and_labels
def load_go_data(self, data_type='train', # As `data_type` you can choose either 'train' or 'test' num_samples=1000): # `num_samples` refers to the number of games to load data from index = KGSIndex(data_directory=self.data_dir) index.download_files() # download all games from KGS to our local data directory. If data is available, it won't be downloaded again sampler = Sampler(data_dir=self.data_dir) data = sampler.draw_data(data_type, num_samples) # The `Sampler` instance selects the specified number of games for a data type zip_names = set() indices_by_zip_name = {} for filename, index in data: zip_names.add(filename) # We collect all zip file names contained in the data in a list if filename not in indices_by_zip_name: indices_by_zip_name[filename] = [] indices_by_zip_name[filename].append(index) # Then we group all SGF file indices by zip file name for zip_name in zip_names: base_name = zip_name.replace('.tar.gz', '') data_file_name = base_name + data_type if not os.path.isfile(self.data_dir + '/' + data_file_name): # The zip files are then processed individually self.process_zip(zip_name, data_file_name, indices_by_zip_name[zip_name]) # Features and labels from each zip are then aggregated and returned features_and_labels = self.consolidate_games(data_type, data) return features_and_labels
def draw_training_games(self): # get list of all non-test games, that are no later than dec 2014 # ignore games after cap_year to keep training data stable index = KGSIndex(data_directory=self.data_dir) for file_info in index.file_info: filename = file_info['filename'] year = int(filename.split('-')[1].split('_')[0]) if year > self.cap_year: continue num_games = file_info['num_games'] for i in range(num_games): sample = (filename, i) if sample not in self.test_games: self.train_games.append(sample) print('total num training games: ' + str(len(self.train_games)))
def draw_training_samples(self, num_sample_games): available_games = [] index = KGSIndex(data_directory=self.data_dir) for fileinfo in index.file_info: filename = fileinfo['filename'] #year = int(filename.split('-')[1].split('-')[0]) #if year > self.cap_year: # continue num_games = fileinfo['num_games'] for i in range(num_games): available_games.append((filename, i)) sample_set = set() while len(sample_set) < num_sample_games: sample = random.choice(available_games) if sample not in sample_set: sample_set.add(sample) print('Drawn ' + str(num_sample_games) + ' samples') return list(sample_set)
def draw_all_training(self): available_games = [] index = KGSIndex(data_directory=self.data_dir) for fileinfo in index.file_info: filename = fileinfo['filename'] #year = int(filename.split('-')[1].split('-')[0]) #if year > self.cap_year: # continue num_games = fileinfo['num_games'] for i in range(num_games): available_games.append((filename, i)) print('>>> Total number of games used: ' + str(len(available_games))) sample_set = set() for sample in available_games: if sample not in sample_set: sample_set.add(sample) print('Drawn all samples, ie ' + str(len(sample_set)) + ' samples:') return list(sample_set) pass
def draw_training_samples(self, num_sample_games): '''Draw training games, not overlapping with any of the test games.''' available_games = [] index = KGSIndex(data_directory=self.data_dir) for fileinfo in index.file_info: filename = fileinfo['filename'] year = int(filename.split('-')[1].split('_')[0]) if year > self.cap_year: continue num_games = fileinfo['num_games'] for i in range(num_games): available_games.append((filename, i)) print('total num games: ' + str(len(available_games))) sample_set = set() while len(sample_set) < num_sample_games: sample = random.choice(available_games) if sample not in self.test_games: sample_set.add(sample) print('Drawn ' + str(num_sample_games) + ' samples:') return list(sample_set)
def draw_samples(self, num_sample_games): '''Draw num_sample_games many training games from index.''' available_games = [] index = KGSIndex(data_directory=self.data_dir) for fileinfo in index.file_info: filename = fileinfo['filename'] year = int(filename.split('-')[1].split('_')[0]) if year > self.cap_year: continue num_games = fileinfo['num_games'] for i in range(num_games): available_games.append((filename, i)) print('>>> Total number of games used: ' + str(len(available_games))) sample_set = set() while len(sample_set) < num_sample_games: sample = random.choice(available_games) if sample not in sample_set: sample_set.add(sample) print('Drawn ' + str(num_sample_games) + ' samples:') return list(sample_set)
def draw_all_training(self): '''Draw all available training games.''' available_games = [] index = KGSIndex(data_directory=self.data_dir) for fileinfo in index.file_info: filename = fileinfo['filename'] year = int(filename.split('-')[1].split('_')[0]) if year > self.cap_year: continue if 'num_games' in fileinfo.keys(): num_games = fileinfo['num_games'] else: continue for i in range(num_games): available_games.append((filename, i)) print('total num games: ' + str(len(available_games))) sample_set = set() for sample in available_games: if sample not in self.test_games: sample_set.add(sample) print('Drawn all samples, ie ' + str(len(sample_set)) + ' samples:') return list(sample_set)
from dlgo.data.index_processor import KGSIndex index = KGSIndex() index.download_files()