def __init__(self, path, dataset, *args, **kwargs): self.dataset = dataset self.vocab = Vocab(*args, **kwargs) if self.dataset in ["ptb", "wt2", "enwik8", "text8"]: self.vocab.count_file(os.path.join(path, "train.txt")) self.vocab.count_file(os.path.join(path, "valid.txt")) self.vocab.count_file(os.path.join(path, "test.txt")) elif self.dataset == "wt103": self.vocab.count_file(os.path.join(path, "train.txt")) elif self.dataset == "lm1b": train_path_pattern = os.path.join( path, "1-billion-word-language-modeling-benchmark-r13output", "training-monolingual.tokenized.shuffled", "news.en-*") train_paths = glob(train_path_pattern) # the vocab will load from file when build_vocab() is called # for train_path in sorted(train_paths): # self.vocab.count_file(train_path, verbose=True) self.vocab.build_vocab() if self.dataset in ["ptb", "wt2", "wt103"]: self.train = self.vocab.encode_file(os.path.join( path, "train.txt"), ordered=True) self.valid = self.vocab.encode_file(os.path.join( path, "valid.txt"), ordered=True) self.test = self.vocab.encode_file(os.path.join(path, "test.txt"), ordered=True) elif self.dataset in ["enwik8", "text8"]: self.train = self.vocab.encode_file(os.path.join( path, "train.txt"), ordered=True, add_eos=False) self.valid = self.vocab.encode_file(os.path.join( path, "valid.txt"), ordered=True, add_eos=False) self.test = self.vocab.encode_file(os.path.join(path, "test.txt"), ordered=True, add_eos=False) elif self.dataset == "lm1b": self.train = train_paths valid_path = os.path.join(path, "valid.txt") test_path = valid_path self.valid = self.vocab.encode_file(valid_path, ordered=True, add_double_eos=True) self.test = self.vocab.encode_file(test_path, ordered=True, add_double_eos=True) if self.dataset == "wt103": self.cutoffs = [0, 20000, 40000, 200000] + [len(self.vocab)] elif self.dataset == "lm1b": self.cutoffs = [0, 60000, 100000, 640000] + [len(self.vocab)] else: self.cutoffs = []
def reconcile_dupes(unused_argv): """Adds found bond topology ids to the output from topology_from_geom.""" del unused_argv # For each input file, a dataframe. df_list: List[pd.Dataframe] = [] for filepath in gfile.glob(FLAGS.input): logging.info("Opening %s", filepath) with gfile.GFile(filepath, "r") as f: df_list.append( pd.read_csv(f, names=[ "Smiles", "StartSmi", "id", "Fate", "NBts", "RingAtoms", "IsStart" ])) data = pd.concat(df_list) del df_list logging.info(data.shape) # Convert conformer_ids to bond_topology_id by dividing by 1000 # Expect many dupes to be overwritten here. smiles_to_id = {k: v for k, v in zip(data["StartSmi"], data["id"] // 1000)} # We only care about the cases where there is a BondTopology mismatch. interesting = data.loc[not data["IsStart"]] logging.info(interesting.shape) # Convert the two smiles columns to molecules. mstart = [Chem.MolFromSmiles(x) for x in interesting["StartSmi"]] mfound = [Chem.MolFromSmiles(x) for x in interesting["Smiles"]] # Ring score for each of the molecules. rstart = [ring_atoms(m) for m in mstart] rfound = [ring_atoms(m) for m in mfound] same_ring_membership = 0 different_ring_membership = 0 no_smiles = 0 print("FoundSmiles,StartSmi,StartId,FoundId,FoundScore,StartScore") for i, scores in enumerate(zip(rfound, rstart)): found_score = scores[0] start_score = scores[1] if found_score == start_score: same_ring_membership += 1 continue different_ring_membership += 1 found_smiles = interesting.iloc[i, 0] other_bt = smiles_to_id.get(found_smiles, "*") if other_bt == "*": message = f"smiles {found_smiles}, not known" logging.info(message) no_smiles += 1 print( f"{interesting.iloc[i,0]},{interesting.iloc[i, 1]},{interesting.iloc[i,2]},{other_bt},{found_score},{start_score}" ) logging.info("%d molecules different smiles but same ring membership", same_ring_membership) logging.info("%d of %d items have different ring membership", different_ring_membership, interesting.shape[0]) logging.info("%d items had unrecognised smiles", no_smiles)
def __init__(self, path, dataset, *args, **kwargs): self.dataset = dataset if self.dataset == "generic_dataset": encode_kwargs = dict( add_eos=kwargs.pop('add_eos', False), add_double_eos=kwargs.pop('add_double_eos', False), ordered=True, verbose=True, ) if kwargs.get('vocab_file') is not None: kwargs['vocab_file'] = os.path.join(path, kwargs['vocab_file']) print(self.dataset, 'vocab params', kwargs) self.vocab = Vocab(*args, **kwargs) if self.dataset in ["ptb", "wt2", "enwik8", "text8"]: self.vocab.count_file(os.path.join(path, "train.txt")) self.vocab.count_file(os.path.join(path, "valid.txt")) self.vocab.count_file(os.path.join(path, "test.txt")) elif self.dataset == "generic_dataset" and not self.vocab.vocab_file: self.vocab.count_file(os.path.join(path, "train.txt")) self.vocab.count_file(os.path.join(path, "valid.txt")) self.vocab.count_file(os.path.join(path, "test.txt")) elif self.dataset == "wt103": self.vocab.count_file(os.path.join(path, "train.txt")) elif self.dataset == "lm1b": train_path_pattern = os.path.join( path, "1-billion-word-language-modeling-benchmark-r13output", "training-monolingual.tokenized.shuffled", "news.en-*") train_paths = glob(train_path_pattern) # the vocab will load from file when build_vocab() is called # for train_path in sorted(train_paths): # self.vocab.count_file(train_path, verbose=True) self.vocab.build_vocab() if self.dataset in ["ptb", "wt2", "wt103"]: self.train = self.vocab.encode_file(os.path.join( path, "train.txt"), ordered=True) self.valid = self.vocab.encode_file(os.path.join( path, "valid.txt"), ordered=True) self.test = self.vocab.encode_file(os.path.join(path, "test.txt"), ordered=True) elif self.dataset == "generic_dataset": self.train = self.vocab.encode_file( os.path.join(path, "train.txt"), **encode_kwargs) self.valid = self.vocab.encode_file( os.path.join(path, "valid.txt"), **encode_kwargs) self.test = self.vocab.encode_file(os.path.join(path, "test.txt"), **encode_kwargs) elif self.dataset in ["enwik8", "text8"]: self.train = self.vocab.encode_file(os.path.join( path, "train.txt"), ordered=True, add_eos=False) self.valid = self.vocab.encode_file(os.path.join( path, "valid.txt"), ordered=True, add_eos=False) self.test = self.vocab.encode_file(os.path.join(path, "test.txt"), ordered=True, add_eos=False) elif self.dataset == "lm1b": self.train = train_paths valid_path = os.path.join(path, "valid.txt") test_path = valid_path self.valid = self.vocab.encode_file(valid_path, ordered=True, add_double_eos=True) self.test = self.vocab.encode_file(test_path, ordered=True, add_double_eos=True) if self.dataset == "wt103": self.cutoffs = [0, 20000, 40000, 200000] + [len(self.vocab)] elif self.dataset == "generic_dataset": with open(os.path.join(path, "cutoffs.json")) as f: self.cutoffs = json.load(f) elif self.dataset == "lm1b": self.cutoffs = [0, 60000, 100000, 640000] + [len(self.vocab)] else: self.cutoffs = []