def calculate_solvent_options(self, ref_solv=None, verbose=1): printdarkcyan("\n ***********************") for each in self.grouped: try: vacuum = [x for x in each if not x.solvate][0] solvated = [x for x in each if x.solvate][0] solvated.dct['energy_au'] = solvated.dct['dft'] - vacuum.dct['dft'] solvated.dct['energy_kj'] = solvated.dct['energy_au'] * HARTREE_TO_KJ_MOL solvated.dct['g_solv_dir'] = solvated.dct['dG'] - vacuum.dct['dG'] solvated.dct['direct_vs_indirect'] = solvated.dct['g_solv_dir'] - solvated.dct['energy_kj'] solvated.dct['G_solv_'] = solvated.dct['energy_kj'] + vacuum.dct['dG'] if ref_solv: solvated.dct['self_ref_indirect'] = solvated.dct['G_solv_'] + ref_solv['G_solv_'] solvated.dct['self_ref_direct'] = solvated.dct['dG'] + ref_solv['dG'] if verbose: vacuum.print_files() solvated.print_files() printdarkcyan(" ***********************") except (ValueError, IndexError): if verbose: printred("Failed to match to vacuum and solvent.") for group in each: group.print_files() print("") printdarkcyan(" ***********************")
def get_pdf_files(PMClist_file, pdf_folder): """ Download PDF files for all PMCs in the file PMClist_file :param PMClist_file: Path to text file with full list of PMCs for dataset :param pdf_folder: Path to folder where the pdf files will be saved """ url = 'https://europepmc.org/backend/ptpmcrender.fcgi?accid=PMCXXXXX&blobtype=pdf' if not pdf_folder.exists(): pdf_folder.mkdir() with PMClist_file.open() as fin: pmcs = [el.strip() for el in fin.readlines()] pmcs = [el for el in pmcs if len(el)] for el in pmcs: path2file = pdf_folder.joinpath(el + '.pdf') response = requests.get(url=url.replace('PMCXXXXX', el)) if response.ok: with path2file.open('wb') as f: f.write(response.content) printgr('Correctly processed ' + el) else: printred('Could not retrieve ' + el) return
def get_xml_files(PMClist_file, xml_folder): """ Download fulltext XML files for all PMCs in the file PMClist_file :param PMClist_file: Path to text file with full list of PMCs for dataset :param xml_folder: Path to folder where the fulltext XML files will be saved """ url = 'https://www.ebi.ac.uk/europepmc/webservices/rest/PMCXXXXX/fullTextXML' if not xml_folder.exists(): xml_folder.mkdir() with PMClist_file.open() as fin: pmcs = [el.strip() for el in fin.readlines()] pmcs = [el for el in pmcs if len(el)] for el in pmcs: path2file = xml_folder.joinpath(el + '.xml') response = requests.get(url=url.replace('PMCXXXXX', el)) if response.ok: with path2file.open('w') as f: f.write(response.text) printgr('Correctly processed ' + el) else: printred('Could not retrieve ' + el) return
def get_annotations(PMClist_file, annotations_folder): """ Download available EuropePMC annotations for all PMCs in the file PMClist_file :param PMClist_file: Path to text file with full list of PMCs for dataset :param annotations_folder: Path to folder where the annotation files will be saved """ url = 'https://www.ebi.ac.uk/europepmc/annotations_api/annotationsByArticleIds?articleIds=PMC%3AXXXXX&format=JSON' if not annotations_folder.exists(): annotations_folder.mkdir() with PMClist_file.open() as fin: pmcs = [el.strip() for el in fin.readlines()] pmcs = [el for el in pmcs if len(el)] for el in pmcs: path2file = annotations_folder.joinpath(el + '.json') response = requests.get(url=url.replace('XXXXX', el.split('PMC')[1])) if response.ok: with path2file.open('w') as f: f.write(response.text) printgr('Coirrectly processed ' + el) else: printred('Could not retrieve ' + el) return
def get_dft_details(self, verbose=0, scale=False, **kwargs): try: thermochem = section_by_pattern(self.contents, pattern="Thermochemistry")[1:][-1] self.thermochemistry = thermochem if verbose > 2: print("Thermochemistry found.") factors = scaling_factors["DEFAULT"] if scale: try: factors = scaling_factors[self.method][self.basis] except KeyError: printred(f"{self.method} {self.basis} scaling factors not specified yet.") self.__dict__.update(factors) if verbose > 2: print("\n".join([f"{k:>20}: {v}" for k, v in factors.items()])) self.temperature = float(self.search("Temperature", lines=thermochem).split()[1]) self.mass_total = float(self.search("Molecular mass", lines=thermochem).split(":")[1].strip().split()[0]) self.mass_total_kg = self.mass_total*AMU_TO_KG atom_lines = [x for x in thermochem if "Atom " in x] self.mass_atoms = [float(x.strip().split()[-1]) for x in atom_lines] if verbose > 2: print(f"""\ Temperature: {self.temperature} K Molecular mass: {self.mass_total_kg} kg \ """) self.check_frequencies(verbose=verbose) self.get_entropy(verbose=verbose) self.get_zero_point(verbose=verbose) self.get_hlc(verbose=verbose) self.get_thermal_correction(verbose=verbose) except: pass
def check_frequencies(self, verbose=3, **kwargs): # try: self.get_frequencies() if all(self.frequencies > 0): if verbose > 1: print(f" {self.base_name} has no imaginary frequencies.") else: printred(f" {self.base_name} has imaginary frequencies.") if verbose: printred([x for x in self.frequencies if x < 0]) raise ValueError if verbose > 5: print(f" Frequencies:{self.frequencies}")
def similar_orientation_to(self, other, rtol=1e-5, atol=1e-4, debug=False, n_letters=50, verbose=2, rmsd_threshold=1e-2): """ Lesson here: don't trust standards. Even Peter Gill's. Checks input orientation first, then standard. """ np.set_printoptions(precision=5, linewidth=120, sign=' ', suppress=True, floatmode="fixed") def _similar(a, b): return np.allclose(a, b, rtol=rtol, atol=atol) if _similar(self.coordinates, other.coordinates): if debug and verbose > 3: printgreen("Found similar coordinates.") return True if debug and verbose > 4: printred(f"\nA: {self.filename[-n_letters:]} B: {other.filename[-n_letters:]}") if verbose > 3: printyellow(f"{'Input orientation':^100}") self._diff_coordinates(self.coordinates, other.coordinates) if verbose > 4: self.print_filename() self.print_summary() other.print_filename() other.print_summary() sign = list(itertools.product([1, -1], repeat=3)) arrangements = list(itertools.permutations(range(3))) per = list(itertools.product(sign, arrangements)) permutations = [(self.std_coords*sign_)[:,arr_] for sign_, arr_ in per] any_close = any(_similar(x, other.std_coords) for x in permutations) if not any_close and debug and verbose > 3: _, rmsds = list(zip(*[self._diff_coordinates(x, other.std_coords, _print=False) for x in permutations])) min_rmsd_idx = np.argsort(rmsds)[0] min_rmsd = rmsds[min_rmsd_idx] if min_rmsd < rmsd_threshold or verbose > 4: printyellow(f"{'Standard orientation':^100}") self._diff_coordinates(permutations[min_rmsd_idx], other.std_coords) printyellow(" -------- \n") return any_close
def try_print_files(self, outfiles, verbose=2): err = 0 txt = "" for k in ["dft", "mp2s", "mp2l", "cc"]: try: filename = getattr(self, k).filename except: filename = stylered("Missing") err += 1 txt += f""" {k.upper():>5} : {filename}""" if err: printred(f""" Missing files!""") print(txt[1:]) print(f"{len(outfiles)} input files.") raise ValueError elif verbose > 3: print(txt[1:])
def get_rj_queued(show=True): dct = get_rj_env() try: output = str( subprocess.check_output("{SSH} {QUEUED}".format(**dct), shell=True))[2:-1] except subprocess.CalledProcessError: output = "" if output: jobstr = [x.strip().strip('\\r').strip() for x in output.split("\\n")] jobs = [x for x in jobstr if x] else: jobs = [] if show: if jobs: print("\n".join(jobs)) else: printred("No jobs available.") return jobs
def get_annotationsMED(PMIDs, annotations_folder): """ Download available EuropePMC annotations for all files in the list PMIDs :param PMIDs: list of Pubmed identifiers :param annotations_folder: Path to folder where the annotation files will be saved """ url = 'https://www.ebi.ac.uk/europepmc/annotations_api/annotationsByArticleIds?articleIds=MED%3AXXXXX&format=JSON' if not annotations_folder.exists(): annotations_folder.mkdir() for el in PMIDs: path2file = annotations_folder.joinpath('PMID' + el + '.json') response = requests.get(url=url.replace('XXXXX', el)) if response.ok: with path2file.open('w') as f: f.write(response.text) printgr('Correctly processed ' + el) else: printred('Could not retrieve ' + el) return
def compare(self, other, n_char=20, ncol=10, verbose=0): def _compare(a, b, name): fmt = dict( float = f">{ncol}.5f", int = f">{ncol}d", str = f">{ncol}", ) if any(type(x) == bool or x == None for x in (a, b)): a, b = str(a), str(b) f_a = fmt.get(type(a).__name__, fmt['str']) diff = ["red", None ][a == b] if verbose: print(style(f"{name.capitalize():>15}: {a:{f_a}} {b:{f_a}}", diff)) return a==b if verbose: printyellow(f"{self.filename:^{n_char}} {other.filename:^{n_char}}") same = True for attr in ["solvate", "solvent", "charge", "multiplicity", "method", "basis", "n_atoms", "energy"]: s_a = getattr(self, attr) o_a = getattr(other, attr) same = same and _compare(s_a, o_a, attr) if self.n_atoms == other.n_atoms: if self.elements == other.elements: if verbose: print("""\ Elements match up.""") if self.similar_orientation_to(other, verbose=0): print("""\ Coordinates are similar.""") else: diff, rmsd = self._diff_coordinates(self.coordinates, other.coordinates, _print=False) printred(f"""\ Coordinates are not similar. RMSD: {rmsd:6.4f} Diff: {diff:6.4f}""") else: printred("Elements don't match.") printred([(x, y) for x, y in zip(self.elements, other.elements) if x != y]) same = False return same
def get_available_files(self, files, verbose=1, **kwargs): self.output_files = [] _summaries = [] for file in files: try: parsed = read_output(file, verbose=verbose, **kwargs) if parsed: matched = [x for x in self.output_files if x.compare(parsed)] if not matched: # parsed.summary not in _summaries: self.output_files.append(parsed) # _summaries.append(parsed.summary) elif verbose > 1: printred(f"Not added: {file} is identical to {matched[0]}") elif verbose > 5: printred(f"Failed to parse: {file}") except: if verbose > 5: printred(f"Failed to parse: {file}")
corpus_file = corpus_dir.joinpath('EuropePMCAnnotations.txt') corpus_mallet = corpus_dir.joinpath('EuropePMCAnnotations.mallet') id_lema = [[el[0], [tk for tk in el[1] if tk in kept_words]] for el in id_lema] id_lema = [[el[0], stwEQ.cleanstr(' '.join(el[1])).split()] for el in id_lema] id_lema = [el for el in id_lema if len(el[1])>=min_lemas or el[0] in S2_base] #Check if all papers in base dataset are in the extended corpus #since they could have been removed if the lemmatizer did not provide a valid #output for them df = pd.read_csv(csv_file) S2_base = set(df['S2ID'].values.tolist()) S2_in = [el for el in id_lema if el[0] in S2_base] if len(S2_in)==len(S2_base): printgr('Todos los papers del dataset base se han incorporado al dataset extendido de Abstracts lematizados') else: printred('Se han perdido papers del dataset base ' + str(len(S2_in)) + ' / ' + str(len(S2_base))) print('Generating corpus, #papers:', len(id_lema)) with corpus_file.open('w', encoding='utf-8') as fout: [fout.write(el[0] + ' 0 ' + ' '.join(el[1]) + '\n') for el in id_lema] token_regexp=cf.get('CorpusGeneration','token_regexp') cmd = str(mallet_path) + \ ' import-file --preserve-case --keep-sequence ' + \ '--remove-stopwords --token-regex "' + token_regexp + '" ' + \ '--input %s --output %s' cmd = cmd % (corpus_file, corpus_mallet) try: print(f'-- -- Running command {cmd}')