示例#1
0
    def calculate_solvent_options(self, ref_solv=None, verbose=1):
        printdarkcyan("\n             ***********************")
        for each in self.grouped:
            try:
                vacuum = [x for x in each if not x.solvate][0]
                solvated = [x for x in each if x.solvate][0]
                solvated.dct['energy_au'] = solvated.dct['dft'] - vacuum.dct['dft']
                solvated.dct['energy_kj'] = solvated.dct['energy_au'] * HARTREE_TO_KJ_MOL
                solvated.dct['g_solv_dir'] = solvated.dct['dG'] - vacuum.dct['dG']
                solvated.dct['direct_vs_indirect'] = solvated.dct['g_solv_dir'] - solvated.dct['energy_kj']
                solvated.dct['G_solv_'] = solvated.dct['energy_kj'] + vacuum.dct['dG']

                if ref_solv:
                    solvated.dct['self_ref_indirect'] = solvated.dct['G_solv_'] + ref_solv['G_solv_']
                    solvated.dct['self_ref_direct'] = solvated.dct['dG'] + ref_solv['dG']

                if verbose:
                    vacuum.print_files()
                    solvated.print_files()
                    printdarkcyan("             ***********************")

            except (ValueError, IndexError):
                if verbose:
                    printred("Failed to match to vacuum and solvent.")
                    for group in each:
                        group.print_files()
                        print("")
                    printdarkcyan("             ***********************")
示例#2
0
def get_pdf_files(PMClist_file, pdf_folder):
    """
    Download PDF files for all PMCs in the file PMClist_file
    :param PMClist_file: Path to text file with full list of PMCs for dataset
    :param pdf_folder: Path to folder where the pdf files will be saved
    """

    url = 'https://europepmc.org/backend/ptpmcrender.fcgi?accid=PMCXXXXX&blobtype=pdf'

    if not pdf_folder.exists():
        pdf_folder.mkdir()

    with PMClist_file.open() as fin:
        pmcs = [el.strip() for el in fin.readlines()]
    pmcs = [el for el in pmcs if len(el)]

    for el in pmcs:
        path2file = pdf_folder.joinpath(el + '.pdf')
        response = requests.get(url=url.replace('PMCXXXXX', el))

        if response.ok:
            with path2file.open('wb') as f:
                f.write(response.content)
            printgr('Correctly processed ' + el)
        else:
            printred('Could not retrieve ' + el)

    return
示例#3
0
def get_xml_files(PMClist_file, xml_folder):
    """
    Download fulltext XML files for all PMCs in the file PMClist_file
    :param PMClist_file: Path to text file with full list of PMCs for dataset
    :param xml_folder: Path to folder where the fulltext XML files will be saved
    """

    url = 'https://www.ebi.ac.uk/europepmc/webservices/rest/PMCXXXXX/fullTextXML'

    if not xml_folder.exists():
        xml_folder.mkdir()

    with PMClist_file.open() as fin:
        pmcs = [el.strip() for el in fin.readlines()]
    pmcs = [el for el in pmcs if len(el)]

    for el in pmcs:
        path2file = xml_folder.joinpath(el + '.xml')
        response = requests.get(url=url.replace('PMCXXXXX', el))

        if response.ok:
            with path2file.open('w') as f:
                f.write(response.text)
            printgr('Correctly processed ' + el)
        else:
            printred('Could not retrieve ' + el)

    return
示例#4
0
def get_annotations(PMClist_file, annotations_folder):
    """
    Download available EuropePMC annotations for all PMCs in the file PMClist_file
    :param PMClist_file: Path to text file with full list of PMCs for dataset
    :param annotations_folder: Path to folder where the annotation files will be saved
    """

    url = 'https://www.ebi.ac.uk/europepmc/annotations_api/annotationsByArticleIds?articleIds=PMC%3AXXXXX&format=JSON'

    if not annotations_folder.exists():
        annotations_folder.mkdir()

    with PMClist_file.open() as fin:
        pmcs = [el.strip() for el in fin.readlines()]
    pmcs = [el for el in pmcs if len(el)]

    for el in pmcs:
        path2file = annotations_folder.joinpath(el + '.json')
        response = requests.get(url=url.replace('XXXXX', el.split('PMC')[1]))

        if response.ok:
            with path2file.open('w') as f:
                f.write(response.text)
            printgr('Coirrectly processed ' + el)
        else:
            printred('Could not retrieve ' + el)

    return
示例#5
0
    def get_dft_details(self, verbose=0, scale=False, **kwargs):
        try:
            thermochem = section_by_pattern(self.contents, pattern="Thermochemistry")[1:][-1]
            self.thermochemistry = thermochem
            if verbose > 2:
                print("Thermochemistry found.")

            factors = scaling_factors["DEFAULT"]
            if scale:
                try:
                    factors = scaling_factors[self.method][self.basis]
                except KeyError:
                    printred(f"{self.method} {self.basis} scaling factors not specified yet.")
            self.__dict__.update(factors)
            if verbose > 2:
                print("\n".join([f"{k:>20}: {v}" for k, v in factors.items()]))

            self.temperature = float(self.search("Temperature", lines=thermochem).split()[1])
            self.mass_total = float(self.search("Molecular mass", lines=thermochem).split(":")[1].strip().split()[0])
            self.mass_total_kg = self.mass_total*AMU_TO_KG
            atom_lines = [x for x in thermochem if "Atom " in x]
            self.mass_atoms = [float(x.strip().split()[-1]) for x in atom_lines]
            if verbose > 2:
                print(f"""\
               Temperature: {self.temperature} K
            Molecular mass: {self.mass_total_kg} kg \
                    """)

            self.check_frequencies(verbose=verbose)
            self.get_entropy(verbose=verbose)
            self.get_zero_point(verbose=verbose)
            self.get_hlc(verbose=verbose)
            self.get_thermal_correction(verbose=verbose)
        except:
            pass
示例#6
0
 def check_frequencies(self, verbose=3, **kwargs):
     # try:
     self.get_frequencies()
     if all(self.frequencies > 0):
         if verbose > 1:
             print(f"   {self.base_name} has no imaginary frequencies.")
     else:
         printred(f"   {self.base_name} has imaginary frequencies.")
         if verbose:
             printred([x for x in self.frequencies if x < 0])
         raise ValueError
     if verbose > 5:
         print(f"    Frequencies:{self.frequencies}")
示例#7
0
    def similar_orientation_to(self, other, rtol=1e-5, atol=1e-4, debug=False, 
                                n_letters=50, verbose=2, rmsd_threshold=1e-2):
        """ Lesson here: don't trust standards. Even Peter Gill's.
        Checks input orientation first, then standard.
        """

        np.set_printoptions(precision=5, linewidth=120, sign=' ', suppress=True, floatmode="fixed")

        def _similar(a, b):
            return np.allclose(a, b, rtol=rtol, atol=atol)


        if _similar(self.coordinates, other.coordinates):
            if debug and verbose > 3:
                printgreen("Found similar coordinates.")
            return True

        if debug and verbose > 4:
            printred(f"\nA: {self.filename[-n_letters:]}       B: {other.filename[-n_letters:]}")
            if verbose > 3:
                printyellow(f"{'Input orientation':^100}")
                self._diff_coordinates(self.coordinates, other.coordinates)
            if verbose > 4:
                self.print_filename()
                self.print_summary()
                other.print_filename()
                other.print_summary()

        sign = list(itertools.product([1, -1], repeat=3))
        arrangements = list(itertools.permutations(range(3)))

        per = list(itertools.product(sign, arrangements))
        permutations = [(self.std_coords*sign_)[:,arr_] for sign_, arr_ in per]

        any_close = any(_similar(x, other.std_coords) for x in permutations)
        if not any_close and debug and verbose > 3:
            _, rmsds = list(zip(*[self._diff_coordinates(x, other.std_coords, _print=False) for x in permutations]))
            min_rmsd_idx = np.argsort(rmsds)[0]
            min_rmsd = rmsds[min_rmsd_idx]

            if min_rmsd < rmsd_threshold or verbose > 4:
                printyellow(f"{'Standard orientation':^100}")
                self._diff_coordinates(permutations[min_rmsd_idx], other.std_coords)
                printyellow("        --------     \n")

        return any_close
示例#8
0
 def try_print_files(self, outfiles, verbose=2):
     err = 0
     txt = ""
     for k in ["dft", "mp2s", "mp2l", "cc"]:
         try:
             filename = getattr(self, k).filename
         except:
             filename = stylered("Missing")
             err += 1
         txt += f"""
             {k.upper():>5} : {filename}"""
     if err:
         printred(f"""
             Missing files!""")
         print(txt[1:])
         print(f"{len(outfiles)} input files.")
         raise ValueError
     elif verbose > 3:
         print(txt[1:])
示例#9
0
def get_rj_queued(show=True):
    dct = get_rj_env()
    try:
        output = str(
            subprocess.check_output("{SSH} {QUEUED}".format(**dct),
                                    shell=True))[2:-1]
    except subprocess.CalledProcessError:
        output = ""

    if output:
        jobstr = [x.strip().strip('\\r').strip() for x in output.split("\\n")]
        jobs = [x for x in jobstr if x]
    else:
        jobs = []

    if show:
        if jobs:
            print("\n".join(jobs))
        else:
            printred("No jobs available.")
    return jobs
示例#10
0
def get_annotationsMED(PMIDs, annotations_folder):
    """
    Download available EuropePMC annotations for all files in the list PMIDs
    :param PMIDs: list of Pubmed identifiers
    :param annotations_folder: Path to folder where the annotation files will be saved
    """

    url = 'https://www.ebi.ac.uk/europepmc/annotations_api/annotationsByArticleIds?articleIds=MED%3AXXXXX&format=JSON'

    if not annotations_folder.exists():
        annotations_folder.mkdir()

    for el in PMIDs:
        path2file = annotations_folder.joinpath('PMID' + el + '.json')
        response = requests.get(url=url.replace('XXXXX', el))

        if response.ok:
            with path2file.open('w') as f:
                f.write(response.text)
            printgr('Correctly processed ' + el)
        else:
            printred('Could not retrieve ' + el)

    return
示例#11
0
    def compare(self, other, n_char=20, ncol=10, verbose=0):

        def _compare(a, b, name):
            fmt = dict(
                        float = f">{ncol}.5f",
                        int   = f">{ncol}d",
                        str   = f">{ncol}",
                        )
            if any(type(x) == bool or x == None for x in (a, b)):
                a, b = str(a), str(b)

            f_a = fmt.get(type(a).__name__, fmt['str'])

            diff = ["red", None ][a == b]
            if verbose:
                print(style(f"{name.capitalize():>15}: {a:{f_a}}  {b:{f_a}}", diff))
            return a==b



        if verbose:
            printyellow(f"{self.filename:^{n_char}} {other.filename:^{n_char}}")
        same = True
        for attr in ["solvate", "solvent", "charge", "multiplicity", "method", "basis", "n_atoms", "energy"]:
            s_a = getattr(self, attr)
            o_a = getattr(other, attr)

            same = same and _compare(s_a, o_a, attr)

        if self.n_atoms == other.n_atoms:
            if self.elements == other.elements:
                if verbose:
                    print("""\
                    Elements match up.""")

                    if self.similar_orientation_to(other, verbose=0):
                        print("""\
                    Coordinates are similar.""")
                    else:
                        diff, rmsd = self._diff_coordinates(self.coordinates, other.coordinates, _print=False)
                        printred(f"""\
                    Coordinates are not similar.
                    RMSD: {rmsd:6.4f}    Diff: {diff:6.4f}""")

            else:
                printred("Elements don't match.")
                printred([(x, y) for x, y in zip(self.elements, other.elements) if x != y])
                same = False
        return same
示例#12
0
 def get_available_files(self, files, verbose=1, **kwargs):
     self.output_files = []
     _summaries = []
     for file in files:
         try:
             parsed = read_output(file, verbose=verbose, **kwargs)
             if parsed:
                 matched = [x for x in self.output_files if x.compare(parsed)]
                 if not matched:
                 # parsed.summary not in _summaries:
                     self.output_files.append(parsed)
                     # _summaries.append(parsed.summary)
                 elif verbose > 1:
                     printred(f"Not added: {file} is identical to {matched[0]}")
             elif verbose > 5:
                 printred(f"Failed to parse: {file}")
         except:
             if verbose > 5:
                 printred(f"Failed to parse: {file}")
示例#13
0
    corpus_file = corpus_dir.joinpath('EuropePMCAnnotations.txt')
    corpus_mallet = corpus_dir.joinpath('EuropePMCAnnotations.mallet')
    id_lema = [[el[0], [tk for tk in el[1] if tk in kept_words]] for el in id_lema]
    id_lema = [[el[0], stwEQ.cleanstr(' '.join(el[1])).split()] for el in id_lema]
    id_lema = [el for el in id_lema if len(el[1])>=min_lemas or el[0] in S2_base]
    #Check if all papers in base dataset are in the extended corpus
    #since they could have been removed if the lemmatizer did not provide a valid
    #output for them
    df = pd.read_csv(csv_file)
    S2_base = set(df['S2ID'].values.tolist())
    S2_in = [el for el in id_lema if el[0] in S2_base]
    if len(S2_in)==len(S2_base):
        printgr('Todos los papers del dataset base se han incorporado al dataset extendido de Abstracts lematizados')
    else:
        printred('Se han perdido papers del dataset base ' + str(len(S2_in)) + ' / ' + str(len(S2_base)))

    print('Generating corpus, #papers:', len(id_lema))

    with corpus_file.open('w', encoding='utf-8') as fout:
        [fout.write(el[0] + ' 0 ' + ' '.join(el[1]) + '\n') for el in id_lema]

    token_regexp=cf.get('CorpusGeneration','token_regexp')
    cmd = str(mallet_path) + \
              ' import-file --preserve-case --keep-sequence ' + \
              '--remove-stopwords --token-regex "' + token_regexp + '" ' + \
              '--input %s --output %s'
    cmd = cmd % (corpus_file, corpus_mallet)

    try:
        print(f'-- -- Running command {cmd}')