Exemplo n.º 1
0
def read_wmt_bib() -> List[Paper]:
    result = []
    with open("downloads/2020.wmt-1.0.bib") as f:
        bib = database.parse_file(f)

        for i, entry in enumerate(bib.entries.values()):
            if entry.type == "book":
                continue

            title = LatexNodes2Text().latex_to_text(entry.fields["title"])
            url = entry.fields["url"]
            abstract = LatexNodes2Text().latex_to_text(
                entry.fields["abstract"])
            author = "|".join([
                " ".join(reversed(str(e).split(", ")))
                for e in entry.persons["author"]
            ])

            uid = url.replace("https://www.aclweb.org/anthology/", "")
            url = "https://www.statmt.org/wmt20/pdf/" + uid + ".pdf"

            paper = Paper(
                uid=f"WS-2.{uid}",
                ws_id="WS-2",
                title=title,
                authors=author,
                abstract=abstract,
                track="WS-2",
                kind="workshop",
                link=url,
            )
            result.append(paper)
    return result
Exemplo n.º 2
0
    def test_repl_doc_title(self):

        # test that \title/\author/\date work and produce something reasonable
        # (exact output might change in the future)

        self.assertEqualUpToWhitespace(
            LatexNodes2Text().latex_to_text(r"""
\title{The Title}
\author{The Author(s)}
\date{July 4, 2020}
\maketitle
"""), r"""
The Title
    The Author(s)
    July 4, 2020
=================
""")
        # missing all \title, \author, \date
        today = '{dt:%B} {dt.day}, {dt.year}'.format(
            dt=datetime.datetime.now())
        eqhrule = '=' * max(4 + len(r'[NO \author GIVEN]'), 4 + len(today))
        self.assertEqualUpToWhitespace(
            LatexNodes2Text().latex_to_text(r"""
\maketitle
"""), r"""
[NO \title GIVEN]
    [NO \author GIVEN]
    %(today)s
%(eqhrule)s
""" % {
                'today': today,
                'eqhrule': eqhrule
            })
Exemplo n.º 3
0
    def test_input(self):
        latex = r'''ABCDEF fdksanfkld safnkd anfklsa

\input{test_input_1.tex}

MORENKFDNSN'''
        correct_text = r'''ABCDEF fdksanfkld safnkd anfklsa

hi there! This is an equation:

    x + y i = 0

where i is the imaginary unit.

MORENKFDNSN'''

        testdir = os.path.realpath(os.path.abspath(os.path.dirname(__file__)))

        l2t = LatexNodes2Text()
        l2t.set_tex_input_directory(testdir)

        output = l2t.nodelist_to_text(LatexWalker(latex).get_latex_nodes()[0])

        self.assertEqualUpToWhitespace(output, correct_text)

        latex = r'''ABCDEF fdksanfkld safnkd anfklsa

\input{test_input_1}

MORENKFDNSN'''

        self.assertEqualUpToWhitespace(
            l2t.nodelist_to_text(LatexWalker(latex).get_latex_nodes()[0]),
            correct_text)

        latex = r'''ABCDEF fdksanfkld safnkd anfklsa

\input{../test_input_1}

MORENKFDNSN'''

        correct_text_unsafe = correct_text  # as before
        correct_text_safe = r'''ABCDEF fdksanfkld safnkd anfklsa

MORENKFDNSN'''

        # make sure that the \input{} directive failed to include the file.
        l2t = LatexNodes2Text()
        l2t.set_tex_input_directory(os.path.join(testdir, 'dummy'))
        self.assertEqualUpToWhitespace(
            l2t.nodelist_to_text(LatexWalker(latex).get_latex_nodes()[0]),
            correct_text_safe)
        # but without the strict_input flag, it can access it.
        l2t.set_tex_input_directory(os.path.join(testdir, 'dummy'),
                                    strict_input=False)
        self.assertEqualUpToWhitespace(
            l2t.nodelist_to_text(LatexWalker(latex).get_latex_nodes()[0]),
            correct_text_unsafe)
Exemplo n.º 4
0
def parse_steps(steps_page_source):
    """
    Go through all the children of the steps and extract the text
    """
    main_list = ["Steps"]
    # print(f"Page: *{steps_page_source}", sep='\n')
    # print("Page: ")
    # print(*steps_page_source, sep='\n')
    for child in steps_page_source:
        if main_list[-1] == "Plotting:":
            main_list = main_list[:-1]
            break
        try:
            if 'solution_step_result' in child['class']:
                text = child.find('span', class_='selectable').text
                req_text = LatexNodes2Text().latex_to_text(text)
                main_list.append(req_text)
                continue
        except Exception as e:
            print(e)

        try:
            if 'mathquill-embedded-latex' in child['class']:
                text = child.find('span', class_='selectable').text
                req_text = LatexNodes2Text().latex_to_text(text)
                main_list.append(req_text)
                continue
        except:
            pass

        try:
            if child['class'] == 'solution_step_list_item':
                text = child.find('span', class_='selectable').text
                req_text = LatexNodes2Text().latex_to_text(text)
                main_list.append(req_text)
                continue
        except:
            pass

        try:
            if child['class'] == 'solution_step_list_item':
                text = child.find('span', class_='selectable').text
                req_text = LatexNodes2Text().latex_to_text(text)
                main_list.append(req_text)
                continue
        except:
            pass

        try:
            if child['class'] == 'solution_step_explanation':
                text = child.find('span', class_='selectable').text
                req_text = LatexNodes2Text().latex_to_text(text)
                main_list.append(req_text)
                continue
        except:
            pass

    return main_list
Exemplo n.º 5
0
 def test_accents(self):
     self.assertEqual(
         LatexNodes2Text().nodelist_to_text(LatexWalker(r"Fran\c cais").get_latex_nodes()[0]),
         '''Fran\N{LATIN SMALL LETTER C WITH CEDILLA}ais'''
     )
     self.assertEqual(
         LatexNodes2Text().nodelist_to_text(LatexWalker(r"Fr\'en{\'{e}}tique").get_latex_nodes()[0]),
         '''Fr\N{LATIN SMALL LETTER E WITH ACUTE}n\N{LATIN SMALL LETTER E WITH ACUTE}tique'''
     )
Exemplo n.º 6
0
    def parseJsonFile(self):
        singleArticle = dict()
         
        for dic in self.data:
        
            author = self.getAuthor(dic)
            authorCount = self.getAuthorCount(dic)
            journal = self.getJournal(dic)
            title = self.getTitle(dic)
            year = self.getYear(dic)
            doi = self.getDoi(dic)
            collaboration = self.getCollaboration(dic)
            pages = self.getPages(dic)
            volume = self.getVolume(dic)
            eprint = self.getEprint(dic)
            abstract = self.getAbstract(dic)       
            id = self.getID(dic)     
            citationCount=self.getCitationCount(dic)


            if author is not None:
                singleArticle['FirstAuthor'] = author
            else:
                singleArticle['FirstAuthor'] = None
            if authorCount is not None:
                singleArticle['AuthorCount'] = authorCount
            if journal is not None:
                singleArticle['Journal'] = journal
            if title is not None:
                singleArticle['Title'] = LatexNodes2Text().latex_to_text(title)
            if year is not None:
                singleArticle['Year'] = int(year)
            else:
                singleArticle['Year'] = 0
            if doi is not None:
                singleArticle['Doi'] = doi
            if collaboration is not None:
                singleArticle['Collaboration'] = collaboration
            if pages is not None:
                singleArticle['Pages'] = pages
            if volume is not None:
                singleArticle['Volume'] = volume
            if eprint is not None:
                singleArticle['Eprint'] = eprint
            if abstract is not None:
                singleArticle['Summary'] = LatexNodes2Text().latex_to_text(abstract)
            if id is not None:
                singleArticle['Source'] = f'https://inspirehep.net/literature/{id}'
            if citationCount is not None:
                singleArticle['CitationCount'] = citationCount

            singleArticle['Bibtex'] = self.convertToBibtex(singleArticle)
            singleArticle['DB'] = "https://inspirehep.net/"
            self.ListOfArticles.append(singleArticle.copy())
            
            singleArticle.clear()
Exemplo n.º 7
0
    def test_math_alphabets(self):
        def gen_latex(macroname):
            return r"""
%s{-ABCDEFGHIJKLMNOPQRSTUVWXYZ abcdefghijklmnopqrstuvwxyz-}
""".strip() % ('\\' + macroname)

        self.assertEqual(
            LatexNodes2Text().latex_to_text(gen_latex('mathbf')),
            '-𝐀𝐁𝐂𝐃𝐄𝐅𝐆𝐇𝐈𝐉𝐊𝐋𝐌𝐍𝐎𝐏𝐐𝐑𝐒𝐓𝐔𝐕𝐖𝐗𝐘𝐙 𝐚𝐛𝐜𝐝𝐞𝐟𝐠𝐡𝐢𝐣𝐤𝐥𝐦𝐧𝐨𝐩𝐪𝐫𝐬𝐭𝐮𝐯𝐰𝐱𝐲𝐳-')
        self.assertEqual(
            LatexNodes2Text().latex_to_text(gen_latex('mathit')),
            '-𝐴𝐵𝐶𝐷𝐸𝐹𝐺𝐻𝐼𝐽𝐾𝐿𝑀𝑁𝑂𝑃𝑄𝑅𝑆𝑇𝑈𝑉𝑊𝑋𝑌𝑍 𝑎𝑏𝑐𝑑𝑒𝑓𝑔ℎ𝑖𝑗𝑘𝑙𝑚𝑛𝑜𝑝𝑞𝑟𝑠𝑡𝑢𝑣𝑤𝑥𝑦𝑧-')
        self.assertEqual(
            LatexNodes2Text().latex_to_text(gen_latex('mathsf')),
            '-𝖠𝖡𝖢𝖣𝖤𝖥𝖦𝖧𝖨𝖩𝖪𝖫𝖬𝖭𝖮𝖯𝖰𝖱𝖲𝖳𝖴𝖵𝖶𝖷𝖸𝖹 𝖺𝖻𝖼𝖽𝖾𝖿𝗀𝗁𝗂𝗃𝗄𝗅𝗆𝗇𝗈𝗉𝗊𝗋𝗌𝗍𝗎𝗏𝗐𝗑𝗒𝗓-')
        self.assertEqual(
            LatexNodes2Text().latex_to_text(gen_latex('mathbb')),
            '-𝔸𝔹ℂ𝔻𝔼𝔽𝔾ℍ𝕀𝕁𝕂𝕃𝕄ℕ𝕆ℙℚℝ𝕊𝕋𝕌𝕍𝕎𝕏𝕐ℤ 𝕒𝕓𝕔𝕕𝕖𝕗𝕘𝕙𝕚𝕛𝕜𝕝𝕞𝕟𝕠𝕡𝕢𝕣𝕤𝕥𝕦𝕧𝕨𝕩𝕪𝕫-')
        self.assertEqual(
            LatexNodes2Text().latex_to_text(gen_latex('mathtt')),
            '-𝙰𝙱𝙲𝙳𝙴𝙵𝙶𝙷𝙸𝙹𝙺𝙻𝙼𝙽𝙾𝙿𝚀𝚁𝚂𝚃𝚄𝚅𝚆𝚇𝚈𝚉 𝚊𝚋𝚌𝚍𝚎𝚏𝚐𝚑𝚒𝚓𝚔𝚕𝚖𝚗𝚘𝚙𝚚𝚛𝚜𝚝𝚞𝚟𝚠𝚡𝚢𝚣-')
        self.assertEqual(
            LatexNodes2Text().latex_to_text(gen_latex('mathcal')),
            '-𝒜ℬ𝒞𝒟ℰℱ𝒢ℋℐ𝒥𝒦ℒℳ𝒩𝒪𝒫𝒬ℛ𝒮𝒯𝒰𝒱𝒲𝒳𝒴𝒵 𝒶𝒷𝒸𝒹ℯ𝒻ℊ𝒽𝒾𝒿𝓀𝓁𝓂𝓃ℴ𝓅𝓆𝓇𝓈𝓉𝓊𝓋𝓌𝓍𝓎𝓏-')
        self.assertEqual(
            LatexNodes2Text().latex_to_text(gen_latex('mathscr')),
            '-𝒜ℬ𝒞𝒟ℰℱ𝒢ℋℐ𝒥𝒦ℒℳ𝒩𝒪𝒫𝒬ℛ𝒮𝒯𝒰𝒱𝒲𝒳𝒴𝒵 𝒶𝒷𝒸𝒹ℯ𝒻ℊ𝒽𝒾𝒿𝓀𝓁𝓂𝓃ℴ𝓅𝓆𝓇𝓈𝓉𝓊𝓋𝓌𝓍𝓎𝓏-')
        self.assertEqual(
            LatexNodes2Text().latex_to_text(gen_latex('mathfrak')),
            '-𝔄𝔅ℭ𝔇𝔈𝔉𝔊ℌℑ𝔍𝔎𝔏𝔐𝔑𝔒𝔓𝔔ℜ𝔖𝔗𝔘𝔙𝔚𝔛𝔜ℨ 𝔞𝔟𝔠𝔡𝔢𝔣𝔤𝔥𝔦𝔧𝔨𝔩𝔪𝔫𝔬𝔭𝔮𝔯𝔰𝔱𝔲𝔳𝔴𝔵𝔶𝔷-')
Exemplo n.º 8
0
def tex_to_plain(tex):
    '''
    Try hard converting tex to unicode plain text.
    '''

    for reg, cate in (
        (r'_\{([^}]*?)\}', subscripts),
        (r'[\^]\{([^}]*?)\}', superscripts),
        (r'_(.)', subscripts),
        (r'[\^](.)', superscripts),
    ):
        pieces = []
        while True:
            match = re.search(reg, tex, flags=re.DOTALL | re.UNICODE)
            if match:
                chars = match.groups()[0]
                if all_in(chars, cate):
                    chars = [cate[x] for x in chars]
                else:
                    chars = tex[match.start():match.end()]
                pieces.append(tex[:match.start()])
                pieces.append(''.join(chars))
                tex = tex[match.end():]
            else:
                pieces.append(tex)
                break

        tex = ''.join(pieces)

    return LatexNodes2Text().latex_to_text(tex)
Exemplo n.º 9
0
def window_results(results_list, keywords):
    root_tk = tk.Tk()
    scrollbar = tk.Scrollbar()
    scrollbar.pack(side=tk.RIGHT, fill=tk.Y)
    window = tk.Text(root_tk,
                     font=24,
                     width=100,
                     height=40,
                     spacing2=4,
                     padx=10,
                     pady=10,
                     wrap=tk.WORD)

    i = 1
    for result in results_list:
        window.insert(tk.END, str(i) + ".  " + result[0] + "\n")
        i += 1
    window.insert(tk.END, "\n")
    i = 1
    for result in results_list:
        for keyword, data in zip(keywords, result):
            if keyword == keywords[0]:
                window.insert(tk.END, str(i) + ".  " + keyword + "\n")
            else:
                window.insert(tk.END, "--> " + keyword + "\n")
            window.insert(tk.END, LatexNodes2Text().latex_to_text(data) + "\n")
        window.insert(tk.END, "================================= \n")
        i += 1

    window.pack(side=tk.LEFT, fill=tk.BOTH)
    scrollbar.config(command=window.yview)
    tk.mainloop()
Exemplo n.º 10
0
 def do_test(tex, uni, strict_latex_spaces=None, keep_comments=None, **kwargs):
     self.assertEqual(
         LatexNodes2Text(strict_latex_spaces=strict_latex_spaces, keep_comments=keep_comments,
                         keep_inline_math=False, **kwargs)
         .latex_to_text(tex, keep_inline_math=True, **kwargs),
         uni
     )
Exemplo n.º 11
0
 def test_accents(self):
     self.assertEqual(
         LatexNodes2Text().nodelist_to_text(
             LatexWalker(r"Fran\c cais").get_latex_nodes()[0]),
         '''Fran\N{LATIN SMALL LETTER C WITH CEDILLA}ais''')
     self.assertEqual(
         LatexNodes2Text().nodelist_to_text(
             LatexWalker(r"Fr\'en{\'{e}}tique").get_latex_nodes()[0]),
         '''Fr\N{LATIN SMALL LETTER E WITH ACUTE}n\N{LATIN SMALL LETTER E WITH ACUTE}tique'''
     )
     self.assertEqual(
         LatexNodes2Text(math_mode='with-delimiters').nodelist_to_text(
             LatexWalker(r"$1 \not= 2$").get_latex_nodes()[0]),
         '''$1 {} 2$'''.format(
             unicodedata.normalize('NFC',
                                   "=\N{COMBINING LONG SOLIDUS OVERLAY}")))
Exemplo n.º 12
0
    def test_empty_pars(self):

        self.assertEqual(
            LatexNodes2Text(fill_text=10,
                            strict_latex_spaces=True).latex_to_text(r"""
A car once was very fast.

Another car came by.  And then some space:



Note the few space tokens in the otherwise empty line above.
"""), r"""A car once
was very
fast.

Another
car came
by.  And
then some
space:

Note the
few space
tokens in
the
otherwise
empty line
above. """)
Exemplo n.º 13
0
 def _get(self,key,bibentry,compile_latex):
     if key in bibentry:
         value =  bibentry[key]
         if compile_latex:
             return LatexNodes2Text().latex_to_text(value)
         return value
     else:
         raise KeyError(key)
Exemplo n.º 14
0
 def test(self):
     latex = r"""\textbf{Hi there!} Here is \emph{an equation
             \begin{equation}
             \zeta = x + i y
             \end{equation}
             where $i$ is the imaginary unit.
             """
     return LatexNodes2Text().latex_to_text(latex)
Exemplo n.º 15
0
 def do_test(tex, uni, math_mode=None):
     kwargs = {}
     if math_mode is not None:
         kwargs['math_mode'] = math_mode
     self.assertEqual(LatexNodes2Text(strict_latex_spaces=True,
                                      **kwargs).latex_to_text(tex),
                      uni,
                      msg="For TeX=r'{}'".format(tex))
Exemplo n.º 16
0
    def test_keep_braced_groups(self):
        self.assertEqual(
            LatexNodes2Text(keep_braced_groups=True)
            .nodelist_to_text(LatexWalker(r"\textit{Voil\`a du texte}. Il est \'{e}crit {en fran{\c{c}}ais}")
                              .get_latex_nodes()[0]),
            '''Voil\N{LATIN SMALL LETTER A WITH GRAVE} du texte. Il est \N{LATIN SMALL LETTER E WITH ACUTE}crit {en fran\N{LATIN SMALL LETTER C WITH CEDILLA}ais}'''
        )

        self.assertEqual(
            LatexNodes2Text(keep_braced_groups=True, keep_braced_groups_minlen=4)
            .nodelist_to_text(LatexWalker(r"A{XYZ}{ABCD}").get_latex_nodes()[0]),
            '''AXYZ{ABCD}'''
        )
        self.assertEqual(
            LatexNodes2Text(keep_braced_groups=True, keep_braced_groups_minlen=0)
            .nodelist_to_text(LatexWalker(r"{A}{XYZ}{ABCD}").get_latex_nodes()[0]),
            '''{A}{XYZ}{ABCD}'''
        )
Exemplo n.º 17
0
    def test_text_filling_InitEndPar(self):

        self.assertEqual(
            LatexNodes2Text(fill_text=True,
                            strict_latex_spaces=True).latex_to_text(r"""

  Hello \emph{world}.  % comment
more text.

"""), "\n\nHello world. more text.\n\n")

        self.assertEqual(
            LatexNodes2Text(fill_text=True,
                            strict_latex_spaces=True).latex_to_text(r"""
  Hello \emph{world}.  % comment
more text.

"""), "Hello world. more text.\n\n")
def get_title_info(entry):
    """Try to guess title information from a publication."""
    if isinstance(entry, Path):
        pdf = PyPDF2.PdfFileReader(str(entry))
        if '/Title' in pdf.documentInfo:
            return pdf.documentInfo['/Title']
        else:
            return None
    elif type(entry) is pybtex.database.Entry:
        if 'title' in entry.fields:
            return LatexNodes2Text().latex_to_text(entry.fields['title'])
        elif 'booktitle' in entry.fields:
            return LatexNodes2Text().latex_to_text(entry.fields['booktitle'])
        else:
            return None
    else:
        raise NotImplementedError("Can only handle pdf or bib objects (was %s)"
                                  % type(entry))
    def _preprocess(self):
        # Defining the "None" value for the "NaN" values.
        self._dataframe.replace({np.nan: None}, inplace=True)

        # Removing unnecessary columns.
        columns_drop = ["methods", "conclusions", "results", "copyrights", "xml", "isbn",
                        "language", "publication_type", "sections", "publisher", "publisher_location"]
        self._dataframe.drop(axis=1, columns=columns_drop, inplace=True)

        # Getting the PubMed ID for each paper.
        self._dataframe.pubmed_id = self._dataframe.pubmed_id.apply(lambda x: x.split()[0].strip())

        # Normalizing the features "abstract" and "title".
        self._dataframe.abstract = self._dataframe.abstract.apply(
            lambda x: LatexNodes2Text().latex_to_text(
                re.sub(r"\s+", " ", re.sub("%", "\\%", x))) if x and len(x) > 0 else None)
        self._dataframe.title = self._dataframe.title.apply(
            lambda x: x.replace("\n", " ") if x and len(x) > 0 else None)

        # Setting the feature "keywords" as a tuple of keywords and
        # normalizing the keywords for each paper.
        self._dataframe.keywords.loc[self._dataframe.keywords.notnull()] = [
            tuple([ProcessPubmed.__clean_text(keyword) for keyword in eval(keywords)]) \
                if eval(keywords) else None
            for keywords in self._dataframe.keywords[self._dataframe.keywords.notnull()]]

        # Correcting the feature "authors".
        for idx, authors in enumerate(self._dataframe.authors):
            if not eval(authors):
                self._dataframe.authors[idx] = None
            else:
                list_authors = []
                for author in eval(authors):
                    auth = {}
                    if author["firstname"] and author["lastname"]:
                        auth["name"] = ProcessPubmed.__clean_text(
                            "{} {}".format(author["firstname"], author["lastname"]))
                    elif author["firstname"] and not author["lastname"]:
                        auth["name"] = ProcessPubmed.__clean_text(author["firstname"])
                    elif not author["firstname"] and author["lastname"]:
                        auth["name"] = ProcessPubmed.__clean_text(author["lastname"])

                    if "affiliation" in author:
                        auth["affiliation"] = ProcessPubmed.__clean_text(author["affiliation"])
                    else:
                        auth["affiliation"] = None
                    
                    if "name" in auth:
                        list_authors.append(auth)
                if list_authors:
                    self._dataframe.authors[idx] = tuple(list_authors)
                else:
                    self._dataframe.authors[idx] = None

        # Renaming the features "authors", "keywords" and "journal".
        self._dataframe.rename(columns={"authors": "author_affil", "keywords": "auth_keywords",
            "journal": "vehicle_name"}, inplace=True)
Exemplo n.º 20
0
def parse_LaTEX(t):                 #übersetzt f von LaTEX in 'normalen' text
    t = LatexNodes2Text().latex_to_text(t)
    t = t.replace("∫", "")      #ändert noch ein paar Zeichen
    t = t.replace("=", "")
    t = t.replace("ds", "")
    t = t.replace("dx", "")
    t = regex.sub("f\([a-z]\)", "", t)     #RegEx
    t = regex.sub("d[a-z]", "", t)
    print("t:", t)
    return t
Exemplo n.º 21
0
def make_plain(text):
    """
    Detexify and asciify text

    :param str text: Text to make plain
    :returns: Text with all LaTeX sequences rendered to text and unicode\
        characters replaced
    :rtype: str
    """
    return unidecode(LatexNodes2Text().latex_to_text(text))
Exemplo n.º 22
0
    def test_repl_matrix_environment(self):

        for env, arg in (('array', '{lll}'), ('pmatrix', ''), ('bmatrix', ''),
                         ('smallmatrix', '')):
            self.assertEqualUpToWhitespace(
                LatexNodes2Text().latex_to_text(
                    r"\begin{%(env)s}%(arg)s1 &   2 & abcdef\\ 3 & 4\end{%(env)s}"
                    % {
                        'env': env,
                        'arg': arg
                    }), "[      1      2 abcdef;      3      4 ]")
Exemplo n.º 23
0
    def test_repl_eqn(self):

        for env in ('equation', 'equation*', 'eqnarray', 'eqnarray*', 'align',
                    'align*', 'multline', 'multline*', 'gather', 'gather*',
                    'dmath', 'dmath*'):

            self.assertEqualUpToWhitespace(
                LatexNodes2Text(
                    strict_latex_spaces='except-in-equations').latex_to_text(
                        r"\begin{%(env)s} e \approx 2.718 \end{%(env)s}" %
                        {'env': env}), u"e ≈ 2.718")
Exemplo n.º 24
0
def format_title(title):
    """format the publication title"""
    logger.info(f"... formatting title \"{title}\"")
    title = title.replace("\\sqrt s", "\\sqrt{s}")
    title = title.replace(" sqrts ", " \\sqrt{s} ")
    title = title.replace(" \\bar{", "\\bar{")
    title = title.replace("\\smash[b]", "")
    title = title.replace("\\smash [b]", "")
    title = title.replace("\\mbox{", "{")
    title = title.replace("{\\rm ", "{")
    title = title.replace("{\\rm\\scriptscriptstyle ", "{")
    title = title.replace("\\kern -0.1em ", "")
    title = title.replace("$~\\mathrm{", "~$\\mathrm{")
    if re.search(r"rightarrow\S", title):
        title = title.replace("rightarrow", "rightarrow ")
    # fix overline without space
    overline = re.search(r"overline\s([a-zA-Z])", title)
    if overline:
        title = title.replace(f"overline {overline.group(1)}",
                              "overline{%s}" % overline.group(1))
    title = title.replace(" \\overline{", "\\overline{")
    # fix "{\mathrm XXX}" to "\mathrm{XXX}"
    mathrm = re.search(r"{\\mathrm (.*)}", title)
    if mathrm:
        title = title.replace(f"\\mathrm {mathrm.group(1)}",
                              "\\mathrm{%s}" % mathrm.group(1))
    # overline{D} gives problems when in mathrm
    title = title.replace("\\overline{D", "\\bar{D")
    try:
        text_title = LatexNodes2Text().latex_to_text(title)
    except LatexWalkerError as identifier:
        logger.error(f"LatexWalkerError in {identifier}")
        text_title = title
    logger.debug(f"... text title {text_title}")
    # Convert some of remaining text to unicode
    text_title = convert_to_unicode(text_title)
    # insert spaces before and after the following characters
    char_with_spaces = ["=", "→"]
    for my_char in char_with_spaces:
        pat = re.compile(r"\s?%s\s?" % my_char)
        text_title = re.sub(pat, " %s " % my_char, text_title)
    # insert space before eV/keV/MeV/GeV/TeV in case of wrong formatting
    text_title = re.sub(r"(\d)([kMGT]?eV)", r"\1 \2", text_title)
    # reduce all spaces to a maximum of one
    text_title = re.sub(r"\s+", " ", text_title)
    # reduce all underscores to a maximum of one
    text_title = re.sub(r"_+", "_", text_title)
    # reduce all hyphens to a maximum of one
    text_title = re.sub(r"-+", "-", text_title)
    # remove space before comma
    text_title = text_title.replace(" ,", ",")
    # merge s_NN
    text_title = text_title.replace("s_ NN", "s_NN").strip()
    return text_title
def read_findings_bib():
    with open("downloads/2020.findings-EMNLP.0.bib") as f:
        bib = database.parse_file(f)

        uids = []
        titles = []
        abstracts = []
        authors = []
        urls = []

        for i, entry in enumerate(bib.entries.values()):
            if entry.type == "book":
                continue

            title = LatexNodes2Text().latex_to_text(entry.fields["title"])
            url = entry.fields["url"]
            abstract = LatexNodes2Text().latex_to_text(entry.fields["abstract"])
            author = "|".join(
                [
                    " ".join(reversed(str(e).split(", ")))
                    for e in entry.persons["author"]
                ]
            )

            uids.append(f"findings.{i}")
            titles.append(title)
            abstracts.append(abstract)
            authors.append(author)
            urls.append(url)

        data = {
            "UID": uids,
            "title": titles,
            "abstract": abstracts,
            "authors": authors,
            "pdf_url": urls,
        }

        df = pd.DataFrame(data)

        df.to_csv("yamls/findings_papers.csv", index=False)
Exemplo n.º 26
0
def decode(entry: Entry) -> Entry:
    """Decode a dictionary with LaTeX strings into a dictionary with unicode strings."""
    translator = LatexNodes2Text()
    # Perform a deepcopy, otherwise the input entry will get altered
    out = deepcopy(entry)
    assert out.fields is not None
    for key, value in out.fields.items():
        if key == "url":
            # The url can contain special LaTeX characters (like %) and that's fine
            continue
        out.fields[key] = translator.latex_to_text(value)
    return out
Exemplo n.º 27
0
def fix_utf8_field(entry, field, args):
    if field not in entry:
        return entry

    value = entry[field]
    if field is args.utf8:
        value = LatexNodes2Text().latex_to_text(value)
    elif field is args.latex:
        value = unicode_to_latex(value)
    entry[field] = value

    return entry
Exemplo n.º 28
0
    def print(self, line, params):
        def to_str(name, kind):
            obj = params[name]
            if kind == 'degree':
                return degree_to_string(obj)
            return str(obj)

        for match in re.finditer('%{(?P<type>[^:}]*):(?P<name>[^}]*)}', line):
            line = line.replace(
                match.group(0), to_str(match.group('name'),
                                       match.group('type')))
        return LatexNodes2Text().latex_to_text(line)
Exemplo n.º 29
0
def decode_latex(latex_text):
    """Decode latex text.

    Args:
        latex_text (str): a latex text.

    Returns:
        str: the latex text decoded.
    """
    if not isinstance(latex_text, text_type):
        latex_text = text_type(latex_text, 'utf8')

    return LatexNodes2Text().latex_to_text(latex_text)
Exemplo n.º 30
0
    def __convert(self, lecture):
        path = self.prefix + lecture.path
        if not os.path.exists(path):
            print "File not found: {}".format(path)
            return
        try:
            with open(path, 'r') as f:
                lecture.content = LatexNodes2Text().latex_to_text(f.read())
            print lecture.url
        except Exception as e:
            print "Skipping due to {}".format(e)

        return lecture