def setUpClass(cls):
        cls.ns = {'ltx': 'http://dlmf.nist.gov/LaTeXML'}
        cls.xml1 = px.DefinitionsXML('tests/latexmled_files/1501.06563.xml')

        cls.xml2 = px.DefinitionsXML(
            'tests/latexmled_files/enumerate_forms.xml')
        cls.def_text = cls.xml2.get_def_text()

        cls.xml_lst1 = cls.xml1.exml.findall('.//ltx:p', namespaces=cls.ns)
        cls.html1 = px.DefinitionsXML('tests/latexmled_files/1501.06563.html')
        cls.html2 = px.DefinitionsXML(
            'tests/latexmled_files/1501.06563_shortened.html')
        cls.html_lst1 = cls.html1.exml.findall('.//p', namespaces=cls.ns)
    def test_exact_tokenize1(self):
        dtest = px.DefinitionsXML('tests/latexmled_files/math.0402243.xml')
        str1 = '''une orbifolde pure est un espace analytique complexe
        normal _inline_math_  n’ayant que des singularités\nquotient.'''
        str2 = dtest.get_def_text()[0].lower()

        self.assertEqual(nltk.word_tokenize(str1), nltk.word_tokenize(str2))
 def test_DefinitionXML_sampling(self):
     dd = px.DefinitionsXML(
         'tests/latexmled_files/minimal_example_with_defs.xml')
     sample_dict = dd.get_def_sample_text_with(sample_size=4)
     self.assertEqual(len(sample_dict['real']), 2)
     self.assertEqual(len(sample_dict['nondef']), 1)
     self.assertTrue(
         'This is an example document.' in sample_dict['nondef'][0])
Пример #4
0
def parse_clf_chunk(file_obj, clf, bio, vzer, tokr):
    '''
    Runs the classifier and chunker on the file_obj
    file_obj: file object

    clf, bio, vzer, tokr: pickled classifiers and tokenizer
    '''
    px = parsing_xml.DefinitionsXML(file_obj)
    ddum = Definiendum(px, clf, bio, vzer, tokr)
    return ddum.root
 def test_exact_tokenize3(self):
     dtest = px.DefinitionsXML('tests/latexmled_files/math.0407523.xml')
     list1 = [
         'a', 'coherent', 'system', '_inline_math_', 'is', 'injective',
         'if', 'the', 'evaluation', 'morphism', '_inline_math_', 'is',
         'injective', 'as', 'a', 'morphism', 'of', 'sheaves', '.',
         'moreover', '_inline_math_', 'is', 'torsion-free', 'if', 'it',
         'is', 'injective', 'and', 'the', 'quotient', 'sheaf',
         '_inline_math_', 'is', 'torsion-free', '.'
     ]
     list2 = dtest.get_def_text()[3].lower()
     self.assertEqual(list1, nltk.word_tokenize(list2))
Пример #6
0
def parse_clf_chunk(file_obj, clf, bio, vzer, tokr, max_stale_tries=15):
    '''
    Runs the classifier and chunker on the file_obj
    file_obj: file object

    clf, bio, vzer, tokr: pickled classifiers and tokenizer

    max_stale_tries: number of retries of OSError Stale file handle
    '''
    retried = 0
    while retried < max_stale_tries:
        retried += 1
        try:
            DD = px.DefinitionsXML(file_obj)
            ddum = Definiendum(DD, clf, bio, vzer, tokr)
            break
        except OSError as ee:
            wait_delay = randint(5, 15)
            logging.warning(f"{ee} waiting for {wait_delay} retry: {retried}")
            time.sleep(wait_delay)
    return ddum.root
 def test_contain_words1(self):
     dtest = px.DefinitionsXML('tests/latexmled_files/math.0412433.xml')
     test_set = set(nltk.word_tokenize(dtest.get_def_text()[0].lower()))
     ss = {
         '.',
         ';',
         ',',
         'kirwan',
         'let',
         'we',
         'be',
         'codimension',
         'components',
         'divisorial',
         'having',
         'if',
         'in',
         'irreducible',
         'is',
         'locus',
         'mild',
         'of',
         'one',
         'other',
         'part',
         'resolution',
         'say',
         'shall',
         'that',
         'the',
         'union',
         'unstable',
         'words',
         '_inline_math_',
     }
     self.assertSetEqual(ss, test_set)
Пример #8
0
    print(
        '  Querying                                                                     ',
        end='\r')
    qq = query()

    with open(args.file_names[0], 'a') as real_f, open(args.file_names[1],
                                                       'a') as nondefs_f:
        for l in qq:
            nonlocal_path = art_dict.get(l[0])
            if nonlocal_path:
                prepath = re.sub('^/mnt/', '', nonlocal_path)
                print(
                    'file: %s                                                            '
                    % prepath,
                    end='\r')
                local_path = os.path.join(loc_path, prepath)
                try:
                    xml = px.DefinitionsXML(local_path)
                    tdict = xml.get_def_sample_text_with()
                    for s in tdict['real']:
                        real_f.write(s + '\n')
                    for s in tdict['nondef']:
                        nondefs_f.write(s + '\n')
                except ValueError:
                    print('error  parsing file %s' % local_path)
            else:
                print(
                    'Did not found: %s                                                  '
                    % l[0],
                    end='\r')
Пример #9
0
    root = etree.Element("definition")
    root.attrib['index'] = repr(ind)
    statement = etree.SubElement(root, 'stmnt')
    statement.text = px.recutext_xml(defi)
    for d in get_definiendum(defi, ns):
        dfndum = etree.SubElement(root, 'dfndum')
        dfndum.text = d
    return root


# +
root = etree.Element('root')

for filenm in glob.glob('data/stacks-clean/perfect.tex.xml'):
    try:
        px_file = px.DefinitionsXML(filenm)
        branch = px_file.create_xml_branch()
        root.append(branch)
    except ValueError as e:
        print('%s is empty!' % filenm)

#print(etree.tostring(root, pretty_print=True).decode('utf8'))
# -

with open('data/short_starts_withp_graph.xml', 'w+') as stack_file:
    stack_file.write(etree.tostring(root, pretty_print=True).decode('utf8'))

lazrd = px.DefinitionsXML('tests/latexmled_files/1501.06563.html')
#print(etree.tostring(lazrd.create_xml_branch(),pretty_print=True).decode('utf8'))
#print(lazrd.get_def_sample_text_with(30)['real'][2])
d1 = lazrd.find_definitions()[2]
            for k,s in enumerate(sm):
                print('{:15} {:>10}  {:>10}'.format(s[0], y_true_tmp[k], predicted[k]))
    return y_true, y_pred

# Prepare and print metrics for the normal metrics
OO = prepare_for_metrics(119, chunker, data_set=test_samples, print_output=True)
y_true, predicted = prepare_for_metrics(range(len(test_samples)), chunker)
print(metrics.classification_report(y_true, predicted))
# -

# An example of a user fed definition
chunked = chunker.parse(pos_tag(word_tokenize(Def[0])))
D =list(filter(lambda x: isinstance(x, nltk.tree.Tree), chunked))[0]
' '.join([d[0] for d in D])

art = px.DefinitionsXML('tests/latexmled_files/1501.06563.xml')
p_lst = [px.recutext_xml(p) for p in art.tag_list(tag='para')] 
p_vec = count_vect.transform(p_lst)
preds = clf.predict(p_vec)

for k,p in enumerate(p_lst):
    print(k,preds[k],p[:100])
    print('------')

chunk = tree2conlltags(chunker.parse(pos_tag(word_tokenize(p_lst[63]))))
for tok in chunk:
    print('{:15} {:>10} '.format(tok[0], tok[2]))

with open('../PickleJar/chunker.pickle', 'wb') as chunker_f:
    pickle.dump(chunker, chunker_f)
 def test_contain_words2(self):
     dtest = px.DefinitionsXML('tests/latexmled_files/math.0402243.xml')
     test_set = set(nltk.word_tokenize(dtest.get_def_text()[0].lower()))
     ss = {'quotient', 'singularités', 'orbifolde'}
     self.assertTrue(ss.issubset(test_set))
    def test_exact_tokenize2(self):
        dtest = px.DefinitionsXML('tests/latexmled_files/math.0412433.xml')
        str1 = '''let _inline_math_ \n             be\nthe divisorial part of the unstable locus of _inline_math_ \n            ; in other words,\n\n         _inline_math_     is the union of the irreducible components\nof \n            _inline_math_  having codimension one in _inline_math_ \n            . we shall say\nthat the kirwan resolution _inline_math_ \n             is mild if _inline_math_ \n       .'''

        str2 = dtest.get_def_text()[0].lower()
        self.assertEqual(nltk.word_tokenize(str1), nltk.word_tokenize(str2))
Пример #13
0
        print('I am rpi%s and dealing with dir %s \n'%(rank, d))
        out_path = os.path.join('/tmp/', d)
        try:
            os.mkdir(out_path)
        except FileExistsError as ee:
            print(ee, ' continuiung using this directory')
            
        #print(tar_lst)
        #root = etree.Element('root', name=d)
        for tarpath in tar_lst:   # tarpath: 9201_001.tar.gz
            #print(os.path.join(mnt_path, d, T))
            tfile_elm = etree.Element('tarfile', name=tarpath)
            for fname,T in peep.tar_iter(os.path.join(mnt_path, d, tarpath), '.xml'):
                print(fname)
                try:
                    DD = px.DefinitionsXML(T)
                    def_dict = DD.get_def_sample_text_with()
                except ValueError as ee:
                    print("\n Probably empty article: %s \n"%fname, ee) 
                    def_dict = {'real': [], 'nondef': []}
                art_elm = etree.SubElement(tfile_elm, 'article', name=fname)
                for defin in def_dict['real']:
                    defi_elm = etree.SubElement(art_elm, 'definition')
                    defi_elm.text = defin
                for defin in def_dict['nondef']:
                    defi_elm = etree.SubElement(art_elm, 'nondef')
                    defi_elm.text = defin

            #print(etree.tostring(tfile_elm, pretty_print=True).decode('utf-8'))
            gz_filename = os.path.basename(tarpath).split('.')[0] + '.xml.gz' 
            #logging.debug('The name of the gz_filename is: %s'%gz_filename)
Пример #14
0
    if args.query:
        art_dict = create_dict()
        qq = query()
        change_path = lambda p: re.sub(r'^/mnt/', '/home/luis/media_home/', p)
        file_lst = [
            change_path(art_dict[s[0]]) for s in qq if s[0] in art_dict
        ]
    else:
        file_lst = args.file_names

    for k, xml_path in enumerate(file_lst):
        havent_done = root.find('.//article[@name = "%s"]' % xml_path) is None
        if havent_done:
            print('Processing file: %s' % os.path.basename(xml_path), end='\r')
            try:
                px = parsing_xml.DefinitionsXML(xml_path)
                ddum = Definiendum(px, clf, bio, vzer, tokr)
                root.append(ddum.root)
                if k % 25 == 0 and args.output:
                    with open(args.output, 'w') as out_f:
                        out_f.write(
                            etree.tostring(root,
                                           pretty_print=True).decode('utf8'))
            except (TypeError, etree.ParseError):
                print('file %s could not be parsed by parsing_xml' %
                      os.path.basename(xml_path))
            except ValueError as e:
                print('In the file %s found the problem' %
                      os.path.basename(xml_path))
                print(e)
        else: