Пример #1
0
def read_file_behrooz(filename,
                      file_id,
                      semantic,
                      missing_tags=None,
                      problem_files=None):
    """
    Read file for parsing

    :type filename: string
    :param filename: file to be parsed

    :rtype: list(SymbolTree)
    :return list of Symbol trees found in the file
    """
    #s = time.time()
    (ext, content) = MathDocument.read_doc_file(filename)
    t = MathExtractor.Behrooz_parse_from_xml(content,
                                             1,
                                             window=3,
                                             operator=semantic,
                                             missing_tags=missing_tags,
                                             problem_files=problem_files)
    # #print("file %s took %s per expr"%(file_id,(time.time()-s)/len(t)))
    # for item in t:
    #     #slttuplesList = SymbolTree.get_pairs(item, window='all')
    #     print(item)
    return t
Пример #2
0
 def get_collection(self, ):
     except_count = 0
     dictionary_formula_tuples = {}
     root = self.collection_file_path
     for directory in os.listdir(root):
         temp_address = root+"/"+directory+"/"
         if not os.path.isdir(temp_address):
             continue
         temp_address = temp_address +"/Articles"
         for filename in os.listdir(temp_address):
             file_path = temp_address + '/' + filename
             parts = filename.split('/')
             file_name = os.path.splitext(parts[len(parts) - 1])[0]
             try:
                 (ext, content) = MathDocument.read_doc_file(file_path)
                 formulas = MathExtractor.parse_from_xml(content, 1, operator=(not self.read_slt), missing_tags=None,
                                                         problem_files=None)
                 temp = str(unicodedata.normalize('NFKD', file_name).encode('ascii', 'ignore'))
                 temp = temp[2:]
                 file_name = temp[:-1]
                 for key in formulas:
                     tuples = formulas[key].get_pairs(window=2, eob=True)
                     dictionary_formula_tuples[file_name + ":" + str(key)] = tuples
             except:
                 except_count += 1
                 print(file_name)
     return dictionary_formula_tuples
Пример #3
0
def read_file(filename,
              file_id,
              semantic,
              missing_tags=None,
              problem_files=None):
    """
    Read file for parsing

    :type filename: string
    :param filename: file to be parsed

    :rtype: list(SymbolTree)
    :return list of Symbol trees found in the file
    """
    #s = time.time()
    (ext, content) = MathDocument.read_doc_file(filename)

    if ext == '.tex' and not semantic:
        t = MathExtractor.parse_from_tex(content, file_id)
        #print("file %s took %s"%(file_id,time.time()-s))
        return [t], 0
    elif ext in {'.xhtml', '.mathml', '.mml', '.html'}:
        t, n_err = MathExtractor.parse_from_xml(content,
                                                file_id,
                                                operator=semantic,
                                                missing_tags=missing_tags,
                                                problem_files=problem_files)
        #print("file %s took %s per expr"%(file_id,(time.time()-s)/len(t)))
        for item in t:
            #slttuplesList = SymbolTree.get_pairs(item, window='all')
            print(item)
        return t, n_err
    else:
        if ext == '.tex' and semantic:
            if "invalid_filetype" not in problem_files:
                problem_files["invalid_filetype"] = set([filename])
            else:
                problem_files["invalid_filetype"].add(filename)

            print('invalid file format %s for %s in operator tree mode' %
                  (ext, filename))
        else:
            problem_files["unknown_filetype"] = problem_files.get(
                "unknown_filetype", set())
            problem_files["unknown_filetype"].add(filename)
            print('Unknown filetype %s for %s' % (ext, filename))
        return [], 0
Пример #4
0
def latex_to_polish_list(expression):
    seen_ops = set()
    for transition in MathExtractor.parse_from_tex_opt(expression).get_pairs(
            window=1, eob=True):
        trans_tuple = transition.split("\t")
        # is var
        if trans_tuple[0].startswith("V"):
            yield alias(trans_tuple[0].split("!")[1], is_var=True)
            continue
        # trans_tuple[0]+trans_tuple[-1] is presumed to be op's unique ID in OPTree
        if trans_tuple[0] + trans_tuple[-1] not in seen_ops:
            yield alias(trans_tuple[0].split("!")[1])
            seen_ops.add(trans_tuple[0] + trans_tuple[-1])
        else:
            seen_ops.remove(trans_tuple[0] + trans_tuple[-1])
Пример #5
0
 def get_query(self,):
     except_count = 0
     dictionary_query_tuples = {}
     for j in range(1, 21):
         temp_address = self.queries_directory_path + '/' + str(j) + '.html'
         try:
             (ext, content) = MathDocument.read_doc_file(temp_address)
             formulas = MathExtractor.parse_from_xml(content, 1, operator=(not self.read_slt), missing_tags=None,
                                                     problem_files=None)
             for key in formulas:
                 tuples = formulas[key].get_pairs(window=2, eob=True)
                 dictionary_query_tuples[j] = tuples
         except:
             except_count += 1
             print(j)
     return dictionary_query_tuples
Пример #6
0
def ConvertWikipediaToSLTTuplesNewVersion(filePathForresults,
                                          filename,
                                          dirId,
                                          lst,
                                          missing_tags=None,
                                          problem_files=None):
    try:
        parts = filename.split('/')
        file_name = os.path.splitext(parts[len(parts) - 1])[0]
        # parts = parts[len(parts)-1].split(".")
        # FileID = parts[0]
        # for i in range (1,len(parts)-1):
        #     FileID = FileID + "."+parts[i]
        (ext, content) = MathDocument.read_doc_file(filename)
        formulas = MathExtractor.parse_from_xml(content,
                                                1,
                                                operator=False,
                                                missing_tags=missing_tags,
                                                problem_files=problem_files)

        # formulas = MathExtractor.parse_from_xml(content,1, operator=False, missing_tags=missing_tags,problem_files=problem_files)

        #formulas = MathExtractor.behrooz_parse_from_xml(content=content, content_id=1, operator=True, missing_tags=missing_tags)
        for key in formulas:
            tuples = formulas[key].get_pairs(window=1, eob=True)
            if not tuples:
                return
            f = open(
                filePathForresults + "/" + str(dirId) + "/" + file_name + ":" +
                str(key) + ".txt", "w+")
            for t in tuples:
                f.write(t + "\n")
            f.close()

            #fileP = filePathForresults + "/" + str(dirId) + "/" + FileID + ":" + str(key) + ".txt"
            #f = open(fileP, "w+")
            #for t in tuples:
            #f.write(t+"\n")
            #f.close()

    except:
        print(filename)
Пример #7
0
def behrooz_queryPreparation(filename,
                             resultFile,
                             file_id,
                             missing_tags=None,
                             problem_files=None):
    (ext, content) = MathDocument.read_doc_file(filename)
    #formulas = MathExtractor.parse_from_xml(content,1, operator=False, missing_tags=missing_tags,problem_files=problem_files)
    formulas = MathExtractor.parse_from_xml(content,
                                            1,
                                            operator=False,
                                            missing_tags=missing_tags,
                                            problem_files=problem_files)
    for key in formulas:
        tuples = formulas[key].get_pairs(window=1, eob=True)
        if not tuples:
            return
        f = open(resultFile, "w+")
        for t in tuples:
            f.write(t + "\n")
        f.close()