예제 #1
0
def process_query_batch(args):
    """
    Given a query, generate query tuples for the math index
    :param args:
    :return: nil
    """
    stats = Stats()
    fileid = os.getpid()

    query_list, topk, math_index = args
    math_index.openDB(fileid, topk)

    stats.num_documents = len(query_list)

    for (query_num, query_string) in query_list:
        trees = MathExtractor.parse_from_xml(query_string, query_num,
                                             stats.missing_tags,
                                             stats.problem_files)
        stats.num_expressions += len(trees)

        # also need to handle keyword queries if present
        terms = re.findall(r"<keyword[^>]*>\s*([^<]*\S)\s*</keyword>",
                           query_string)
        stats.num_keywords += len(terms)

        math_index.search(fileid, query_num, trees, terms, topk)

    math_index.closeDB(fileid)
    return (fileid, stats)
예제 #2
0
def read_file(filename, file_id, missing_tags=None, problem_files=None):
    """
    Read file for parsing

    :type filename: string
    :param filename: file to be parsed

    :rtype: list(SymbolTree)
    :return list of Symbol trees found in the file
    """
    #s = time.time()
    (ext,content) = MathDocument.read_doc_file(filename)
    if ext == '.tex':
        t = MathExtractor.parse_from_tex(content, file_id)
        #print("file %s took %s"%(file_id,time.time()-s))
        return [t]
    elif ext in {'.xhtml', '.mathml', '.mml', '.html'}:
        t = MathExtractor.parse_from_xml(content, file_id, missing_tags=missing_tags, problem_files=problem_files)
        #print("file %s took %s per expr"%(file_id,(time.time()-s)/len(t)))
        return t
    else:
        problem_files["unknown_filetype"] = problem_files.get("unknown_filetype", set())
        problem_files["unknown_filetype"].add(filename)
        print('Unknown filetype %s for %s' % (ext, filename))
        return []
예제 #3
0
def read_file(filename, file_id, missing_tags=None, problem_files=None):
    """
    Read file for parsing

    :type filename: string
    :param filename: file to be parsed

    :rtype: list(SymbolTree)
    :return list of Symbol trees found in the file
    """
    #s = time.time()
    (ext, content) = MathDocument.read_doc_file(filename)
    if ext == '.tex':
        t = MathExtractor.parse_from_tex(content, file_id)
        #print("file %s took %s"%(file_id,time.time()-s))
        return [t]
    elif ext in {'.xhtml', '.mathml', '.mml', '.html'}:
        t = MathExtractor.parse_from_xml(content,
                                         file_id,
                                         missing_tags=missing_tags,
                                         problem_files=problem_files)
        #print("file %s took %s per expr"%(file_id,(time.time()-s)/len(t)))
        return t
    else:
        problem_files["unknown_filetype"] = problem_files.get(
            "unknown_filetype", set())
        problem_files["unknown_filetype"].add(filename)
        print('Unknown filetype %s for %s' % (ext, filename))
        return []
예제 #4
0
def process_query_batch(args):
    """
    Given a query, generate query tuples for the math index
    :param args:
    :return: nil
    """
    stats = Stats()
    fileid = os.getpid()

    system, db, run_tag, query_list, topk, math_index, strategy = args
    math_index.openDB(fileid,topk)

    stats.num_documents = len(query_list)

    for (query_num,query_string) in query_list:
        trees = MathExtractor.parse_from_xml(query_string, query_num, stats.missing_tags, stats.problem_files)
        stats.num_expressions += len(trees)
        math_index.search(fileid, query_num, trees)

        # also need to handle keyword queries if present
    
    math_index.closeDB(fileid)
    return (fileid,stats)