def math_indexer_task(pargs) -> (str, list): """ creates index tuples for the expressions in this subcollection :param pargs: :return: (fileid, combined_stats) """ math_index, cntl, chunkid = pargs combined_stats = Stats() docs = MathDocument(cntl) (chunk_size, mappings) = docs.read_mapping_file(chunkid) combined_stats.num_documents += len(mappings) seen_docs = [] # just dump them as they come for (doc_id, filename) in enumerate(mappings,start=chunkid*chunk_size): ## print('parsing %s, id:%s ' % (filename, doc_id),flush=True) try: # get all the symbol trees found in file for tree in read_file(filename, doc_id, missing_tags=combined_stats.missing_tags, problem_files=combined_stats.problem_files): combined_stats.num_expressions += 1 # pairs = tree.get_pairs(window) do not store pairs -- will be created in C++ module seen_docs.append(tree) except Exception as err: reason = str(err) print("Failed to process document "+filename+": "+reason, file=sys.stderr) combined_stats.problem_files[reason] = combined_stats.problem_files.get(reason, set()) combined_stats.problem_files[reason].add(doc_id) fileid = math_index.add(seen_docs) print("%s is done saving to database %s" % (chunkid,fileid), flush=True) return fileid, combined_stats
def math_indexer_task(pargs) -> (str, list): """ creates index tuples for the expressions in this subcollection :param pargs: :return: (fileid, combined_stats) """ math_index, cntl, chunkid = pargs combined_stats = Stats() docs = MathDocument(cntl) (chunk_size, mappings) = docs.read_mapping_file(chunkid) combined_stats.num_documents += len(mappings) seen_docs = [] # just dump them as they come for (doc_id, filename) in enumerate(mappings, start=chunkid * chunk_size): ## print('parsing %s, id:%s ' % (filename, doc_id),flush=True) try: # get all the symbol trees found in file for tree in read_file(filename, doc_id, missing_tags=combined_stats.missing_tags, problem_files=combined_stats.problem_files): combined_stats.num_expressions += 1 combined_stats.global_expressions += len(tree.position) # pairs = tree.get_pairs(window) do not store pairs -- will be created in C++ module seen_docs.append(tree) except Exception as err: reason = str(err) print("Failed to process document " + filename + ": " + reason, file=sys.stderr) combined_stats.problem_files[ reason] = combined_stats.problem_files.get(reason, set()) combined_stats.problem_files[reason].add(doc_id) fileid = math_index.add(seen_docs) print("%s is done saving to database %s" % (chunkid, fileid), flush=True) return fileid, combined_stats