def run(api_host, revisions, labels_f, trusted_groups, trusted_edits, revert_radius, revert_window, exclude_reverted, exclude_reverting, threads, verbose): # Construct our API session session = mwapi.Session( api_host, user_agent="wikimedia scoring platform/editquality -- autolabel") autolabel = autolabeler(session, trusted_groups, trusted_edits, revert_radius, revert_window, exclude_reverted, exclude_reverting) revisions, revisions2 = tee(revisions) number_of_revisions = sum(1 for line in revisions2) rev_id_chunks = chunk(revisions, 50) tq = tqdm(para.map(autolabel, rev_id_chunks, mappers=threads), file=sys.stderr, total=number_of_revisions) verbose_result = '' for revision in tq: if verbose: if not revision['autolabel']['needs_review']: verbose_result += '.' else: verbose_result += (revision['autolabel']['review_reason'] or "?")[0] labels_f.write(json.dumps(revision)) labels_f.write("\n") if verbose: sys.stderr.write(verbose_result + "\n") sys.stderr.flush()
def run(api_host, revisions, labels_f, trusted_groups, trusted_edits, revert_radius, revert_window, exclude_reverted, exclude_reverting, threads, verbose): # Construct our API session session = mwapi.Session( api_host, user_agent="wiki-ai/editquality -- autolabel script") autolabel = autolabeler(session, trusted_groups, trusted_edits, revert_radius, revert_window, exclude_reverted, exclude_reverting) rev_id_chunks = chunk(revisions, 50) for revision in para.map(autolabel, rev_id_chunks, mappers=threads): if verbose: if not revision['autolabel']['needs_review']: sys.stderr.write(".") else: sys.stderr.write((revision['autolabel']['review_reason'] or "?")[0]) sys.stderr.flush() labels_f.write(json.dumps(revision)) labels_f.write("\n") if verbose: sys.stderr.write("\n") sys.stderr.flush()
def process_all(paths): git_root_dir = "/export/scratch2/levon003/repos/wiki-ores-feedback" derived_data_dir = os.path.join(git_root_dir, "data", "derived") working_dir = os.path.join(derived_data_dir, 'stub-history-all-revisions', 'oidb') os.makedirs(working_dir, exist_ok=True) #output_filepath = os.path.join(working_dir, 'oidb.sqlite') #engine = get_engine(output_filepath) #create_tables(engine) #metadata = MetaData(bind=engine) #metadata.reflect() #page_metadata = metadata.tables['page_metadata'] #revision = metadata.tables['revision'] #conn = engine.connect() start = datetime.now() processed_count = 0 page_processed_count = 0 curr_batch = [] FORCE_COMMIT_SIZE = 100000 with open(os.path.join(working_dir, 'revs_unsorted.tsv'), 'w') as outfile, open(os.path.join(working_dir, 'page.ndjson'), 'w') as page_outfile: for result in para.map(process_stub_history_filepath, paths, mappers=len(paths)): if 'wiki_namespace' in result: # this is a page result page_outfile.write(json.dumps(result) + "\n") page_processed_count += 1 if page_processed_count % 100000 == 0: print( f"Processed {page_processed_count} pages in {datetime.now() - start}" ) else: #outfile.write(json.dumps(result) + "\n") outfile.write( "{rev_timestamp}\t{page_id}\t{rev_id}\t{prev_rev_id}\t{is_minor}\t{user_text}\t{user_id}\t{seconds_to_prev}\t{curr_bytes}\t{delta_bytes}\t{has_edit_summary}\t{is_reverted}\t{is_revert}\t{is_reverted_to_by_other}\t{is_self_reverted}\t{is_self_revert}\t{revert_target_id}\t{revert_set_size}\t{revert_id}\t{seconds_to_revert}\n" .format(**result)) #curr_batch.append(result) #if len(curr_batch) >= FORCE_COMMIT_SIZE: # conn.execute(revision.insert(), curr_batch) # curr_batch = [] processed_count += 1 if processed_count % 1000000 == 0: print( f"Processed {processed_count} revisions in {datetime.now() - start}" ) #if len(curr_batch) > 0: # conn.execute(revision.insert(), curr_batch) print( f"Finished processing {processed_count} revisions (and {page_processed_count} pages) in {datetime.now() - start}" )
def run(paths, rate): writer = mysqltsv.Writer(sys.stdout, headers=HEADERS) def process_path(path): f = mwcli.files.reader(path) return sample_tokens((json.loads(line) for line in f), rate) for values in para.map(process_path, paths): writer.write(values)
def process_all(paths): start = datetime.now() with open(os.path.join(working_dir, 'rev_ids.csv'), 'w') as outfile: for result in para.map(process_stub_history_filepath, paths, mappers=len(paths)): page_id, rev_id, rev_timestamp, rev_user_text, rev_user_id, is_revert_target, is_reverted, is_reverting = result outfile.write( f"{page_id},{rev_id},{rev_timestamp},{rev_user_text},{rev_user_id},{is_revert_target},{is_reverted},{is_reverting}\n" ) print(f"{datetime.now() - start}")
def process_all(paths): git_root_dir = "/export/scratch2/levon003/repos/wiki-ores-feedback" derived_data_dir = os.path.join(git_root_dir, "data", "derived") working_dir = os.path.join(derived_data_dir, 'stub-history-all-revisions', 'oidb') os.makedirs(working_dir, exist_ok=True) start = datetime.now() processed_count = 0 curr_batch = [] with open(os.path.join(working_dir, 'pre2018_edit_counts_ungrouped.tsv'), 'w') as outfile: for result in para.map(process_stub_history_filepath, paths, mappers=len(paths)): user_id, count = result outfile.write(str(user_id) + "\t" + str(count) + "\n") processed_count += 1 if processed_count % 100000 == 0: print(f"Processed {processed_count} users in {datetime.now() - start}") print(f"Finished processing {processed_count} users in {datetime.now() - start}")
def map(process, paths, threads=None): u""" Implements a distributed stategy for processing XML files. This function constructs a set of py:mod:`multiprocessing` threads (spread over multiple cores) and uses an internal queue to aggregate outputs. To use this function, implement a `process()` function that takes two arguments -- a :class:`mwxml.Dump` and the path the dump was loaded from. Anything that this function ``yield``s will be `yielded` in turn from the :func:`mwxml.map` function. :Parameters: paths : `iterable` ( `str` | `file` ) a list of paths to dump files to process process : `func` A function that takes a :class:`~mwxml.iteration.dump.Dump` and the path the dump was loaded from and yields threads : int the number of individual processing threads to spool up :Example: >>> import mwxml >>> files = ["examples/dump.xml", "examples/dump2.xml"] >>> >>> def page_info(dump, path): ... for page in dump: ... yield page.id, page.namespace, page.title ... >>> for id, namespace, title in mwxml.map(page_info, files): ... print(id, namespace, title) ... """ paths = [mwtypes.files.normalize_path(path) for path in paths] def process_path(path): dump = Dump.from_file(mwtypes.files.reader(path)) for x in process(dump, path): yield x for x in para.map(process_path, paths, mappers=threads): yield x
def run(self, paths, threads, kwargs, output_dir, compression, verbose): def process_path(path): f = files.reader(path) input = self.file_reader(f) docs = self.a2b(input, verbose=verbose, **kwargs) if output_dir == None: yield from docs else: new_path = files.output_dir_path(path, output_dir, compression) writer = files.writer(new_path) for doc in docs: json.dump(doc, writer) writer.write("\n") for doc in para.map(process_path, paths, mappers=threads): json.dump(doc, sys.stdout) sys.stdout.write("\n")
def map(process, paths, threads=None): """ Implements a distributed stategy for processing XML files. This function constructs a set of py:mod:`multiprocessing` threads (spread over multiple cores) and uses an internal queue to aggregate outputs. To use this function, implement a `process()` function that takes two arguments -- a :class:`mwxml.Dump` and the path the dump was loaded from. Anything that this function ``yield``s will be `yielded` in turn from the :func:`mwxml.map` function. :Parameters: paths : `iterable` ( `str` | `file` ) a list of paths to dump files to process process : `func` A function that takes a :class:`~mwxml.iteration.dump.Dump` and the path the dump was loaded from and yields threads : int the number of individual processing threads to spool up :Example: >>> import mwxml >>> files = ["examples/dump.xml", "examples/dump2.xml"] >>> >>> def page_info(dump, path): ... for page in dump: ... yield page.id, page.namespace, page.title ... >>> for id, namespace, title in mwxml.map(page_info, files): ... print(id, namespace, title) ... """ paths = [mwtypes.files.normalize_path(path) for path in paths] def process_path(path): dump = Dump.from_file(mwtypes.files.reader(path)) yield from process(dump, path) yield from para.map(process_path, paths, mappers=threads)
def process_all(paths): git_root_dir = "/export/scratch2/levon003/repos/wiki-ores-feedback" derived_data_dir = os.path.join(git_root_dir, "data", "derived") working_dir = os.path.join(derived_data_dir, 'stub-history-all-revisions') os.makedirs(working_dir, exist_ok=True) start = datetime.now() with open(os.path.join(working_dir, 'rev_ids.csv'), 'w') as outfile: processed_count = 0 for result in para.map(process_stub_history_filepath, paths, mappers=len(paths)): page_id, page_namespace, is_page_redirect, rev_id, rev_timestamp, rev_user_text, rev_user_id, is_revert_target, is_reverted, is_reverting = result outfile.write( f"{page_id},{page_namespace},{is_page_redirect},{rev_id},{rev_timestamp},{rev_user_text},{rev_user_id},{is_revert_target},{is_reverted},{is_reverting}\n" ) processed_count += 1 if processed_count % 1000000 == 0: print( f"Processed {processed_count} revisions in {datetime.now() - start}" ) print( f"Finished processing {processed_count} revisions in {datetime.now() - start}" )
def run( api_host, revisions, labels_f, trusted_groups, trusted_edits, revert_radius, revert_window, exclude_reverted, exclude_reverting, threads, verbose, ): # Construct our API session session = mwapi.Session(api_host, user_agent="editquality -- prelabeling script.") autolabel = autolabeler( session, trusted_groups, trusted_edits, revert_radius, revert_window, exclude_reverted, exclude_reverting ) rev_id_chunks = chunk(revisions, 50) for revision in para.map(autolabel, rev_id_chunks, mappers=threads): if verbose: if not revision["autolabel"]["needs_review"]: sys.stderr.write(".") else: sys.stderr.write((revision["autolabel"]["review_reason"] or "?")[0]) sys.stderr.flush() labels_f.write(json.dumps(revision)) labels_f.write("\n") if verbose: sys.stderr.write("\n") sys.stderr.flush()
Or, to parallelize using ``MPI`` (assuming you have ``mpi4py`` installed and are running on a cluster), run >>> mpi example.py to send a PBS job to the queue. By default, the output will be printed to a log file in the same directory. ''' from __future__ import division, print_function, absolute_import, unicode_literals import para import numpy as np def quadratic(x, a, b, c): ''' The function we're parallelizing ''' print("[BEGIN JOB %d]" % x) for i in range(10**8): j = i print("[END JOB %d]" % x) return a * x ** 2 + b * x + c for res in para.map(quadratic, np.arange(50), args = (1, 1, 1), kwargs = {}): print(res)