def main(): args = docopt.docopt(__doc__) session = api.Session(args['--api']) title_parser = title.Parser.from_api(session) threads = int(args['--threads'] or cpu_count()) def process_dump(dump, path): for page in dump: for revision in page: try: links = set(extract_and_parse_inlinks(revision.text, title_parser)) for ns, title in links: yield page.id, ns, title except Exception as e: sys.stderr.write(traceback.format_exc()) print("from_id\tto_ns\tto_title") link_infos = xml_dump.map(args['<dump_path>'], process_dump, threads=threads) for from_id, to_ns, to_title in link_infos: print('{0}\t{1}\t{2}'.format(from_id, to_ns, encode(to_title)))
def extract_pages(output_path): articles_dump_fn = os.path.expanduser(public_domain_rank.config['data']['articles']) print("number of pageids of interest: {}".format(len(_PAGEIDS_OF_INTEREST))) counter = 0 pageid_title_timestamp_length = [] for i, page_info_tuple in enumerate(xml_dump.map([articles_dump_fn], page_info)): # empty tuple indicates not of interest if page_info_tuple is tuple(): continue pageid, namespace, title, timestamp, text = page_info_tuple article_length = len(text) pageid_title_timestamp_length.append((pageid, title, timestamp, article_length)) text_fn = os.path.join(output_path, '{}.txt'.format(pageid)) with open(text_fn, 'w') as f: f.write(text) counter += 1 if counter % 1e4 == 0: print("extracted {} pages".format(counter)) pageid_title_output_fn = os.path.join(output_path, 'pageid-title-timestamp-length.csv') with open(pageid_title_output_fn, 'w') as f: # write header f.write('pageid\ttitle\trevision_timestamp\tarticle_length\n'.format(pageid, title, timestamp)) for pageid, title, timestamp, article_length in pageid_title_timestamp_length: f.write('{}\t{}\t{}\t{}\n'.format(pageid, title, timestamp, article_length)) print("finished extracting. extracted {} pages".format(counter))
def extract(dump_files, extractors=ALL_EXTRACTORS): """ Extracts cites from a set of `dump_files`. :Parameters: dump_files : str | `file` A set of files MediaWiki XML dump files (expects: pages-meta-history) extractors : `list`(`extractor`) A list of extractors to apply to the text :Returns: `iterable` -- a generator of extracted cites """ # Dump processor function def process_dump(dump, path): for page in dump: if page.namespace != 0: continue else: for cite in extract_cite_history(page, extractors): yield cite # Map call return xml_dump.map(dump_files, process_dump)
def run(dump_files): def process_dump(dump, path): for page in dump: if page.namespace != 0: continue last_references = set() for revision in page: references = set(extract(revision.text or "")) references_added = references - last_references references_removed = last_references - references if len(references_added) > 0 or len(references_removed) > 0: if revision.contributor: user_id = revision.contributor.id user_text = revision.contributor.user_text else: user_id = 0 user_text = None yield (revision.id, revision.timestamp, user_id, user_text, page.id, page.title, list(references_added), list(references_removed)) last_references = references print("\t".join([ "rev_id", "rev_timestamp", "user_id", "user_text", "page_id", "page_title", "references_added", "references_removed" ])) for vals in xml_dump.map(dump_files, process_dump): print("\t".join(tsv_encode(val) for val in vals))
def create_mappings(input_dump_files): print('extracting contributors from {}'.format(input_dump_files)) # TODO für direkte Ausgabe sorgen contributors = xml_dump.map(input_dump_files, process_dump=process_dump, threads=4) contributors = [revision_contributor_name for revision_contributor_name in contributors] print('found {} contributions'.format(len(contributors))) contributors = list(set(contributors)) # entferne Dubletten print('found {} contributors'.format(len(contributors))) contributors.sort() return contributors
def run(dump_files, threads, verbose): if len(dump_files) == 0: revision_docs = dump2json(xml_dump.Iterator.from_file(sys.stdin), verbose=verbose) else: revision_docs = xml_dump.map(dump_files, lambda d, p: dump2json(d, verbose=verbose), threads=threads) for revision_doc in revision_docs: json.dump(revision_doc, sys.stdout) sys.stdout.write("\n")
def run(dump_files): def process_dump(dump, path): for page in dump: if page.namespace != 0: continue for revision in page: for reference in set(extract(revision.text or "")): yield page.id, page.title, revision.id, reference print("\t".join(["page_id", "page_title", "rev_id", "reference"])) for vals in xml_dump.map(dump_files, process_dump): print("\t".join(tsv_encode(val) for val in vals))
def run(dump_files): def process_dump(dump, path): for page in dump: if page.namespace != 0: continue for revision in page: for reference in set(extract(revision.text or "")): yield (page.id, page.title, revision.id, revision.timestamp, reference) print("\t".join( ["page_id", "page_title", "rev_id", "rev_timestamp", "reference"])) for vals in xml_dump.map(dump_files, process_dump): print("\t".join(tsv_encode(val) for val in vals))
def run(dump_files, diff_engine, threads, drop_text, verbose): if len(dump_files) == 0: revision_docs = dump2diffs(xml_dump.Iterator.from_file(sys.stdin), diff_engine, verbose=verbose) else: dump_processor = lambda d, p: dump2diffs(d, diff_engine, verbose=verbose) revision_docs = xml_dump.map(dump_files, dump_processor, threads=threads) for revision_doc in revision_docs: if drop_text: del revision_doc['text'] json.dump(revision_doc, sys.stdout) sys.stdout.write("\n")
def run(dump_files, diff_engine, threads, drop_text, verbose): if len(dump_files) == 0: revision_docs = dump2diffs(xml_dump.Iterator.from_file(sys.stdin), diff_engine, verbose=verbose) else: dump_processor = lambda d, p: dump2diffs( d, diff_engine, verbose=verbose) revision_docs = xml_dump.map(dump_files, dump_processor, threads=threads) for revision_doc in revision_docs: if drop_text: del revision_doc['text'] json.dump(revision_doc, sys.stdout) sys.stdout.write("\n")
def run(page_ids, namespace_titles, dump_paths): def process_dump(dump, path): for page in dump: page_title = title.normalize(page.title) # Converts " " to "_" # Try to match the current page to our mappings page_info = None source = None if page.id in page_ids: page_info = page_ids[page.id] source = "id match" elif (page.namespace, page_title) in namespace_titles: page_info = namespace_titles[(page.namespace, page_title)] source = "namespace/title match" elif page.namespace == 1 and (0, page_title) in namespace_titles: page_info = namespace_titles[(0, page_title)] source = "talk page" if page_info != None: changes = templates.detect_changes(Revision(r.id, r.timestamp, r.text or "") for r in page) for current, new in changes: yield page_info, current, new, source writer = tsv.Writer(sys.stdout, headers=HEADERS) for page_info, old, new, source in xml_dump.map(dump_paths, process_dump): if new != None: writer.write( [ page_info.id, page_info.namespace, page_info.title, new.revision.id, new.revision.timestamp, new.status, source, ] )
def run(self): def _process_dump(dump, path): try: for page in dump: logger.debug("Constructing new processor for {0}:{1}"\ .format(page.namespace, page.title)) processor_status = self.store.processor_status.get(page.id, type=self.engine.Processor.Status) if processor_status is None: processor_status = self.engine.Processor.Status(page.id) processor = self.engine.processor(processor_status) for rev in page: if rev.id <= processor_status.last_rev_id: logger.debug( "Skipping revision (already processed) " +\ "{0}:{1}".format(rev.id, rev.timestamp)) continue try: user = User(rev.contributor.id, rev.contributor.user_text) delta = processor.process(rev.id, rev.timestamp, rev.text) revision = Revision(rev.id, rev.timestamp, page.id, user, delta) yield (revision, None) except RevisionOrderError as e: logger.error(traceback.format_exc()) logger.info("Skipping revision (out of order) " + \ "{0}:{1}".format(rev.id, rev.timestamp)) logger.debug("Finished processing page {0}:{1}"\ .format(page.namespace, page.title)) yield (processor.status, page.title) logger.debug("Finished processing dump at {0}".format(path)) yield (path, None) except Exception as e: logger.error(traceback.format_exc()) raise engine_status = self.store.engine_status.get(type=self.engine.Status) if engine_status is None: logger.info("Starting {0} from scratch.".format(self.engine.info())) engine_status = self.engine.Status(self.engine.info()) max_rev_id = 0 max_timestamp = Timestamp(0) if len(self.paths) == 1: dump = Iterator.from_file(open_file(self.paths[0])) rev_proc_or_paths = _process_dump(dump, self.paths[0]) else: rev_proc_or_paths = map(self.paths, _process_dump, **self.map_kwargs) try: for rev_proc_or_path, meta in rev_proc_or_paths: if isinstance(rev_proc_or_path, Revision): revision = rev_proc_or_path self.store.revisions.store(revision) self.status.stats['revisions_processed'] += 1 max_rev_id = max(revision.rev_id, max_rev_id) max_timestamp = max(revision.timestamp, max_timestamp) elif isinstance(rev_proc_or_path, ProcessorStatus): processor_status = rev_proc_or_path page_title = meta logger.debug("Completed processing page " + \ "{0}. {1}".format( page_title, processor_status.stats)) self.store.processor_status.store(processor_status) elif isinstance(rev_proc_or_path, str): path = rev_proc_or_path logger.info("Completed processing dump {0}".format(path)) else: raise RuntimeError( "Did not expect a " + \ "{0}".format(type(rev_proc_or_path))) self.status.update(max_rev_id, max_timestamp) self.store.engine_status.store(engine_status) except Exception as e: logger.error(traceback.format_exc()) raise
line_starts = defaultdict(int) if lines[0].startswith('#REDIRECT'): line_starts['total'] = 9999 #yield page.title, line_starts continue for line in lines: for char in CHARS: if line.startswith(char): line_starts[char] += 1 line_starts['total'] = len(lines) yield page.title, line_starts outfile = open('linestarts.txt', 'w') outfile.write('\t'.join(['page+title', '\t'.join(CHARS), 'total', '\n'])) for page_title, line_starts in xml_dump.map(files, page_info): print("\t".join([page_title, str(line_starts)])) outfile.write(page_title+'\t') for char in CHARS+['total']: outfile.write( str(line_starts[char]) + '\t') outfile.write('\n') outfile.close() etime = datetime.datetime.now() print(etime) print('took ', (etime - stime))
import sys,os;sys.path.insert(0, os.path.abspath(os.getcwd())) from mw import xml_dump files = ["examples/dump.xml", "examples/dump2.xml"] def page_info(dump, path): for page in dump: yield page.id, page.namespace, page.title for page_id, page_namespace, page_title in xml_dump.map(files, page_info): print("\t".join([str(page_id), str(page_namespace), page_title]))
""" Processes two dump files. """ from mw import xml_dump files = ["examples/dump.xml", "examples/dump2.xml"] def page_info(dump, path): for page in dump: yield page.id, page.namespace, page.title for page_id, page_namespace, page_title in xml_dump.map(files, page_info): print("\t".join([str(page_id), str(page_namespace), page_title]))
revisions = list(page) if len(revisions) != 1: raise ValueError else: latest_revision = revisions[0] wikicode = mwparserfromhell.parse(latest_revision.text) for template in wikicode.filter_templates(): if template.name.lower().replace('_',' ') == 'cite doi': for param in template.params: cite_dois.append(param) yield(page.title, page.id, {'cite_dois':cite_dois, 'journal_dois':journal_dois}) outfile = open('doi_list.txt', 'w') for page_title, page_id, doi_dict in xml_dump.map(files, page_info): print(' pageid', page_id, ' page title ', page_title , ' doi_dict', doi_dict) #if int(page_id) > 10000: # break if doi_dict['cite_dois']: for doi in doi_dict['cite_dois']: outfile.write(str(page_title) + '\t' + str(doi)+ '\n') if doi_dict['journal_dois']: for doi in doi_dict['journal_dois']: outfile.write('cite journal ---- ' + str(doi) + '\n') outfile.close() etime = datetime.datetime.now() print(etime) print('took ', (etime - stime))
latest_revision = revisions[0] wikicode = mwparserfromhell.parse(latest_revision.text) for template in wikicode.filter_templates(): if template.name.lower().replace('_', ' ') == 'cite doi': for param in template.params: cite_dois.append(param) yield (page.title, page.id, { 'cite_dois': cite_dois, 'journal_dois': journal_dois }) outfile = open('doi_list.txt', 'w') for page_title, page_id, doi_dict in xml_dump.map(files, page_info): print(' pageid', page_id, ' page title ', page_title, ' doi_dict', doi_dict) #if int(page_id) > 10000: # break if doi_dict['cite_dois']: for doi in doi_dict['cite_dois']: outfile.write(str(page_title) + '\t' + str(doi) + '\n') if doi_dict['journal_dois']: for doi in doi_dict['journal_dois']: outfile.write('cite journal ---- ' + str(doi) + '\n') outfile.close() etime = datetime.datetime.now() print(etime)