def main():
    args = docopt.docopt(__doc__)
    
    session = api.Session(args['--api'])
    title_parser = title.Parser.from_api(session)
    threads = int(args['--threads'] or cpu_count())
    
    
    def process_dump(dump, path):
        
        for page in dump:
            
            for revision in page:
                try:
                    links = set(extract_and_parse_inlinks(revision.text,
                                                          title_parser))
                    for ns, title in links:
                        yield page.id, ns, title

                except Exception as e:
                    sys.stderr.write(traceback.format_exc())
		            
        
    print("from_id\tto_ns\tto_title")
    
    link_infos = xml_dump.map(args['<dump_path>'],
                              process_dump,
                              threads=threads)
    
    for from_id, to_ns, to_title in link_infos:
        
        print('{0}\t{1}\t{2}'.format(from_id, to_ns, encode(to_title)))
Exemplo n.º 2
0
def extract_pages(output_path):
    articles_dump_fn = os.path.expanduser(public_domain_rank.config['data']['articles'])
    print("number of pageids of interest: {}".format(len(_PAGEIDS_OF_INTEREST)))
    counter = 0
    pageid_title_timestamp_length = []
    for i, page_info_tuple in enumerate(xml_dump.map([articles_dump_fn], page_info)):
        # empty tuple indicates not of interest
        if page_info_tuple is tuple():
            continue
        pageid, namespace, title, timestamp, text = page_info_tuple
        article_length = len(text)
        pageid_title_timestamp_length.append((pageid, title, timestamp, article_length))
        text_fn = os.path.join(output_path, '{}.txt'.format(pageid))
        with open(text_fn, 'w') as f:
            f.write(text)
        counter += 1
        if counter % 1e4 == 0:
            print("extracted {} pages".format(counter))
    pageid_title_output_fn = os.path.join(output_path, 'pageid-title-timestamp-length.csv')
    with open(pageid_title_output_fn, 'w') as f:
        # write header
        f.write('pageid\ttitle\trevision_timestamp\tarticle_length\n'.format(pageid, title, timestamp))
        for pageid, title, timestamp, article_length in pageid_title_timestamp_length:
            f.write('{}\t{}\t{}\t{}\n'.format(pageid, title, timestamp, article_length))
    print("finished extracting. extracted {} pages".format(counter))
def extract(dump_files, extractors=ALL_EXTRACTORS):
    """
    Extracts cites from a set of `dump_files`.
    
    :Parameters:
        dump_files : str | `file`
            A set of files MediaWiki XML dump files
            (expects: pages-meta-history)
        extractors : `list`(`extractor`)
            A list of extractors to apply to the text
    
    :Returns:
        `iterable` -- a generator of extracted cites
    
    """
    # Dump processor function
    def process_dump(dump, path):
        for page in dump:
            if page.namespace != 0: continue
            else:
                for cite in extract_cite_history(page, extractors):
                    yield cite
        
    # Map call
    return xml_dump.map(dump_files, process_dump)
Exemplo n.º 4
0
def run(dump_files):
    def process_dump(dump, path):
        for page in dump:
            if page.namespace != 0: continue

            last_references = set()
            for revision in page:

                references = set(extract(revision.text or ""))

                references_added = references - last_references
                references_removed = last_references - references

                if len(references_added) > 0 or len(references_removed) > 0:
                    if revision.contributor:
                        user_id = revision.contributor.id
                        user_text = revision.contributor.user_text
                    else:
                        user_id = 0
                        user_text = None

                    yield (revision.id, revision.timestamp, user_id, user_text,
                           page.id, page.title, list(references_added),
                           list(references_removed))

                last_references = references

    print("\t".join([
        "rev_id", "rev_timestamp", "user_id", "user_text", "page_id",
        "page_title", "references_added", "references_removed"
    ]))

    for vals in xml_dump.map(dump_files, process_dump):
        print("\t".join(tsv_encode(val) for val in vals))
def create_mappings(input_dump_files):
    print('extracting contributors from {}'.format(input_dump_files))   # TODO für direkte Ausgabe sorgen
    contributors = xml_dump.map(input_dump_files, process_dump=process_dump, threads=4)
    contributors = [revision_contributor_name for revision_contributor_name in contributors]
    print('found {} contributions'.format(len(contributors)))
    contributors = list(set(contributors))  # entferne Dubletten
    print('found {} contributors'.format(len(contributors)))
    contributors.sort()
    return contributors
Exemplo n.º 6
0
def run(dump_files, threads, verbose):
    
    if len(dump_files) == 0:
        revision_docs = dump2json(xml_dump.Iterator.from_file(sys.stdin),
                                  verbose=verbose)
        
    else:
        revision_docs = xml_dump.map(dump_files,
                                     lambda d, p: dump2json(d, verbose=verbose),
                                     threads=threads)
    
        
    for revision_doc in revision_docs:
        json.dump(revision_doc, sys.stdout)
        sys.stdout.write("\n")
Exemplo n.º 7
0
def run(dump_files):

    def process_dump(dump, path):
        for page in dump:
            if page.namespace != 0: continue

            for revision in page:
                for reference in set(extract(revision.text or "")):

                    yield page.id, page.title, revision.id, reference


    print("\t".join(["page_id", "page_title", "rev_id", "reference"]))

    for vals in xml_dump.map(dump_files, process_dump):
        print("\t".join(tsv_encode(val) for val in vals))
Exemplo n.º 8
0
def run(dump_files):
    def process_dump(dump, path):
        for page in dump:
            if page.namespace != 0: continue

            for revision in page:
                for reference in set(extract(revision.text or "")):

                    yield (page.id, page.title, revision.id,
                           revision.timestamp, reference)

    print("\t".join(
        ["page_id", "page_title", "rev_id", "rev_timestamp", "reference"]))

    for vals in xml_dump.map(dump_files, process_dump):
        print("\t".join(tsv_encode(val) for val in vals))
Exemplo n.º 9
0
def run(dump_files, diff_engine, threads, drop_text, verbose):

    if len(dump_files) == 0:
        revision_docs = dump2diffs(xml_dump.Iterator.from_file(sys.stdin),
                                   diff_engine, verbose=verbose)

    else:
        dump_processor = lambda d, p: dump2diffs(d, diff_engine,
                                                 verbose=verbose)
        revision_docs = xml_dump.map(dump_files, dump_processor,
                                     threads=threads)


    for revision_doc in revision_docs:
        if drop_text:
            del revision_doc['text']

        json.dump(revision_doc, sys.stdout)
        sys.stdout.write("\n")
Exemplo n.º 10
0
def run(dump_files, diff_engine, threads, drop_text, verbose):

    if len(dump_files) == 0:
        revision_docs = dump2diffs(xml_dump.Iterator.from_file(sys.stdin),
                                   diff_engine,
                                   verbose=verbose)

    else:
        dump_processor = lambda d, p: dump2diffs(
            d, diff_engine, verbose=verbose)
        revision_docs = xml_dump.map(dump_files,
                                     dump_processor,
                                     threads=threads)

    for revision_doc in revision_docs:
        if drop_text:
            del revision_doc['text']

        json.dump(revision_doc, sys.stdout)
        sys.stdout.write("\n")
Exemplo n.º 11
0
def run(page_ids, namespace_titles, dump_paths):
    def process_dump(dump, path):

        for page in dump:
            page_title = title.normalize(page.title)  # Converts " " to "_"

            # Try to match the current page to our mappings
            page_info = None
            source = None
            if page.id in page_ids:
                page_info = page_ids[page.id]
                source = "id match"
            elif (page.namespace, page_title) in namespace_titles:
                page_info = namespace_titles[(page.namespace, page_title)]
                source = "namespace/title match"
            elif page.namespace == 1 and (0, page_title) in namespace_titles:
                page_info = namespace_titles[(0, page_title)]
                source = "talk page"

            if page_info != None:
                changes = templates.detect_changes(Revision(r.id, r.timestamp, r.text or "") for r in page)

                for current, new in changes:
                    yield page_info, current, new, source

    writer = tsv.Writer(sys.stdout, headers=HEADERS)

    for page_info, old, new, source in xml_dump.map(dump_paths, process_dump):

        if new != None:
            writer.write(
                [
                    page_info.id,
                    page_info.namespace,
                    page_info.title,
                    new.revision.id,
                    new.revision.timestamp,
                    new.status,
                    source,
                ]
            )
Exemplo n.º 12
0
 def run(self):
     
     def _process_dump(dump, path):
         try:
             for page in dump:
                 logger.debug("Constructing new processor for {0}:{1}"\
                              .format(page.namespace, page.title))
                 
                 processor_status = self.store.processor_status.get(page.id,
                                           type=self.engine.Processor.Status)
                 
                 if processor_status is None:
                     processor_status = self.engine.Processor.Status(page.id)
                 
                 processor = self.engine.processor(processor_status)
                 
                 for rev in page:
                     if rev.id <= processor_status.last_rev_id:
                         
                         logger.debug(
                                 "Skipping revision (already processed) " +\
                                 "{0}:{1}".format(rev.id, rev.timestamp))
                         continue
                     try:
                         user = User(rev.contributor.id,
                                     rev.contributor.user_text)
                         delta = processor.process(rev.id, rev.timestamp,
                                                   rev.text)
                         revision = Revision(rev.id, rev.timestamp, page.id,
                                             user, delta)
                         yield (revision, None)
                     except RevisionOrderError as e:
                         logger.error(traceback.format_exc())
                         logger.info("Skipping revision (out of order) " + \
                                     "{0}:{1}".format(rev.id, rev.timestamp))
                 
                 logger.debug("Finished processing page {0}:{1}"\
                              .format(page.namespace, page.title))
                 
                 yield (processor.status, page.title)
             
             logger.debug("Finished processing dump at {0}".format(path))
             yield (path, None)
         
         
         except Exception as e:
             logger.error(traceback.format_exc())
             raise
     
     engine_status = self.store.engine_status.get(type=self.engine.Status)
     if engine_status is None:
         logger.info("Starting {0} from scratch.".format(self.engine.info()))
         engine_status = self.engine.Status(self.engine.info())
     
     max_rev_id = 0
     max_timestamp = Timestamp(0)
     
     if len(self.paths) == 1:
         dump = Iterator.from_file(open_file(self.paths[0]))
         rev_proc_or_paths = _process_dump(dump, self.paths[0])
     else:
         rev_proc_or_paths = map(self.paths, _process_dump,
                                 **self.map_kwargs)
     
     try:
         for rev_proc_or_path, meta in rev_proc_or_paths:
             
             if isinstance(rev_proc_or_path, Revision):
                 revision = rev_proc_or_path
                 
                 self.store.revisions.store(revision)
                 self.status.stats['revisions_processed'] += 1
                 
                 max_rev_id = max(revision.rev_id, max_rev_id)
                 max_timestamp = max(revision.timestamp, max_timestamp)
                 
             elif isinstance(rev_proc_or_path, ProcessorStatus):
                 processor_status = rev_proc_or_path
                 page_title = meta
                     
                 logger.debug("Completed processing page " + \
                              "{0}. {1}".format(
                                      page_title,
                                      processor_status.stats))
                 
                 self.store.processor_status.store(processor_status)
                 
                 
             elif isinstance(rev_proc_or_path, str):
                 path = rev_proc_or_path
                 
                 logger.info("Completed processing dump {0}".format(path))
                 
             else:
                 raise RuntimeError(
                         "Did not expect a " + \
                         "{0}".format(type(rev_proc_or_path)))
             
             
         
         self.status.update(max_rev_id, max_timestamp)
         
         self.store.engine_status.store(engine_status)
     
     except Exception as e:
         logger.error(traceback.format_exc())
         raise
Exemplo n.º 13
0
            line_starts = defaultdict(int)
            if lines[0].startswith('#REDIRECT'):
                line_starts['total'] = 9999
                #yield page.title, line_starts
                continue

            for line in lines:
                for char in CHARS:
                    if line.startswith(char):
                        line_starts[char] += 1
            
            line_starts['total'] = len(lines)
        yield page.title, line_starts

outfile = open('linestarts.txt', 'w')

outfile.write('\t'.join(['page+title', '\t'.join(CHARS), 'total', '\n']))

for page_title, line_starts in xml_dump.map(files, page_info):
    print("\t".join([page_title, str(line_starts)]))
    outfile.write(page_title+'\t')
    for char in CHARS+['total']:
        outfile.write( str(line_starts[char]) + '\t')
    outfile.write('\n')


outfile.close()
etime = datetime.datetime.now()
print(etime)
print('took ', (etime - stime))
Exemplo n.º 14
0
import sys,os;sys.path.insert(0, os.path.abspath(os.getcwd()))
from mw import xml_dump

files = ["examples/dump.xml", "examples/dump2.xml"]

def page_info(dump, path):
	for page in dump:
		
		yield page.id, page.namespace, page.title
		

for page_id, page_namespace, page_title in xml_dump.map(files, page_info):
	print("\t".join([str(page_id), str(page_namespace), page_title]))

Exemplo n.º 15
0
"""
Processes two dump files.
"""
from mw import xml_dump

files = ["examples/dump.xml", "examples/dump2.xml"]


def page_info(dump, path):
    for page in dump:
        yield page.id, page.namespace, page.title


for page_id, page_namespace, page_title in xml_dump.map(files, page_info):
    print("\t".join([str(page_id), str(page_namespace), page_title]))
Exemplo n.º 16
0
                revisions = list(page)
                if len(revisions) != 1:
                    raise ValueError
                else:
                    latest_revision = revisions[0]
                    wikicode = mwparserfromhell.parse(latest_revision.text)
                    for template in wikicode.filter_templates():
                        if template.name.lower().replace('_',' ') == 'cite doi':
                            for param in template.params:
                                cite_dois.append(param)
            yield(page.title, page.id, {'cite_dois':cite_dois, 'journal_dois':journal_dois})
         

outfile = open('doi_list.txt', 'w')

for page_title, page_id, doi_dict in xml_dump.map(files, page_info):
    print(' pageid', page_id, ' page title ', page_title , ' doi_dict', doi_dict)
    #if int(page_id) > 10000:
    #    break
    if doi_dict['cite_dois']:
        for doi in doi_dict['cite_dois']:
            outfile.write(str(page_title) + '\t' + str(doi)+ '\n')
    if doi_dict['journal_dois']:
        for doi in doi_dict['journal_dois']:
            outfile.write('cite journal ---- ' + str(doi) + '\n')

outfile.close()

etime = datetime.datetime.now()
print(etime)
print('took ', (etime - stime))
Exemplo n.º 17
0
                    latest_revision = revisions[0]
                    wikicode = mwparserfromhell.parse(latest_revision.text)
                    for template in wikicode.filter_templates():
                        if template.name.lower().replace('_',
                                                         ' ') == 'cite doi':
                            for param in template.params:
                                cite_dois.append(param)
            yield (page.title, page.id, {
                'cite_dois': cite_dois,
                'journal_dois': journal_dois
            })


outfile = open('doi_list.txt', 'w')

for page_title, page_id, doi_dict in xml_dump.map(files, page_info):
    print(' pageid', page_id, ' page title ', page_title, ' doi_dict',
          doi_dict)
    #if int(page_id) > 10000:
    #    break
    if doi_dict['cite_dois']:
        for doi in doi_dict['cite_dois']:
            outfile.write(str(page_title) + '\t' + str(doi) + '\n')
    if doi_dict['journal_dois']:
        for doi in doi_dict['journal_dois']:
            outfile.write('cite journal ---- ' + str(doi) + '\n')

outfile.close()

etime = datetime.datetime.now()
print(etime)