Python Iterator示例，wmf.dump.Iterator Python示例

示例#1

0

显示文件

def processTalkPages(file_name):
	"""
	DESCRIPTION

    :Parameters:
        NAME : TYPE
            DESCRIPTIOIN
        
    :Return:
        DESCRIPTION
	"""
	talk_pages = {}
	
	# Access the file
	dumpIterator = dump.Iterator(file_name)
	
	# Iterate over pages of the dump
	for page in dumpIterator.readPages():
		
		# Iterate over revisions of the article.
		for revision in page.readRevisions():
			
			contributor_name = revision.getContributor().getUsername()
			
			if (contributor_name in talk_pages.keys()):    
				talk_pages[contributor_name].append(revision.getTimestamp())
			else:
				talk_pages.update({contributor_name : [revision.getTimestamp()]})
			
	return talk_pages

示例#2

0

显示文件

def processTalkPages(file_name):

    talk_pages = {}

    # Access the file.
    dumpIterator = dump.Iterator(file_name)

    # Iterate over pages of the dump.
    for page in dumpIterator.readPages():

        # Iterate over revisions of the article.
        for revision in page.readRevisions():

            contributor_name = revision.getContributor().getUsername().encode(
                "utf-8")

            if (contributor_name in talk_pages.keys()):
                talk_pages[contributor_name].append(revision.getTimestamp())
            else:
                talk_pages.update(
                    {contributor_name: [revision.getTimestamp()]})

    return talk_pages

示例#3

0

显示文件

def analyseArticle(file_name):

    # Container of relationships.
    relations = {}

    # Revisions to compare.
    revision_curr = Revision()
    revision_prev = Revision()
    text_curr = None

    # Access the file.
    dumpIterator = dump.Iterator(file_name)

    # Iterate over the pages.
    for page in dumpIterator.readPages():
        i = 0

        # Iterate over revisions of the article.
        for revision in page.readRevisions():
            vandalism = False

            #print "processing rev", revision.getId()

            # Update the information about the previous revision.
            revision_prev = revision_curr

            if (revision.getSha1() == None):
                revision.setSha1(
                    Text.calculateHash(revision.getText().encode("utf-8")))

            if (revision.getSha1() in spam):
                vandalism = True

            #TODO: SPAM detection: DELETION
            if (revision.getComment() != None
                    and revision.getComment().find(FLAG) > 0):
                pass
            else:
                if (revision_prev.length > PREVIOUS_LENGTH) and (len(
                        revision.getText()) < CURR_LENGTH) and ((
                            (len(revision.getText()) - revision_prev.length) /
                            float(revision_prev.length)) <= CHANGE_PERCENTAGE):
                    vandalism = True
                    revision_curr = revision_prev

            #if (vandalism):
            #print "---------------------------- FLAG 1"
            #print "SPAM", revision.getId()
            #print revision.getText()
            #print

            if (not vandalism):
                # Information about the current revision.
                revision_curr = Revision()
                revision_curr.id = i
                revision_curr.wikipedia_id = int(revision.getId())
                revision_curr.length = len(revision.getText())
                revision_curr.timestamp = revision.getTimestamp()
                revision_curr.comment = revision.getComment()

                # Relation of the current relation.
                relation = Relation()
                relation.revision = int(revision.getId())
                relation.length = len(revision.getText())

                # Some revisions don't have contributor.
                if (revision.getContributor() != None):
                    revision_curr.contributor_id = revision.getContributor(
                    ).getId()
                    revision_curr.contributor_name = revision.getContributor(
                    ).getUsername().encode('utf-8')
                    relation.author = revision.getContributor().getUsername(
                    ).encode('utf-8')
                else:
                    revision_curr.contributor_id = 'Not Available ' + revision.getId(
                    )
                    revision_curr.contribur_name = 'Not Available ' + revision.getId(
                    )
                    relation.author = 'Not Available ' + revision.getId()

                # Content within the revision.
                text_curr = revision.getText().encode('utf-8')
                text_curr = text_curr.lower()
                revision_curr.content = text_curr

                # Perform comparison.
                vandalism = determineAuthorship(revision_curr, revision_prev,
                                                text_curr, relation)

                if (not vandalism):
                    #print "NOT SPAM", revision.getId()

                    # Add the current revision with all the information.
                    revisions.update(
                        {revision_curr.wikipedia_id: revision_curr})
                    relations.update({revision_curr.wikipedia_id: relation})
                    revision_order.append((revision_curr.wikipedia_id, False))
                    # Update the fake revision id.
                    i = i + 1

                    # Calculate the number of tokens in the revision.
                    total = 0
                    for p in revision_curr.ordered_paragraphs:
                        for paragraph_curr in revision_curr.paragraphs[p]:
                            for hash_sentence_curr in paragraph_curr.sentences.keys(
                            ):
                                for sentence_curr in paragraph_curr.sentences[
                                        hash_sentence_curr]:
                                    total = total + len(sentence_curr.words)
                    revision_curr.total_tokens = total
                    relation.total_tokens = total

                else:
                    #print "---------------------------- FLAG 2"
                    #print "SPAM", revision.getId()
                    #print revision.getText()
                    #print
                    revision_order.append((revision_curr.wikipedia_id, True))
                    revision_curr = revision_prev
                    spam.append(revision.getSha1())

    return (revisions, revision_order, relations)

示例#4

0

显示文件

def getTagDatesFromPage(file_name):
    # Compile regexp
    reglist = list()
    reglist.append({
        "tagname":
        "maintained",
        "regexp":
        re.compile(
            '\{\{(articleissues\|((?:(?!\}\}).)*\||)|multiple issues\|((?:(?!\}\}).)*\||)|)maintained((\||=)(?:(?!\}\}).)*|)\}\}',
            re.IGNORECASE)
    })
    reglist.append({
        "tagname":
        "good article",
        "regexp":
        re.compile(
            '\{\{(articleissues\|((?:(?!\}\}).)*\||)|multiple issues\|((?:(?!\}\}).)*\||)|)good article((\||=)(?:(?!\}\}).)*|)\}\}',
            re.IGNORECASE)
    })
    #reglist.append({"tagname": "featured article", "regexp": re.compile('\{\{(articleissues\|((?:(?!\}\}).)*\||)|multiple issues\|((?:(?!\}\}).)*\||)|)featured article((\||=)(?:(?!\}\}).)*|)\}\}', re.IGNORECASE)})
    #reglist.append({"tagname": "featured article", "regexp": re.compile('\|currentstatus=FA', re.IGNORECASE)})
    reglist.append({
        "tagname":
        "npov",
        "regexp":
        re.compile(
            '\{\{(articleissues\|((?:(?!\}\}).)*\||)|multiple issues\|((?:(?!\}\}).)*\||)|)(pov|npov)((\||=)(?:(?!\}\}).)*|)\}\}',
            re.IGNORECASE)
    })
    reglist.append({
        "tagname":
        "disputed",
        "regexp":
        re.compile(
            '\{\{(articleissues\|((?:(?!\}\}).)*\||)|multiple issues\|((?:(?!\}\}).)*\||)|)disputed((\||=)(?:(?!\}\}).)*|)\}\}',
            re.IGNORECASE)
    })
    re_user = re.compile('({{|\[\[)user.*?[:|](.*?)[}/\]|]', re.IGNORECASE)
    #"({{|\[\[)user[\s\S]*?[:|]([\s\S]*?)[}/\]|]"

    # Access the file.
    dumpIterator = dump.Iterator(file_name)

    # Revisions to compare.
    revision_curr = Revision()
    revision_prev = Revision()
    text_curr = None

    listOfTagChanges = {}
    all_contributors = {
        "maintained": {},
        "good article": {},
        "featured article": {},
        "npov": {},
        "disputed": {}
    }

    # Iterate over the pages.
    for page in dumpIterator.readPages():
        # Iterate over revisions of the article.
        i = 0
        prev_matched = list()
        for revision in page.readRevisions():
            revision.wikipedia_id = int(revision.getId())
            revision.timestamp = revision.getTimestamp()
            # Some revisions don't have contributor.
            if (revision.getContributor() != None):
                revision.contributor_id = revision.getContributor().getId()
                revision.contributor_name = revision.getContributor(
                ).getUsername()
            else:
                revision.contributor_id = 'Not Available'
                revision.contribur_name = 'Not Available'

            text_curr = revision.getText()
            if (text_curr):
                text_curr = text_curr.encode('utf-8')
                text_curr = text_curr.lower()
            else:
                continue
            matched = list()
            aux = list()

            for regexp in reglist:
                m = regexp["regexp"].search(text_curr)
                if m:
                    mc = re_user.split(m.group(0))

                    i = 2
                    users = []
                    users.append(mc[2])
                    while (i + 3 < len(mc)):
                        #print m[i+3]
                        users.append(mc[i + 3])
                        i = i + 3

                    #print regexp["tagname"], contributor
                    #print regexp["tagname"], users
                    matched.append(regexp["tagname"])
                    aux.append((regexp["tagname"], users))

                    ##
                    #m_user = re_user.search(m.group(0))
                    #contributor = m_user.group(2)
            if "|currentstatus=FA" in text_curr:
                matched.append("featured article")
                aux.append((revision.contributor_name, "featured article"))

            # Calculate additions
            for (match, contributor) in aux:
                if not (match in prev_matched):
                    if not (revision.timestamp in listOfTagChanges.keys()):
                        listOfTagChanges[revision.timestamp] = list()
                    listOfTagChanges[revision.timestamp].append({
                        "rev":
                        revision.wikipedia_id,
                        "type":
                        "addition",
                        "tagname":
                        match,
                        "wikiname":
                        revision.contributor_name,
                        "timestamp":
                        revision.timestamp,
                        "date":
                        datetime.datetime.fromtimestamp(int(
                            revision.timestamp)).strftime('%Y-%m-%d %H:%M:%S')
                    })
                all_contributors[match].update({
                    revision.timestamp: {
                        "rev":
                        revision.wikipedia_id,
                        "user":
                        contributor,
                        "date":
                        datetime.datetime.fromtimestamp(int(
                            revision.timestamp)).strftime('%Y-%m-%d %H:%M:%S')
                    }
                })

            # Calculate removals
            for match in prev_matched:
                if not (match in matched):
                    if not (revision.timestamp in listOfTagChanges.keys()):
                        listOfTagChanges[revision.timestamp] = list()
                    listOfTagChanges[revision.timestamp].append({
                        "rev":
                        revision.wikipedia_id,
                        "type":
                        "removal",
                        "tagname":
                        match,
                        "wikiname":
                        revision.contributor_name,
                        "timestamp":
                        revision.timestamp,
                        "date":
                        datetime.datetime.fromtimestamp(int(
                            revision.timestamp)).strftime('%Y-%m-%d %H:%M:%S')
                    })

            prev_matched = matched

    return listOfTagChanges, all_contributors

示例#5

0

显示文件

    import bz2
    source = bz2.BZ2File(path)
elif path.endswith('.gz'):
    import gzip
    source = gzip.open(path)
elif path.endswith('.7z'):
    import subprocess
    source = subprocess.Popen('7za e -bd -so %s 2>/dev/null' % path,
                              shell=True,
                              stdout=subprocess.PIPE,
                              bufsize=65535).stdout
else:
    # assume it's an uncompressed XML file
    source = open(self.filename)

dumpIterator = dump.Iterator(source)
t1 = time.time()
cpages = 0
crevs = 0
totalrevs = 0
for page in dumpIterator.readPages():
    #page.getId(), page.getTitle(), page.readRevisions()
    #rev.getId(), rev.getParentId(), rev.getTimestamp(), rev.getContributor(), rev.getMinor(), rev.getComment(), rev.getText(), rev.getSha1()
    pagetitle = page.getTitle()
    if skip:
        if pagetitle == skip:
            skip = ''
        else:
            continue

    if ':' in pagetitle:

示例#6

0

显示文件

def analyseArticle(file_name):

    # Container of revisions.
    revisions = {}
    revision_order = []

    # Revisions to compare.
    revision_curr = Revision()
    revision_prev = Revision()
    text_curr = None

    # Access the file.
    dumpIterator = dump.Iterator(file_name)

    # Iterate over the pages.
    for page in dumpIterator.readPages():
        i = 0

        # Iterate over revisions of the article.
        for revision in page.readRevisions():
            vandalism = False

            # Update the information about the previous revision.
            revision_prev = revision_curr

            if (revision.getSha1() == None):
                revision.setSha1(
                    Text.calculateHash(revision.getText().encode("utf-8")))

            if (revision.getSha1() in spam):
                vandalism = True

            #TODO: SPAM detection: DELETION
            if (revision.getComment() != None
                    and revision.getComment().find(FLAG) > 0):
                pass
            else:
                if (revision_prev.length > PREVIOUS_LENGTH) and (len(
                        revision.getText()) < CURR_LENGTH) and ((
                            (len(revision.getText()) - revision_prev.length) /
                            float(revision_prev.length)) <= CHANGE_PERCENTAGE):
                    print "VANDALISM: CHANGE PERCETANGE"
                    vandalism = True

            #if (vandalism):
            #print "---------------------------- FLAG 1"
            #print revision.getId()
            #print revision.getText()
            #print

            if (not vandalism):
                # Information about the current revision.
                revision_curr = Revision()
                revision_curr.id = i
                revision_curr.wikipedia_id = int(revision.getId())
                revision_curr.length = len(revision.getText())

                # Some revisions don't have contributor.
                if (revision.getContributor() != None):
                    revision_curr.contributor_id = revision.getContributor(
                    ).getId()
                    revision_curr.contributor_name = revision.getContributor(
                    ).getUsername()
                else:
                    revision_curr.contributor_id = 'Not Available'
                    revision_curr.contribur_name = 'Not Available'

                # Content within the revision.
                text_curr = revision.getText().encode('utf-8')
                text_curr = text_curr.lower()
                revision_curr.content = text_curr

                # Perform comparison.
                vandalism = determineAuthorship(revision_curr, revision_prev,
                                                text_curr)

                if (not vandalism):
                    # Add the current revision with all the information.
                    revisions.update(
                        {revision_curr.wikipedia_id: revision_curr})
                    # Update the fake revision id.
                    i = i + 1
                    # Update the index of processed revisions.
                    revision_order.append((revision_curr.wikipedia_id, False))

                else:
                    #print "detected vandalism in here ...................................."
                    #print "---------------------------- FLAG 2"
                    #print revision.getId()
                    #print revision.getText()
                    #print
                    spam.append(revision.getSha1())
                    revision_order.append((revision_curr.wikipedia_id, True))
                    revision_curr = revision_prev

            else:
                #	revision.getText()
                #    #print
                spam.append(revision.getSha1())
                revision_order.append((revision_curr.wikipedia_id, True))
                revision_curr = revision_prev

    return (revisions, revision_order)

示例#7

0

显示文件

文件： templatedatafetcherusingxml.py 项目： nettrom/WikiMultiLangAnalysis

    def run(self):
        threading.Thread.run(self)
        print "Starting thread number", self.threadnumber, "for " + self.lang
        file = BZ2File(self.filepath)
        dumpiterator = dump.Iterator(file)
        templates = self.tempparams.keys()
        rows = []
        i = 0
        talkpages = 0
        totalrows = 0
        for page in dumpiterator.readPages():
            if page.getNamespace() == 1:
                talkpages += 1
                for revision in page.readRevisions():
                    text = mwparserfromhell.parse(revision.getText())
                    pagetemplates = text.filter_templates()
                    for pagetemplate in pagetemplates:
                        parameters = None
                        paramvals = None
                        if pagetemplate.name.lower() in templates:
                            parameters = self.tempparams[
                                pagetemplate.name.lower()].keys()
                            paramvals = self.tempparams[
                                pagetemplate.name.lower()]
                        elif '*' in templates:
                            parameters = self.tempparams["*"].keys()
                            paramvals = self.tempparams["*"]
                        if parameters is not None and paramvals is not None:
                            for pagetemplateparameter in pagetemplate.params:
                                if pagetemplateparameter.name.lower(
                                ) in parameters:
                                    #print pagetemplates
                                    #print pagetemplate.name.lower()
                                    #print pagetemplate
                                    #print pagetemplateparameter
                                    #print parameters
                                    #print paramvals
                                    if pagetemplate.has(
                                            pagetemplateparameter.name, True):
                                        #print pagetemplate.get(pagetemplateparameter.name).value.lower()
                                        #print paramvals[pagetemplateparameter.name.lower()]
                                        if pagetemplate.get(
                                                pagetemplateparameter.name
                                        ).value.lower() in paramvals[
                                                pagetemplateparameter.name.
                                                lower()]:
                                            row = [
                                                self.lang,
                                                page.getTitle(),
                                                page.getId(),
                                                pagetemplate.get(
                                                    pagetemplateparameter.name
                                                ).value.lower()
                                            ]
                                            rows.append(row)
                                            totalrows += 1
            i += 1
            if (i % 50000 == 0):
                print i, "pages (", talkpages, "talk pages) for " + self.lang + " with", totalrows, "added."
                if len(rows) > 1000:
                    self.writeoutput(rows, lang)
                    rows = []

        self.writeoutput(rows, lang)

示例#8

0

显示文件

def main():
    actualidad_r = re.compile(
        ur"(?im)(\{\{\s*(Actual|Actualidad|Actualidad[ _]deporte|Current|EA|Evento[ _]actual|Launching|Muerte[ _]reciente|Sencillo[ _]actual|Single[ _]actual|Telenovela[ _]en[ _]emisión|Teleserie[ _]en[ _]emisión)\s*[\|\}]([^\}]*?)\}\}?)"
    )

    filename = "eswiki-20140113-pages-meta-history4.xml.7z"
    fp = subprocess.Popen('7za e -bd -so %s 2>/dev/null' % filename,
                          shell=True,
                          stdout=subprocess.PIPE,
                          bufsize=65535)
    dumpIterator = dump.Iterator(fp.stdout)

    hoy = datetime.datetime.today()
    actualidad = {}
    pagecount = 0

    #csv
    f = open('actualidad.csv', 'w')
    f.write(
        u'page_title|it_rev_id|it_rev_timestamp|it_rev_username|event_type|it_rev_comment|rt_rev_id|rt_rev_timestamp|rt_rev_username|rt_rev_comment|template_time\n'
    )
    f.close()

    for page in dumpIterator.readPages():
        if page.getNamespace() not in [0, 104]:  #mainspace y anexos
            continue
        pagecount += 1
        if pagecount % 1000 == 0:
            print pagecount
        if pagecount > 50000:
            fp.kill()
            break
        plantillapuesta = False
        for rev in page.readRevisions():
            revtext = rev.getText()
            if revtext:
                if re.search(actualidad_r, revtext):
                    if plantillapuesta:
                        pass  #la plantilla sigue puesta
                    else:
                        #alguien acaba de poner la plantilla
                        plantillapuesta = util.timestamp2WP(rev.getTimestamp())
                        if not actualidad.has_key(page.getTitle()):
                            actualidad[page.getTitle()] = []
                        tipo = re.findall(
                            actualidad_r, revtext)[0][2] and re.findall(
                                actualidad_r,
                                revtext)[0][2] or 'Sin especificar'
                        actualidad[page.getTitle()] = [[
                            u'%s' % rev.getId(),
                            util.timestamp2WP(rev.getTimestamp()),
                            rev.getContributor().getUsername(), tipo,
                            rev.getComment() and rev.getComment() or u"", u"",
                            u"", u"", u"", u""
                        ]]
                        #https://es.wikipedia.org/w/index.php?oldid=%s&diff=prev
                else:
                    if plantillapuesta:
                        #la plantilla la acaban de quitar
                        actualidad[
                            page.getTitle()][-1][-1] = u"%s" % calculartiempo(
                                plantillapuesta,
                                util.timestamp2WP(rev.getTimestamp()))
                        plantillapuesta = False
                        actualidad[page.getTitle()][-1][-2] = rev.getComment(
                        ) and rev.getComment() or u""
                        actualidad[page.getTitle(
                        )][-1][-3] = rev.getContributor().getUsername()
                        actualidad[
                            page.getTitle()][-1][-4] = util.timestamp2WP(
                                rev.getTimestamp())
                        actualidad[
                            page.getTitle()][-1][-5] = u'%s' % rev.getId()
                        print page.getTitle(), actualidad[page.getTitle()][-1]

        if plantillapuesta:
            #sigue puesta a fecha de hoy
            actualidad[page.getTitle()][-1][-1] = u"%s" % calculartiempo(
                plantillapuesta, hoy.strftime('%Y%m%d000000'))
            print page.getTitle(), actualidad[page.getTitle()][-1]

    f = csv.writer(open('actualidad.csv', 'a'),
                   delimiter='|',
                   quotechar='"',
                   quoting=csv.QUOTE_MINIMAL)
    for page_title, props in actualidad.items():
        for props2 in props:
            f.writerow([page_title.encode('utf-8')] +
                       [i.encode('utf-8') for i in props2])