def processTalkPages(file_name): """ DESCRIPTION :Parameters: NAME : TYPE DESCRIPTIOIN :Return: DESCRIPTION """ talk_pages = {} # Access the file dumpIterator = dump.Iterator(file_name) # Iterate over pages of the dump for page in dumpIterator.readPages(): # Iterate over revisions of the article. for revision in page.readRevisions(): contributor_name = revision.getContributor().getUsername() if (contributor_name in talk_pages.keys()): talk_pages[contributor_name].append(revision.getTimestamp()) else: talk_pages.update({contributor_name : [revision.getTimestamp()]}) return talk_pages
def processTalkPages(file_name): talk_pages = {} # Access the file. dumpIterator = dump.Iterator(file_name) # Iterate over pages of the dump. for page in dumpIterator.readPages(): # Iterate over revisions of the article. for revision in page.readRevisions(): contributor_name = revision.getContributor().getUsername().encode( "utf-8") if (contributor_name in talk_pages.keys()): talk_pages[contributor_name].append(revision.getTimestamp()) else: talk_pages.update( {contributor_name: [revision.getTimestamp()]}) return talk_pages
def analyseArticle(file_name): # Container of relationships. relations = {} # Revisions to compare. revision_curr = Revision() revision_prev = Revision() text_curr = None # Access the file. dumpIterator = dump.Iterator(file_name) # Iterate over the pages. for page in dumpIterator.readPages(): i = 0 # Iterate over revisions of the article. for revision in page.readRevisions(): vandalism = False #print "processing rev", revision.getId() # Update the information about the previous revision. revision_prev = revision_curr if (revision.getSha1() == None): revision.setSha1( Text.calculateHash(revision.getText().encode("utf-8"))) if (revision.getSha1() in spam): vandalism = True #TODO: SPAM detection: DELETION if (revision.getComment() != None and revision.getComment().find(FLAG) > 0): pass else: if (revision_prev.length > PREVIOUS_LENGTH) and (len( revision.getText()) < CURR_LENGTH) and (( (len(revision.getText()) - revision_prev.length) / float(revision_prev.length)) <= CHANGE_PERCENTAGE): vandalism = True revision_curr = revision_prev #if (vandalism): #print "---------------------------- FLAG 1" #print "SPAM", revision.getId() #print revision.getText() #print if (not vandalism): # Information about the current revision. revision_curr = Revision() revision_curr.id = i revision_curr.wikipedia_id = int(revision.getId()) revision_curr.length = len(revision.getText()) revision_curr.timestamp = revision.getTimestamp() revision_curr.comment = revision.getComment() # Relation of the current relation. relation = Relation() relation.revision = int(revision.getId()) relation.length = len(revision.getText()) # Some revisions don't have contributor. if (revision.getContributor() != None): revision_curr.contributor_id = revision.getContributor( ).getId() revision_curr.contributor_name = revision.getContributor( ).getUsername().encode('utf-8') relation.author = revision.getContributor().getUsername( ).encode('utf-8') else: revision_curr.contributor_id = 'Not Available ' + revision.getId( ) revision_curr.contribur_name = 'Not Available ' + revision.getId( ) relation.author = 'Not Available ' + revision.getId() # Content within the revision. text_curr = revision.getText().encode('utf-8') text_curr = text_curr.lower() revision_curr.content = text_curr # Perform comparison. vandalism = determineAuthorship(revision_curr, revision_prev, text_curr, relation) if (not vandalism): #print "NOT SPAM", revision.getId() # Add the current revision with all the information. revisions.update( {revision_curr.wikipedia_id: revision_curr}) relations.update({revision_curr.wikipedia_id: relation}) revision_order.append((revision_curr.wikipedia_id, False)) # Update the fake revision id. i = i + 1 # Calculate the number of tokens in the revision. total = 0 for p in revision_curr.ordered_paragraphs: for paragraph_curr in revision_curr.paragraphs[p]: for hash_sentence_curr in paragraph_curr.sentences.keys( ): for sentence_curr in paragraph_curr.sentences[ hash_sentence_curr]: total = total + len(sentence_curr.words) revision_curr.total_tokens = total relation.total_tokens = total else: #print "---------------------------- FLAG 2" #print "SPAM", revision.getId() #print revision.getText() #print revision_order.append((revision_curr.wikipedia_id, True)) revision_curr = revision_prev spam.append(revision.getSha1()) return (revisions, revision_order, relations)
def getTagDatesFromPage(file_name): # Compile regexp reglist = list() reglist.append({ "tagname": "maintained", "regexp": re.compile( '\{\{(articleissues\|((?:(?!\}\}).)*\||)|multiple issues\|((?:(?!\}\}).)*\||)|)maintained((\||=)(?:(?!\}\}).)*|)\}\}', re.IGNORECASE) }) reglist.append({ "tagname": "good article", "regexp": re.compile( '\{\{(articleissues\|((?:(?!\}\}).)*\||)|multiple issues\|((?:(?!\}\}).)*\||)|)good article((\||=)(?:(?!\}\}).)*|)\}\}', re.IGNORECASE) }) #reglist.append({"tagname": "featured article", "regexp": re.compile('\{\{(articleissues\|((?:(?!\}\}).)*\||)|multiple issues\|((?:(?!\}\}).)*\||)|)featured article((\||=)(?:(?!\}\}).)*|)\}\}', re.IGNORECASE)}) #reglist.append({"tagname": "featured article", "regexp": re.compile('\|currentstatus=FA', re.IGNORECASE)}) reglist.append({ "tagname": "npov", "regexp": re.compile( '\{\{(articleissues\|((?:(?!\}\}).)*\||)|multiple issues\|((?:(?!\}\}).)*\||)|)(pov|npov)((\||=)(?:(?!\}\}).)*|)\}\}', re.IGNORECASE) }) reglist.append({ "tagname": "disputed", "regexp": re.compile( '\{\{(articleissues\|((?:(?!\}\}).)*\||)|multiple issues\|((?:(?!\}\}).)*\||)|)disputed((\||=)(?:(?!\}\}).)*|)\}\}', re.IGNORECASE) }) re_user = re.compile('({{|\[\[)user.*?[:|](.*?)[}/\]|]', re.IGNORECASE) #"({{|\[\[)user[\s\S]*?[:|]([\s\S]*?)[}/\]|]" # Access the file. dumpIterator = dump.Iterator(file_name) # Revisions to compare. revision_curr = Revision() revision_prev = Revision() text_curr = None listOfTagChanges = {} all_contributors = { "maintained": {}, "good article": {}, "featured article": {}, "npov": {}, "disputed": {} } # Iterate over the pages. for page in dumpIterator.readPages(): # Iterate over revisions of the article. i = 0 prev_matched = list() for revision in page.readRevisions(): revision.wikipedia_id = int(revision.getId()) revision.timestamp = revision.getTimestamp() # Some revisions don't have contributor. if (revision.getContributor() != None): revision.contributor_id = revision.getContributor().getId() revision.contributor_name = revision.getContributor( ).getUsername() else: revision.contributor_id = 'Not Available' revision.contribur_name = 'Not Available' text_curr = revision.getText() if (text_curr): text_curr = text_curr.encode('utf-8') text_curr = text_curr.lower() else: continue matched = list() aux = list() for regexp in reglist: m = regexp["regexp"].search(text_curr) if m: mc = re_user.split(m.group(0)) i = 2 users = [] users.append(mc[2]) while (i + 3 < len(mc)): #print m[i+3] users.append(mc[i + 3]) i = i + 3 #print regexp["tagname"], contributor #print regexp["tagname"], users matched.append(regexp["tagname"]) aux.append((regexp["tagname"], users)) ## #m_user = re_user.search(m.group(0)) #contributor = m_user.group(2) if "|currentstatus=FA" in text_curr: matched.append("featured article") aux.append((revision.contributor_name, "featured article")) # Calculate additions for (match, contributor) in aux: if not (match in prev_matched): if not (revision.timestamp in listOfTagChanges.keys()): listOfTagChanges[revision.timestamp] = list() listOfTagChanges[revision.timestamp].append({ "rev": revision.wikipedia_id, "type": "addition", "tagname": match, "wikiname": revision.contributor_name, "timestamp": revision.timestamp, "date": datetime.datetime.fromtimestamp(int( revision.timestamp)).strftime('%Y-%m-%d %H:%M:%S') }) all_contributors[match].update({ revision.timestamp: { "rev": revision.wikipedia_id, "user": contributor, "date": datetime.datetime.fromtimestamp(int( revision.timestamp)).strftime('%Y-%m-%d %H:%M:%S') } }) # Calculate removals for match in prev_matched: if not (match in matched): if not (revision.timestamp in listOfTagChanges.keys()): listOfTagChanges[revision.timestamp] = list() listOfTagChanges[revision.timestamp].append({ "rev": revision.wikipedia_id, "type": "removal", "tagname": match, "wikiname": revision.contributor_name, "timestamp": revision.timestamp, "date": datetime.datetime.fromtimestamp(int( revision.timestamp)).strftime('%Y-%m-%d %H:%M:%S') }) prev_matched = matched return listOfTagChanges, all_contributors
import bz2 source = bz2.BZ2File(path) elif path.endswith('.gz'): import gzip source = gzip.open(path) elif path.endswith('.7z'): import subprocess source = subprocess.Popen('7za e -bd -so %s 2>/dev/null' % path, shell=True, stdout=subprocess.PIPE, bufsize=65535).stdout else: # assume it's an uncompressed XML file source = open(self.filename) dumpIterator = dump.Iterator(source) t1 = time.time() cpages = 0 crevs = 0 totalrevs = 0 for page in dumpIterator.readPages(): #page.getId(), page.getTitle(), page.readRevisions() #rev.getId(), rev.getParentId(), rev.getTimestamp(), rev.getContributor(), rev.getMinor(), rev.getComment(), rev.getText(), rev.getSha1() pagetitle = page.getTitle() if skip: if pagetitle == skip: skip = '' else: continue if ':' in pagetitle:
def analyseArticle(file_name): # Container of revisions. revisions = {} revision_order = [] # Revisions to compare. revision_curr = Revision() revision_prev = Revision() text_curr = None # Access the file. dumpIterator = dump.Iterator(file_name) # Iterate over the pages. for page in dumpIterator.readPages(): i = 0 # Iterate over revisions of the article. for revision in page.readRevisions(): vandalism = False # Update the information about the previous revision. revision_prev = revision_curr if (revision.getSha1() == None): revision.setSha1( Text.calculateHash(revision.getText().encode("utf-8"))) if (revision.getSha1() in spam): vandalism = True #TODO: SPAM detection: DELETION if (revision.getComment() != None and revision.getComment().find(FLAG) > 0): pass else: if (revision_prev.length > PREVIOUS_LENGTH) and (len( revision.getText()) < CURR_LENGTH) and (( (len(revision.getText()) - revision_prev.length) / float(revision_prev.length)) <= CHANGE_PERCENTAGE): print "VANDALISM: CHANGE PERCETANGE" vandalism = True #if (vandalism): #print "---------------------------- FLAG 1" #print revision.getId() #print revision.getText() #print if (not vandalism): # Information about the current revision. revision_curr = Revision() revision_curr.id = i revision_curr.wikipedia_id = int(revision.getId()) revision_curr.length = len(revision.getText()) # Some revisions don't have contributor. if (revision.getContributor() != None): revision_curr.contributor_id = revision.getContributor( ).getId() revision_curr.contributor_name = revision.getContributor( ).getUsername() else: revision_curr.contributor_id = 'Not Available' revision_curr.contribur_name = 'Not Available' # Content within the revision. text_curr = revision.getText().encode('utf-8') text_curr = text_curr.lower() revision_curr.content = text_curr # Perform comparison. vandalism = determineAuthorship(revision_curr, revision_prev, text_curr) if (not vandalism): # Add the current revision with all the information. revisions.update( {revision_curr.wikipedia_id: revision_curr}) # Update the fake revision id. i = i + 1 # Update the index of processed revisions. revision_order.append((revision_curr.wikipedia_id, False)) else: #print "detected vandalism in here ...................................." #print "---------------------------- FLAG 2" #print revision.getId() #print revision.getText() #print spam.append(revision.getSha1()) revision_order.append((revision_curr.wikipedia_id, True)) revision_curr = revision_prev else: # revision.getText() # #print spam.append(revision.getSha1()) revision_order.append((revision_curr.wikipedia_id, True)) revision_curr = revision_prev return (revisions, revision_order)
def run(self): threading.Thread.run(self) print "Starting thread number", self.threadnumber, "for " + self.lang file = BZ2File(self.filepath) dumpiterator = dump.Iterator(file) templates = self.tempparams.keys() rows = [] i = 0 talkpages = 0 totalrows = 0 for page in dumpiterator.readPages(): if page.getNamespace() == 1: talkpages += 1 for revision in page.readRevisions(): text = mwparserfromhell.parse(revision.getText()) pagetemplates = text.filter_templates() for pagetemplate in pagetemplates: parameters = None paramvals = None if pagetemplate.name.lower() in templates: parameters = self.tempparams[ pagetemplate.name.lower()].keys() paramvals = self.tempparams[ pagetemplate.name.lower()] elif '*' in templates: parameters = self.tempparams["*"].keys() paramvals = self.tempparams["*"] if parameters is not None and paramvals is not None: for pagetemplateparameter in pagetemplate.params: if pagetemplateparameter.name.lower( ) in parameters: #print pagetemplates #print pagetemplate.name.lower() #print pagetemplate #print pagetemplateparameter #print parameters #print paramvals if pagetemplate.has( pagetemplateparameter.name, True): #print pagetemplate.get(pagetemplateparameter.name).value.lower() #print paramvals[pagetemplateparameter.name.lower()] if pagetemplate.get( pagetemplateparameter.name ).value.lower() in paramvals[ pagetemplateparameter.name. lower()]: row = [ self.lang, page.getTitle(), page.getId(), pagetemplate.get( pagetemplateparameter.name ).value.lower() ] rows.append(row) totalrows += 1 i += 1 if (i % 50000 == 0): print i, "pages (", talkpages, "talk pages) for " + self.lang + " with", totalrows, "added." if len(rows) > 1000: self.writeoutput(rows, lang) rows = [] self.writeoutput(rows, lang)
def main(): actualidad_r = re.compile( ur"(?im)(\{\{\s*(Actual|Actualidad|Actualidad[ _]deporte|Current|EA|Evento[ _]actual|Launching|Muerte[ _]reciente|Sencillo[ _]actual|Single[ _]actual|Telenovela[ _]en[ _]emisión|Teleserie[ _]en[ _]emisión)\s*[\|\}]([^\}]*?)\}\}?)" ) filename = "eswiki-20140113-pages-meta-history4.xml.7z" fp = subprocess.Popen('7za e -bd -so %s 2>/dev/null' % filename, shell=True, stdout=subprocess.PIPE, bufsize=65535) dumpIterator = dump.Iterator(fp.stdout) hoy = datetime.datetime.today() actualidad = {} pagecount = 0 #csv f = open('actualidad.csv', 'w') f.write( u'page_title|it_rev_id|it_rev_timestamp|it_rev_username|event_type|it_rev_comment|rt_rev_id|rt_rev_timestamp|rt_rev_username|rt_rev_comment|template_time\n' ) f.close() for page in dumpIterator.readPages(): if page.getNamespace() not in [0, 104]: #mainspace y anexos continue pagecount += 1 if pagecount % 1000 == 0: print pagecount if pagecount > 50000: fp.kill() break plantillapuesta = False for rev in page.readRevisions(): revtext = rev.getText() if revtext: if re.search(actualidad_r, revtext): if plantillapuesta: pass #la plantilla sigue puesta else: #alguien acaba de poner la plantilla plantillapuesta = util.timestamp2WP(rev.getTimestamp()) if not actualidad.has_key(page.getTitle()): actualidad[page.getTitle()] = [] tipo = re.findall( actualidad_r, revtext)[0][2] and re.findall( actualidad_r, revtext)[0][2] or 'Sin especificar' actualidad[page.getTitle()] = [[ u'%s' % rev.getId(), util.timestamp2WP(rev.getTimestamp()), rev.getContributor().getUsername(), tipo, rev.getComment() and rev.getComment() or u"", u"", u"", u"", u"", u"" ]] #https://es.wikipedia.org/w/index.php?oldid=%s&diff=prev else: if plantillapuesta: #la plantilla la acaban de quitar actualidad[ page.getTitle()][-1][-1] = u"%s" % calculartiempo( plantillapuesta, util.timestamp2WP(rev.getTimestamp())) plantillapuesta = False actualidad[page.getTitle()][-1][-2] = rev.getComment( ) and rev.getComment() or u"" actualidad[page.getTitle( )][-1][-3] = rev.getContributor().getUsername() actualidad[ page.getTitle()][-1][-4] = util.timestamp2WP( rev.getTimestamp()) actualidad[ page.getTitle()][-1][-5] = u'%s' % rev.getId() print page.getTitle(), actualidad[page.getTitle()][-1] if plantillapuesta: #sigue puesta a fecha de hoy actualidad[page.getTitle()][-1][-1] = u"%s" % calculartiempo( plantillapuesta, hoy.strftime('%Y%m%d000000')) print page.getTitle(), actualidad[page.getTitle()][-1] f = csv.writer(open('actualidad.csv', 'a'), delimiter='|', quotechar='"', quoting=csv.QUOTE_MINIMAL) for page_title, props in actualidad.items(): for props2 in props: f.writerow([page_title.encode('utf-8')] + [i.encode('utf-8') for i in props2])