def run(similar_titles, session): grouped_similar_titles = aggregate(similar_titles, by=lambda r:r.input_title) writer = tsv.Writer(sys.stdout, headers=HEADERS) for input_title, rows in grouped_similar_titles: try: content = list(session.revisions.query(titles={input_title}, properties={'content'}, limit=1))[0]['*'] except Exception as e: content = "" sys.stderr.write(traceback.format_exc()) parsed = mwparserfromhell.parse(content) lead_bit = "".join(str(v) for v in parsed.strip_code())[:200] + "..." sys.stderr.write(".");sys.stderr.flush() for row in rows: writer.write([ re.sub(re.compile(r"\#.*"), "", row.input_title.replace("_", " ")), lead_bit, row.similar_title.replace("_", " "), row.rank, row.snippet ]) sys.stderr.write("\n");sys.stderr.flush()
def main(): reader = tsv.Reader(sys.stdin, headers=tsv.Reader.FIRST_LINE) writer = tsv.Writer(sys.stdout) for i, (id, user_rows) in enumerate(aggregate(reader, by=lambda row: row['event_experimentId'])): if i % 100 == 0: sys.stderr.write(".") for user_i, row in enumerate(user_rows): writer.write(row.values() + [user_i]) sys.stderr.write("\n")
def run(similar_titles, n): grouped_similar_titles = aggregate(similar_titles, by=lambda r:r.input_title) writer = tsv.Writer(sys.stdout, headers=HEADERS) for input_title, similar_titles in grouped_similar_titles: similar_titles = list(similar_titles) random.shuffle(similar_titles) for similar_title in similar_titles[:n]: writer.write(similar_title.values())
def run(revs, radius, cutoff): writer = None for wiki, revs in aggregate(revs, by=lambda r: r.wiki): sys.stderr.write("Conn({0}): ".format(wiki)) db = DB.from_params( host="analytics-store.eqiad.wmnet", user="******", read_default_file="~/.my.research.cnf", db=wiki ) for rev in revs: if writer == None: writer = tsv.Writer(sys.stdout, headers=rev.keys() + ["reverted", "archived"]) rev_doc = dict(rev) try: # sys.stderr.write("<");sys.stderr.flush() rev_row = db.revisions.get(int(rev.rev_id)) # sys.stderr.write(str(int(rev_row==None))) # sys.stderr.write("|");sys.stderr.flush() rev_doc["archived"] = False revert = reverts.database.check_row( db, rev_row, radius=radius, before=Timestamp(rev_row["rev_timestamp"]) + cutoff ) if revert != None: rev_doc["reverted"] = True sys.stderr.write("r") sys.stderr.flush() else: rev_doc["reverted"] = False sys.stderr.write(".") sys.stderr.flush() except KeyError: rev_doc["archived"] = False rev_doc["reverted"] = None sys.stderr.write("a") sys.stderr.flush() finally: # sys.stderr.write(">");sys.stderr.flush() pass writer.write([rev_doc[k] for k in rev.keys() + ["reverted", "archived"]]) sys.stderr.flush() sys.stderr.write("\n") sys.stderr.flush()
def run(wiki_editor_months, active_edits=5): writer = tsv.Writer(sys.stdout, headers=HEADERS) for wiki, editor_months in aggregate(wiki_editor_months, by=lambda em:em.wiki): mae = deque([MonthlyActiveEditors(), MonthlyActiveEditors(), MonthlyActiveEditors()], maxlen=3) previously_active = set() for month, editors in aggregate(editor_months, by=lambda em:em.month): sys.stderr.write("{0}, {1}\n".format(wiki, month)) first_actives = 0 for editor in editors: user_id = editor.user_id user_registration = editor.user_registration attached_method = editor.attached_method revisions = editor.revisions or 0 if user_id == 0: pass elif revisions >= active_edits: # Active editor if user_id not in previously_active: first_actives += 1 previously_active.add(user_id) if user_registration != None and \ user_registration > (month[:4] + month[4:]) and \ attached_method != 'login': # New active editor mae[0].new.add(user_id) elif user_id in mae[1].new: # Surviving new active editor mae[0].surviving.add(user_id) elif user_id in mae[1]: # Old active editor mae[0].old.add(user_id) else: # Other active editor mae[0].reactivated.add(user_id) inactivated = len(mae[1] - mae[0]) writer.write([ wiki, month, len(mae[0]), len(mae[0].new), len(mae[0].surviving), len(mae[0].surviving)/len(mae[1].new) if len(mae[1].new) > 0 else None, len(mae[0].old), len(mae[0].old)/(len(mae[1])-len(mae[1].new)) if len(mae[1])-len(mae[1].new) > 0 else None, len(mae[0].reactivated), inactivated, inactivated/len(mae[1]) if len(mae[1]) > 0 else None, first_actives ]) mae.appendleft(MonthlyActiveEditors()) # Updating current
def run(wiki_editor_months, active_edits=5): writer = tsv.Writer(sys.stdout, headers=HEADERS) for wiki, editor_months in aggregate(wiki_editor_months, by=lambda em: em.wiki): mae = deque([ MonthlyActiveEditors(), MonthlyActiveEditors(), MonthlyActiveEditors() ], maxlen=3) previously_active = set() for month, editors in aggregate(editor_months, by=lambda em: em.month): sys.stderr.write("{0}, {1}\n".format(wiki, month)) first_actives = 0 for editor in editors: user_id = editor.user_id user_registration = editor.user_registration attached_method = editor.attached_method revisions = editor.revisions or 0 if user_id == 0: pass elif revisions >= active_edits: # Active editor if user_id not in previously_active: first_actives += 1 previously_active.add(user_id) if user_registration != None and \ user_registration > (month[:4] + month[4:]) and \ attached_method != 'login': # New active editor mae[0].new.add(user_id) elif user_id in mae[1].new: # Surviving new active editor mae[0].surviving.add(user_id) elif user_id in mae[1]: # Old active editor mae[0].old.add(user_id) else: # Other active editor mae[0].reactivated.add(user_id) inactivated = len(mae[1] - mae[0]) writer.write([ wiki, month, len(mae[0]), len(mae[0].new), len(mae[0].surviving), len(mae[0].surviving) / len(mae[1].new) if len(mae[1].new) > 0 else None, len(mae[0].old), len(mae[0].old) / (len(mae[1]) - len(mae[1].new)) if len(mae[1]) - len(mae[1].new) > 0 else None, len(mae[0].reactivated), inactivated, inactivated / len(mae[1]) if len(mae[1]) > 0 else None, first_actives ]) mae.appendleft(MonthlyActiveEditors()) # Updating current