def on_process(self, data_frames): data = [] columns = ['page_id', 'user_name', 'contribution_similarity'] esa = EsaProvider(self.data_dir) revs_df = data_frames[0] assert isinstance(revs_df, pd.DataFrame) with pymongo.MongoClient(host=config.get('MONGO', 'host'), port=config.get_int('MONGO', 'port')) as client: db = client.get_database(config.get('MONGO', 'database')) collection = db.get_collection('pages') persistence_db = client.get_database( config.get('MONGO', 'persistence_database')) persistence_collection = persistence_db.get_collection( config.get('MONGO', 'persistence_collection')) grouped = revs_df.groupby(by=['page_id', 'user_name']) for (page_id, user_name), group in grouped: if is_bot(user_name): continue user_persists = {} for index, row in group.iterrows(): rev_id = row['revision_id'] rev_pers = self.get_persistence( rev_id=rev_id, collection=persistence_collection) if isinstance(rev_pers, RevisionPersistence): if user_name not in user_persists: user_persists[user_name] = [] user_persists[user_name].append(rev_pers) if user_name not in user_persists: contribution_similarity = .0 else: all_content_tokens = [] for rev_pers in user_persists[user_name]: if isinstance(rev_pers, RevisionPersistence): all_content_tokens.extend(rev_pers.content_tokens) if len(all_content_tokens) == 0: contribution_similarity = .0 else: page_json = collection.find_one( filter={'_id': int(page_id)}) if page_json is None: continue summary = str(page_json['text'][:300]) contribution_similarity = esa.get_semantic_similarity( summary, ' '.join(all_content_tokens)) data.append([page_id, user_name, contribution_similarity]) return pd.DataFrame(data=data, columns=columns)
def get_revisions(self, site, page_id, talk_page_id, bots): data = [] limit = 50000 rv_continue = None total_revisions = 0 while True: json_data = self.__get_revisions(site, talk_page_id, rv_continue) pages = json_data['query']['pages'] cont_dictionary = json_data.get('continue') if str(talk_page_id) in pages: page = pages[str(talk_page_id)] if 'revisions' not in page: continue revisions = page['revisions'] for rev in revisions: total_revisions += 1 if 'suppressed' in rev.keys() or 'userhidden' in rev.keys( ): continue user_id = parse_string(rev['userid']) user_name = parse_string(rev['user']) size = parse_int(rev['size']) rev_id = parse_int(rev['revid']) timestamp = None if 'timestamp' in rev: timestamp = parse_timestamp(rev['timestamp']) if is_bot(user_name): continue data.append([ page_id, talk_page_id, rev_id, user_id, user_name, size, timestamp ]) if cont_dictionary and total_revisions < limit: rv_continue = str(cont_dictionary['rvcontinue']) else: break return pd.DataFrame(data=data, columns=[ 'page_id', 'talk_page_id', 'revision_id', 'user_id', 'user_name', 'size', 'timestamp' ])
def on_process(self, data_frames): revs_df = data_frames[0] assert isinstance(revs_df, pd.DataFrame) top10_pages_df = data_frames[1] assert isinstance(top10_pages_df, pd.DataFrame) data = [] columns = ['page_id', 'user_name', 'links_overlap'] with pymongo.MongoClient(host=config.get('MONGO', 'host'), port=config.get_int('MONGO', 'port')) as client: db = client.get_database(config.get('MONGO', 'database')) collection = db.get_collection('page_links') grouped = revs_df.groupby(by=['page_id', 'user_name']) for (page_id, user_name), group in grouped: if is_bot(user_name): continue cursor = collection.find(filter={'pl_from': int(page_id)}) if cursor is None: continue links = [entry['pl_title'] for entry in cursor] top10_page_ids = list(top10_pages_df[ top10_pages_df['user_name'] == user_name]['page_id']) top10_page_ids = list( filter(lambda x: x != page_id, top10_page_ids)) links_overlaps = [] for k_page_id in top10_page_ids: cursor = collection.find( filter={'pl_from': int(k_page_id)}) if cursor is None: continue k_links = [entry['pl_title'] for entry in cursor] links_overlap = (float(len(set(links).intersection(k_links))) / len( set(links).union(k_links))) \ if (len(links) > 0 or len(k_links)) > 0 else 0.0 links_overlaps.append(links_overlap) mean_links_overlap = np.mean( links_overlaps) if len(links_overlaps) > 0 else 0.0 data.append([page_id, user_name, mean_links_overlap]) return pd.DataFrame(data=data, columns=columns)
def on_process(self, data_frames): wiki_launch_date = pytz.utc.localize( datetime(year=2001, month=1, day=15)) current_date = pytz.utc.localize(datetime.utcnow().replace( hour=0, minute=0, second=0, microsecond=0)) normalization_factor = (current_date - wiki_launch_date).total_seconds() revs_df = data_frames[0] data = [] columns = ['user_name', 'tenure'] if isinstance(revs_df, pd.DataFrame): user_names = revs_df['user_name'].unique() with pymongo.MongoClient(host=config.get('MONGO', 'host'), port=config.get_int('MONGO', 'port')) as client: db = client.get_database(config.get('MONGO', 'database')) collection = db.get_collection( config.get('MONGO', 'collection')) for user_name in user_names: if is_bot(user_name): continue first_edit, last_edit = self.aggregate( collection=collection, user_name=user_name) if first_edit is None or last_edit is None: continue tenure = (last_edit - first_edit ).total_seconds() / normalization_factor data.append([user_name, tenure]) logging.debug('Username: {}\tTenure: {}'.format( user_name, tenure)) tenure_df = pd.DataFrame(data=data, columns=columns) data = [] cols = ['page_id', 'user_name', 'tenure'] df = revs_df.merge(tenure_df, how='left', on='user_name')[cols] for (page_id, user_name), group in df.groupby(by=['page_id', 'user_name']): data.append([page_id, user_name, group.iloc[0]['tenure']]) return pd.DataFrame(data=data, columns=cols)
def on_process(self, data_frames): host = config.get('MONGO', 'host') port = config.get_int('MONGO', 'port') database = config.get('MONGO', 'database') collection = config.get('MONGO', 'collection') revs_df = data_frames[0] data = [] columns = ['user_name', 'total_edited_pages'] if isinstance(revs_df, pd.DataFrame): user_names = revs_df['user_name'].unique() with pymongo.MongoClient(host=host, port=port) as client: db = client.get_database(database) collection = db.get_collection(collection) for user_name in user_names: if is_bot(user_name): continue total_edited_pages = self.aggregate(collection=collection, user_name=user_name) if total_edited_pages is None: continue data.append([user_name, total_edited_pages]) logging.debug( 'Username: {}\tTotal edited pages: {}'.format( user_name, total_edited_pages)) df = pd.DataFrame(data=data, columns=columns) # normalization_factor = df['total_edited_pages'].max() # df['total_edited_pages'] = df['total_edited_pages'].apply(lambda x: float(x) / normalization_factor) data = [] cols = ['page_id', 'user_name', 'total_edited_pages'] df = revs_df.merge(df, how='left', on='user_name')[cols] for (page_id, user_name), group in df.groupby(by=['page_id', 'user_name']): data.append( [page_id, user_name, group.iloc[0]['total_edited_pages']]) return pd.DataFrame(data=data, columns=cols)
def on_process(self, data_frames): revs_df = data_frames[0] data = [] columns = ['user_name', 'gender'] if isinstance(revs_df, pd.DataFrame): user_names = list(revs_df['user_name'].unique()) user_names = list(filter(lambda x: not is_bot(x), user_names)) # non_bot_users = [] # for user_name in user_names: # if is_bot(user_name): # continue # else: # non_bot_users.append(user_name) for user in user_names: resp_users = site.users(users=[str(user)], prop=['gender']) for resp_user in resp_users: if 'missing' in resp_user or 'gender' not in resp_user: gender = -1. data.append([user, gender]) else: gender = 1. if resp_user['gender'] == 'male' else ( -1. if resp_user['gender'] is None else 0.) data.append([user, gender]) logging.debug('Username: {}\tGender: {}'.format( user, gender)) break tenure_df = pd.DataFrame(data=data, columns=columns) data = [] cols = ['page_id', 'user_name', 'gender'] df = revs_df.merge(tenure_df, how='left', on='user_name')[cols] for (page_id, user_name), group in df.groupby(by=['page_id', 'user_name']): data.append([page_id, user_name, group.iloc[0]['gender']]) return pd.DataFrame(data=data, columns=cols)
def run(self): revs_df = pd.read_hdf(self.input()[0].path, mode='r') assert isinstance(revs_df, pd.DataFrame) user_names = list(revs_df['user_name'].unique()) user_names = list(filter(lambda x: not is_bot(x), user_names)) with pymongo.MongoClient(host=config.get('MONGO', 'host'), port=config.get_int('MONGO', 'port')) as client: db = client.get_database(config.get('MONGO', 'database')) collection = db.get_collection(config.get('MONGO', 'collection')) data = [] columns = ['page_id', 'user_name'] for user_name in user_names: page_ids = self.aggregate(collection, user_name) for page_id in page_ids: data.append([page_id, user_name]) pages_df = pd.DataFrame(data=data, columns=columns) pages_df.to_hdf(os.path.join(self.data_dir, self.file_name), key='df', mode='w')
def on_process(self, data_frames): revs_df = data_frames[0] data = [] ns_columns = ['ns{}_edit_dist'.format(ns) for ns in range(0, 16, 1)] columns = ['user_name'] + ns_columns if isinstance(revs_df, pd.DataFrame): user_names = revs_df['user_name'].unique() with pymongo.MongoClient(host=config.get('MONGO', 'host'), port=config.get_int('MONGO', 'port')) as client: db = client.get_database(config.get('MONGO', 'database')) collection = db.get_collection( config.get('MONGO', 'collection')) for user_name in user_names: if is_bot(user_name): continue namespaces = self.aggregate(collection=collection, user_name=user_name) data.append([user_name] + namespaces) logging.debug('Username: {}\tNamespaces: {}'.format( user_name, namespaces)) ns_df = pd.DataFrame(data=data, columns=columns) data = [] cols = ['page_id', 'user_name'] + ns_columns df = revs_df.merge(ns_df, how='left', on='user_name')[cols] for (page_id, user_name), group in df.groupby(by=['page_id', 'user_name']): data.append([page_id, user_name] + list(group.iloc[0][ns_columns])) return pd.DataFrame(data=data, columns=cols)
def on_process(self, data_frames): data = [] columns = [ 'page_id', 'user_name', 'title_similarity', 'summary_similarity' ] esa = EsaProvider(self.data_dir) revs_df = data_frames[0] assert isinstance(revs_df, pd.DataFrame) top10_pages_df = data_frames[1] assert isinstance(top10_pages_df, pd.DataFrame) with pymongo.MongoClient(host=config.get('MONGO', 'host'), port=config.get_int('MONGO', 'port')) as client: db = client.get_database(config.get('MONGO', 'database')) collection = db.get_collection('pages') grouped = revs_df.groupby(by=['page_id', 'user_name']) for (page_id, user_name), group in grouped: if is_bot(user_name): continue # logging.info(f'Page ID: {page_id}\tUsername: {user_name}') top10_page_ids = list(top10_pages_df[ top10_pages_df['user_name'] == user_name]['page_id']) top10_page_ids = list( filter(lambda x: x != page_id, top10_page_ids)) page_json = collection.find_one(filter={'_id': int(page_id)}) if page_json is None: continue title = str(page_json['title']) text = str(page_json['text'][:1000]) title_similarity_scores = [] summary_similarity_scores = [] for k_page_id in top10_page_ids: k_page_json = collection.find_one( filter={'_id': int(k_page_id)}) if k_page_json is None: continue if 'title' in k_page_json: k_title = str(k_page_json['title']) title_sim = esa.get_semantic_similarity(title, k_title) title_similarity_scores.append( 0.0 if title_sim is None else title_sim) if 'text' in k_page_json: k_text = str(k_page_json['text'][:1000]) text_sim = esa.get_semantic_similarity(text, k_text, long_text=True) summary_similarity_scores.append( 0.0 if text_sim is None else text_sim) f_title_similarity = np.mean(title_similarity_scores) if len( title_similarity_scores) > 0 else 0.0 f_summary_similarity = np.mean( summary_similarity_scores ) if len(summary_similarity_scores) > 0 else 0.0 data.append([ page_id, user_name, f_title_similarity, f_summary_similarity ]) return pd.DataFrame(data=data, columns=columns)