Пример #1
0
def analyze_all_user_pages_globally():
    collection = solr.all_user_pages_collection()
    new_docs = {}
    for doc in solr.get_all_docs_by_query(collection, '*:*', fields="id,doc_authority_f"):
        user_id = doc['id'].split('_').pop()
        if user_id in new_docs:
            new_docs[user_id]['total_authority_f']['set'] += doc['doc_authority_f']
            new_docs[user_id]['total_contribs_f']['set'] += doc['contribs_f']
        else:
            new_docs[user_id] = {
                'id': user_id,
                'total_authority_f': {'set': doc['doc_authority_f']},
                'total_contribs_f': {'set': doc['contribs_f']}
            }

    authorities, contribs = [], []
    for doc in new_docs.values():
        authorities.append(doc['total_authority_f'])
        contribs.append(doc['total_contribs_f'])

    authorityscaler = MinMaxScaler(authorities)
    contribscaler = MinMaxScaler(contribs)
    for doc in new_docs.values():
        doc['scaled_authority_f'] = authorityscaler.scale(doc['total_authority_f'])
        doc['contribs_scaled_f'] = contribscaler.scale(doc['total_contribs_f'])
        doc['scaled_authority_contribs_f'] = doc['scaled_authority_f'] * doc['contribs_scaled_f']

    collection.add(new_docs.values())
    collection.commit()
Пример #2
0
def author_centrality(titles_to_authors):
    """
    Identifies the centrality of an author

    :param titles_to_authors: a dict keying title strings to the authors associated
    :type titles_to_authors: dict

    :return: a dict matching author to centrality
    :rtype: dict
    """
    author_graph = digraph()
    author_graph.add_nodes(map(lambda x: u"title_%s" % x, titles_to_authors.keys()))
    author_graph.add_nodes(list(set([u'author_%s' % author[u'user']
                                     for authors in titles_to_authors.values()
                                     for author in authors])))

    for title in titles_to_authors:
        for author in titles_to_authors[title]:
            try:
                author_graph.add_edge((u'title_%s' % title, u'author_%s' % author[u'user']))
            except AdditionError:
                pass

    centralities = dict([('_'.join(item[0].split('_')[1:]), item[1])
                         for item in pagerank(author_graph).items() if item[0].startswith(u'author_')])

    centrality_scaler = MinMaxScaler(centralities.values())

    return dict([(cent_author, centrality_scaler.scale(cent_val))
                 for cent_author, cent_val in centralities.items()])
Пример #3
0
def analyze_wikis_globally():
    print "Analyzing Wikis..."
    wiki_collection = solr.existing_collection(solr.global_collection())

    wiki_docs = [doc for doc in solr.get_all_docs_by_query(wiki_collection, '*:*')]
    scaler = MinMaxScaler([doc['total_authority_f'] for doc in wiki_docs])
    new_docs = []
    for doc in wiki_docs:
        new_docs.append({'id': doc['id'], 'scaled_authority_f': {'set': scaler.scale(doc['total_authority_f'])}})
        if len(new_docs) > 10:
            try:
                wiki_collection.add(new_docs)
            except ReadTimeout:
                sleep(5)
                try:
                    wiki_collection.add(new_docs)
                except ReadTimeout:
                    pass
            new_docs = []
    try:
        wiki_collection.add(new_docs)
    except ReadTimeout:
        sleep(5)
        try:
            wiki_collection.add(new_docs)
        except ReadTimeout:
            pass
    wiki_collection.commit()
Пример #4
0
def get_title_top_authors(wiki_id, api_url, all_titles, all_revisions):
    """
    Creates a dictionary of titles and its top authors
    :param wiki_id: the ID of the wiki
    :type wiki_id: int
    :param api_url: the API URL of the wiki
    :type api_url: str
    :param all_titles: a list of all title objects
    :type all_titles: list
    :param all_revisions: a dict keying titles to revisions
    :type all_revisions: dict

    :return: a dict keying title to top authors
    :rtype: dict
    """

    print "Initializing edit distance data"

    all_title_len = len(all_titles)
    group_map = []
    for i in range(0, all_title_len, 25):
        print "%d/%d" % (i, all_title_len)
        group_map.append(group(prime_edit_distance.s(wiki_id, api_url, title_obj, all_revisions[title_obj[u'title']])
                               for title_obj in all_titles[i:i+100])())

    print "Waiting on initialization to complete"
    readies = len(filter(lambda x: x.ready(), group_map))
    group_size = len(group_map)
    while False in map(lambda x: x.ready(), group_map):
        new_readies = len(filter(lambda x: x.ready(), group_map))
        if new_readies > readies:
            print "%d/%d" % (new_readies, group_size)
        readies = new_readies
        time.sleep(1)

    print "Getting contributing authors for titles"
    title_to_authors = group(get_contributing_authors.s(wiki_id, api_url, title_obj, all_revisions[title_obj[u'title']])
                             for title_obj in all_titles)().get()

    contribs_scaler = MinMaxScaler([author[u'contribs']
                                    for title, authors in title_to_authors
                                    for author in authors])

    print "Scaling top authors"
    scaled_title_top_authors = {}
    for title, authors in title_to_authors:
        new_authors = []
        for author in authors:
            author[u'contribs'] = contribs_scaler.scale(author[u'contribs'])
            new_authors.append(author)
        scaled_title_top_authors[title] = new_authors
    return scaled_title_top_authors
Пример #5
0
def get_title_top_authors(wiki_id, api_url, all_titles, all_revisions):
    """
    Creates a dictionary of titles and its top authors
    :param wiki_id: the ID of the wiki
    :type wiki_id: int
    :param api_url: the API URL of the wiki
    :type api_url: str
    :param all_titles: a list of all title objects
    :type all_titles: list
    :param all_revisions: a dict keying titles to revisions
    :type all_revisions: dict

    :return: a dict keying title to top authors
    :rtype: dict
    """

    print "Getting contributing authors for titles"
    futures = group(get_contributing_authors.s(wiki_id, api_url, title_obj, all_revisions[title_obj[u'title']])
                    for title_obj in all_titles if title_obj[u'title'] in all_revisions)()
    future_len = len(futures)
    cc = futures.completed_count()
    while not futures.ready():
        new_cc = futures.completed_count()
        if new_cc > cc:
            print "%d/%d" % (new_cc, future_len)
        cc = new_cc
        time.sleep(1)
    title_to_authors = get_with_backoff(futures, [])
    if not title_to_authors:
        print "Failed to get title to authors. Connection failure?"
        return

    contribs_scaler = MinMaxScaler([author[u'contribs']
                                    for title, authors in title_to_authors
                                    for author in authors])

    print "Scaling top authors"
    scaled_title_top_authors = {}
    for title, authors in title_to_authors:
        new_authors = []
        for author in authors:
            author[u'contribs'] = contribs_scaler.scale(author[u'contribs'])
            new_authors.append(author)
        scaled_title_top_authors[title] = new_authors
    return scaled_title_top_authors
Пример #6
0
def analyze_pages_globally():
    print "Analyzing All Pages..."
    page_collection = solr.all_pages_collection()

    authorities = []
    for page_doc in solr.get_all_docs_by_query(page_collection, '*:*'):
        authorities.append(page_doc['authority_f'])

    page_scaler = MinMaxScaler(authorities)
    docs = []
    counter = 0
    for page_doc in solr.get_all_docs_by_query(page_collection, '*:*'):
        docs.append({'id': page_doc['id'], 'scaled_authority_f': {'set': page_scaler.scale(page_doc['authority_f'])}})
        counter += 1
        if counter % 500:
            page_collection.add(docs)
            docs = []
    page_collection.commit()
Пример #7
0
def analyze_users_globally():
    print "Analyzing Users..."
    user_collection = solr.existing_collection(solr.user_collection())
    wiki_user_collection = solr.wiki_user_collection()

    id_to_docs = dict()
    for user_doc in solr.get_all_docs_by_query(wiki_user_collection, '*:*'):
        # these are gonna be wiki-id_user-id
        doc_id = user_doc['id'].split('_').pop()
        if doc_id not in id_to_docs:
            id_to_docs[doc_id] = dict(id=doc_id,
                                      attr_entities={'set': []},
                                      name_s={'set': user_doc['name_s']},
                                      name_txt_en={'set': user_doc['name_txt_en']},
                                      wikis_is={'set': []},
                                      attr_wikis={'set': []},
                                      authorities_fs={'set': []},
                                      total_authority_f={'set': 0},
                                      scaled_authority_f={'set': 0})

        try:
            map(id_to_docs[doc_id]['attr_entities']['set'].append, user_doc['attr_entities'])
            id_to_docs[doc_id]['wikis_is']['set'].append(user_doc['wiki_id_i'])
            id_to_docs[doc_id]['attr_wikis']['set'].append(user_doc['wiki_name_txt'])
            id_to_docs[doc_id]['authorities_fs']['set'].append(user_doc['total_page_authority_f'])
        except KeyError:
            pass  # zero f***s

    id_to_total_authorities = dict([(uid, sum(doc['authorities_fs']['set'])) for uid, doc in id_to_docs.items()])
    user_scaler = MinMaxScaler(id_to_total_authorities.values())
    for uid, total_authority in id_to_total_authorities.items():
        id_to_docs[uid]['total_authority_f']['set'] = total_authority
        id_to_docs[uid]['scaled_authority_f']['set'] = user_scaler.scale(total_authority)

    user_collection.add(id_to_docs.values())
    user_collection.commit()
Пример #8
0
def ingest_data(wiki_id):
    """
    Create Solr documents for a given wiki ID

    :param wiki_id: the ID of the wiki (int or str)
    :type wiki_id: int
    :return:
    """
    # make sure all pages and all user pages exists
    solr.existing_collection(solr.all_pages_collection())
    solr.existing_collection(solr.all_user_pages_collection())

    resp = requests.get(u'http://www.wikia.com/api/v1/Wikis/Details', params={u'ids': wiki_id})
    items = resp.json()['items']
    if wiki_id not in items:
        print u"Wiki doesn't exist?"
        return

    api_data = items[wiki_id]
    wiki_data = {
        'id': api_data['id'],
        'wam_f': {'set': api_data['wam_score']},
        'title_s': {'set': api_data['title']},
        'attr_title': {'set': api_data['title']},
        'attr_desc': {'set': api_data['desc']}
    }
    for key in api_data['stats'].keys():
        wiki_data['%s_i' % key] = {'set': api_data['stats'][key]}

    wiki_api_data = requests.get(u'%swikia.php' % (api_data[u'url']),
                                 params={u'method': u'getForWiki',
                                         u'service': u'CrossWikiCore',
                                         u'controller': u'WikiaSearchIndexerController'}).json()[u'contents']

    wiki_data[u'hub_s'] = wiki_api_data[u'hub_s']
    
    # easier
    api_data[u'hub_s'] = wiki_api_data[u'hub_s']

    collection = solr.existing_collection(solr.collection_for_wiki(wiki_id))

    use_caching(is_read_only=True, shouldnt_compute=True)

    wpe = WikiPageToEntitiesService().get_value(wiki_id)
    if not wpe:
        print u"NO WIKI PAGE TO ENTITIES SERVICE FOR", wiki_id
        return False

    documents = []

    grouped_futures = []

    pages_to_authority = WikiAuthorityService().get_value(str(wiki_data['id']))
    for counter, (doc_id, entity_data) in enumerate(wpe.items()):
        documents.append({
            'id': doc_id,
            'attr_entities': {'set': list(set(entity_data.get(u'redirects', {}).values()
                                              + entity_data.get(u'titles')))},
            'type_s': {'set': 'Page'},
            'authority_f': {'set': pages_to_authority.get(doc_id, 0)},
            'hub_s': wiki_api_data['hub_s']
        })

        if counter != 0 and counter % 1500 == 0:
            grouped_futures.append(
                group(add_with_metadata.s(api_data, grouping) for grouping in iter_grouper(15, documents))()
            )

            documents = []

    grouped_futures.append(
        group(add_with_metadata.s(api_data, grouping) for grouping in iter_grouper(15, documents))()
    )

    # block on completion of all grouped futures
    completed = 0
    total = 0
    while len(filter(lambda x: not x.ready(), grouped_futures)) > 1:
        new_completed = 0
        new_total = 0
        for future in grouped_futures:
            new_completed += future.completed_count()
            new_total += len(future.results)
        if completed != new_completed or total != new_total:
            completed = new_completed
            total = new_total
            print "Grouped Tasks: (%d/%d)" % (completed, total)
        sleep(2)

    all_user_tuples = []
    for future in grouped_futures:
        result = get_with_backoff(future, [])
        map(all_user_tuples.extend, result)

    all_user_tuples = list(set(all_user_tuples))
    if not all_user_tuples:
        print "Empty user tuples, bailing"
        return

    # assign the unique user ids to the first variable, and the unique usernames to the second
    all_user_ids, all_users = zip(*all_user_tuples)

    collection.commit()
    solr.all_pages_collection().commit()
    solr.all_user_pages_collection().commit()

    wiki_data['attr_entities'] = {'set': []}

    for count, entities in WikiEntitiesService().get_value(str(wiki_id)).items():
        for entity in entities:
            map(wiki_data['attr_entities']['set'].append, [entity] * int(count))  # goddamnit count isn't int

    wiki_data['user_ids_is'] = {'set': all_user_ids}
    wiki_data['attr_users'] = {'set': all_users}
    wiki_data['total_authority_f'] = {'set': sum(pages_to_authority.values())}
    wiki_data['authorities_fs'] = {'set': pages_to_authority.values()}

    wiki_collection = solr.existing_collection(solr.global_collection())
    wiki_collection.add([wiki_data])
    wiki_collection.commit()
    print "Committed wiki data"

    print "Retrieving user docs..."
    futures = group(build_wiki_user_doc.s(api_data, user_tuple) for user_tuple in all_user_tuples)()
    future_result_len = len(futures.results)
    while not futures.ready():
        print "Progress: (%d/%d)" % (futures.completed_count(), future_result_len)
        sleep(2)

    user_docs = get_with_backoff(futures, [])
    if not user_docs:
        print "User docs was empty. Possibly connection problems."
        return

    authority_scaler = MinMaxScaler([doc['total_page_authority_f']['set'] for doc in user_docs])
    contribs_scaler = MinMaxScaler([doc['total_contribs_f']['set'] for doc in user_docs])
    for doc in user_docs:
        scaled_authority = authority_scaler.scale(doc['total_page_authority_f']['set'])
        scaled_contribs = contribs_scaler.scale(doc['total_contribs_f']['set'])
        doc['scaled_authority_f'] = {'set': scaled_authority}
        doc['scaled_contribs_f'] = {'set': scaled_contribs}
        doc['scaled_contribs_authority_f'] = {'set': scaled_authority * scaled_contribs}

    wiki_user_collection = solr.existing_collection(solr.wiki_user_collection())
    wiki_user_collection.add(user_docs)
    wiki_user_collection.commit()

    print "Analyzing topics"
    futures = group(get_wiki_topic_doc.s(wiki_data['id'], topic)
                    for topic in list(set(wiki_data['attr_entities']['set'])))()
    future_result_len = len(futures.results)
    counter = 0
    while not futures.ready():
        if counter % 5 == 0:
            print "Progress: (%d/%d)" % (futures.completed_count(), future_result_len)
        sleep(2)
        counter += 1
    topic_docs = get_with_backoff(futures, [])
    if not topic_docs:
        print "No topics, probably a connection error"
        return

    collection.add(topic_docs)
    collection.commit()

    topic_collection = solr.existing_collection(solr.all_topics_collection())
    topic_collection.add(topic_docs)
    topic_collection.commit()