Пример #1
0
def aggregate_global_topic(topic):
    collection = solr.all_topics_collection()

    total_authorities = []
    all_user_id_dict = {}
    all_user_name_dict = {}
    all_wikis = []

    for doc in solr.get_all_docs_by_query(collection, topic):
        total_authorities.append(doc['total_authority_f'])
        if 'user_id_is' in doc:
            for user_id in doc['user_id_is']:
                all_user_id_dict[user_id] = True
        if 'user_names_ss' in doc:
            for user_name in doc['user_names_ss']:
                all_user_name_dict[user_name] = True
        if 'wiki_id_i' in doc:
            all_wikis.append(doc['wiki_id_i'])

    total_authority = sum(total_authorities)

    avg_authority = 0
    if total_authority > 0:
        avg_authority = total_authority / float(total_authority)

    return {
        'id': topic,
        'topic_s': {'set': topic},
        'wikis_is': {'set': all_wikis},
        'user_ids_is': {'set': all_user_id_dict.keys()},
        'user_names_ss': {'set': all_user_name_dict.keys()},
        'total_authority_f': {'set': total_authority},
        'avg_authority_f': {'set': avg_authority},
        'type_s': {'set': 'GlobalTopic'}
    }
Пример #2
0
def analyze_wikis_globally():
    print "Analyzing Wikis..."
    wiki_collection = solr.existing_collection(solr.global_collection())

    wiki_docs = [doc for doc in solr.get_all_docs_by_query(wiki_collection, '*:*')]
    scaler = MinMaxScaler([doc['total_authority_f'] for doc in wiki_docs])
    new_docs = []
    for doc in wiki_docs:
        new_docs.append({'id': doc['id'], 'scaled_authority_f': {'set': scaler.scale(doc['total_authority_f'])}})
        if len(new_docs) > 10:
            try:
                wiki_collection.add(new_docs)
            except ReadTimeout:
                sleep(5)
                try:
                    wiki_collection.add(new_docs)
                except ReadTimeout:
                    pass
            new_docs = []
    try:
        wiki_collection.add(new_docs)
    except ReadTimeout:
        sleep(5)
        try:
            wiki_collection.add(new_docs)
        except ReadTimeout:
            pass
    wiki_collection.commit()
Пример #3
0
def analyze_all_user_pages_globally():
    collection = solr.all_user_pages_collection()
    new_docs = {}
    for doc in solr.get_all_docs_by_query(collection, '*:*', fields="id,doc_authority_f"):
        user_id = doc['id'].split('_').pop()
        if user_id in new_docs:
            new_docs[user_id]['total_authority_f']['set'] += doc['doc_authority_f']
            new_docs[user_id]['total_contribs_f']['set'] += doc['contribs_f']
        else:
            new_docs[user_id] = {
                'id': user_id,
                'total_authority_f': {'set': doc['doc_authority_f']},
                'total_contribs_f': {'set': doc['contribs_f']}
            }

    authorities, contribs = [], []
    for doc in new_docs.values():
        authorities.append(doc['total_authority_f'])
        contribs.append(doc['total_contribs_f'])

    authorityscaler = MinMaxScaler(authorities)
    contribscaler = MinMaxScaler(contribs)
    for doc in new_docs.values():
        doc['scaled_authority_f'] = authorityscaler.scale(doc['total_authority_f'])
        doc['contribs_scaled_f'] = contribscaler.scale(doc['total_contribs_f'])
        doc['scaled_authority_contribs_f'] = doc['scaled_authority_f'] * doc['contribs_scaled_f']

    collection.add(new_docs.values())
    collection.commit()
Пример #4
0
    def all_wikis(args):
        """
        Accesses all wikis from database

        :return: dict keying wiki name to ids
        :rtype: dict
        """
        return solr.get_all_docs_by_query(solr.global_collection(), '*:*', sort='scaled_authority_f')
Пример #5
0
    def get_all_pages(self):
        """
        Optimized for all pages

        :return: dict of pages
        :rtype: dict
        """
        return solr.get_all_docs_by_query(solr.collection_for_wiki(self.wiki_id), 'type_s:Page', sort='authority_f')
Пример #6
0
def analyze_pages_globally():
    print "Analyzing All Pages..."
    page_collection = solr.all_pages_collection()

    authorities = []
    for page_doc in solr.get_all_docs_by_query(page_collection, '*:*'):
        authorities.append(page_doc['authority_f'])

    page_scaler = MinMaxScaler(authorities)
    docs = []
    counter = 0
    for page_doc in solr.get_all_docs_by_query(page_collection, '*:*'):
        docs.append({'id': page_doc['id'], 'scaled_authority_f': {'set': page_scaler.scale(page_doc['authority_f'])}})
        counter += 1
        if counter % 500:
            page_collection.add(docs)
            docs = []
    page_collection.commit()
Пример #7
0
    def get_row(self):
        """
        Returns the row from the DB as a dict

        :return: row data
        :rtype: dict
        """
        for doc in solr.get_all_docs_by_query(solr.user_collection(), 'name_txt_en:"%s"' % self.user_name):
            return doc['attr_entities']
Пример #8
0
    def get_all_authors(self):
        """
        Optimized to get all authors

        :return: an OrderedDict with author dicts
        :rtype: collections.OrderedDict
        """

        return solr.get_all_docs_by_query(solr.wiki_user_collection(), 'wiki_id_i:%s' % self.wiki_id);
Пример #9
0
    def get_all_users(self):
        """
        Optimized to get all users

        :return: an OrderedDict with user dicts
        :rtype: collections.OrderedDict
        """

        return solr.get_all_docs_by_query(solr.wiki_user_collection(), 'wiki_id_i:%s' % self.wiki_id,
                                          fields=','.join(UserModel.fields))
Пример #10
0
    def get_row(self):
        """
        Returns the row from the DB as a dict

        :return: row data
        :rtype: dict
        """
        for row in solr.get_all_docs_by_query(solr.all_pages_collection(),
                                              'id:%s_%s' % (str(self.page_id), str(self.wiki_id))):
            return row
Пример #11
0
    def get_row(self):
        """
        Gets the database for this wiki

        :rtype: dict
        :return: a dict representing the row and its column titles
        """
        collection = solr.global_collection()
        for doc in solr.get_all_docs_by_query(collection, 'id:%s' % str(self.wiki_id)):
            return doc
Пример #12
0
    def all_wikis():
        """
        Accesses all wikis from database

        :return: dict keying wiki name to ids
        :rtype: dict
        """
        return solr.get_all_docs_by_query(solr.global_collection(), 
                                          '*:*', 
                                          sort='scaled_authority_f desc',
                                          fields=','.join(WikiModel.fields))
Пример #13
0
    def get_row(self):
        """
        Returns the row from the DB as a dict

        :return: row data
        :rtype: dict
        """
        for row in solr.get_all_docs_by_query(solr.all_pages_collection(),
                                              'id:%s' % self.doc_id,
                                              fields=','.join(self.fields)):
            return row
Пример #14
0
    def get_row(self):
        """
        Returns the row from the DB as a dict

        :return: row data
        :rtype: dict
        """
        for doc in solr.get_all_docs_by_query(solr.user_collection(), 
                                              "*:*",
                                              fields=','.join(self.fields+['attr_entities'])):
            return doc
Пример #15
0
 def run(self):
     """
     Drops all indices
     """
     global_coll = solr.global_collection()
     for doc in solr.get_all_docs_by_query(global_coll, '*:*', fields='id'):
         solr.collection_for_wiki(doc['id']).drop()
     global_coll.drop()
     solr.all_pages_collection().drop()
     solr.all_topics_collection().drop()
     solr.all_user_pages_collection().drop()
     solr.wiki_user_collection().drop()
     solr.user_collection().drop()
Пример #16
0
def build_wiki_user_doc(wiki_data, user_tuple):
    """
    Retrieves data from wiki collection to generate a user document at the wiki level

    :param wiki_data: a dict representing the data we've retrieved from the Wikia API
    :type wiki_data: dict
    :param user_tuple: a tuple containing user id and user name
    :type user_tuple: tuple

    :return: the document we want to add to solr; we will commit in bulk instead of blasting the network
    :rtype: dict
    """

    user_id, user_name = user_tuple
    collection = solr.collection_for_wiki(str(wiki_data['id']))
    user_doc = {
        'id': '%d_%d' % (wiki_data['id'], user_id),
        'user_id_i': user_id,
        'wiki_id_i': wiki_data['id'],
        'wiki_name_txt': wiki_data['title'],
        'name_s': {'set': user_name},
        'type_s': {'set': 'WikiUser'},
        'name_txt_en': {'set': user_name},
        'hub_s': wiki_data['hub_s']
    }
    doc_ids = []
    entities = []
    authorities = []
    contribs = []
    for doc in solr.get_all_docs_by_query(collection, 'type_s:PageUser AND user_id_i:%d' % user_id):
        doc_ids.append(doc['doc_id_s'])
        if 'attr_entities' in doc:
            map(entities.append, doc['attr_entities'])
        if 'user_page_authority_f' in doc:
            authorities.append(doc['user_page_authority_f'])
        if 'contribs_f' in doc:
            contribs.append(doc['contribs_f'])

    total_authorities = sum(authorities)
    total_contribs = sum(contribs)

    user_doc['doc_ids_ss'] = {'set': doc_ids}
    user_doc['attr_entities'] = {'set': entities}
    user_doc['total_page_authority_f'] = {'set': total_authorities}
    user_doc['total_contribs_f'] = {'set': total_contribs}
    user_doc['page_authority_fs'] = {'set': authorities}
    user_doc['contribs_fs'] = {'set': contribs}
    user_doc['total_contribs_authority_f'] = {'set': total_authorities * total_contribs}

    return user_doc
Пример #17
0
    def get_topics_for_wiki(self, wiki_id, limit=10, offset=0, **kwargs):
        """
        Gets most important topics for this user on this wiki

        :param limit: the wiki id
        :type limit: str
        :param limit: limit
        :type limit: int
        :param offset: offset
        :type offset: int

        :return: ordered dict of topic name to auth or a list of dicts for api
        :rtype: collections.OrderedDict|list
        """
        for doc in solr.get_all_docs_by_query(solr.wiki_user_collection(),
                                              'user_id_i:%d_%d' % (wiki_id, self.user_id),
                                              fields=','.join(TopicModel.fields+['attr_entities'])):
            return doc['attr_entities']
Пример #18
0
    def get_topics_for_wiki(self, wiki_id, limit=10, offset=0, for_api=False):
        """
        Gets most important topics for this user on this wiki

        :param limit: the wiki id
        :type limit: str
        :param limit: limit
        :type limit: int
        :param offset: offset
        :type offset: int
        :param for_api: if it's for the api, we fix the naming
        :type for_api: bool

        :return: ordered dict of topic name to auth or a list of dicts for api
        :rtype: collections.OrderedDict|list
        """
        for doc in solr.get_all_docs_by_query(solr.wiki_user_collection(), 'name_txt_en:"%s"' % self.user_name):
            return doc['attr_entities']
Пример #19
0
 def run(self):
     """
     Drops all indices
     """
     
     global_coll = solr.global_collection()
     print 'global'
     global_coll.optimize()
     for doc in solr.get_all_docs_by_query(global_coll, '*:*', fields='id'):
         print doc['id']
         solr.collection_for_wiki(doc['id']).optimize()
     print 'all pages'
     solr.all_pages_collection().optimize()
     print 'all topics'
     solr.all_topics_collection().optimize()
     print 'all user pages'
     solr.all_user_pages_collection().optimize()
     print 'wiki user'
     solr.wiki_user_collection().optimize()
     print 'user'
     solr.user_collection().optimize()
Пример #20
0
def get_wiki_topic_doc(wiki_id, topic):
    """
    Create a solr doc for a given topic based on all matching pages for a wiki

    :param wiki_id: the ID of the wiki
    :type wiki_id: str
    :param topic: the topic we're creating a document for
    :type topic: str

    :return: the solr document we want to add
    :rtype: dict
    """
    collection = solr.collection_for_wiki(wiki_id)
    authorities = []
    all_user_id_dict = {}
    all_user_name_dict = {}

    for doc in solr.get_all_docs_by_query(collection, 'type_s:Page AND attr_entities:"%s"' % topic):
        if 'user_id_is' in doc:
            for user_id in doc['user_ids_is']:
                all_user_id_dict[user_id] = True
        if 'attr_users' in doc:
            for user_name in doc['attr_users']:
                all_user_name_dict[user_name] = True
        if 'authority_f' in doc:
            authorities.append(doc['authority_f'])

    total_authority = sum(authorities)
    return {
        'id': '%s_%s' % (wiki_id, topic),
        'wiki_id_i': wiki_id,
        'topic_s': topic,
        'topic_txt_en': topic,
        'type_s': {'set': 'Topic'},
        'user_ids_is': {'set': all_user_id_dict.keys()},
        'user_names_ss': {'set': all_user_name_dict.keys()},
        'total_authority_f': {'set': total_authority},
        'avg_authority_f': {'set': total_authority / float(len(authorities))}
    }
Пример #21
0
def analyze_users_globally():
    print "Analyzing Users..."
    user_collection = solr.existing_collection(solr.user_collection())
    wiki_user_collection = solr.wiki_user_collection()

    id_to_docs = dict()
    for user_doc in solr.get_all_docs_by_query(wiki_user_collection, '*:*'):
        # these are gonna be wiki-id_user-id
        doc_id = user_doc['id'].split('_').pop()
        if doc_id not in id_to_docs:
            id_to_docs[doc_id] = dict(id=doc_id,
                                      attr_entities={'set': []},
                                      name_s={'set': user_doc['name_s']},
                                      name_txt_en={'set': user_doc['name_txt_en']},
                                      wikis_is={'set': []},
                                      attr_wikis={'set': []},
                                      authorities_fs={'set': []},
                                      total_authority_f={'set': 0},
                                      scaled_authority_f={'set': 0})

        try:
            map(id_to_docs[doc_id]['attr_entities']['set'].append, user_doc['attr_entities'])
            id_to_docs[doc_id]['wikis_is']['set'].append(user_doc['wiki_id_i'])
            id_to_docs[doc_id]['attr_wikis']['set'].append(user_doc['wiki_name_txt'])
            id_to_docs[doc_id]['authorities_fs']['set'].append(user_doc['total_page_authority_f'])
        except KeyError:
            pass  # zero f***s

    id_to_total_authorities = dict([(uid, sum(doc['authorities_fs']['set'])) for uid, doc in id_to_docs.items()])
    user_scaler = MinMaxScaler(id_to_total_authorities.values())
    for uid, total_authority in id_to_total_authorities.items():
        id_to_docs[uid]['total_authority_f']['set'] = total_authority
        id_to_docs[uid]['scaled_authority_f']['set'] = user_scaler.scale(total_authority)

    user_collection.add(id_to_docs.values())
    user_collection.commit()