Пример #1
0
def get_data(wid):
    log(wid)
    use_caching(shouldnt_compute=True)
    #should be CombinedEntitiesService yo
    doc_ids_to_heads = WikiToPageHeadsService().get_value(wid, {})
    doc_ids_to_entities = WikiPageToEntitiesService().get_value(wid, {})
    doc_ids_combined = {}
    if doc_ids_to_heads == {}:
        log(wid, "no heads")
    if doc_ids_to_entities == {}:
        log(wid, "no entities")
    from_s3 = json.loads(
        bucket.get_key('feature-data/page-%s.json' %
                       wid).get_contents_as_string())
    for doc_id in doc_ids_to_heads:
        entity_response = doc_ids_to_entities.get(doc_id, {
            'titles': [],
            'redirects': {}
        })
        doc_ids_combined[doc_id] = (map(
            preprocess,
            entity_response['titles'] + entity_response['redirects'].keys() +
            entity_response['redirects'].values() +
            list(set(doc_ids_to_heads.get(doc_id, [])))) +
                                    from_s3.get(doc_id, []))
    return doc_ids_combined.items()
def get_data_wid(wid):
    print wid
    use_caching(shouldnt_compute=True)
    # should be CombinedEntitiesService yo
    doc_ids_to_heads = WikiToPageHeadsService().get_value(wid, {})
    doc_ids_to_entities = WikiPageToEntitiesService().get_value(wid, {})
    doc_ids_combined = {}
    if doc_ids_to_heads == {}:
        print wid, "no heads"
    if doc_ids_to_entities == {}:
        print wid, "no entities"
    for doc_id in doc_ids_to_heads:
        entity_response = doc_ids_to_entities.get(doc_id, {"titles": [], "redirects": {}})
        doc_ids_combined[doc_id] = map(
            preprocess,
            entity_response["titles"]
            + entity_response["redirects"].keys()
            + entity_response["redirects"].values()
            + list(set(doc_ids_to_heads.get(doc_id, []))),
        )
    return doc_ids_combined.items()
def get_data_wid(wid):
    print wid
    use_caching(shouldnt_compute=True)
    #should be CombinedEntitiesService yo
    doc_ids_to_heads = WikiToPageHeadsService().get_value(wid, {})
    doc_ids_to_entities = WikiPageToEntitiesService().get_value(wid, {})
    doc_ids_combined = {}
    if doc_ids_to_heads == {}:
        print wid, "no heads"
    if doc_ids_to_entities == {}:
        print wid, "no entities"
    for doc_id in doc_ids_to_heads:
        entity_response = doc_ids_to_entities.get(doc_id, {
            'titles': [],
            'redirects': {}
        })
        doc_ids_combined[doc_id] = map(
            preprocess,
            entity_response['titles'] + entity_response['redirects'].keys() +
            entity_response['redirects'].values() +
            list(set(doc_ids_to_heads.get(doc_id, []))))
    return doc_ids_combined.items()
Пример #4
0
def get_data(wid):
    log(wid)
    use_caching(shouldnt_compute=True)
    #should be CombinedEntitiesService yo
    doc_ids_to_heads = WikiToPageHeadsService().get_value(wid, {})
    doc_ids_to_entities = WikiPageToEntitiesService().get_value(wid, {})
    doc_ids_combined = {}
    if doc_ids_to_heads == {}:
        log(wid, "no heads")
    if doc_ids_to_entities == {}:
        log(wid, "no entities")
    from_s3 = json.loads(bucket.get_key(
        'feature-data/page-%s.json' % wid).get_contents_as_string())
    for doc_id in doc_ids_to_heads:
        entity_response = doc_ids_to_entities.get(
            doc_id, {'titles': [], 'redirects': {}})
        doc_ids_combined[doc_id] = (map(preprocess,
                                        entity_response['titles'] +
                                        entity_response['redirects'].keys() +
                                        entity_response['redirects'].values() +
                                        list(set(doc_ids_to_heads.get(doc_id,
                                                                      [])))) +
                                    from_s3.get(doc_id, []))
    return doc_ids_combined.items()
def insert_contrib_data(args):
    try:
        use_caching(is_read_only=True, shouldnt_compute=True)
        db, cursor = get_db_and_cursor(args)
        wpe = WikiPageToEntitiesService().get_value(args.wid)
        if not wpe:
            print u"NO WIKI PAGE TO ENTITIES SERVICE FOR", args.wid
            return False
        authority_dict_fixed = get_authority_dict_fixed(args)
        if not authority_dict_fixed:
            return False
        print u"Inserting page and author and contrib data for wiki", args.wid
        for doc_id in authority_dict_fixed:
            wiki_id, article_id = doc_id.split(u'_')

            entity_data = wpe.get(doc_id, {})
            entity_list = filter(
                lambda x: x,
                map(
                    lambda x: x.strip(),
                    map(
                        my_escape,
                        list(
                            set(
                                entity_data.get(u'redirects', {}).values() +
                                entity_data.get(u'titles', []))))))

            cursor.execute(u"""
            SELECT topic_id FROM topics WHERE name IN ("%s")
            """ % (u'", "'.join(entity_list)))
            topic_ids = list(set([result[0] for result in cursor.fetchall()]))

            for topic_id in topic_ids:
                sql = u"""
                INSERT IGNORE INTO articles_topics (article_id, wiki_id, topic_id) VALUES (%s, %s, %s)
                """ % (article_id, wiki_id, topic_id)
                cursor.execute(sql)
                db.commit()

            cursor = db.cursor()

            for contribs in PageAuthorityService().get_value(doc_id, []):
                cursor.execute(u"""
                INSERT IGNORE INTO users (user_id, user_name) VALUES (%d, "%s")
                """ % (contribs[u'userid'], my_escape(contribs[u'user'])))
                db.commit()

                cursor.execute(u"""
                INSERT INTO articles_users (article_id, wiki_id, user_id, contribs) VALUES (%s, %s, %d, %s)
                """ % (article_id, wiki_id, contribs[u'userid'],
                       contribs[u'contribs']))
                db.commit()

                local_authority = contribs[
                    u'contribs'] * authority_dict_fixed.get(doc_id, 0)
                for topic_id in topic_ids:
                    cursor.execute(u"""
                    INSERT INTO topics_users (user_id, topic_id, local_authority) VALUES (%d, %s, %s)
                    ON DUPLICATE KEY UPDATE local_authority = local_authority + %s
                    """ % (contribs[u'userid'], topic_id, local_authority,
                           local_authority))
                    db.commit()
        db.commit()
        print u"Done with", args.wid
        return args
    except Exception as e:
        print e, traceback.format_exc()
        return False
Пример #6
0
def insert_contrib_data(args):
    try:
        use_caching(is_read_only=True, shouldnt_compute=True)
        db, cursor = get_db_and_cursor(args)
        wpe = WikiPageToEntitiesService().get_value(args.wid)
        if not wpe:
            print u"NO WIKI PAGE TO ENTITIES SERVICE FOR", args.wid
            return False
        authority_dict_fixed = get_authority_dict_fixed(args)
        if not authority_dict_fixed:
            return False
        print u"Inserting page and author and contrib data for wiki", args.wid
        for doc_id in authority_dict_fixed:
            wiki_id, article_id = doc_id.split(u"_")

            entity_data = wpe.get(doc_id, {})
            entity_list = filter(
                lambda x: x,
                map(
                    lambda x: x.strip(),
                    map(
                        my_escape,
                        list(set(entity_data.get(u"redirects", {}).values() + entity_data.get(u"titles", []))),
                    ),
                ),
            )

            cursor.execute(
                u"""
            SELECT topic_id FROM topics WHERE name IN ("%s")
            """
                % (u'", "'.join(entity_list))
            )
            topic_ids = list(set([result[0] for result in cursor.fetchall()]))

            for topic_id in topic_ids:
                sql = u"""
                INSERT IGNORE INTO articles_topics (article_id, wiki_id, topic_id) VALUES (%s, %s, %s)
                """ % (
                    article_id,
                    wiki_id,
                    topic_id,
                )
                cursor.execute(sql)
                db.commit()

            cursor = db.cursor()

            for contribs in PageAuthorityService().get_value(doc_id, []):
                cursor.execute(
                    u"""
                INSERT IGNORE INTO users (user_id, user_name) VALUES (%d, "%s")
                """
                    % (contribs[u"userid"], my_escape(contribs[u"user"]))
                )
                db.commit()

                cursor.execute(
                    u"""
                INSERT INTO articles_users (article_id, wiki_id, user_id, contribs) VALUES (%s, %s, %d, %s)
                """
                    % (article_id, wiki_id, contribs[u"userid"], contribs[u"contribs"])
                )
                db.commit()

                local_authority = contribs[u"contribs"] * authority_dict_fixed.get(doc_id, 0)
                for topic_id in topic_ids:
                    cursor.execute(
                        u"""
                    INSERT INTO topics_users (user_id, topic_id, local_authority) VALUES (%d, %s, %s)
                    ON DUPLICATE KEY UPDATE local_authority = local_authority + %s
                    """
                        % (contribs[u"userid"], topic_id, local_authority, local_authority)
                    )
                    db.commit()
        db.commit()
        print u"Done with", args.wid
        return args
    except Exception as e:
        print e, traceback.format_exc()
        return False