示例#1
0
def get_data(wid):
    log(wid)
    use_caching(shouldnt_compute=True)
    #should be CombinedEntitiesService yo
    doc_ids_to_heads = WikiToPageHeadsService().get_value(wid, {})
    doc_ids_to_entities = WikiPageToEntitiesService().get_value(wid, {})
    doc_ids_combined = {}
    if doc_ids_to_heads == {}:
        log(wid, "no heads")
    if doc_ids_to_entities == {}:
        log(wid, "no entities")
    from_s3 = json.loads(
        bucket.get_key('feature-data/page-%s.json' %
                       wid).get_contents_as_string())
    for doc_id in doc_ids_to_heads:
        entity_response = doc_ids_to_entities.get(doc_id, {
            'titles': [],
            'redirects': {}
        })
        doc_ids_combined[doc_id] = (map(
            preprocess,
            entity_response['titles'] + entity_response['redirects'].keys() +
            entity_response['redirects'].values() +
            list(set(doc_ids_to_heads.get(doc_id, [])))) +
                                    from_s3.get(doc_id, []))
    return doc_ids_combined.items()
def insert_entities(args):
    try:
        use_caching(is_read_only=True, shouldnt_compute=True)
        db, cursor = get_db_and_cursor(args)

        wpe = WikiPageToEntitiesService().get_value(args.wid)
        if not wpe:
            print u"NO WIKI PAGE TO ENTITIES SERVICE FOR", args.wid
            return False

        print u"Priming entity data on", args.wid
        for page, entity_data in wpe.items():
            entity_list = map(
                my_escape,
                list(
                    set(
                        entity_data.get(u'redirects', {}).values() +
                        entity_data.get(u'titles'))))
            for i in range(0, len(entity_list), 50):
                cursor.execute(u"""
                INSERT IGNORE INTO topics (name) VALUES ("%s")
                """ % u'"), ("'.join(entity_list[i:i + 50]))
                db.commit()
        return args
    except Exception as e:
        print e, traceback.format_exc()
        return False
def insert_wiki_ids(args):
    try:
        use_caching(is_read_only=True, shouldnt_compute=True)
        db, cursor = get_db_and_cursor(args)

        print u"Inserting wiki data for", args.wid

        response = requests.get(u'http://www.wikia.com/api/v1/Wikis/Details',
                                params={u'ids': args.wid})

        items = response.json().get(u'items')
        if not items:
            return False

        wiki_data = items[args.wid]

        cursor.execute(u"""
        INSERT INTO wikis (wiki_id, wam_score, title, url) VALUES (%s, %s, "%s", "%s")
        """ % (args.wid, str(wiki_data[u'wam_score']),
               my_escape(wiki_data[u'title']), wiki_data[u'url']))
        db.commit()
        return args
    except Exception as e:
        print e, traceback.format_exc()
        return False
def insert_pages(args):
    try:
        use_caching(is_read_only=True, shouldnt_compute=True)
        db, cursor = get_db_and_cursor(args)

        authority_dict_fixed = get_authority_dict_fixed(args)
        if not authority_dict_fixed:
            return False

        print u"Inserting authority data for pages on wiki", args.wid

        dbargs = []
        for doc_id in authority_dict_fixed:
            wiki_id, article_id = doc_id.split(u'_')
            dbargs.append((doc_id, article_id, wiki_id,
                           str(authority_dict_fixed[doc_id])))

        cursor.execute(
            u"""
            INSERT INTO articles (doc_id, article_id, wiki_id, local_authority) VALUES %s
            """ %
            u", ".join([u"""("%s", %s, %s, %s)""" % arg for arg in dbargs]))

        db.commit()
        return args
    except Exception as e:
        print e, traceback.format_exc()
        return False
def insert_wiki_ids(args):
    try:
        use_caching(is_read_only=True, shouldnt_compute=True)
        db, cursor = get_db_and_cursor(args)

        print u"Inserting wiki data for", args.wid

        response = requests.get(u"http://www.wikia.com/api/v1/Wikis/Details", params={u"ids": args.wid})

        items = response.json().get(u"items")
        if not items:
            return False

        wiki_data = items[args.wid]

        cursor.execute(
            u"""
        INSERT INTO wikis (wiki_id, wam_score, title, url) VALUES (%s, %s, "%s", "%s")
        """
            % (args.wid, str(wiki_data[u"wam_score"]), my_escape(wiki_data[u"title"]), wiki_data[u"url"])
        )
        db.commit()
        return args
    except Exception as e:
        print e, traceback.format_exc()
        return False
def insert_pages(args):
    try:
        use_caching(is_read_only=True, shouldnt_compute=True)
        db, cursor = get_db_and_cursor(args)

        authority_dict_fixed = get_authority_dict_fixed(args)
        if not authority_dict_fixed:
            return False

        print u"Inserting authority data for pages on wiki", args.wid

        dbargs = []
        for doc_id in authority_dict_fixed:
            wiki_id, article_id = doc_id.split(u"_")
            dbargs.append((doc_id, article_id, wiki_id, str(authority_dict_fixed[doc_id])))

        cursor.execute(
            u"""
            INSERT INTO articles (doc_id, article_id, wiki_id, local_authority) VALUES %s
            """
            % u", ".join([u"""("%s", %s, %s, %s)""" % arg for arg in dbargs])
        )

        db.commit()
        return args
    except Exception as e:
        print e, traceback.format_exc()
        return False
def insert_entities(args):
    try:
        use_caching(is_read_only=True, shouldnt_compute=True)
        db, cursor = get_db_and_cursor(args)

        wpe = WikiPageToEntitiesService().get_value(args.wid)
        if not wpe:
            print u"NO WIKI PAGE TO ENTITIES SERVICE FOR", args.wid
            return False

        print u"Priming entity data on", args.wid
        for page, entity_data in wpe.items():
            entity_list = map(
                my_escape, list(set(entity_data.get(u"redirects", {}).values() + entity_data.get(u"titles")))
            )
            for i in range(0, len(entity_list), 50):
                cursor.execute(
                    u"""
                INSERT IGNORE INTO topics (name) VALUES ("%s")
                """
                    % u'"), ("'.join(entity_list[i : i + 50])
                )
                db.commit()
        return args
    except Exception as e:
        print e, traceback.format_exc()
        return False
def get_data(wiki_id):
    use_caching(per_service_cache={'TopEntitiesService.get': {'dont_compute': True},
                                   'HeadsCountService.get': {'dont_compute': True}})
    hcs = HeadsCountService().get_value(wiki_id)
    tes = TopEntitiesService().get_value(wiki_id)
    if type(hcs) == dict:
        hcs = hcs.items()
    if type(tes) == dict:
        tes = tes.items()
    return wiki_id, {'heads': sorted(hcs, key=lambda y: y[1], reverse=True)[:50],
                     'entities': sorted(tes, key=lambda y: y[1], reverse=True)}
def get_data_doc(doc_id):
    use_caching()
    # should be CombinedEntitiesService yo
    heads = HeadsService().get_value(doc_id, {})
    entities = EntitiesService().get_value(doc_id, {})
    doc_ids_combined = {}
    if doc_ids_to_heads == {}:
        print doc_id, "no heads"
    if doc_ids_to_entities == {}:
        print doc_id, "no entities"
    return entities["titles"].values() + entities["redirects"].keys() + entities["redirects"].values()
def get_data_doc(doc_id):
    use_caching()
    #should be CombinedEntitiesService yo
    heads = HeadsService().get_value(doc_id, {})
    entities = EntitiesService().get_value(doc_id, {})
    doc_ids_combined = {}
    if doc_ids_to_heads == {}:
        print doc_id, "no heads"
    if doc_ids_to_entities == {}:
        print doc_id, "no entities"
    return entities['titles'].values() + entities['redirects'].keys(
    ) + entities['redirects'].values()
示例#11
0
def main():
    global wiki_id
    sys.stdout = os.fdopen(sys.stdout.fileno(), 'w', 0)
    args, _ = get_args()
    wiki_id = args.wiki_id
    services = args.services.split(',')

    caching_dict = dict([(service+'.get', {'write_only': True}) for service in
                         services])
    use_caching(per_service_cache=caching_dict)

    log('Calling wiki-level services on %s' % args.wiki_id)

    #pool = Pool(processes=8)
    #s = pool.map_async(get_service, services)
    #s.wait()
    for service in services:
        get_service(service)
示例#12
0
def main():
    global wiki_id
    sys.stdout = os.fdopen(sys.stdout.fileno(), 'w', 0)
    args, _ = get_args()
    wiki_id = args.wiki_id
    services = args.services.split(',')

    caching_dict = dict([(service + '.get', {
        'write_only': True
    }) for service in services])
    use_caching(per_service_cache=caching_dict)

    log('Calling wiki-level services on %s' % args.wiki_id)

    #pool = Pool(processes=8)
    #s = pool.map_async(get_service, services)
    #s.wait()
    for service in services:
        get_service(service)
def get_data(wiki_id):
    use_caching(
        per_service_cache={
            'TopEntitiesService.get': {
                'dont_compute': True
            },
            'HeadsCountService.get': {
                'dont_compute': True
            }
        })
    hcs = HeadsCountService().get_value(wiki_id)
    tes = TopEntitiesService().get_value(wiki_id)
    if type(hcs) == dict:
        hcs = hcs.items()
    if type(tes) == dict:
        tes = tes.items()
    return wiki_id, {
        'heads': sorted(hcs, key=lambda y: y[1], reverse=True)[:50],
        'entities': sorted(tes, key=lambda y: y[1], reverse=True)
    }
def get_data_wid(wid):
    print wid
    use_caching(shouldnt_compute=True)
    # should be CombinedEntitiesService yo
    doc_ids_to_heads = WikiToPageHeadsService().get_value(wid, {})
    doc_ids_to_entities = WikiPageToEntitiesService().get_value(wid, {})
    doc_ids_combined = {}
    if doc_ids_to_heads == {}:
        print wid, "no heads"
    if doc_ids_to_entities == {}:
        print wid, "no entities"
    for doc_id in doc_ids_to_heads:
        entity_response = doc_ids_to_entities.get(doc_id, {"titles": [], "redirects": {}})
        doc_ids_combined[doc_id] = map(
            preprocess,
            entity_response["titles"]
            + entity_response["redirects"].keys()
            + entity_response["redirects"].values()
            + list(set(doc_ids_to_heads.get(doc_id, []))),
        )
    return doc_ids_combined.items()
def get_data_wid(wid):
    print wid
    use_caching(shouldnt_compute=True)
    #should be CombinedEntitiesService yo
    doc_ids_to_heads = WikiToPageHeadsService().get_value(wid, {})
    doc_ids_to_entities = WikiPageToEntitiesService().get_value(wid, {})
    doc_ids_combined = {}
    if doc_ids_to_heads == {}:
        print wid, "no heads"
    if doc_ids_to_entities == {}:
        print wid, "no entities"
    for doc_id in doc_ids_to_heads:
        entity_response = doc_ids_to_entities.get(doc_id, {
            'titles': [],
            'redirects': {}
        })
        doc_ids_combined[doc_id] = map(
            preprocess,
            entity_response['titles'] + entity_response['redirects'].keys() +
            entity_response['redirects'].values() +
            list(set(doc_ids_to_heads.get(doc_id, []))))
    return doc_ids_combined.items()
def get_data(wid):
    log(wid)
    use_caching(shouldnt_compute=True)
    #should be CombinedEntitiesService yo
    doc_ids_to_heads = WikiToPageHeadsService().get_value(wid, {})
    doc_ids_to_entities = WikiPageToEntitiesService().get_value(wid, {})
    doc_ids_combined = {}
    if doc_ids_to_heads == {}:
        log(wid, "no heads")
    if doc_ids_to_entities == {}:
        log(wid, "no entities")
    from_s3 = json.loads(bucket.get_key(
        'feature-data/page-%s.json' % wid).get_contents_as_string())
    for doc_id in doc_ids_to_heads:
        entity_response = doc_ids_to_entities.get(
            doc_id, {'titles': [], 'redirects': {}})
        doc_ids_combined[doc_id] = (map(preprocess,
                                        entity_response['titles'] +
                                        entity_response['redirects'].keys() +
                                        entity_response['redirects'].values() +
                                        list(set(doc_ids_to_heads.get(doc_id,
                                                                      [])))) +
                                    from_s3.get(doc_id, []))
    return doc_ids_combined.items()
示例#17
0
from nlp_services.caching import use_caching
from nlp_services.syntax import WikiToPageHeadsService
from nlp_services.discourse.entities import WikiPageToEntitiesService
from pprint import pprint

use_caching(shouldnt_compute=True)


def heads(wid):
    #pprint(WikiToPageHeadsService().get_value(wid, {}))
    return WikiToPageHeadsService().get_value(wid, {})


def entities(wid):
    #pprint(WikiPageToEntitiesService().get_value(wid, {}))
    return WikiPageToEntitiesService().get_value(wid, {})
def insert_contrib_data(args):
    try:
        use_caching(is_read_only=True, shouldnt_compute=True)
        db, cursor = get_db_and_cursor(args)
        wpe = WikiPageToEntitiesService().get_value(args.wid)
        if not wpe:
            print u"NO WIKI PAGE TO ENTITIES SERVICE FOR", args.wid
            return False
        authority_dict_fixed = get_authority_dict_fixed(args)
        if not authority_dict_fixed:
            return False
        print u"Inserting page and author and contrib data for wiki", args.wid
        for doc_id in authority_dict_fixed:
            wiki_id, article_id = doc_id.split(u'_')

            entity_data = wpe.get(doc_id, {})
            entity_list = filter(
                lambda x: x,
                map(
                    lambda x: x.strip(),
                    map(
                        my_escape,
                        list(
                            set(
                                entity_data.get(u'redirects', {}).values() +
                                entity_data.get(u'titles', []))))))

            cursor.execute(u"""
            SELECT topic_id FROM topics WHERE name IN ("%s")
            """ % (u'", "'.join(entity_list)))
            topic_ids = list(set([result[0] for result in cursor.fetchall()]))

            for topic_id in topic_ids:
                sql = u"""
                INSERT IGNORE INTO articles_topics (article_id, wiki_id, topic_id) VALUES (%s, %s, %s)
                """ % (article_id, wiki_id, topic_id)
                cursor.execute(sql)
                db.commit()

            cursor = db.cursor()

            for contribs in PageAuthorityService().get_value(doc_id, []):
                cursor.execute(u"""
                INSERT IGNORE INTO users (user_id, user_name) VALUES (%d, "%s")
                """ % (contribs[u'userid'], my_escape(contribs[u'user'])))
                db.commit()

                cursor.execute(u"""
                INSERT INTO articles_users (article_id, wiki_id, user_id, contribs) VALUES (%s, %s, %d, %s)
                """ % (article_id, wiki_id, contribs[u'userid'],
                       contribs[u'contribs']))
                db.commit()

                local_authority = contribs[
                    u'contribs'] * authority_dict_fixed.get(doc_id, 0)
                for topic_id in topic_ids:
                    cursor.execute(u"""
                    INSERT INTO topics_users (user_id, topic_id, local_authority) VALUES (%d, %s, %s)
                    ON DUPLICATE KEY UPDATE local_authority = local_authority + %s
                    """ % (contribs[u'userid'], topic_id, local_authority,
                           local_authority))
                    db.commit()
        db.commit()
        print u"Done with", args.wid
        return args
    except Exception as e:
        print e, traceback.format_exc()
        return False
def insert_contrib_data(args):
    try:
        use_caching(is_read_only=True, shouldnt_compute=True)
        db, cursor = get_db_and_cursor(args)
        wpe = WikiPageToEntitiesService().get_value(args.wid)
        if not wpe:
            print u"NO WIKI PAGE TO ENTITIES SERVICE FOR", args.wid
            return False
        authority_dict_fixed = get_authority_dict_fixed(args)
        if not authority_dict_fixed:
            return False
        print u"Inserting page and author and contrib data for wiki", args.wid
        for doc_id in authority_dict_fixed:
            wiki_id, article_id = doc_id.split(u"_")

            entity_data = wpe.get(doc_id, {})
            entity_list = filter(
                lambda x: x,
                map(
                    lambda x: x.strip(),
                    map(
                        my_escape,
                        list(set(entity_data.get(u"redirects", {}).values() + entity_data.get(u"titles", []))),
                    ),
                ),
            )

            cursor.execute(
                u"""
            SELECT topic_id FROM topics WHERE name IN ("%s")
            """
                % (u'", "'.join(entity_list))
            )
            topic_ids = list(set([result[0] for result in cursor.fetchall()]))

            for topic_id in topic_ids:
                sql = u"""
                INSERT IGNORE INTO articles_topics (article_id, wiki_id, topic_id) VALUES (%s, %s, %s)
                """ % (
                    article_id,
                    wiki_id,
                    topic_id,
                )
                cursor.execute(sql)
                db.commit()

            cursor = db.cursor()

            for contribs in PageAuthorityService().get_value(doc_id, []):
                cursor.execute(
                    u"""
                INSERT IGNORE INTO users (user_id, user_name) VALUES (%d, "%s")
                """
                    % (contribs[u"userid"], my_escape(contribs[u"user"]))
                )
                db.commit()

                cursor.execute(
                    u"""
                INSERT INTO articles_users (article_id, wiki_id, user_id, contribs) VALUES (%s, %s, %d, %s)
                """
                    % (article_id, wiki_id, contribs[u"userid"], contribs[u"contribs"])
                )
                db.commit()

                local_authority = contribs[u"contribs"] * authority_dict_fixed.get(doc_id, 0)
                for topic_id in topic_ids:
                    cursor.execute(
                        u"""
                    INSERT INTO topics_users (user_id, topic_id, local_authority) VALUES (%d, %s, %s)
                    ON DUPLICATE KEY UPDATE local_authority = local_authority + %s
                    """
                        % (contribs[u"userid"], topic_id, local_authority, local_authority)
                    )
                    db.commit()
        db.commit()
        print u"Done with", args.wid
        return args
    except Exception as e:
        print e, traceback.format_exc()
        return False
def main():
    sys.stdout = os.fdopen(sys.stdout.fileno(), 'w', 0)
    use_caching()
    args = get_args()
    get_model_from_args(args)
    log("Done")
示例#21
0
def main():
    args, _ = get_args()
    use_caching(per_service_cache=dict([(service + '.get', {
        'write_only': True
    }) for service in args.services.split(',')]))
    call_services(args)
parser = argparse.ArgumentParser(description="Generate a per-page topic model using latent dirichlet analysis.")
parser.add_argument(
    "--wiki_ids",
    dest="wiki_ids_file",
    nargs="?",
    type=argparse.FileType("r"),
    help="The source file of wiki IDs sorted by WAM",
)
parser.add_argument("--num_wikis", dest="num_wikis", type=int, action="store", help="The number of wikis to process")
parser.add_argument(
    "--num_topics", dest="num_topics", type=int, action="store", help="The number of topics for the model to use"
)
parser.add_argument("--model_dest", dest="model_dest", type=str, action="store", help="Where to save the model")
args = parser.parse_args()

use_caching()


def get_data_wid(wid):
    print wid
    use_caching(shouldnt_compute=True)
    # should be CombinedEntitiesService yo
    doc_ids_to_heads = WikiToPageHeadsService().get_value(wid, {})
    doc_ids_to_entities = WikiPageToEntitiesService().get_value(wid, {})
    doc_ids_combined = {}
    if doc_ids_to_heads == {}:
        print wid, "no heads"
    if doc_ids_to_entities == {}:
        print wid, "no entities"
    for doc_id in doc_ids_to_heads:
        entity_response = doc_ids_to_entities.get(doc_id, {"titles": [], "redirects": {}})
示例#23
0
def main():
    use_caching()
    args = get_args()
    set_global_num_processes(args.num_processes)
    api_data = get_api_data(args.wiki_id)

    workbook = xlwt.Workbook()
    pages_sheet = workbook.add_sheet("Pages by Authority")
    pages_sheet.write(0, 0, "Page")
    pages_sheet.write(0, 1, "Authority")

    print "Getting Page Data..."
    page_authority = get_page_authority(api_data)

    print "Writing Page Data..."
    pages, authorities = zip(*page_authority)
    scaler = MinMaxScaler(authorities, enforced_min=0, enforced_max=100)
    for i, page in enumerate(pages):
        if i > 65000:
            break
        pages_sheet.write(i+1, 0, page)
        pages_sheet.write(i+1, 1, scaler.scale(authorities[i]))

    print "Getting Author and Topic Data..."
    author_authority = get_author_authority(api_data)
    topic_authority = sorted(WikiTopicsToAuthorityService().get_value(args.wiki_id),
                             key=lambda y: y[1]['authority'], reverse=True)

    print "Writing Author Data..."
    authors_sheet = workbook.add_sheet("Authors by Authority")
    authors_sheet.write(0, 0, "Author")
    authors_sheet.write(0, 1, "Authority")

    authors_topics_sheet = workbook.add_sheet("Topics for Best Authors")
    authors_topics_sheet.write(0, 0, "Author")
    authors_topics_sheet.write(0, 1, "Topic")
    authors_topics_sheet.write(0, 2, "Rank")
    authors_topics_sheet.write(0, 3, "Score")

    # why is total_authority not there?
    all_total_authorities = [author.get('total_authority', 0) for author in author_authority]
    scaler = MinMaxScaler(all_total_authorities, enforced_min=0, enforced_max=100)
    pivot_counter = 1
    for i, author in enumerate(author_authority):
        authors_sheet.write(i+1, 0, author['name'])
        authors_sheet.write(i+1, 1, scaler.scale(author['total_authority']))
        for rank, topic in enumerate(author['topics'][:10]):
            if pivot_counter > 65000:
                break
            authors_topics_sheet.write(pivot_counter, 0, author['name'])
            authors_topics_sheet.write(pivot_counter, 1, topic[0])
            authors_topics_sheet.write(pivot_counter, 2, rank+1)
            authors_topics_sheet.write(pivot_counter, 3, topic[1])
            pivot_counter += 1
        if i > 65000:
            break

    print "Writing Topic Data"
    topics_sheet = workbook.add_sheet("Topics by Authority")
    topics_sheet.write(0, 0, "Topic")
    topics_sheet.write(0, 1, "Authority")

    topics_authors_sheet = workbook.add_sheet("Authors for Best Topics")
    topics_authors_sheet.write(0, 0, "Topic")
    topics_authors_sheet.write(0, 1, "Author")
    topics_authors_sheet.write(0, 2, "Rank")
    topics_authors_sheet.write(0, 3, "Authority")

    scaler = MinMaxScaler([x[1].get('authority', 0) for x in topic_authority], enforced_min=0, enforced_max=100)
    pivot_counter = 1
    for i, topic in enumerate(topic_authority):
        topics_sheet.write(i+1, 0, topic[0])
        topics_sheet.write(i+1, 1, scaler.scale(topic[1]['authority']))
        authors = topic[1]['authors']
        for rank, author in enumerate(authors[:10]):
            if pivot_counter > 65000:
                break
            topics_authors_sheet.write(pivot_counter, 0, topic[0])
            topics_authors_sheet.write(pivot_counter, 1, author['author'])
            topics_authors_sheet.write(pivot_counter, 2, rank+1)
            topics_authors_sheet.write(pivot_counter, 3, author['topic_authority'])
            pivot_counter += 1

        if i > 65000:
            break

    print "Saving to Excel"
    wiki_name = api_data['url'].replace('http://', '').replace('.wikia', '').replace('.com/', '')
    fname = "%s-%s-authority-data-%s.xls" % (args.wiki_id, wiki_name,
                                             datetime.strftime(datetime.now(), '%Y-%m-%d-%H-%M'))
    workbook.save(fname)

    if args.send_to_s3:
        bucket = connect_s3().get_bucket('nlp-data')
        k = bucket.new_key('authority/%s/%s' % (args.wiki_id, fname))
        k.set_contents_from_fiename(fname)

    print fname
示例#24
0
    def get_workbook(self, num_processes=8):
        use_caching()
        set_global_num_processes(num_processes)

        workbook = xlwt.Workbook()
        pages_sheet = workbook.add_sheet(u"Pages by Authority")
        pages_sheet.write(0, 0, u"Page")
        pages_sheet.write(0, 1, u"Authority")

        page_authority = self.get_all_pages()

        pages, authorities = zip(*page_authority)
        scaler = MinMaxScaler(authorities, enforced_min=0, enforced_max=100)
        for i, page in enumerate(pages):
            if i > 65000:
                break
            pages_sheet.write(i+1, 0, page)
            pages_sheet.write(i+1, 1, scaler.scale(authorities[i]))

        author_authority = self.get_all_authors().values()

        for counter, author in enumerate(author_authority):
            author[u'topics'] = [topic.topic for topic in
                                 UserModel(author, self.args).get_topics_for_wiki(self.wiki_id, limit=5)]
            if counter > 25:
                break

        topic_authority = self.get_topics(limit=None)
        for counter, topic in enumerate(topic_authority):
            topic[u'authors'] = TopicModel(topic[u'topic'], self.args).get_users(5, for_api=True)
            if counter > 25:
                break

        authors_sheet = workbook.add_sheet(u"Authors by Authority")
        authors_sheet.write(0, 0, u"Author")
        authors_sheet.write(0, 1, u"Authority")

        authors_topics_sheet = workbook.add_sheet(u"Topics for Best Authors")
        authors_topics_sheet.write(0, 0, u"Author")
        authors_topics_sheet.write(0, 1, u"Topic")
        authors_topics_sheet.write(0, 2, u"Rank")
        authors_topics_sheet.write(0, 3, u"Score")

        # why is total_authority not there?
        all_total_authorities = [author.get(u'total_authority', 0) for author in author_authority]
        scaler = MinMaxScaler(all_total_authorities, enforced_min=0, enforced_max=100)
        pivot_counter = 1
        for i, author in enumerate(author_authority):
            print author
            authors_sheet.write(i+1, 0, author[u'name'])
            authors_sheet.write(i+1, 1, scaler.scale(author[u'total_authority']))
            for rank, topic in enumerate(author.get(u'topics', [])[:10]):
                if pivot_counter > 65000:
                    break
                authors_topics_sheet.write(pivot_counter, 0, author[u'name'])
                authors_topics_sheet.write(pivot_counter, 1, topic[0])
                authors_topics_sheet.write(pivot_counter, 2, rank+1)
                authors_topics_sheet.write(pivot_counter, 3, topic[1])
                pivot_counter += 1
            if i > 65000:
                break

        topics_sheet = workbook.add_sheet(u"Topics by Authority")
        topics_sheet.write(0, 0, u"Topic")
        topics_sheet.write(0, 1, u"Authority")

        topics_authors_sheet = workbook.add_sheet(u"Authors for Best Topics")
        topics_authors_sheet.write(0, 0, u"Topic")
        topics_authors_sheet.write(0, 1, u"Author")
        topics_authors_sheet.write(0, 2, u"Rank")
        topics_authors_sheet.write(0, 3, u"Authority")

        scaler = MinMaxScaler([x[1].get(u'authority', 0) for x in topic_authority], enforced_min=0, enforced_max=100)
        pivot_counter = 1
        for i, topic in enumerate(topic_authority):
            topics_sheet.write(i+1, 0, topic[0])
            topics_sheet.write(i+1, 1, scaler.scale(topic[1][u'authority']))
            authors = topic[1][u'authors']
            for rank, author in enumerate(authors[:10]):
                if pivot_counter > 65000:
                    break
                topics_authors_sheet.write(pivot_counter, 0, topic[0])
                topics_authors_sheet.write(pivot_counter, 1, author[u'author'])
                topics_authors_sheet.write(pivot_counter, 2, rank+1)
                topics_authors_sheet.write(pivot_counter, 3, author[u'topic_authority'])
                pivot_counter += 1

            if i > 65000:
                break

        return workbook
示例#25
0
def main():
    sys.stdout = os.fdopen(sys.stdout.fileno(), 'w', 0)
    use_caching()
    args = get_args()
    get_model_from_args(args)
    log("Done")
from boto.s3.key import Key
from boto.exception import S3ResponseError
import traceback
import boto
import sys
import re
import json
import time
import random

BUCKET = boto.connect_s3().get_bucket('nlp-data')

service_file = sys.argv[2] if len(sys.argv) > 2 else 'services-config.json'
SERVICES = json.loads(open(service_file).read())['services']

use_caching(per_service_cache=dict([(service+'.get', {'write_only': True}) for service in SERVICES]))

def process_file(filename):
    if filename.strip() == '':
        return  # newline at end of file
    global SERVICES
    match = re.search('([0-9]+)/([0-9]+)', filename)
    if match is None:
        print "No match for %s" % filename
        return

    doc_id = '%s_%s' % (match.group(1), match.group(2))
    for service in SERVICES:
        try:
            getattr(sys.modules[__name__], service)().get(doc_id)
        except KeyboardInterrupt:
from boto import connect_s3
from nlp_services.caching import use_caching
from nlp_services.syntax import WikiToPageHeadsService, HeadsCountService, TopHeadsService
from nlp_services.discourse.entities import WikiEntitiesService, WpWikiEntitiesService, CombinedWikiEntitiesService, TopEntitiesService, WpTopEntitiesService, CombinedTopEntitiesService, WikiPageEntitiesService, WpWikiPageEntitiesService, CombinedWikiPageEntitiesService, EntityDocumentCountsService, WpEntityDocumentCountsService, CombinedDocumentEntityCountsService, WikiPageToEntitiesService, WpPageToEntitiesService, CombinedPageToEntitiesService
from nlp_services.discourse import AllEntitiesSentimentAndCountsService
from nlp_services.discourse.sentiment import WikiEntitySentimentService, WpWikiEntitySentimentService
from title_confirmation.wikia import AllTitlesService, RedirectsService

sys.stdout = os.fdopen(sys.stdout.fileno(), 'w', 0)

#todo argparse
service_file = 'services-config.json'
SERVICES = json.loads(open(service_file).read())['wiki-services']

caching_dict = dict([(service+'.get', {'write_only': True}) for service in SERVICES])
use_caching(per_service_cache=caching_dict)

wid = sys.argv[1]
try:
    for service in SERVICES:
        try:
            print service
            getattr(sys.modules[__name__], service)().get(wid)
            caching_dict[service+'.get'] = {'dont_compute': True}  # DRY fool!
            use_caching(per_service_caching=caching_dict)
        except KeyboardInterrupt:
            sys.exit()
        except Exception as e:
            print 'Could not call %s on %s!' % (service, wid)
            print traceback.format_exc()
except:
                    type=int,
                    action='store',
                    help="The number of wikis to process")
parser.add_argument('--num_topics',
                    dest='num_topics',
                    type=int,
                    action='store',
                    help="The number of topics for the model to use")
parser.add_argument('--model_dest',
                    dest='model_dest',
                    type=str,
                    action='store',
                    help="Where to save the model")
args = parser.parse_args()

use_caching()


def get_data_wid(wid):
    print wid
    use_caching(shouldnt_compute=True)
    #should be CombinedEntitiesService yo
    doc_ids_to_heads = WikiToPageHeadsService().get_value(wid, {})
    doc_ids_to_entities = WikiPageToEntitiesService().get_value(wid, {})
    doc_ids_combined = {}
    if doc_ids_to_heads == {}:
        print wid, "no heads"
    if doc_ids_to_entities == {}:
        print wid, "no entities"
    for doc_id in doc_ids_to_heads:
        entity_response = doc_ids_to_entities.get(doc_id, {
示例#29
0
def ingest_data(wiki_id):
    """
    Create Solr documents for a given wiki ID

    :param wiki_id: the ID of the wiki (int or str)
    :type wiki_id: int
    :return:
    """
    # make sure all pages and all user pages exists
    solr.existing_collection(solr.all_pages_collection())
    solr.existing_collection(solr.all_user_pages_collection())

    resp = requests.get(u'http://www.wikia.com/api/v1/Wikis/Details', params={u'ids': wiki_id})
    items = resp.json()['items']
    if wiki_id not in items:
        print u"Wiki doesn't exist?"
        return

    api_data = items[wiki_id]
    wiki_data = {
        'id': api_data['id'],
        'wam_f': {'set': api_data['wam_score']},
        'title_s': {'set': api_data['title']},
        'attr_title': {'set': api_data['title']},
        'attr_desc': {'set': api_data['desc']}
    }
    for key in api_data['stats'].keys():
        wiki_data['%s_i' % key] = {'set': api_data['stats'][key]}

    wiki_api_data = requests.get(u'%swikia.php' % (api_data[u'url']),
                                 params={u'method': u'getForWiki',
                                         u'service': u'CrossWikiCore',
                                         u'controller': u'WikiaSearchIndexerController'}).json()[u'contents']

    wiki_data[u'hub_s'] = wiki_api_data[u'hub_s']
    
    # easier
    api_data[u'hub_s'] = wiki_api_data[u'hub_s']

    collection = solr.existing_collection(solr.collection_for_wiki(wiki_id))

    use_caching(is_read_only=True, shouldnt_compute=True)

    wpe = WikiPageToEntitiesService().get_value(wiki_id)
    if not wpe:
        print u"NO WIKI PAGE TO ENTITIES SERVICE FOR", wiki_id
        return False

    documents = []

    grouped_futures = []

    pages_to_authority = WikiAuthorityService().get_value(str(wiki_data['id']))
    for counter, (doc_id, entity_data) in enumerate(wpe.items()):
        documents.append({
            'id': doc_id,
            'attr_entities': {'set': list(set(entity_data.get(u'redirects', {}).values()
                                              + entity_data.get(u'titles')))},
            'type_s': {'set': 'Page'},
            'authority_f': {'set': pages_to_authority.get(doc_id, 0)},
            'hub_s': wiki_api_data['hub_s']
        })

        if counter != 0 and counter % 1500 == 0:
            grouped_futures.append(
                group(add_with_metadata.s(api_data, grouping) for grouping in iter_grouper(15, documents))()
            )

            documents = []

    grouped_futures.append(
        group(add_with_metadata.s(api_data, grouping) for grouping in iter_grouper(15, documents))()
    )

    # block on completion of all grouped futures
    completed = 0
    total = 0
    while len(filter(lambda x: not x.ready(), grouped_futures)) > 1:
        new_completed = 0
        new_total = 0
        for future in grouped_futures:
            new_completed += future.completed_count()
            new_total += len(future.results)
        if completed != new_completed or total != new_total:
            completed = new_completed
            total = new_total
            print "Grouped Tasks: (%d/%d)" % (completed, total)
        sleep(2)

    all_user_tuples = []
    for future in grouped_futures:
        result = get_with_backoff(future, [])
        map(all_user_tuples.extend, result)

    all_user_tuples = list(set(all_user_tuples))
    if not all_user_tuples:
        print "Empty user tuples, bailing"
        return

    # assign the unique user ids to the first variable, and the unique usernames to the second
    all_user_ids, all_users = zip(*all_user_tuples)

    collection.commit()
    solr.all_pages_collection().commit()
    solr.all_user_pages_collection().commit()

    wiki_data['attr_entities'] = {'set': []}

    for count, entities in WikiEntitiesService().get_value(str(wiki_id)).items():
        for entity in entities:
            map(wiki_data['attr_entities']['set'].append, [entity] * int(count))  # goddamnit count isn't int

    wiki_data['user_ids_is'] = {'set': all_user_ids}
    wiki_data['attr_users'] = {'set': all_users}
    wiki_data['total_authority_f'] = {'set': sum(pages_to_authority.values())}
    wiki_data['authorities_fs'] = {'set': pages_to_authority.values()}

    wiki_collection = solr.existing_collection(solr.global_collection())
    wiki_collection.add([wiki_data])
    wiki_collection.commit()
    print "Committed wiki data"

    print "Retrieving user docs..."
    futures = group(build_wiki_user_doc.s(api_data, user_tuple) for user_tuple in all_user_tuples)()
    future_result_len = len(futures.results)
    while not futures.ready():
        print "Progress: (%d/%d)" % (futures.completed_count(), future_result_len)
        sleep(2)

    user_docs = get_with_backoff(futures, [])
    if not user_docs:
        print "User docs was empty. Possibly connection problems."
        return

    authority_scaler = MinMaxScaler([doc['total_page_authority_f']['set'] for doc in user_docs])
    contribs_scaler = MinMaxScaler([doc['total_contribs_f']['set'] for doc in user_docs])
    for doc in user_docs:
        scaled_authority = authority_scaler.scale(doc['total_page_authority_f']['set'])
        scaled_contribs = contribs_scaler.scale(doc['total_contribs_f']['set'])
        doc['scaled_authority_f'] = {'set': scaled_authority}
        doc['scaled_contribs_f'] = {'set': scaled_contribs}
        doc['scaled_contribs_authority_f'] = {'set': scaled_authority * scaled_contribs}

    wiki_user_collection = solr.existing_collection(solr.wiki_user_collection())
    wiki_user_collection.add(user_docs)
    wiki_user_collection.commit()

    print "Analyzing topics"
    futures = group(get_wiki_topic_doc.s(wiki_data['id'], topic)
                    for topic in list(set(wiki_data['attr_entities']['set'])))()
    future_result_len = len(futures.results)
    counter = 0
    while not futures.ready():
        if counter % 5 == 0:
            print "Progress: (%d/%d)" % (futures.completed_count(), future_result_len)
        sleep(2)
        counter += 1
    topic_docs = get_with_backoff(futures, [])
    if not topic_docs:
        print "No topics, probably a connection error"
        return

    collection.add(topic_docs)
    collection.commit()

    topic_collection = solr.existing_collection(solr.all_topics_collection())
    topic_collection.add(topic_docs)
    topic_collection.commit()
示例#30
0
def main():
    args, _ = get_args()
    use_caching(
        per_service_cache=dict([(service + ".get", {"write_only": True}) for service in args.services.split(",")])
    )
    call_services(args)