def get_most_popular_articles(source, campaign=''):
    days = configuration.get_config_int('popular_pageviews', 'days')
    date_format = configuration.get_config_value('popular_pageviews',
                                                 'date_format')
    query = configuration.get_config_value('popular_pageviews', 'query')
    date = (datetime.datetime.utcnow() -
            datetime.timedelta(days=days)).strftime(date_format)
    query = query.format(source=source, date=date)
    try:
        data = get(query)
    except ValueError:
        log.info('pageview query failed')
        return []

    if 'items' not in data or len(
            data['items']) < 1 or 'articles' not in data['items'][0]:
        log.info('pageview data is not in a known format')
        return []

    articles = []

    for article in data['items'][0]['articles']:
        articles.append({
            'title': article['article'],
            'pageviews': article['views']
        })

    return articles
def home():
    s = request.args.get('s', '')
    t = request.args.get('t', '')
    seed = request.args.get('seed', '')
    search = request.args.get('search', '')
    rank_method = request.args.get('rank_method', '')
    campaign = request.args.get('campaign', '')
    campaign_info_file = ''
    pairs = language_pairs.get_language_pairs()
    # WikiGapFinder specific settings. TODO: these should be in a config file.
    if campaign == 'WikiGapFinder':
        s = s or 'en'
        t = t or 'sv'
        campaign_info_file = 'gf-wikigapfinder-campaign-info.tag'
    return render_template(
        'index.html',
        language_pairs=json.dumps(pairs),
        language_to_domain_mapping=json.dumps(
            language_pairs.get_language_to_domain_mapping()),
        s=urllib.parse.quote(s),
        t=urllib.parse.quote(t),
        seed=urllib.parse.quote(seed),
        search=urllib.parse.quote(search),
        rank_method=urllib.parse.quote(rank_method),
        campaign=urllib.parse.quote(campaign),
        campaign_info_file=campaign_info_file,
        event_logger_url=configuration.get_config_value(
            'endpoints', 'event_logger'),
        default_search=configuration.get_config_value('gapfinder',
                                                      'default_search'))
def initialize_logging():
    logging.basicConfig(format=configuration.get_config_value(
        'logging', 'format'),
                        level=logging.WARNING)
    log = logging.getLogger(recommendation.__name__)
    log.setLevel(
        logging.getLevelName(configuration.get_config_value(
            'logging', 'level')))
def initialize_embedding(optimize=True):
    global _embedding
    embedding_path = configuration.get_config_value('related_articles', 'embedding_path', fallback='')
    embedding_package = configuration.get_config_value('related_articles', 'embedding_package', fallback='')
    embedding_name = configuration.get_config_value('related_articles', 'embedding_name', fallback='')
    optimized_embedding_path = configuration.get_config_value('related_articles', 'optimized_embedding_path')
    minimum_similarity = configuration.get_config_float('related_articles', 'minimum_similarity')
    _embedding = WikiEmbedding(minimum_similarity)
    _embedding.initialize(embedding_path, embedding_package, embedding_name, optimize, optimized_embedding_path)
示例#5
0
def test_correct_endpoints_are_used(client, query_url):
    client.get(query_url)
    called_urls = [r.request.url for r in responses.calls]
    expected_urls = [
        configuration.get_config_value('endpoints', 'language_pairs'),
        configuration.get_config_value('endpoints', 'event_logger'),
        configuration.get_config_value('endpoints', 'related_articles')
    ]
    for expected_url in expected_urls:
        assert any(expected_url in url for url in called_urls)
    assert 3 == len(responses.calls)
def get_sections_in_article(title):
    endpoint = configuration.get_config_value('endpoints', 'restbase')
    path = configuration.get_config_value('sections_query', 'path')
    endpoint = endpoint.format(source='en')
    path = path.format(title=title)
    url = endpoint + path
    try:
        result = fetcher.get(url)
    except ValueError:
        return {title: []}
    sections = {title: [item['line'].upper() for item in result.get('sections', []) if 'line' in item]}
    return sections
def log_api_request(source,
                    target,
                    seed=None,
                    search=None,
                    user_agent=None,
                    **kwargs):
    event = dict(timestamp=int(time.time()),
                 sourceLanguage=source,
                 targetLanguage=target)
    if seed:
        event['seed'] = seed
    if search:
        event['searchAlgorithm'] = search

    payload = dict(schema='TranslationRecommendationAPIRequests',
                   revision=16261139,
                   wiki='metawiki',
                   event=event)

    url = configuration.get_config_value('endpoints', 'event_logger')
    url += '?' + urllib.parse.quote_plus(json.dumps(payload))

    log.info('Logging event: %s', json.dumps(payload))

    headers = {}
    if user_agent is not None:
        headers['User-Agent'] = user_agent

    try:
        requests.get(url, headers=headers)
    except requests.exceptions.RequestException:
        pass
def get_wikidata_sitelinks(source, target, titles):
    """
    Returns a dictionary mapping from titles to wikidata ids
    for the articles in source missing in target
    """
    endpoint = configuration.get_config_value('endpoints', 'wikidata')
    params = configuration.get_config_dict('wikidata_params')
    params['sites'] = params['sites'].format(source=source)
    params['titles'] = '|'.join(titles)

    title_id_dict = {}
    try:
        data = post(endpoint, data=params)
    except ValueError:
        log.info('Bad Wikidata API response')
        return title_id_dict

    source_wiki = '{}wiki'.format(source)
    target_wiki = '{}wiki'.format(target)

    if 'entities' not in data:
        log.info('None of the titles have a Wikidata Item')
        return title_id_dict

    for wikidata_id, v in data['entities'].items():
        sitelinks = v.get('sitelinks', None)
        if sitelinks:
            if source_wiki in sitelinks and target_wiki not in sitelinks:
                title = sitelinks[source_wiki]['title'].replace(' ', '_')
                title_id_dict[title] = wikidata_id

    if len(title_id_dict) == 0:
        log.info('None of the source articles missing in the target')

    return title_id_dict
示例#9
0
def get_wikidata_sitelinks(source, target, titles):
    """
    Returns a dictionary mapping from titles to wikidata ids
    for the articles in source missing in target
    """
    endpoint = configuration.get_config_value('endpoints', 'wikidata')
    params = configuration.get_config_dict('wikidata_params')
    params['sites'] = params['sites'].format(source=source)
    params['titles'] = '|'.join(titles)

    title_id_dict = {}
    try:
        data = post(endpoint, data=params)
    except ValueError:
        log.info('Bad Wikidata API response')
        return title_id_dict

    source_wiki = '{}wiki'.format(source)
    target_wiki = '{}wiki'.format(target)

    if 'entities' not in data:
        log.info('None of the titles have a Wikidata Item')
        return title_id_dict

    for wikidata_id, v in data['entities'].items():
        sitelinks = v.get('sitelinks', None)
        if sitelinks:
            if source_wiki in sitelinks and target_wiki not in sitelinks:
                title = sitelinks[source_wiki]['title'].replace(' ', '_')
                title_id_dict[title] = wikidata_id

    if len(title_id_dict) == 0:
        log.info('None of the source articles missing in the target')

    return title_id_dict
def setup_function(function):
    language_pairs._language_pairs = None
    responses.add(responses.GET,
                  configuration.get_config_value('endpoints',
                                                 'language_pairs'),
                  json=LANGUAGE_PAIRS,
                  status=200)
def get_related_articles(source, seed):
    endpoint = configuration.get_config_value('endpoints', 'related_articles')
    try:
        response = get(endpoint, dict(source=source, seed=seed, count=500))
    except ValueError:
        return []
    return response
def get_pageview_query_url(source, title):
    start_days = configuration.get_config_int('single_article_pageviews', 'start_days')
    end_days = configuration.get_config_int('single_article_pageviews', 'end_days')
    query = configuration.get_config_value('single_article_pageviews', 'query')
    start = get_relative_timestamp(start_days)
    end = get_relative_timestamp(end_days)
    query = query.format(source=source, title=title, start=start, end=end)
    return query
示例#13
0
def test_getter_queries_correct_url():
    add_response()
    run_getter()
    assert 1 == len(responses.calls)
    assert configuration.get_config_value(
        'endpoints', 'pageviews') in responses.calls[0].request.url
    assert data_fetcher.get_pageview_query_url(
        SOURCE, TITLE) == responses.calls[0].request.url
def build_wiki_search(source, seed, count, morelike):
    endpoint = configuration.get_config_value('endpoints', 'wikipedia').format(source=source)
    params = configuration.get_config_dict('wiki_search_params')
    params['srlimit'] = count
    if morelike:
        seed = 'morelike:' + seed
    params['srsearch'] = seed
    return endpoint, params
示例#15
0
def set_related_articles_response():
    related_articles_endpoint = configuration.get_config_value(
        'endpoints', 'related_articles')
    responses.add(responses.GET,
                  re.compile(r'{}.'.format(related_articles_endpoint)),
                  body=json.dumps(RELATED_ARTICLE_RESPONSE),
                  status=200,
                  content_type='application/json')
示例#16
0
def get_pageview_query_url(source, title):
    start_days = configuration.get_config_int('single_article_pageviews',
                                              'start_days')
    end_days = configuration.get_config_int('single_article_pageviews',
                                            'end_days')
    query = configuration.get_config_value('single_article_pageviews', 'query')
    start = get_relative_timestamp(start_days)
    end = get_relative_timestamp(end_days)
    query = query.format(source=source, title=title, start=start, end=end)
    return query
def get_entities(params):
    endpoint = configuration.get_config_value('endpoints', 'wikidata')
    try:
        data = fetcher.post(endpoint, data=params)
        if 'warnings' in data:
            raise ValueError()
    except ValueError:
        log.info('Bad Wikidata API response')
        return {}

    return data.get('entities', {})
def initialize_language_pairs():
    global _language_pairs
    if _language_pairs is None:
        language_pairs_endpoint = configuration.get_config_value('endpoints', 'language_pairs')
        try:
            result = requests.get(language_pairs_endpoint)
            result.raise_for_status()
            pairs = result.json()
        except requests.exceptions.RequestException as e:
            raise ConnectionError('Unable to load data from {}. {}'.format(language_pairs_endpoint, e))
        _language_pairs = pairs
def initialize_language_pairs():
    global _language_pairs
    if _language_pairs is None:
        language_pairs_endpoint = configuration.get_config_value(
            'endpoints', 'language_pairs')
        try:
            result = requests.get(language_pairs_endpoint)
            result.raise_for_status()
            pairs = result.json()
        except requests.exceptions.RequestException as e:
            raise ConnectionError('Unable to load data from {}. {}'.format(
                language_pairs_endpoint, e))
        _language_pairs = pairs
    def query_pageviews(self, s):
        """
        Query pageview API and parse results
        """
        days = configuration.get_config_int('popular_pageviews', 'days')
        date_format = configuration.get_config_value('popular_pageviews', 'date_format')
        query = configuration.get_config_value('popular_pageviews', 'query')
        date = (datetime.datetime.utcnow() - datetime.timedelta(days=days)).strftime(date_format)
        query = query.format(source=s, date=date)
        try:
            data = data_fetcher.get(query)
        except ValueError:
            return []

        article_pv_tuples = []

        try:
            for d in data['items'][0]['articles']:
                article_pv_tuples.append((d['article'], d['views']))
        except:
            log.info('Could not get most popular articles for %s from pageview API. Try using a seed article.', s)

        return article_pv_tuples
def home():
    s = request.args.get('s')
    t = request.args.get('t')
    seed = request.args.get('seed')
    pairs = language_pairs.get_language_pairs()
    return render_template(
        'index.html',
        language_pairs=json.dumps(pairs),
        language_to_domain_mapping=json.dumps(language_pairs.get_language_to_domain_mapping()),
        s=s,
        t=t,
        seed=seed,
        event_logger_url=configuration.get_config_value('endpoints', 'event_logger')
    )
示例#22
0
def home():
    s = request.args.get('s')
    t = request.args.get('t')
    seed = request.args.get('seed')
    pairs = language_pairs.get_language_pairs()
    return render_template(
        'index.html',
        language_pairs=json.dumps(pairs),
        language_to_domain_mapping=json.dumps(
            language_pairs.get_language_to_domain_mapping()),
        s=s,
        t=t,
        seed=seed,
        event_logger_url=configuration.get_config_value(
            'endpoints', 'event_logger'))
def get_disambiguation_pages(source, titles):
    """
    Returns the subset of titles that are disambiguation pages
    """
    endpoint = configuration.get_config_value('endpoints', 'wikipedia').format(source=source)
    params = configuration.get_config_dict('disambiguation_params')
    params['titles'] = '|'.join(titles)

    try:
        data = post(endpoint, data=params)
    except ValueError:
        log.info('Bad Disambiguation API response')
        return []

    pages = data.get('query', {}).get('pages', {}).values()
    return list(set(page['title'].replace(' ', '_') for page in pages if 'disambiguation' in page.get('pageprops', {})))
def get_categories_for_article(title):
    endpoint = configuration.get_config_value('endpoints', 'wikipedia')
    params = {
        'action': 'query',
        'prop': 'categories',
        'format': 'json',
        'titles': title
    }
    endpoint = endpoint.format(source='en')
    try:
        result = fetcher.get(endpoint, params=params)
    except ValueError:
        return {title: []}
    items = list(result.get('query', {}).get('pages', {}).values())
    if len(items) != 1:
        return {title: []}
    categories = {title: [item['title'].replace(' ', '_') for item in items[0].get('categories', []) if 'title' in item]}
    return categories
示例#25
0
def get_disambiguation_pages(source, titles):
    """
    Returns the subset of titles that are disambiguation pages
    """
    endpoint = configuration.get_config_value(
        'endpoints', 'wikipedia').format(source=source)
    params = configuration.get_config_dict('disambiguation_params')
    params['titles'] = '|'.join(titles)

    try:
        data = post(endpoint, data=params)
    except ValueError:
        log.info('Bad Disambiguation API response')
        return []

    pages = data.get('query', {}).get('pages', {}).values()
    return list(
        set(page['title'].replace(' ', '_') for page in pages
            if 'disambiguation' in page.get('pageprops', {})))
def initialize_language_pairs():
    global _language_pairs
    if _language_pairs is None:
        language_pairs_endpoint = configuration.get_config_value(
            'endpoints', 'language_pairs')
        try:
            result = requests.get(language_pairs_endpoint)
            result.raise_for_status()
            pairs = result.json()
            if {'source', 'target'} ^ set(pairs.keys()):
                raise ValueError()
            if not all(isinstance(v, list) for v in pairs.values()):
                raise ValueError()
            _language_pairs = pairs
        except requests.exceptions.RequestException as e:
            log.warning('Unable to load data from {}. {}'.format(
                language_pairs_endpoint, e))
        except (AttributeError, ValueError):
            log.warning('language pairs were invalid')
def get_category_members(source, category):
    log.debug(category)
    endpoint = configuration.get_config_value(
        'endpoints', 'wikipedia').format(source=source)
    params = configuration.get_config_dict('category_search_params')
    params['cmtitle'] = category

    members = dict(pages=set(), subcats=set())

    try:
        response = get(endpoint, params=params)
    except ValueError:
        return []
    results = response.get('query', {}).get('categorymembers', [])
    for member in results:
        if member.get('type', None) == 'page':
            members['pages'].add(member.get('title'))
        if member.get('type', None) == 'subcat':
            members['subcats'].add(member.get('title'))
    return members
def log_api_request(source, target, seed=None, search=None, **kwargs):
    event = dict(timestamp=int(time.time()),
                 sourceLanguage=source,
                 targetLanguage=target)
    if seed:
        event['seed'] = seed
    if search:
        event['searchAlgorithm'] = search

    payload = dict(schema='TranslationRecommendationAPIRequests',
                   revision=15405506,
                   wiki='metawiki',
                   event=event)

    url = configuration.get_config_value('endpoints', 'event_logger')
    url += '?' + urllib.parse.quote_plus(json.dumps(payload))

    log.info('Logging event: %s', json.dumps(payload))

    try:
        requests.get(url)
    except requests.exceptions.RequestException:
        pass
示例#29
0
def test_configuration():
    assert recommendation.__name__ == configuration.get_config_value('related_articles', 'embedding_package')
def get_expected_endpoint(the_filter):
    if the_filter is filters.filter_by_missing:
        return configuration.get_config_value('endpoints', 'wikidata')
    if the_filter is filters.filter_by_disambiguation:
        return configuration.get_config_value('endpoints', 'wikipedia').format(source=SOURCE)
def test_getter_queries_correct_url():
    add_response()
    run_getter()
    assert 1 == len(responses.calls)
    assert configuration.get_config_value('endpoints', 'pageviews') in responses.calls[0].request.url
    assert data_fetcher.get_pageview_query_url(SOURCE, TITLE) == responses.calls[0].request.url
def get_relative_timestamp(relative_days):
    date_format = configuration.get_config_value('single_article_pageviews', 'date_format')
    return (datetime.datetime.utcnow() + datetime.timedelta(days=relative_days)).strftime(date_format)
def resource(filename):
    return send_from_directory(configuration.get_config_value(
        'gapfinder', 'resource_path'),
                               filename=filename)
def test_correct_endpoint_is_requested():
    responses.add(responses.GET, re.compile("."), body="", status=200)
    event_logger.log_api_request("a", "b")
    assert 1 == len(responses.calls)
    assert configuration.get_config_value("endpoints", "event_logger") in responses.calls[0].request.url
def get_expected_endpoint(the_filter):
    if the_filter is filters.filter_by_missing:
        return configuration.get_config_value('endpoints', 'wikidata')
    if the_filter is filters.filter_by_disambiguation:
        return configuration.get_config_value(
            'endpoints', 'wikipedia').format(source=SOURCE)
def setup_function(function):
    language_pairs._language_pairs = None
    responses.add(responses.GET, configuration.get_config_value('endpoints', 'language_pairs'),
                  json=LANGUAGE_PAIRS, status=200)
示例#37
0
def get_relative_timestamp(relative_days):
    date_format = configuration.get_config_value('single_article_pageviews',
                                                 'date_format')
    return (datetime.datetime.utcnow() +
            datetime.timedelta(days=relative_days)).strftime(date_format)
示例#38
0
def test_language_pairs_when_fetch_is_invalid(json_value):
    responses.reset()
    responses.add(responses.GET, configuration.get_config_value('endpoints', 'language_pairs'),
                  json=json_value, status=200)
    assert None is language_pairs.get_language_pairs()
    assert True is language_pairs.is_valid_language_pair('any', 'combination')
示例#39
0
def test_correct_endpoint_is_requested():
    responses.add(responses.GET, re.compile('.'), body='', status=200)
    event_logger.log_api_request('a', 'b')
    assert 1 == len(responses.calls)
    assert configuration.get_config_value('endpoints', 'event_logger') in responses.calls[0].request.url
示例#40
0
def initialize_logging():
    logging.basicConfig(format=configuration.get_config_value('logging', 'format'),
                        level=logging.WARNING)
    log = logging.getLogger(recommendation.__name__)
    log.setLevel(logging.getLevelName(configuration.get_config_value('logging', 'level')))