Exemplo n.º 1
0
from soweego.commons import constants, target_database, utils
from soweego.linker import train

LOGGER = logging.getLogger(__name__)


# Let the user pass extra kwargs to the classifier
# This is for development purposes only, and is not explicitly documented
@click.command(context_settings={
    'ignore_unknown_options': True,
    'allow_extra_args': True
})
@click.argument('classifier', type=click.Choice(constants.CLASSIFIERS))
@click.argument('catalog',
                type=click.Choice(target_database.supported_targets()))
@click.argument('entity',
                type=click.Choice(target_database.supported_entities()))
@click.option('-k',
              '--k-folds',
              default=5,
              help="Number of folds, default: 5.")
@click.option(
    '-s',
    '--single',
    is_flag=True,
    help='Compute a single evaluation over all k folds, instead of k '
    'evaluations.',
)
@click.option(
    '-n',
Exemplo n.º 2
0
import click
import requests
from pandas import read_csv
from sqlalchemy.exc import SQLAlchemyError
from tqdm import tqdm

from soweego.commons import keys, target_database
from soweego.commons.constants import SUPPORTED_ENTITIES
from soweego.commons.db_manager import DBManager
from soweego.importer.models import mix_n_match
from soweego.wikidata.vocabulary import HUMAN_QID

LOGGER = logging.getLogger(__name__)

SUPPORTED_TARGETS = set(target_database.supported_targets()) ^ {keys.TWITTER}
INPUT_CSV_HEADER = (keys.QID, keys.TID, keys.CONFIDENCE)
COMMIT_EVERY = 10_000  # DB entity batch size

MNM_DB = 's51434__mixnmatch_p'
MNM_API_URL = 'https://tools.wmflabs.org/mix-n-match/api.php'
MNM_API_ACTIVATION_PARAMS = {
    'query': 'update_overview',
    'catalog': None,  # To be filled by activate_catalog
}

TIMESTAMP_FORMAT = '%Y%m%d%H%M%S'  # 20190528131053
NOTE_FIELD = 'Uploaded by soweego'
SEARCH_WP_FIELD = 'en'
EXT_DESC_FIELD = 'soweego confidence score: {}'
USER_FIELD = 0  # stands for 'automatically matched'
Exemplo n.º 3
0
from soweego.importer.discogs_dump_extractor import DiscogsDumpExtractor
from soweego.importer.imdb_dump_extractor import IMDbDumpExtractor
from soweego.importer.musicbrainz_dump_extractor import MusicBrainzDumpExtractor

LOGGER = logging.getLogger(__name__)

DUMP_EXTRACTOR = {
    keys.DISCOGS: DiscogsDumpExtractor,
    keys.IMDB: IMDbDumpExtractor,
    keys.MUSICBRAINZ: MusicBrainzDumpExtractor,
}
ROTTEN_URLS_FNAME = '{catalog}_{entity}_rotten_urls.csv'


@click.command()
@click.argument('catalog', type=click.Choice(target_database.supported_targets()))
@click.option(
    '--url-check',
    is_flag=True,
    help=(
        'Check for rotten URLs while importing. Default: no. '
        'WARNING: this will dramatically increase the import time.'
    ),
)
@click.option(
    '--dir-io',
    type=click.Path(file_okay=False),
    default=constants.WORK_DIR,
    help=f'Input/output directory, default: {constants.WORK_DIR}.',
)
def import_cli(catalog: str, url_check: bool, dir_io: str) -> None:
Exemplo n.º 4
0
# (retrieved, TIMESTAMP) reference object
TODAY = date.today()
TIMESTAMP = pywikibot.WbTime(
    site=REPO,
    year=TODAY.year,
    month=TODAY.month,
    day=TODAY.day,
    precision='day',
)
RETRIEVED_REFERENCE = pywikibot.Claim(REPO,
                                      vocabulary.RETRIEVED,
                                      is_reference=True)
RETRIEVED_REFERENCE.setTarget(TIMESTAMP)

# We also support Twitter
SUPPORTED_TARGETS = target_database.supported_targets() ^ {TWITTER}


@click.command()
@click.argument('catalog', type=click.Choice(SUPPORTED_TARGETS))
@click.argument('entity',
                type=click.Choice(target_database.supported_entities()))
@click.argument('invalid_identifiers', type=click.File())
@click.option(
    '-s',
    '--sandbox',
    is_flag=True,
    help='Perform all edits on the Wikidata sandbox item Q4115189.',
)
def delete_cli(catalog, entity, invalid_identifiers, sandbox):
    """Delete invalid identifiers.