示例#1
0
def main(argv=None):
    args = docopt.docopt(__doc__, argv=argv)

    logging.basicConfig(
        level=logging.WARNING if not args['--debug'] else logging.DEBUG,
        format='%(asctime)s %(levelname)s:%(name)s -- %(message)s'
    )

    dependents = []
    for dependent_path in args['<dependent>']:
        dependent_or_list = yamlconf.import_path(dependent_path)
        if isinstance(dependent_or_list, Dependent):
            dependents.append(dependent_or_list)
        else:
            dependents.extend(dependent_or_list)

    if args['--input'] == "<stdin>":
        observations = read_observations(sys.stdin)
    else:
        observations = read_observations(open(args['--input']))

    if args['--output'] == "<stdout>":
        output = sys.stdout
    else:
        output = open(args['--output'], 'w')

    if args['--extractors'] == "<cpu count>":
        extractors = cpu_count()
    else:
        extractors = int(args['--extractors'])

    verbose = args['--verbose']

    run(observations, dependents, output, extractors, verbose)
def main(argv=None):
    args = docopt.docopt(__doc__, argv=argv)

    logging.basicConfig(
        level=logging.WARNING if not args['--debug'] else logging.DEBUG,
        format='%(asctime)s %(levelname)s:%(name)s -- %(message)s')

    if args['--input'] == "<stdin>":
        obs = read_observations(sys.stdin)
    else:
        obs = read_observations(open(args['--input']))

    if args['--output'] == "<stdout>":
        output = sys.stdout
    else:
        output = open(args['--output'], 'w')

    if args['--workers'] == "<cpu-count>":
        workers = cpu_count()
    else:
        workers = int(args['--workers'])

    verbose = args['--verbose']

    run(obs, output, workers, verbose)
示例#3
0
def main(argv=None):
    args = docopt(__doc__, argv=argv)

    logging.basicConfig(
        level=logging.INFO if not args['--debug'] else logging.DEBUG,
        format='%(asctime)s %(levelname)s:%(name)s -- %(message)s'
    )

    if args['--input'] == '<stdin>':
        observations = read_observations(sys.stdin)
    else:
        observations = read_observations(open(args['--input']))

    if args['--output'] == '<stdout>':
        output = sys.stdout
    else:
        output = open(args['--output'])

    claims = args['--claim']

    session = mwapi.Session(args['--api-host'],
                            user_agent="ArticleQuality fetch_text utility.")

    verbose = args['--verbose']

    run(session, observations, claims, output, verbose)
示例#4
0
def main(argv=None):
    args = docopt.docopt(__doc__, argv=argv)

    logging.basicConfig(
        level=logging.INFO if not args['--debug'] else logging.DEBUG,
        format='%(asctime)s %(levelname)s:%(name)s -- %(message)s')

    sys.path.insert(0, ".")  # Search local directory first
    features = yamlconf.import_module(args['<features>'])
    label_name = args['<label>']
    if args['<model>'] is not None:
        model = Model.load(open(args['<model>']))
    else:
        model = None

    additional_fields = args['<additional-field>']

    if args['--input'] == "<stdin>":
        observations = read_observations(sys.stdin)
    else:
        observations = read_observations(open(args['--input']))

    if args['--output'] == "<stdout>":
        output = sys.stdout
    else:
        output = open(args['--output'], 'w')

    verbose = args['--verbose']

    run(observations, output, features, label_name, model, additional_fields,
        verbose)
示例#5
0
def main(argv=None):
    args = docopt.docopt(__doc__, argv=argv)

    logging.basicConfig(
        level=logging.INFO if not args['--debug'] else logging.DEBUG,
        format='%(asctime)s %(levelname)s:%(name)s -- %(message)s')
    logging.getLogger('requests').setLevel(logging.WARNING)
    host = args['--host']

    try_deleted_first = args['--deleted-1st']

    if args['--input'] == "<stdin>":
        obs = read_observations(sys.stdin)
    else:
        obs = read_observations(open(args['--input']))

    if args['--output'] == "<stdout>":
        output = sys.stdout
    else:
        output = open(args['--output'], 'w')

    if args['--threads'] == "<cpu-count>":
        threads = cpu_count()
    else:
        threads = int(args['--threads'])

    verbose = args['--verbose']

    run(host, obs, try_deleted_first, output, threads, verbose)
def main(argv=None):
    args = docopt.docopt(__doc__, argv=argv)

    logging.basicConfig(
        level=logging.WARNING if not args['--debug'] else logging.DEBUG,
        format='%(asctime)s %(levelname)s:%(name)s -- %(message)s')

    dependents = []
    for dependent_path in args['<dependent>']:
        dependent_or_list = yamlconf.import_path(dependent_path)
        if isinstance(dependent_or_list, Dependent):
            dependents.append(dependent_or_list)
        else:
            dependents.extend(dependent_or_list)

    if args['--input'] == "<stdin>":
        observations = read_observations(sys.stdin)
    else:
        observations = read_observations(open(args['--input']))

    if args['--output'] == "<stdout>":
        output = sys.stdout
    else:
        output = open(args['--output'], 'w')

    if args['--extractors'] == "<cpu count>":
        extractors = cpu_count()
    else:
        extractors = int(args['--extractors'])

    verbose = args['--verbose']

    run(observations, dependents, output, extractors, verbose)
示例#7
0
def main(argv=None):
    args = docopt.docopt(__doc__, argv=argv)

    dump_paths = args['<dump-file>']

    if args['--labelings'] == "<stdin>":
        labelings = read_observations(sys.stdin)
    else:
        path = os.path.expanduser(args['--labelings'])
        labelings = read_observations(open(path))

    grouped_labelings = groupby(labelings, key=lambda l: l['page_title'])
    page_labelings = {title: sorted(list(labs), key=lambda l: l['timestamp'])
                      for title, labs in grouped_labelings}

    if args['--threads'] == "<cpu_count>":
        threads = cpu_count()
    else:
        threads = int(args['--threads'])

    if args['--output'] == "<stdout>":
        output = sys.stdout
    else:
        output = open(os.path.expanduser(args['--output']), "w")

    verbose = args['--verbose']

    run(dump_paths, page_labelings, output, threads, verbose=verbose)
示例#8
0
def main(argv=None):
    args = docopt.docopt(__doc__, argv=argv)

    logging.basicConfig(
        level=logging.INFO if not args['--debug'] else logging.DEBUG,
        format='%(asctime)s %(levelname)s:%(name)s -- %(message)s'
    )
    # Requests is loud.  Be quiet requests.
    requests.packages.urllib3.disable_warnings()

    ores_urls = args['<ores-url>']
    context = args['<context>']
    if args['--input'] == "<stdin>":
        rev_ids = [ob['rev_id'] for ob in read_observations(sys.stdin)]
    else:
        rev_ids = [
            ob['rev_id'] for ob in read_observations(open(args['--input']))]
    if args['--model'] is None:
        models = []
    else:
        models = args['--model']

    batch_size = int(args['--batch-size'])
    delay = float(args['--delay'])
    verbose = args['--verbose']

    run(ores_urls, context, models, rev_ids, batch_size, delay, verbose)
示例#9
0
def main(argv=None):
    args = docopt.docopt(__doc__, argv=argv)

    logging.basicConfig(
        level=logging.INFO if not args['--debug'] else logging.DEBUG,
        format='%(asctime)s %(levelname)s:%(name)s -- %(message)s'
    )
    logging.getLogger('requests').setLevel(logging.WARNING)

    logger.info("Loading models...")
    models = [(os.path.basename(path), SentenceScorer.load(open(path)))
              for path in args['<model>']]

    if args['--input'] == "<stdin>":
        obs = read_observations(sys.stdin)
    else:
        obs = read_observations(open(args['--input']))

    if args['--output'] == "<stdout>":
        output = sys.stdout
    else:
        output = open(args['--output'], 'w')

    if args['--workers'] == "<cpu-count>":
        workers = cpu_count()
    else:
        workers = int(args['--workers'])

    verbose = args['--verbose']

    run(models, obs, output, workers, verbose)
示例#10
0
def main(argv=None):
    args = docopt.docopt(__doc__, argv=argv)

    dump_paths = args['<dump-file>']

    if args['--labelings'] == "<stdin>":
        labelings = read_observations(sys.stdin)
    else:
        path = os.path.expanduser(args['--labelings'])
        labelings = read_observations(open(path))

    grouped_labelings = groupby(labelings, key=lambda l: l['page_title'])
    page_labelings = {
        title: sorted(list(labs), key=lambda l: l['timestamp'])
        for title, labs in grouped_labelings
    }

    if args['--threads'] == "<cpu_count>":
        threads = cpu_count()
    else:
        threads = int(args['--threads'])

    if args['--output'] == "<stdout>":
        output = sys.stdout
    else:
        output = open(os.path.expanduser(args['--output']), "w")

    verbose = args['--verbose']

    run(dump_paths, page_labelings, output, threads, verbose=verbose)
示例#11
0
def main(argv=None):
    args = docopt(__doc__, argv=argv)

    logging.basicConfig(
        level=logging.INFO if not args['--debug'] else logging.DEBUG,
        format='%(asctime)s %(levelname)s:%(name)s -- %(message)s')

    if args['--input'] == '<stdin>':
        observations = read_observations(sys.stdin)
    else:
        observations = read_observations(open(args['--input']))

    if args['--output'] == '<stdout>':
        output = sys.stdout
    else:
        output = open(args['--output'])

    claims = args['--claim']

    session = mwapi.Session(args['--api-host'],
                            user_agent="WikiClass fetch_text utility.")

    verbose = args['--verbose']

    run(session, observations, claims, output, verbose)
示例#12
0
def main(argv=None):
    args = docopt(__doc__, argv=argv)

    if args['--labelings'] == '<stdin>':
        labelings = read_observations(sys.stdin)
    else:
        labelings = read_observations(open(args['--labelings']))

    if args['--output'] == '<stdout>':
        output = sys.stdout
    else:
        output = open(args['--output'], 'w')

    session = mwapi.Session(args['--api-host'], user_agent=DRAFTTOPIC_UA)

    verbose = args['--verbose']

    run(labelings, output, session, verbose)
示例#13
0
def main(argv=None):
    args = docopt(__doc__, argv=argv)

    if args['--labelings'] == '<stdin>':
        labelings = read_observations(sys.stdin)
    else:
        labelings = read_observations(open(args['--labelings']))

    if args['--output'] == '<stdout>':
        output = sys.stdout
    else:
        output = open(args['--output'])

    session = mwapi.Session(args['--api-host'],
                            user_agent="WikiClass fetch_text utility.")

    verbose = args['--verbose']

    run(labelings, output, session, verbose)
示例#14
0
def main(argv=None):
    args = docopt(__doc__, argv=argv)

    logging.basicConfig(
        level=logging.INFO if not args['--debug'] else logging.DEBUG,
        format='%(asctime)s %(levelname)s:%(name)s -- %(message)s')
    label_field = args['<label>']

    if args['--input'] == '<stdin>':
        observations = read_observations(sys.stdin)
    else:
        observations = read_observations(open(args['--input']))

    if args['--output'] == '<stdout>':
        output = sys.stdout
    else:
        output = open(args['--output'], 'w')

    write_labels(observations, label_field, output)
示例#15
0
def main(argv=None):
    args = docopt(__doc__, argv=argv)

    if args['--labelings'] == '<stdin>':
        labelings = read_observations(sys.stdin)
    else:
        labelings = read_observations(open(args['--labelings']))

    if args['--output'] == '<stdout>':
        output = sys.stdout
    else:
        output = open(args['--output'])

    session = mwapi.Session(args['--api-host'],
                            user_agent="ArticleQuality fetch_text utility.")

    verbose = args['--verbose']

    run(labelings, output, session, verbose)
示例#16
0
def main(argv=None):
    args = docopt(__doc__, argv=argv)

    logging.basicConfig(
        level=logging.INFO if not args['--debug'] else logging.DEBUG,
        format='%(asctime)s %(levelname)s:%(name)s -- %(message)s')

    if args['--input'] == '<stdin>':
        observations = read_observations(sys.stdin)
    else:
        observations = read_observations(open(args['--input']))

    if args['--output'] == '<stdout>':
        output = sys.stdout
    else:
        output = open(args['--output'], 'w')

    session = mwapi.Session(args['--api-host'],
                            user_agent="WikiProjects \
                            fetch_wikiprojects utility.")

    mid_level_wp = None
    try:
        with open(args['--mid_level_wp']) as fwp:
            mid_level_wp = json.loads(fwp.read())
    except:  # noqa: E722
        logger.error(
            "Failed to load mid-level wikiprojects file, check and run\
                    again")
        pdb.set_trace()
        sys.exit()
    mid_level_wp = invert_mid_level_projects(mid_level_wp)

    verbose = args['--verbose']

    start_time = datetime.now()
    run(session, observations, output, mid_level_wp, verbose)
    end_time = datetime.now()
    time_elapsed = end_time - start_time
    if verbose:
        logger.info('Time taken (hh:mm:ss.ms): {}'.format(time_elapsed))
示例#17
0
def main(argv=None):
    args = docopt(__doc__, argv=argv)

    logging.basicConfig(
        level=logging.INFO if not args['--debug'] else logging.DEBUG,
        format='%(asctime)s %(levelname)s:%(name)s -- %(message)s')

    lang_code = args['<lang>']
    max_n = int(args['-n'])

    if args['--input'] == '<stdin>':
        observations = read_observations(sys.stdin)
    else:
        observations = read_observations(open(args['--input']))

    if args['--output'] == '<stdout>':
        output = sys.stdout
    else:
        output = open(args['--output'], 'w')

    balance_sample(observations, lang_code, max_n, output)
示例#18
0
def main(argv=None):
    args = docopt(__doc__, argv=argv)

    logging.basicConfig(
        level=logging.INFO if not args['--debug'] else logging.DEBUG,
        format='%(asctime)s %(levelname)s:%(name)s -- %(message)s')

    if args['--input'] == '<stdin>':
        observations = read_observations(sys.stdin)
    else:
        observations = read_observations(open(args['--input']))

    if args['--output'] == '<stdout>':
        output = sys.stdout
    else:
        output = open(args['--output'], 'w')
    label_name = args['<label>']

    config = process_labels(observations, label_name)
    output.write(json.dumps(config, indent=4))
    output.close()
示例#19
0
def main(argv=None):
    args = docopt.docopt(__doc__, argv=argv)

    logging.basicConfig(
        level=logging.INFO if not args['--debug'] else logging.DEBUG,
        format='%(asctime)s %(levelname)s:%(name)s -- %(message)s')

    observations = read_observations(sys.stdin)

    model = ScorerModel.load(open(args['<model-file>']))
    verbose = args['--verbose']

    run(observations, model, verbose)
示例#20
0
def main(argv=None):
    args = docopt(__doc__, argv=argv)

    logging.basicConfig(
        level=logging.INFO if not args['--debug'] else logging.DEBUG,
        format='%(asctime)s %(levelname)s:%(name)s -- %(message)s')
    logging.getLogger("urllib3.connectionpool").setLevel(logging.WARNING)

    if args['--input'] == '<stdin>':
        observations = read_observations(sys.stdin)
    else:
        observations = read_observations(open(args['--input']))

    if args['--output'] == '<stdout>':
        output = sys.stdout
    else:
        output = open(args['--output'], 'w')

    threads = int(args['--threads'])

    session = mwapi.Session(args['--api-host'], user_agent=DRAFTTOPIC_UA)

    run(observations, session, threads, output)
示例#21
0
def main(argv=None):
    args = docopt.docopt(__doc__, argv=argv)

    logging.basicConfig(
        level=logging.INFO if not args['--debug'] else logging.DEBUG,
        format='%(asctime)s %(levelname)s:%(name)s -- %(message)s'
    )

    observations = read_observations(sys.stdin)

    sys.path.insert(0, ".")  # Search local directory first
    features = yamlconf.import_module(args['<features>'])
    label_name = args['<label>']
    verbose = args['--verbose']

    run(observations, features, label_name, verbose)
示例#22
0
trainingData = []
trainingInfo = []
for revTrainId in trainingRevId:
    revTrainId = int(revTrainId)
    try:
        #print("https://en.wikipedia.org/wiki/?diff={0}".format(revTrainId))
        trainingRevData = list(api_extractor.extract(revTrainId, features))
        trainingObserv = {"rev_id": revTrainId, "cache": trainingRevData}
    except:
        print('Revision Data Not Found')
        continue
    trainingObserv = json.dumps(trainingObserv)
    trainingData.append(trainingObserv)

for trainings in read_observations(trainingData):
    trainingInfo.append(trainings)

testData = []
testInfo = []
for revTestId in testRevId:
    revTestId = int(revTestId)
    try:
        #print("https://en.wikipedia.org/wiki/?diff={0}".format(revTestId))
        testRevData = list(api_extractor.extract(revTestId, features))
        testObserv = {"rev_id": revTestId, "cache": testRevData}
    except:
        print('Revision Data Not Found')
        continue
    testObserv = json.dumps(testObserv)
    testData.append(testObserv)
示例#23
0
        }
    except RuntimeError as e:
        sys.stderr.write(str(e))
    else:
        print(observation)
        training_features.append(observation)

print("Dump observations to file")
from revscoring.utilities.util import dump_observation, read_observations

with open("observations.json.bz2", "wt") as dumpfile:
    for observation in training_features:
        dump_observation(observation, dumpfile)

with open("observations.json.bz2", "r") as dumpfile:
    training_features = list(read_observations(dumpfile))

from revscoring.scoring.models import GradientBoosting

is_approved = GradientBoosting(features,
                               labels=[True, False],
                               version="Demo",
                               learning_rate=0.01,
                               max_features="log2",
                               n_estimators=700,
                               max_depth=5,
                               population_rates={
                                   False: 0.5,
                                   True: 0.5
                               },
                               scale=True,