Exemplo n.º 1
0
def coherence_metrics():
    metric_fcts = load_metric_fcts('coherence')
    if len(metric_fcts) == 0:
        nbprint('No metrics active.')
        return

    topiclist_infos = data.get_all_topiclist_infos()
    if len(topiclist_infos) == 0:
        nbprint('No topics found.')
        return

    # Group them into batches based on topic_version and add num_tokens
    topiclist_info_batches = defaultdict(list)
    for info in topiclist_infos:
        for num_tokens in config.metrics['num_tokens']:
            extended_info = info.copy()
            extended_info['num_tokens'] = num_tokens
            if 'second_info' in info:
                token_version = info['second_info']['token_version']
            else:
                token_version = info['token_version']
            topiclist_info_batches[token_version].append(extended_info)

    for token_version, batch in topiclist_info_batches.items():
        nbprint('Batch {}'.format(token_version)).push()
        for metric_id, fct in metric_fcts.items():
            start = time.time()
            nbprint('Metric: {}'.format(
                config.metrics['coherence'][metric_id]['name'])).push()
            coherence_metric_batch(token_version, batch, metric_id, fct)
            end = time.time()
            nbprint('Runtime: {} minutes'.format((end - start) / 60)).pop()
        nbprint.pop()
Exemplo n.º 2
0
def run_importer(info=None):
    nbprint('Importer').push()

    if info is None:
        iterate(["data"], [import_data], depth=0)
    else:
        import_data(info)

    nbprint.pop()
Exemplo n.º 3
0
def run_tokenizer(info=None):
    nbprint('Tokenizer').push()

    if info is None:
        iterate(['token:BC', 'data'], tokenize)
    else:
        tokenize(info)

    nbprint.pop()
Exemplo n.º 4
0
def run_vocab(info=None):
    nbprint('Vocab').push()

    if info is None:
        iterate(["data", "token", "vocab"], [check_tokens, build_vocab])
    else:
        check_tokens(info)
        build_vocab(info)

    nbprint.pop()
Exemplo n.º 5
0
def run_distiller():
    global rejector
    rejector = Rejector(0.99)

    nbprint('Distiller').push()

    iterate(['distiller', 'distillerinputs'],
            add_second_info,
            print_iterates=False)

    nbprint.pop()
Exemplo n.º 6
0
    def add_data(self, filename):
        nbprint("Loading '{}'".format(filename)).push()
        folderpath = join(config.paths["rawdata"], "tweetsodp")
        jsonfilename = join(folderpath, filename + ".json")
        zipfilename = join(folderpath, filename + ".json.zip")

        self.load_id_to_classname(folderpath, filename)
        if isfile(jsonfilename):
            with open(jsonfilename, "r") as jsonfile:
                self.parse_files(jsonfile)
        else:
            with zipfile.ZipFile(zipfilename) as zip:
                with zip.open(filename + ".json") as jsonfile:
                    self.parse_files(jsonfile)
        nbprint.pop()
Exemplo n.º 7
0
    def run(self):
        # Open Writer
        with data.document_writer(self.info) as document_writer:
            self.docinfo = DocumentInfo(document_writer)

            # Iterate all archives
            folder = join(config.paths["rawdata"], "tweetsla")
            archives = self.get_archives(folder)
            for idx, archive in enumerate(archives):
                nbprint('{}/{}: {}'.format(idx + 1, len(archives),
                                           archive)).push()
                self.archivepath = join(folder, archive)
                self.import_archive()
                nbprint.pop()

            # Print Meta Info
            self.docinfo.save_meta(self.info)
Exemplo n.º 8
0
def clustering_metrics():
    metric_fcts = load_metric_fcts('clustering')
    clustering_data = data.load_metric_data('clustering')

    # First everything by taking the column wise maximum as cluster idx
    nbprint('H Matrix').push()
    h_mat_infos = data.get_all_h_mat_infos(labeled_only=True)
    for info in ProgressIterator(h_mat_infos, print_every=1):
        # Grab the corresponding entry from clustering data
        metric_data_entry = grab_metric_data_entry(clustering_data, info)

        # Iterate all metric functions and store result in entry
        for metric_id, metric_fct in metric_fcts.items():
            # Skip metric if it already exists:
            if metric_id in metric_data_entry:
                continue

            # Compute the metric
            labels_true = load_ground_truth_classes(info)
            labels_pred = load_class_array_from_h_mat(info)
            metric_data_entry[metric_id] = metric_fct(labels_true, labels_pred)

            # Save everything in between
            data.save_metric_data(clustering_data, 'clustering')

    # Taking indices directly from c
    nbprint.pop()('C Vector').push()
    c_vec_infos = data.get_all_c_vec_infos(labeled_only=True)
    for info in ProgressIterator(c_vec_infos, print_every=1):
        # Grab the corresponding entry from clustering data
        metric_data_entry = grab_metric_data_entry(clustering_data, info)

        # Iterate all metric functions and store result in entry
        for metric_id, metric_fct in metric_fcts.items():
            # Skip metric if it already exists:
            if metric_id in metric_data_entry:
                continue

            # Compute the metric
            labels_true = load_ground_truth_classes(info)
            labels_pred = data.load_c_vec(info)
            metric_data_entry[metric_id] = metric_fct(labels_true, labels_pred)

            # Save everything in between
            data.save_metric_data(clustering_data, 'clustering')
    nbprint.pop()
Exemplo n.º 9
0
def run_models(info=None):
    nbprint('Models').push()

    if info is None:
        iterate(['models', 'modelinputs', 'num_topics'],
                [check_input_mat, run_model])
    else:
        info['model_info'] = config.models['list'][info['model_name']]
        info['model'] = get_model(info)
        if not info['model'].output_of(info):
            nbprint('Model is not compatible to inputs.')
        else:
            try:
                check_input_mat(info)
                run_model(info)
            except BreakIteration:
                pass
    nbprint.pop()
Exemplo n.º 10
0
def call_next(what, callbacks, print_string, new_data, info, depth,
              print_iterates):
    if print_string and print_iterates == True:
        nbprint(print_string)
    if print_iterates == True:
        nbprint.push()
    new_info = {**info, **new_data}
    if len(callbacks) < len(what):
        iterate(what[1:], callbacks, new_info, depth + 1)
    else:
        try:
            if callbacks[0]:
                callbacks[0](new_info)
            if len(what) > 1:
                iterate(what[1:], callbacks[1:], new_info, depth + 1,
                        print_iterates)
        except BreakIteration:
            if print_iterates == True:
                nbprint('skipping')
            pass
    if print_iterates == True:
        nbprint.pop()
Exemplo n.º 11
0
def run_distiller_on(first_info, second_info):
    global rejector
    must_execute = False
    info = first_info.copy()
    if second_info is None:
        must_execute = True
    else:
        must_execute = (first_info.get('token_version',
                                       None) == second_info['token_version']
                        and first_info.get('vocab_version', None)
                        == second_info['vocab_version']
                        and first_info.get('vector_version', None)
                        == second_info['vector_version'])
        info['second_info'] = second_info
    if must_execute or rejector.allow():
        nbprint('({}), ({})'.format(info_summary_str(first_info),
                                    info_summary_str(second_info))).push()
        if config.skip_existing and data.topiclist_exists(info):
            nbprint('Skipping Distiller (file(s) exists)')
        else:
            info['distiller'].run_distiller(info)
            info['distiller'].save()
            nbprint('Distiller: success')
        nbprint.pop()
Exemplo n.º 12
0
def run_topic_metrics():
    nbprint('Topic Metrics').push()

    nbprint('Coherence').push()
    coherence_metrics()
    nbprint.pop()

    nbprint('similarity').push()
    similarity_metrics()
    nbprint.pop()

    nbprint.pop()
Exemplo n.º 13
0
def run_model_metrics():
    nbprint('Model Metrics').push()

    nbprint('Clustering').push()
    clustering_metrics()
    nbprint.pop()

    nbprint('Classification').push()
    classification_metrics()
    nbprint.pop()

    nbprint.pop()
Exemplo n.º 14
0
def run_vectorizer(info=None):
    nbprint('Vectorizer').push()
    global runvars

    if info is None:
        if config.vectorizer['run_B']:
            nbprint('BoW').push()
            runvars = {}
            iterate(['data', 'token:BC', 'vocab', 'vector:B'],
                    [count_mat, bow])
            nbprint.pop()

        if config.vectorizer['run_C']:
            nbprint('cBoW').push()
            runvars = {}
            iterate(['data', 'token:C', 'vocab', 'vector:C'],
                    [count_mat, cbow])
            nbprint.pop()

        if config.vectorizer['run_P']:
            nbprint('Phrase').push()
            runvars = {}
            iterate(['data', 'vector:P'], [phrase])
            nbprint.pop()
    else:
        runvars = {}
        vector_bcp, vector_id = config.split(info['vector_version'])
        if vector_bcp == 'B' or vector_bcp == 'C':
            count_mat(info)
            if vector_bcp == 'B':
                bow(info)
            else:
                cbow(info)
        else:
            phrase(info)

    runvars = None
    nbprint.pop()