def test_open_file_in_folder(tmpdir):
    with open_file_in_folder(os.path.join(str(tmpdir), 'foo', 'bar'), 'w') as fd:
        fd.write('baz')

    expected = 'baz'
    with open(os.path.join(str(tmpdir), 'foo', 'bar')) as fd:
        result = fd.read()

    assert expected == result
예제 #2
0
def test_open_file_in_folder(tmpdir):
    with open_file_in_folder(os.path.join(str(tmpdir), 'foo', 'bar'),
                             'w') as fd:
        fd.write('baz')

    expected = 'baz'
    with open(os.path.join(str(tmpdir), 'foo', 'bar')) as fd:
        result = fd.read()

    assert expected == result
예제 #3
0
def save_publications():
    """Save publications to disk.

    Saves a file to disk called (by default) ``publications.jsonl``, which
    contains one line per record in INSPIRE with information that will be
    useful for ``BEARD`` during training and prediction.
    """
    with open_file_in_folder(
            current_app.config['DISAMBIGUATION_PUBLICATIONS_PATH'], 'w') as fd:
        for publication in get_all_publications():
            fd.write(json.dumps(publication) + '\n')
예제 #4
0
def save_curated_signatures_and_input_clusters():
    """Save curated signatures and input clusters to disk.

    Saves two files to disk called (by default) ``input_clusters.jsonl`` and
    ``curated_signatures.jsonl``. The former contains one line per each cluster
    initially present in INSPIRE, while the latter contains one line per each
    curated signature that will be used as ground truth by ``BEARD``.
    """
    signatures_with_author = defaultdict(list)
    signatures_without_author = []

    with open_file_in_folder(
            current_app.config['DISAMBIGUATION_CURATED_SIGNATURES_PATH'],
            'w') as fd:
        for signature in get_all_curated_signatures():
            if signature.get('author_id'):
                signatures_with_author[signature['author_id']].append(
                    signature['signature_uuid'])
                fd.write(json.dumps(signature) + '\n')
            else:
                signatures_without_author.append(signature['signature_uuid'])

    with open_file_in_folder(
            current_app.config['DISAMBIGUATION_INPUT_CLUSTERS_PATH'],
            'w') as fd:
        for cluster_id, (author_id, signature_uuids) in enumerate(
                six.iteritems(signatures_with_author)):
            fd.write(
                json.dumps({
                    'author_id': author_id,
                    'cluster_id': cluster_id,
                    'signature_uuids': signature_uuids,
                }) + '\n')
        for cluster_id, signature_uuid in enumerate(signatures_without_author,
                                                    cluster_id + 1):
            fd.write(
                json.dumps({
                    'author_id': None,
                    'cluster_id': cluster_id,
                    'signature_uuids': [signature_uuid],
                }) + '\n')
예제 #5
0
def save_sampled_pairs():
    """Save sampled signature pairs to disk.

    Save a file to disk called (by default) ``sampled_pairs.jsonl``, which
    contains one line per each pair of signatures sampled from INSPIRE that
    will be used by ``BEARD`` during training.
    """
    with open_file_in_folder(
            current_app.config['DISAMBIGUATION_SAMPLED_PAIRS_PATH'],
            'w') as fd:
        signatures_path = current_app.config[
            'DISAMBIGUATION_CURATED_SIGNATURES_PATH']
        clusters_path = current_app.config[
            'DISAMBIGUATION_INPUT_CLUSTERS_PATH']
        pairs_size = current_app.config['DISAMBIGUATION_SAMPLED_PAIRS_SIZE']
        for pair in sample_signature_pairs(signatures_path, clusters_path,
                                           pairs_size):
            fd.write(json.dumps(pair) + '\n')
예제 #6
0
 def save_model(self, output_filename):
     with open_file_in_folder(output_filename, 'w') as fd:
         pickle.dump(self.estimator, fd, protocol=pickle.HIGHEST_PROTOCOL)
예제 #7
0
 def save_model(self, output_filename):
     with open_file_in_folder(output_filename, 'w') as fd:
         pickle.dump(self.estimator, fd, protocol=pickle.HIGHEST_PROTOCOL)