Пример #1
0
def split_file(input_filepath, output_filepaths, round_robin=False):
    if not round_robin:
        raise NotImplementedError(
            'Splitting files is only implemented as round robin.')
    with open_files(output_filepaths, 'w') as files:
        # We write each line to a different file in a round robin fashion
        for i, line in enumerate(yield_lines(input_filepath)):
            files[i % len(output_filepaths)].write(line + '\n')
Пример #2
0
def mix_files(input_filepaths, props, output_filepath):
    np.random.seed(0)
    generators = [yield_lines(filepath) for filepath in input_filepaths]
    has_looped = [False] * len(input_filepaths)
    # Stop when all lines have been seen at least once
    with open(output_filepath, 'w') as f:
        while True:
            idx = np.random.choice(range(len(input_filepaths)), p=props)
            try:
                line = next(generators[idx])
            except StopIteration:
                has_looped[idx] = True
                # Start reading the file all over again
                generators[idx] = yield_lines(input_filepaths[idx])
                line = next(generators[idx])
            if all(has_looped):
                break
            f.write(f'{line}\n')
Пример #3
0
def fairseq_parse_all_hypotheses(out_filepath):
    hypotheses_dict = defaultdict(list)
    for line in yield_lines(out_filepath):
        match = re.match(r'^H-(\d+)\t-?\d+\.\d+\t(.*)$', line)
        if match:
            sample_id, hypothesis = match.groups()
            hypotheses_dict[int(sample_id)].append(hypothesis)
    # Sort in original order
    return [hypotheses_dict[i] for i in range(len(hypotheses_dict))]
Пример #4
0
def get_word2rank(vocab_size=10**5, language='en'):
    word2rank = {}
    line_generator = yield_lines(get_fasttext_embeddings_path(language))
    next(line_generator)  # Skip the first line (header)
    for i, line in enumerate(line_generator):
        if (i + 1) > vocab_size:
            break
        word = line.split(' ')[0]
        word2rank[word] = i
    return word2rank
Пример #5
0
def create_smaller_dataset(dataset, n_lines):
    new_dataset = f'{dataset}-lines{n_lines}'
    with create_directory_or_skip(get_dataset_dir(new_dataset)):
        filepaths_dict = get_filepaths_dict(dataset)
        new_filepaths_dict = get_filepaths_dict(new_dataset)
        for phase, language in product(['train'], LANGUAGES):
            with open(new_filepaths_dict[(phase, language)],
                      'w') as output_file:
                for line in yield_lines(filepaths_dict[(phase, language)],
                                        n_lines=n_lines):
                    output_file.write(line + '\n')
        for phase, language in product(['valid', 'test'], LANGUAGES):
            shutil.copy(filepaths_dict[(phase, language)],
                        new_filepaths_dict[(phase, language)])
    return new_dataset
Пример #6
0
def yield_json_documents_from_compressed(compressed_path):
    for document in yield_lines(compressed_path, gzipped=True):
        yield json.loads(document)
Пример #7
0
def yield_pairs_from_file(filepath):
    for line in yield_lines(filepath):
        complex_sentence, simple_sentence = line.split('\t')
        yield (complex_sentence, simple_sentence)
Пример #8
0
# All rights reserved.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.

import faiss

from muss.mining.preprocessing import create_base_index, get_index_name, get_sentences_paths
from muss.utils.helpers import yield_lines
from muss.laser import get_laser_embeddings
from muss.resources.paths import get_dataset_dir

# Create index
language = 'en'
n_train_sentences = 1000000
train_sentences = []
for sentences_path in get_sentences_paths(language='en'):
    for sentence in yield_lines(sentences_path):
        train_sentences.append(sentence)
        if len(train_sentences) == n_train_sentences:
            break
    if len(train_sentences) == n_train_sentences:
        break

get_embeddings = lambda sentences: get_laser_embeddings(
    sentences, max_tokens=3000, language=language)  # noqa: E731
output_dir = get_dataset_dir('uts') / f'base_indexes/laser_{language}'
output_dir.mkdir(exist_ok=True)
create_base_index(train_sentences, get_index_name(), get_embeddings,
                  faiss.METRIC_INNER_PRODUCT, output_dir)