def split_file(input_filepath, output_filepaths, round_robin=False): if not round_robin: raise NotImplementedError( 'Splitting files is only implemented as round robin.') with open_files(output_filepaths, 'w') as files: # We write each line to a different file in a round robin fashion for i, line in enumerate(yield_lines(input_filepath)): files[i % len(output_filepaths)].write(line + '\n')
def mix_files(input_filepaths, props, output_filepath): np.random.seed(0) generators = [yield_lines(filepath) for filepath in input_filepaths] has_looped = [False] * len(input_filepaths) # Stop when all lines have been seen at least once with open(output_filepath, 'w') as f: while True: idx = np.random.choice(range(len(input_filepaths)), p=props) try: line = next(generators[idx]) except StopIteration: has_looped[idx] = True # Start reading the file all over again generators[idx] = yield_lines(input_filepaths[idx]) line = next(generators[idx]) if all(has_looped): break f.write(f'{line}\n')
def fairseq_parse_all_hypotheses(out_filepath): hypotheses_dict = defaultdict(list) for line in yield_lines(out_filepath): match = re.match(r'^H-(\d+)\t-?\d+\.\d+\t(.*)$', line) if match: sample_id, hypothesis = match.groups() hypotheses_dict[int(sample_id)].append(hypothesis) # Sort in original order return [hypotheses_dict[i] for i in range(len(hypotheses_dict))]
def get_word2rank(vocab_size=10**5, language='en'): word2rank = {} line_generator = yield_lines(get_fasttext_embeddings_path(language)) next(line_generator) # Skip the first line (header) for i, line in enumerate(line_generator): if (i + 1) > vocab_size: break word = line.split(' ')[0] word2rank[word] = i return word2rank
def create_smaller_dataset(dataset, n_lines): new_dataset = f'{dataset}-lines{n_lines}' with create_directory_or_skip(get_dataset_dir(new_dataset)): filepaths_dict = get_filepaths_dict(dataset) new_filepaths_dict = get_filepaths_dict(new_dataset) for phase, language in product(['train'], LANGUAGES): with open(new_filepaths_dict[(phase, language)], 'w') as output_file: for line in yield_lines(filepaths_dict[(phase, language)], n_lines=n_lines): output_file.write(line + '\n') for phase, language in product(['valid', 'test'], LANGUAGES): shutil.copy(filepaths_dict[(phase, language)], new_filepaths_dict[(phase, language)]) return new_dataset
def yield_json_documents_from_compressed(compressed_path): for document in yield_lines(compressed_path, gzipped=True): yield json.loads(document)
def yield_pairs_from_file(filepath): for line in yield_lines(filepath): complex_sentence, simple_sentence = line.split('\t') yield (complex_sentence, simple_sentence)
# All rights reserved. # # This source code is licensed under the license found in the # LICENSE file in the root directory of this source tree. import faiss from muss.mining.preprocessing import create_base_index, get_index_name, get_sentences_paths from muss.utils.helpers import yield_lines from muss.laser import get_laser_embeddings from muss.resources.paths import get_dataset_dir # Create index language = 'en' n_train_sentences = 1000000 train_sentences = [] for sentences_path in get_sentences_paths(language='en'): for sentence in yield_lines(sentences_path): train_sentences.append(sentence) if len(train_sentences) == n_train_sentences: break if len(train_sentences) == n_train_sentences: break get_embeddings = lambda sentences: get_laser_embeddings( sentences, max_tokens=3000, language=language) # noqa: E731 output_dir = get_dataset_dir('uts') / f'base_indexes/laser_{language}' output_dir.mkdir(exist_ok=True) create_base_index(train_sentences, get_index_name(), get_embeddings, faiss.METRIC_INNER_PRODUCT, output_dir)