def prepare_worker_(document_path): reader = trec_utils.TRECTextReader([document_path], encoding=prepare_worker.encoding) num_documents = 0 for doc_id, doc_text in reader.iter_documents(replace_digits=True, strip_html=True): # Values to be returned. instances_and_labels = [] if doc_id not in prepare_worker.document_assocs: logging.debug('Document "%s" does not exist in associations.', doc_id) continue def _callback(num_yielded_windows, remaining_tokens): if not num_yielded_windows: logging.error('Document "%s" (%s) yielded zero instances; ' 'remaining tokens: %s.', doc_id, doc_text, remaining_tokens) padding_token = ( '</s>' if not prepare_worker.args.no_padding else None) # Ignore end-of-sentence. windowed_word_stream = io_utils.windowed_translated_token_stream( io_utils.replace_numeric_tokens_stream( io_utils.token_stream( io_utils.lowercased_stream( io_utils.filter_non_latin_stream( io_utils.filter_non_alphanumeric_stream( iter(doc_text)))), eos_chars=[])), prepare_worker.args.window_size, prepare_worker.words, stride=prepare_worker.args.stride, padding_token=padding_token, callback=_callback) # To determine the matrix indices of the entities associated with # the document. entity_ids = [entity_id for entity_id in prepare_worker.document_assocs[doc_id]] label = _candidate_centric_label(entity_ids) partition_function = float(sum(label.values())) for index in label: label[index] /= partition_function for instance in windowed_word_stream: instances_and_labels.append((doc_id, instance, label)) prepare_worker.result_queue.put( (doc_id, instances_and_labels, label)) num_documents += 1 return num_documents
def parse_query(unsplitted_terms): assert isinstance(unsplitted_terms, str) unsplitted_terms = remove_parentheses_re.sub( r'\1', unsplitted_terms.strip()) unsplitted_terms = unsplitted_terms.replace('/', ' ') unsplitted_terms = unsplitted_terms.replace('-', ' ') return list(io_utils.token_stream( io_utils.lowercased_stream( io_utils.filter_non_latin_stream( io_utils.filter_non_alphanumeric_stream( iter(unsplitted_terms)))), eos_chars=[]))