class Prep(object): def __init__(self, index='biosum'): self.es_int = ESInterface(index_name=index) def prep(self, docs_path='../data/TAC_2014_BiomedSumm_Training_Data', json_data_path='../data/v1-2a.json'): data = get_data(docs_path, json_data_path) train_set = {} for tid in data: train_set[tid] = [] # citation number for cit in data[tid]: offsets = [] ref_art = '' for ann in data[tid][cit].values(): for off in ann['ref_offset']: offsets.append(off) query = ann['cit_text'] ref_art = ann['ref_art'] # union of all annotators reference offsets offsets = union(offsets) doc_type = tid.lower() + '_' + ref_art.lower()[:-4] d = self._prep_data(clean(query), doc_type, offsets) train_set[tid].append(d) return train_set def _prep_data(self, query, doc_type, relevant_offsets, save_path=False): ''' Prepares the training data for leaning to rank Fetches the document from elastic_search and returns a x_train, y_train vector Args: query(str) The query that is used to retrieve relevant offsets doc_type(str) Name of the type on elasticsearch index e.g. 'd1409_train_sherr' relevant_offsets(list): list of offsets that are relevant Returns: list of tuples: a list of training data ('query', 'some text', bool (1 if relevant 0 otherwise)) ''' hits = self.es_int.find_all(doc_type=doc_type) x_train = [] y_train = [] queries = [] for hit in hits: label = 0 offset = eval(hit['_source']['offset']) for off in relevant_offsets: if self.get_overlap(offset, off) > 0: label = 1 break x_train.append(hit['_source']['sentence']) y_train.append(label) queries.append(query) # if save_path: # with codecs.open(save_path, 'wb', 'utf-8') as mf: # pickle.dump(zip(x_train, y_train), mf) return zip(queries, x_train, y_train) def get_overlap(self, a, b): return max(0, min(a[1], b[1]) - max(a[0], b[0]))
class Prep(object): def __init__(self, index='biosum'): self.es_int = ESInterface(index_name=index) def prep(self, docs_path='../data/TAC_2014_BiomedSumm_Training_Data', json_data_path='../data/v1-2a.json'): data = get_data(docs_path, json_data_path) train_set = {} for tid in data: train_set[tid] = [] # citation number for cit in data[tid]: offsets = [] ref_art = '' for ann in data[tid][cit].values(): for off in ann['ref_offset']: offsets.append(off) query = ann['cit_text'] ref_art = ann['ref_art'] # union of all annotators reference offsets offsets = union(offsets) doc_type = tid.lower() + '_' + ref_art.lower()[:-4] d = self._prep_data(clean(query), doc_type, offsets) train_set[tid].append( d) return train_set def _prep_data(self, query, doc_type, relevant_offsets, save_path=False): ''' Prepares the training data for leaning to rank Fetches the document from elastic_search and returns a x_train, y_train vector Args: query(str) The query that is used to retrieve relevant offsets doc_type(str) Name of the type on elasticsearch index e.g. 'd1409_train_sherr' relevant_offsets(list): list of offsets that are relevant Returns: list of tuples: a list of training data ('query', 'some text', bool (1 if relevant 0 otherwise)) ''' hits = self.es_int.find_all(doc_type=doc_type) x_train = [] y_train = [] queries = [] for hit in hits: label = 0 offset = eval(hit['_source']['offset']) for off in relevant_offsets: if self.get_overlap(offset, off) > 0: label = 1 break x_train.append(hit['_source']['sentence']) y_train.append(label) queries.append(query) # if save_path: # with codecs.open(save_path, 'wb', 'utf-8') as mf: # pickle.dump(zip(x_train, y_train), mf) return zip(queries, x_train, y_train) def get_overlap(self, a, b): return max(0, min(a[1], b[1]) - max(a[0], b[0]))