class Method(MethodInterface): """ Produce reference text by submitting the citance to the ElasticSearch server. """ method_opts = {'maxsize': {'type': int, 'default': 100}, 'thresh': {'type': int, 'default': False}, 'stopwords-path': {'default': STOPWORDS_PATH}, 'remove-stopwords': {'default': True, 'action': 'store_true'}, 'combine': {'default': False, 'action': 'store_true'}, 'cache-path': {'default': 'cache'}, 'idf_index': {'default': 'pubmed'}} def __init__(self, args, opts): super(Method, self).__init__(args, opts) self.es_int = ESInterface(host=self.opts.server, port=self.opts.port, index_name=self.opts.index_name) self.regex_citation = re.compile(r"(\(\s?(([A-Za-z]+\.?\s?)+,? \d+" r"(\s?([;,]|and)\s)?)+\))|" r"(\[(\d+([,–]\s?)?)+\])|" r"\[[\d,-]+\]").sub self.all_digits = re.compile(r"^\d+$").search if self.opts.remove_stopwords: with file(self.opts.stopwords_path) as f: self.stopwords = frozenset([l.strip().lower() for l in f]) else: self.stopwords = frozenset([]) def run(self, test_data): out_results = [] doc_freq_path = os.path.join(self.opts.cache_path, 'idfidx' + self.opts.idf_index + 'wp_doc_freq.json') if os.path.exists(doc_freq_path): with codecs.open(doc_freq_path, 'rb', 'UTF-8') as mf: doc_freq = json.load(mf) else: doc_freq = {} es_int2 = ESAuth(host='devram4.cs.georgetown.edu', index_name=self.opts.idf_index) count_docs = es_int2.count(query='*:*') for ann in test_data: doc_type = '_'.join((ann['topic_id'].lower(), ann['reference_article'][:-4].lower())) doc_type = doc_type.replace(',', '').replace("'", '"') authors = set((ann['reference_article'][:-4].lower().strip(), ann['citing_article'][:-4].lower().strip())) # preprocess (removes citations) and tokenizes # citation text before submitting to elasticsearch q = self.regex_citation('', ann['citation_text']) q = q.encode('ascii', 'ignore') terms = [] for t in self.es_int.tokenize(q, 'sentence'): if (t not in self.stopwords and t not in authors and not(self.all_digits(t))): if t not in doc_freq.keys(): count = es_int2.count(t) if count > 0: idf = log(count_docs / float(count + 1)) doc_freq[t] = idf terms.append(t) else: idf = doc_freq[t] terms.append(t) avg_idf = np.average([doc_freq[t] for t in terms]) thresh = avg_idf if self.opts.thresh is not None\ else self.opts.thresh q = ' '.join([t for t in terms if (doc_freq[t] > thresh)]) if q == '': max_idf = -1 for t in terms: if max_idf < doc_freq[t]: max_idf = doc_freq[t] q = t r = self.es_int.simple_search(q, maxsize=self.opts.maxsize, source_fields=['offset', 'sentence'], field='sentence', doc_type=doc_type) for e in r: fld = e.pop('fields') e['offset'] = [eval(fld['offset'][0])] # beg = e['offset'][0][0] - \ # 100 if e['offset'][0][0] else e['offset'][0][0] # end = e['offset'][0][1] + 100 # e['offset'] = [(beg, end)] e['sentence'] = fld['sentence'][0] e['query'] = q if self.opts.combine: if len(r) == 0: r = [{'_type': doc_type, '_index': self.opts.index_name, '_score': 0, 'sentence': '', 'offset': [(0, 1)], 'query':q, '_id':-11}] r = [{'_type': r[0]['_type'], '_index': r[0]['_index'], 'query': q, 'topic': ann['topic_id'].lower(), 'citance_number': ann['citance_number'], 'citation_text': ann['citation_text'], 'citing_article': ann['citing_article'], '_score': sum([e['_score'] for e in r]), 'offset': [e['offset'][0] for e in r], 'sentence': [e['sentence'] for e in r], '_id': '-000001'}] out_results.append(r) with codecs.open(doc_freq_path, 'wb', 'UTF-8') as mf: json.dump(doc_freq, mf, indent=2) return out_results
class Method(MethodInterface): """ Produce reference text by submitting the citance to the ElasticSearch server. """ method_opts = {'maxsize': {'type': int, 'default': 100}, 'stopwords-path': {'default': STOPWORDS_PATH}, 'remove-stopwords': {'default': False, 'action': 'store_true'}, 'remove-stopwords-phrase': {'default': False, 'action': 'store_true'}, 'noun-phrase': {'default': False, 'action': 'store_true'}, 'phrase-slop': {'type': int, 'default': 0}, 'combine': {'default': False, 'action': 'store_true'}, 'docs-path': {'default': DOCS_PATH}, 'expand-window': {'default': False, 'action': 'store_true'}, 'query-terms': {'default': False, 'action': 'store_true'}, 'verbose': {'default': False, 'action': 'store_true'}, 'qterm-weight': {'type': float, 'default': 1.0}, 'phrase-weight': {'type': float, 'default': 2.0}, 'surrounding-words-weight': {'type': float, 'default': 1.0}, 'filter-allstops': {'default': False, 'action': 'store_true'}, 'expand-results': {'type': int, 'default': 0}, 'sentence': {'default': False, 'type': int}, 'analyzer': {'default': False, 'type': str}} def __init__(self, args, opts): super(Method, self).__init__(args, opts) self.es_int = ESInterface(host=self.opts.server, port=self.opts.port, index_name=self.opts.index_name) self.regex_citation = re.compile(r"\(\s?(([A-Za-z\-]+\s)+([A-Za-z\-]+\.?)?,?\s\d{2,4}(;\s)?)+\s?\)|" r"(\[(\d+([,–]\s?)?)+\])|" r"\[[\d,-]+\]").sub self.all_digits = re.compile(r"^\d+$").search if self.opts.remove_stopwords: with file(self.opts.stopwords_path) as f: self.stopwords = frozenset([l.strip().lower() for l in f]) else: self.stopwords = frozenset([]) self.doc_mod = documents_model.DocumentsModel(opts.docs_path) self.ann_client = AnnotationsClient() self.reg_apa = re.compile( # [Chen et al.2000] r"\(\s?(([A-Za-z\-]+\s)+([A-Za-z\-]+\.?)?,?\s\d{2,4}(;\s)?)+\s?\)|" r"\(\s?([^ ]+\s?[^ ]*et\sal\.,?\s\d{2,4}(,\s)?)+(\sand\s)?[^ ]+\s?[^ ]*et\sal\.,?\s\d{2,4}\)|" r"\w+\set al\. \(\d{2,4}\)") # [Chen et al. 200] self.reg_apa_rare = re.compile( r"((([A-Z]\w+\set\sal\.,? \d{4})|([A-Z]\w+\sand\s[A-Z]\w+,?\s\d{4}))((,\s)| and )?)+") self.reg_apa2 = re.compile( r"\(\s?(\w+\s?\w*et\sal\.,\s\d{2,4}(,\s)?)+(\sand\s)?\w+\s?\w*et\sal\.,\s\d{2,4}\)") self.reg_ieee = re.compile(r"(\[(\d+([,–]\s?)?)+\])|\[\s?[\d,-]+\]") self.reg_paranthesis = re.compile( r"\(\s?\d{1,2}(,\s\d{1,2})*(\sand\s\d{1,2})?\)") self.nlp_extractor = Extract_NLP_Tags() self.tokenizer = RegexpTokenizer('[^\w\-\']+', gaps=True) self.lmtzr = WordNetLemmatizer() def run(self, test_data): out_results = [] not_found = 0 total = 0 # outfile = codecs.open('tmp/nlp.txt' , 'wb' , 'UTF-8') processed = set() for ann in test_data: if (ann['topic_id'] + '_' + str(ann['citance_number'])) not in processed: doc_type = '_'.join((ann['topic_id'].lower(), ann['reference_article'][:-4].lower())) doc_type = doc_type.replace(',', '').replace("'", '"') doc = self.doc_mod.get_doc( ann['topic_id'].lower(), ann['citing_article']) cit_text = ann['citation_text'] cit_text_doc = doc[ ann['citation_offset'][0]:ann['citation_offset'][1]] cit_marker = ann['citation_marker'] cit_marker_doc = doc[ ann['citation_marker_offset'][0]:ann['citation_marker_offset'][1]] cit_mrk_offset_sent = [ann['citation_marker_offset'][0] - ann['citation_offset'][0] + 1, [ann['citation_marker_offset'][1] - ann['citation_offset'][1] + 1]] cleaned = self.reg_apa.sub('', cit_text_doc) cleaned = self.reg_ieee.sub('', cleaned) cleaned = self.reg_paranthesis.sub('', cleaned) cleaned = self.reg_apa_rare.sub('', cleaned) cleaned = re.sub('\s+', ' ', cleaned).strip() cleaned = re.sub('(,\s)+', ', ', cleaned).strip(', ') chunks = set() # get noun phrases, format [[[term1, term2],[term3]][term4, # term5]] nps = self.nlp_extractor.extract_NP(cleaned, mode='flattened') # nps = [[[a[1:-1] for a in piece] for piece in sent] for sent in nps] # for e in nps: # noun_phrases = [(sub_e[0].replace('"', ''),idx) for idx, sub_e in enumerate(e) if sub_e[0].replace('"', '') not in self.stopwords] noun_phrases = [e for e in list(itertools.chain.from_iterable(nps)) if e not in self.stopwords] # tokens = self.tokenizer.tokenize(cit_text) # tokens_offsets = self.tokenizer.span_tokenize(cit_text_doc) # cleaned = '' # # m = list(self.reg_apa.finditer(cit_text_doc)) # m1 = list(self.reg_ieee.finditer(cit_text_doc)) # m2 = list(self.reg_paranthesis.finditer(cit_text_doc)) # # (start, end, group) # if len(m) > 0: # markers = [(e.start(), e.end(), e.group(0)) for e in m] # elif len(m1) > 0: # markers = [(e.start(), e.end(), e.group(0)) # for e in m1] # elif len(m2) > 0: # markers = [(e.start(), e.end(), e.group(0)) # for e in m2] # else: # m3 = list(self.reg_apa_rare.finditer(cit_text_doc)) # if len(m3) > 0: # markers = [(e.start(), e.end(), e.group(0)) # for e in m3] # else: # not_found += 1 # nearest = '' # distance = 100000 # if len(markers) > 1: # # find nearest word to the citation marker # for idx, f in enumerate(tokens_offsets): # # check to see if in valid span (not citation markers) # invalid = False # for e in markers: # if f[0] >= e[0] and f[1] <= e[1]: # invalid = True # if (cit_mrk_offset_sent[0] - f[1] >= 0) and\ # (cit_mrk_offset_sent[0] - f[1] < distance) and\ # not invalid: # distance = cit_mrk_offset_sent[0] - f[1] # if len(re.findall(r"^[^A-Za-z]+$", tokens[idx])) == 0: # nearest = tokens[idx] # # # find longest noun phrase containing the nearest # longest = 0 # res = None # for np in nps[0]: # if nearest in np and len(np) > longest: # longest = len(np) # res = np # if res is not None: # res = ' '.join([el for el in res]) # else: # res = nearest # else: # # if there is only one citation marker, just consider the # # whole citation text as the query # q_tokens = [] # for idx, f in enumerate(tokens_offsets): # invalid = False # for e in markers: # if f[0] >= e[0] and f[1] <= e[1]: # invalid = True # if (cit_mrk_offset_sent[0] - f[1] >= 0) and\ # (cit_mrk_offset_sent[0] - f[1] < distance) and\ # not invalid: # q_tokens.append(tokens[idx]) # res = ' '.join([f for f in q_tokens]) q = noun_phrases q = ' '.join(q).encode('ascii', 'ignore') # outfile.write('query: "%s" \nparsed: "%s"\n\n' %(q,str(nps)) ) tokens = self.es_int.tokenize(q, "sentence") q = ' '.join([t for t in tokens if (t not in self.stopwords and not(self.all_digits(t)))]) if self.opts.analyzer: r = self.es_int.simple_search(q, maxsize=self.opts.maxsize, source_fields=[ 'offset', 'sentence'], # field='sentence', doc_type=doc_type, params={'analyzer': self.opts.analyzer}) else: r = self.es_int.simple_search(q, maxsize=self.opts.maxsize, source_fields=[ 'offset', 'sentence'], # field='sentence', doc_type=doc_type) for e in r: fld = e.pop('fields') e['offset'] = [eval(fld['offset'][0])] beg = e['offset'][0][0] - \ 100 if e['offset'][0][0] else e['offset'][0][0] end = e['offset'][0][1] + 100 e['offset'] = [(beg, end)] e['sentence'] = fld['sentence'][0] e['query'] = q if self.opts.combine: if len(r) == 0: r = [{'_type': doc_type, '_index': self.opts.index_name, '_score': 0, 'sentence': '', 'offset': [(0, 1)], 'query':q, '_id':-11}] r = [{'_type': r[0]['_type'], '_index': r[0]['_index'], 'query': q, 'topic': ann['topic_id'].lower(), 'citance_number': ann['citance_number'], 'citation_text': ann['citation_text'], 'citing_article': ann['citing_article'], '_score': sum([e['_score'] for e in r]), 'offset': [e['offset'][0] for e in r], 'sentence': [e['sentence'] for e in r], '_id': '-000001'}] out_results.append(r) return out_results
class Method(MethodInterface): """ Produce reference text by submitting the citance to the ElasticSearch server. """ method_opts = { 'maxsize': { 'type': int, 'default': 3 }, 'stopwords-path': { 'default': STOPWORDS_PATH }, 'remove-stopwords': { 'default': False, 'action': 'store_true' }, 'combine': { 'default': False, 'action': 'store_true' }, 'analyzer': { 'default': False, 'type': str }, 'ngram': { 'default': False, 'type': int } } def __init__(self, args, opts): super(Method, self).__init__(args, opts) self.es_int = ESInterface(host=self.opts.server, port=self.opts.port, index_name=self.opts.index_name) self.regex_citation = re.compile( r"\(\s?(([A-Za-z\-]+\s)+([A-Za-z\-]+\.?)?,?\s\d{2,4}(;\s)?)+\s?\)|" r"(\[(\d+([,–]\s?)?)+\])|" r"\[[\d,-]+\]").sub self.all_digits = re.compile(r"^\d+$").search if self.opts.stopwords_path: stop_path = self.opts.stopwords_path else: stop_path = STOPWORDS_PATH if self.opts.remove_stopwords: with file(self.opts.stopwords_path) as f: self.stopwords = frozenset([l.strip().lower() for l in f]) else: self.stopwords = frozenset([]) self.tokenizer = RegexpTokenizer('[^\w\-\']+', gaps=True) def run(self, test_data): # with codecs.open('tmp/test_data.json', 'wb', 'utf-8') as mf: # json.dump(test_data, mf, indent=2) out_results = [] det_res = {} for ann in test_data: doc_type = '_'.join((ann['topic_id'].lower(), ann['reference_article'][:-4].lower())) # TEMPORARY FIX FOR WRONG DOCUMENT TYPE NAME doc_type = doc_type.replace('train', 'eval') doc_type = doc_type.replace(',', '').replace("'", '"') # TEMPORARY FIX FOR WRONG DOCUMENT TYPE NAME doc_type = doc_type.replace('eval', 'train') authors = set((ann['reference_article'][:-4].lower().strip(), ann['citing_article'][:-4].lower().strip())) # preprocess (removes citations) and tokenizes # citation text before submitting to elasticsearch q = self.regex_citation('', ann['citation_text']) q = q.encode('ascii', 'ignore') # tokens = self.es_int.tokenize(q, "sentence") tokens = self.tokenizer.tokenize(q) tokens = ['"' + t + '"' if '-' in t else t for t in tokens] q = ' '.join([ t for t in tokens if (t not in self.stopwords and t not in authors and not (self.all_digits(t))) ]) if self.opts.ngram: tokens = self.es_int.tokenize(q, "sentence") new_query = '' for i in range(len(tokens) - self.opts.ngram): tmp = '' for j in range(i, i + self.opts.ngram): tmp += tokens[j] + ' ' new_query += '"' + tmp.strip() + '" ' q = new_query.strip() # q = '*:*' if self.opts.analyzer: r = self.es_int.simple_search( q, maxsize=self.opts.maxsize, source_fields=['offset', 'sentence'], # field='sentence', doc_type=doc_type, params={'analyzer': self.opts.analyzer}) else: r = self.es_int.simple_search( q, maxsize=self.opts.maxsize, source_fields=['offset', 'sentence'], # field='sentence', doc_type=doc_type) for e in r: fld = e.pop('fields') e['offset'] = [eval(fld['offset'][0])] # beg = e['offset'][0][0] - \ # 100 if e['offset'][0][0] else e['offset'][0][0] # end = e['offset'][0][1] + 100 # e['offset'] = [(beg, end)] e['sentence'] = fld['sentence'][0] e['query'] = q e['topic'] = ann['topic_id'].lower() if self.opts.combine: if len(r) == 0: r = [{ '_type': doc_type, '_index': self.opts.index_name, '_score': 0, 'sentence': '', 'offset': [(0, 1)], 'query': q, '_id': -11 }] r = [{ '_type': r[0]['_type'], '_index': r[0]['_index'], 'query': q, 'topic': ann['topic_id'].lower(), 'citance_number': ann['citance_number'], 'citation_text': ann['citation_text'], 'citing_article': ann['citing_article'], '_score': sum([e['_score'] for e in r]), 'offset': [e['offset'][0] for e in r], 'sentence': [e['sentence'] for e in r], '_id': '-000001' }] out_results.append(r) # with codecs.open('tmp/out_results.json', 'wb', 'utf-8') as mf: # json.dump(out_results, mf, indent=2) # sys.exit() return out_results
class Method(MethodInterface): """ Produce reference text by submitting the citance to the ElasticSearch server. """ method_opts = { 'maxsize': { 'type': int, 'default': 100 }, 'stopwords-path': { 'default': STOPWORDS_PATH }, 'remove-stopwords': { 'default': False, 'action': 'store_true' }, 'remove-stopwords-phrase': { 'default': False, 'action': 'store_true' }, 'noun-phrase': { 'default': False, 'action': 'store_true' }, 'phrase-slop': { 'type': int, 'default': 0 }, 'combine': { 'default': False, 'action': 'store_true' }, 'docs-path': { 'default': DOCS_PATH }, 'expand-window': { 'default': False, 'action': 'store_true' }, 'query-terms': { 'default': False, 'action': 'store_true' }, 'verbose': { 'default': False, 'action': 'store_true' }, 'qterm-weight': { 'type': float, 'default': 1.0 }, 'phrase-weight': { 'type': float, 'default': 2.0 }, 'surrounding-words-weight': { 'type': float, 'default': 1.0 }, 'filter-allstops': { 'default': False, 'action': 'store_true' }, 'expand-results': { 'type': int, 'default': 0 }, 'sentence': { 'default': False, 'type': int }, 'analyzer': { 'default': False, 'type': str } } def __init__(self, args, opts): super(Method, self).__init__(args, opts) self.es_int = ESInterface(host=self.opts.server, port=self.opts.port, index_name=self.opts.index_name) self.regex_citation = re.compile( r"\(\s?(([A-Za-z\-]+\s)+([A-Za-z\-]+\.?)?,?\s\d{2,4}(;\s)?)+\s?\)|" r"(\[(\d+([,–]\s?)?)+\])|" r"\[[\d,-]+\]").sub self.all_digits = re.compile(r"^\d+$").search if self.opts.remove_stopwords: with file(self.opts.stopwords_path) as f: self.stopwords = frozenset([l.strip().lower() for l in f]) else: self.stopwords = frozenset([]) self.doc_mod = documents_model.DocumentsModel(opts.docs_path) self.ann_client = AnnotationsClient() def run(self, test_data): out_results = [] # outfile = codecs.open('tmp/nlp.txt' , 'wb' , 'UTF-8') for ann in test_data: doc_type = '_'.join((ann['topic_id'].lower(), ann['reference_article'][:-4].lower())) doc_type = doc_type.replace(',', '').replace("'", '"') authors = set((ann['reference_article'][:-4].lower().strip(), ann['citing_article'][:-4].lower().strip())) # preprocess (removes citations) and tokenizes # citation text before submitting to elasticsearch q = self.regex_citation('', ann['citation_text']) q = re.sub(r'( ,)+', ',', q) q = q.encode('ascii', 'ignore') nlp_extractor = Extract_NLP_Tags() nps = nlp_extractor.extract_NP(q, mode='flattened') # outfile.write('query: "%s" \nparsed: "%s"\n\n' %(q,str(nps)) ) q1 = '' queryterms = set() for e in nps: for e1 in e: if len(e1) < 4: all_stop = False if self.opts.remove_stopwords_phrase: tmp = ' '.join( sub_e.replace('"', '') for sub_e in e1 if sub_e.replace('"', '') not in self.stopwords) else: count = 0 for sub_e in e1: if sub_e.replace('"', '') in self.stopwords: count += 1 if count == len(e1): all_stop = True tmp = ' '.join( sub_e.replace('"', '') for sub_e in e1) if tmp not in queryterms and not all_stop: q1 += '"' + tmp + '"^' + \ str(self.opts.phrase_weight) + ' ' queryterms.add(tmp) if self.opts.expand_window: window = self.doc_mod.get_para( ann['topic_id'].lower(), ann['citing_article'][:-4].lower(), (ann['citation_offset'][0], ann['citation_offset'][1])) sorrounding_text = deepcopy(window['sentence']) st = self.regex_citation('', sorrounding_text) st = re.sub(r'( ,)+', ',', st) st = st.encode('ascii', 'ignore') other_nouns = nlp_extractor.extract_NP(st, mode='flattened') for e in other_nouns: for e1 in e: if len(e1) < 4: all_stop = False if self.opts.remove_stopwords_phrase: tmp = ' '.join( sub_e.replace('"', '') for sub_e in e1 if sub_e.replace('"', '') not in self.stopwords) else: count = 0 for sub_e in e1: if sub_e.replace('"', '') in self.stopwords: count += 1 if count == len(e1): all_stop = True tmp = ' '.join( sub_e.replace('"', '') for sub_e in e1) if tmp not in queryterms and not all_stop: q1 += '"' + tmp + '"^' + \ str(self.opts.surrounding_words_weight) + \ ' ' queryterms.add(tmp) if self.opts.query_terms: q = ' '.join([ t + '^' + str(self.opts.qtrem_weight) for t in self.es_int.tokenize(q) if (t not in self.stopwords and t not in authors and not (self.all_digits(t))) ]) q1 = q1 + ' ' + q if self.opts.verbose: print "query: %s" % q print "q1 : %s" % q1 print '_____' # q2 = self.es_int.tokenize(q1, 'sentence') # q2 = ' '.join([t for t in self.es_int.tokenize(q1) # if (t not in self.stopwords and # t not in authors and # not(self.all_digits(t)))]) if self.opts.analyzer: r = self.es_int.simple_search( q1.strip(), maxsize=self.opts.maxsize, source_fields=['offset', 'sentence'], # field='sentence', doc_type=doc_type, phrase_slop=self.opts.phrase_slop, params={'analyzer': self.opts.analyzer}) else: r = self.es_int.simple_search( q1.strip(), maxsize=self.opts.maxsize, source_fields=['offset', 'sentence'], # field='sentence', doc_type=doc_type, phrase_slop=self.opts.phrase_slop) if self.opts.sentence: for idx, e in enumerate(deepcopy(r)): if '_id' in e: query = ' OR '.join([ '_id:%s' % (str(int(e['_id']) + j).zfill(5)) for j in range(-1 * self.opts.sentence, self.opts.sentence + 1) if j != 0 and int(e['_id']) + j > 0 ]) sour = self.es_int.simple_search( query, doc_type=e['_type'], maxsize=2 * self.opts.sentence, source_fields=['offset', 'sentence']) # aft = self.es_int.get_page( # str(int(e['_id']) + 1).zfill(5), e['_type']) # bef = self.es_int.get_page( # str(int(e['_id']) + 1).zfill(5), e['_type']) if len(sour) > 0: for s in sour: r.insert(idx + 1, s) for e in r: fld = e.pop('fields') if eval(fld['offset'][0])[0] < self.opts.expand_results: beg = 0 else: beg = eval(fld['offset'][0])[0] - self.opts.expand_results endd = eval(fld['offset'][0])[1] + self.opts.expand_results e['offset'] = [(beg, endd)] e['sentence'] = fld['sentence'][0] e['query'] = q1 r1 = deepcopy(r) r = [] for idx, e in enumerate(r1): if idx < self.opts.maxsize: r.append(e) if self.opts.combine: if len(r) == 0: r = [{ '_type': doc_type, '_index': self.opts.index_name, '_score': 0, 'sentence': '', 'offset': [(0, 1)], 'query': q1, '_id': -11 }] r = [{ '_type': r[0]['_type'], '_index': r[0]['_index'], 'query': q1, '_score': sum([e['_score'] for e in r]), 'offset': [e['offset'][0] for e in r], 'sentence': [e['sentence'] for e in r], '_id': '-000001' }] out_results.append(r) return out_results
class Method(MethodInterface): """ Produce reference text by submitting the citance to the ElasticSearch server. """ method_opts = { 'maxsize': { 'type': int, 'default': 100 }, 'thresh': { 'type': int, 'default': False }, 'stopwords-path': { 'default': STOPWORDS_PATH }, 'remove-stopwords': { 'default': True, 'action': 'store_true' }, 'combine': { 'default': False, 'action': 'store_true' }, 'cache-path': { 'default': 'cache' }, 'idf_index': { 'default': 'pubmed' } } def __init__(self, args, opts): super(Method, self).__init__(args, opts) self.es_int = ESInterface(host=self.opts.server, port=self.opts.port, index_name=self.opts.index_name) self.regex_citation = re.compile(r"(\(\s?(([A-Za-z]+\.?\s?)+,? \d+" r"(\s?([;,]|and)\s)?)+\))|" r"(\[(\d+([,–]\s?)?)+\])|" r"\[[\d,-]+\]").sub self.all_digits = re.compile(r"^\d+$").search if self.opts.remove_stopwords: with file(self.opts.stopwords_path) as f: self.stopwords = frozenset([l.strip().lower() for l in f]) else: self.stopwords = frozenset([]) def run(self, test_data): out_results = [] doc_freq_path = os.path.join( self.opts.cache_path, 'idfidx' + self.opts.idf_index + 'wp_doc_freq.json') if os.path.exists(doc_freq_path): with codecs.open(doc_freq_path, 'rb', 'UTF-8') as mf: doc_freq = json.load(mf) else: doc_freq = {} es_int2 = ESAuth(host='devram4.cs.georgetown.edu', index_name=self.opts.idf_index) count_docs = es_int2.count(query='*:*') for ann in test_data: doc_type = '_'.join((ann['topic_id'].lower(), ann['reference_article'][:-4].lower())) doc_type = doc_type.replace(',', '').replace("'", '"') authors = set((ann['reference_article'][:-4].lower().strip(), ann['citing_article'][:-4].lower().strip())) # preprocess (removes citations) and tokenizes # citation text before submitting to elasticsearch q = self.regex_citation('', ann['citation_text']) q = q.encode('ascii', 'ignore') terms = [] for t in self.es_int.tokenize(q, 'sentence'): if (t not in self.stopwords and t not in authors and not (self.all_digits(t))): if t not in doc_freq.keys(): count = es_int2.count(t) if count > 0: idf = log(count_docs / float(count + 1)) doc_freq[t] = idf terms.append(t) else: idf = doc_freq[t] terms.append(t) avg_idf = np.average([doc_freq[t] for t in terms]) thresh = avg_idf if self.opts.thresh is not None\ else self.opts.thresh q = ' '.join([t for t in terms if (doc_freq[t] > thresh)]) if q == '': max_idf = -1 for t in terms: if max_idf < doc_freq[t]: max_idf = doc_freq[t] q = t r = self.es_int.simple_search(q, maxsize=self.opts.maxsize, source_fields=['offset', 'sentence'], field='sentence', doc_type=doc_type) for e in r: fld = e.pop('fields') e['offset'] = [eval(fld['offset'][0])] # beg = e['offset'][0][0] - \ # 100 if e['offset'][0][0] else e['offset'][0][0] # end = e['offset'][0][1] + 100 # e['offset'] = [(beg, end)] e['sentence'] = fld['sentence'][0] e['query'] = q if self.opts.combine: if len(r) == 0: r = [{ '_type': doc_type, '_index': self.opts.index_name, '_score': 0, 'sentence': '', 'offset': [(0, 1)], 'query': q, '_id': -11 }] r = [{ '_type': r[0]['_type'], '_index': r[0]['_index'], 'query': q, 'topic': ann['topic_id'].lower(), 'citance_number': ann['citance_number'], 'citation_text': ann['citation_text'], 'citing_article': ann['citing_article'], '_score': sum([e['_score'] for e in r]), 'offset': [e['offset'][0] for e in r], 'sentence': [e['sentence'] for e in r], '_id': '-000001' }] out_results.append(r) with codecs.open(doc_freq_path, 'wb', 'UTF-8') as mf: json.dump(doc_freq, mf, indent=2) return out_results
class Method(MethodInterface): """ Produce reference text by submitting the citance to the ElasticSearch server. """ method_opts = { 'maxsize': { 'type': int, 'default': 100 }, 'stopwords-path': { 'default': STOPWORDS_PATH }, 'remove-stopwords': { 'default': False, 'action': 'store_true' }, 'combine': { 'default': False, 'action': 'store_true' }, 'analyzer': { 'default': False, 'type': str }, 'ngram': { 'default': False, 'type': int }, 'runmode': { 'default': 'train' } } def __init__(self, args, opts): super(Method, self).__init__(args, opts) self.es_int = ESInterface(host=self.opts.server, port=self.opts.port, index_name=self.opts.index_name) self.analyzer = self.es_int.get_index_analyzer() self.regex_citation = re.compile( r"\(\s?(([A-Za-z\-]+\s)+([A-Za-z\-]+\.?)?,?\s\d{2,4}(;\s)?)+\s?\)|" r"(\[(\d+([,–]\s?)?)+\])|" r"\[[\d,-]+\]").sub self.all_digits = re.compile(r"^\d+$").search if self.opts.remove_stopwords: with file(self.opts.stopwords_path) as f: self.stopwords = frozenset([l.strip().lower() for l in f]) else: self.stopwords = frozenset([]) self.db = MySQLdb.connect(host="127.0.0.1", port=3309, user="******", passwd="lollipop11", db="umls") self.cur = self.db.cursor() self.ttys = ['SY'] ttygroups = { "syns": ('AUN', 'EQ', 'SYN', 'MTH'), "chemicals": ('CCN', 'CSN'), "drugs": ('BD', 'BN', 'CD', 'DP', 'FBD', 'GN', 'OCD'), "diseases": ('DI', ), "findings": ('FI', ), "hierarchy": ('HS', 'HT', 'HX'), "related": ('RT', ), "preferred": ('PTN', 'PT') } self.doc_mod = documents_model.DocumentsModel(opts.anns_dir) self.ann_client = AnnotationsClient() self.reg_apa = re.compile( # [Chen et al.2000] r"\(\s?(([A-Za-z\-]+\s)+([A-Za-z\-]+\.?)?,?\s\d{2,4}(;\s)?)+\s?\)|" r"\(\s?([^ ]+\s?[^ ]*et\sal\.,?\s\d{2,4}(,\s)?)+(\sand\s)?[^ ]+\s?[^ ]*et\sal\.,?\s\d{2,4}\)|" r"\w+\set al\. \(\d{2,4}\)") # [Chen et al. 200] self.reg_apa_rare = re.compile( r"((([A-Z]\w+\set\sal\.,? \d{4})|([A-Z]\w+\sand\s[A-Z]\w+,?\s\d{4}))((,\s)| and )?)+" ) self.reg_apa2 = re.compile( r"\(\s?(\w+\s?\w*et\sal\.,\s\d{2,4}(,\s)?)+(\sand\s)?\w+\s?\w*et\sal\.,\s\d{2,4}\)" ) self.reg_ieee = re.compile(r"(\[(\d+([,–]\s?)?)+\])|\[\s?[\d,-]+\]") self.reg_paranthesis = re.compile( r"\(\s?\d{1,2}(,\s\d{1,2})*(\sand\s\d{1,2})?\)") self.nlp_extractor = Extract_NLP_Tags() self.tokenizer = RegexpTokenizer('[^\w\-\']+', gaps=True) self.lmtzr = WordNetLemmatizer() # if len(args) > 3:s # self.ttys = [] # # for tty in args[3:]: # if tty in ttygroups: # self.ttys.extend(ttygroups[tty]) # else: # self.ttys.append(tty) def expand_concept(self, cdata): rejected_semTypes = {'ftcn', 'qlco', 'qnco', 'inpr'} Okay = True for st in cdata['SemanticTypes']: if st in rejected_semTypes: Okay = False if Okay: return self.concept_synonyms(cdata['ConceptId']) def concept_synonyms(self, cui): if cui in evaluate.cachefile: return set(evaluate.cachefile[cui]) else: termtypes = ("and (TTY=" + " OR TTY=".join(["'%s'" % x for x in self.ttys]) + ")") # query = 'select * from (select distinct STR from MRCONSO a,'+\ # '(select distinct CUI1,AUI1,AUI2,RELA,CUI2 from MRREL where cui1 = \'%s\'' % cui +\ # ' and rela is not null) b where a.CUI=b.CUI2 and a.LAT=\'ENG\') dd ;' query = "select STR from MRCONSO where " +\ "CUI = '%s' and LAT = 'ENG' and ISPREF = 'Y'" % cui +\ termtypes + " and (SAB = 'SNOMEDCT_US')" # query = "select STR" # print query self.cur.execute(query) # self.cur.execute("select STR from MRCONSO where " + # "CUI = '%s' and LAT = 'ENG' and ISPREF = 'Y'" % cui + # termtypes + " and SAB != 'CHV'") syns = set( filter(lambda y: y.replace(" ", "").isalpha(), [x.lower() for x, in self.cur.fetchall()])) evaluate.cachefile[cui] = list(syns) return syns def umls_expand(self, cui): if cui in evaluate.cachefile: return set(evaluate.cachefile[cui]) else: termtypes = ("and (TTY=" + " OR TTY=".join(["'%s'" % x for x in self.ttys]) + ")") # query = 'select * from (select distinct STR from MRCONSO a,'+\ # '(select distinct CUI1,AUI1,AUI2,RELA,CUI2 from MRREL where cui1 = \'%s\'' % cui +\ # ' and rela is not null) b where a.CUI=b.CUI2 and a.LAT=\'ENG\') dd ;' query = "select STR from MRCONSO where " +\ "STR LIKE '%%%s%%' and LAT = 'ENG' and ISPREF = 'Y'" % cui +\ termtypes + " and (SAB = 'SNOMEDCT_US')" # query = "select STR" # print query self.cur.execute(query) # self.cur.execute("select STR from MRCONSO where " + # "CUI = '%s' and LAT = 'ENG' and ISPREF = 'Y'" % cui + # termtypes + " and SAB != 'CHV'") syns = set( filter(lambda y: y.replace(" ", "").isalpha(), [x.lower() for x, in self.cur.fetchall()])) evaluate.cachefile[cui] = list(syns) return syns def run(self, test_data): out_results = [] for ann in test_data: doc_type = '_'.join((ann['topic_id'].lower(), ann['reference_article'][:-4].lower())) doc_type = doc_type.replace(',', '').replace("'", '"') # TEMPORARY FIX FOR WRONG DOCUMENT TYPE NAME if self.opts.runmode == 'eval': doc_type = doc_type.replace('train', 'eval') doc = self.doc_mod.get_doc(ann['topic_id'].lower(), ann['citing_article']) cit_text = ann['citation_text'] cit_text_doc = doc[ ann['citation_offset'][0]:ann['citation_offset'][1]] cit_marker = ann['citation_marker'] cit_marker_doc = doc[ann['citation_marker_offset'][0]: ann['citation_marker_offset'][1]] cit_mrk_offset_sent = [ ann['citation_marker_offset'][0] - ann['citation_offset'][0], ann['citation_marker_offset'][1] - ann['citation_offset'][0] ] cleaned = self.reg_apa.sub('', cit_text_doc) cleaned = self.reg_ieee.sub('', cleaned) cleaned = self.reg_paranthesis.sub('', cleaned) cleaned = self.reg_apa_rare.sub('', cleaned) cleaned = re.sub('\s+', ' ', cleaned).strip() cleaned = re.sub('(,\s)+', ', ', cleaned).strip(', ') ''' -------------- IMMEDIATE NP BEFORE MARKER ---------- ''' m = list(self.reg_apa.finditer(cit_text_doc)) m1 = list(self.reg_ieee.finditer(cit_text_doc)) m2 = list(self.reg_paranthesis.finditer(cit_text_doc)) # (start, end, group) if len(m) > 0: markers = [(e.start(), e.end(), e.group(0)) for e in m] elif len(m1) > 0: markers = [(e.start(), e.end(), e.group(0)) for e in m1] elif len(m2) > 0: markers = [(e.start(), e.end(), e.group(0)) for e in m2] else: m3 = list(self.reg_apa_rare.finditer(cit_text_doc)) if len(m3) > 0: markers = [(e.start(), e.end(), e.group(0)) for e in m3] else: markers = [] if len(markers) > 10000: nps = self.nlp_extractor.parse_by_mbsp(cleaned.strip()) if nps is None: q = cleaned else: t = nps.split(' ') concepts = [] for i in range(len(t)): conc = [] toks = t[i].split('/') while (('NP' in toks[2]) and (i < len(t))): conc.append((toks[0], toks[6])) i += 1 if i < len(t): toks = t[i].split('/') if len(conc) > 0: concepts.append(conc) noun_phrases = [ ' '.join([s1[0] for s1 in t1]) for t1 in concepts ] # nps = self.nlp_extractor.extract_NP(cleaned, mode='flattened') # nps = [[[a[1:-1] for a in piece] for piece in sent] for sent in nps] # nps = [a[1:-1] for sent in nps for piece in sent for a in piece] # for e in nps: # noun_phrases = [(sub_e[0].replace('"', ''),idx) for idx, sub_e in enumerate(e) if sub_e[0].replace('"', '') not in self.stopwords] tokens = self.tokenizer.tokenize(cit_text) tokens_offsets = self.tokenizer.span_tokenize(cit_text_doc) nearest = '' nearest_idx = -1 distance = 100000 # find nearest word to the citation marker for idx, f in enumerate(tokens_offsets): # check to see if in valid span (not citation markers) invalid = False for e in markers: if f[0] >= e[0] and f[1] <= e[1]: invalid = True if (cit_mrk_offset_sent[0] - f[1] >= 0) and\ (cit_mrk_offset_sent[0] - f[1] < distance) and\ not invalid: distance = cit_mrk_offset_sent[0] - f[1] if len(re.findall(r"^[^A-Za-z]+$", tokens[idx])) == 0: nearest = tokens[idx] if (idx > 0) and len( re.findall(r"^[^A-Za-z]+$", tokens[idx - 1])) == 0: nearest = tokens[idx - 1] + ' ' + tokens[idx] nearest_idx = idx elif (cit_mrk_offset_sent[0] < f[1]): break if len(nearest.split(' ')) == 1 and nearest_idx > 0 and\ tokens[nearest_idx] not in stops100: nearest = tokens[idx - 1] + ' ' + tokens[idx] largest = 0 q = '' for n in noun_phrases: if (nearest in n) and (len(nearest.split()) > largest): q = '"%s"' % nearest largest = len(nearest.split()) if q == '': q = cleaned q = sanitize(q) # find longest noun phrase containing the nearest # res = None # for np in nps[0]: # if nearest in np and len(np) > longest and len(np) < 5: # longest = len(np) # res = np # if res is not None: # res = ' '.join([el for el in res]) # else: # res = nearest else: try: qtxt = unicodedata.normalize('NFKD', cleaned).encode( 'ascii', 'ignore') except: qtxt = cleaned.encode('ascii', 'ignore') qterms = [qtxt] qconcepts = mmrun(cleaned) for cdata in qconcepts['concepts']: newterms = self.expand_concept(cdata) if newterms is not None: qterms.extend(newterms) tokens = self.tokenizer.tokenize(' '.join(qterms)) # tokens = self.es_int.tokenize(qtxt, analyzer=self.analyzer) q = ' '.join([ t for t in tokens if (t not in self.stopwords and not (self.all_digits(t))) ]) if self.opts.ngram: tokens = self.es_int.tokenize(q, "sentence") new_query = '' for i in range(len(tokens) - self.opts.ngram): tmp = '' for j in range(i, i + self.opts.ngram): tmp += tokens[j] + ' ' new_query += '"' + tmp.strip() + '" ' q = new_query.strip() if self.opts.analyzer: r = self.es_int.simple_search( q, maxsize=self.opts.maxsize, source_fields=['offset', 'sentence'], # field='sentence', doc_type=doc_type, params={'analyzer': self.opts.analyzer}) else: r = self.es_int.simple_search( q, maxsize=self.opts.maxsize, source_fields=['offset', 'sentence'], # field='sentence', doc_type=doc_type) for e in r: fld = e.pop('fields') e['offset'] = [eval(fld['offset'][0])] # beg = e['offset'][0][0] - \ # 100 if e['offset'][0][0] else e['offset'][0][0] # end = e['offset'][0][1] + 100 # e['offset'] = [(beg, end)] e['sentence'] = fld['sentence'][0] e['query'] = q if self.opts.combine: if len(r) == 0: r = [{ '_type': doc_type, '_index': self.opts.index_name, '_score': 0, 'sentence': '', 'offset': [(0, 1)], 'query': q, '_id': -11 }] r = [{ '_type': r[0]['_type'], '_index': r[0]['_index'], 'query': q, '_score': sum([e['_score'] for e in r]), 'offset': [e['offset'][0] for e in r], 'sentence': [e['sentence'] for e in r], '_id': '-000001' }] out_results.append(r) return out_results
class Method(MethodInterface): """ Produce reference text by submitting the citance to the ElasticSearch server. """ method_opts = {'maxsize': {'type': int, 'default': 100}, 'stopwords-path': {'default': STOPWORDS_PATH}, 'remove-stopwords': {'default': False, 'action': 'store_true'}, 'combine': {'default': False, 'action': 'store_true'}, 'analyzer': {'default': False, 'type': str}, 'ngram': {'default': False, 'type': int}, 'runmode': {'default': 'train'}} def __init__(self, args, opts): super(Method, self).__init__(args, opts) self.es_int = ESInterface(host=self.opts.server, port=self.opts.port, index_name=self.opts.index_name) self.analyzer = self.es_int.get_index_analyzer() self.regex_citation = re.compile(r"\(\s?(([A-Za-z\-]+\s)+([A-Za-z\-]+\.?)?,?\s\d{2,4}(;\s)?)+\s?\)|" r"(\[(\d+([,–]\s?)?)+\])|" r"\[[\d,-]+\]").sub self.all_digits = re.compile(r"^\d+$").search if self.opts.remove_stopwords: with file(self.opts.stopwords_path) as f: self.stopwords = frozenset([l.strip().lower() for l in f]) else: self.stopwords = frozenset([]) self.db = MySQLdb.connect(host="127.0.0.1", port=3309, user="******", passwd="lollipop11", db="umls") self.cur = self.db.cursor() self.ttys = ['SY'] ttygroups = {"syns": ('AUN', 'EQ', 'SYN', 'MTH'), "chemicals": ('CCN', 'CSN'), "drugs": ('BD', 'BN', 'CD', 'DP', 'FBD', 'GN', 'OCD'), "diseases": ('DI', ), "findings": ('FI', ), "hierarchy": ('HS', 'HT', 'HX'), "related": ('RT', ), "preferred": ('PTN', 'PT')} self.doc_mod = documents_model.DocumentsModel(opts.anns_dir) self.ann_client = AnnotationsClient() self.reg_apa = re.compile( # [Chen et al.2000] r"\(\s?(([A-Za-z\-]+\s)+([A-Za-z\-]+\.?)?,?\s\d{2,4}(;\s)?)+\s?\)|" r"\(\s?([^ ]+\s?[^ ]*et\sal\.,?\s\d{2,4}(,\s)?)+(\sand\s)?[^ ]+\s?[^ ]*et\sal\.,?\s\d{2,4}\)|" r"\w+\set al\. \(\d{2,4}\)") # [Chen et al. 200] self.reg_apa_rare = re.compile( r"((([A-Z]\w+\set\sal\.,? \d{4})|([A-Z]\w+\sand\s[A-Z]\w+,?\s\d{4}))((,\s)| and )?)+") self.reg_apa2 = re.compile( r"\(\s?(\w+\s?\w*et\sal\.,\s\d{2,4}(,\s)?)+(\sand\s)?\w+\s?\w*et\sal\.,\s\d{2,4}\)") self.reg_ieee = re.compile(r"(\[(\d+([,–]\s?)?)+\])|\[\s?[\d,-]+\]") self.reg_paranthesis = re.compile( r"\(\s?\d{1,2}(,\s\d{1,2})*(\sand\s\d{1,2})?\)") self.nlp_extractor = Extract_NLP_Tags() self.tokenizer = RegexpTokenizer('[^\w\-\']+', gaps=True) self.lmtzr = WordNetLemmatizer() # if len(args) > 3:s # self.ttys = [] # # for tty in args[3:]: # if tty in ttygroups: # self.ttys.extend(ttygroups[tty]) # else: # self.ttys.append(tty) def expand_concept(self, cdata): rejected_semTypes = {'ftcn', 'qlco', 'qnco', 'inpr'} Okay = True for st in cdata['SemanticTypes']: if st in rejected_semTypes: Okay = False if Okay: return self.concept_synonyms(cdata['ConceptId']) def concept_synonyms(self, cui): if cui in evaluate.cachefile: return set(evaluate.cachefile[cui]) else: termtypes = ("and (TTY=" + " OR TTY=".join(["'%s'" % x for x in self.ttys]) + ")") # query = 'select * from (select distinct STR from MRCONSO a,'+\ # '(select distinct CUI1,AUI1,AUI2,RELA,CUI2 from MRREL where cui1 = \'%s\'' % cui +\ # ' and rela is not null) b where a.CUI=b.CUI2 and a.LAT=\'ENG\') dd ;' query = "select STR from MRCONSO where " +\ "CUI = '%s' and LAT = 'ENG' and ISPREF = 'Y'" % cui +\ termtypes + " and (SAB = 'SNOMEDCT_US')" # query = "select STR" # print query self.cur.execute(query) # self.cur.execute("select STR from MRCONSO where " + # "CUI = '%s' and LAT = 'ENG' and ISPREF = 'Y'" % cui + # termtypes + " and SAB != 'CHV'") syns = set(filter(lambda y: y.replace(" ", "").isalpha(), [x.lower() for x, in self.cur.fetchall()])) evaluate.cachefile[cui] = list(syns) return syns def umls_expand(self, cui): if cui in evaluate.cachefile: return set(evaluate.cachefile[cui]) else: termtypes = ("and (TTY=" + " OR TTY=".join(["'%s'" % x for x in self.ttys]) + ")") # query = 'select * from (select distinct STR from MRCONSO a,'+\ # '(select distinct CUI1,AUI1,AUI2,RELA,CUI2 from MRREL where cui1 = \'%s\'' % cui +\ # ' and rela is not null) b where a.CUI=b.CUI2 and a.LAT=\'ENG\') dd ;' query = "select STR from MRCONSO where " +\ "STR LIKE '%%%s%%' and LAT = 'ENG' and ISPREF = 'Y'" % cui +\ termtypes + " and (SAB = 'SNOMEDCT_US')" # query = "select STR" # print query self.cur.execute(query) # self.cur.execute("select STR from MRCONSO where " + # "CUI = '%s' and LAT = 'ENG' and ISPREF = 'Y'" % cui + # termtypes + " and SAB != 'CHV'") syns = set(filter(lambda y: y.replace(" ", "").isalpha(), [x.lower() for x, in self.cur.fetchall()])) evaluate.cachefile[cui] = list(syns) return syns def run(self, test_data): out_results = [] for ann in test_data: doc_type = '_'.join((ann['topic_id'].lower(), ann['reference_article'][:-4].lower())) doc_type = doc_type.replace(',', '').replace("'", '"') # TEMPORARY FIX FOR WRONG DOCUMENT TYPE NAME if self.opts.runmode == 'eval': doc_type = doc_type.replace('train', 'eval') doc = self.doc_mod.get_doc( ann['topic_id'].lower(), ann['citing_article']) cit_text = ann['citation_text'] cit_text_doc = doc[ ann['citation_offset'][0]:ann['citation_offset'][1]] cit_marker = ann['citation_marker'] cit_marker_doc = doc[ ann['citation_marker_offset'][0]:ann['citation_marker_offset'][1]] cit_mrk_offset_sent = [ann['citation_marker_offset'][0] - ann['citation_offset'][0], ann['citation_marker_offset'][1] - ann['citation_offset'][0]] cleaned = self.reg_apa.sub('', cit_text_doc) cleaned = self.reg_ieee.sub('', cleaned) cleaned = self.reg_paranthesis.sub('', cleaned) cleaned = self.reg_apa_rare.sub('', cleaned) cleaned = re.sub('\s+', ' ', cleaned).strip() cleaned = re.sub('(,\s)+', ', ', cleaned).strip(', ') ''' -------------- IMMEDIATE NP BEFORE MARKER ---------- ''' m = list(self.reg_apa.finditer(cit_text_doc)) m1 = list(self.reg_ieee.finditer(cit_text_doc)) m2 = list(self.reg_paranthesis.finditer(cit_text_doc)) # (start, end, group) if len(m) > 0: markers = [(e.start(), e.end(), e.group(0)) for e in m] elif len(m1) > 0: markers = [(e.start(), e.end(), e.group(0)) for e in m1] elif len(m2) > 0: markers = [(e.start(), e.end(), e.group(0)) for e in m2] else: m3 = list(self.reg_apa_rare.finditer(cit_text_doc)) if len(m3) > 0: markers = [(e.start(), e.end(), e.group(0)) for e in m3] else: markers = [] if len(markers) > 10000: nps = self.nlp_extractor.parse_by_mbsp(cleaned.strip()) if nps is None: q = cleaned else: t = nps.split(' ') concepts = [] for i in range(len(t)): conc = [] toks = t[i].split('/') while(('NP' in toks[2]) and (i < len(t))): conc.append((toks[0], toks[6])) i += 1 if i < len(t): toks = t[i].split('/') if len(conc) > 0: concepts.append(conc) noun_phrases = [ ' '.join([s1[0] for s1 in t1]) for t1 in concepts] # nps = self.nlp_extractor.extract_NP(cleaned, mode='flattened') # nps = [[[a[1:-1] for a in piece] for piece in sent] for sent in nps] # nps = [a[1:-1] for sent in nps for piece in sent for a in piece] # for e in nps: # noun_phrases = [(sub_e[0].replace('"', ''),idx) for idx, sub_e in enumerate(e) if sub_e[0].replace('"', '') not in self.stopwords] tokens = self.tokenizer.tokenize(cit_text) tokens_offsets = self.tokenizer.span_tokenize(cit_text_doc) nearest = '' nearest_idx = -1 distance = 100000 # find nearest word to the citation marker for idx, f in enumerate(tokens_offsets): # check to see if in valid span (not citation markers) invalid = False for e in markers: if f[0] >= e[0] and f[1] <= e[1]: invalid = True if (cit_mrk_offset_sent[0] - f[1] >= 0) and\ (cit_mrk_offset_sent[0] - f[1] < distance) and\ not invalid: distance = cit_mrk_offset_sent[0] - f[1] if len(re.findall(r"^[^A-Za-z]+$", tokens[idx])) == 0: nearest = tokens[idx] if (idx > 0) and len(re.findall(r"^[^A-Za-z]+$", tokens[idx - 1])) == 0: nearest = tokens[ idx - 1] + ' ' + tokens[idx] nearest_idx = idx elif (cit_mrk_offset_sent[0] < f[1]): break if len(nearest.split(' ')) == 1 and nearest_idx > 0 and\ tokens[nearest_idx] not in stops100: nearest = tokens[idx - 1] + ' ' + tokens[idx] largest = 0 q = '' for n in noun_phrases: if (nearest in n) and (len(nearest.split()) > largest): q = '"%s"' % nearest largest = len(nearest.split()) if q == '': q = cleaned q = sanitize(q) # find longest noun phrase containing the nearest # res = None # for np in nps[0]: # if nearest in np and len(np) > longest and len(np) < 5: # longest = len(np) # res = np # if res is not None: # res = ' '.join([el for el in res]) # else: # res = nearest else: try: qtxt = unicodedata.normalize('NFKD', cleaned).encode('ascii', 'ignore') except: qtxt = cleaned.encode('ascii', 'ignore') qterms = [qtxt] qconcepts = mmrun(cleaned) for cdata in qconcepts['concepts']: newterms = self.expand_concept(cdata) if newterms is not None: qterms.extend(newterms) tokens = self.tokenizer.tokenize(' '.join(qterms)) # tokens = self.es_int.tokenize(qtxt, analyzer=self.analyzer) q = ' '.join([t for t in tokens if (t not in self.stopwords and not(self.all_digits(t)))]) if self.opts.ngram: tokens = self.es_int.tokenize(q, "sentence") new_query = '' for i in range(len(tokens) - self.opts.ngram): tmp = '' for j in range(i, i + self.opts.ngram): tmp += tokens[j] + ' ' new_query += '"' + tmp.strip() + '" ' q = new_query.strip() if self.opts.analyzer: r = self.es_int.simple_search(q, maxsize=self.opts.maxsize, source_fields=[ 'offset', 'sentence'], # field='sentence', doc_type=doc_type, params={'analyzer': self.opts.analyzer}) else: r = self.es_int.simple_search(q, maxsize=self.opts.maxsize, source_fields=[ 'offset', 'sentence'], # field='sentence', doc_type=doc_type) for e in r: fld = e.pop('fields') e['offset'] = [eval(fld['offset'][0])] # beg = e['offset'][0][0] - \ # 100 if e['offset'][0][0] else e['offset'][0][0] # end = e['offset'][0][1] + 100 # e['offset'] = [(beg, end)] e['sentence'] = fld['sentence'][0] e['query'] = q if self.opts.combine: if len(r) == 0: r = [{'_type': doc_type, '_index': self.opts.index_name, '_score': 0, 'sentence': '', 'offset': [(0, 1)], 'query':q, '_id':-11}] r = [{'_type': r[0]['_type'], '_index': r[0]['_index'], 'query': q, '_score': sum([e['_score'] for e in r]), 'offset': [e['offset'][0] for e in r], 'sentence': [e['sentence'] for e in r], '_id': '-000001'}] out_results.append(r) return out_results
class Method(MethodInterface): """ Produce reference text by submitting the citance to the ElasticSearch server. """ method_opts = {'maxsize': {'type': int, 'default': 3}, 'stopwords-path': {'default': STOPWORDS_PATH}, 'remove-stopwords': {'default': False, 'action': 'store_true'}, 'combine': {'default': False, 'action': 'store_true'}, 'analyzer': {'default': False, 'type': str}, 'ngram': {'default': False, 'type': int}} def __init__(self, args, opts): super(Method, self).__init__(args, opts) self.es_int = ESInterface(host=self.opts.server, port=self.opts.port, index_name=self.opts.index_name) self.regex_citation = re.compile(r"\(\s?(([A-Za-z\-]+\s)+([A-Za-z\-]+\.?)?,?\s\d{2,4}(;\s)?)+\s?\)|" r"(\[(\d+([,–]\s?)?)+\])|" r"\[[\d,-]+\]").sub self.all_digits = re.compile(r"^\d+$").search if self.opts.stopwords_path: stop_path = self.opts.stopwords_path else: stop_path = STOPWORDS_PATH if self.opts.remove_stopwords: with file(self.opts.stopwords_path) as f: self.stopwords = frozenset([l.strip().lower() for l in f]) else: self.stopwords = frozenset([]) self.tokenizer = RegexpTokenizer('[^\w\-\']+', gaps=True) def run(self, test_data): # with codecs.open('tmp/test_data.json', 'wb', 'utf-8') as mf: # json.dump(test_data, mf, indent=2) out_results = [] det_res = {} for ann in test_data: doc_type = '_'.join((ann['topic_id'].lower(), ann['reference_article'][:-4].lower())) # TEMPORARY FIX FOR WRONG DOCUMENT TYPE NAME doc_type = doc_type.replace('train', 'eval') doc_type = doc_type.replace(',', '').replace("'", '"') # TEMPORARY FIX FOR WRONG DOCUMENT TYPE NAME doc_type = doc_type.replace('eval', 'train') authors = set((ann['reference_article'][:-4].lower().strip(), ann['citing_article'][:-4].lower().strip())) # preprocess (removes citations) and tokenizes # citation text before submitting to elasticsearch q = self.regex_citation('', ann['citation_text']) q = q.encode('ascii', 'ignore') # tokens = self.es_int.tokenize(q, "sentence") tokens = self.tokenizer.tokenize(q) tokens = ['"' + t + '"' if '-' in t else t for t in tokens] q = ' '.join([t for t in tokens if (t not in self.stopwords and t not in authors and not(self.all_digits(t)))]) if self.opts.ngram: tokens = self.es_int.tokenize(q, "sentence") new_query = '' for i in range(len(tokens) - self.opts.ngram): tmp = '' for j in range(i, i + self.opts.ngram): tmp += tokens[j] + ' ' new_query += '"' + tmp.strip() + '" ' q = new_query.strip() # q = '*:*' if self.opts.analyzer: r = self.es_int.simple_search(q, maxsize=self.opts.maxsize, source_fields=[ 'offset', 'sentence'], # field='sentence', doc_type=doc_type, params={'analyzer': self.opts.analyzer}) else: r = self.es_int.simple_search(q, maxsize=self.opts.maxsize, source_fields=[ 'offset', 'sentence'], # field='sentence', doc_type=doc_type) for e in r: fld = e.pop('fields') e['offset'] = [eval(fld['offset'][0])] # beg = e['offset'][0][0] - \ # 100 if e['offset'][0][0] else e['offset'][0][0] # end = e['offset'][0][1] + 100 # e['offset'] = [(beg, end)] e['sentence'] = fld['sentence'][0] e['query'] = q e['topic'] = ann['topic_id'].lower() if self.opts.combine: if len(r) == 0: r = [{'_type': doc_type, '_index': self.opts.index_name, '_score': 0, 'sentence': '', 'offset': [(0, 1)], 'query':q, '_id':-11}] r = [{'_type': r[0]['_type'], '_index': r[0]['_index'], 'query': q, 'topic': ann['topic_id'].lower(), 'citance_number': ann['citance_number'], 'citation_text': ann['citation_text'], 'citing_article': ann['citing_article'], '_score': sum([e['_score'] for e in r]), 'offset': [e['offset'][0] for e in r], 'sentence': [e['sentence'] for e in r], '_id': '-000001'}] out_results.append(r) # with codecs.open('tmp/out_results.json', 'wb', 'utf-8') as mf: # json.dump(out_results, mf, indent=2) # sys.exit() return out_results