def __init__(self, args, opts): super(Method, self).__init__(args, opts) self.es_int = ESInterface(host=self.opts.server, port=self.opts.port, index_name=self.opts.index_name) self.regex_citation = re.compile(r"\(\s?(([A-Za-z\-]+\s)+([A-Za-z\-]+\.?)?,?\s\d{2,4}(;\s)?)+\s?\)|" r"(\[(\d+([,–]\s?)?)+\])|" r"\[[\d,-]+\]").sub self.all_digits = re.compile(r"^\d+$").search if self.opts.remove_stopwords: with file(self.opts.stopwords_path) as f: self.stopwords = frozenset([l.strip().lower() for l in f]) else: self.stopwords = frozenset([]) self.doc_mod = documents_model.DocumentsModel(opts.docs_path) self.ann_client = AnnotationsClient() self.reg_apa = re.compile( # [Chen et al.2000] r"\(\s?(([A-Za-z\-]+\s)+([A-Za-z\-]+\.?)?,?\s\d{2,4}(;\s)?)+\s?\)|" r"\(\s?([^ ]+\s?[^ ]*et\sal\.,?\s\d{2,4}(,\s)?)+(\sand\s)?[^ ]+\s?[^ ]*et\sal\.,?\s\d{2,4}\)|" r"\w+\set al\. \(\d{2,4}\)") # [Chen et al. 200] self.reg_apa_rare = re.compile( r"((([A-Z]\w+\set\sal\.,? \d{4})|([A-Z]\w+\sand\s[A-Z]\w+,?\s\d{4}))((,\s)| and )?)+") self.reg_apa2 = re.compile( r"\(\s?(\w+\s?\w*et\sal\.,\s\d{2,4}(,\s)?)+(\sand\s)?\w+\s?\w*et\sal\.,\s\d{2,4}\)") self.reg_ieee = re.compile(r"(\[(\d+([,–]\s?)?)+\])|\[\s?[\d,-]+\]") self.reg_paranthesis = re.compile( r"\(\s?\d{1,2}(,\s\d{1,2})*(\sand\s\d{1,2})?\)") self.nlp_extractor = Extract_NLP_Tags() self.tokenizer = RegexpTokenizer('[^\w\-\']+', gaps=True) self.lmtzr = WordNetLemmatizer()
def __init__(self, args, opts): super(Method, self).__init__(args, opts) self.es_int = ESInterface(host=self.opts.server, port=self.opts.port, index_name=self.opts.index_name) self.analyzer = self.es_int.get_index_analyzer() self.regex_citation = re.compile( r"\(\s?(([A-Za-z\-]+\s)+([A-Za-z\-]+\.?)?,?\s\d{2,4}(;\s)?)+\s?\)|" r"(\[(\d+([,–]\s?)?)+\])|" r"\[[\d,-]+\]").sub self.all_digits = re.compile(r"^\d+$").search if self.opts.remove_stopwords: with file(self.opts.stopwords_path) as f: self.stopwords = frozenset([l.strip().lower() for l in f]) else: self.stopwords = frozenset([]) self.db = MySQLdb.connect(host=constants.mysql_server, port=constants.mysql_port, user=constants.mysql_user, passwd=constants.mysql_pass, db=constants.mysql_db) self.cur = self.db.cursor() self.ttys = ['SY'] ttygroups = { "syns": ('AUN', 'EQ', 'SYN', 'MTH'), "chemicals": ('CCN', 'CSN'), "drugs": ('BD', 'BN', 'CD', 'DP', 'FBD', 'GN', 'OCD'), "diseases": ('DI', ), "findings": ('FI', ), "hierarchy": ('HS', 'HT', 'HX'), "related": ('RT', ), "preferred": ('PTN', 'PT') } self.doc_mod = documents_model.DocumentsModel(opts.anns_dir) # self.ann_client = AnnotationsClient() self.reg_apa = re.compile( # [Chen et al.2000] r"\(\s?(([A-Za-z\-]+\s)+([A-Za-z\-]+\.?)?,?\s\d{2,4}(;\s)?)+\s?\)|" r"\(\s?([^ ]+\s?[^ ]*et\sal\.,?\s\d{2,4}(,\s)?)+(\sand\s)?[^ ]+\s?[^ ]*et\sal\.,?\s\d{2,4}\)|" r"\w+\set al\. \(\d{2,4}\)") # [Chen et al. 200] self.reg_apa_rare = re.compile( r"((([A-Z]\w+\set\sal\.,? \d{4})|([A-Z]\w+\sand\s[A-Z]\w+,?\s\d{4}))((,\s)| and )?)+" ) self.reg_apa2 = re.compile( r"\(\s?(\w+\s?\w*et\sal\.,\s\d{2,4}(,\s)?)+(\sand\s)?\w+\s?\w*et\sal\.,\s\d{2,4}\)" ) self.reg_ieee = re.compile(r"(\[(\d+([,–]\s?)?)+\])|\[\s?[\d,-]+\]") self.reg_paranthesis = re.compile( r"\(\s?\d{1,2}(,\s\d{1,2})*(\sand\s\d{1,2})?\)") self.nlp_extractor = Extract_NLP_Tags() self.tokenizer = RegexpTokenizer('[^\w\-\']+', gaps=True) self.lmtzr = WordNetLemmatizer() self.stemmer = stem.porter.PorterStemmer()
def __init__(self, index_name=constants.bm25_index): ''' default constuctor Args: index_name(str): The elasticsearch index name that will be used to retrieve documents and idfs ''' self.es_int = ESInterface(index_name=index_name) print self.es_int.get_avg_size('sentence') self.avg_doc_length = -1
def dump_stats(dump_stats_data, dump_path, index_name): es_int = ESInterface(index_name=index_name) csv_line = [] for ann, res in dump_stats_data: csv_line.extend([[ ann['topic_id'], ann['citing_article'][:-4].lower(), ann['reference_article'][:-4].lower(), ann['discourse_facet'] ], [''], [ann['citation_text'].encode('ascii', 'ignore')], ['']]) offsets = chain(*[[s[0], s[1], ''] for s in sorted(ann['reference_offset'].keys(), key=lambda t: t[0])]) csv_line.extend([list(offsets), ['']]) csv_line.append(['prec:']) csv_line.extend([list(t) for t in calculate_ap([res], [ann]).items()]) csv_line.append(['ndcg:']) csv_line.extend( [list(t) for t in calculate_ndcg([res], [ann]).items()]) csv_line.append(['']) for i, r in enumerate(res, start=1): rel = str(calculate_ndcg([[r]], [ann])['all'] > 0).upper() # temp until Arman fixes bug txt = es_int.get_page_by_res(r)['sentence'].encode( 'ascii', 'ignore') offset = str( es_int.get_page_by_res(r)['offset']).strip('()').split(', ') csv_line.extend([[txt], [ 'rank', i, '', 'offset', offset[0], offset[1], '', 'rel?', rel ]]) # commented until bugs fixed # txt = [] # for offset in r['offset']: # txt.append(ann_cl.get_doc('_'.join(r['_type'].split('_')[:2]), # r['_type'].split('_')[2], offset)) # txt = ' ... '.join(txt) # csv_line.extend([[txt], ['rank', i, '', 'offset', # r['offset'][0][0], r['offset'][0][1], # '', 'rel?', rel]]) # csv_line.append(['']) csv_line.extend([[''], ['']]) with file(dump_path, 'wb') as csv_file: wr = csv.writer(csv_file) wr.writerows(csv_line)
def __init__(self, args, opts): super(Method, self).__init__(args, opts) self.es_int = ESInterface(host=self.opts.server, port=self.opts.port, index_name=self.opts.index_name) self.regex_citation = re.compile(r"(\(\s?(([A-Za-z]+\.?\s?)+,? \d+" r"(\s?([;,]|and)\s)?)+\))|" r"(\[(\d+([,–]\s?)?)+\])|" r"\[[\d,-]+\]").sub self.all_digits = re.compile(r"^\d+$").search if self.opts.remove_stopwords: with file(self.opts.stopwords_path) as f: self.stopwords = frozenset([l.strip().lower() for l in f]) else: self.stopwords = frozenset([])
def __init__(self, args, opts): super(Method, self).__init__(args, opts) self.es_int = ESInterface(host=self.opts.server, port=self.opts.port, index_name=self.opts.index_name) self.regex_citation = re.compile( r"\(\s?(([A-Za-z\-]+\s)+([A-Za-z\-]+\.?)?,?\s\d{2,4}(;\s)?)+\s?\)|" r"(\[(\d+([,–]\s?)?)+\])|" r"\[[\d,-]+\]").sub self.all_digits = re.compile(r"^\d+$").search if self.opts.remove_stopwords: with file(self.opts.stopwords_path) as f: self.stopwords = frozenset([l.strip().lower() for l in f]) else: self.stopwords = frozenset([]) self.doc_mod = documents_model.DocumentsModel(opts.docs_path) self.ann_client = AnnotationsClient()
def __init__(self, filter_list=None, base_cache=None, cachedir='cache', eshost=None, esport=None, esindex=None, sim_func=CosineSimilarity(), stopwords=None, weighted=False, query_terms_only=False): if not eshost: eshost = 'localhost' if not esport: esport = 9200 if not esindex: esindex = 'pubmed' self.es = ESInterface(host=eshost, port=esport, index_name=esindex) self.cachedir = cachedir self.sim_func = sim_func self.timer = Timer(prefix='[timer]') self.weighted = weighted self.query_terms_only = query_terms_only self.base_cache = base_cache if not stopwords: stopwords = set() self._stopwords = stopwords if filter_list: filter_list = set([e for e in filter_list if e not in stopwords]) self._filter_list = filter_list # calculate figerprint to use as cache comment! finger_text = ' '.join([ w for w in set.union((self._filter_list or set()), self._stopwords) ]) finger_md5 = md5() finger_md5.update(finger_text.encode('utf-8')) self.finger_filter = finger_md5.hexdigest()
def __init__(self, args, opts): super(Method, self).__init__(args, opts) self.es_int = ESInterface(host=self.opts.server, port=self.opts.port, index_name=self.opts.index_name) self.regex_citation = re.compile( r"\(\s?(([A-Za-z\-]+\s)+([A-Za-z\-]+\.?)?,?\s\d{2,4}(;\s)?)+\s?\)|" r"(\[(\d+([,–]\s?)?)+\])|" r"\[[\d,-]+\]").sub self.all_digits = re.compile(r"^\d+$").search if self.opts.stopwords_path: stop_path = self.opts.stopwords_path else: stop_path = STOPWORDS_PATH if self.opts.remove_stopwords: with file(self.opts.stopwords_path) as f: self.stopwords = frozenset([l.strip().lower() for l in f]) else: self.stopwords = frozenset([]) self.tokenizer = RegexpTokenizer('[^\w\-\']+', gaps=True)
def __init__(self, documents, eshost='localhost', esport=9200, esindex='pubmed21', cachedir='cache'): self.cachedir = cachedir self.questions = documents self.categories = None self.added = dict([(qid, []) for qid in self.questions.keys()]) self.removed = dict([(qid, []) for qid in self.questions.keys()]) self.es = ESInterface(host=eshost, port=esport, index_name=esindex) self.tokenquestions = self.tokenize_questions(self.questions.items()) self.tokquestions = dict([(k, " ".join(v)) for k, v in self.tokenquestions.iteritems()]) self.run()
def __init__(self, index='biosum'): self.es_int = ESInterface(index_name=index)
def __init__(self, cache_index='cache'): self.es_int = ESInterface(index_name=cache_index)