示例#1
0
    def __init__(self, args, opts):
        super(Method, self).__init__(args, opts)
        self.es_int = ESInterface(host=self.opts.server,
                                  port=self.opts.port,
                                  index_name=self.opts.index_name)
        self.regex_citation = re.compile(r"\(\s?(([A-Za-z\-]+\s)+([A-Za-z\-]+\.?)?,?\s\d{2,4}(;\s)?)+\s?\)|"
                                         r"(\[(\d+([,–]\s?)?)+\])|"
                                         r"\[[\d,-]+\]").sub
        self.all_digits = re.compile(r"^\d+$").search
        if self.opts.remove_stopwords:
            with file(self.opts.stopwords_path) as f:
                self.stopwords = frozenset([l.strip().lower() for l in f])
        else:
            self.stopwords = frozenset([])
        self.doc_mod = documents_model.DocumentsModel(opts.docs_path)
        self.ann_client = AnnotationsClient()

        self.reg_apa = re.compile(
            # [Chen et al.2000]
            r"\(\s?(([A-Za-z\-]+\s)+([A-Za-z\-]+\.?)?,?\s\d{2,4}(;\s)?)+\s?\)|"
            r"\(\s?([^ ]+\s?[^ ]*et\sal\.,?\s\d{2,4}(,\s)?)+(\sand\s)?[^ ]+\s?[^ ]*et\sal\.,?\s\d{2,4}\)|"
            r"\w+\set al\. \(\d{2,4}\)")  # [Chen et al. 200]
        self.reg_apa_rare = re.compile(
            r"((([A-Z]\w+\set\sal\.,? \d{4})|([A-Z]\w+\sand\s[A-Z]\w+,?\s\d{4}))((,\s)| and )?)+")
        self.reg_apa2 = re.compile(
            r"\(\s?(\w+\s?\w*et\sal\.,\s\d{2,4}(,\s)?)+(\sand\s)?\w+\s?\w*et\sal\.,\s\d{2,4}\)")
        self.reg_ieee = re.compile(r"(\[(\d+([,–]\s?)?)+\])|\[\s?[\d,-]+\]")
        self.reg_paranthesis = re.compile(
            r"\(\s?\d{1,2}(,\s\d{1,2})*(\sand\s\d{1,2})?\)")
        self.nlp_extractor = Extract_NLP_Tags()
        self.tokenizer = RegexpTokenizer('[^\w\-\']+', gaps=True)
        self.lmtzr = WordNetLemmatizer()
示例#2
0
    def __init__(self, args, opts):
        super(Method, self).__init__(args, opts)

        self.es_int = ESInterface(host=self.opts.server,
                                  port=self.opts.port,
                                  index_name=self.opts.index_name)
        self.analyzer = self.es_int.get_index_analyzer()
        self.regex_citation = re.compile(
            r"\(\s?(([A-Za-z\-]+\s)+([A-Za-z\-]+\.?)?,?\s\d{2,4}(;\s)?)+\s?\)|"
            r"(\[(\d+([,–]\s?)?)+\])|"
            r"\[[\d,-]+\]").sub
        self.all_digits = re.compile(r"^\d+$").search
        if self.opts.remove_stopwords:
            with file(self.opts.stopwords_path) as f:
                self.stopwords = frozenset([l.strip().lower() for l in f])
        else:
            self.stopwords = frozenset([])
        self.db = MySQLdb.connect(host=constants.mysql_server,
                                  port=constants.mysql_port,
                                  user=constants.mysql_user,
                                  passwd=constants.mysql_pass,
                                  db=constants.mysql_db)
        self.cur = self.db.cursor()
        self.ttys = ['SY']

        ttygroups = {
            "syns": ('AUN', 'EQ', 'SYN', 'MTH'),
            "chemicals": ('CCN', 'CSN'),
            "drugs": ('BD', 'BN', 'CD', 'DP', 'FBD', 'GN', 'OCD'),
            "diseases": ('DI', ),
            "findings": ('FI', ),
            "hierarchy": ('HS', 'HT', 'HX'),
            "related": ('RT', ),
            "preferred": ('PTN', 'PT')
        }
        self.doc_mod = documents_model.DocumentsModel(opts.anns_dir)
        #         self.ann_client = AnnotationsClient()

        self.reg_apa = re.compile(
            # [Chen et al.2000]
            r"\(\s?(([A-Za-z\-]+\s)+([A-Za-z\-]+\.?)?,?\s\d{2,4}(;\s)?)+\s?\)|"
            r"\(\s?([^ ]+\s?[^ ]*et\sal\.,?\s\d{2,4}(,\s)?)+(\sand\s)?[^ ]+\s?[^ ]*et\sal\.,?\s\d{2,4}\)|"
            r"\w+\set al\. \(\d{2,4}\)")  # [Chen et al. 200]
        self.reg_apa_rare = re.compile(
            r"((([A-Z]\w+\set\sal\.,? \d{4})|([A-Z]\w+\sand\s[A-Z]\w+,?\s\d{4}))((,\s)| and )?)+"
        )
        self.reg_apa2 = re.compile(
            r"\(\s?(\w+\s?\w*et\sal\.,\s\d{2,4}(,\s)?)+(\sand\s)?\w+\s?\w*et\sal\.,\s\d{2,4}\)"
        )
        self.reg_ieee = re.compile(r"(\[(\d+([,–]\s?)?)+\])|\[\s?[\d,-]+\]")
        self.reg_paranthesis = re.compile(
            r"\(\s?\d{1,2}(,\s\d{1,2})*(\sand\s\d{1,2})?\)")
        self.nlp_extractor = Extract_NLP_Tags()
        self.tokenizer = RegexpTokenizer('[^\w\-\']+', gaps=True)
        self.lmtzr = WordNetLemmatizer()
        self.stemmer = stem.porter.PorterStemmer()
示例#3
0
    def __init__(self, args, opts):
        super(Method, self).__init__(args, opts)

        self.es_int = ESInterface(host=self.opts.server,
                                  port=self.opts.port,
                                  index_name=self.opts.index_name)
        self.analyzer = self.es_int.get_index_analyzer()
        self.regex_citation = re.compile(r"\(\s?(([A-Za-z\-]+\s)+([A-Za-z\-]+\.?)?,?\s\d{2,4}(;\s)?)+\s?\)|"
                                         r"(\[(\d+([,–]\s?)?)+\])|"
                                         r"\[[\d,-]+\]").sub
        self.all_digits = re.compile(r"^\d+$").search
        if self.opts.remove_stopwords:
            with file(self.opts.stopwords_path) as f:
                self.stopwords = frozenset([l.strip().lower() for l in f])
        else:
            self.stopwords = frozenset([])
        self.db = MySQLdb.connect(host=constants.mysql_server,
                                  port=constants.mysql_port,
                                  user=constants.mysql_user,
                                  passwd=constants.mysql_pass,
                                  db=constants.mysql_db)
        self.cur = self.db.cursor()
        self.ttys = ['SY']

        ttygroups = {"syns": ('AUN', 'EQ', 'SYN', 'MTH'),
                     "chemicals": ('CCN', 'CSN'),
                     "drugs": ('BD', 'BN', 'CD', 'DP', 'FBD', 'GN', 'OCD'),
                     "diseases": ('DI', ), "findings": ('FI', ),
                     "hierarchy": ('HS', 'HT', 'HX'), "related": ('RT', ),
                     "preferred": ('PTN', 'PT')}
        self.doc_mod = documents_model.DocumentsModel(opts.anns_dir)
#         self.ann_client = AnnotationsClient()

        self.reg_apa = re.compile(
            # [Chen et al.2000]
            r"\(\s?(([A-Za-z\-]+\s)+([A-Za-z\-]+\.?)?,?\s\d{2,4}(;\s)?)+\s?\)|"
            r"\(\s?([^ ]+\s?[^ ]*et\sal\.,?\s\d{2,4}(,\s)?)+(\sand\s)?[^ ]+\s?[^ ]*et\sal\.,?\s\d{2,4}\)|"
            r"\w+\set al\. \(\d{2,4}\)")  # [Chen et al. 200]
        self.reg_apa_rare = re.compile(
            r"((([A-Z]\w+\set\sal\.,? \d{4})|([A-Z]\w+\sand\s[A-Z]\w+,?\s\d{4}))((,\s)| and )?)+")
        self.reg_apa2 = re.compile(
            r"\(\s?(\w+\s?\w*et\sal\.,\s\d{2,4}(,\s)?)+(\sand\s)?\w+\s?\w*et\sal\.,\s\d{2,4}\)")
        self.reg_ieee = re.compile(r"(\[(\d+([,–]\s?)?)+\])|\[\s?[\d,-]+\]")
        self.reg_paranthesis = re.compile(
            r"\(\s?\d{1,2}(,\s\d{1,2})*(\sand\s\d{1,2})?\)")
        self.nlp_extractor = Extract_NLP_Tags()
        self.tokenizer = RegexpTokenizer('[^\w\-\']+', gaps=True)
        self.lmtzr = WordNetLemmatizer()
        self.stemmer = stem.porter.PorterStemmer()
示例#4
0
class Method(MethodInterface):
    """ Produce reference text by submitting the
        citance to the ElasticSearch server.
    """
    method_opts = {
        'maxsize': {
            'type': int,
            'default': 100
        },
        'stopwords-path': {
            'default': STOPWORDS_PATH
        },
        'remove-stopwords': {
            'default': False,
            'action': 'store_true'
        },
        'combine': {
            'default': False,
            'action': 'store_true'
        },
        'analyzer': {
            'default': False,
            'type': str
        },
        'ngram': {
            'default': False,
            'type': int
        },
        'concept_boost': {
            'default': 3,
            'type': int
        },
        'np_boost': {
            'default': 3,
            'type': int
        },
        'sent_boost': {
            'default': 1,
            'type': int
        },
        'stem_boost': {
            'default': 1,
            'type': int
        },
        'runmode': {
            'default': 'train'
        }
    }

    def __init__(self, args, opts):
        super(Method, self).__init__(args, opts)

        self.es_int = ESInterface(host=self.opts.server,
                                  port=self.opts.port,
                                  index_name=self.opts.index_name)
        self.analyzer = self.es_int.get_index_analyzer()
        self.regex_citation = re.compile(
            r"\(\s?(([A-Za-z\-]+\s)+([A-Za-z\-]+\.?)?,?\s\d{2,4}(;\s)?)+\s?\)|"
            r"(\[(\d+([,–]\s?)?)+\])|"
            r"\[[\d,-]+\]").sub
        self.all_digits = re.compile(r"^\d+$").search
        if self.opts.remove_stopwords:
            with file(self.opts.stopwords_path) as f:
                self.stopwords = frozenset([l.strip().lower() for l in f])
        else:
            self.stopwords = frozenset([])
        self.db = MySQLdb.connect(host=constants.mysql_server,
                                  port=constants.mysql_port,
                                  user=constants.mysql_user,
                                  passwd=constants.mysql_pass,
                                  db=constants.mysql_db)
        self.cur = self.db.cursor()
        self.ttys = ['SY']

        ttygroups = {
            "syns": ('AUN', 'EQ', 'SYN', 'MTH'),
            "chemicals": ('CCN', 'CSN'),
            "drugs": ('BD', 'BN', 'CD', 'DP', 'FBD', 'GN', 'OCD'),
            "diseases": ('DI', ),
            "findings": ('FI', ),
            "hierarchy": ('HS', 'HT', 'HX'),
            "related": ('RT', ),
            "preferred": ('PTN', 'PT')
        }
        self.doc_mod = documents_model.DocumentsModel(opts.anns_dir)
        #         self.ann_client = AnnotationsClient()

        self.reg_apa = re.compile(
            # [Chen et al.2000]
            r"\(\s?(([A-Za-z\-]+\s)+([A-Za-z\-]+\.?)?,?\s\d{2,4}(;\s)?)+\s?\)|"
            r"\(\s?([^ ]+\s?[^ ]*et\sal\.,?\s\d{2,4}(,\s)?)+(\sand\s)?[^ ]+\s?[^ ]*et\sal\.,?\s\d{2,4}\)|"
            r"\w+\set al\. \(\d{2,4}\)")  # [Chen et al. 200]
        self.reg_apa_rare = re.compile(
            r"((([A-Z]\w+\set\sal\.,? \d{4})|([A-Z]\w+\sand\s[A-Z]\w+,?\s\d{4}))((,\s)| and )?)+"
        )
        self.reg_apa2 = re.compile(
            r"\(\s?(\w+\s?\w*et\sal\.,\s\d{2,4}(,\s)?)+(\sand\s)?\w+\s?\w*et\sal\.,\s\d{2,4}\)"
        )
        self.reg_ieee = re.compile(r"(\[(\d+([,–]\s?)?)+\])|\[\s?[\d,-]+\]")
        self.reg_paranthesis = re.compile(
            r"\(\s?\d{1,2}(,\s\d{1,2})*(\sand\s\d{1,2})?\)")
        self.nlp_extractor = Extract_NLP_Tags()
        self.tokenizer = RegexpTokenizer('[^\w\-\']+', gaps=True)
        self.lmtzr = WordNetLemmatizer()
        self.stemmer = stem.porter.PorterStemmer()

#         if len(args) > 3:s
#             self.ttys = []
#
#             for tty in args[3:]:
#                 if tty in ttygroups:
#                     self.ttys.extend(ttygroups[tty])
#                 else:
#                     self.ttys.append(tty)

    def expand_concept(self, cdata, synonyms=False):
        rejected_semTypes = {'ftcn', 'qlco', 'qnco', 'inpr'}
        Okay = True
        for st in cdata['SemanticTypes']:
            if st in rejected_semTypes:
                Okay = False
        if Okay:
            if synonyms:
                return self.concept_synonyms(cdata['ConceptId'])
            else:
                return cdata['ConceptId']

    def concept_synonyms(self, cui):
        if cui in evaluate.cachefile:
            return set(evaluate.cachefile[cui])
        else:
            termtypes = ("and (TTY=" +
                         " OR TTY=".join(["'%s'" % x
                                          for x in self.ttys]) + ")")
            #         query = 'select * from (select distinct STR from MRCONSO a,'+\
            #                 '(select distinct CUI1,AUI1,AUI2,RELA,CUI2 from MRREL where cui1 = \'%s\'' % cui +\
            #                 ' and rela is not null) b where a.CUI=b.CUI2 and a.LAT=\'ENG\') dd  ;'
            query = "select STR from MRCONSO where " +\
                "CUI = '%s' and LAT = 'ENG' and ISPREF = 'Y'" % cui +\
                termtypes + " and (SAB = 'SNOMEDCT_US')"
            #             print query
            self.cur.execute(query)

            #         self.cur.execute("select STR from MRCONSO where " +
            #                          "CUI = '%s' and LAT = 'ENG' and ISPREF = 'Y'" % cui +
            #                          termtypes + " and SAB != 'CHV'")

            syns = set(
                filter(lambda y: y.replace(" ", "").isalpha(),
                       [x.lower() for x, in self.cur.fetchall()]))
            evaluate.cachefile[cui] = list(syns)
            return syns

    def run(self, test_data):
        out_results = []
        for ann in test_data:
            doc_type = '_'.join((ann['topic_id'].lower(),
                                 ann['reference_article'][:-4].lower()))
            doc_type = doc_type.replace(',', '').replace("'", '"')
            # TEMPORARY FIX FOR WRONG DOCUMENT TYPE NAME
            if self.opts.runmode == 'eval':
                doc_type = doc_type.replace('train', 'eval')

            doc = self.doc_mod.get_doc(ann['topic_id'].lower(),
                                       ann['citing_article'])
            cit_text = ann['citation_text']
            cit_text_doc = doc[
                ann['citation_offset'][0]:ann['citation_offset'][1]]
            cit_marker = ann['citation_marker']
            cit_marker_doc = doc[ann['citation_marker_offset'][0]:
                                 ann['citation_marker_offset'][1]]
            cit_mrk_offset_sent = [
                ann['citation_marker_offset'][0] - ann['citation_offset'][0],
                ann['citation_marker_offset'][1] - ann['citation_offset'][0]
            ]
            cleaned = self.reg_apa.sub('', cit_text_doc)
            cleaned = self.reg_ieee.sub('', cleaned)
            cleaned = self.reg_paranthesis.sub('', cleaned)
            cleaned = self.reg_apa_rare.sub('', cleaned)
            cleaned = re.sub('\s+', ' ', cleaned).strip()
            cleaned = re.sub('(,\s)+', ', ', cleaned).strip(', ')
            '''
            -------------- IMMEDIATE NP BEFORE MARKER ----------
            '''
            m = list(self.reg_apa.finditer(cit_text_doc))
            m1 = list(self.reg_ieee.finditer(cit_text_doc))
            m2 = list(self.reg_paranthesis.finditer(cit_text_doc))
            # (start, end, group)
            if len(m) > 0:
                markers = [(e.start(), e.end(), e.group(0)) for e in m]
            elif len(m1) > 0:
                markers = [(e.start(), e.end(), e.group(0)) for e in m1]
            elif len(m2) > 0:
                markers = [(e.start(), e.end(), e.group(0)) for e in m2]
            else:
                m3 = list(self.reg_apa_rare.finditer(cit_text_doc))
                if len(m3) > 0:
                    markers = [(e.start(), e.end(), e.group(0)) for e in m3]
                else:
                    markers = []

            if len(markers) > 10000:

                nps = self.nlp_extractor.parse_by_mbsp(cleaned.strip())
                if nps is None:
                    q = cleaned
                else:
                    t = nps.split(' ')
                    concepts = []
                    for i in range(len(t)):
                        conc = []
                        toks = t[i].split('/')
                        while (('NP' in toks[2]) and (i < len(t))):
                            conc.append((toks[0], toks[6]))
                            i += 1
                            if i < len(t):
                                toks = t[i].split('/')
                        if len(conc) > 0:
                            concepts.append(conc)
                    noun_phrases = [
                        ' '.join([s1[0] for s1 in t1]) for t1 in concepts
                    ]

                    #                 nps = self.nlp_extractor.extract_NP(cleaned, mode='flattened')
                    #                 nps = [[[a[1:-1] for a in piece] for piece in sent] for sent in nps]
                    #             nps = [a[1:-1] for sent in nps for piece in sent for a in piece]
                    #                 for e in nps:
                    #                     noun_phrases = [(sub_e[0].replace('"', ''),idx) for idx, sub_e in enumerate(e) if sub_e[0].replace('"', '') not in self.stopwords]
                    tokens = self.tokenizer.tokenize(cit_text)
                    tokens_offsets = self.tokenizer.span_tokenize(cit_text_doc)
                    nearest = ''
                    nearest_idx = -1
                    distance = 100000
                    # find nearest word to the citation marker
                    for idx, f in enumerate(tokens_offsets):
                        # check to see if in valid span (not citation markers)
                        invalid = False
                        for e in markers:
                            if f[0] >= e[0] and f[1] <= e[1]:
                                invalid = True
                        if (cit_mrk_offset_sent[0] - f[1] >= 0) and\
                                (cit_mrk_offset_sent[0] - f[1] < distance) and\
                                not invalid:
                            distance = cit_mrk_offset_sent[0] - f[1]
                            if len(re.findall(r"^[^A-Za-z]+$",
                                              tokens[idx])) == 0:
                                nearest = tokens[idx]
                                if (idx > 0) and len(
                                        re.findall(r"^[^A-Za-z]+$",
                                                   tokens[idx - 1])) == 0:
                                    nearest = tokens[idx -
                                                     1] + ' ' + tokens[idx]
                                nearest_idx = idx
                        elif (cit_mrk_offset_sent[0] < f[1]):
                            break
                        if len(nearest.split(' ')) == 1 and nearest_idx > 0 and\
                                tokens[nearest_idx] not in stops100:
                            nearest = tokens[idx - 1] + ' ' + tokens[idx]
                    largest = 0
                    q = ''
                    for n in noun_phrases:
                        if (nearest in n) and (len(nearest.split()) > largest):
                            q = '"%s"' % nearest
                            largest = len(nearest.split())
                    if q == '':
                        q = cleaned
                q = sanitize(q)
# find longest noun phrase containing the nearest
#                 res = None
#                 for np in nps[0]:
#                    if nearest in np and len(np) > longest and len(np) < 5:
#                        longest = len(np)
#                        res = np
#                 if res is not None:
#                     res = ' '.join([el for el in res])
#                 else:
#                     res = nearest
            else:
                try:
                    qtxt = unicodedata.normalize('NFKD', cleaned).encode(
                        'ascii', 'ignore')
                except:
                    qtxt = cleaned.encode('ascii', 'ignore')
                qterms = [qtxt]
                tokens = self.tokenizer.tokenize(' '.join(qterms))
                #             tokens = self.es_int.tokenize(qtxt, analyzer=self.analyzer)
                q = ' '.join([
                    t for t in tokens
                    if (t not in self.stopwords and not (self.all_digits(t)))
                ])
                if self.opts.concept_boost > 0:

                    qconcepts = mmrun(cleaned)
                    qcids = []
                    for cdata in qconcepts['concepts']:
                        newterms = self.expand_concept(cdata)
                        if newterms is not None:
                            qcids.append(newterms)
                else:
                    qcids = []
                if self.opts.np_boost > 0:
                    nps = self.nlp_extractor.extract_NP(qtxt, mode='flattened')
                    noun_phs = set()
                    for e in nps:
                        for e1 in e:
                            if len(e1) < 4:
                                all_stop = False
                                if self.opts.remove_stopwords:
                                    tmp = ' '.join(
                                        sub_e.replace('"', '')
                                        for sub_e in e1 if sub_e.replace(
                                            '"', '') not in self.stopwords)
                                else:
                                    count = 0
                                    for sub_e in e1:
                                        if sub_e.replace('"',
                                                         '') in self.stopwords:
                                            count += 1
                                    if count == len(e1):
                                        all_stop = True
                                    tmp = ' '.join(
                                        sub_e.replace('"', '') for sub_e in e1)
                                if '"' + tmp.replace(
                                        '"', ''
                                ) + '"' not in noun_phs and not all_stop:
                                    noun_phs.add('"' + tmp.replace('"', '') +
                                                 '"')
                else:
                    noun_phs = []

            if self.opts.analyzer:
                r = self.es_int.simple_search(
                    q,
                    maxsize=self.opts.maxsize,
                    source_fields=['offset', 'sentence'],
                    # field='sentence',
                    doc_type=doc_type,
                    params={'analyzer': self.opts.analyzer})
            else:
                #                 r = self.es_int.multi_field_search(sentence=q,
                #                                                    concepts=' '.join(
                #                                                        [w for w in qcids]),
                #                                                    noun_phrases=' '.join(
                #                                                        [e for e in noun_phs]),
                #                                                    maxsize=self.opts.maxsize,
                #                                                    source_fields=[
                #                                                        'offset', 'sentence', 'mm-concepts', 'noun_phrases'],
                #                                                    doc_type=doc_type,
                #                                                    field_boost=[self.opts.sent_boost,
                #                                                                 self.opts.concept_boost,
                # self.opts.np_boost])
                fields = [
                    'sentence', 'mm-concepts', 'noun_phrases_1', 'stemmed'
                ]
                tokens1 = []
                for w in self.tokenizer.tokenize(cleaned):
                    Okay = True
                    if self.opts.remove_stopwords:
                        if w in self.stopwords:
                            Okay = False
                    if '-' in w:
                        tokens1.append(self.stemmer.stem(w.replace('-', '')))
                    if Okay:
                        tokens1.append(self.stemmer.stem(w))
                field_vals = [
                    q, ' '.join([w for w in qcids]),
                    (' '.join([e for e in noun_phs])).replace('"', ''),
                    ' '.join([w for w in tokens1])
                ]
                field_boosts = [
                    self.opts.sent_boost, self.opts.concept_boost,
                    self.opts.np_boost, self.opts.stem_boost
                ]
                r = self.es_int.multi_field_search(
                    field_vals=field_vals,
                    fields=fields,
                    source_fields=['offset', 'sentence'],
                    maxsize=self.opts.maxsize,
                    field_boost=field_boosts,
                    doc_type=doc_type)
#             r = self.es_int.find_all(doc_type=doc_type, source_fields=['offset','sentence'])
            for e in r:
                fld = e.pop('fields')
                e['offset'] = [eval(fld['offset'][0])]
                #                 beg = e['offset'][0][0] - \
                #                     100 if e['offset'][0][0] else e['offset'][0][0]
                #                 end = e['offset'][0][1] + 100
                #                 e['offset'] = [(beg, end)]
                e['sentence'] = fld['sentence'][0]
                e['query'] = q
            if self.opts.combine:
                if len(r) == 0:
                    r = [{
                        '_type': doc_type,
                        '_index': self.opts.index_name,
                        '_score': 0,
                        'score': 0,
                        'sentence': [''],
                        'offset': [(0, 1)],
                        'query': q,
                        '_id': -11
                    }]
                r = [{
                    '_type': r[0]['_type'],
                    '_index': r[0]['_index'],
                    'query': q,
                    'topic': ann['topic_id'].lower(),
                    'citance_number': ann['citance_number'],
                    'citation_text': ann['citation_text'],
                    'citing_article': ann['citing_article'],
                    '_score': sum([e['_score'] for e in r]),
                    'offset': [e['offset'][0] for e in r],
                    'sentence': [e['sentence'] for e in r],
                    '_id': '-000001'
                }]

            out_results.append(r)
        return out_results
示例#5
0
    '*', '?', ':', '\\', '/'
]

CMD = 'curl -XPOST \'http://localhost:9200/index_name/type/idx\' -d '

ref_topics = [
    "d1408_train_lewis", "d1419_train_yeoh", "d1415_train_blasco",
    "d1401_train_voorhoeve", "d1417_train_bos", "d1409_train_sherr",
    "d1414_train_vandelft", "d1412_train_cho", "d1403_train_serrano",
    "d1404_train_agamibernards", "d1413_train_figueroa",
    "d1402_train_westbrook", "d1411_train_fazi", "d1416_train_zhaowang",
    "d1407_train_toji", "d1406_train_hanahan", "d1405_train_campbell",
    "d1418_train_ying", "d1420_train_kumar", "d1410_train_wangtang", "doc"
]

nlp_extractor = Extract_NLP_Tags()


def assign_str(text, idx, new_characters):
    '''
    Assigns the new_character to position idx of the str text
    '''
    return text[:idx] + new_characters + text[idx + len(new_characters):]


def filter(data):
    '''
    Filters the irrelevant part of the data

    Args: data(str)
示例#6
0
class Method(MethodInterface):

    """ Produce reference text by submitting the
        citance to the ElasticSearch server.
    """
    method_opts = {'maxsize': {'type': int, 'default': 100},
                   'stopwords-path': {'default': STOPWORDS_PATH},
                   'remove-stopwords': {'default': False,
                                        'action': 'store_true'},
                   'remove-stopwords-phrase': {'default': False,
                                               'action': 'store_true'},
                   'noun-phrase': {'default': False,
                                   'action': 'store_true'},
                   'phrase-slop': {'type': int, 'default': 0},
                   'combine': {'default': False, 'action': 'store_true'},
                   'docs-path': {'default': DOCS_PATH},
                   'expand-window': {'default': False, 'action': 'store_true'},
                   'query-terms': {'default': False, 'action': 'store_true'},
                   'verbose': {'default': False, 'action': 'store_true'},
                   'qterm-weight': {'type': float, 'default': 1.0},
                   'phrase-weight': {'type': float, 'default': 2.0},
                   'surrounding-words-weight': {'type': float, 'default': 1.0},
                   'filter-allstops': {'default': False, 'action': 'store_true'},
                   'expand-results': {'type': int, 'default': 0},
                   'sentence': {'default': False, 'type': int},
                   'analyzer': {'default': False, 'type': str}}

    def __init__(self, args, opts):
        super(Method, self).__init__(args, opts)
        self.es_int = ESInterface(host=self.opts.server,
                                  port=self.opts.port,
                                  index_name=self.opts.index_name)
        self.regex_citation = re.compile(r"\(\s?(([A-Za-z\-]+\s)+([A-Za-z\-]+\.?)?,?\s\d{2,4}(;\s)?)+\s?\)|"
                                         r"(\[(\d+([,–]\s?)?)+\])|"
                                         r"\[[\d,-]+\]").sub
        self.all_digits = re.compile(r"^\d+$").search
        if self.opts.remove_stopwords:
            with file(self.opts.stopwords_path) as f:
                self.stopwords = frozenset([l.strip().lower() for l in f])
        else:
            self.stopwords = frozenset([])
        self.doc_mod = documents_model.DocumentsModel(opts.docs_path)
        self.ann_client = AnnotationsClient()

        self.reg_apa = re.compile(
            # [Chen et al.2000]
            r"\(\s?(([A-Za-z\-]+\s)+([A-Za-z\-]+\.?)?,?\s\d{2,4}(;\s)?)+\s?\)|"
            r"\(\s?([^ ]+\s?[^ ]*et\sal\.,?\s\d{2,4}(,\s)?)+(\sand\s)?[^ ]+\s?[^ ]*et\sal\.,?\s\d{2,4}\)|"
            r"\w+\set al\. \(\d{2,4}\)")  # [Chen et al. 200]
        self.reg_apa_rare = re.compile(
            r"((([A-Z]\w+\set\sal\.,? \d{4})|([A-Z]\w+\sand\s[A-Z]\w+,?\s\d{4}))((,\s)| and )?)+")
        self.reg_apa2 = re.compile(
            r"\(\s?(\w+\s?\w*et\sal\.,\s\d{2,4}(,\s)?)+(\sand\s)?\w+\s?\w*et\sal\.,\s\d{2,4}\)")
        self.reg_ieee = re.compile(r"(\[(\d+([,–]\s?)?)+\])|\[\s?[\d,-]+\]")
        self.reg_paranthesis = re.compile(
            r"\(\s?\d{1,2}(,\s\d{1,2})*(\sand\s\d{1,2})?\)")
        self.nlp_extractor = Extract_NLP_Tags()
        self.tokenizer = RegexpTokenizer('[^\w\-\']+', gaps=True)
        self.lmtzr = WordNetLemmatizer()

    def run(self, test_data):
        out_results = []
        not_found = 0
        total = 0
#         outfile = codecs.open('tmp/nlp.txt' , 'wb' , 'UTF-8')
        processed = set()
        for ann in test_data:
            if (ann['topic_id'] + '_' + str(ann['citance_number'])) not in processed:
                doc_type = '_'.join((ann['topic_id'].lower(),
                                     ann['reference_article'][:-4].lower()))
                doc_type = doc_type.replace(',', '').replace("'", '"')
                doc = self.doc_mod.get_doc(
                    ann['topic_id'].lower(), ann['citing_article'])
                cit_text = ann['citation_text']
                cit_text_doc = doc[
                    ann['citation_offset'][0]:ann['citation_offset'][1]]
                cit_marker = ann['citation_marker']
                cit_marker_doc = doc[
                    ann['citation_marker_offset'][0]:ann['citation_marker_offset'][1]]
                cit_mrk_offset_sent = [ann['citation_marker_offset'][0] - ann['citation_offset'][0] + 1,
                                       [ann['citation_marker_offset'][1] - ann['citation_offset'][1] + 1]]
                cleaned = self.reg_apa.sub('', cit_text_doc)
                cleaned = self.reg_ieee.sub('', cleaned)
                cleaned = self.reg_paranthesis.sub('', cleaned)
                cleaned = self.reg_apa_rare.sub('', cleaned)
                cleaned = re.sub('\s+', ' ', cleaned).strip()
                cleaned = re.sub('(,\s)+', ', ', cleaned).strip(', ')
                chunks = set()
                # get noun phrases, format [[[term1, term2],[term3]][term4,
                # term5]]
                nps = self.nlp_extractor.extract_NP(cleaned, mode='flattened')
#                 nps = [[[a[1:-1] for a in piece] for piece in sent] for sent in nps]
#                 for e in nps:
#                     noun_phrases = [(sub_e[0].replace('"', ''),idx) for idx, sub_e in enumerate(e) if sub_e[0].replace('"', '') not in self.stopwords]
                noun_phrases = [e for e in list(itertools.chain.from_iterable(nps))
                                if e not in self.stopwords]
#                 tokens = self.tokenizer.tokenize(cit_text)
#                 tokens_offsets = self.tokenizer.span_tokenize(cit_text_doc)
#                 cleaned = ''
#
#                 m = list(self.reg_apa.finditer(cit_text_doc))
#                 m1 = list(self.reg_ieee.finditer(cit_text_doc))
#                 m2 = list(self.reg_paranthesis.finditer(cit_text_doc))
#                 # (start, end, group)
#                 if len(m) > 0:
#                     markers = [(e.start(), e.end(), e.group(0)) for e in m]
#                 elif len(m1) > 0:
#                     markers = [(e.start(), e.end(), e.group(0))
#                                for e in m1]
#                 elif len(m2) > 0:
#                     markers = [(e.start(), e.end(), e.group(0))
#                                for e in m2]
#                 else:
#                     m3 = list(self.reg_apa_rare.finditer(cit_text_doc))
#                     if len(m3) > 0:
#                         markers = [(e.start(), e.end(), e.group(0))
#                                    for e in m3]
#                     else:
#                         not_found += 1
#                 nearest = ''
#                 distance = 100000
#                 if len(markers) > 1:
#                     # find nearest word to the citation marker
#                     for idx, f in enumerate(tokens_offsets):
#                         # check to see if in valid span (not citation markers)
#                         invalid = False
#                         for e in markers:
#                             if f[0] >= e[0] and f[1] <= e[1]:
#                                 invalid = True
#                         if (cit_mrk_offset_sent[0] - f[1] >= 0) and\
#                                 (cit_mrk_offset_sent[0] - f[1] < distance) and\
#                                 not invalid:
#                             distance = cit_mrk_offset_sent[0] - f[1]
#                             if len(re.findall(r"^[^A-Za-z]+$", tokens[idx])) == 0:
#                                 nearest = tokens[idx]
#
#                         # find longest noun phrase containing the nearest
#                         longest = 0
#                         res = None
#                         for np in nps[0]:
#                             if nearest in np and len(np) > longest:
#                                 longest = len(np)
#                                 res = np
#                         if res is not None:
#                             res = ' '.join([el for el in res])
#                         else:
#                             res = nearest
#                 else:
#                     # if there is only one citation marker, just consider the
#                     # whole citation text as the query
#                     q_tokens = []
#                     for idx, f in enumerate(tokens_offsets):
#                         invalid = False
#                         for e in markers:
#                             if f[0] >= e[0] and f[1] <= e[1]:
#                                 invalid = True
#                         if (cit_mrk_offset_sent[0] - f[1] >= 0) and\
#                                 (cit_mrk_offset_sent[0] - f[1] < distance) and\
#                                 not invalid:
#                             q_tokens.append(tokens[idx])
#                     res = ' '.join([f for f in q_tokens])
                q = noun_phrases
                q = ' '.join(q).encode('ascii', 'ignore')
    #             outfile.write('query: "%s" \nparsed: "%s"\n\n' %(q,str(nps)) )
                tokens = self.es_int.tokenize(q, "sentence")
                q = ' '.join([t for t in tokens
                              if (t not in self.stopwords and
                                  not(self.all_digits(t)))])
                if self.opts.analyzer:
                    r = self.es_int.simple_search(q, maxsize=self.opts.maxsize,
                                                  source_fields=[
                                                      'offset', 'sentence'],
                                                  # field='sentence',
                                                  doc_type=doc_type,
                                                  params={'analyzer': self.opts.analyzer})
                else:
                    r = self.es_int.simple_search(q, maxsize=self.opts.maxsize,
                                                  source_fields=[
                                                      'offset', 'sentence'],
                                                  # field='sentence',
                                                  doc_type=doc_type)
                for e in r:
                    fld = e.pop('fields')
                    e['offset'] = [eval(fld['offset'][0])]
                    beg = e['offset'][0][0] - \
                        100 if e['offset'][0][0] else e['offset'][0][0]
                    end = e['offset'][0][1] + 100
                    e['offset'] = [(beg, end)]
                    e['sentence'] = fld['sentence'][0]
                    e['query'] = q
                if self.opts.combine:
                    if len(r) == 0:
                        r = [{'_type': doc_type,
                              '_index': self.opts.index_name,
                              '_score': 0,
                              'sentence': '',
                              'offset': [(0, 1)],
                              'query':q, '_id':-11}]
                    r = [{'_type': r[0]['_type'],
                          '_index': r[0]['_index'],
                          'query': q,
                          'topic': ann['topic_id'].lower(),
                          'citance_number': ann['citance_number'],
                          'citation_text': ann['citation_text'],
                          'citing_article': ann['citing_article'],
                          '_score': sum([e['_score'] for e in r]),
                          'offset': [e['offset'][0] for e in r],
                          'sentence': [e['sentence'] for e in r],
                          '_id': '-000001'}]
                out_results.append(r)
        return out_results
示例#7
0
import sys
import codecs
import elasticsearch
from util.extract_nlptags import Extract_NLP_Tags
from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktParameters
from copy import deepcopy

reserved_chars = [
    '+', '-', '&&', '||', '!', '(', ')', '{', '}', '[', ']', '^', '"', '~',
    '*', '?', ':', '\\', '/'
]

verbs = {'VB', 'VBD', 'VBG', 'VBN', 'VBZ', 'VBP'}

CMD = 'curl -XPOST \'http://localhost:9200/index_name/type/idx\' -d '
pos_tagger = Extract_NLP_Tags()


def filter_text(sentences, offsets, data):
    forbidden = []
    new_sentences = []
    new_offsets = []
    beg = 0
    end = sys.maxint
    abst = data.lower().find('\nabstract')
    if abst > -1 and (abst < data.lower().find('\n\n')
                      or abst < data.lower().find('\r\n\r\n')):
        beg = abst
    elif abst == -1:
        abst = data.lower().find('\nsummary')
        if abst > -1 and (abst < data.lower().find('\n\n')
示例#8
0
    def run(self, test_data):
        out_results = []
        #         outfile = codecs.open('tmp/nlp.txt' , 'wb' , 'UTF-8')
        for ann in test_data:
            doc_type = '_'.join((ann['topic_id'].lower(),
                                 ann['reference_article'][:-4].lower()))
            doc_type = doc_type.replace(',', '').replace("'", '"')

            authors = set((ann['reference_article'][:-4].lower().strip(),
                           ann['citing_article'][:-4].lower().strip()))

            # preprocess (removes citations) and tokenizes
            # citation text before submitting to elasticsearch
            q = self.regex_citation('', ann['citation_text'])
            q = re.sub(r'( ,)+', ',', q)
            q = q.encode('ascii', 'ignore')
            nlp_extractor = Extract_NLP_Tags()
            nps = nlp_extractor.extract_NP(q, mode='flattened')
            #             outfile.write('query: "%s" \nparsed: "%s"\n\n' %(q,str(nps)) )
            q1 = ''
            queryterms = set()
            for e in nps:
                for e1 in e:
                    if len(e1) < 4:
                        all_stop = False
                        if self.opts.remove_stopwords_phrase:
                            tmp = ' '.join(
                                sub_e.replace('"', '') for sub_e in e1 if
                                sub_e.replace('"', '') not in self.stopwords)
                        else:
                            count = 0
                            for sub_e in e1:
                                if sub_e.replace('"', '') in self.stopwords:
                                    count += 1
                            if count == len(e1):
                                all_stop = True
                            tmp = ' '.join(
                                sub_e.replace('"', '') for sub_e in e1)
                        if tmp not in queryterms and not all_stop:
                            q1 += '"' + tmp + '"^' + \
                                str(self.opts.phrase_weight) + ' '
                            queryterms.add(tmp)
            if self.opts.expand_window:
                window = self.doc_mod.get_para(
                    ann['topic_id'].lower(),
                    ann['citing_article'][:-4].lower(),
                    (ann['citation_offset'][0], ann['citation_offset'][1]))
                sorrounding_text = deepcopy(window['sentence'])
                st = self.regex_citation('', sorrounding_text)
                st = re.sub(r'( ,)+', ',', st)
                st = st.encode('ascii', 'ignore')
                other_nouns = nlp_extractor.extract_NP(st, mode='flattened')
                for e in other_nouns:
                    for e1 in e:
                        if len(e1) < 4:
                            all_stop = False
                            if self.opts.remove_stopwords_phrase:
                                tmp = ' '.join(
                                    sub_e.replace('"', '') for sub_e in e1
                                    if sub_e.replace('"', '') not in
                                    self.stopwords)
                            else:
                                count = 0
                                for sub_e in e1:
                                    if sub_e.replace('"',
                                                     '') in self.stopwords:
                                        count += 1
                                if count == len(e1):
                                    all_stop = True
                                tmp = ' '.join(
                                    sub_e.replace('"', '') for sub_e in e1)
                            if tmp not in queryterms and not all_stop:
                                q1 += '"' + tmp + '"^' + \
                                    str(self.opts.surrounding_words_weight) + \
                                    ' '
                                queryterms.add(tmp)
            if self.opts.query_terms:
                q = ' '.join([
                    t + '^' + str(self.opts.qtrem_weight)
                    for t in self.es_int.tokenize(q)
                    if (t not in self.stopwords and t not in authors
                        and not (self.all_digits(t)))
                ])
                q1 = q1 + ' ' + q
            if self.opts.verbose:
                print "query:   %s" % q
                print "q1   :       %s" % q1
                print '_____'
#             q2 = self.es_int.tokenize(q1, 'sentence')
#             q2 = ' '.join([t for t in self.es_int.tokenize(q1)
#                           if (t not in self.stopwords and
#                               t not in authors and
#                               not(self.all_digits(t)))])
            if self.opts.analyzer:
                r = self.es_int.simple_search(
                    q1.strip(),
                    maxsize=self.opts.maxsize,
                    source_fields=['offset', 'sentence'],
                    # field='sentence',
                    doc_type=doc_type,
                    phrase_slop=self.opts.phrase_slop,
                    params={'analyzer': self.opts.analyzer})
            else:
                r = self.es_int.simple_search(
                    q1.strip(),
                    maxsize=self.opts.maxsize,
                    source_fields=['offset', 'sentence'],
                    # field='sentence',
                    doc_type=doc_type,
                    phrase_slop=self.opts.phrase_slop)
            if self.opts.sentence:
                for idx, e in enumerate(deepcopy(r)):
                    if '_id' in e:
                        query = ' OR '.join([
                            '_id:%s' % (str(int(e['_id']) + j).zfill(5))
                            for j in range(-1 * self.opts.sentence,
                                           self.opts.sentence + 1)
                            if j != 0 and int(e['_id']) + j > 0
                        ])
                        sour = self.es_int.simple_search(
                            query,
                            doc_type=e['_type'],
                            maxsize=2 * self.opts.sentence,
                            source_fields=['offset', 'sentence'])
                        #                         aft = self.es_int.get_page(
                        #                             str(int(e['_id']) + 1).zfill(5), e['_type'])
                        #                         bef = self.es_int.get_page(
                        #                             str(int(e['_id']) + 1).zfill(5), e['_type'])
                        if len(sour) > 0:
                            for s in sour:
                                r.insert(idx + 1, s)

            for e in r:
                fld = e.pop('fields')
                if eval(fld['offset'][0])[0] < self.opts.expand_results:
                    beg = 0
                else:
                    beg = eval(fld['offset'][0])[0] - self.opts.expand_results
                endd = eval(fld['offset'][0])[1] + self.opts.expand_results
                e['offset'] = [(beg, endd)]
                e['sentence'] = fld['sentence'][0]
                e['query'] = q1

            r1 = deepcopy(r)
            r = []
            for idx, e in enumerate(r1):
                if idx < self.opts.maxsize:
                    r.append(e)

            if self.opts.combine:
                if len(r) == 0:
                    r = [{
                        '_type': doc_type,
                        '_index': self.opts.index_name,
                        '_score': 0,
                        'sentence': '',
                        'offset': [(0, 1)],
                        'query': q1,
                        '_id': -11
                    }]
                r = [{
                    '_type': r[0]['_type'],
                    '_index': r[0]['_index'],
                    'query': q1,
                    '_score': sum([e['_score'] for e in r]),
                    'offset': [e['offset'][0] for e in r],
                    'sentence': [e['sentence'] for e in r],
                    '_id': '-000001'
                }]
            out_results.append(r)
        return out_results
示例#9
0
class Method(MethodInterface):

    """ Produce reference text by submitting the
        citance to the ElasticSearch server.
    """
    method_opts = {'maxsize': {'type': int, 'default': 100},
                   'stopwords-path': {'default': STOPWORDS_PATH},
                   'remove-stopwords': {'default': False,
                                        'action': 'store_true'},
                   'combine': {'default': False, 'action': 'store_true'},
                   'analyzer': {'default': False, 'type': str},
                   'ngram': {'default': False, 'type': int},
                   'concept_boost': {'default': 3, 'type': int},
                   'np_boost': {'default': 3, 'type': int},
                   'sent_boost': {'default': 1, 'type': int},
                   'stem_boost': {'default': 1, 'type': int},
                   'runmode': {'default': 'train'}}

    def __init__(self, args, opts):
        super(Method, self).__init__(args, opts)

        self.es_int = ESInterface(host=self.opts.server,
                                  port=self.opts.port,
                                  index_name=self.opts.index_name)
        self.analyzer = self.es_int.get_index_analyzer()
        self.regex_citation = re.compile(r"\(\s?(([A-Za-z\-]+\s)+([A-Za-z\-]+\.?)?,?\s\d{2,4}(;\s)?)+\s?\)|"
                                         r"(\[(\d+([,–]\s?)?)+\])|"
                                         r"\[[\d,-]+\]").sub
        self.all_digits = re.compile(r"^\d+$").search
        if self.opts.remove_stopwords:
            with file(self.opts.stopwords_path) as f:
                self.stopwords = frozenset([l.strip().lower() for l in f])
        else:
            self.stopwords = frozenset([])
        self.db = MySQLdb.connect(host=constants.mysql_server,
                                  port=constants.mysql_port,
                                  user=constants.mysql_user,
                                  passwd=constants.mysql_pass,
                                  db=constants.mysql_db)
        self.cur = self.db.cursor()
        self.ttys = ['SY']

        ttygroups = {"syns": ('AUN', 'EQ', 'SYN', 'MTH'),
                     "chemicals": ('CCN', 'CSN'),
                     "drugs": ('BD', 'BN', 'CD', 'DP', 'FBD', 'GN', 'OCD'),
                     "diseases": ('DI', ), "findings": ('FI', ),
                     "hierarchy": ('HS', 'HT', 'HX'), "related": ('RT', ),
                     "preferred": ('PTN', 'PT')}
        self.doc_mod = documents_model.DocumentsModel(opts.anns_dir)
#         self.ann_client = AnnotationsClient()

        self.reg_apa = re.compile(
            # [Chen et al.2000]
            r"\(\s?(([A-Za-z\-]+\s)+([A-Za-z\-]+\.?)?,?\s\d{2,4}(;\s)?)+\s?\)|"
            r"\(\s?([^ ]+\s?[^ ]*et\sal\.,?\s\d{2,4}(,\s)?)+(\sand\s)?[^ ]+\s?[^ ]*et\sal\.,?\s\d{2,4}\)|"
            r"\w+\set al\. \(\d{2,4}\)")  # [Chen et al. 200]
        self.reg_apa_rare = re.compile(
            r"((([A-Z]\w+\set\sal\.,? \d{4})|([A-Z]\w+\sand\s[A-Z]\w+,?\s\d{4}))((,\s)| and )?)+")
        self.reg_apa2 = re.compile(
            r"\(\s?(\w+\s?\w*et\sal\.,\s\d{2,4}(,\s)?)+(\sand\s)?\w+\s?\w*et\sal\.,\s\d{2,4}\)")
        self.reg_ieee = re.compile(r"(\[(\d+([,–]\s?)?)+\])|\[\s?[\d,-]+\]")
        self.reg_paranthesis = re.compile(
            r"\(\s?\d{1,2}(,\s\d{1,2})*(\sand\s\d{1,2})?\)")
        self.nlp_extractor = Extract_NLP_Tags()
        self.tokenizer = RegexpTokenizer('[^\w\-\']+', gaps=True)
        self.lmtzr = WordNetLemmatizer()
        self.stemmer = stem.porter.PorterStemmer()

#         if len(args) > 3:s
#             self.ttys = []
#
#             for tty in args[3:]:
#                 if tty in ttygroups:
#                     self.ttys.extend(ttygroups[tty])
#                 else:
#                     self.ttys.append(tty)

    def expand_concept(self, cdata, synonyms=False):
        rejected_semTypes = {'ftcn', 'qlco', 'qnco', 'inpr'}
        Okay = True
        for st in cdata['SemanticTypes']:
            if st in rejected_semTypes:
                Okay = False
        if Okay:
            if synonyms:
                return self.concept_synonyms(cdata['ConceptId'])
            else:
                return cdata['ConceptId']

    def concept_synonyms(self, cui):
        if cui in evaluate.cachefile:
            return set(evaluate.cachefile[cui])
        else:
            termtypes = ("and (TTY=" +
                         " OR TTY=".join(["'%s'" % x for x in self.ttys]) + ")")
    #         query = 'select * from (select distinct STR from MRCONSO a,'+\
    #                 '(select distinct CUI1,AUI1,AUI2,RELA,CUI2 from MRREL where cui1 = \'%s\'' % cui +\
    #                 ' and rela is not null) b where a.CUI=b.CUI2 and a.LAT=\'ENG\') dd  ;'
            query = "select STR from MRCONSO where " +\
                "CUI = '%s' and LAT = 'ENG' and ISPREF = 'Y'" % cui +\
                termtypes + " and (SAB = 'SNOMEDCT_US')"
#             print query
            self.cur.execute(query)

#         self.cur.execute("select STR from MRCONSO where " +
#                          "CUI = '%s' and LAT = 'ENG' and ISPREF = 'Y'" % cui +
#                          termtypes + " and SAB != 'CHV'")

            syns = set(filter(lambda y: y.replace(" ", "").isalpha(),
                              [x.lower() for x, in self.cur.fetchall()]))
            evaluate.cachefile[cui] = list(syns)
            return syns

    def run(self, test_data):
        out_results = []
        for ann in test_data:
            doc_type = '_'.join((ann['topic_id'].lower(),
                                 ann['reference_article'][:-4].lower()))
            doc_type = doc_type.replace(',', '').replace("'", '"')
            # TEMPORARY FIX FOR WRONG DOCUMENT TYPE NAME
            if self.opts.runmode == 'eval':
                doc_type = doc_type.replace('train', 'eval')

            doc = self.doc_mod.get_doc(
                ann['topic_id'].lower(), ann['citing_article'])
            cit_text = ann['citation_text']
            cit_text_doc = doc[
                ann['citation_offset'][0]:ann['citation_offset'][1]]
            cit_marker = ann['citation_marker']
            cit_marker_doc = doc[
                ann['citation_marker_offset'][0]:ann['citation_marker_offset'][1]]
            cit_mrk_offset_sent = [ann['citation_marker_offset'][0] - ann['citation_offset'][0],
                                   ann['citation_marker_offset'][1] - ann['citation_offset'][0]]
            cleaned = self.reg_apa.sub('', cit_text_doc)
            cleaned = self.reg_ieee.sub('', cleaned)
            cleaned = self.reg_paranthesis.sub('', cleaned)
            cleaned = self.reg_apa_rare.sub('', cleaned)
            cleaned = re.sub('\s+', ' ', cleaned).strip()
            cleaned = re.sub('(,\s)+', ', ', cleaned).strip(', ')

            '''
            -------------- IMMEDIATE NP BEFORE MARKER ----------
            '''
            m = list(self.reg_apa.finditer(cit_text_doc))
            m1 = list(self.reg_ieee.finditer(cit_text_doc))
            m2 = list(self.reg_paranthesis.finditer(cit_text_doc))
            # (start, end, group)
            if len(m) > 0:
                markers = [(e.start(), e.end(), e.group(0)) for e in m]
            elif len(m1) > 0:
                markers = [(e.start(), e.end(), e.group(0))
                           for e in m1]
            elif len(m2) > 0:
                markers = [(e.start(), e.end(), e.group(0))
                           for e in m2]
            else:
                m3 = list(self.reg_apa_rare.finditer(cit_text_doc))
                if len(m3) > 0:
                    markers = [(e.start(), e.end(), e.group(0))
                               for e in m3]
                else:
                    markers = []

            if len(markers) > 10000:

                nps = self.nlp_extractor.parse_by_mbsp(cleaned.strip())
                if nps is None:
                    q = cleaned
                else:
                    t = nps.split(' ')
                    concepts = []
                    for i in range(len(t)):
                        conc = []
                        toks = t[i].split('/')
                        while(('NP' in toks[2]) and (i < len(t))):
                            conc.append((toks[0], toks[6]))
                            i += 1
                            if i < len(t):
                                toks = t[i].split('/')
                        if len(conc) > 0:
                            concepts.append(conc)
                    noun_phrases = [
                        ' '.join([s1[0] for s1 in t1]) for t1 in concepts]

    #                 nps = self.nlp_extractor.extract_NP(cleaned, mode='flattened')
    #                 nps = [[[a[1:-1] for a in piece] for piece in sent] for sent in nps]
        #             nps = [a[1:-1] for sent in nps for piece in sent for a in piece]
    #                 for e in nps:
    #                     noun_phrases = [(sub_e[0].replace('"', ''),idx) for idx, sub_e in enumerate(e) if sub_e[0].replace('"', '') not in self.stopwords]
                    tokens = self.tokenizer.tokenize(cit_text)
                    tokens_offsets = self.tokenizer.span_tokenize(cit_text_doc)
                    nearest = ''
                    nearest_idx = -1
                    distance = 100000
                    # find nearest word to the citation marker
                    for idx, f in enumerate(tokens_offsets):
                        # check to see if in valid span (not citation markers)
                        invalid = False
                        for e in markers:
                            if f[0] >= e[0] and f[1] <= e[1]:
                                invalid = True
                        if (cit_mrk_offset_sent[0] - f[1] >= 0) and\
                                (cit_mrk_offset_sent[0] - f[1] < distance) and\
                                not invalid:
                            distance = cit_mrk_offset_sent[0] - f[1]
                            if len(re.findall(r"^[^A-Za-z]+$", tokens[idx])) == 0:
                                nearest = tokens[idx]
                                if (idx > 0) and len(re.findall(r"^[^A-Za-z]+$", tokens[idx - 1])) == 0:
                                    nearest = tokens[
                                        idx - 1] + ' ' + tokens[idx]
                                nearest_idx = idx
                        elif (cit_mrk_offset_sent[0] < f[1]):
                            break
                        if len(nearest.split(' ')) == 1 and nearest_idx > 0 and\
                                tokens[nearest_idx] not in stops100:
                            nearest = tokens[idx - 1] + ' ' + tokens[idx]
                    largest = 0
                    q = ''
                    for n in noun_phrases:
                        if (nearest in n) and (len(nearest.split()) > largest):
                            q = '"%s"' % nearest
                            largest = len(nearest.split())
                    if q == '':
                        q = cleaned
                q = sanitize(q)
# find longest noun phrase containing the nearest
#                 res = None
#                 for np in nps[0]:
#                    if nearest in np and len(np) > longest and len(np) < 5:
#                        longest = len(np)
#                        res = np
#                 if res is not None:
#                     res = ' '.join([el for el in res])
#                 else:
#                     res = nearest
            else:
                try:
                    qtxt = unicodedata.normalize('NFKD',
                                                 cleaned).encode('ascii', 'ignore')
                except:
                    qtxt = cleaned.encode('ascii', 'ignore')
                qterms = [qtxt]
                tokens = self.tokenizer.tokenize(' '.join(qterms))
    #             tokens = self.es_int.tokenize(qtxt, analyzer=self.analyzer)
                q = ' '.join([t for t in tokens
                              if (t not in self.stopwords and
                                  not(self.all_digits(t)))])
                if self.opts.concept_boost > 0:

                    qconcepts = mmrun(cleaned)
                    qcids = []
                    for cdata in qconcepts['concepts']:
                        newterms = self.expand_concept(cdata)
                        if newterms is not None:
                            qcids.append(newterms)
                else:
                    qcids = []
                if self.opts.np_boost > 0:
                    nps = self.nlp_extractor.extract_NP(qtxt, mode='flattened')
                    noun_phs = set()
                    for e in nps:
                        for e1 in e:
                            if len(e1) < 4:
                                all_stop = False
                                if self.opts.remove_stopwords:
                                    tmp = ' '.join(sub_e.replace('"', '')
                                                   for sub_e in e1 if sub_e.replace('"', '') not in self.stopwords)
                                else:
                                    count = 0
                                    for sub_e in e1:
                                        if sub_e.replace('"', '') in self.stopwords:
                                            count += 1
                                    if count == len(e1):
                                        all_stop = True
                                    tmp = ' '.join(sub_e.replace('"', '')
                                                   for sub_e in e1)
                                if '"' + tmp.replace('"', '') + '"' not in noun_phs and not all_stop:
                                    noun_phs.add(
                                        '"' + tmp.replace('"', '') + '"')
                else:
                    noun_phs = []

            if self.opts.analyzer:
                r = self.es_int.simple_search(q, maxsize=self.opts.maxsize,
                                              source_fields=[
                                                  'offset', 'sentence'],
                                              # field='sentence',
                                              doc_type=doc_type,
                                              params={'analyzer': self.opts.analyzer})
            else:
                #                 r = self.es_int.multi_field_search(sentence=q,
                #                                                    concepts=' '.join(
                #                                                        [w for w in qcids]),
                #                                                    noun_phrases=' '.join(
                #                                                        [e for e in noun_phs]),
                #                                                    maxsize=self.opts.maxsize,
                #                                                    source_fields=[
                #                                                        'offset', 'sentence', 'mm-concepts', 'noun_phrases'],
                #                                                    doc_type=doc_type,
                #                                                    field_boost=[self.opts.sent_boost,
                #                                                                 self.opts.concept_boost,
                # self.opts.np_boost])
                fields = [
                    'sentence', 'mm-concepts', 'noun_phrases_1', 'stemmed']
                tokens1 = []
                for w in self.tokenizer.tokenize(cleaned):
                    Okay = True
                    if self.opts.remove_stopwords:
                        if w in self.stopwords:
                            Okay = False
                    if '-' in w:
                        tokens1.append(self.stemmer.stem(w.replace('-', '')))
                    if Okay:
                        tokens1.append(self.stemmer.stem(w))
                field_vals = [q, ' '.join([w for w in qcids]),
                              (' '.join([e for e in noun_phs])).replace(
                                  '"', ''),
                              ' '.join([w for w in tokens1])]
                field_boosts = [
                    self.opts.sent_boost, self.opts.concept_boost, self.opts.np_boost, self.opts.stem_boost]
                r = self.es_int.multi_field_search(field_vals=field_vals,
                                                   fields=fields,
                                                   source_fields=[
                                                       'offset', 'sentence'],
                                                   maxsize=self.opts.maxsize,
                                                   field_boost=field_boosts,
                                                   doc_type=doc_type)
#             r = self.es_int.find_all(doc_type=doc_type, source_fields=['offset','sentence'])
            for e in r:
                fld = e.pop('fields')
                e['offset'] = [eval(fld['offset'][0])]
#                 beg = e['offset'][0][0] - \
#                     100 if e['offset'][0][0] else e['offset'][0][0]
#                 end = e['offset'][0][1] + 100
#                 e['offset'] = [(beg, end)]
                e['sentence'] = fld['sentence'][0]
                e['query'] = q
            if self.opts.combine:
                if len(r) == 0:
                    r = [{'_type': doc_type,
                          '_index': self.opts.index_name,
                          '_score': 0,
                          'score': 0,
                          'sentence': [''],
                          'offset': [(0, 1)],
                          'query':q, '_id':-11}]
                r = [{'_type': r[0]['_type'],
                      '_index': r[0]['_index'],
                      'query': q,
                      'topic': ann['topic_id'].lower(),
                      'citance_number': ann['citance_number'],
                      'citation_text': ann['citation_text'],
                      'citing_article': ann['citing_article'],
                      '_score': sum([e['_score'] for e in r]),
                      'offset': [e['offset'][0] for e in r],
                      'sentence': [e['sentence'] for e in r],
                      '_id': '-000001'}]
                
                
            out_results.append(r)
        return out_results