Python ESInterface.tokenize 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: util.es_interface

클래스/타입: ESInterface

메소드/함수: tokenize

hotexamples.com에서의 예제들: 8

Python ESInterface.tokenize - 8개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 util.es_interface.ESInterface.tokenize에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

ESInterface(11)

simple_search(5)

tokenize(5)

find_all(1)

get_avg_size(1)

get_idf(1)

get_index_analyzer(1)

get_page(1)

get_page_by_res(1)

multi_field_search(1)

예제 #1

파일 보기

파일: high_idf.py 프로젝트: acohan/scientific-summ

class Method(MethodInterface):

    """ Produce reference text by submitting the
        citance to the ElasticSearch server.
    """
    method_opts = {'maxsize': {'type': int, 'default': 100},
                   'thresh': {'type': int, 'default': False},
                   'stopwords-path': {'default': STOPWORDS_PATH},
                   'remove-stopwords': {'default': True,
                                        'action': 'store_true'},
                   'combine': {'default': False, 'action': 'store_true'},
                   'cache-path': {'default': 'cache'},
                   'idf_index': {'default': 'pubmed'}}

    def __init__(self, args, opts):
        super(Method, self).__init__(args, opts)
        self.es_int = ESInterface(host=self.opts.server,
                                  port=self.opts.port,
                                  index_name=self.opts.index_name)
        self.regex_citation = re.compile(r"(\(\s?(([A-Za-z]+\.?\s?)+,? \d+"
                                         r"(\s?([;,]|and)\s)?)+\))|"
                                         r"(\[(\d+([,–]\s?)?)+\])|"
                                         r"\[[\d,-]+\]").sub
        self.all_digits = re.compile(r"^\d+$").search
        if self.opts.remove_stopwords:
            with file(self.opts.stopwords_path) as f:
                self.stopwords = frozenset([l.strip().lower() for l in f])
        else:
            self.stopwords = frozenset([])

    def run(self, test_data):
        out_results = []
        doc_freq_path = os.path.join(self.opts.cache_path, 'idfidx' +
                                     self.opts.idf_index +
                                     'wp_doc_freq.json')
        if os.path.exists(doc_freq_path):
            with codecs.open(doc_freq_path,
                             'rb',
                             'UTF-8') as mf:
                doc_freq = json.load(mf)
        else:
            doc_freq = {}
        es_int2 = ESAuth(host='devram4.cs.georgetown.edu',
                              index_name=self.opts.idf_index)
        count_docs = es_int2.count(query='*:*')
        for ann in test_data:
            doc_type = '_'.join((ann['topic_id'].lower(),
                                 ann['reference_article'][:-4].lower()))
            doc_type = doc_type.replace(',', '').replace("'", '"')

            authors = set((ann['reference_article'][:-4].lower().strip(),
                           ann['citing_article'][:-4].lower().strip()))

            # preprocess (removes citations) and tokenizes
            # citation text before submitting to elasticsearch
            q = self.regex_citation('', ann['citation_text'])
            q = q.encode('ascii', 'ignore')
            terms = []
            for t in self.es_int.tokenize(q, 'sentence'):
                if (t not in self.stopwords and
                        t not in authors and
                        not(self.all_digits(t))):
                    if t not in doc_freq.keys():
                        count = es_int2.count(t)
                        if count > 0:
                            idf = log(count_docs / float(count + 1))
                            doc_freq[t] = idf
                            terms.append(t)
                    else:
                        idf = doc_freq[t]
                        terms.append(t)
            avg_idf = np.average([doc_freq[t] for t in terms])
            thresh = avg_idf if self.opts.thresh is not None\
                else self.opts.thresh
            q = ' '.join([t for t in terms
                          if (doc_freq[t] > thresh)])
            if q == '':
                max_idf = -1
                for t in terms:
                    if max_idf < doc_freq[t]:
                        max_idf = doc_freq[t]
                        q = t
            r = self.es_int.simple_search(q, maxsize=self.opts.maxsize,
                                          source_fields=['offset', 'sentence'],
                                          field='sentence',
                                          doc_type=doc_type)
            for e in r:
                fld = e.pop('fields')
                e['offset'] = [eval(fld['offset'][0])]
#                 beg = e['offset'][0][0] - \
#                     100 if e['offset'][0][0] else e['offset'][0][0]
#                 end = e['offset'][0][1] + 100
#                 e['offset'] = [(beg, end)]
                e['sentence'] = fld['sentence'][0]
                e['query'] = q

            if self.opts.combine:
                if len(r) == 0:
                    r = [{'_type': doc_type,
                          '_index': self.opts.index_name,
                          '_score': 0,
                          'sentence': '',
                          'offset': [(0, 1)],
                          'query':q, '_id':-11}]
                r = [{'_type': r[0]['_type'],
                      '_index': r[0]['_index'],
                      'query': q,
                      'topic': ann['topic_id'].lower(),
                      'citance_number': ann['citance_number'],
                      'citation_text': ann['citation_text'],
                      'citing_article': ann['citing_article'],
                      '_score': sum([e['_score'] for e in r]),
                      'offset': [e['offset'][0] for e in r],
                      'sentence': [e['sentence'] for e in r],
                      '_id': '-000001'}]
            out_results.append(r)
        with codecs.open(doc_freq_path,
                         'wb',
                         'UTF-8') as mf:
            json.dump(doc_freq, mf, indent=2)
        return out_results

예제 #2

파일 보기

파일: np.py 프로젝트: yuancz/scientific-summ

class Method(MethodInterface):

    """ Produce reference text by submitting the
        citance to the ElasticSearch server.
    """
    method_opts = {'maxsize': {'type': int, 'default': 100},
                   'stopwords-path': {'default': STOPWORDS_PATH},
                   'remove-stopwords': {'default': False,
                                        'action': 'store_true'},
                   'remove-stopwords-phrase': {'default': False,
                                               'action': 'store_true'},
                   'noun-phrase': {'default': False,
                                   'action': 'store_true'},
                   'phrase-slop': {'type': int, 'default': 0},
                   'combine': {'default': False, 'action': 'store_true'},
                   'docs-path': {'default': DOCS_PATH},
                   'expand-window': {'default': False, 'action': 'store_true'},
                   'query-terms': {'default': False, 'action': 'store_true'},
                   'verbose': {'default': False, 'action': 'store_true'},
                   'qterm-weight': {'type': float, 'default': 1.0},
                   'phrase-weight': {'type': float, 'default': 2.0},
                   'surrounding-words-weight': {'type': float, 'default': 1.0},
                   'filter-allstops': {'default': False, 'action': 'store_true'},
                   'expand-results': {'type': int, 'default': 0},
                   'sentence': {'default': False, 'type': int},
                   'analyzer': {'default': False, 'type': str}}

    def __init__(self, args, opts):
        super(Method, self).__init__(args, opts)
        self.es_int = ESInterface(host=self.opts.server,
                                  port=self.opts.port,
                                  index_name=self.opts.index_name)
        self.regex_citation = re.compile(r"\(\s?(([A-Za-z\-]+\s)+([A-Za-z\-]+\.?)?,?\s\d{2,4}(;\s)?)+\s?\)|"
                                         r"(\[(\d+([,–]\s?)?)+\])|"
                                         r"\[[\d,-]+\]").sub
        self.all_digits = re.compile(r"^\d+$").search
        if self.opts.remove_stopwords:
            with file(self.opts.stopwords_path) as f:
                self.stopwords = frozenset([l.strip().lower() for l in f])
        else:
            self.stopwords = frozenset([])
        self.doc_mod = documents_model.DocumentsModel(opts.docs_path)
        self.ann_client = AnnotationsClient()

        self.reg_apa = re.compile(
            # [Chen et al.2000]
            r"\(\s?(([A-Za-z\-]+\s)+([A-Za-z\-]+\.?)?,?\s\d{2,4}(;\s)?)+\s?\)|"
            r"\(\s?([^ ]+\s?[^ ]*et\sal\.,?\s\d{2,4}(,\s)?)+(\sand\s)?[^ ]+\s?[^ ]*et\sal\.,?\s\d{2,4}\)|"
            r"\w+\set al\. \(\d{2,4}\)")  # [Chen et al. 200]
        self.reg_apa_rare = re.compile(
            r"((([A-Z]\w+\set\sal\.,? \d{4})|([A-Z]\w+\sand\s[A-Z]\w+,?\s\d{4}))((,\s)| and )?)+")
        self.reg_apa2 = re.compile(
            r"\(\s?(\w+\s?\w*et\sal\.,\s\d{2,4}(,\s)?)+(\sand\s)?\w+\s?\w*et\sal\.,\s\d{2,4}\)")
        self.reg_ieee = re.compile(r"(\[(\d+([,–]\s?)?)+\])|\[\s?[\d,-]+\]")
        self.reg_paranthesis = re.compile(
            r"\(\s?\d{1,2}(,\s\d{1,2})*(\sand\s\d{1,2})?\)")
        self.nlp_extractor = Extract_NLP_Tags()
        self.tokenizer = RegexpTokenizer('[^\w\-\']+', gaps=True)
        self.lmtzr = WordNetLemmatizer()

    def run(self, test_data):
        out_results = []
        not_found = 0
        total = 0
#         outfile = codecs.open('tmp/nlp.txt' , 'wb' , 'UTF-8')
        processed = set()
        for ann in test_data:
            if (ann['topic_id'] + '_' + str(ann['citance_number'])) not in processed:
                doc_type = '_'.join((ann['topic_id'].lower(),
                                     ann['reference_article'][:-4].lower()))
                doc_type = doc_type.replace(',', '').replace("'", '"')
                doc = self.doc_mod.get_doc(
                    ann['topic_id'].lower(), ann['citing_article'])
                cit_text = ann['citation_text']
                cit_text_doc = doc[
                    ann['citation_offset'][0]:ann['citation_offset'][1]]
                cit_marker = ann['citation_marker']
                cit_marker_doc = doc[
                    ann['citation_marker_offset'][0]:ann['citation_marker_offset'][1]]
                cit_mrk_offset_sent = [ann['citation_marker_offset'][0] - ann['citation_offset'][0] + 1,
                                       [ann['citation_marker_offset'][1] - ann['citation_offset'][1] + 1]]
                cleaned = self.reg_apa.sub('', cit_text_doc)
                cleaned = self.reg_ieee.sub('', cleaned)
                cleaned = self.reg_paranthesis.sub('', cleaned)
                cleaned = self.reg_apa_rare.sub('', cleaned)
                cleaned = re.sub('\s+', ' ', cleaned).strip()
                cleaned = re.sub('(,\s)+', ', ', cleaned).strip(', ')
                chunks = set()
                # get noun phrases, format [[[term1, term2],[term3]][term4,
                # term5]]
                nps = self.nlp_extractor.extract_NP(cleaned, mode='flattened')
#                 nps = [[[a[1:-1] for a in piece] for piece in sent] for sent in nps]
#                 for e in nps:
#                     noun_phrases = [(sub_e[0].replace('"', ''),idx) for idx, sub_e in enumerate(e) if sub_e[0].replace('"', '') not in self.stopwords]
                noun_phrases = [e for e in list(itertools.chain.from_iterable(nps))
                                if e not in self.stopwords]
#                 tokens = self.tokenizer.tokenize(cit_text)
#                 tokens_offsets = self.tokenizer.span_tokenize(cit_text_doc)
#                 cleaned = ''
#
#                 m = list(self.reg_apa.finditer(cit_text_doc))
#                 m1 = list(self.reg_ieee.finditer(cit_text_doc))
#                 m2 = list(self.reg_paranthesis.finditer(cit_text_doc))
#                 # (start, end, group)
#                 if len(m) > 0:
#                     markers = [(e.start(), e.end(), e.group(0)) for e in m]
#                 elif len(m1) > 0:
#                     markers = [(e.start(), e.end(), e.group(0))
#                                for e in m1]
#                 elif len(m2) > 0:
#                     markers = [(e.start(), e.end(), e.group(0))
#                                for e in m2]
#                 else:
#                     m3 = list(self.reg_apa_rare.finditer(cit_text_doc))
#                     if len(m3) > 0:
#                         markers = [(e.start(), e.end(), e.group(0))
#                                    for e in m3]
#                     else:
#                         not_found += 1
#                 nearest = ''
#                 distance = 100000
#                 if len(markers) > 1:
#                     # find nearest word to the citation marker
#                     for idx, f in enumerate(tokens_offsets):
#                         # check to see if in valid span (not citation markers)
#                         invalid = False
#                         for e in markers:
#                             if f[0] >= e[0] and f[1] <= e[1]:
#                                 invalid = True
#                         if (cit_mrk_offset_sent[0] - f[1] >= 0) and\
#                                 (cit_mrk_offset_sent[0] - f[1] < distance) and\
#                                 not invalid:
#                             distance = cit_mrk_offset_sent[0] - f[1]
#                             if len(re.findall(r"^[^A-Za-z]+$", tokens[idx])) == 0:
#                                 nearest = tokens[idx]
#
#                         # find longest noun phrase containing the nearest
#                         longest = 0
#                         res = None
#                         for np in nps[0]:
#                             if nearest in np and len(np) > longest:
#                                 longest = len(np)
#                                 res = np
#                         if res is not None:
#                             res = ' '.join([el for el in res])
#                         else:
#                             res = nearest
#                 else:
#                     # if there is only one citation marker, just consider the
#                     # whole citation text as the query
#                     q_tokens = []
#                     for idx, f in enumerate(tokens_offsets):
#                         invalid = False
#                         for e in markers:
#                             if f[0] >= e[0] and f[1] <= e[1]:
#                                 invalid = True
#                         if (cit_mrk_offset_sent[0] - f[1] >= 0) and\
#                                 (cit_mrk_offset_sent[0] - f[1] < distance) and\
#                                 not invalid:
#                             q_tokens.append(tokens[idx])
#                     res = ' '.join([f for f in q_tokens])
                q = noun_phrases
                q = ' '.join(q).encode('ascii', 'ignore')
    #             outfile.write('query: "%s" \nparsed: "%s"\n\n' %(q,str(nps)) )
                tokens = self.es_int.tokenize(q, "sentence")
                q = ' '.join([t for t in tokens
                              if (t not in self.stopwords and
                                  not(self.all_digits(t)))])
                if self.opts.analyzer:
                    r = self.es_int.simple_search(q, maxsize=self.opts.maxsize,
                                                  source_fields=[
                                                      'offset', 'sentence'],
                                                  # field='sentence',
                                                  doc_type=doc_type,
                                                  params={'analyzer': self.opts.analyzer})
                else:
                    r = self.es_int.simple_search(q, maxsize=self.opts.maxsize,
                                                  source_fields=[
                                                      'offset', 'sentence'],
                                                  # field='sentence',
                                                  doc_type=doc_type)
                for e in r:
                    fld = e.pop('fields')
                    e['offset'] = [eval(fld['offset'][0])]
                    beg = e['offset'][0][0] - \
                        100 if e['offset'][0][0] else e['offset'][0][0]
                    end = e['offset'][0][1] + 100
                    e['offset'] = [(beg, end)]
                    e['sentence'] = fld['sentence'][0]
                    e['query'] = q
                if self.opts.combine:
                    if len(r) == 0:
                        r = [{'_type': doc_type,
                              '_index': self.opts.index_name,
                              '_score': 0,
                              'sentence': '',
                              'offset': [(0, 1)],
                              'query':q, '_id':-11}]
                    r = [{'_type': r[0]['_type'],
                          '_index': r[0]['_index'],
                          'query': q,
                          'topic': ann['topic_id'].lower(),
                          'citance_number': ann['citance_number'],
                          'citation_text': ann['citation_text'],
                          'citing_article': ann['citing_article'],
                          '_score': sum([e['_score'] for e in r]),
                          'offset': [e['offset'][0] for e in r],
                          'sentence': [e['sentence'] for e in r],
                          '_id': '-000001'}]
                out_results.append(r)
        return out_results

예제 #3

파일 보기

파일: citation_vec.py 프로젝트: yuancz/scientific-summ

class Method(MethodInterface):
    """ Produce reference text by submitting the
        citance to the ElasticSearch server.
    """
    method_opts = {
        'maxsize': {
            'type': int,
            'default': 3
        },
        'stopwords-path': {
            'default': STOPWORDS_PATH
        },
        'remove-stopwords': {
            'default': False,
            'action': 'store_true'
        },
        'combine': {
            'default': False,
            'action': 'store_true'
        },
        'analyzer': {
            'default': False,
            'type': str
        },
        'ngram': {
            'default': False,
            'type': int
        }
    }

    def __init__(self, args, opts):
        super(Method, self).__init__(args, opts)
        self.es_int = ESInterface(host=self.opts.server,
                                  port=self.opts.port,
                                  index_name=self.opts.index_name)

        self.regex_citation = re.compile(
            r"\(\s?(([A-Za-z\-]+\s)+([A-Za-z\-]+\.?)?,?\s\d{2,4}(;\s)?)+\s?\)|"
            r"(\[(\d+([,–]\s?)?)+\])|"
            r"\[[\d,-]+\]").sub
        self.all_digits = re.compile(r"^\d+$").search
        if self.opts.stopwords_path:
            stop_path = self.opts.stopwords_path
        else:
            stop_path = STOPWORDS_PATH
        if self.opts.remove_stopwords:
            with file(self.opts.stopwords_path) as f:
                self.stopwords = frozenset([l.strip().lower() for l in f])
        else:
            self.stopwords = frozenset([])
        self.tokenizer = RegexpTokenizer('[^\w\-\']+', gaps=True)

    def run(self, test_data):
        #         with codecs.open('tmp/test_data.json', 'wb', 'utf-8') as mf:
        #             json.dump(test_data, mf, indent=2)
        out_results = []
        det_res = {}
        for ann in test_data:
            doc_type = '_'.join((ann['topic_id'].lower(),
                                 ann['reference_article'][:-4].lower()))
            # TEMPORARY FIX FOR WRONG DOCUMENT TYPE NAME
            doc_type = doc_type.replace('train', 'eval')
            doc_type = doc_type.replace(',', '').replace("'", '"')

            # TEMPORARY FIX FOR WRONG DOCUMENT TYPE NAME
            doc_type = doc_type.replace('eval', 'train')

            authors = set((ann['reference_article'][:-4].lower().strip(),
                           ann['citing_article'][:-4].lower().strip()))

            # preprocess (removes citations) and tokenizes
            # citation text before submitting to elasticsearch
            q = self.regex_citation('', ann['citation_text'])
            q = q.encode('ascii', 'ignore')
            #             tokens = self.es_int.tokenize(q, "sentence")
            tokens = self.tokenizer.tokenize(q)
            tokens = ['"' + t + '"' if '-' in t else t for t in tokens]
            q = ' '.join([
                t for t in tokens
                if (t not in self.stopwords and t not in authors
                    and not (self.all_digits(t)))
            ])

            if self.opts.ngram:
                tokens = self.es_int.tokenize(q, "sentence")
                new_query = ''
                for i in range(len(tokens) - self.opts.ngram):
                    tmp = ''
                    for j in range(i, i + self.opts.ngram):
                        tmp += tokens[j] + ' '
                    new_query += '"' + tmp.strip() + '" '
                q = new_query.strip()
#             q = '*:*'
            if self.opts.analyzer:
                r = self.es_int.simple_search(
                    q,
                    maxsize=self.opts.maxsize,
                    source_fields=['offset', 'sentence'],
                    # field='sentence',
                    doc_type=doc_type,
                    params={'analyzer': self.opts.analyzer})
            else:
                r = self.es_int.simple_search(
                    q,
                    maxsize=self.opts.maxsize,
                    source_fields=['offset', 'sentence'],
                    # field='sentence',
                    doc_type=doc_type)
            for e in r:
                fld = e.pop('fields')
                e['offset'] = [eval(fld['offset'][0])]
                #                 beg = e['offset'][0][0] - \
                #                     100 if e['offset'][0][0] else e['offset'][0][0]
                #                 end = e['offset'][0][1] + 100
                #                 e['offset'] = [(beg, end)]
                e['sentence'] = fld['sentence'][0]
                e['query'] = q
                e['topic'] = ann['topic_id'].lower()

            if self.opts.combine:
                if len(r) == 0:
                    r = [{
                        '_type': doc_type,
                        '_index': self.opts.index_name,
                        '_score': 0,
                        'sentence': '',
                        'offset': [(0, 1)],
                        'query': q,
                        '_id': -11
                    }]
                r = [{
                    '_type': r[0]['_type'],
                    '_index': r[0]['_index'],
                    'query': q,
                    'topic': ann['topic_id'].lower(),
                    'citance_number': ann['citance_number'],
                    'citation_text': ann['citation_text'],
                    'citing_article': ann['citing_article'],
                    '_score': sum([e['_score'] for e in r]),
                    'offset': [e['offset'][0] for e in r],
                    'sentence': [e['sentence'] for e in r],
                    '_id': '-000001'
                }]
            out_results.append(r)


#         with codecs.open('tmp/out_results.json', 'wb', 'utf-8') as mf:
#             json.dump(out_results, mf, indent=2)
#         sys.exit()
        return out_results

예제 #4

파일 보기

class Method(MethodInterface):
    """ Produce reference text by submitting the
        citance to the ElasticSearch server.
    """
    method_opts = {
        'maxsize': {
            'type': int,
            'default': 100
        },
        'stopwords-path': {
            'default': STOPWORDS_PATH
        },
        'remove-stopwords': {
            'default': False,
            'action': 'store_true'
        },
        'remove-stopwords-phrase': {
            'default': False,
            'action': 'store_true'
        },
        'noun-phrase': {
            'default': False,
            'action': 'store_true'
        },
        'phrase-slop': {
            'type': int,
            'default': 0
        },
        'combine': {
            'default': False,
            'action': 'store_true'
        },
        'docs-path': {
            'default': DOCS_PATH
        },
        'expand-window': {
            'default': False,
            'action': 'store_true'
        },
        'query-terms': {
            'default': False,
            'action': 'store_true'
        },
        'verbose': {
            'default': False,
            'action': 'store_true'
        },
        'qterm-weight': {
            'type': float,
            'default': 1.0
        },
        'phrase-weight': {
            'type': float,
            'default': 2.0
        },
        'surrounding-words-weight': {
            'type': float,
            'default': 1.0
        },
        'filter-allstops': {
            'default': False,
            'action': 'store_true'
        },
        'expand-results': {
            'type': int,
            'default': 0
        },
        'sentence': {
            'default': False,
            'type': int
        },
        'analyzer': {
            'default': False,
            'type': str
        }
    }

    def __init__(self, args, opts):
        super(Method, self).__init__(args, opts)
        self.es_int = ESInterface(host=self.opts.server,
                                  port=self.opts.port,
                                  index_name=self.opts.index_name)
        self.regex_citation = re.compile(
            r"\(\s?(([A-Za-z\-]+\s)+([A-Za-z\-]+\.?)?,?\s\d{2,4}(;\s)?)+\s?\)|"
            r"(\[(\d+([,–]\s?)?)+\])|"
            r"\[[\d,-]+\]").sub
        self.all_digits = re.compile(r"^\d+$").search
        if self.opts.remove_stopwords:
            with file(self.opts.stopwords_path) as f:
                self.stopwords = frozenset([l.strip().lower() for l in f])
        else:
            self.stopwords = frozenset([])
        self.doc_mod = documents_model.DocumentsModel(opts.docs_path)
        self.ann_client = AnnotationsClient()

    def run(self, test_data):
        out_results = []
        #         outfile = codecs.open('tmp/nlp.txt' , 'wb' , 'UTF-8')
        for ann in test_data:
            doc_type = '_'.join((ann['topic_id'].lower(),
                                 ann['reference_article'][:-4].lower()))
            doc_type = doc_type.replace(',', '').replace("'", '"')

            authors = set((ann['reference_article'][:-4].lower().strip(),
                           ann['citing_article'][:-4].lower().strip()))

            # preprocess (removes citations) and tokenizes
            # citation text before submitting to elasticsearch
            q = self.regex_citation('', ann['citation_text'])
            q = re.sub(r'( ,)+', ',', q)
            q = q.encode('ascii', 'ignore')
            nlp_extractor = Extract_NLP_Tags()
            nps = nlp_extractor.extract_NP(q, mode='flattened')
            #             outfile.write('query: "%s" \nparsed: "%s"\n\n' %(q,str(nps)) )
            q1 = ''
            queryterms = set()
            for e in nps:
                for e1 in e:
                    if len(e1) < 4:
                        all_stop = False
                        if self.opts.remove_stopwords_phrase:
                            tmp = ' '.join(
                                sub_e.replace('"', '') for sub_e in e1 if
                                sub_e.replace('"', '') not in self.stopwords)
                        else:
                            count = 0
                            for sub_e in e1:
                                if sub_e.replace('"', '') in self.stopwords:
                                    count += 1
                            if count == len(e1):
                                all_stop = True
                            tmp = ' '.join(
                                sub_e.replace('"', '') for sub_e in e1)
                        if tmp not in queryterms and not all_stop:
                            q1 += '"' + tmp + '"^' + \
                                str(self.opts.phrase_weight) + ' '
                            queryterms.add(tmp)
            if self.opts.expand_window:
                window = self.doc_mod.get_para(
                    ann['topic_id'].lower(),
                    ann['citing_article'][:-4].lower(),
                    (ann['citation_offset'][0], ann['citation_offset'][1]))
                sorrounding_text = deepcopy(window['sentence'])
                st = self.regex_citation('', sorrounding_text)
                st = re.sub(r'( ,)+', ',', st)
                st = st.encode('ascii', 'ignore')
                other_nouns = nlp_extractor.extract_NP(st, mode='flattened')
                for e in other_nouns:
                    for e1 in e:
                        if len(e1) < 4:
                            all_stop = False
                            if self.opts.remove_stopwords_phrase:
                                tmp = ' '.join(
                                    sub_e.replace('"', '') for sub_e in e1
                                    if sub_e.replace('"', '') not in
                                    self.stopwords)
                            else:
                                count = 0
                                for sub_e in e1:
                                    if sub_e.replace('"',
                                                     '') in self.stopwords:
                                        count += 1
                                if count == len(e1):
                                    all_stop = True
                                tmp = ' '.join(
                                    sub_e.replace('"', '') for sub_e in e1)
                            if tmp not in queryterms and not all_stop:
                                q1 += '"' + tmp + '"^' + \
                                    str(self.opts.surrounding_words_weight) + \
                                    ' '
                                queryterms.add(tmp)
            if self.opts.query_terms:
                q = ' '.join([
                    t + '^' + str(self.opts.qtrem_weight)
                    for t in self.es_int.tokenize(q)
                    if (t not in self.stopwords and t not in authors
                        and not (self.all_digits(t)))
                ])
                q1 = q1 + ' ' + q
            if self.opts.verbose:
                print "query:   %s" % q
                print "q1   :       %s" % q1
                print '_____'
#             q2 = self.es_int.tokenize(q1, 'sentence')
#             q2 = ' '.join([t for t in self.es_int.tokenize(q1)
#                           if (t not in self.stopwords and
#                               t not in authors and
#                               not(self.all_digits(t)))])
            if self.opts.analyzer:
                r = self.es_int.simple_search(
                    q1.strip(),
                    maxsize=self.opts.maxsize,
                    source_fields=['offset', 'sentence'],
                    # field='sentence',
                    doc_type=doc_type,
                    phrase_slop=self.opts.phrase_slop,
                    params={'analyzer': self.opts.analyzer})
            else:
                r = self.es_int.simple_search(
                    q1.strip(),
                    maxsize=self.opts.maxsize,
                    source_fields=['offset', 'sentence'],
                    # field='sentence',
                    doc_type=doc_type,
                    phrase_slop=self.opts.phrase_slop)
            if self.opts.sentence:
                for idx, e in enumerate(deepcopy(r)):
                    if '_id' in e:
                        query = ' OR '.join([
                            '_id:%s' % (str(int(e['_id']) + j).zfill(5))
                            for j in range(-1 * self.opts.sentence,
                                           self.opts.sentence + 1)
                            if j != 0 and int(e['_id']) + j > 0
                        ])
                        sour = self.es_int.simple_search(
                            query,
                            doc_type=e['_type'],
                            maxsize=2 * self.opts.sentence,
                            source_fields=['offset', 'sentence'])
                        #                         aft = self.es_int.get_page(
                        #                             str(int(e['_id']) + 1).zfill(5), e['_type'])
                        #                         bef = self.es_int.get_page(
                        #                             str(int(e['_id']) + 1).zfill(5), e['_type'])
                        if len(sour) > 0:
                            for s in sour:
                                r.insert(idx + 1, s)

            for e in r:
                fld = e.pop('fields')
                if eval(fld['offset'][0])[0] < self.opts.expand_results:
                    beg = 0
                else:
                    beg = eval(fld['offset'][0])[0] - self.opts.expand_results
                endd = eval(fld['offset'][0])[1] + self.opts.expand_results
                e['offset'] = [(beg, endd)]
                e['sentence'] = fld['sentence'][0]
                e['query'] = q1

            r1 = deepcopy(r)
            r = []
            for idx, e in enumerate(r1):
                if idx < self.opts.maxsize:
                    r.append(e)

            if self.opts.combine:
                if len(r) == 0:
                    r = [{
                        '_type': doc_type,
                        '_index': self.opts.index_name,
                        '_score': 0,
                        'sentence': '',
                        'offset': [(0, 1)],
                        'query': q1,
                        '_id': -11
                    }]
                r = [{
                    '_type': r[0]['_type'],
                    '_index': r[0]['_index'],
                    'query': q1,
                    '_score': sum([e['_score'] for e in r]),
                    'offset': [e['offset'][0] for e in r],
                    'sentence': [e['sentence'] for e in r],
                    '_id': '-000001'
                }]
            out_results.append(r)
        return out_results

예제 #5

파일 보기

파일: high_idf.py 프로젝트: yuancz/scientific-summ

class Method(MethodInterface):
    """ Produce reference text by submitting the
        citance to the ElasticSearch server.
    """
    method_opts = {
        'maxsize': {
            'type': int,
            'default': 100
        },
        'thresh': {
            'type': int,
            'default': False
        },
        'stopwords-path': {
            'default': STOPWORDS_PATH
        },
        'remove-stopwords': {
            'default': True,
            'action': 'store_true'
        },
        'combine': {
            'default': False,
            'action': 'store_true'
        },
        'cache-path': {
            'default': 'cache'
        },
        'idf_index': {
            'default': 'pubmed'
        }
    }

    def __init__(self, args, opts):
        super(Method, self).__init__(args, opts)
        self.es_int = ESInterface(host=self.opts.server,
                                  port=self.opts.port,
                                  index_name=self.opts.index_name)
        self.regex_citation = re.compile(r"(\(\s?(([A-Za-z]+\.?\s?)+,? \d+"
                                         r"(\s?([;,]|and)\s)?)+\))|"
                                         r"(\[(\d+([,–]\s?)?)+\])|"
                                         r"\[[\d,-]+\]").sub
        self.all_digits = re.compile(r"^\d+$").search
        if self.opts.remove_stopwords:
            with file(self.opts.stopwords_path) as f:
                self.stopwords = frozenset([l.strip().lower() for l in f])
        else:
            self.stopwords = frozenset([])

    def run(self, test_data):
        out_results = []
        doc_freq_path = os.path.join(
            self.opts.cache_path,
            'idfidx' + self.opts.idf_index + 'wp_doc_freq.json')
        if os.path.exists(doc_freq_path):
            with codecs.open(doc_freq_path, 'rb', 'UTF-8') as mf:
                doc_freq = json.load(mf)
        else:
            doc_freq = {}
        es_int2 = ESAuth(host='devram4.cs.georgetown.edu',
                         index_name=self.opts.idf_index)
        count_docs = es_int2.count(query='*:*')
        for ann in test_data:
            doc_type = '_'.join((ann['topic_id'].lower(),
                                 ann['reference_article'][:-4].lower()))
            doc_type = doc_type.replace(',', '').replace("'", '"')

            authors = set((ann['reference_article'][:-4].lower().strip(),
                           ann['citing_article'][:-4].lower().strip()))

            # preprocess (removes citations) and tokenizes
            # citation text before submitting to elasticsearch
            q = self.regex_citation('', ann['citation_text'])
            q = q.encode('ascii', 'ignore')
            terms = []
            for t in self.es_int.tokenize(q, 'sentence'):
                if (t not in self.stopwords and t not in authors
                        and not (self.all_digits(t))):
                    if t not in doc_freq.keys():
                        count = es_int2.count(t)
                        if count > 0:
                            idf = log(count_docs / float(count + 1))
                            doc_freq[t] = idf
                            terms.append(t)
                    else:
                        idf = doc_freq[t]
                        terms.append(t)
            avg_idf = np.average([doc_freq[t] for t in terms])
            thresh = avg_idf if self.opts.thresh is not None\
                else self.opts.thresh
            q = ' '.join([t for t in terms if (doc_freq[t] > thresh)])
            if q == '':
                max_idf = -1
                for t in terms:
                    if max_idf < doc_freq[t]:
                        max_idf = doc_freq[t]
                        q = t
            r = self.es_int.simple_search(q,
                                          maxsize=self.opts.maxsize,
                                          source_fields=['offset', 'sentence'],
                                          field='sentence',
                                          doc_type=doc_type)
            for e in r:
                fld = e.pop('fields')
                e['offset'] = [eval(fld['offset'][0])]
                #                 beg = e['offset'][0][0] - \
                #                     100 if e['offset'][0][0] else e['offset'][0][0]
                #                 end = e['offset'][0][1] + 100
                #                 e['offset'] = [(beg, end)]
                e['sentence'] = fld['sentence'][0]
                e['query'] = q

            if self.opts.combine:
                if len(r) == 0:
                    r = [{
                        '_type': doc_type,
                        '_index': self.opts.index_name,
                        '_score': 0,
                        'sentence': '',
                        'offset': [(0, 1)],
                        'query': q,
                        '_id': -11
                    }]
                r = [{
                    '_type': r[0]['_type'],
                    '_index': r[0]['_index'],
                    'query': q,
                    'topic': ann['topic_id'].lower(),
                    'citance_number': ann['citance_number'],
                    'citation_text': ann['citation_text'],
                    'citing_article': ann['citing_article'],
                    '_score': sum([e['_score'] for e in r]),
                    'offset': [e['offset'][0] for e in r],
                    'sentence': [e['sentence'] for e in r],
                    '_id': '-000001'
                }]
            out_results.append(r)
        with codecs.open(doc_freq_path, 'wb', 'UTF-8') as mf:
            json.dump(doc_freq, mf, indent=2)
        return out_results

예제 #6

파일 보기

class Method(MethodInterface):
    """ Produce reference text by submitting the
        citance to the ElasticSearch server.
    """
    method_opts = {
        'maxsize': {
            'type': int,
            'default': 100
        },
        'stopwords-path': {
            'default': STOPWORDS_PATH
        },
        'remove-stopwords': {
            'default': False,
            'action': 'store_true'
        },
        'combine': {
            'default': False,
            'action': 'store_true'
        },
        'analyzer': {
            'default': False,
            'type': str
        },
        'ngram': {
            'default': False,
            'type': int
        },
        'runmode': {
            'default': 'train'
        }
    }

    def __init__(self, args, opts):
        super(Method, self).__init__(args, opts)

        self.es_int = ESInterface(host=self.opts.server,
                                  port=self.opts.port,
                                  index_name=self.opts.index_name)
        self.analyzer = self.es_int.get_index_analyzer()
        self.regex_citation = re.compile(
            r"\(\s?(([A-Za-z\-]+\s)+([A-Za-z\-]+\.?)?,?\s\d{2,4}(;\s)?)+\s?\)|"
            r"(\[(\d+([,–]\s?)?)+\])|"
            r"\[[\d,-]+\]").sub
        self.all_digits = re.compile(r"^\d+$").search
        if self.opts.remove_stopwords:
            with file(self.opts.stopwords_path) as f:
                self.stopwords = frozenset([l.strip().lower() for l in f])
        else:
            self.stopwords = frozenset([])
        self.db = MySQLdb.connect(host="127.0.0.1",
                                  port=3309,
                                  user="******",
                                  passwd="lollipop11",
                                  db="umls")
        self.cur = self.db.cursor()
        self.ttys = ['SY']

        ttygroups = {
            "syns": ('AUN', 'EQ', 'SYN', 'MTH'),
            "chemicals": ('CCN', 'CSN'),
            "drugs": ('BD', 'BN', 'CD', 'DP', 'FBD', 'GN', 'OCD'),
            "diseases": ('DI', ),
            "findings": ('FI', ),
            "hierarchy": ('HS', 'HT', 'HX'),
            "related": ('RT', ),
            "preferred": ('PTN', 'PT')
        }
        self.doc_mod = documents_model.DocumentsModel(opts.anns_dir)
        self.ann_client = AnnotationsClient()

        self.reg_apa = re.compile(
            # [Chen et al.2000]
            r"\(\s?(([A-Za-z\-]+\s)+([A-Za-z\-]+\.?)?,?\s\d{2,4}(;\s)?)+\s?\)|"
            r"\(\s?([^ ]+\s?[^ ]*et\sal\.,?\s\d{2,4}(,\s)?)+(\sand\s)?[^ ]+\s?[^ ]*et\sal\.,?\s\d{2,4}\)|"
            r"\w+\set al\. \(\d{2,4}\)")  # [Chen et al. 200]
        self.reg_apa_rare = re.compile(
            r"((([A-Z]\w+\set\sal\.,? \d{4})|([A-Z]\w+\sand\s[A-Z]\w+,?\s\d{4}))((,\s)| and )?)+"
        )
        self.reg_apa2 = re.compile(
            r"\(\s?(\w+\s?\w*et\sal\.,\s\d{2,4}(,\s)?)+(\sand\s)?\w+\s?\w*et\sal\.,\s\d{2,4}\)"
        )
        self.reg_ieee = re.compile(r"(\[(\d+([,–]\s?)?)+\])|\[\s?[\d,-]+\]")
        self.reg_paranthesis = re.compile(
            r"\(\s?\d{1,2}(,\s\d{1,2})*(\sand\s\d{1,2})?\)")
        self.nlp_extractor = Extract_NLP_Tags()
        self.tokenizer = RegexpTokenizer('[^\w\-\']+', gaps=True)
        self.lmtzr = WordNetLemmatizer()

#         if len(args) > 3:s
#             self.ttys = []
#
#             for tty in args[3:]:
#                 if tty in ttygroups:
#                     self.ttys.extend(ttygroups[tty])
#                 else:
#                     self.ttys.append(tty)

    def expand_concept(self, cdata):
        rejected_semTypes = {'ftcn', 'qlco', 'qnco', 'inpr'}
        Okay = True
        for st in cdata['SemanticTypes']:
            if st in rejected_semTypes:
                Okay = False
        if Okay:
            return self.concept_synonyms(cdata['ConceptId'])

    def concept_synonyms(self, cui):
        if cui in evaluate.cachefile:
            return set(evaluate.cachefile[cui])
        else:
            termtypes = ("and (TTY=" +
                         " OR TTY=".join(["'%s'" % x
                                          for x in self.ttys]) + ")")
            #         query = 'select * from (select distinct STR from MRCONSO a,'+\
            #                 '(select distinct CUI1,AUI1,AUI2,RELA,CUI2 from MRREL where cui1 = \'%s\'' % cui +\
            #                 ' and rela is not null) b where a.CUI=b.CUI2 and a.LAT=\'ENG\') dd  ;'
            query = "select STR from MRCONSO where " +\
                "CUI = '%s' and LAT = 'ENG' and ISPREF = 'Y'" % cui +\
                termtypes + " and (SAB = 'SNOMEDCT_US')"
            #             query = "select STR"
            #             print query
            self.cur.execute(query)

            #         self.cur.execute("select STR from MRCONSO where " +
            #                          "CUI = '%s' and LAT = 'ENG' and ISPREF = 'Y'" % cui +
            #                          termtypes + " and SAB != 'CHV'")

            syns = set(
                filter(lambda y: y.replace(" ", "").isalpha(),
                       [x.lower() for x, in self.cur.fetchall()]))
            evaluate.cachefile[cui] = list(syns)
            return syns

    def umls_expand(self, cui):
        if cui in evaluate.cachefile:
            return set(evaluate.cachefile[cui])
        else:
            termtypes = ("and (TTY=" +
                         " OR TTY=".join(["'%s'" % x
                                          for x in self.ttys]) + ")")
            #         query = 'select * from (select distinct STR from MRCONSO a,'+\
            #                 '(select distinct CUI1,AUI1,AUI2,RELA,CUI2 from MRREL where cui1 = \'%s\'' % cui +\
            #                 ' and rela is not null) b where a.CUI=b.CUI2 and a.LAT=\'ENG\') dd  ;'
            query = "select STR from MRCONSO where " +\
                "STR LIKE '%%%s%%' and LAT = 'ENG' and ISPREF = 'Y'" % cui +\
                termtypes + " and (SAB = 'SNOMEDCT_US')"
            #             query = "select STR"
            #             print query
            self.cur.execute(query)

            #         self.cur.execute("select STR from MRCONSO where " +
            #                          "CUI = '%s' and LAT = 'ENG' and ISPREF = 'Y'" % cui +
            #                          termtypes + " and SAB != 'CHV'")

            syns = set(
                filter(lambda y: y.replace(" ", "").isalpha(),
                       [x.lower() for x, in self.cur.fetchall()]))
            evaluate.cachefile[cui] = list(syns)
            return syns

    def run(self, test_data):
        out_results = []
        for ann in test_data:
            doc_type = '_'.join((ann['topic_id'].lower(),
                                 ann['reference_article'][:-4].lower()))
            doc_type = doc_type.replace(',', '').replace("'", '"')
            # TEMPORARY FIX FOR WRONG DOCUMENT TYPE NAME
            if self.opts.runmode == 'eval':
                doc_type = doc_type.replace('train', 'eval')

            doc = self.doc_mod.get_doc(ann['topic_id'].lower(),
                                       ann['citing_article'])
            cit_text = ann['citation_text']
            cit_text_doc = doc[
                ann['citation_offset'][0]:ann['citation_offset'][1]]
            cit_marker = ann['citation_marker']
            cit_marker_doc = doc[ann['citation_marker_offset'][0]:
                                 ann['citation_marker_offset'][1]]
            cit_mrk_offset_sent = [
                ann['citation_marker_offset'][0] - ann['citation_offset'][0],
                ann['citation_marker_offset'][1] - ann['citation_offset'][0]
            ]
            cleaned = self.reg_apa.sub('', cit_text_doc)
            cleaned = self.reg_ieee.sub('', cleaned)
            cleaned = self.reg_paranthesis.sub('', cleaned)
            cleaned = self.reg_apa_rare.sub('', cleaned)
            cleaned = re.sub('\s+', ' ', cleaned).strip()
            cleaned = re.sub('(,\s)+', ', ', cleaned).strip(', ')
            '''
            -------------- IMMEDIATE NP BEFORE MARKER ----------
            '''
            m = list(self.reg_apa.finditer(cit_text_doc))
            m1 = list(self.reg_ieee.finditer(cit_text_doc))
            m2 = list(self.reg_paranthesis.finditer(cit_text_doc))
            # (start, end, group)
            if len(m) > 0:
                markers = [(e.start(), e.end(), e.group(0)) for e in m]
            elif len(m1) > 0:
                markers = [(e.start(), e.end(), e.group(0)) for e in m1]
            elif len(m2) > 0:
                markers = [(e.start(), e.end(), e.group(0)) for e in m2]
            else:
                m3 = list(self.reg_apa_rare.finditer(cit_text_doc))
                if len(m3) > 0:
                    markers = [(e.start(), e.end(), e.group(0)) for e in m3]
                else:
                    markers = []

            if len(markers) > 10000:

                nps = self.nlp_extractor.parse_by_mbsp(cleaned.strip())
                if nps is None:
                    q = cleaned
                else:
                    t = nps.split(' ')
                    concepts = []
                    for i in range(len(t)):
                        conc = []
                        toks = t[i].split('/')
                        while (('NP' in toks[2]) and (i < len(t))):
                            conc.append((toks[0], toks[6]))
                            i += 1
                            if i < len(t):
                                toks = t[i].split('/')
                        if len(conc) > 0:
                            concepts.append(conc)
                    noun_phrases = [
                        ' '.join([s1[0] for s1 in t1]) for t1 in concepts
                    ]

                    #                 nps = self.nlp_extractor.extract_NP(cleaned, mode='flattened')
                    #                 nps = [[[a[1:-1] for a in piece] for piece in sent] for sent in nps]
                    #             nps = [a[1:-1] for sent in nps for piece in sent for a in piece]
                    #                 for e in nps:
                    #                     noun_phrases = [(sub_e[0].replace('"', ''),idx) for idx, sub_e in enumerate(e) if sub_e[0].replace('"', '') not in self.stopwords]
                    tokens = self.tokenizer.tokenize(cit_text)
                    tokens_offsets = self.tokenizer.span_tokenize(cit_text_doc)
                    nearest = ''
                    nearest_idx = -1
                    distance = 100000
                    # find nearest word to the citation marker
                    for idx, f in enumerate(tokens_offsets):
                        # check to see if in valid span (not citation markers)
                        invalid = False
                        for e in markers:
                            if f[0] >= e[0] and f[1] <= e[1]:
                                invalid = True
                        if (cit_mrk_offset_sent[0] - f[1] >= 0) and\
                                (cit_mrk_offset_sent[0] - f[1] < distance) and\
                                not invalid:
                            distance = cit_mrk_offset_sent[0] - f[1]
                            if len(re.findall(r"^[^A-Za-z]+$",
                                              tokens[idx])) == 0:
                                nearest = tokens[idx]
                                if (idx > 0) and len(
                                        re.findall(r"^[^A-Za-z]+$",
                                                   tokens[idx - 1])) == 0:
                                    nearest = tokens[idx -
                                                     1] + ' ' + tokens[idx]
                                nearest_idx = idx
                        elif (cit_mrk_offset_sent[0] < f[1]):
                            break
                        if len(nearest.split(' ')) == 1 and nearest_idx > 0 and\
                                tokens[nearest_idx] not in stops100:
                            nearest = tokens[idx - 1] + ' ' + tokens[idx]
                    largest = 0
                    q = ''
                    for n in noun_phrases:
                        if (nearest in n) and (len(nearest.split()) > largest):
                            q = '"%s"' % nearest
                            largest = len(nearest.split())
                    if q == '':
                        q = cleaned
                q = sanitize(q)
# find longest noun phrase containing the nearest
#                 res = None
#                 for np in nps[0]:
#                    if nearest in np and len(np) > longest and len(np) < 5:
#                        longest = len(np)
#                        res = np
#                 if res is not None:
#                     res = ' '.join([el for el in res])
#                 else:
#                     res = nearest
            else:
                try:
                    qtxt = unicodedata.normalize('NFKD', cleaned).encode(
                        'ascii', 'ignore')
                except:
                    qtxt = cleaned.encode('ascii', 'ignore')
                qterms = [qtxt]
                qconcepts = mmrun(cleaned)
                for cdata in qconcepts['concepts']:
                    newterms = self.expand_concept(cdata)
                    if newterms is not None:
                        qterms.extend(newterms)

                tokens = self.tokenizer.tokenize(' '.join(qterms))
                #             tokens = self.es_int.tokenize(qtxt, analyzer=self.analyzer)
                q = ' '.join([
                    t for t in tokens
                    if (t not in self.stopwords and not (self.all_digits(t)))
                ])
                if self.opts.ngram:
                    tokens = self.es_int.tokenize(q, "sentence")
                    new_query = ''
                    for i in range(len(tokens) - self.opts.ngram):
                        tmp = ''
                        for j in range(i, i + self.opts.ngram):
                            tmp += tokens[j] + ' '
                        new_query += '"' + tmp.strip() + '" '
                    q = new_query.strip()

            if self.opts.analyzer:
                r = self.es_int.simple_search(
                    q,
                    maxsize=self.opts.maxsize,
                    source_fields=['offset', 'sentence'],
                    # field='sentence',
                    doc_type=doc_type,
                    params={'analyzer': self.opts.analyzer})
            else:
                r = self.es_int.simple_search(
                    q,
                    maxsize=self.opts.maxsize,
                    source_fields=['offset', 'sentence'],
                    # field='sentence',
                    doc_type=doc_type)
            for e in r:
                fld = e.pop('fields')
                e['offset'] = [eval(fld['offset'][0])]
                #                 beg = e['offset'][0][0] - \
                #                     100 if e['offset'][0][0] else e['offset'][0][0]
                #                 end = e['offset'][0][1] + 100
                #                 e['offset'] = [(beg, end)]
                e['sentence'] = fld['sentence'][0]
                e['query'] = q
            if self.opts.combine:
                if len(r) == 0:
                    r = [{
                        '_type': doc_type,
                        '_index': self.opts.index_name,
                        '_score': 0,
                        'sentence': '',
                        'offset': [(0, 1)],
                        'query': q,
                        '_id': -11
                    }]
                r = [{
                    '_type': r[0]['_type'],
                    '_index': r[0]['_index'],
                    'query': q,
                    '_score': sum([e['_score'] for e in r]),
                    'offset': [e['offset'][0] for e in r],
                    'sentence': [e['sentence'] for e in r],
                    '_id': '-000001'
                }]
            out_results.append(r)
        return out_results

예제 #7

파일 보기

파일: umls_expand.py 프로젝트: acohan/scientific-summ

class Method(MethodInterface):

    """ Produce reference text by submitting the
        citance to the ElasticSearch server.
    """
    method_opts = {'maxsize': {'type': int, 'default': 100},
                   'stopwords-path': {'default': STOPWORDS_PATH},
                   'remove-stopwords': {'default': False,
                                        'action': 'store_true'},
                   'combine': {'default': False, 'action': 'store_true'},
                   'analyzer': {'default': False, 'type': str},
                   'ngram': {'default': False, 'type': int},
                   'runmode': {'default': 'train'}}

    def __init__(self, args, opts):
        super(Method, self).__init__(args, opts)

        self.es_int = ESInterface(host=self.opts.server,
                                  port=self.opts.port,
                                  index_name=self.opts.index_name)
        self.analyzer = self.es_int.get_index_analyzer()
        self.regex_citation = re.compile(r"\(\s?(([A-Za-z\-]+\s)+([A-Za-z\-]+\.?)?,?\s\d{2,4}(;\s)?)+\s?\)|"
                                         r"(\[(\d+([,–]\s?)?)+\])|"
                                         r"\[[\d,-]+\]").sub
        self.all_digits = re.compile(r"^\d+$").search
        if self.opts.remove_stopwords:
            with file(self.opts.stopwords_path) as f:
                self.stopwords = frozenset([l.strip().lower() for l in f])
        else:
            self.stopwords = frozenset([])
        self.db = MySQLdb.connect(host="127.0.0.1", port=3309,
                                  user="******",
                                  passwd="lollipop11",
                                  db="umls")
        self.cur = self.db.cursor()
        self.ttys = ['SY']

        ttygroups = {"syns": ('AUN', 'EQ', 'SYN', 'MTH'),
                     "chemicals": ('CCN', 'CSN'),
                     "drugs": ('BD', 'BN', 'CD', 'DP', 'FBD', 'GN', 'OCD'),
                     "diseases": ('DI', ), "findings": ('FI', ),
                     "hierarchy": ('HS', 'HT', 'HX'), "related": ('RT', ),
                     "preferred": ('PTN', 'PT')}
        self.doc_mod = documents_model.DocumentsModel(opts.anns_dir)
        self.ann_client = AnnotationsClient()

        self.reg_apa = re.compile(
            # [Chen et al.2000]
            r"\(\s?(([A-Za-z\-]+\s)+([A-Za-z\-]+\.?)?,?\s\d{2,4}(;\s)?)+\s?\)|"
            r"\(\s?([^ ]+\s?[^ ]*et\sal\.,?\s\d{2,4}(,\s)?)+(\sand\s)?[^ ]+\s?[^ ]*et\sal\.,?\s\d{2,4}\)|"
            r"\w+\set al\. \(\d{2,4}\)")  # [Chen et al. 200]
        self.reg_apa_rare = re.compile(
            r"((([A-Z]\w+\set\sal\.,? \d{4})|([A-Z]\w+\sand\s[A-Z]\w+,?\s\d{4}))((,\s)| and )?)+")
        self.reg_apa2 = re.compile(
            r"\(\s?(\w+\s?\w*et\sal\.,\s\d{2,4}(,\s)?)+(\sand\s)?\w+\s?\w*et\sal\.,\s\d{2,4}\)")
        self.reg_ieee = re.compile(r"(\[(\d+([,–]\s?)?)+\])|\[\s?[\d,-]+\]")
        self.reg_paranthesis = re.compile(
            r"\(\s?\d{1,2}(,\s\d{1,2})*(\sand\s\d{1,2})?\)")
        self.nlp_extractor = Extract_NLP_Tags()
        self.tokenizer = RegexpTokenizer('[^\w\-\']+', gaps=True)
        self.lmtzr = WordNetLemmatizer()

#         if len(args) > 3:s
#             self.ttys = []
#
#             for tty in args[3:]:
#                 if tty in ttygroups:
#                     self.ttys.extend(ttygroups[tty])
#                 else:
#                     self.ttys.append(tty)

    def expand_concept(self, cdata):
        rejected_semTypes = {'ftcn', 'qlco', 'qnco', 'inpr'}
        Okay = True
        for st in cdata['SemanticTypes']:
            if st in rejected_semTypes:
                Okay = False
        if Okay:
            return self.concept_synonyms(cdata['ConceptId'])

    def concept_synonyms(self, cui):
        if cui in evaluate.cachefile:
            return set(evaluate.cachefile[cui])
        else:
            termtypes = ("and (TTY=" +
                         " OR TTY=".join(["'%s'" % x for x in self.ttys]) + ")")
    #         query = 'select * from (select distinct STR from MRCONSO a,'+\
    #                 '(select distinct CUI1,AUI1,AUI2,RELA,CUI2 from MRREL where cui1 = \'%s\'' % cui +\
    #                 ' and rela is not null) b where a.CUI=b.CUI2 and a.LAT=\'ENG\') dd  ;'
            query = "select STR from MRCONSO where " +\
                "CUI = '%s' and LAT = 'ENG' and ISPREF = 'Y'" % cui +\
                termtypes + " and (SAB = 'SNOMEDCT_US')"
#             query = "select STR"
#             print query
            self.cur.execute(query)

#         self.cur.execute("select STR from MRCONSO where " +
#                          "CUI = '%s' and LAT = 'ENG' and ISPREF = 'Y'" % cui +
#                          termtypes + " and SAB != 'CHV'")

            syns = set(filter(lambda y: y.replace(" ", "").isalpha(),
                              [x.lower() for x, in self.cur.fetchall()]))
            evaluate.cachefile[cui] = list(syns)
            return syns

    def umls_expand(self, cui):
        if cui in evaluate.cachefile:
            return set(evaluate.cachefile[cui])
        else:
            termtypes = ("and (TTY=" +
                         " OR TTY=".join(["'%s'" % x for x in self.ttys]) + ")")
    #         query = 'select * from (select distinct STR from MRCONSO a,'+\
    #                 '(select distinct CUI1,AUI1,AUI2,RELA,CUI2 from MRREL where cui1 = \'%s\'' % cui +\
    #                 ' and rela is not null) b where a.CUI=b.CUI2 and a.LAT=\'ENG\') dd  ;'
            query = "select STR from MRCONSO where " +\
                "STR LIKE '%%%s%%' and LAT = 'ENG' and ISPREF = 'Y'" % cui +\
                termtypes + " and (SAB = 'SNOMEDCT_US')"
#             query = "select STR"
#             print query
            self.cur.execute(query)

#         self.cur.execute("select STR from MRCONSO where " +
#                          "CUI = '%s' and LAT = 'ENG' and ISPREF = 'Y'" % cui +
#                          termtypes + " and SAB != 'CHV'")

            syns = set(filter(lambda y: y.replace(" ", "").isalpha(),
                              [x.lower() for x, in self.cur.fetchall()]))
            evaluate.cachefile[cui] = list(syns)
            return syns

    def run(self, test_data):
        out_results = []
        for ann in test_data:
            doc_type = '_'.join((ann['topic_id'].lower(),
                                 ann['reference_article'][:-4].lower()))
            doc_type = doc_type.replace(',', '').replace("'", '"')
            # TEMPORARY FIX FOR WRONG DOCUMENT TYPE NAME
            if self.opts.runmode == 'eval':
                doc_type = doc_type.replace('train', 'eval')

            doc = self.doc_mod.get_doc(
                ann['topic_id'].lower(), ann['citing_article'])
            cit_text = ann['citation_text']
            cit_text_doc = doc[
                ann['citation_offset'][0]:ann['citation_offset'][1]]
            cit_marker = ann['citation_marker']
            cit_marker_doc = doc[
                ann['citation_marker_offset'][0]:ann['citation_marker_offset'][1]]
            cit_mrk_offset_sent = [ann['citation_marker_offset'][0] - ann['citation_offset'][0],
                                   ann['citation_marker_offset'][1] - ann['citation_offset'][0]]
            cleaned = self.reg_apa.sub('', cit_text_doc)
            cleaned = self.reg_ieee.sub('', cleaned)
            cleaned = self.reg_paranthesis.sub('', cleaned)
            cleaned = self.reg_apa_rare.sub('', cleaned)
            cleaned = re.sub('\s+', ' ', cleaned).strip()
            cleaned = re.sub('(,\s)+', ', ', cleaned).strip(', ')

            '''
            -------------- IMMEDIATE NP BEFORE MARKER ----------
            '''
            m = list(self.reg_apa.finditer(cit_text_doc))
            m1 = list(self.reg_ieee.finditer(cit_text_doc))
            m2 = list(self.reg_paranthesis.finditer(cit_text_doc))
            # (start, end, group)
            if len(m) > 0:
                markers = [(e.start(), e.end(), e.group(0)) for e in m]
            elif len(m1) > 0:
                markers = [(e.start(), e.end(), e.group(0))
                           for e in m1]
            elif len(m2) > 0:
                markers = [(e.start(), e.end(), e.group(0))
                           for e in m2]
            else:
                m3 = list(self.reg_apa_rare.finditer(cit_text_doc))
                if len(m3) > 0:
                    markers = [(e.start(), e.end(), e.group(0))
                               for e in m3]
                else:
                    markers = []

            if len(markers) > 10000:

                nps = self.nlp_extractor.parse_by_mbsp(cleaned.strip())
                if nps is None:
                    q = cleaned
                else:
                    t = nps.split(' ')
                    concepts = []
                    for i in range(len(t)):
                        conc = []
                        toks = t[i].split('/')
                        while(('NP' in toks[2]) and (i < len(t))):
                            conc.append((toks[0], toks[6]))
                            i += 1
                            if i < len(t):
                                toks = t[i].split('/')
                        if len(conc) > 0:
                            concepts.append(conc)
                    noun_phrases = [
                        ' '.join([s1[0] for s1 in t1]) for t1 in concepts]

    #                 nps = self.nlp_extractor.extract_NP(cleaned, mode='flattened')
    #                 nps = [[[a[1:-1] for a in piece] for piece in sent] for sent in nps]
        #             nps = [a[1:-1] for sent in nps for piece in sent for a in piece]
    #                 for e in nps:
    #                     noun_phrases = [(sub_e[0].replace('"', ''),idx) for idx, sub_e in enumerate(e) if sub_e[0].replace('"', '') not in self.stopwords]
                    tokens = self.tokenizer.tokenize(cit_text)
                    tokens_offsets = self.tokenizer.span_tokenize(cit_text_doc)
                    nearest = ''
                    nearest_idx = -1
                    distance = 100000
                    # find nearest word to the citation marker
                    for idx, f in enumerate(tokens_offsets):
                        # check to see if in valid span (not citation markers)
                        invalid = False
                        for e in markers:
                            if f[0] >= e[0] and f[1] <= e[1]:
                                invalid = True
                        if (cit_mrk_offset_sent[0] - f[1] >= 0) and\
                                (cit_mrk_offset_sent[0] - f[1] < distance) and\
                                not invalid:
                            distance = cit_mrk_offset_sent[0] - f[1]
                            if len(re.findall(r"^[^A-Za-z]+$", tokens[idx])) == 0:
                                nearest = tokens[idx]
                                if (idx > 0) and len(re.findall(r"^[^A-Za-z]+$", tokens[idx - 1])) == 0:
                                    nearest = tokens[
                                        idx - 1] + ' ' + tokens[idx]
                                nearest_idx = idx
                        elif (cit_mrk_offset_sent[0] < f[1]):
                            break
                        if len(nearest.split(' ')) == 1 and nearest_idx > 0 and\
                                tokens[nearest_idx] not in stops100:
                            nearest = tokens[idx - 1] + ' ' + tokens[idx]
                    largest = 0
                    q = ''
                    for n in noun_phrases:
                        if (nearest in n) and (len(nearest.split()) > largest):
                            q = '"%s"' % nearest
                            largest = len(nearest.split())
                    if q == '':
                        q = cleaned
                q = sanitize(q)
# find longest noun phrase containing the nearest
#                 res = None
#                 for np in nps[0]:
#                    if nearest in np and len(np) > longest and len(np) < 5:
#                        longest = len(np)
#                        res = np
#                 if res is not None:
#                     res = ' '.join([el for el in res])
#                 else:
#                     res = nearest
            else:
                try:
                    qtxt = unicodedata.normalize('NFKD',
                                                 cleaned).encode('ascii', 'ignore')
                except:
                    qtxt = cleaned.encode('ascii', 'ignore')
                qterms = [qtxt]
                qconcepts = mmrun(cleaned)
                for cdata in qconcepts['concepts']:
                    newterms = self.expand_concept(cdata)
                    if newterms is not None:
                        qterms.extend(newterms)

                tokens = self.tokenizer.tokenize(' '.join(qterms))
    #             tokens = self.es_int.tokenize(qtxt, analyzer=self.analyzer)
                q = ' '.join([t for t in tokens
                              if (t not in self.stopwords and
                                  not(self.all_digits(t)))])
                if self.opts.ngram:
                    tokens = self.es_int.tokenize(q, "sentence")
                    new_query = ''
                    for i in range(len(tokens) - self.opts.ngram):
                        tmp = ''
                        for j in range(i, i + self.opts.ngram):
                            tmp += tokens[j] + ' '
                        new_query += '"' + tmp.strip() + '" '
                    q = new_query.strip()

            if self.opts.analyzer:
                r = self.es_int.simple_search(q, maxsize=self.opts.maxsize,
                                              source_fields=[
                                                  'offset', 'sentence'],
                                              # field='sentence',
                                              doc_type=doc_type,
                                              params={'analyzer': self.opts.analyzer})
            else:
                r = self.es_int.simple_search(q, maxsize=self.opts.maxsize,
                                              source_fields=[
                                                  'offset', 'sentence'],
                                              # field='sentence',
                                              doc_type=doc_type)
            for e in r:
                fld = e.pop('fields')
                e['offset'] = [eval(fld['offset'][0])]
#                 beg = e['offset'][0][0] - \
#                     100 if e['offset'][0][0] else e['offset'][0][0]
#                 end = e['offset'][0][1] + 100
#                 e['offset'] = [(beg, end)]
                e['sentence'] = fld['sentence'][0]
                e['query'] = q
            if self.opts.combine:
                if len(r) == 0:
                    r = [{'_type': doc_type,
                          '_index': self.opts.index_name,
                          '_score': 0,
                          'sentence': '',
                          'offset': [(0, 1)],
                          'query':q, '_id':-11}]
                r = [{'_type': r[0]['_type'],
                      '_index': r[0]['_index'],
                      'query': q,
                      '_score': sum([e['_score'] for e in r]),
                      'offset': [e['offset'][0] for e in r],
                      'sentence': [e['sentence'] for e in r],
                      '_id': '-000001'}]
            out_results.append(r)
        return out_results

예제 #8

파일 보기

파일: citation_vec.py 프로젝트: acohan/scientific-summ

class Method(MethodInterface):

    """ Produce reference text by submitting the
        citance to the ElasticSearch server.
    """
    method_opts = {'maxsize': {'type': int, 'default': 3},
                   'stopwords-path': {'default': STOPWORDS_PATH},
                   'remove-stopwords': {'default': False,
                                        'action': 'store_true'},
                   'combine': {'default': False, 'action': 'store_true'},
                   'analyzer': {'default': False, 'type': str},
                   'ngram': {'default': False, 'type': int}}

    def __init__(self, args, opts):
        super(Method, self).__init__(args, opts)
        self.es_int = ESInterface(host=self.opts.server,
                                  port=self.opts.port,
                                  index_name=self.opts.index_name)

        self.regex_citation = re.compile(r"\(\s?(([A-Za-z\-]+\s)+([A-Za-z\-]+\.?)?,?\s\d{2,4}(;\s)?)+\s?\)|"
                                         r"(\[(\d+([,–]\s?)?)+\])|"
                                         r"\[[\d,-]+\]").sub
        self.all_digits = re.compile(r"^\d+$").search
        if self.opts.stopwords_path:
            stop_path = self.opts.stopwords_path
        else:
            stop_path = STOPWORDS_PATH
        if self.opts.remove_stopwords:
            with file(self.opts.stopwords_path) as f:
                self.stopwords = frozenset([l.strip().lower() for l in f])
        else:
            self.stopwords = frozenset([])
        self.tokenizer = RegexpTokenizer('[^\w\-\']+', gaps=True)

    def run(self, test_data):
        #         with codecs.open('tmp/test_data.json', 'wb', 'utf-8') as mf:
        #             json.dump(test_data, mf, indent=2)
        out_results = []
        det_res = {}
        for ann in test_data:
            doc_type = '_'.join((ann['topic_id'].lower(),
                                 ann['reference_article'][:-4].lower()))
            # TEMPORARY FIX FOR WRONG DOCUMENT TYPE NAME
            doc_type = doc_type.replace('train', 'eval')
            doc_type = doc_type.replace(',', '').replace("'", '"')

            # TEMPORARY FIX FOR WRONG DOCUMENT TYPE NAME
            doc_type = doc_type.replace('eval', 'train')

            authors = set((ann['reference_article'][:-4].lower().strip(),
                           ann['citing_article'][:-4].lower().strip()))

            # preprocess (removes citations) and tokenizes
            # citation text before submitting to elasticsearch
            q = self.regex_citation('', ann['citation_text'])
            q = q.encode('ascii', 'ignore')
#             tokens = self.es_int.tokenize(q, "sentence")
            tokens = self.tokenizer.tokenize(q)
            tokens = ['"' + t + '"' if '-' in t else t for t in tokens]
            q = ' '.join([t for t in tokens
                          if (t not in self.stopwords and
                              t not in authors and
                              not(self.all_digits(t)))])

            if self.opts.ngram:
                tokens = self.es_int.tokenize(q, "sentence")
                new_query = ''
                for i in range(len(tokens) - self.opts.ngram):
                    tmp = ''
                    for j in range(i, i + self.opts.ngram):
                        tmp += tokens[j] + ' '
                    new_query += '"' + tmp.strip() + '" '
                q = new_query.strip()
#             q = '*:*'
            if self.opts.analyzer:
                r = self.es_int.simple_search(q, maxsize=self.opts.maxsize,
                                              source_fields=[
                                                  'offset', 'sentence'],
                                              # field='sentence',
                                              doc_type=doc_type,
                                              params={'analyzer': self.opts.analyzer})
            else:
                r = self.es_int.simple_search(q, maxsize=self.opts.maxsize,
                                              source_fields=[
                                                  'offset', 'sentence'],
                                              # field='sentence',
                                              doc_type=doc_type)
            for e in r:
                fld = e.pop('fields')
                e['offset'] = [eval(fld['offset'][0])]
#                 beg = e['offset'][0][0] - \
#                     100 if e['offset'][0][0] else e['offset'][0][0]
#                 end = e['offset'][0][1] + 100
#                 e['offset'] = [(beg, end)]
                e['sentence'] = fld['sentence'][0]
                e['query'] = q
                e['topic'] = ann['topic_id'].lower()

            if self.opts.combine:
                if len(r) == 0:
                    r = [{'_type': doc_type,
                          '_index': self.opts.index_name,
                          '_score': 0,
                          'sentence': '',
                          'offset': [(0, 1)],
                          'query':q, '_id':-11}]
                r = [{'_type': r[0]['_type'],
                      '_index': r[0]['_index'],
                      'query': q,
                      'topic': ann['topic_id'].lower(),
                      'citance_number': ann['citance_number'],
                      'citation_text': ann['citation_text'],
                      'citing_article': ann['citing_article'],
                      '_score': sum([e['_score'] for e in r]),
                      'offset': [e['offset'][0] for e in r],
                      'sentence': [e['sentence'] for e in r],
                      '_id': '-000001'}]
            out_results.append(r)
#         with codecs.open('tmp/out_results.json', 'wb', 'utf-8') as mf:
#             json.dump(out_results, mf, indent=2)
#         sys.exit()
        return out_results