Exemplo n.º 1
0
    def fulltext(self):
        """obtain error counts from elasticsearch """
        u = conf['ELASTICSEARCH_URL']
        es = elasticsearch2.Elasticsearch(u)
        pipeline = 'backoffice-fulltext_pipeline'

        # get start time
        start = Search(using=es, index='_all') \
                    .query('match', **{'@message': 'Loading records from: /proj/ads/abstracts/config/links/fulltext/all.links'}) \
                    .filter('match', **{'_type': pipeline}) \
                    .execute() \
                    .hits[0] \
                    .timestamp

        # convert to datetime object
        start = datetime.strptime(start.split('.')[0], '%Y-%m-%dT%H:%M:%S')

        self.values['ft_start'] = start

        # fulltext pipeline runs for ~15 hours without forcing extraction
        if (datetime.now() - start) < timedelta(hours=15):
            print("fulltext pipeline is most likely not done processing.")
        else:

            total_num_errors = 0

            for err in conf['FULLTEXT_ERRORS']:

                bibs = []

                s = Search(using=es, index='_all') \
                                  .filter('range', **{'@timestamp': {'gte': start, 'lt': 'now'}}) \
                                  .query('query_string', query=err) \
                                  .filter('match', **{'_type': pipeline})

                err_str = "_".join(err.split('"')[1].split()).replace(
                    '-', '_').replace(']', '').replace('[', '')

                filename = str(start).split()[0] + "_" + err_str + ".txt"
                dir = "data/ft/" + err_str + '/' + filename

                #if not os.path.isfile(dir):
                with open(dir, "w") as f:
                    for hit in s.scan():
                        if "Retrying" in hit.message:
                            continue
                        if (re.findall(r"'(.*?)'", hit.message)[0]
                                == 'bibcode') or (re.findall(
                                    r"'(.*?)'", hit.message)[0] == 'UPDATE'):
                            bib = re.search(r"u'bibcode': u'(.*?)'",
                                            hit.message).group(1)
                            f.write(bib + '\n')
                            bibs.append(bib)
                        else:
                            bib = re.findall(r"'(.*?)'", hit.message)[0]
                            f.write(bib + '\n')
                            bibs.append(bib)

                count = len(set(bibs))
                self.values[err_str + "_total"] = count
                total_num_errors += count

            self.values['total_fulltext_errors'] = total_num_errors
Exemplo n.º 2
0
    def post(self, request):
        es = Elasticsearch([{'host': ELASTICSEARCH_HOST, 'port': '9200'}])
        req = JSONParser().parse(request)
        data = req['search']
        if not data:
            return Response(status=status.HTTP_400_BAD_REQUEST,
                            data={'message': 'search word param is missing'})

        # 검색어 전처리
        # : 기준 split
        data_list = []

        search_words = data.split(":")
        # 전체검색
        if len(search_words) <= 1:
            m = MultiMatch(query=search_words[0],
                           fields=[
                               "category",
                               "region",
                               "distance",
                               "transportation",
                               "_explain",
                               "point_name",
                           ])
            s = Search(using=es, index='walkingtrails-index').query(m)[:10000]
            res = s.execute()
            for data in res:
                data_list.append(data.to_dict())
            return Response(data_list)

        # 쿼리별 검색
        key = None
        for s in search_words:
            key = transkey(key)
            words = s.split(' ')
            if key == None:
                key = words[0]
                continue
            try:
                value = " ".join(words[:len(words) -
                                       1]) if len(words) > 2 else words[0]
                if "'" in value:
                    value = re.sub("\'", "", value)
                    s = Search(using=es, index='walkingtrails-index').query(
                        'term', point_name=value)[:10000]
                else:
                    # if key == 'dog_ok':

                    m = MultiMatch(query=value, fields=[key])
                    s = Search(using=es,
                               index='walkingtrails-index').query(m)[:10000]
                res = s.execute()

                d_list = []
                for data in res:
                    d_list.append(data.to_dict())
                data_list.append(d_list)
                key = words[-1]
            except:
                return Response(status=status.HTTP_400_BAD_REQUEST,
                                data={'message': 'wrong query'})
        # s = Search(using=es, index='walkingtrails-index').query(
        #     'multi_match',
        #     query=data,
        #     fuzziness='auto',
        #     fields=['region']
        # )[:10000]

        result = data_list[0]
        for data_li in data_list:
            result = list(
                map(
                    dict,
                    set(tuple(sorted(d.items())) for d in result)
                    & set(tuple(sorted(d.items())) for d in data_li)))

        return Response(result)