示例#1
0
from flask import Flask, jsonify, make_response, request, Response, render_template, url_for
from flask_compress import Compress
from whoosh import index
from whoosh.index import create_in
from whoosh.fields import Schema, STORED, TEXT
from whoosh.analysis import StemmingAnalyzer, StandardAnalyzer, NgramFilter
from whoosh.qparser import QueryParser, MultifieldParser

#security patterns references
sec_references = []
capec_url = 'https://capec.mitre.org/data/definitions/{:}.html'

#whoosh things
my_analyzer = StemmingAnalyzer() | NgramFilter(minsize=2, maxsize=10)

sec_schema = Schema(title=TEXT(stored=True, analyzer=my_analyzer),\
    overview=TEXT(stored=True, analyzer=my_analyzer),\
    problem=TEXT(analyzer=my_analyzer),\
    id_repo=TEXT(stored=True))

att_schema = Schema(title=TEXT(stored=True, analyzer=my_analyzer),\
    summary=TEXT(stored=True, analyzer=my_analyzer),\
    attreq=TEXT(analyzer=my_analyzer),\
    solmit=TEXT(analyzer=my_analyzer),\
    secreq=TEXT(analyzer=my_analyzer),\
    secpri=TEXT(analyzer=my_analyzer),\
    id=TEXT(stored=True))

ix_attack = index.open_dir("index", indexname="ix_attack")
ix_security = index.open_dir("index", indexname="ix_security")
示例#2
0
class InfoSchema(SchemaClass):
    objectID = ID(stored=True, unique=True, sortable=True)
    args = KEYWORD(commas=True, stored=True, scorable=True)
    base_class = TEXT(stored=True)
    docstring = TEXT(stored=True)
    definition = TEXT(stored=True)
    file = ID(stored=True)
    fullname = TEXT(field_boost=2.0, stored=True, sortable=True, phrase=False)
    module = TEXT(stored=True)
    name = TEXT(stored=True)
    package = TEXT(stored=True)
    signature = TEXT(stored=True)
    source = TEXT(stored=True)
    string_form = TEXT(stored=True)
    tags = KEYWORD(stored=True)
    type_name = TEXT(stored=True)
    date_created = DATETIME
示例#3
0
import shutil
from whoosh.fields import Schema, TEXT, KEYWORD, ID, STORED
from whoosh.analysis import StemmingAnalyzer
import os
from functools import reduce
import helpers
import json
from whoosh import index

schema = Schema(post_id=ID(stored=True),
                title=TEXT(stored=True),
                tokens=KEYWORD(stored=True, commas=True, scorable=True))

if __name__ == "__main__":
    if os.path.exists("indexdir"):
        shutil.rmtree("indexdir")

    os.mkdir("indexdir")

    ix = index.create_in("indexdir", schema)
    writer = ix.writer()

    with open('../dataset.json') as dataset_file:
        for (post_id, post) in json.load(dataset_file).items():
            terms = set(post['BodyTokens'])
            terms = terms.union(set(helpers.preprocess_text(post['Title'])))

            child_body_terms = reduce(
                lambda child_one, child_two: child_one.union(child_two),
                map(lambda child: set(child['BodyTokens']),
                    post['Children'][1:]))
def postprocess(type):
    us_eastern_time = pytz.timezone('US/Eastern')
    datapath = './processes/daily/store/'
    cluster_index_datapath = './processes/daily/store/'
    result_file = datapath + 'result.json'
    with open(result_file) as data_file:
        result_data = json.load(data_file)

    all_results = []
    print(
        '\n\nCLUSTER_POSTPROC: Initiated, total {0} cluster(s) for postprocessing \n'
        .format(len(result_data)))

    for group in result_data:
        pick_list = []
        pick_timefiltered = []
        pick_toparticles = []
        peak_indexes = []
        peak_indexes_items = []
        peak_indexes_time = []
        peak_processing_articlesfromquery = []
        pick_theme = ' '.join(group['theme'])
        lda_theme = []
        ldathemes_combined = []

        for item in group['groups']:
            all_documents = []
            all_documents_title = []
            all_documents_text = []

            schema = Schema(title=TEXT(stored=True),
                            path=ID(stored=True),
                            content=TEXT(stored=True))

            os.makedirs(cluster_index_datapath + "indexes", exist_ok=True)
            ix = create_in(cluster_index_datapath + "indexes", schema)
            writer = ix.writer()

            for article in item['articles']:
                writer.add_document(title=article['title'],
                                    path=article['_id'],
                                    content=article['text'])
                all_documents.append(article)

            writer.commit()
            with ix.searcher() as searcher:
                query = whoosh.qparser.QueryParser(
                    "title", ix.schema,
                    group=whoosh.qparser.OrGroup).parse(pick_theme)
                results = searcher.search(query, limit=30)

                if len(results):
                    article_pick = results
                    for a in article_pick:
                        all_documents_title.append(a['title'])
                        all_documents_text.append(a['content'])

            pick_list.append(len(item['articles']))
            pick_timefiltered.append(item['time_filterby'])
            print(
                '  -* CLUSTER_POSTPROC: Total: {0}, Month: {1}, Query Result: {2}'
                .format(pick_list, pick_timefiltered,
                        len(all_documents_title)))

            querytext = ' '.join(group['theme'])
            article_pick = []
            toparticles_matched = []
            toparticles_matched_text = []
            with ix.searcher() as searcher:
                query = whoosh.qparser.QueryParser(
                    "title", ix.schema,
                    group=whoosh.qparser.OrGroup).parse(querytext)
                results = searcher.search(query)

                if len(results):
                    for a in results:
                        for article in item['articles']:
                            if a['path'] == article['_id']:
                                toparticles_matched_text.append(
                                    article['title'])
                                peak_processing_articlesfromquery.append(
                                    article)

                    article_pick = results[0:5]
                    for a in article_pick:
                        for article in item['articles']:
                            if a['path'] == article['_id']:
                                if 'text' in article:
                                    del article['text']
                                toparticles_matched.append(article)
                                # toparticles_matched_text.append(article['title'])

            pick_toparticles.append(toparticles_matched)

            print(
                '  -- CLUSTER_POSTPROC: Extracting features from the dataset /lda'
            )
            from nltk.corpus import stopwords
            from nltk.stem.wordnet import WordNetLemmatizer
            import string
            stop = set(stopwords.words('english'))
            exclude = set(string.punctuation)
            lemma = WordNetLemmatizer()

            def clean(doc):
                stop_free = " ".join(
                    [i for i in doc.lower().split() if i not in stop])
                punc_free = ''.join(ch for ch in stop_free
                                    if ch not in exclude)
                normalized = " ".join(
                    lemma.lemmatize(word) for word in punc_free.split())
                return normalized

            def clean_text(raw_text):
                letters_only = re.sub('[^a-zA-Z]', ' ', str(raw_text))
                words = letters_only.lower().split()
                cachedStopWords = set(stopwords.words("english"))
                cachedStopWords.update([
                    'periscope', 'pbs', 'newshour', 'npr', 'watch',
                    'bloomberg', 'says', 'abc', 'news'
                ])
                useful_words = [x for x in words if x not in cachedStopWords]

                useful_words_string = ' '.join(useful_words)
                return useful_words_string

            doc_clean = [
                clean_text(doc).split() for doc in toparticles_matched_text
            ]

            import gensim
            from gensim import corpora
            dictionary = corpora.Dictionary(doc_clean)
            doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]

            Lda = gensim.models.ldamodel.LdaModel
            ldamodel = Lda(doc_term_matrix,
                           num_topics=3,
                           id2word=dictionary,
                           passes=50)
            for idx, topic in ldamodel.show_topics(formatted=False,
                                                   num_words=3,
                                                   num_topics=3):
                lda_theme.append([w[0] for w in topic])

            for x in lda_theme:
                ldathemes_combined.extend(x)
            ldathemes_combined = np.unique(ldathemes_combined).tolist()

        print(
            '  -- CLUSTER_POSTPROC: Detect peaks with minimum height and distance filters.'
        )
        article_list_peak_pre = []
        for item in peak_processing_articlesfromquery:  # item['articles']:
            timeformat = '%Y-%m-%d-%H'  # mode <= 48
            if 'text' in item:
                del item['text']

            item['time_filter'] = dt.fromtimestamp(
                item['ts'], us_eastern_time).strftime(timeformat)
            article_list_peak_pre.append(item)

        article_list_peak_list = []
        article_list_peak_items = []
        article_list_peak_timekey = []
        sorted_articles = sorted(article_list_peak_pre,
                                 key=itemgetter('time_filter'))
        for key, gp in itertools.groupby(sorted_articles,
                                         key=lambda x: x['time_filter']):
            group_articles = {}
            group_articles['time_filterby'] = key
            group_articles['articles'] = list(gp)
            article_list_peak_list.append(len(group_articles['articles']))
            article_list_peak_items.append(group_articles)
            article_list_peak_timekey.append(key)
            print(
                '  -- CLUSTER_POSTPROC: peak time_filterby {0}, {1} article(s)'
                .format(key, len(group_articles['articles'])))

        print('  -- CLUSTER_POSTPROC: peak time_filterby total: {0}'.format(
            article_list_peak_list))
        peak_indexes = detect_peaks.detect_peaks(article_list_peak_list,
                                                 mph=7,
                                                 mpd=2).tolist()

        for item in peak_indexes:
            peak_indexes_items.append(article_list_peak_items[int(item)])
            peak_indexes_time.append(article_list_peak_timekey[int(item)])

        print(
            '  -- CLUSTER_POSTPROC: peak time_filterby detection result: {0} \n'
            .format(peak_indexes_time))

        print('  -- CLUSTER_POSTPROC: Final packaging')
        result_pick_data = {}
        result_pick_data['clusterid'] = str(uuid.uuid4())
        result_pick_data['topics_tfidf'] = group['theme']
        # result_pick_data['topics_lda'] = ldathemes_combined #group['theme']
        result_pick_data['topics_lda'] = lda_theme
        result_pick_data['namedentities'] = group['namedentity']
        result_pick_data['counts_total'] = pick_list
        result_pick_data['counts_highestrank'] = sum(
            len(x) for x in pick_toparticles)
        result_pick_data['months'] = pick_timefiltered
        result_pick_data['item_highestrank'] = pick_toparticles
        result_pick_data['item_total'] = all_documents
        result_pick_data['peaks'] = peak_indexes_time
        result_pick_data['peaks_item'] = peak_indexes_items

        then = dt.now(pytz.utc)
        timeest = str(then.astimezone(pytz.timezone('US/Eastern')))
        result_pick_data['timestamp'] = timeest
        all_results.append(result_pick_data)
        print(
            '  -* CLUSTER_POSTPROC: Postprocessing finished for the cluster: TFIDF {0} / LDA {1} \n\n'
            .format(group['theme'], lda_theme))

        time_file = dt.now(pytz.timezone('US/Eastern'))
        time_file_string = time_file.strftime("%Y%m%d-%H%M")

    os.makedirs('../data_publish_ready/' + type + '/', exist_ok=True)
    final_datapath_today = '../data_publish_ready/' + type + '/' + type + '_data.json'
    with open(final_datapath_today, 'w') as f:
        json.dump(all_results, f, indent=4, sort_keys=True)

    os.makedirs('../data_publish_ready/' + type + '/', exist_ok=True)
    final_datapath_today = '../data_publish_ready/' + type + '/' + type + '_' + time_file_string + '_data.json'
    with open(final_datapath_today, 'w') as f:
        json.dump(all_results, f, indent=4, sort_keys=True)

    print(
        'CLUSTER_POSTPROC: Saved data into json file in ../data_publish_ready/.'
    )
def search():
    query = request.form['query']
    q = []
    q.append(query)
    r = []  #complete path
    c = []  #preview of the paste content
    paste_date = []
    paste_size = []
    paste_tags = []
    index_name = request.form['index_name']
    num_elem_to_get = 50

    # select correct index
    if index_name is None or index_name == "0":
        selected_index = get_current_index()
    else:
        selected_index = os.path.join(baseindexpath, index_name)

    # Search filename
    for path in r_serv_pasteName.smembers(q[0]):
        r.append(path)
        paste = Paste.Paste(path)
        content = paste.get_p_content()
        content_range = max_preview_char if len(
            content) > max_preview_char else len(content) - 1
        c.append(content[0:content_range])
        curr_date = str(paste._get_p_date())
        curr_date = curr_date[0:4] + '/' + curr_date[4:6] + '/' + curr_date[6:]
        paste_date.append(curr_date)
        paste_size.append(paste._get_p_size())

    # Search full line
    schema = Schema(title=TEXT(stored=True),
                    path=ID(stored=True),
                    content=TEXT)

    ix = index.open_dir(selected_index)
    with ix.searcher() as searcher:
        query = QueryParser("content", ix.schema).parse(" ".join(q))
        results = searcher.search_page(query, 1, pagelen=num_elem_to_get)
        for x in results:
            r.append(x.items()[0][1])
            path = x.items()[0][1]
            paste = Paste.Paste(path)
            content = paste.get_p_content()
            content_range = max_preview_char if len(
                content) > max_preview_char else len(content) - 1
            c.append(content[0:content_range])
            curr_date = str(paste._get_p_date())
            curr_date = curr_date[0:4] + '/' + curr_date[
                4:6] + '/' + curr_date[6:]
            paste_date.append(curr_date)
            paste_size.append(paste._get_p_size())
            p_tags = r_serv_metadata.smembers('tag:' + path)
            l_tags = []
            for tag in p_tags:
                complete_tag = tag
                tag = tag.split('=')
                if len(tag) > 1:
                    if tag[1] != '':
                        tag = tag[1][1:-1]
                    # no value
                    else:
                        tag = tag[0][1:-1]
                # use for custom tags
                else:
                    tag = tag[0]

                l_tags.append((tag, complete_tag))

            paste_tags.append(l_tags)
        results = searcher.search(query)
        num_res = len(results)

    index_min = 1
    index_max = len(get_index_list())
    return render_template("search.html",
                           r=r,
                           c=c,
                           query=request.form['query'],
                           paste_date=paste_date,
                           paste_size=paste_size,
                           char_to_display=max_preview_modal,
                           num_res=num_res,
                           index_min=index_min,
                           index_max=index_max,
                           bootstrap_label=bootstrap_label,
                           paste_tags=paste_tags,
                           index_list=get_index_list(selected_index))
示例#6
0
def get_schema():
    return Schema(title=TEXT(stored=True),
                  category=TEXT(stored=True),
                  image=TEXT(stored=True),
                  date=DATETIME(stored=True),
                  description=TEXT(stored=True))
示例#7
0
import lxml.html
import couchdb
import requests  #TODO: replace with httplib2
import ngrams

from ngrams import segment
from datetime import datetime
from urllib.parse import urlparse, urljoin
from lxml.html import document_fromstring
from lxml.html.clean import Cleaner

from whoosh import index
from whoosh.fields import Schema, TEXT, ID, NUMERIC, KEYWORD, NGRAMWORDS
from couchdb.mapping import Document, TextField, DateTimeField, ListField, FloatField

schema = Schema(title=TEXT(stored=True),
                url=ID(stored=True, unique=True),
                desc=ID(stored=True),
                description=TEXT(stored=True),
                rank=NUMERIC(stored=True, numtype=float),
                raw=TEXT,
                content=TEXT,
                keywords=KEYWORD,
                internal_links=TEXT,
                external_links=TEXT,
                ngramwords=NGRAMWORDS)

_ix = None


class XTRExcetion:
示例#8
0
 def get_schema(self):
     return Schema(title=TEXT(stored=True),
                   name=ID(stored=True),
                   content=TEXT(stored=True))
示例#9
0
def media_rebuild():
    print datetime.datetime.now()
    print 'media_rebuild'
    media_db = mysql_new.BaseDB(config.MYSQL_DEFINE_MEDIA)
    schema = Schema(movieid=ID(stored=True, unique=True),
                    title=TEXT(stored=True,
                               analyzer=analyzer_zhongwen,
                               field_boost=2.0),
                    pinyin_title=TEXT(stored=True,
                                      analyzer=analyzer_pinyin,
                                      field_boost=2.0),
                    director=KEYWORD(stored=True),
                    year=NUMERIC(stored=True, sortable=True),
                    score=NUMERIC(stored=True, sortable=True),
                    area=KEYWORD(stored=True),
                    description=TEXT(stored=True, field_boost=1.5),
                    pinyin_description=TEXT(stored=True, field_boost=1.0),
                    actor=KEYWORD(stored=True, field_boost=1.0),
                    pinyin_actor=TEXT(stored=True, field_boost=1.0),
                    genres=KEYWORD(stored=True, field_boost=1.0),
                    pinyin_genres=TEXT(stored=True, field_boost=1.0),
                    type=NUMERIC(stored=True),
                    source=NUMERIC(stored=True))
    SQL = '''SELECT `movieid`, `title`, `type`, `actor`, `genres`, `director`, `douban_score`, `introduction` as description, `year` FROM `media_info` WHERE `status`=1 AND type in ('movie', 'tv', 'teleplay', 'anime')
          '''
    res = media_db.query(SQL, ())
    if not res:
        return
    for info in res:
        if info.get('type') == 'movie':
            info['type'] = 1
        elif info.get('type') == 'teleplay':
            info['type'] = 2
        elif info.get('type') == 'tv':
            info['type'] = 3
        elif info.get('type') == 'anime':
            info['type'] = 4
        else:
            continue
    index_path = os.path.join(config.index_root_dir, 'media')
    if not os.path.exists(index_path):
        os.mkdir(index_path)
    #ix = create_in(index_path, schema=schema)
    storage = FileStorage(index_path)
    ix = storage.open_index()
    writer = ix.writer()
    for info in res:
        pinyin_title = ' '.join(lazy_pinyin(info.get('title').decode('utf8')))
        pinyin_description = ' '.join(
            lazy_pinyin(info.get('description').decode('utf8')))
        pinyin_actor = ''.join(info.get('actor', '').strip().split('/'))
        pinyin_actor = ' '.join(lazy_pinyin(pinyin_actor.decode('utf8')))
        pinyin_genres = ''.join(info.get('genres', '').strip().split('/'))
        pinyin_genres = ' '.join(lazy_pinyin(pinyin_genres.decode('utf8')))
        actor = ';'.join(info.get('actor', '').strip().split('/'))
        area = ';'.join(info.get('area', '').strip().split('/'))
        director = ';'.join(info.get('area', '').strip().split('/'))
        genres = ';'.join(info.get('genres', '').strip().split('/'))

        writer.add_document(movieid=info.get('movieid').decode('utf8'),
                            title=info.get('title').decode('utf8'),
                            pinyin_title=pinyin_title,
                            type=info.get('type'),
                            actor=actor.decode('utf8'),
                            pinyin_actor=pinyin_actor,
                            genres=genres.decode('utf8'),
                            pinyin_genres=pinyin_genres,
                            director=director.decode('utf8'),
                            score=info.get('douban_score'),
                            description=info.get('description').decode('utf8'),
                            pinyin_description=pinyin_description,
                            area=area.decode('utf8'),
                            year=info.get('year'))
    writer.commit(mergetype=writing.CLEAR)
my_analyzer= wp.analysis.RegexTokenizer() | wp.analysis.LowercaseFilter() | wp.analysis.StopFilter()| wp.analysis.StemFilter()

[token.text for token in my_analyzer(u("This is a dose-escalation study of the CDK4/6 inhibitor liposarcomas"))]

[token.text for token in my_analyzer(u('35 Years'))]

from nltk.stem import WordNetLemmatizer

new_analyzer = wp.analysis.RegexTokenizer() | wp.analysis.LowercaseFilter() | wp.analysis.StopFilter()| wp.analysis.StemFilter()

if not os.path.exists("./index"):
    os.mkdir("./index")
#Creating the schema for the index as ID , content being the concatenated string of both the detailed and brief summary together
#Gender #Min AGe and #Max age
#If in case the Min Age is N/A make it 0 if Max Age is N/A make it 1
schema =  Schema(name=ID(stored=True, analyzer=wp.analysis.RegexTokenizer()), Title=TEXT(stored=True, analyzer=new_analyzer, field_boost=1.5),content=TEXT(stored=True,analyzer=new_analyzer))
ix = index.create_in("./index", schema)
writer = ix.writer()

writer.add_document(name='https://clinicaltrials.gov/show/NCT02429089',
                    content="This is a dose-escalation study of the CDK4/6 inhibitor ribociclib in combination with\n      standard-dose doxorubicin.\n\n      PRIMARY OBJECTIVES:\n\n      I. To determine the recommended phase 2 dose (RP2D) of ribociclib in combination with\n      doxorubicin in subjects with advanced soft tissue sarcomas.\n\n      SECONDARY OBJECTIVES:\n\n      I. To assess preliminary anti-tumor activity of ribociclib in combination with doxorubicin in\n      subjects with advanced soft tissue sarcomas (Progression-Free Survival and Overall Response\n      Rate).\n\n      II. To characterize the safety and tolerability of ribociclib in combination with\n      doxorubicin.\n\n      A mandatory biopsy will be obtained after 7 days of ribociclib treatment.\n\n      TREATMENT: Patients receive ribociclib orally (PO) daily on days 1-7, and doxorubicin\n      intravenously (IV) on day 10. Treatment repeats every 21 days for up to 6 courses in the\n      absence of disease progression or unacceptable toxicity. After 6 courses, patients may\n      receive maintenance treatment with ribociclib PO daily on days 1-21. Courses repeat every 28\n      days in the absence of disease progression or unacceptable toxicity.\n\n      STARTING DOSE COHORT: Ribociclib 400 mg with doxorubicin 75 mg/m2.\n\n      FOLLOW-UP: After completion of study treatment, patients are followed up at 30 days and then\n      every 12 weeks for 12 months.\n\n      RATIONALE: Over-expression of CDK4 or loss of the CDK4 inhibitor p16 are common in sarcomas\n      and result in a selective growth advantage by bypassing normal cell cycle checkpoints. Intact\n      pRb is required for CDK4/6 inhibition to be effective, therefore all eligible subjects must\n      have documented pRb expression by IHC on archival tissue. Synergy between CDK4 inhibition and\n      chemotherapy has been documented in preclinical models when given sequentially, suggestion a\n      role for cell cycle synchronization.",
                    Title="dose-escalation study of the CDK4/6 and LIPOSARCOMA")
writer.commit()

ix = open_dir("./index/")
query_b = QueryParser('content', ix.schema).parse('sarcoma cdk4')
with ix.searcher() as srch:
    res_b = srch.search(query_b, limit=10)
    for i in res_b:
        print(i['content'])
示例#11
0
        searcher = self._index.searcher()
        results = searcher.search(query_object)
        return results


if __name__ == '__main__':
    import shutil


    index_path = 'index-test'
    try:
        shutil.rmtree(index_path)
    except OSError:
        pass

    schema = Schema(filename=TEXT(stored=True), id=ID(stored=True), content=TEXT)
    my_index = Index(index_path, schema)

    doc_1 = {u'filename': u'a.txt', u'id': u'1', u'content': u'first document'}
    doc_2 = {u'filename': u'b.txt', u'id': u'2', u'content': u'2nd document'}
    doc_3 = {u'filename': u'c.txt', u'id': u'3', u'content': u'3rd document'}
    documents = [doc_1, doc_2, doc_3]
    my_index.add_documents(documents)

    result = my_index.search(u'first', u'content')
    # assert len(result) == 1
    # assert result[0][u'id'] == u'1'

    doc_4 = {u'filename': u'a.txt', u'id': u'4', u'content': u'not first'}
    my_index.add_document(**doc_4)
示例#12
0
import pandas as pd
from whoosh.fields import Schema, TEXT
from whoosh import index
import os, os.path
from whoosh import qparser
#Below module contains implementations of various scoring algorithms.Default is BM25F
#from whoosh import scoring
#Load the data
qadata=pd.read_csv("D:/NLP project/information retrieval/qa_Electronics.csv")
#update the null values answer field with default value
qadata["answer"].fillna("Please Provide more information", inplace = True)
#Schema is created to index on question and answer fields
schema = Schema(question = TEXT (stored = True,  field_boost = 2.0),
                answer = TEXT (stored = True,  field_boost = 2.0))
#Functions to create index for the search fields
def add_stories(i, dataframe, writer):   
    writer.update_document(question = str(dataframe.loc[i, "question"]),
                           answer = str(dataframe.loc[i, "answer"]))
# create and populate index
def populate_index(dirname, dataframe, schema):
    # Checks for existing index path and creates one if not present
    if not os.path.exists(dirname):
        os.mkdir(dirname)
    print("Creating the Index")
    ix = index.create_in(dirname, schema)
    with ix.writer() as writer:
        # Imports stories from pandas df
        print("Populating the Index")
        for i in dataframe.index:
            add_stories(i, dataframe, writer)
#Populate index of the csv file
示例#13
0
"""Module for searching the toolshed tools within all repositories"""
import logging
import os

import whoosh.index
from whoosh import scoring
from whoosh.fields import (Schema, STORED, TEXT)
from whoosh.qparser import MultifieldParser

from galaxy import exceptions
from galaxy.exceptions import ObjectNotFound
from galaxy.util import unicodify

log = logging.getLogger(__name__)

schema = Schema(name=TEXT(stored=True),
                description=TEXT(stored=True),
                owner=TEXT(stored=True),
                id=TEXT(stored=True),
                help=TEXT(stored=True),
                version=TEXT(stored=True),
                repo_name=TEXT(stored=True),
                repo_owner_username=TEXT(stored=True),
                repo_id=STORED)


class ToolSearch:
    def search(self, trans, search_term, page, page_size, boosts):
        """
        Perform the search on the given search_term
示例#14
0
from whoosh import index
from whoosh.index import create_in
from whoosh.fields import Schema, TEXT
import os.path
from whoosh.index import open_dir

englishPrefix = 'en:'
frenchPrefix = 'fr:'
spanishPrefix = 'es:'

path = u"shortenedFile.txt"
# on windows systems there has to be an encoding specified, on Linux this can be omitted
file = open(path, "r", encoding="utf8")
schema = Schema(english=TEXT(stored=True),
                spanish=TEXT(stored=True),
                french=TEXT(stored=True))

if not os.path.exists("index"):
    os.mkdir("index")

ix = create_in("index", schema)
ix = open_dir("index")
writer = ix.writer()

for line in file:
    line = line.split(';')
    indexEs = [i for i, s in enumerate(line) if spanishPrefix in s]
    indexEn = [i for i, s in enumerate(line) if englishPrefix in s]
    indexFr = [i for i, s in enumerate(line) if frenchPrefix in s]
    if indexEn:
        english = line[indexEn[0]].split(':')
示例#15
0
class SearchIndexer:
    """Full-text search indexer."""

    # schema for searches of all (public + private) info
    SCHEMA = Schema(
        type=ID(stored=True, sortable=True),
        handle=ID(stored=True, unique=True),
        private=BOOLEAN(stored=True),
        text=TEXT(),
        text_private=TEXT(),
        change=DATETIME(sortable=True, stored=True),
    )

    # schema for searches of public info only
    SCHEMA_PUBLIC = Schema(
        type=ID(stored=True, sortable=True),
        handle=ID(stored=True, unique=True),
        private=BOOLEAN(stored=True),
        text=TEXT(),
        change=DATETIME(sortable=True, stored=True),
    )

    def __init__(self, index_dir=FilenameOrPath):
        """Initialize given an index dir path."""
        self.index_dir = Path(index_dir)
        self.index_dir.mkdir(exist_ok=True)
        # query parser for all (public + private) content
        self.query_parser_all = MultifieldParser(["text", "text_private"],
                                                 schema=self.SCHEMA)
        # query parser for public content only
        self.query_parser_public = QueryParser("text",
                                               schema=self.SCHEMA_PUBLIC)

    def index(self, overwrite=False):
        """Return the index; create if doesn't exist."""
        index_dir = str(self.index_dir)
        if overwrite or not index.exists_in(index_dir):
            return index.create_in(index_dir, self.SCHEMA)
        return index.open_dir(index_dir)

    def _add_obj_strings(self, writer, obj_dict):
        """Add or update an object to the index."""
        try:
            writer.update_document(
                type=obj_dict["class_name"].lower(),
                handle=obj_dict["handle"],
                private=obj_dict["private"],
                text=obj_dict["string"],
                text_private=obj_dict["string_private"],
                change=obj_dict["change"],
            )
        except:
            current_app.logger.error("Failed adding object {}".format(
                obj_dict["handle"]))

    def reindex_full(self, db_handle: DbReadBase):
        """Reindex the whole database."""
        with self.index(overwrite=True).writer() as writer:
            for obj_dict in iter_obj_strings(db_handle):
                self._add_obj_strings(writer, obj_dict)

    def _get_object_timestamps(self):
        """Get a dictionary with the timestamps of all objects in the index."""
        d = {}
        with self.index().searcher() as searcher:
            for fields in searcher.all_stored_fields():
                class_name = fields["type"]
                if class_name not in d:
                    d[class_name] = set()
                d[class_name].add((fields["handle"], fields["change"]))
        return d

    def _get_update_info(self, db_handle: DbReadBase):
        """Get a dictionary with info about changed objects in the db."""
        db_timestamps = get_object_timestamps(db_handle)
        ix_timestamps = self._get_object_timestamps()
        deleted = {}
        updated = {}
        new = {}
        for class_name in db_timestamps:
            db_handles = set(handle for handle, _ in db_timestamps[class_name])
            ix_handles = set(
                handle
                for handle, _ in ix_timestamps.get(class_name.lower(), set()))
            # new: not present in index
            new[class_name] = db_handles - ix_handles
            # deleted: not present in db
            deleted[class_name] = ix_handles - db_handles
            # changed: different (new or modified) in db
            changed_timestamps = db_timestamps[class_name] - ix_timestamps.get(
                class_name.lower(), set())
            changed_handles = set(handle for handle, _ in changed_timestamps)
            # updated: changed and present in the index
            updated[class_name] = changed_handles & ix_handles
        return {"deleted": deleted, "updated": updated, "new": new}

    def delete_object(self, writer, handle: str):
        """Delete an object from the index."""
        writer.delete_by_term("handle", handle)

    def add_or_update_object(self, writer, handle: str, db_handle: DbReadBase,
                             class_name: str):
        """Add an object to the index or update it if it exists."""
        obj_dict = obj_strings_from_handle(db_handle, class_name, handle)
        self._add_obj_strings(writer, obj_dict)

    def get_writer(self, overwrite: bool = False, use_async: bool = False):
        """Get a writer instance.

        If `use_async` is true, use an `AsyncWriter`.
        """
        idx = self.index(overwrite=overwrite)
        if use_async:
            return AsyncWriter(idx, delay=0.1)
        return idx.writer()

    def reindex_incremental(self, db_handle: DbReadBase):
        """Update the index incrementally."""
        update_info = self._get_update_info(db_handle)
        with self.index(overwrite=False).writer() as writer:
            # delete objects
            for class_name, handles in update_info["deleted"].items():
                for handle in handles:
                    self.delete_object(writer, handle)
            # add objects
            for class_name, handles in update_info["new"].items():
                for handle in handles:
                    self.add_or_update_object(writer, handle, db_handle,
                                              class_name)
            # update objects
            for class_name, handles in update_info["updated"].items():
                for handle in handles:
                    self.add_or_update_object(writer, handle, db_handle,
                                              class_name)

    @staticmethod
    def format_hit(hit: Hit) -> Dict[str, Any]:
        """Format a search hit."""
        return {
            "handle": hit["handle"],
            "object_type": hit["type"],
            "rank": hit.rank,
            "score": hit.score,
        }

    def _get_sorting(
        self,
        sort: Optional[List[str]] = None,
    ) -> Optional[List[FieldFacet]]:
        """Get the appropriate field facets for sorting."""
        if not sort:
            return None
        facets = []
        allowed_sorters = {"type", "change"}
        for sorter in sort:
            _field = sorter.lstrip("+-")
            if _field not in allowed_sorters:
                continue
            reverse = sorter.startswith("-")
            facets.append(FieldFacet(_field, reverse=reverse))
        return facets

    def search(
        self,
        query: str,
        page: int,
        pagesize: int,
        include_private: bool = True,
        extend: bool = False,
        sort: Optional[List[str]] = None,
    ):
        """Search the index.

        If `include_private` is true, include also private objects and
        search in private fields.
        """
        query_parser = (self.query_parser_all
                        if include_private else self.query_parser_public)
        query_parser.add_plugin(DateParserPlugin())
        # if private objects should not be shown, add a mask
        mask = None if include_private else Term("private", True)
        parsed_query = query_parser.parse(query)
        with self.index().searcher() as searcher:
            sortedby = self._get_sorting(sort)
            results = searcher.search_page(parsed_query,
                                           page,
                                           pagesize,
                                           mask=mask,
                                           sortedby=sortedby)
            return results.total, [self.format_hit(hit) for hit in results]
示例#16
0
## Add fields programmatically by parsing the first line of the file

Searchable = ('Text', )

ix = index.create_in(SSAWG_index_dir, schema)
writer = ix.writer()
for field in fieldnames:
    if field in Searchable:
        print(field)
        writer.add_field(field,
                         NGRAMWORDS(minsize=NgramMin,
                                    maxsize=NgramMax,
                                    stored=True)
                         )  # May need to adjust size to allow for description
    else:
        writer.add_field(field, TEXT(stored=True, chars=True))
mtgCnt = 0
for Meeting in Meetings:  # Text is NGRAMMED, link is stored
    #print('-----------------')
    #print(str(Meeting.text))
    StrippedText = ''
    for item in Meeting.find_all('li'):  # for each list item...
        if item.text:
            CurStrip = item.text
        else:
            CurStrip = ''
        StrippedText += CurStrip.strip() + '\n'
    writer.add_document(MeetingLink=str(Meeting.a),
                        Text=StrippedText)  # assemble document
    #writer.add_document(MeetingLink=str(Meeting.a),Text=str(Meeting.text))  # assemble document
    mtgCnt += 1
示例#17
0
# - avoid removing "stop words" from text
#
IDANALYZER = IDTokenizer()

# CUSTOM ANALYZER wordsplit + lowercase filter, for pathname-like text
#
# This is useful to:
# - avoid removing "stop words" from text
# - search case-insensitively
#
PATHANALYZER = RegexTokenizer() | LowercaseFilter()

#INDEX SCHEMA DEFINITION
SCHEMA = Schema(
    fileid=ID(unique=True),
    owner=TEXT(analyzer=EMAILADDRANALYZER),
    # this field preserves case of repository name for exact matching
    repository_rawname=TEXT(analyzer=IDANALYZER),
    repository=TEXT(stored=True, analyzer=ICASEIDANALYZER),
    path=TEXT(stored=True, analyzer=PATHANALYZER),
    content=FieldType(format=Characters(),
                      analyzer=ANALYZER,
                      scorable=True,
                      stored=True),
    modtime=STORED(),
    extension=TEXT(stored=True, analyzer=PATHANALYZER))

IDX_NAME = 'HG_INDEX'
FORMATTER = HtmlFormatter('span', between='\n<span class="break">...</span>\n')
FRAGMENTER = ContextFragmenter(200)
示例#18
0
def find_unused_templates():
    start = time.perf_counter()
    print('Finding all unused templates...')
    print('  Getting global templates...')
    global_templates_files, global_templates = find_global_templates()
    print('   Done.\n  Getting app templates...')
    app_templates_files, app_templates = find_app_templates()
    print('   Done.')
    templates = global_templates + app_templates
    template_files = global_templates_files + app_templates_files
    # templates.sort()
    template_files.sort()

    print('  Getting python files...')
    py_files, pys = find_py_files()
    print('   Done.')
    all_files = py_files + template_files

    tl_count = [0 for t in templates]
    unused_templates = []

    print('  Creating Index', end='')
    tmp_dir = TemporaryDirectory()

    schema = Schema(title=TEXT(stored=True),
                    path=ID(stored=True),
                    content=TEXT(analyzer=RegexTokenizer(expression=rcompile(r"[\w/.]+"))))
    ix = create_in(tmp_dir.name, schema)
    writer = ix.writer()

    for filename in all_files:
        print('.', end='')  # , flush=True)
        with open(filename, 'r') as f:
            # print('WHOOSH', filename, filename, f)
            # content = '/n'.join(f.readlines())
            # if content:
            #     print('HAS CONTENT')
            #     print(content)
            u_filename = filename
            try:  # Python2
                u_filename = unicode(filename)
            except NameError:
                pass
            writer.add_document(title=u_filename, path=u_filename,
                                content=six.u('/n'.join(f.readlines())))
                                # content=content)
    print('')  # , flush=True)
    writer.commit()
    print('   Done.')

    print('  Searching through templates for references', end='')  # , flush=True)
    with ix.searcher() as searcher:
        for count, template in enumerate(templates):
            print('.', end="")  # , flush=True)
            query = QueryParser("content", ix.schema).parse(template)
            results = searcher.search(query)
            if len(results) < 1:
                unused_templates.append(template)
    print('')  # , flush=True)
    print('   Done.')

    if not unused_templates:
        print('No unused templates found.')
    else:
        print('\nUnused templates:')
        for template in unused_templates:
            print(template)
    end = time.perf_counter()
    print('Finished in ' + str(end - start) + ' seconds.')
    return unused_templates
def define_schema():
	print ("Schema Definition")
	return Schema(path=ID(stored=True),
				  title=TEXT(stored=True),  
				  content=TEXT(stored=True, analyzer=StemmingAnalyzer()))
示例#20
0
def populate_news(jugador, equipo):
    url_noticias = "https://www.google.es/search?q=" + quote_plus(
        jugador) + "+" + quote_plus(equipo) + "&tbm=nws&source=lnms"
    path = os.getcwd()
    dir_exe = os.path.join(path, "BiKlopp/resources/chromedriver.exe").replace(
        "\\", "/")

    driver = webdriver.Chrome(dir_exe, options=options)

    driver.get(url_noticias)
    html_page_noticias = BeautifulSoup(driver.page_source, "html5lib")

    # html_archivo = open(os.path.join(path, "BiKlopp/resources/noticias_google.html").replace("\\","/"), "r").read()
    # html_page_noticias = BeautifulSoup(html_archivo,"html5lib")
    noticias_html = html_page_noticias.find_all("div", {"class": "gG0TJc"})

    mini_noticias_html = html_page_noticias.find_all("div",
                                                     {"class": "YiHbdc"})

    schema_noticias = Schema(titulo=TEXT(stored=True),
                             link=TEXT(stored=True),
                             periodico=TEXT(stored=True),
                             desc=TEXT(stored=True),
                             fecha=DATETIME(stored=True, sortable=True))

    if not os.path.isdir("Index_news"):
        os.mkdir("Index_news")
    ix = create_in("Index_news", schema=schema_noticias)
    with ix.writer() as writer_noticias:
        i = 0
        for noticia_html in noticias_html:
            titulo = noticia_html.find("a", {"class": "l lLrAF"}).get_text()
            link = noticia_html.find("a", {"class": "l lLrAF"}).get("href")
            periodico = noticia_html.find("span", {
                "class": "xQ82C"
            }).get_text()
            fecha_html = noticia_html.find("span", {
                "class": "fwzPFf"
            }).get_text()
            fecha = dateparser.parse(fecha_html)
            desc = noticia_html.find("div", {"class": "st"}).get_text()

            writer_noticias.add_document(titulo=str(titulo),
                                         link=str(link),
                                         periodico=str(periodico),
                                         desc=str(desc),
                                         fecha=fecha)

            i = i + 1

        for mini_noticia_html in mini_noticias_html:
            titulo = mini_noticia_html.find("a", {
                "class": "RTNUJf"
            }).get_text()
            link = mini_noticia_html.find("a", {"class": "RTNUJf"}).get("href")
            periodico = mini_noticia_html.find("span", {
                "class": "xQ82C"
            }).get_text()
            fecha_html = mini_noticia_html.find("span", {
                "class": "fwzPFf"
            }).get_text()
            fecha = dateparser.parse(fecha_html)

            writer_noticias.add_document(titulo=str(titulo),
                                         link=str(link),
                                         periodico=str(periodico),
                                         desc=None,
                                         fecha=fecha)

            i = i + 1

    driver.quit()
示例#21
0
    def build_schema(self, fields):
        schema_fields = {
            ID: WHOOSH_ID(stored=True, unique=True),
            DJANGO_CT: WHOOSH_ID(stored=True),
            DJANGO_ID: WHOOSH_ID(stored=True),
        }
        # Grab the number of keys that are hard-coded into Haystack.
        # We'll use this to (possibly) fail slightly more gracefully later.
        initial_key_count = len(schema_fields)
        content_field_name = ''

        for field_name, field_class in fields.items():
            if field_class.is_multivalued:
                if field_class.indexed is False:
                    schema_fields[field_class.index_fieldname] = IDLIST(
                        stored=True, field_boost=field_class.boost)
                else:
                    schema_fields[field_class.index_fieldname] = KEYWORD(
                        stored=True,
                        commas=True,
                        scorable=True,
                        field_boost=field_class.boost)
            elif field_class.field_type in ['date', 'datetime']:
                schema_fields[field_class.index_fieldname] = DATETIME(
                    stored=field_class.stored, sortable=True)
            elif field_class.field_type == 'integer':
                schema_fields[field_class.index_fieldname] = NUMERIC(
                    stored=field_class.stored,
                    numtype=int,
                    field_boost=field_class.boost)
            elif field_class.field_type == 'float':
                schema_fields[field_class.index_fieldname] = NUMERIC(
                    stored=field_class.stored,
                    numtype=float,
                    field_boost=field_class.boost)
            elif field_class.field_type == 'boolean':
                # Field boost isn't supported on BOOLEAN as of 1.8.2.
                schema_fields[field_class.index_fieldname] = BOOLEAN(
                    stored=field_class.stored)
            elif field_class.field_type == 'ngram':
                schema_fields[field_class.index_fieldname] = NGRAM(
                    minsize=3,
                    maxsize=15,
                    stored=field_class.stored,
                    field_boost=field_class.boost)
            elif field_class.field_type == 'edge_ngram':
                schema_fields[field_class.index_fieldname] = NGRAMWORDS(
                    minsize=2,
                    maxsize=15,
                    at='start',
                    stored=field_class.stored,
                    field_boost=field_class.boost)
            else:
                schema_fields[field_class.index_fieldname] = TEXT(
                    stored=True,
                    analyzer=ChineseAnalyzer(),
                    field_boost=field_class.boost,
                    sortable=True)

            if field_class.document is True:
                content_field_name = field_class.index_fieldname
                schema_fields[field_class.index_fieldname].spelling = True

        # Fail more gracefully than relying on the backend to die if no fields
        # are found.
        if len(schema_fields) <= initial_key_count:
            raise SearchBackendError(
                "No fields were found in any search_indexes. Please correct this before attempting to search."
            )

        return (content_field_name, Schema(**schema_fields))
import os

from whoosh.fields import Schema, ID, KEYWORD, TEXT
from whoosh.index import create_in, open_dir, exists_in
from whoosh.query import Term
from whoosh.qparser import QueryParser
from pymongo import MongoClient
from bson.objectid import ObjectId

# Set up mongoClient and to DB compareDB and to collection phones
mongo = MongoClient('localhost', 27017)['compareDB']
products = mongo.phones

# Set index, we index title and content as texts and tags as keywords.
# We store inside index only titles and ids.
schema = Schema(productname=TEXT(stored=True), nid=ID(stored=True))

# Create index dir if it does not exists.
if not os.path.exists("index"):
    os.mkdir("index")

# Initialize index
ix = create_in("index", schema)
ix = open_dir("index")
# Fill index with posts from DB
writer = ix.writer()

for product in products.find():
    try:
        fullProductName = product["Brand"] + " " + product["Model Name"]
        writer.add_document(productname=fullProductName,
def get_more_search_result():
    query = request.form['query']
    q = []
    q.append(query)
    page_offset = int(request.form['page_offset'])
    index_name = request.form['index_name']
    num_elem_to_get = 50

    # select correct index
    if index_name is None or index_name == "0":
        selected_index = get_current_index()
    else:
        selected_index = os.path.join(baseindexpath, index_name)

    path_array = []
    preview_array = []
    date_array = []
    size_array = []
    list_tags = []

    schema = Schema(title=TEXT(stored=True),
                    path=ID(stored=True),
                    content=TEXT)

    ix = index.open_dir(selected_index)
    with ix.searcher() as searcher:
        query = QueryParser("content", ix.schema).parse(" ".join(q))
        results = searcher.search_page(query, page_offset, num_elem_to_get)
        for x in results:
            path = x.items()[0][1]
            path_array.append(path)
            paste = Paste.Paste(path)
            content = paste.get_p_content()
            content_range = max_preview_char if len(
                content) > max_preview_char else len(content) - 1
            preview_array.append(content[0:content_range])
            curr_date = str(paste._get_p_date())
            curr_date = curr_date[0:4] + '/' + curr_date[
                4:6] + '/' + curr_date[6:]
            date_array.append(curr_date)
            size_array.append(paste._get_p_size())
            p_tags = r_serv_metadata.smembers('tag:' + path)
            l_tags = []
            for tag in p_tags:
                tag = tag.split('=')
                if len(tag) > 1:
                    if tag[1] != '':
                        tag = tag[1][1:-1]
                    # no value
                    else:
                        tag = tag[0][1:-1]
                # use for custom tags
                else:
                    tag = tag[0]

                l_tags.append(tag)
            list_tags.append(l_tags)

        to_return = {}
        to_return["path_array"] = path_array
        to_return["preview_array"] = preview_array
        to_return["date_array"] = date_array
        to_return["size_array"] = size_array
        to_return["list_tags"] = list_tags
        to_return["bootstrap_label"] = bootstrap_label
        if len(path_array) < num_elem_to_get:  #pagelength
            to_return["moreData"] = False
        else:
            to_return["moreData"] = True

    return jsonify(to_return)
def get_schema_producto():
    return Schema(marca=TEXT(stored=True),
                  nombre=TEXT(stored=True),
                  descripcion=TEXT(stored=True),
                  url_imagen=TEXT(stored=True),
                  caracteristicas=KEYWORD(stored=True))
示例#25
0

F = whoosh.formats.Frequency

from whoosh.formats import CharacterBoosts, Characters, Existence, Format, Frequency, PositionBoosts, Positions

FORMATS = [
    CharacterBoosts, Characters, Existence, Format, Frequency, PositionBoosts,
    Positions
]

#RegexTokenizer, 	RegexAnalyzer,KeywordAnalyzer,CompositeAnalyzer,CommaSeparatedTokenizer, Tokenizer
#_terms, _segment, _gen, most_frequent_terms, most_distinctive_terms, lexicon,iter_field,frequency
#._terms._fieldmap

EXTEXT = TEXT(field_boost=2.0, stored=True, sortable=True)

#search
#searcher.lexicon('fullname')
"""analyzer = format = scorable = stored = unique = vector = None
    indexed = True
    multitoken_query = "default"
    sortable_typecode = None
    column_type = None
    
    searcher.all_stored_fields()
    
    
    """#unique

META = whoosh.fields.MetaSchema
示例#26
0
# -*- coding: utf-8 -*-
from __future__ import print_function, unicode_literals
from eight import *

from whoosh.fields import TEXT, ID, Schema

bw2_schema = Schema(
    name=TEXT(stored=True, sortable=True),
    comment=TEXT(stored=True),
    product=TEXT(stored=True, sortable=True),
    categories=TEXT(stored=True),
    location=TEXT(stored=True, sortable=True),
    database=TEXT(stored=True),
    code=ID(unique=True, stored=True),
)
示例#27
0
    help="Number of last entries to index (Default: 5) - 0 to index all documents",
)
argParser.add_argument(
    "-n",
    action="store_true",
    default=False,
    help="lookup complete cpe (Common Platform Enumeration) name for vulnerable configuration to add in the index",
)
args = argParser.parse_args()

c = CveHandler(namelookup=args.n)

indexpath = Configuration.getIndexdir()

schema = Schema(
    title=TEXT(stored=True), path=ID(stored=True, unique=True), content=TEXT
)

if not os.path.exists(indexpath):
    os.mkdir(indexpath)

if not exists_in(indexpath):
    ix = create_in(indexpath, schema)
else:
    ix = open_dir(indexpath)


def dumpallcveid(entry=None):
    return getCVEIDs if not entry else getCVEIDs(int(entry))

示例#28
0
import json
import os.path

from whoosh import index
from whoosh.analysis import StemmingAnalyzer
from whoosh.fields import Schema, TEXT, ID

schema = Schema(title=ID(stored=True),
                ingredients=TEXT(analyzer=StemmingAnalyzer()))

if not os.path.exists("indexdir"):
    os.mkdir("indexdir")

ix = index.create_in("indexdir", schema)

with open("full_format_recipes.json") as file:
    recipes = json.load(file)

writer = ix.writer()
num_recipes = len(recipes)
i = 0
for i, recipe in enumerate(recipes):
    if i % 1000 == 0:
        print("Building the index: " + str(round(i * 100.0 / num_recipes)) +
              "%\n")
    if recipe == {}:
        continue
    if "title" not in recipe or "ingredients" not in recipe:
        b = 5
    writer.add_document(title=str(i),
                        ingredients=" ".join(recipe["ingredients"]))
示例#29
0
    # Subscriber
    sub_config_section = 'PubSub_Global'
    sub_name = 'indexer'

    config_section = 'PubSub_Global'
    config_channel = 'channel'
    subscriber_name = 'indexer'

    h = Helper.Redis_Queues(config_section, config_channel, subscriber_name)

    # Indexer configuration - index dir and schema setup
    indexpath = h.config.get("Indexer", "path")
    indexertype = h.config.get("Indexer", "type")
    if indexertype == "whoosh":
        schema = Schema(title=TEXT(stored=True),
                        path=ID(stored=True, unique=True),
                        content=TEXT)
        if not os.path.exists(indexpath):
            os.mkdir(indexpath)
        if not exists_in(indexpath):
            ix = create_in(indexpath, schema)
        else:
            ix = open_dir(indexpath)

    # LOGGING #
    publisher.info("""ZMQ Indexer is Running""")

    while True:
        try:
            message = h.redis_rpop()
示例#30
0
from jieba.analyse.analyzer import ChineseAnalyzer
from whoosh.fields import Schema
from whoosh.fields import TEXT

analyzer = ChineseAnalyzer()

schema = Schema(
    question=TEXT(stored=True, analyzer=analyzer),
    answer=TEXT(stored=True),
)