def analyzeText(text):
    """
    Analyzes the text to facilitate the searching of words.

    Creates an analizer for the Portuguese language which creates a lower case filter,
    a stop word filter and a stemming filter to process the text it receives.

    Parameters
    ----------
    arg1 : string
        Document in simple text format

    Returns
    -------
    string
        Document text after being processed

    """
    languageAnalyzer = LanguageAnalyzer("pt")
    langText = ""

    for token in languageAnalyzer(text):
        langText += "".join(token.text)
        langText += " "
    return langText
Пример #2
0
    def get_words(self, unit):
        """Return list of word pairs for an unit."""
        words = set()

        # Prepare analyzers
        # - standard analyzer simply splits words
        # - stemming extracts stems, to catch things like plurals
        analyzers = [
            (SimpleAnalyzer(), True),
            (SimpleAnalyzer(expression=SPLIT_RE, gaps=True), True),
            (StandardAnalyzer(), False),
            (StemmingAnalyzer(), False),
        ]
        source_language = unit.translation.subproject.project.source_language
        lang_code = source_language.base_code()
        # Add per language analyzer if Whoosh has it
        if has_stemmer(lang_code):
            analyzers.append((LanguageAnalyzer(lang_code), False))
        # Add ngram analyzer for languages like Chinese or Japanese
        if source_language.uses_ngram():
            analyzers.append((NgramAnalyzer(4), False))

        # Extract words from all plurals and from context
        for text in unit.get_source_plurals() + [unit.context]:
            for analyzer, combine in analyzers:
                # Some Whoosh analyzers break on unicode
                new_words = []
                try:
                    new_words = [token.text for token in analyzer(text)]
                except (UnicodeDecodeError, IndexError) as error:
                    report_error(error, sys.exc_info())
                words.update(new_words)
                # Add combined string to allow match against multiple word
                # entries allowing to combine up to 5 words
                if combine:
                    words.update([
                        ' '.join(new_words[x:y]) for x in range(len(new_words))
                        for y in range(1, min(x + 6,
                                              len(new_words) + 1)) if x != y
                    ])

        # Grab all words in the dictionary
        dictionary = self.filter(project=unit.translation.subproject.project,
                                 language=unit.translation.language)

        if '' in words:
            words.remove('')

        if len(words) == 0:
            # No extracted words, no dictionary
            dictionary = dictionary.none()
        else:
            # Build the query for fetching the words
            # Can not use __in as we want case insensitive lookup
            dictionary = dictionary.filter(source__iregex=r'^({0})$'.format(
                '|'.join([re_escape(word) for word in words])))

        return dictionary
Пример #3
0
    def get_words(self, unit):
        """Return list of word pairs for an unit."""
        words = set()
        source_language = unit.translation.component.project.source_language

        # Filters stop words for a language
        try:
            stopfilter = StopFilter(lang=source_language.base_code)
        except NoStopWords:
            stopfilter = StopFilter()

        # Prepare analyzers
        # - simple analyzer just splits words based on regexp
        # - language analyzer if available (it is for English)
        analyzers = [
            SimpleAnalyzer(expression=SPLIT_RE, gaps=True) | stopfilter,
            LanguageAnalyzer(source_language.base_code),
        ]

        # Add ngram analyzer for languages like Chinese or Japanese
        if source_language.uses_ngram():
            analyzers.append(NgramAnalyzer(4))

        # Extract words from all plurals and from context
        flags = unit.all_flags
        for text in unit.get_source_plurals() + [unit.context]:
            text = strip_string(text, flags).lower()
            for analyzer in analyzers:
                # Some Whoosh analyzers break on unicode
                try:
                    words.update(token.text for token in analyzer(text))
                except (UnicodeDecodeError, IndexError) as error:
                    report_error(error)
                if len(words) > 1000:
                    break
            if len(words) > 1000:
                break

        if '' in words:
            words.remove('')

        if not words:
            # No extracted words, no dictionary
            return self.none()

        # Build the query for fetching the words
        # We want case insensitive lookup
        return self.filter(
            project=unit.translation.component.project,
            language=unit.translation.language,
            source__iregex=r'(^|[ \t\n\r\f\v])({0})($|[ \t\n\r\f\v])'.format(
                '|'.join(re_escape(word) for word in islice(words, 1000))),
        )
Пример #4
0
    def get_words(self, unit):
        """
        Returns list of word pairs for an unit.
        """
        words = set()

        # Prepare analyzers
        # - standard analyzer simply splits words
        # - stemming extracts stems, to catch things like plurals
        analyzers = [
            StandardAnalyzer(),
            StemmingAnalyzer(),
        ]
        source_language = unit.translation.subproject.project.source_language
        lang_code = source_language.base_code()
        # Add per language analyzer if Whoosh has it
        if has_stemmer(lang_code):
            analyzers.append(LanguageAnalyzer(lang_code))
        # Add ngram analyzer for languages like Chinese or Japanese
        if source_language.uses_ngram():
            analyzers.append(NgramAnalyzer(4))

        # Extract words from all plurals and from context
        for text in unit.get_source_plurals() + [unit.context]:
            for analyzer in analyzers:
                # Some Whoosh analyzers break on unicode
                try:
                    words.update(
                        [token.text for token in analyzer(force_text(text))]
                    )
                except (UnicodeDecodeError, IndexError) as error:
                    report_error(error, sys.exc_info())

        # Grab all words in the dictionary
        dictionary = self.filter(
            project=unit.translation.subproject.project,
            language=unit.translation.language
        )

        if len(words) == 0:
            # No extracted words, no dictionary
            dictionary = dictionary.none()
        else:
            # Build the query for fetching the words
            # Can not use __in as we want case insensitive lookup
            query = Q()
            for word in words:
                query |= Q(source__iexact=word)

            # Filter dictionary
            dictionary = dictionary.filter(query)

        return dictionary
Пример #5
0
def exec_comp():
    '''
    Method that calculates MRR: Mean Reciprocal Rank and saves a table with MRR evaluation for every search engine configuration 
    '''
    #text analyzers
    selected_analyzers = [
        StemmingAnalyzer(),
        SimpleAnalyzer(),
        StandardAnalyzer(),
        RegexAnalyzer(),
        FancyAnalyzer(),
        NgramAnalyzer(5),
        KeywordAnalyzer(),
        LanguageAnalyzer('en')
    ]  #text analyzers
    sel_ana = [
        'StemmingAnalyzer()', 'SimpleAnalyzer()', 'StandardAnalyzer()',
        'RegexAnalyzer()', 'FancyAnalyzer()', 'NgramAnalyzer(5)',
        'KeywordAnalyzer()', 'LanguageAnalyzer()'
    ]  #text which will be used for graph and for mrr table

    i = 0  #counter
    mrrs = []  #list where MRR values for each SE configuration will be stored

    #scoring functions
    scoring_functions = [
        scoring.TF_IDF(),
        scoring.Frequency(),
        scoring.BM25F(B=0.75, content_B=1.0, K1=1.5)
    ]
    scor_func = [' TF_IDF', ' Frequency', ' BM25F']

    #ground truth
    gt1 = pd.read_csv(os.getcwd() +
                      "/part_1/Cranfield_DATASET/cran_Ground_Truth.tsv",
                      sep='\t')

    #combinations for every chosen analyzer with every chosen scoring function
    for x in range(len(selected_analyzers)):
        for y in range(len(scoring_functions)):
            print(sel_ana[x] + scor_func[y])
            i = i + 1
            sr_1 = exec_queries(
                selected_analyzers[x], scoring_functions[y]
            )  # execute queries for the chosen configuration combination
            sr_1.to_csv(os.getcwd() + "/part_1/" + str(i) + "__.csv",
                        index=False)  #save results of the search engine
            mrrs.append((sel_ana[x] + scor_func[y], mrr(gt1,
                                                        sr_1)))  #calculate MRR
    mrrs_saving = pd.DataFrame(mrrs)
    mrrs_saving.to_csv(os.getcwd() + "/part_1/mrrs.csv",
                       index=False)  #store MRR table
Пример #6
0
 def _get_schema(self, language):
     lang_analyzer = LanguageAnalyzer(language)
     return Schema(
         key=ID(stored=True, unique=True),
         assignee=ID(stored=True),
         reporter=ID(stored=True),
         status=ID(stored=True),
         summary=TEXT(analyzer=lang_analyzer, field_boost=2.0),
         description=TEXT(analyzer=lang_analyzer),
         comments_str=TEXT(analyzer=lang_analyzer),
         labels=KEYWORD(stored=True, lowercase=True),
         components=KEYWORD(stored=True, lowercase=True),
     )
Пример #7
0
    def __init__(self, index_path, language):
        from whoosh import index as whoosh_index
        from whoosh.fields import Schema, TEXT, ID
        from whoosh import qparser
        from whoosh.highlight import UppercaseFormatter
        from whoosh.analysis import SimpleAnalyzer, LanguageAnalyzer
        from whoosh.lang import has_stemmer, has_stopwords
        import os

        if not has_stemmer(language) or not has_stopwords(language):
            # TODO Display a warning?
            analyzer = SimpleAnalyzer()
        else:
            analyzer = LanguageAnalyzer(language)

        self.schema = Schema(path=ID(unique=True, stored=True),
                             body=TEXT(analyzer=analyzer))
        self.formatter = UppercaseFormatter()

        self.index_path = index_path

        if not os.path.exists(index_path):
            try:
                os.mkdir(index_path)
            except OSError as e:
                sys.exit("Error creating Whoosh index: %s" % e)

        if whoosh_index.exists_in(index_path):
            try:
                self.search_index = whoosh_index.open_dir(index_path)
            except whoosh_index.IndexError as e:
                sys.exit("Error opening whoosh index: {0}".format(e))
        else:
            self.search_index = whoosh_index.create_in(index_path, self.schema)

        self.query_parser = qparser.MultifieldParser(["body", "path"],
                                                     schema=self.schema)
        self.query_parser.add_plugin(qparser.FuzzyTermPlugin())
Пример #8
0
def index_all():
    db_util.init_db()

    stemmer = Stemmer.Stemmer('russian')
    whoosh_ru_stemmer = RussianStemmer()

    analyzer = LanguageAnalyzer('russian')

    schema = Schema(transcription_id=ID(stored=True),
                    transcript=TEXT(stored=True, analyzer=analyzer))
    if not os.path.exists(const.TRANSCRIBED_WHOOSH_INDEX_DIR_PATH):
        os.makedirs(const.TRANSCRIBED_WHOOSH_INDEX_DIR_PATH)

    # recreate new index
    ix = create_in(const.TRANSCRIBED_WHOOSH_INDEX_DIR_PATH, schema)

    writer = ix.writer()

    for item in db_util.get_all_items():
        writer.add_document(transcription_id=str(item.id).decode('utf-8'),
                            transcript=item.transcription)

    writer.commit()
Пример #9
0
    def get_terms(self, unit):
        """Return list of term pairs for an unit."""
        words = set()
        source_language = unit.translation.component.project.source_language

        # Filters stop words for a language
        try:
            stopfilter = StopFilter(lang=source_language.base_code)
        except NoStopWords:
            stopfilter = StopFilter()

        # Prepare analyzers
        # - basic simple analyzer to split on non-word chars
        # - simple analyzer just splits words based on regexp to catch in word dashes
        # - language analyzer if available (it is for English)
        analyzers = [
            SimpleAnalyzer() | stopfilter,
            SimpleAnalyzer(expression=SPLIT_RE, gaps=True) | stopfilter,
            LanguageAnalyzer(source_language.base_code),
        ]

        # Add ngram analyzer for languages like Chinese or Japanese
        if source_language.uses_ngram():
            analyzers.append(NgramAnalyzer(4))

        # Extract words from all plurals and from context
        flags = unit.all_flags
        for text in unit.get_source_plurals() + [unit.context]:
            text = strip_string(text, flags).lower()
            for analyzer in analyzers:
                # Some Whoosh analyzers break on unicode
                try:
                    words.update(token.text for token in analyzer(text))
                except (UnicodeDecodeError, IndexError):
                    report_error(cause="Term words parsing")
                if len(words) > 1000:
                    break
            if len(words) > 1000:
                break

        if "" in words:
            words.remove("")

        if not words:
            # No extracted words, no glossary
            return self.none()

        # Build the query for fetching the words
        # We want case insensitive lookup
        words = islice(words, 1000)
        if settings.DATABASES["default"][
                "ENGINE"] == "django.db.backends.postgresql":
            # Use regex as that is utilizing pg_trgm index
            results = self.filter(
                source__iregex=r"(^|[ \t\n\r\f\v])({0})($|[ \t\n\r\f\v])".
                format("|".join(re_escape(word) for word in words)), )
        else:
            # MySQL
            results = self.filter(
                reduce(
                    lambda x, y: x | y,
                    (models.Q(source__search=word) for word in words),
                ), )

        return results.for_project(unit.translation.component.project).filter(
            language=unit.translation.language)
Пример #10
0
from __future__ import unicode_literals

import copy

from django.conf import settings
from django.test import TestCase, override_settings
from django.utils import timezone
from wagtail.search.tests.test_backends import BackendTests
from wagtail.tests.search import models

from whoosh.analysis import LanguageAnalyzer

sv_search_setttings_language = copy.deepcopy(settings.WAGTAILSEARCH_BACKENDS)
sv_search_setttings_language['default']['LANGUAGE'] = 'sv'

analyzer_swedish = LanguageAnalyzer('sv')
sv_search_setttings_analyzer = copy.deepcopy(settings.WAGTAILSEARCH_BACKENDS)
sv_search_setttings_analyzer['default']['ANALYZER'] = analyzer_swedish


class TestWhooshSearchBackend(BackendTests, TestCase):
    backend_path = 'wagtail_whoosh.backend'

    def test_facet(self):
        pass

    def test_facet_tags(self):
        pass

    def test_facet_with_nonexistent_field(self):
        pass
Пример #11
0
def search(request):
    data = request.GET
    category_id = int(data.get('category_id', 0))
    order = int(data.get('order', ORDER_BY_MOST_RECENT))
    search_text = data.get('search_text', '').lower()
    tesis_services = TesisServices()
    total_full = list()
    tutors_full = list()
    all_full = tesis_services.get_by_category(category_id, order)

    if len(search_text) > 0:
        total_full, tutors_full = TesisServices.search_in_tesis(
            search_text, all_full)

        # Por cada busqueda, en la tabla de palabras buscadas, si la palabra existe se suma 1, sino se inserta con valor 1
        # Si lo que se ingresa como búsqueda no es una sola pabla, sino una frase, se utiliza filtros tipo Stop y Stemming,
        # luego se realiza la extracción de keywords o tokens
        """
        “Stop” words are words that are so common it’s often counter-productive to index them, such as “and”, 
        “or”, “if”, etc. The provided analysis.StopFilter lets you filter out stop words, and includes a default 
        list of common stop words.
        Stemming is a heuristic process of removing suffixes (and sometimes prefixes) from words to arrive (hopefully, 
        most of the time) at the base word.
        """
        if len(search_text.split()) > 1:
            analyzer = LanguageAnalyzer("es")
            a_filters = StopFilter() | StemFilter()
            keywords = list(
                set([
                    token.text for token in a_filters(
                        analyzer(search_text, no_morph=True))
                ]))
        else:
            keywords = [search_text]

        for word in keywords:
            obj, created = Searches.objects.update_or_create(word=word)
            if not created:
                obj.count += 1
            else:
                if obj.count is None:
                    obj.count = 1
            obj.save()
    else:
        total_full = all_full

    # Se actualiza las palabras más buscadas
    # Se actualiza total de búsquedas y cantidad de palabras diferentes
    searches_services = SearchesServices()
    searches_services.generate_resume()
    top_words_searched = searches_services.top_words_searched
    # Total de palabras diferentes
    total_words = searches_services.total_words
    # Total de busquedas en el sitio
    total_searchs = searches_services.total_searchs

    # Paginado de lista de tesis
    paginator = Paginator(total_full, 5)
    page = request.GET.get('page')
    tesis_list = paginator.get_page(page)
    the_data = {
        'tesis_list':
        render_to_string('sections/central_published_tesis.html', {
            'tesis_list': tesis_list,
            'question': search_text
        }),
        # serializers.serialize("json", [x for x in total_full]),
        'tutors_list':
        tutors_full,
        'top_words_searched':
        top_words_searched,
        'total_words':
        total_words,
        'total_searchs':
        total_searchs,
        'question':
        search_text
    }
    # the_data = serializers.serialize("json", [x for x in total_full])
    return JsonResponse(the_data)
Пример #12
0
import copy

from django.conf import settings
from django.test import TestCase, override_settings

from wagtail.search.index import AutocompleteField
from wagtail.search.tests.test_backends import BackendTests
from wagtail.tests.search import models

from whoosh.analysis import LanguageAnalyzer
from whoosh.analysis.ngrams import NgramFilter

sv_search_setttings_language = copy.deepcopy(settings.WAGTAILSEARCH_BACKENDS)
sv_search_setttings_language["default"]["LANGUAGE"] = "sv"

analyzer_swedish = LanguageAnalyzer("sv")
sv_search_setttings_analyzer = copy.deepcopy(settings.WAGTAILSEARCH_BACKENDS)
sv_search_setttings_analyzer["default"]["ANALYZER"] = analyzer_swedish

indexing_resources = copy.deepcopy(settings.WAGTAILSEARCH_BACKENDS)
indexing_resources["default"]["MEMORY"] = 2048
indexing_resources["default"]["PROCS"] = 2

ngram_length = copy.deepcopy(settings.WAGTAILSEARCH_BACKENDS)
ngram_length["default"]["NGRAM_LENGTH"] = (3, 9)


class TestWhooshSearchBackend(BackendTests, TestCase):
    backend_path = "wagtail_whoosh.backend"

    def test_facet(self):
# customize highlight formatter
class HighlightFormatter(Formatter):
    def format_token(self, text, token, replace=False):
        # Use the get_text function to get the text corresponding to the
        # token
        tokentext = get_text(text, token, replace)

        # Return the text as you want it to appear in the highlighted
        # string
        return "<mark>%s<mark>" % tokentext


hf = HighlightFormatter()  # formatter for highlighting
wf = WholeFragmenter()  # fragmenter for splitting words
es_ana = LanguageAnalyzer("es")  # Whoosh analyzer for Spanish

# Load Whoosh index
index = open_dir("whoosh_index")

# Initialize Whoosh parser
parser = QueryParser("text", schema=index.schema)


@app.route("/")
def load_index():
    return render_template("index.html")


@app.route("/api/greguerias/all/", methods=['GET'])
def get_all_greguerias():
Пример #14
0
def indexer(data):
    stopwords_pt = get_stop_words()
    # for p in stopwords_pt:
    #print u (p)
    ana = LanguageAnalyzer("pt")
    schema = Schema(link=TEXT(stored=True),
                    title=TEXT(stored=True, analyzer=ana),
                    summary=TEXT(stored=True, analyzer=ana),
                    content=TEXT(stored=True, analyzer=ana))

    if not os.path.exists("pulledfeeds"):
        os.mkdir("pulledfeeds")
        ix = create_in("pulledfeeds", schema)
    else:
        ix = index.open_dir("pulledfeeds")
    writer = ix.writer()

    for item in data:
        if item['content'] == '':
            cont = u" "
        else:
            cont = item['content']
        writer.add_document(link=item['link'],
                            title=item['title'],
                            summary=item['summary'],
                            content=cont)
    writer.commit()

    #for w in stopwords.words('portuguese'):
    #    print w

    for item in data:
        link = item['link']
        title = item['title']
        summary = item['summary']
        content = item['content']

        sentencesArray = [title, summary, content]

        person_list = []
        for sentences in sentencesArray:

            sentence_sem_stop = []

            tokens = nltk.tokenize.word_tokenize(sentences, 'portuguese')

            for w in tokens:
                p = w.lower()
                if p not in stopwords.words(
                        'portuguese') and p not in stopwords_pt:
                    sentence_sem_stop.append(w)

            pos = nltk.pos_tag(sentence_sem_stop)
            sentt = nltk.ne_chunk(pos, binary=False)

            # print sentt

            person = []
            name = ""
            for t in sentt:
                if hasattr(t, 'label') and t.label:
                    #print t.label()
                    if t.label() == 'PERSON' or t.label() == 'ORGANIZATION':
                        for leaf in t.leaves():
                            person.append(leaf[0])
                        #if len(person) > 1: #avoid grabbing lone surnames
                        for part in person:
                            name += part + ' '
                        if name[:-1] not in person_list:
                            person_list.append(name[:-1])
                        name = ''

                        person = []
        entidades = "| "
        for ent in person_list:
            entidades += ent + " | "
        if len(person_list) == 0:
            entidades = " none"

        save_ent.save_relations(person_list)

        entidades2 = "| "
        for ent in person_list:
            if save_ent.checkIfEntityWikiExists(ent):
                entidades2 += ent + " | "
        save_ent.save_entities(link, entidades)
        print "entidades ", entidades
        print "entidades2 ", entidades2

        print "---------"
Пример #15
0
Time_GT = sw1_utils_query.GT_Q_read(directory_containing_Time_GT, True)

# reading and storing queries for both datasets
directory_containing_Cran_Q = '../Cranfield_DATASET/cran_Queries.tsv'
directory_containing_Time_Q = '../Time_DATASET/time_Queries.tsv'

Cran_Q = sw1_utils_query.GT_Q_read(directory_containing_Cran_Q, False)
Time_Q = sw1_utils_query.GT_Q_read(directory_containing_Time_Q, False)

###
### Define a Text-Analyzer
###
selected_analyzer = [
    SimpleAnalyzer(),
    StandardAnalyzer(),
    LanguageAnalyzer('en')
]
analyzer_names = ['Simple', 'Standard', 'Language']
###
### Create a Schema
###
datasets = ['Cranfield_DATASET', 'Time_DATASET']
datasets_len = [1400, 423]
dir_idx_list = []  # list to save directories for indexes

# for each of the datasets and for each of the analyzers create empty index
# based on schema in the separate directory.
# Then fill empty index through parsing dataset. Save directory into list.
for idx in range(len(datasets)):
    if datasets[idx] == 'Cranfield_DATASET':
        for i in range(len(selected_analyzer)):
Пример #16
0
        #
        num_added_records_so_far += 1
        if (num_added_records_so_far % 100 == 0):
            print(" num_added_records_so_far= " + str(num_added_records_so_far))
    #
    writer.commit()  # it is necessary to store the index once filled
    in_file.close()  # it is necessary to close the .csv file


'''
Here "schemas" function is used to create and fill all the schemas(indexes) for both .csv files (Cranfield.csv and Time.csv)

'''

analyzers = [StemmingAnalyzer(), StandardAnalyzer(), RegexAnalyzer(), SimpleAnalyzer(),
             FancyAnalyzer(), NgramAnalyzer(4), KeywordAnalyzer(), LanguageAnalyzer('en')] # all the analyzers that are used
analyzer_names = ['StemmingAnalyzer', 'StandardAnalyzer', 'RegexAnalyzer', 'SimpleAnalyzer',
                 'FancyAnalyzer', 'NgramAnalyzer', 'KeywordAnalyzer',  'LanguageAnalyzer'] # analyzers names

csv_names = ['Cranfield', 'Time'] # file names



# start to iterate over all the .csv files (in particular the only two that there are, Cranfield.csv, and Time.csv)
for name in csv_names: 
    
    print(name, '\n\n')
    
    path = "C:./"+name+"_DATASET" # get the path where the .csv is stored
    for e,type_analyzer in enumerate(analyzers): # now the iteration is necessary to create the 8 different inverted indexes
        
Пример #17
0
def get_schema(lang=languages[10]):
    """
    get_schema([lang="pt"])
    
    Obtém o esquema a ser usado para a criação do índice de documentos.
    Por padrão, o esquema é carregado com o analisador de textos para o idioma
    Português. Mas, pode ser carregado para qualquer um dos idiomas suportados
    pela biblioteca Whoosh. Atualmente Whoosh suporta os seguintes idiomas:
    
    .. code-block:: python

        >>> from whoosh.lang import languages
        >>> languages
        ('ar', 'da', 'nl', 'en', 'fi', 'fr', 'de', 'hu', 'it', 'no',
        'pt', 'ro', 'ru', 'es', 'sv', 'tr')

    
    Os campos que compõem o índice de documentos não necessariamente precisam
    ser todos os campos que compõem o documento.
    
    Este esquema contém 4 campos - ``id``, ``pub_date``, ``title`` e
    ``body`` - dos 6 campos da classe :class:`apps.search.models.Article`,
    que define um documento neste projeto.
    
    O esquema do índice de documentos para a classe Article é o seguinte:
    
    .. code-block:: python
    
        Schema(
                id = ID(unique=True, stored=True),
                pub_date = DATETIME(stored=True),
                title = TEXT(stored=True,
                            analyzer=LanguageAnalyzer(lang)),
                body = TEXT(stored=True,
                               analyzer=LanguageAnalyzer(lang)),
               )
    
    Os campos ``title`` e ``body``, por serem de tipo ``TEXT``, podem receber
    processamento textual que varia de acordo com o idioma. O idioma padrão
    deste método é o Português. O parâmetro ``lang`` permite alterar o idioma
    para um dos idiomas listados em :mod:`whoosh.lang.languages`. 
    A escolha do idioma é importante para que a análise léxico-sintática sobre
    o texto seja feita corretamente.
    O analisador de textos :class:`LanguageAnalyzer` usa 3 filtros para
    o processamento textual nos campos ``title`` e ``body``:
    LowercaseFilter (converte para letras minúsculas),
    StopFilter (remove palavras irrelevantes) e
    StemFilter (converte para a raiz da palavra).
    
    Todos os campos do schema também são armazenados no índice de documentos.
    O parâmetro ``stored=True`` indica que os campos serão indexados e armazenados.
    O parâmetro ``unique`` informa que o campo é único. 
    
    :param lang: Idioma do Schema.
    :type lang: str
    
    :returns: Schema
    """
    return Schema(
        id=ID(unique=True, stored=True),
        pub_date=DATETIME(stored=True, sortable=True),
        url=TEXT(stored=True),
        source=TEXT(stored=True),
        title=TEXT(stored=True, sortable=True,
                   analyzer=LanguageAnalyzer(lang)),
        body=TEXT(stored=True, analyzer=LanguageAnalyzer(lang)),
        links=TEXT(stored=True),
    )
Пример #18
0
def analyze(texto, lang=languages[10]):

    la = LanguageAnalyzer(lang)
    text_analyzed = ' '.join([token.text for token in la(texto)])

    return text_analyzed
Пример #19
0
    def get_words(self, unit):
        """Return list of word pairs for an unit."""
        words = set()
        source_language = unit.translation.component.project.source_language

        # Filters stop words for a language
        try:
            stopfilter = StopFilter(lang=source_language.base_code)
        except NoStopWords:
            stopfilter = StopFilter()

        # Prepare analyzers
        # - simple analyzer just splits words based on regexp
        # - language analyzer if available (it is for English)
        analyzers = [
            SimpleAnalyzer(expression=SPLIT_RE, gaps=True) | stopfilter,
            LanguageAnalyzer(source_language.base_code),
        ]

        # Add ngram analyzer for languages like Chinese or Japanese
        if source_language.uses_ngram():
            analyzers.append(NgramAnalyzer(4))

        # Extract words from all plurals and from context
        flags = unit.all_flags
        for text in unit.get_source_plurals() + [unit.context]:
            text = strip_string(text, flags).lower()
            for analyzer in analyzers:
                # Some Whoosh analyzers break on unicode
                try:
                    words.update(token.text for token in analyzer(text))
                except (UnicodeDecodeError, IndexError) as error:
                    report_error(error)
                if len(words) > 1000:
                    break
            if len(words) > 1000:
                break

        if "" in words:
            words.remove("")

        if not words:
            # No extracted words, no dictionary
            return self.none()

        # Build the query for fetching the words
        # We want case insensitive lookup
        words = islice(words, 1000)
        if settings.DATABASES["default"][
                "ENGINE"] == "django.db.backends.postgresql":
            results = self.filter(source__search=reduce(
                lambda x, y: x | y, (SearchQuery(word) for word in words)), )
        else:
            # MySQL
            results = self.filter(
                reduce(
                    lambda x, y: x | y,
                    (models.Q(source__search=word) for word in words),
                ), )

        return results.filter(
            project=unit.translation.component.project,
            language=unit.translation.language,
        )
Пример #20
0
def getSchema():
    return Schema(path=STORED, id=STORED, body=TEXT(analyzer=LanguageAnalyzer("en")))
Пример #21
0
    "version": whoosh.fields.ID(stored=True),
    "url_endpoint": whoosh.fields.ID(stored=True),
    "url_args": whoosh.fields.ID(stored=True)
}

part_fields = {
    "category": whoosh.fields.ID(stored=True),
    "id": whoosh.fields.ID(field_boost=3.0, stored=True),
    "table_indices": whoosh.fields.TEXT(),
    "url_endpoint": whoosh.fields.ID(stored=True),
    "url_args": whoosh.fields.ID(stored=True)
}

for lang in languages:
    doc_fields["title_%s" % lang] = whoosh.fields.TEXT(
        stored=True, field_boost=2.0, analyzer=LanguageAnalyzer(lang))
    doc_fields["content_%s" %
               lang] = whoosh.fields.TEXT(analyzer=LanguageAnalyzer(lang))

    part_fields["title_%s" % lang] = whoosh.fields.TEXT(
        stored=True, field_boost=2.0, analyzer=LanguageAnalyzer(lang))
    part_fields["content_%s" %
                lang] = whoosh.fields.TEXT(analyzer=LanguageAnalyzer(lang))

doc_schema = whoosh.fields.Schema(**doc_fields)
part_schema = whoosh.fields.Schema(**part_fields)

doc_index = whoosh.index.create_in(whoosh_dir, doc_schema, indexname="docs")
part_index = whoosh.index.create_in(whoosh_dir, part_schema, indexname="parts")

doc_parsers = {}