Exemplo n.º 1
0
    def get_words(self, unit):
        """
        Returns list of word pairs for an unit.
        """
        words = set()

        # Prepare analyzers
        # - standard analyzer simply splits words
        # - stemming extracts stems, to catch things like plurals
        analyzers = [
            StandardAnalyzer(),
            StemmingAnalyzer(),
        ]
        lang_code = unit.translation.language.base_code()
        # Add per language analyzer if Whoosh has it
        if has_stemmer(lang_code):
            analyzers.append(LanguageAnalyzer(lang_code))
        # Add ngram analyzer for languages like Chinese or Japanese
        if unit.translation.language.uses_ngram():
            analyzers.append(NgramAnalyzer(4))

        # Extract words from all plurals and from context
        for text in unit.get_source_plurals() + [unit.context]:
            for analyzer in analyzers:
                # Some Whoosh analyzers break on unicode
                try:
                    words.update([
                        token.text for token in analyzer(force_unicode(text))
                    ])
                except (UnicodeDecodeError, IndexError) as error:
                    report_error(error, sys.exc_info())

        # Grab all words in the dictionary
        dictionary = self.filter(project=unit.translation.subproject.project,
                                 language=unit.translation.language)

        if len(words) == 0:
            # No extracted words, no dictionary
            dictionary = dictionary.none()
        else:
            # Build the query for fetching the words
            # Can not use __in as we want case insensitive lookup
            query = Q()
            for word in words:
                query |= Q(source__iexact=word)

            # Filter dictionary
            dictionary = dictionary.filter(query)

        return dictionary
def exec_comp():

    #text analyzers
    selected_analyzers = [
        StemmingAnalyzer(),
        SimpleAnalyzer(),
        StandardAnalyzer(),
        RegexAnalyzer(),
        FancyAnalyzer(),
        NgramAnalyzer(5),
        KeywordAnalyzer(),
        LanguageAnalyzer('en')
    ]  #text analyzers
    sel_ana = [
        'StemmingAnalyzer()', 'SimpleAnalyzer()', 'StandardAnalyzer()',
        'RegexAnalyzer()', 'FancyAnalyzer()', 'NgramAnalyzer(5)',
        'KeywordAnalyzer()', 'LanguageAnalyzer()'
    ]  #text which will be used for graph and for mrr table

    i = 0  #counter
    mrrs = []  #list where MRR values for each SE configuration will be stored

    #scoring functions
    scoring_functions = [
        scoring.TF_IDF(),
        scoring.Frequency(),
        scoring.BM25F(B=0.75, content_B=1.0, K1=1.5)
    ]
    scor_func = [' TF_IDF', ' Frequency', ' BM25F']

    #ground truth
    gt1 = pd.read_csv(gt_path, sep='\t')

    #combinations for every chosen analyzer with every chosen scoring function
    for x in range(len(selected_analyzers)):
        for y in range(len(scoring_functions)):

            i = i + 1
            sr_1 = exec_queries(
                selected_analyzers[x], scoring_functions[y]
            )  # execute queries for the chosen configuration combination
            sr_1.to_csv(path_start + "/part_1/" + str(i) + "__.csv",
                        index=False)  #save results of the search engine
            mrrs.append((sel_ana[x] + scor_func[y], mrr(gt1,
                                                        sr_1)))  #calculate MRR
    mrrs_saving = pd.DataFrame(mrrs)
    print(mrrs_saving)
    mrrs_saving.to_csv(path_start + "/part_1/" + mrrs_name,
                       index=False)  #store MRR table
Exemplo n.º 3
0
    def create(self):
        analyzer = StandardAnalyzer(minsize=1, stoplist=None) | CleanUpFilter()
        schema = Schema(
            source=TEXT(stored=True, analyzer=analyzer),
            target=TEXT(stored=True, analyzer=analyzer),
            comment=TEXT(stored=True),
            context=TEXT(stored=True),
            softcatala=BOOLEAN(stored=True),
            project=TEXT(stored=True),
        )
        if not os.path.exists(self.dir_name):
            os.mkdir(self.dir_name)

        ix = create_in(self.dir_name, schema)
        self.writer = ix.writer()
Exemplo n.º 4
0
    def _mk_schema(self, dsinfo):
        from whoosh import fields as wf
        from whoosh.analysis import StandardAnalyzer

        # TODO support some customizable mapping to homogenize some metadata fields
        # onto a given set of index keys
        self.schema = wf.Schema(
            id=wf.ID,
            path=wf.ID(stored=True),
            type=wf.ID(stored=True),
            parentds=wf.ID(stored=True),
            meta=wf.TEXT(
                stored=False,
                analyzer=StandardAnalyzer(minsize=2))
        )
Exemplo n.º 5
0
 def search( self, q, tool_name_boost, tool_section_boost, tool_description_boost, tool_label_boost, tool_stub_boost, tool_help_boost, tool_search_limit, tool_enable_ngram_search, tool_ngram_minsize, tool_ngram_maxsize ):
     """
     Perform search on the in-memory index. Weight in the given boosts.
     """
     # Change field boosts for searcher
     searcher = self.index.searcher(
         weighting=BM25F(
             field_B={ 'name_B': float( tool_name_boost ),
                       'section_B': float( tool_section_boost ),
                       'description_B': float( tool_description_boost ),
                       'labels_B': float( tool_label_boost ),
                       'stub_B': float( tool_stub_boost ),
                       'help_B': float( tool_help_boost ) }
         )
     )
     # Set query to search name, description, section, help, and labels.
     parser = MultifieldParser( [ 'name', 'description', 'section', 'help', 'labels', 'stub' ], schema=self.schema )
     # Hyphens are wildcards in Whoosh causing bad things
     if q.find( '-' ) != -1:
         q = (' ').join( [ token.text for token in self.rex( to_unicode( q ) ) ] )
     # Perform tool search with ngrams if set to true in the config file
     if ( tool_enable_ngram_search is True or tool_enable_ngram_search == "True" ):
         hits_with_score = {}
         token_analyzer = StandardAnalyzer() | analysis.NgramFilter( minsize=int( tool_ngram_minsize ), maxsize=int( tool_ngram_maxsize ) )
         ngrams = [ token.text for token in token_analyzer( q ) ]
         for query in ngrams:
             # Get the tool list with respective scores for each qgram
             curr_hits = searcher.search( parser.parse( '*' + query + '*' ), limit=float( tool_search_limit ) )
             for i, curr_hit in enumerate( curr_hits ):
                 is_present = False
                 for prev_hit in hits_with_score:
                     # Check if the tool appears again for the next qgram search
                     if curr_hit[ 'id' ] == prev_hit:
                         is_present = True
                         # Add the current score with the previous one if the
                         # tool appears again for the next qgram
                         hits_with_score[ prev_hit ] = curr_hits.score(i) + hits_with_score[ prev_hit ]
                 # Add the tool if not present to the collection with its score
                 if not is_present:
                     hits_with_score[ curr_hit[ 'id' ] ] = curr_hits.score(i)
         # Sort the results based on aggregated BM25 score in decreasing order of scores
         hits_with_score = sorted( hits_with_score.items(), key=lambda x: x[1], reverse=True )
         # Return the tool ids
         return [ item[0] for item in hits_with_score[ 0:int( tool_search_limit ) ] ]
     else:
         # Perform the search
         hits = searcher.search( parser.parse( '*' + q + '*' ), limit=float( tool_search_limit ) )
         return [ hit[ 'id' ] for hit in hits ]
Exemplo n.º 6
0
def tokenize(text, stemming=True, stoplist=None):
    # kstemmer = Stemmer()
    translator = str.maketrans(string.punctuation, ' ' * len(string.punctuation))  # map punctuation to space
    text = text.translate(translator)
    text = text.lower()
    text = text.strip()
    table = str.maketrans({key: None for key in string.punctuation})
    text = text.translate(table)
    if stemming:
        analyzer = StemmingAnalyzer(stoplist=stoplist, minsize=2, stemfn=kstemmer.stem)
    else:
        analyzer = StandardAnalyzer(stoplist=stoplist, minsize=2)

    tokens = [token.text for token in analyzer(text)]
    tokens = [word for word in tokens if not contains_digits(word)]
    return tokens
Exemplo n.º 7
0
    def create(self, in_memory=False):
        tokenizer_pattern = rcompile(r"(\w|·)+(\.?(\w|·)+)*") # Includes l·l
        analyzer = StandardAnalyzer(minsize=1, stoplist=None, expression=tokenizer_pattern)
        schema = Schema(verb_form=TEXT(stored=True, sortable=True, analyzer=analyzer),
                        index_letter=TEXT(stored=True, analyzer=analyzer),
                        file_path=TEXT(stored=True, sortable=True))

        if os.path.exists(self.dir_name):
            shutil.rmtree(self.dir_name)

        os.makedirs(self.dir_name)

        ix = create_in(self.dir_name, schema)

        self.writer = ix.writer()
        return ix
Exemplo n.º 8
0
def create_schema_with_stopwords(directory):
    schema = Schema(FileName=TEXT(stored=True),
                    FilePath=TEXT(stored=True),
                    Title=TEXT(stored=True),
                    Content=TEXT(stored=True,
                                 analyzer=StandardAnalyzer(stoplist=None)),
                    Size=TEXT(stored=True),
                    LastModified=TEXT(stored=True),
                    LastAccessed=TEXT(stored=True),
                    CreationTime=TEXT(stored=True),
                    Mode=TEXT(stored=True),
                    text=TEXT(spelling=True))
    create_non_existent_directory(directory)
    ix = create_in(directory, schema)
    global writer
    writer = ix.writer()
    return
Exemplo n.º 9
0
    def __init__(self,
                 analyzer=None,
                 phrase=True,
                 vector=None,
                 stored=False,
                 field_boost=1.0,
                 multitoken_query="first"):
        """
        :param analyzer: The analysis.Analyzer to use to index the field
            contents. See the analysis module for more information. If you omit
            this argument, the field uses analysis.StandardAnalyzer.
        :param phrase: Whether the store positional information to allow phrase
            searching.
        :param vector: A :class:`whoosh.formats.Format` object to use to store
            term vectors, or ``True`` to store vectors using the same format as
            the inverted index, or ``None`` or ``False`` to not store vectors.
            By default, fields do not store term vectors.
        :param stored: Whether to store the value of this field with the
            document. Since this field type generally contains a lot of text,
            you should avoid storing it with the document unless you need to,
            for example to allow fast excerpts in the search results.
        """

        ana = analyzer or StandardAnalyzer()

        if phrase:
            formatclass = Positions
        else:
            formatclass = Frequency

        self.format = formatclass(analyzer=ana, field_boost=field_boost)

        if vector:
            if type(vector) is type:
                vector = vector(ana)
            elif isinstance(vector, Format):
                pass
            else:
                vector = formatclass(ana)
        else:
            vector = None
        self.vector = vector

        self.multitoken_query = multitoken_query
        self.scorable = True
        self.stored = stored
Exemplo n.º 10
0
	def tokenize_corpus(self, corpus_folder, stopwords, out_path, remove_digits=False, minsize=3):
		"""tokenize corpus into list of lists and return list of doc ids"""
		corpus = self.read_corpus(corpus_folder)
		# set tokenizer
		tokenizer = StandardAnalyzer(stoplist=stopwords, minsize=minsize)
		# tokenize corpus as list of lists
		print('tokenizing corpus ...')
		docs = {}
		for docno, doc in corpus:
			# tokenize doc
			doc_tokens = self.tokenize_doc(doc, tokenizer, remove_digits)
			docs[docno] = doc_tokens
		print('corpus tokenized!')
		print("store processed data")
		with open(out_path + '/docs.json', 'w') as file_docs:
			json.dump(docs, file_docs)
		return docs	
Exemplo n.º 11
0
def index(root_folder, files, output, language):
    # schema = Schema(title=TEXT(stored=True), path=ID(stored=True), content=TEXT(stored=True))
    my_analyzer = StandardAnalyzer(stoplist=getStopWords())
    schema = Schema(title=TEXT(stored=True), path=ID(stored=True), content=TEXT(stored=True, vector=True, analyzer=my_analyzer))
    ix = create_in(output, schema)
    total = len(files)
    i = 1
    for f in files:
        print i, "/", total, ":", f
        text = get_text_from_file(f)
        writer = ix.writer()
        writer.add_document(title=unicode(f.replace(root_folder, ""), "utf-8"), path=unicode(f.replace(root_folder, ""), "utf-8"),
                            content=unicode(text, "utf-8"))


        writer.commit()
        i = i + 1
Exemplo n.º 12
0
def main():
    option = True
    while option:
        print("""
            1. Create Index.
            2. Query Index.
            3. Exit
            """)
        option = input("Please select an option...!")
        if option == "1":
            file_content_doc1 = open("rural.txt").read()
            file_content_doc2 = open("science.txt").read()
            sent_tokenize_list1 = sent_tokenize(file_content_doc1,
                                                language='english')
            sent_tokenize_list2 = sent_tokenize(file_content_doc2,
                                                language='english')
            if not os.path.exists("index_task2"):
                os.mkdir("index_task2")

            schema = Schema(full_text=TEXT(phrase=True,
                                           stored=True,
                                           analyzer=StandardAnalyzer(
                                               stoplist=None)))
            ix = create_in("index_task2", schema)
            writer = ix.writer()

            for sentence in sent_tokenize_list1:
                writer.add_document(full_text=sentence)
            for sentence in sent_tokenize_list2:
                writer.add_document(full_text=sentence)
            writer.commit()
            print("\n\n Index created with various features as its fields")
        elif option == "2":
            ix = index.open_dir("index_task2")
            with ix.searcher() as searcher:

                og = qparser.OrGroup.factory(0.5)
                q = input("\n Insert a query...!")
                query = QueryParser("full_text", ix.schema, group=og).parse(q)
                results = searcher.search(query)

                for i, hit in enumerate(results):
                    print(results.score(i), hit["full_text"], sep=":")
                    print("\n")
Exemplo n.º 13
0
    def create(self, in_memory=False):
        analyzer = StandardAnalyzer(minsize=1, stoplist=None) | CleanUpFilter()
        schema = Schema(source=TEXT(stored=True, analyzer=analyzer),
                        target=TEXT(stored=True, analyzer=analyzer),
                        comment=STORED,
                        context=STORED,
                        softcatala=BOOLEAN,
                        project=ID(stored=True))

        if in_memory:
            st = RamStorage()
            ix = st.create_index(schema)
        else:
            if not os.path.exists(self.dir_name):
                os.mkdir(self.dir_name)

            ix = create_in(self.dir_name, schema)

        self.writer = ix.writer()
        return ix
Exemplo n.º 14
0
def get_dictionary(request, unit_id):
    '''
    Lists words from dictionary for current translation.
    '''
    unit = get_object_or_404(Unit, pk=int(unit_id))
    unit.check_acl(request)
    words = set()

    # Prepare analyzers
    # - standard analyzer simply splits words
    # - stemming extracts stems, to catch things like plurals
    analyzers = (StandardAnalyzer(), StemmingAnalyzer())

    # Extract words from all plurals and from context
    for text in unit.get_source_plurals() + [unit.context]:
        for analyzer in analyzers:
            words = words.union([token.text for token in analyzer(text)])

    # Grab all words in the dictionary
    dictionary = Dictionary.objects.filter(
        project=unit.translation.subproject.project,
        language=unit.translation.language)

    if len(words) == 0:
        # No extracted words, no dictionary
        dictionary = dictionary.none()
    else:
        # Build the query (can not use __in as we want case insensitive lookup)
        query = Q()
        for word in words:
            query |= Q(source__iexact=word)

        # Filter dictionary
        dictionary = dictionary.filter(query)

    return render_to_response(
        'js/dictionary.html',
        RequestContext(request, {
            'dictionary': dictionary,
            'translation': unit.translation,
        }))
Exemplo n.º 15
0
def indexing(corpus, ram_limit=1024, d_test=True, stemmed=True):
    start_time = time.time()

    global stemming
    stemming = stemmed

    if stemming:
        analyzer = StemmingAnalyzer()
    else:
        analyzer = StandardAnalyzer()

    schema = Schema(doc_id=NUMERIC(stored=True),
                    date=TEXT(analyzer=SpaceSeparatedTokenizer()),
                    headline=TEXT(field_boost=1.5, analyzer=analyzer),
                    dateline=TEXT(analyzer=analyzer),
                    byline=TEXT(analyzer=analyzer),
                    content=TEXT(analyzer=analyzer))

    index_dir = os.path.join("indexes", "docs")

    # Clear existing indexes/docs folder and make new one
    if os.path.exists(index_dir):
        shutil.rmtree(index_dir)
    os.makedirs(index_dir)

    # Create index in indexes/docs folder
    ix = create_in(index_dir, schema)
    writer = ix.writer(limitmb=ram_limit)
    traverse_folders(writer, corpus, d_test=d_test)
    writer.commit()

    end_time = time.time()

    # Traverses all files in the indexes/docs folder to calculate disk space taken up by the index
    space = 0
    for subdir, dirs, files in os.walk(index_dir):
        space += sum(
            os.stat(os.path.join(index_dir, file)).st_size for file in files)

    return ix, end_time - start_time, space
Exemplo n.º 16
0
 def _search_ngrams(self, cleaned_query: str,
                    tool_ngram_minsize: CanConvertToInt,
                    tool_ngram_maxsize: CanConvertToInt,
                    tool_search_limit: CanConvertToFloat) -> List[str]:
     """
     Break tokens into ngrams and search on those instead.
     This should make searching more resistant to typos and unfinished words.
     See docs at https://whoosh.readthedocs.io/en/latest/ngrams.html
     """
     hits_with_score: Dict[str, float] = {}
     token_analyzer = StandardAnalyzer() | analysis.NgramFilter(
         minsize=int(tool_ngram_minsize), maxsize=int(tool_ngram_maxsize))
     ngrams = [token.text for token in token_analyzer(cleaned_query)]
     for query in ngrams:
         # Get the tool list with respective scores for each qgram
         curr_hits = self.searcher.search(self.parser.parse(f"*{query}*"),
                                          limit=float(tool_search_limit))
         for i, curr_hit in enumerate(curr_hits):
             is_present = False
             for prev_hit in hits_with_score:
                 # Check if the tool appears again for the next qgram search
                 if curr_hit['id'] == prev_hit:
                     is_present = True
                     # Add the current score with the previous one if the
                     # tool appears again for the next qgram
                     hits_with_score[prev_hit] = curr_hits.score(
                         i) + hits_with_score[prev_hit]
             # Add the tool if not present to the collection with its score
             if not is_present:
                 hits_with_score[curr_hit['id']] = curr_hits.score(i)
     # Sort the results based on aggregated BM25 score in decreasing order of scores
     hits_with_score_list: List[Tuple[str, float]] = sorted(
         hits_with_score.items(), key=lambda x: x[1], reverse=True)
     # Return the tool ids
     return [
         item[0] for item in hits_with_score_list[0:int(tool_search_limit)]
     ]
Exemplo n.º 17
0
 def __init__(self, analyzer = None, phrase = True, vector = None,
              stored = False, field_boost = 1.0):
     """
     :stored: Whether to store the value of this field with the document. Since
         this field type generally contains a lot of text, you should avoid storing it
         with the document unless you need to, for example to allow fast excerpts in the
         search results.
     :phrase: Whether the store positional information to allow phrase searching.
     :analyzer: The analysis.Analyzer to use to index the field contents. See the
         analysis module for more information. If you omit this argument, the field uses
         analysis.StandardAnalyzer.
     """
     
     ana = analyzer or StandardAnalyzer()
     
     if phrase:
         formatclass = Positions
     else:
         formatclass = Frequency
     self.format = formatclass(analyzer = ana, field_boost = field_boost)
     self.vector = vector
     
     self.scorable = True
     self.stored = stored
Exemplo n.º 18
0
def create_table(index_dir, *, overwrite=False):
    analyzer = StandardAnalyzer() | CharsetFilter(accent_map)
    schema = Schema(label=TEXT(stored=True, analyzer=analyzer, lang='fr'),
                    rome=TEXT(stored=True, sortable=True),
                    source=KEYWORD(stored=True, sortable=True),
                    slug=STORED)

    if not os.path.exists(index_dir):
        os.mkdir(index_dir)
    elif exists_in(index_dir):
        if not overwrite:
            logger.critical(
                'An index already exists in %s; overwrite flag not set; abandonning',
                index_dir)
            raise RuntimeError('Index already exists')
        logger.warning('Index already found, deleting %s to start anew',
                       index_dir)
        shutil.rmtree(index_dir, ignore_errors=True, onerror=None)

        os.mkdir(index_dir)

    logger.info('Whoosh index %s ready for use', index_dir)
    create_in(index_dir, schema)
    return index_dir
Exemplo n.º 19
0
class MySchema(SchemaClass):
    '''
    Definition of Schema for indexing documents.
    '''
    url = ID(stored=True, unique=True)
    url_len = NUMERIC(stored=True, default=0)
    url_txt = TEXT(field_boost=1.9, stored=True)
    title_page = TEXT(analyzer=StandardAnalyzer(),
                      field_boost=2.2,
                      stored=True)
    content = TEXT(analyzer=StandardAnalyzer(),
                   stored=True,
                   phrase=True,
                   sortable=True)  # not storing content in the index
    date_created = DATETIME(
        stored=True)  # maybe we prefer storing date as an ID
    date_updated = DATETIME(stored=True)
    h1 = TEXT(analyzer=StandardAnalyzer(), field_boost=1.75, stored=True)
    h2 = TEXT(analyzer=StandardAnalyzer(), field_boost=1.5, stored=True)
    h3 = TEXT(analyzer=StandardAnalyzer(), field_boost=1.25, stored=True)
    h4 = TEXT(analyzer=StandardAnalyzer(), field_boost=1.10, stored=True)
    rank = NUMERIC(stored=True, default=0)
Exemplo n.º 20
0

#-----------------
# Index Schema
#-----------------


class _AccentFilter(Filter):
    def __call__(self, tokens):
        for t in tokens:
            t.text = normalize_token(t.text)
            yield t


_stopwords = frozenset(('a', 'an'))
_analyzer = (StandardAnalyzer(stoplist=_stopwords) | _AccentFilter())
_schema = Schema(
    content=TEXT(stored=True, spelling=True, analyzer=_analyzer),
    data=STORED,  # tuple (label, path, prio, sortkey)
    itemtype=ID,
    asfilter=IDLIST)
_schema['content'].scorable = False

#-----------------
# Maker
#-----------------


class Maker(object):
    def __init__(self, index_dir):
        if os.path.exists(index_dir) and os.path.isfile(index_dir):
tbl = dict.fromkeys(i for i in xrange(sys.maxunicode)
                      if unicodedata.category(unichr(i)).startswith('P'))

# Create function with which to strip punct from unicode
def remove_punctuation(unicode_text):
    return unicode_text.translate(tbl)

# Make the index folder
if not os.path.exists("index_for_sample_files"):
    os.mkdir("index_for_sample_files")

# Specify a list of paths that contain all of the texts we wish to index
text_dirs = ["sample_text_collection"]
      
# Identify the schema we'll use when creating index
schema = Schema(  filename=TEXT(stored=True), path=TEXT(stored=True), author=TEXT(stored=True), short_title=TEXT(stored=True), full_text=TEXT( stored=True,phrase=True,analyzer=StandardAnalyzer(stoplist=None) )   )

# Create the index using the schema defined above
ix = create_in("index_for_sample_files", schema)

writer = ix.writer()

for i in text_dirs:
    for j in glob.glob(i + "/*.txt"):       
        with codecs.open(j,"r","utf-8") as raw_file:
            
            cleaner_file = remove_punctuation( raw_file.read().replace("\r","").replace("\n"," ") )
                    
            # Grab filename, then use that to grab all metadata. NB: Unix users should change the following line to j.split("/")[-1]
            filename        = j.split("\\")[-1]
            print "indexing file: ", filename
Exemplo n.º 22
0
from whoosh.index import create_in, open_dir, EmptyIndexError
from whoosh.fields import *
from whoosh.qparser import QueryParser, OrGroup
from whoosh.analysis import RegexTokenizer, LowercaseFilter, StandardAnalyzer
import inflect
from globals import *
ana = StandardAnalyzer(expression=re.compile(r"[\w\-+]+(\.?[\w\-]+)*",
                                             re.UNICODE),
                       stoplist=None,
                       minsize=0)

schema = Schema(description=TEXT(stored=True),
                path=ID(stored=True),
                lang=TEXT(analyzer=ana, multitoken_query="phrase"))
from curseshelpers import *


class Searcher:
    '''
  Our search engine class! I, for one, welcome our Googly overlords.
  '''
    def __init__(self, indexdir):
        # Open the index if it exists
        try:
            self.ix = open_dir(indexdir)
        except EmptyIndexError:
            self.ix = create_in(indexdir, schema)
        self.indexdir = indexdir

    def get_writer(self):
        self.writer = self.ix.writer()
Exemplo n.º 23
0
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import sys, glob, os, unicodedata
from django.conf import settings
from bs4 import BeautifulSoup

stopset = set(stopwords.words('english'))

if not os.path.exists('whoosh'):
    os.mkdir('whoosh')

schema = Schema(title=TEXT(stored=True),
                path=TEXT(stored=True),
                body=TEXT(stored=True,
                          phrase=True,
                          analyzer=StandardAnalyzer(stoplist=None)))

ix = create_in('whoosh', schema)

writer = ix.writer()

for url in os.listdir('articles/static/'):
    for html_file in os.listdir('articles/static/' + url):
        filename = html_file
        path = 'articles/static/' + url + '/' + html_file
        with open(path, 'r') as content_file:
            soup = BeautifulSoup(content_file, "html.parser")

            title = ""
            if soup.title is not None:
                title = soup.title.string
Exemplo n.º 24
0
try:
    from whoosh.index import _CURRENT_TOC_VERSION as whoosh_ix_ver
except ImportError:
    from whoosh.filedb.fileindex import _INDEX_VERSION as whoosh_ix_ver

from stemming import stemArabic


def stemfn(word):
    return stemArabic(stem(word))


# word_re = ur"[\w\u064e\u064b\u064f\u064c\u0650\u064d\u0652\u0651\u0640]"
analyzer = StandardAnalyzer(
    expression=
    ur"[\w\u064e\u064b\u064f\u064c\u0650\u064d\u0652\u0651\u0640]+(?:\.?[\w\u064e\u064b\u064f\u064c\u0650\u064d\u0652\u0651\u0640]+)*"
) | StemFilter(stemfn)

from whoosh.qparser import FieldAliasPlugin
from whooshSymbolicQParser import MultifieldSQParser


class ExcerptFormatter(object):
    def __init__(self, between="..."):
        self.between = between

    def _format_fragment(self, text, fragment):
        output = []
        index = fragment.startchar

        for t in fragment.matches:
Exemplo n.º 25
0
from whoosh.qparser import *
from whoosh.fields import Schema, TEXT, KEYWORD, NUMERIC
from whoosh.analysis import StandardAnalyzer
from whoosh import index
import os.path

# year,author,title,abstract are 字段,每隔字段对应索引查找标准文件的一部分信息
schema = Schema(year=NUMERIC(stored=True),
                author=TEXT(analyzer=StandardAnalyzer(stoplist=None),
                            stored=True),
                title=TEXT(analyzer=StandardAnalyzer(stoplist=None),
                           stored=True),
                abstract=TEXT(analyzer=StandardAnalyzer(stoplist=None),
                              stored=True),
                body=TEXT(analyzer=StandardAnalyzer(stoplist=None)),
                subject=KEYWORD(commas=True, scorable=True),
                keywords=KEYWORD(commas=True, scorable=True))

# to create an index in a dictionary if there is none
if not os.path.exists("indexdir"):
    os.mkdir("indexdir")
ix = index.create_in("indexdir", schema)

# open an existing index object
ix = index.open_dir("indexdir")

# create a writer object to add documents to the index
writer = ix.writer()

# now we can add documents to the index
abstract1 = (
Exemplo n.º 26
0
def all_stop_words( lst):
    analyzer = StandardAnalyzer()
    for t in analyzer( unicode(lst)):
        if not t.stopped:
            return False
    return True
Exemplo n.º 27
0
from solvertools.wordlist import WORDS
from solvertools.normalize import slugify, sanitize
from solvertools.util import data_path
from whoosh.index import open_dir
from whoosh.analysis import StandardAnalyzer
from whoosh import qparser
from operator import itemgetter
from collections import defaultdict
from .conceptnet_numberbatch import load_numberbatch, get_vector, similar_to_term
import re

INDEX = None
QUERY_PARSER = None
NUMBERBATCH = None
ANALYZER = StandardAnalyzer()


def simple_parser(fieldname, schema, group, **kwargs):
    """
    Returns a QueryParser configured to support only +, -, and phrase
    syntax.

    Modified from Whoosh's SimpleParser to accept a custom 'group'
    argument.
    """

    pins = [
        qparser.plugins.WhitespacePlugin, qparser.plugins.PlusMinusPlugin,
        qparser.plugins.PhrasePlugin
    ]
    orgroup = qparser.syntax.OrGroup
Exemplo n.º 28
0
from whoosh.analysis import Filter, LowercaseFilter, StandardAnalyzer, \
    NgramFilter
from whoosh.analysis.tokenizers import IDTokenizer
from whoosh.fields import NUMERIC, STORED, SchemaClass, TEXT


class LodashFilter(Filter):
    def __call__(self, tokens):
        for t in tokens:
            t.text = t.text.replace('_', '')
            yield t


simple_ana = IDTokenizer() | LowercaseFilter() | LodashFilter()
custom_ana = StandardAnalyzer(stoplist=None) | LodashFilter()
# | NgramFilter(minsize=2, maxsize=5, at='start')
# The sort problems with NgramFilter: less relevant artefacts will be first


class IndexSchema(SchemaClass):
    filename = TEXT(stored=True, analyzer=simple_ana)
    symbol = TEXT(stored=True, analyzer=custom_ana)
    module = TEXT(stored=True, analyzer=simple_ana)
    location = STORED()
    kind = STORED()
    sort = NUMERIC(sortable=True)
Exemplo n.º 29
0
class WhooshConstants():
    index_dir = configuration.get('whoosh_index_dir')
    tokenized_analyzer = StandardAnalyzer(stoplist=None)
    normalized_analyzer = IDTokenizer() | SubstitutionFilter(
        r"[\s/,_'-]", "") | LowercaseFilter()
    stem_analyzer = StemmingAnalyzer(r"[\s/,_'-]", gaps=True, stoplist=None)
def main():
    file_content_doc1 = open("rural_min.txt").read()
    file_content_doc2 = open("science_min.txt").read()
    option = True
    while option:
        print("""
        1. Create Index.
        2. Query Index.
        3. Exit
        """)
        option = input("Please select an option...!")
        if option == "1":

            sent_tokenize_list1 = sent_tokenize(file_content_doc1,
                                                language='english')
            sent_tokenize_list2 = sent_tokenize(file_content_doc2,
                                                language='english')
            if not os.path.exists("index_task3_min"):
                os.mkdir("index_task3_min")

            my_analyzer = RegexTokenizer() | StopFilter() | LowercaseFilter(
            ) | Lemmatizer()
            pos_tagger = RegexTokenizer() | StopFilter() | LowercaseFilter(
            ) | PosTagger()
            wordnetsyn1 = RegexTokenizer() | StopFilter() | LowercaseFilter(
            ) | WordNetSynsets()
            wordnetsyn2 = RegexTokenizer() | StopFilter() | LowercaseFilter(
            ) | WordNetSynsets1()
            wordnetsyn3 = RegexTokenizer() | StopFilter() | LowercaseFilter(
            ) | WordNetSynsets2()
            wordnetsyn4 = RegexTokenizer() | StopFilter() | LowercaseFilter(
            ) | WordNetSynsets3()

            schema = Schema(id=ID(stored=True, unique=True),
                            standard=TEXT(stored=True,
                                          analyzer=StandardAnalyzer()),
                            stem_text=TEXT(stored=True,
                                           analyzer=StemmingAnalyzer()),
                            lemma=TEXT(stored=True, analyzer=my_analyzer),
                            pos_text=TEXT(stored=True, analyzer=pos_tagger),
                            hypernym=TEXT(stored=True, analyzer=wordnetsyn1),
                            hyponym=TEXT(stored=True, analyzer=wordnetsyn2),
                            holonym=TEXT(stored=True, analyzer=wordnetsyn3),
                            meronyms=TEXT(stored=True, analyzer=wordnetsyn4),
                            dependency=TEXT(analyzer=DependencyParser()))

            ix = index.create_in("index_task3_min", schema)
            writer = ix.writer()

            for sentence in sent_tokenize_list1:
                writer.add_document(standard=sentence,
                                    stem_text=sentence,
                                    lemma=sentence,
                                    pos_text=sentence,
                                    hypernym=sentence,
                                    hyponym=sentence,
                                    meronyms=sentence,
                                    holonym=sentence,
                                    dependency=sentence)
            for sentence in sent_tokenize_list2:
                writer.add_document(standard=sentence,
                                    stem_text=sentence,
                                    lemma=sentence,
                                    pos_text=sentence,
                                    hypernym=sentence,
                                    hyponym=sentence,
                                    meronyms=sentence,
                                    holonym=sentence,
                                    dependency=sentence)
            writer.commit()

            print_index_details(ix)

            print("\n\n Index created with various features as its fields")

        elif option == "2":
            ix = index.open_dir("index_task3")

            with ix.searcher(weighting=whoosh.scoring.BM25F()) as searcher:
                og = qparser.OrGroup.factory(0.5)
                q = input("\n Insert a query...!")
                query_text = MultifieldParser([
                    "standard", "stem_text", "lemma", "pos_text", "hyponym",
                    "meronyms", "hypernym", "holonym"
                ],
                                              schema=ix.schema,
                                              group=og).parse(q)
                results = searcher.search(query_text, limit=10)
                for i, hit in enumerate(results):
                    print(results.score(i), hit["standard"], sep=":")
                    print("\n")

        elif option == "3":
            print("\n Goodbye")
            sys.exit(0)
            option = None
        else:
            print("\n Not valid choice try again...!")