def ImprovedTokenizer(): """ Basiert auf dem whoosh RegexTokenizer. Dies ist nur ein Wrapper um die Funktionalität des Tokenizers """ chain = RegexTokenizer() | LowercaseFilter() | StopFilter( stoplist=STOP_WORDS, minsize=2) return chain
def text_treat(path): conto = { "titulo": "", "categoria": "", "texto": [], "ano": "", "full": [], "tokens": [] } with open(path, 'r', encoding="ISO-8859-1") as arquivo: conto["full"] = arquivo.readlines() analyzer = RegexTokenizer() | LowercaseFilter() | StopFilter( lang="portuguese") #exemplo: Poesia, Americanas, 1875 p = re.compile(r'[/.,]') inf = p.split(conto["full"][0]) #inf = conto["full"][0].split(r"[/.,]") conto["categoria"] = inf[0] conto["titulo"] = inf[1] conto["ano"] = inf[2].replace("\n", "") for i in range(len(conto["full"])): conto["texto"].append(conto["full"][i].replace('\n', '')) #retirar stop words for token in analyzer(conto["texto"][i]): conto["tokens"].append(token.text) #conto["tokens"] = remove_stop_words(conto["texto"][i]) return conto
def NERAnalyzer( ne_types=QA_NE_TYPES, expression=default_pattern, stoplist=QA_STOPWORDS, minsize=2, maxsize=None, gaps=False, ): """Named Entity centric version of StandardAnalyzer :param ne_types: list/set of named entities to keep :param expression: The regular expression pattern to use to extract tokens. :param stoplist: A list of stop words. Set this to None to disable the stop word filter. :param minsize: Words smaller than this are removed from the stream. :param maxsize: Words longer that this are removed from the stream. :param gaps: If True, the tokenizer *splits* on the expression, rather than matching on the expression. :return: analyzer to be used with a whoosh_utils index """ chain = NERTokenizer(ne_types, expression, gaps) chain |= LowercaseFilter() chain |= StopFilter(stoplist=stoplist, minsize=minsize, maxsize=maxsize) return chain
def get_schema(): """ Return a schema used for indexing document """ analyzer = MyVietnameseTokenizer() | LowercaseFilter() | StopFilter(get_stopword_list()) return Schema(title=TEXT(analyzer=analyzer, stored=True, field_boost=1.5), path=ID(unique=True, stored=True), time=STORED, content=TEXT(analyzer=analyzer, stored=True))
def ChineseAnalyzer(stoplist=STOP_WORDS, minsize=1, stemfn=stem, cachesize=50000): return (ChineseTokenizer() | LowercaseFilter() | StopFilter(stoplist=stoplist, minsize=minsize) | StemFilter(stemfn=stemfn, ignore=None, cachesize=cachesize))
def GensimAnalyzer(stoplist=STOP_WORDS, minsize=1, stemfn=stem, cachesize=50000): return GensimTokenizer()|LowercaseFilter()\ |StemFilter(stemfn=stemfn, ignore=None,cachesize=cachesize)\ |StopFilter(stoplist=stoplist,minsize=minsize)
def get_search_terms(text): ''' Splits up a text in tokens, drops non-usefull ones and returns a Set of tokens @type text: String @param text: A unicode string to split up in tokens @rtype: Set of strings @return: A Set of usefull unique tokens appearing the text ''' stoplist = ['and', 'is', 'it', 'an', 'as', 'at', 'have', 'in', 'yet', 'if', 'from', 'for', 'when', 'by', 'to', 'you', 'be', 'we', 'that', 'may', 'not', 'with', 'tbd', 'a', 'on', 'your', 'this', 'of', 'us', 'will', 'can', 'the', 'or', 'are', 'up', 'down', 'ip', ] analyzer = SpaceSeparatedTokenizer() | StopFilter(stoplist=stoplist) tokens = set([x.text for x in analyzer(text)]) # TODO: When we go to whoosh 2.x we can drop the following and use a whoosh # SubstitutionFilter to the analyzer above tokens = set([re.sub('[\(\)/]', '', x) for x in tokens]) return tokens
def remove_stop_words(str): analyzer = RegexTokenizer() | LowercaseFilter() | StopFilter( lang="portuguese") r = [] for token in analyzer(str): r.append(token.text) return r
def create_analyzer(): conf = config.get_config() if conf['STOPWORDS']: if conf['CHARACTERS_FOLDING']: if conf['STEMMING']: if conf['QGRAMS']: analyzer = RegexTokenizer() | StopFilter() | CharsetFilter(accent_map) | StemFilter() \ | NgramFilter(minsize=conf['QNUM_MIN'], maxsize=conf['QNUM_MAX']) else: analyzer = RegexTokenizer() | StopFilter() | CharsetFilter(accent_map) | StemFilter() else: if conf['QGRAMS']: analyzer = RegexTokenizer() | StopFilter() | CharsetFilter(accent_map) \ | NgramFilter(minsize=conf['QNUM_MIN'], maxsize=conf['QNUM_MAX']) else: analyzer = RegexTokenizer() | StopFilter() | CharsetFilter(accent_map) else: if conf['STEMMING']: if conf['QGRAMS']: analyzer = RegexTokenizer() | StopFilter() | StemFilter() \ | NgramFilter(minsize=conf['QNUM_MIN'], maxsize=conf['QNUM_MAX']) else: analyzer = RegexTokenizer() | StopFilter() | StemFilter() else: if conf['QGRAMS']: analyzer = RegexTokenizer() | StopFilter() \ | NgramFilter(minsize=conf['QNUM_MIN'], maxsize=conf['QNUM_MAX']) else: analyzer = RegexTokenizer() | StopFilter() else: if conf['CHARACTERS_FOLDING']: if conf['STEMMING']: if conf['QGRAMS']: analyzer = RegexTokenizer() | CharsetFilter(accent_map) | StemFilter() \ | NgramFilter(minsize=conf['QNUM_MIN'], maxsize=conf['QNUM_MAX']) else: analyzer = RegexTokenizer() | CharsetFilter(accent_map) | StemFilter() else: if conf['QGRAMS']: analyzer = RegexTokenizer() | CharsetFilter(accent_map) \ | NgramFilter(minsize=conf['QNUM_MIN'], maxsize=conf['QNUM_MAX']) else: analyzer = RegexTokenizer() | CharsetFilter(accent_map) else: if conf['STEMMING']: if conf['QGRAMS']: analyzer = RegexTokenizer() | StemFilter() \ | NgramFilter(minsize=conf['QNUM_MIN'], maxsize=conf['QNUM_MAX']) else: analyzer = RegexTokenizer() | StemFilter() else: if conf['QGRAMS']: analyzer = RegexTokenizer() | NgramFilter(minsize=conf['QNUM_MIN'], maxsize=conf['QNUM_MAX']) else: analyzer = RegexTokenizer() log.print_debug(TAG, "Analizzatore creato") return analyzer
def __init__(self, index_path, names=None): self._analyzer = SpaceSeparatedTokenizer() | LowercaseFilter() | StopFilter(minsize=1, stoplist=stoplist) | StemFilter() if index.exists_in(index_path): self._ix = index.open_dir(index_path) else: self.build_index(index_path, names) self._qp = QueryParser("title", self._ix.schema, plugins=[])
def schema(self): my_analyzer = RegexTokenizer("[a-zA-Z_]+") | LowercaseFilter() | StopFilter() schema = Schema( h=TEXT(stored=True, analyzer=my_analyzer), gnx=ID(stored=True), b=TEXT(analyzer=my_analyzer), parent=ID(stored=True), doc=ID(stored=True), ) return schema
def __init__(self, path, schema): self.analyzer = RegexTokenizer() | LowercaseFilter() | StopFilter() self.schema = schema if not os.path.exists( "index"): ## make an index folder if one does not exist os.mkdir("index") index.create_in("index", self.schema) self.ix = index.open_dir("index")
def get_schema(): analyzer = StemmingAnalyzer(stoplist=STOP) | StopFilter(stoplist=STOP) schema = Schema(title=TEXT(analyzer=analyzer, stored=True, sortable=True), content=TEXT(analyzer=analyzer, stored=True, sortable=True), tags=KEYWORD(commas=True, stored=True), author=TEXT(stored=True), uid=ID(unique=True, stored=True), lastedit_date=DATETIME(sortable=True, stored=True)) return schema
def __determine_analyzer(self, mode): tokenizer = StanTokenizer() return { 'normal': tokenizer | PunctuationFilter() | StanfordLemmatizerFilter() | LowercaseFilter(), 'author topic modelling': tokenizer | PunctuationFilter() | StanfordLemmatizerFilter() | LowercaseFilter() | StopFilter(), }[mode]
def CleanupStandardAnalyzer(expression=default_pattern, stoplist=STOP_WORDS, minsize=2, maxsize=None, gaps=False): ret = RegexTokenizer(expression=expression, gaps=gaps) # added CleanupFilter here chain = ret | CleanupFilter() | LowercaseFilter() if stoplist is not None: chain = chain | StopFilter( stoplist=stoplist, minsize=minsize, maxsize=maxsize) return chain
def build_schema(self, fields): schema = super(SpanishWhooshSearchBackend, self).build_schema(fields) stemmer_sp = SpanishStemmer() stemming_analyzer = StemmingAnalyzer(stemfn=stemmer_sp.stem) stop_filter = StopFilter(stoplist=STOPWORDS, minsize=2) for name, field in schema[1].items(): if isinstance(field, TEXT): # field.analyzer = StemmingAnalyzer() | CharsetFilter(accent_map) field.analyzer = stemming_analyzer | stop_filter return schema
def __init__(self): chfilter = CharsetFilter(accent_map) stoplist = stoplists["en"].union(stoplists["fr"]) analyzer = RegexTokenizer() | LowercaseFilter() | \ StopFilter(stoplist=stoplist) | chfilter # defines the schema # see http://pythonhosted.org/Whoosh/schema.html for reference keywordType = KEYWORD(lowercase=True, scorable=True) self.schema = Schema(content=TEXT(analyzer=analyzer), docType=TEXT, docId=ID(stored=True, unique=True), tags=keywordType) # Adds dynamic fields so each documents can index its fields in the # same Whoosh index self.schema.add('*_string', TEXT(analyzer=analyzer), glob=True) self.schema.add('*_date', DATETIME, glob=True) self.schema.add('*_number', NUMERIC, glob=True) self.schema.add('*_boolean', BOOLEAN, glob=True) # Creates the index folder and Whoosh index files if it doesn't exist # And loads the index in any case if not os.path.exists("indexes"): os.mkdir("indexes") self.index = index.create_in("indexes", self.schema) else: self.index = index.open_dir("indexes") # Creates the doctypes folder if it doesn't exist if not os.path.exists("doctypes"): os.mkdir("doctypes") # Creates the doctypes default schema file if it doesn't exist if not os.path.exists('doctypes/doctypes_schema.json'): with open('doctypes/doctypes_schema.json', 'w') as defaultFile: defaultFile.write("{}") ''' Loads the doctypes schema if it's valid, otherwise recreates it Doctypes schema is a dictionary of doctypes with their fields created and updated when a document is indexed. That way, we can tell Whoosh which fields to search by default, because there is apparently no way to say "search in all fields". ''' with open('doctypes/doctypes_schema.json', 'r+') as rawJSON: try: self.doctypesSchema = json.load(rawJSON) except ValueError: rawJSON.write("{}") self.doctypesSchema = {}
def LemmatizingAnalyzer(stoplist=STOP_WORDS, minsize=2, maxsize=None): """ Analizzatore che effettua tokenizzazione, lowercase, rimozione stopword e lemmatizzazione. :param stoplist: lista di stopword. E' possibile effettuare l'unione con altre un altra lista :param minsize: Parole più piccole di questo valore vengono eliminate :param maxsize: parole più grandi di questo valore vengono eliminate """ ret = RegexTokenizer(expression=default_pattern, gaps=False) chain = ret | LowercaseFilter() if stoplist is not None: chain = chain | StopFilter(stoplist=stoplist, minsize=minsize, maxsize=maxsize) return chain | LemmatizerFilter()
def CleanupStemmingAnalyzer(expression=default_pattern, stoplist=STOP_WORDS, minsize=2, maxsize=None, gaps=False, stemfn=stem, ignore=None, cachesize=50000): ret = RegexTokenizer(expression=expression, gaps=gaps) # added CleanupFilter here chain = ret | CleanupFilter() | LowercaseFilter() if stoplist is not None: chain = chain | StopFilter( stoplist=stoplist, minsize=minsize, maxsize=maxsize) return chain | StemFilter( stemfn=stemfn, ignore=ignore, cachesize=cachesize)
def queryIndex(query): tokenizer = RegexTokenizer() return_list = [] # Removing stop words with open("../smartStopList.txt", "r") as fp: line = fp.readline() words = [] while line: words.append(line.replace('\n', '')) line = fp.readline() stopper = StopFilter(stoplist=frozenset(words)) tokens = stopper(tokenizer(query)) for t in tokens: t.text = t.text.lower() # Converting to lower case s = stem(t.text) # stemming if len(s) > 2: return_list.append(s) return return_list
def ChineseAnalyzer(stoplist=STOP_WORDS, minsize=1): return ChineseTokenizer() | StopFilter(stoplist=stoplist, minsize=minsize)
def search(request): data = request.GET category_id = int(data.get('category_id', 0)) order = int(data.get('order', ORDER_BY_MOST_RECENT)) search_text = data.get('search_text', '').lower() tesis_services = TesisServices() total_full = list() tutors_full = list() all_full = tesis_services.get_by_category(category_id, order) if len(search_text) > 0: total_full, tutors_full = TesisServices.search_in_tesis( search_text, all_full) # Por cada busqueda, en la tabla de palabras buscadas, si la palabra existe se suma 1, sino se inserta con valor 1 # Si lo que se ingresa como búsqueda no es una sola pabla, sino una frase, se utiliza filtros tipo Stop y Stemming, # luego se realiza la extracción de keywords o tokens """ “Stop” words are words that are so common it’s often counter-productive to index them, such as “and”, “or”, “if”, etc. The provided analysis.StopFilter lets you filter out stop words, and includes a default list of common stop words. Stemming is a heuristic process of removing suffixes (and sometimes prefixes) from words to arrive (hopefully, most of the time) at the base word. """ if len(search_text.split()) > 1: analyzer = LanguageAnalyzer("es") a_filters = StopFilter() | StemFilter() keywords = list( set([ token.text for token in a_filters( analyzer(search_text, no_morph=True)) ])) else: keywords = [search_text] for word in keywords: obj, created = Searches.objects.update_or_create(word=word) if not created: obj.count += 1 else: if obj.count is None: obj.count = 1 obj.save() else: total_full = all_full # Se actualiza las palabras más buscadas # Se actualiza total de búsquedas y cantidad de palabras diferentes searches_services = SearchesServices() searches_services.generate_resume() top_words_searched = searches_services.top_words_searched # Total de palabras diferentes total_words = searches_services.total_words # Total de busquedas en el sitio total_searchs = searches_services.total_searchs # Paginado de lista de tesis paginator = Paginator(total_full, 5) page = request.GET.get('page') tesis_list = paginator.get_page(page) the_data = { 'tesis_list': render_to_string('sections/central_published_tesis.html', { 'tesis_list': tesis_list, 'question': search_text }), # serializers.serialize("json", [x for x in total_full]), 'tutors_list': tutors_full, 'top_words_searched': top_words_searched, 'total_words': total_words, 'total_searchs': total_searchs, 'question': search_text } # the_data = serializers.serialize("json", [x for x in total_full]) return JsonResponse(the_data)
analyzer = RegexTokenizer() | LowercaseFilter() | StopFilter( lang="portuguese") r = [] for token in analyzer(str): r.append(token.text) return r #--------------------------------------------------------------------- #variaveis arqs = [] categorias = ("contos", "critica", "cronica", "miscelanea", "poesia", "romance", "teatro", "traducao") stop_words = get_stop_words('portuguese') analyzer = RegexTokenizer() | LowercaseFilter() | StopFilter(lang="portuguese") index_path = "whoosh_index" print( "------------------------------------------------------------------------------" ) print( "----------------------------buscador da vava----------------------------------" ) print( "------------------------------------------------------------------------------\n\n\n" ) #Schema #criar só com conteudo, depois separo as informações schema = Schema(caminho=TEXT, titulo=TEXT(stored=True), ano=TEXT,
def main(): file_content_doc1 = open("rural_min.txt").read() file_content_doc2 = open("science_min.txt").read() option = True while option: print(""" 1. Create Index. 2. Query Index. 3. Exit """) option = input("Please select an option...!") if option == "1": sent_tokenize_list1 = sent_tokenize(file_content_doc1, language='english') sent_tokenize_list2 = sent_tokenize(file_content_doc2, language='english') if not os.path.exists("index_task3_min"): os.mkdir("index_task3_min") my_analyzer = RegexTokenizer() | StopFilter() | LowercaseFilter( ) | Lemmatizer() pos_tagger = RegexTokenizer() | StopFilter() | LowercaseFilter( ) | PosTagger() wordnetsyn1 = RegexTokenizer() | StopFilter() | LowercaseFilter( ) | WordNetSynsets() wordnetsyn2 = RegexTokenizer() | StopFilter() | LowercaseFilter( ) | WordNetSynsets1() wordnetsyn3 = RegexTokenizer() | StopFilter() | LowercaseFilter( ) | WordNetSynsets2() wordnetsyn4 = RegexTokenizer() | StopFilter() | LowercaseFilter( ) | WordNetSynsets3() schema = Schema(id=ID(stored=True, unique=True), standard=TEXT(stored=True, analyzer=StandardAnalyzer()), stem_text=TEXT(stored=True, analyzer=StemmingAnalyzer()), lemma=TEXT(stored=True, analyzer=my_analyzer), pos_text=TEXT(stored=True, analyzer=pos_tagger), hypernym=TEXT(stored=True, analyzer=wordnetsyn1), hyponym=TEXT(stored=True, analyzer=wordnetsyn2), holonym=TEXT(stored=True, analyzer=wordnetsyn3), meronyms=TEXT(stored=True, analyzer=wordnetsyn4), dependency=TEXT(analyzer=DependencyParser())) ix = index.create_in("index_task3_min", schema) writer = ix.writer() for sentence in sent_tokenize_list1: writer.add_document(standard=sentence, stem_text=sentence, lemma=sentence, pos_text=sentence, hypernym=sentence, hyponym=sentence, meronyms=sentence, holonym=sentence, dependency=sentence) for sentence in sent_tokenize_list2: writer.add_document(standard=sentence, stem_text=sentence, lemma=sentence, pos_text=sentence, hypernym=sentence, hyponym=sentence, meronyms=sentence, holonym=sentence, dependency=sentence) writer.commit() print_index_details(ix) print("\n\n Index created with various features as its fields") elif option == "2": ix = index.open_dir("index_task3") with ix.searcher(weighting=whoosh.scoring.BM25F()) as searcher: og = qparser.OrGroup.factory(0.5) q = input("\n Insert a query...!") query_text = MultifieldParser([ "standard", "stem_text", "lemma", "pos_text", "hyponym", "meronyms", "hypernym", "holonym" ], schema=ix.schema, group=og).parse(q) results = searcher.search(query_text, limit=10) for i, hit in enumerate(results): print(results.score(i), hit["standard"], sep=":") print("\n") elif option == "3": print("\n Goodbye") sys.exit(0) option = None else: print("\n Not valid choice try again...!")
imageURL text, price numeric, rating numeric, noOfReviews numeric, savings numeric, percentageSavings numeric, productDesc text, reviewPolarity numeric, countryOfOrigin text, overview text)''') c.close() # initialise sentic net sn = SenticNet() # does stemming, removes accents so you can match words like cafe, facade etc and removes stopwords hsn_analyzer = StemmingAnalyzer() | CharsetFilter(accent_map) | StopFilter() SCHEMA = Schema( filename=ID(unique=True, stored=True, analyzer=hsn_analyzer), content=TEXT(analyzer=hsn_analyzer, spelling=True), price=NUMERIC(sortable=True, stored=True), rating=NUMERIC(sortable=True, stored=True), noOfReviews=NUMERIC(sortable=True, stored=True), savings=NUMERIC(sortable=True, stored=True), percentageSavings=NUMERIC(sortable=True, stored=True), review=TEXT(analyzer=hsn_analyzer, spelling=True), productDesc=TEXT(stored=True), reviewPolarity=NUMERIC(sortable=True, stored=True), countryOfOrigin=TEXT(sortable=True, stored=True), overview=TEXT(stored=True), )
class CustomFuzzyTerm(FuzzyTerm): """ Custom FuzzyTerm query parser to set a custom maxdist """ def __init__(self, fieldname, text, boost=1.0, maxdist=1): FuzzyTerm.__init__(self, fieldname, text, 1.0, 2) logger = logging.getLogger("indexer" + __name__) ##==========================={Index-Schema}===================================== chfilter = CharsetFilter(accent_map) stoplist = stoplists["en"].union(stoplists["ru"]) analyzer = (RegexTokenizer() | LowercaseFilter() | StopFilter(stoplist=stoplist) | chfilter) # Define the schema keywordType = KEYWORD(lowercase=True, scorable=True) def add_fields(schema): """ * -------------{Function}--------------- * Add dynamic fields so each document can index its fields in * the same Whoosh index * -------------{returns}---------------- * Whoosh Schema . . . * -------------{params}----------------- * : whoosh.fields.Schema """
#!/usr/bin/python # # Classes to aid parsing of jobs from the Hive recent jobs page # # Usage: cat saved.html | hive_job_parser.py # from bs4 import BeautifulSoup import re, sys, datetime from whoosh.fields import SchemaClass, TEXT, KEYWORD, ID, DATETIME from whoosh.analysis import RegexTokenizer, LowercaseFilter, StopFilter QUERY_ANALYZER = RegexTokenizer('\w+|\d+') | LowercaseFilter() | StopFilter() class HiveJobListing(SchemaClass): '''Class to store the details associated with each Hive job''' job_url = ID(stored=True) title = TEXT(stored=True,analyzer=QUERY_ANALYZER) owner = KEYWORD(stored=True) completion_time = DATETIME(stored=True) query = TEXT(stored=True,analyzer=QUERY_ANALYZER) def __init__(self): self.job_url = None self.title = None self.owner = None self.completion_time = None self.query = None def __str__(self):
def query_thread(queue, database, g_minus_d, e1_type, e2_type, index): idx = open_dir(index) regex_tokenize = re.compile('\w+|-|<[A-Z]+>[^<]+</[A-Z]+>', re.U) tokenizer = RegexTokenizer(regex_tokenize) stopper = StopFilter() count = 0 with idx.searcher() as searcher: while True: r = queue.get_nowait() count += 1 if count % 25000 == 0: print multiprocessing.current_process(), count, queue.qsize() if len(database[(r.ent1, r.ent2)]) == 0: # if its not in the database calculate the PMI entity1 = "<" + e1_type + ">" + r.ent1 + "</" + e1_type + ">" entity2 = "<" + e2_type + ">" + r.ent2 + "</" + e2_type + ">" terms = list() for token in stopper( tokenizer((r.between.decode("utf8")), renumber=True)): terms.append(query.Term("sentence", token.text)) #print terms t1 = query.Term("sentence", entity1) t3 = query.Term("sentence", entity2) query_terms = list() query_terms.append(t1) for t in terms: query_terms.append(t) query_terms.append(t3) q1 = spans.SpanNear2(query_terms, slop=2, ordered=True) q2 = spans.SpanNear2([t1, t3], slop=8, ordered=True) entities_r = searcher.search(q1) entities = searcher.search(q2) """ print query_terms, len(entities_r) print [t1, t3], len(entities) print "\n" """ #print entity1, '\t', r.between, '\t', entity2, len(entities_r), len(entities) try: assert not len(entities_r) > len(entities) except AssertionError, e: print e print r.sentence print r.ent1 print r.ent2 print query_terms print[t1, t3] if len(entities) > 0: pmi = float(len(entities_r)) / float(len(entities)) if pmi >= 0.5: #print entity1, '\t', r.between, '\t', entity2, pmi g_minus_d.append(r) if queue.empty() is True: break
#!/usr/bin/env python import os from whoosh.index import create_in from whoosh.fields import Schema, ID, TEXT from whoosh.analysis import LowercaseFilter, RegexTokenizer, StopFilter # Analizadores que utilizará el schema my_analizer = RegexTokenizer() | LowercaseFilter() | StopFilter(lang="es") # Esquema en el que se guarda el título e ID schema = Schema(title=TEXT(stored=True), path=ID(stored=True), num_noticia=TEXT(stored=True), doc=TEXT(stored=True), content=TEXT) #(analyzer=my_analizer)) # Nombre del directorio donde se guardará el índice idir = "index_dir" # Creación del directorio donde se guarda el índice if not os.path.exists(idir): os.mkdir(idir) ix = create_in(idir, schema) # El writer añadirá los índices writer = ix.writer() # Ficheros a añadir nomF = os.listdir("./enero") for filename in nomF: f = open("./enero/" + filename, mode='r') f = str(f.read()).split("<DOC>")
from docx import Document import PyPDF2 from bs4 import BeautifulSoup from whoosh import fields, index from whoosh.analysis import SimpleAnalyzer, StopFilter sys.path.append( os.path.join(os.path.abspath(os.path.dirname(__file__)), os.pardir, os.pardir)) from searchEngine.seconfig import SearchEngineConfig import time WHOOSH_SCHEMA = fields.Schema(title=fields.TEXT(stored=True), path=fields.ID(stored=True, unique=True), content=fields.TEXT(analyzer=SimpleAnalyzer() | StopFilter(), stored=False)) FILE_INDEXED_LIST = [] # Creates a list of all the files in the lookup directory def list_all_files(): file_name_list = [] for path, subdirs, files in os.walk( SearchEngineConfig.DOCUMENT_LOOKUP_DIRECTORY): for name in files: extension = os.path.splitext(name)[1] if extension in SearchEngineConfig.SUPPORTED_EXTENSIONS: file_name_list.append(str(os.path.join(path, name))) return file_name_list