def createSearchableData(): charmap = charset_table_to_dict(default_charset) custom_analyzers = StemmingAnalyzer() | CharsetFilter(charmap) schema = Schema(title=TEXT(stored= True, field_boost=3.0), ID= ID(stored=True, unique=True), url= TEXT(stored=True), textdata= TEXT(stored=True, analyzer= custom_analyzers, field_boost=0.8)) if not os.path.exists("indexdir"): os.mkdir("indexdir") ix = create_in("indexdir",schema) writer = ix.writer() path = os.path.relpath("/dump/dump_grande.xml", start="/") root = ET.parse(path) xml_data = {} for item in root.iter(): if item.tag == 'root': next elif item.tag == 'row' and len(xml_data) > 0: writer.add_document(title=xml_data['title'], ID=xml_data['id'], url=xml_data['url'], textdata=xml_data['text']) xml_data = {} else: xml_data[item.tag] = item.text writer.commit()
def make_search_service(search_text): charmap = charset_table_to_dict(default_charset) custom_analyzers = StemmingAnalyzer() index_path = join(pathlib.Path(__file__).parent.parent.absolute(), 'indexdir') myindex = open_dir(index_path) qp = MultifieldParser(["title", "textdata"], schema=myindex.schema, group=AndGroup, fieldboosts={'title': 3.0, 'textdata': 0.8}) qstring = search_text q = qp.parse(qstring) results_list = [] myWeighting= scoring.MultiWeighting(scoring.BM25F(textdata_B=0.5), textdata=scoring.Frequency(), title=scoring.BM25F(title_B=2.0)) with myindex.searcher(weighting=myWeighting) as s: results = s.search(q, limit=30, terms=True) #forse cercavi e risultati relativi a corrected = s.correct_query(q, qstring) did_you_mean = str result_for = str if corrected.query != q: if len(results) < 1: results = s.search(qp.parse(corrected.string), limit=30, terms=True) result_for = corrected.string else: did_you_mean = corrected.string #query expansion keywords = [keyword for keyword, score in results.key_terms("textdata", docs=3, numterms=5)] if not keywords and keywords == " ": query_keyword = qp.parse(reduce(lambda a, b: a + ' ' + b, keywords)) results_keyword = s.search(query_keyword, limit=30, terms=True) results.upgrade_and_extend(results_keyword) #sorting key_sort = lambda result: result.score results = sorted(results, key=key_sort, reverse=True) for ris in results: result = {} result['title'] = ris['title'] result['url'] = ris['url'] result['id'] = ris['ID'] result['highlight'] = ris.highlights("textdata") results_list.append(result) #per calcolo precisione e recall id_results = [ris['id'] for ris in results_list[:10]] return { 'search_text': search_text, 'results': results_list, 'did_you_mean': did_you_mean, 'result_for': result_for, 'results_ids': id_results }
def test_charset_pickeability(): from whoosh.support import charset charmap = charset.charset_table_to_dict(charset.default_charset) ana = analysis.StandardAnalyzer() | analysis.CharsetFilter(charmap) _ = dumps(ana, -1) ana = analysis.CharsetTokenizer(charmap) _ = dumps(ana, -1)
def MemopolAnal(): charmap = charset_table_to_dict(default_charset) return anal.RegexTokenizer(r"\w+") | anal.LowercaseFilter() | anal.CharsetFilter(accent_map)
#!/usr/bin/env python import urllib from urlparse import urlparse import os.path from whoosh.support.charset import charset_table_to_dict, default_charset from genshi.input import XMLParser, START, TEXT, END __author__ = "mlecarme" no_accent = charset_table_to_dict(default_charset) """ [TODO] indexing artworks """ class ItunesParser: """ Event based iTunes XML parser """ def __init__(self, path): self.stream = XMLParser(open(path,'r')) def __iter__(self): ouvrant = None indentation = 0 tracks = False valeur = False piste = {} albums = set() artists = set() for kind, data, pos in self.stream:
def indexer(): charmap = charset_table_to_dict(default_charset) my_analyzer = StemmingAnalyzer() | CharsetFilter(charmap) | StopFilter() schema = Schema(url=ID(stored=True), title=TEXT(stored=True), content=TEXT(stored=True, analyzer=my_analyzer, spelling=True), data=STORED, tags=KEYWORD(stored=True), extension=TEXT(stored=True)) if not os.path.exists("everywhere"): os.mkdir("everywhere") if not os.path.exists("pdf"): os.mkdir("pdf") if not os.path.exists("doc"): os.mkdir("doc") if not os.path.exists("tar"): os.mkdir("tar") if not os.path.exists("jpg"): os.mkdir("jpg") if not os.path.exists("forms"): os.mkdir("forms") i_a = index.create_in("everywhere", schema) writer_a = i_a.writer() i_b = index.create_in("pdf", schema) writer_b = i_b.writer() i_c = index.create_in("doc", schema) writer_c = i_c.writer() i_d = index.create_in("tar", schema) writer_d = i_d.writer() i_e = index.create_in("jpg", schema) writer_e = i_e.writer() i_f = index.create_in("forms", schema) writer_f = i_f.writer() ctx = ssl.create_default_context() ctx.check_hostname = False ctx.verify_mode = ssl.CERT_NONE specific = [ '.jpg', '.exe', '.pdf', '.doc', '.zip', '.xls', 'pptx', 'docx', 'r.gz', '.iso', 'jpeg', '.gif', '.png' ] ignore = ['calendar', 'events', 'mailto'] with open('intranet/crawled.txt', "r") as fp: num = 0 for line in fp: num = num + 1 print("Extracting link" + str(num)) line = line.replace('\n', '') #if line[-4: ] not in specific: if all(item not in line.lower() for item in ignore): try: if all(item not in line.lower() for item in specific): print(line) html = urlopen(line) soup = BeautifulSoup(html, "html.parser") #soup = BeautifulSoup(html, "html.parser") for script in soup(["script", "style"]): script.extract() try: heading = soup.title.string except AttributeError: heading = "line" #print (str(heading)) try: content = soup.body.get_text() except AttributeError: content = "" tags = "" try: for h in soup.findAll( ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'h7']): tags = tags + " " + h.string except: pass else: #pattern = re.compile('[\W_]+') heading = line #heading = pattern.sub(' ',heading) #re.sub(r'[\W_]+','', heading) #heading = heading.split() content = line.split() tags = "" title = str(heading) #print (title) tags = str(tags) content = str(content) #print ("content") url = str(line) extension = str(line[-4:]) writer_a.add_document(url=url, title=title, data=content, content=content, tags=tags, extension=extension) if "pdf" in line.lower(): writer_b.add_document(url=url, title=title, data=content, content=content, tags=tags, extension=extension) print("added to pdf") elif (".doc" in line.lower()) or (".ppt" in line.lower()) or ( ".xls" in line.lower()) or ( "docx" in line.lower()) or (".ppt" in line.lower()): writer_c.add_document(url=url, title=title, data=content, content=content, tags=tags, extension=extension) print("added to doc") elif (".exe" in line.lower()) or (".iso" in line.lower()) or ( ".zip" in line.lower()) or ("r.gz" in line.lower()): writer_d.add_document(url=url, title=title, data=content, content=content, tags=tags, extension=extension) print("added to tar") elif (".jpeg" in line.lower()) or (".jpg" in line.lower()) or ( ".gif" in line.lower()) or (".png" in line.lower()): writer_e.add_document(url=url, title=title, data=content, content=content, tags=tags, extension=extension) print("added to jpg") elif "form" in line.lower(): writer_f.add_document(url=url, title=title, data=content, content=content, tags=tags, extension=extension) print("added to form") else: print("adding to everywhere") #writer_a.add_document(url=url, title=title, data=content, content=content, tags=tags) print("added To whoosh") except urllib.error.HTTPError: print("HTTP Error") # test = "True" except (ConnectionResetError, urllib.error.URLError): print("Connection Reset Fail") else: print("ignored this url") writer_a.commit() writer_b.commit() writer_c.commit() writer_d.commit() writer_e.commit() writer_f.commit()
import os import json import string from whoosh.index import create_in from whoosh.fields import * from whoosh.writing import AsyncWriter from whoosh.analysis import StandardAnalyzer, StemmingAnalyzer, CharsetFilter from whoosh.support.charset import default_charset, charset_table_to_dict charmap = charset_table_to_dict(default_charset) def processText(text): """ Questo metodo si occupa di processare il testo prima di inserirlo nell'index. Nello specifico, elimina i caratteri di punteggiatura e scarta le parole lunghe solo una lettera. Inoltre, elimina anche le stopwords, esegue lo stemming delle parole e normalizza le lettere accentate ed altre lettere in testo appartenente all'ASCII :rtype: list """ #(, filterStopwords=False, stemming=False, normalizeAccents=False, minLength=1) # tokenizzazione # tokens = nltk.wordpunct_tokenize(text) # tokenizer = RegexTokenizer() # if stemming: # if filterStopwords: # analyzer = StemmingAnalyzer() # else: