Exemplo n.º 1
0
def createSearchableData(directory, load_path):
    '''
    Schema definition: title(name of file), path(as ID), content(indexed
    but not stored),textdata (stored text content)
    '''
    # the call of the StemmingAnalyzer had to be changed in the whoosh directory to support the portuguese language
    my_analyzer = StemmingAnalyzer() | CharsetFilter(accent_map) | NgramFilter(
        minsize=2, maxsize=4)
    schema = Schema(question=TEXT(analyzer=my_analyzer, stored=True),
                    response=TEXT(analyzer=my_analyzer, stored=True))
    # schema = Schema(question=TEXT(stored=True), response=TEXT(stored=True))
    schema.cachesize = -1

    if not os.path.exists(directory):

        # makedirs is used to create directories with subdirectories in it
        os.makedirs(directory)

    # Creating a index writer to add document as per schema
    ix = create_in(directory, schema)
    writer = ix.writer(limitmb=1024)

    with open(load_path) as subtles_file:
        subtles_corpus = subtles_file.read().splitlines()

    for i in range(0, len(subtles_corpus), 2):
        writer.add_document(question=subtles_corpus[i],
                            response=subtles_corpus[i + 1])
    writer.commit()
Exemplo n.º 2
0
def get_whoosh_index(force_create=False):
  from whoosh.index import create_in, exists_in, open_dir
  from whoosh.fields import Schema, TEXT, KEYWORD, ID, STORED
  from whoosh.analysis import CharsetFilter, StemmingAnalyzer, NgramWordAnalyzer
  from whoosh.support.charset import accent_map

  analyzer = StemmingAnalyzer() | CharsetFilter(accent_map)
  ngramAnalyzer = NgramWordAnalyzer( minsize=2, maxsize=4)

  schema = Schema(
    title     = TEXT(analyzer=analyzer, spelling=True, stored=True, field_boost=3.0), 
    abstract  = TEXT(analyzer=analyzer, stored=True, field_boost=2.0), 
    path      = ID(unique=True, stored=True), 
    authors   = TEXT(analyzer=analyzer, sortable=True, field_boost=1.5), 
    content   = TEXT(analyzer=analyzer, stored=True), 
    tags      = KEYWORD(sortable=True, commas=True, field_boost=1.5, lowercase=True), 
    status    = KEYWORD,
    classname = KEYWORD,
    typeahead = TEXT(spelling=True, stored=True, phrase=False)
  )
    
  if not os.path.exists(settings.WHOOSH_ROOT):
    os.mkdir(settings.WHOOSH_ROOT)
  
  if not exists_in(settings.WHOOSH_ROOT) or force_create:
    index = create_in(settings.WHOOSH_ROOT, schema)
  else:
    index = open_dir(settings.WHOOSH_ROOT)
  return index
Exemplo n.º 3
0
def createSearchableData():   
    charmap = charset_table_to_dict(default_charset)
    custom_analyzers = StemmingAnalyzer() | CharsetFilter(charmap)
    
    schema = Schema(title=TEXT(stored= True, field_boost=3.0),
                                 ID= ID(stored=True, unique=True), 
                                 url= TEXT(stored=True), 
                                 textdata= TEXT(stored=True, analyzer= custom_analyzers, field_boost=0.8))
    if not os.path.exists("indexdir"):
        os.mkdir("indexdir")

    ix = create_in("indexdir",schema)
    writer = ix.writer()

    path = os.path.relpath("/dump/dump_grande.xml", start="/")
    root = ET.parse(path)
    xml_data = {}
    for item in root.iter():
        if item.tag == 'root':
            next
        elif item.tag == 'row' and len(xml_data) > 0:
            writer.add_document(title=xml_data['title'], ID=xml_data['id'], url=xml_data['url'], textdata=xml_data['text'])
            xml_data = {}
        else:
            xml_data[item.tag] = item.text

    writer.commit()
Exemplo n.º 4
0
    def __init__(self, **kwargs):
        super(WhooshEngine, self).__init__()

        analyzer = (StemmingAnalyzer()
                    | CharsetFilter(accent_map)
                    | NgramFilter(minsize=4, maxsize=10))
        self.schema = Schema(id=ID(stored=True),
                             title=TEXT(stored=True,
                                        field_boost=5.0,
                                        analyzer=analyzer),
                             firstname=TEXT(stored=True,
                                            field_boost=2.0,
                                            analyzer=analyzer),
                             lastname=TEXT(stored=True,
                                           field_boost=2.0,
                                           analyzer=analyzer),
                             type=ID(stored=True),
                             description=TEXT(stored=True, analyzer=analyzer),
                             creators=TEXT(stored=False, analyzer=analyzer),
                             tags=TEXT(stored=False, analyzer=analyzer),
                             business_unit=TEXT(stored=False,
                                                analyzer=analyzer),
                             position=TEXT(stored=False, analyzer=analyzer),
                             competencies=TEXT(stored=False,
                                               analyzer=analyzer),
                             text=TEXT(stored=True, analyzer=analyzer))

        self.dir = kwargs['dir']
        if not os.path.exists(self.dir):
            os.makedirs(self.dir)
        try:
            self._index = open_dir(self.dir)
        except EmptyIndexError:
            self._index = create_in(self.dir, self.schema)
Exemplo n.º 5
0
    def build_schema(self, fields):
        schema = super(FoldingWhooshSearchBackend, self).build_schema(fields)

        for name, field in schema[1].items():
            if isinstance(field, TEXT) or isinstance(field, NGRAM) or isinstance(field, NGRAMWORDS):
                field.analyzer = StemmingAnalyzer() | CharsetFilter(accent_map) | NgramFilter(minsize=2, maxsize=15)

        return schema
Exemplo n.º 6
0
def analyze(text):
    my_analyzer = StemmingAnalyzer() | CharsetFilter(accent_map)

    tokens = my_analyzer(text.strip())

    words = [token.text for token in tokens]

    return words
Exemplo n.º 7
0
def normalize_name(name):
    stem = True
    letters = list(name)
    
    # if format w o r k o u t / w.o.r.k.o.u.t/ w*o*r*k*o*u*t join togother
    if len(letters)>4:
        if len(set([letters[i] for i in range(0,len(letters),2)]))==1:
            name = "".join([letters[i] for i in range(1,len(letters),2)])
        elif len(set([letters[i] for i in range(1,len(letters),2)]))==1:
            name = "".join([letters[i] for i in range(0,len(letters),2)])
             
    # if there is and & not surrounded by spaces, leave alone (example 'r&b)
    if "&" in letters:
        position = letters.index("&")
        if position>0 and position<len(letters)-1:
            if letters[position-1]!=' ' and letters[position+1]!=' ':
                stem  = False
    
      
    # if there is a k surrounded by numbers turn to 0
    if "k" in letters and '2' in letters:
        positions = [x for x in range(len(letters)) if letters[x]=='k']
        for pos in positions:
             if pos>0 and pos<len(letters)-1:
                if letters[pos-1]=='2':
                    letters[pos]='0'
                    name = "".join(letters)
           
    # proceed to stem   
    if stem: 
        my_analyzer = StemmingAnalyzer() | CharsetFilter(accent_map)
        tokens = my_analyzer(name)
        words = [token.text for token in tokens]
        
        # if the reuslt is empyt, leave alone, if not, return as a list
        if len(words)!=0:
            result=""
            for el in words:
                result +=el+" "
            letters = list(result)[:-1]
    # softer stem
    else:
        name = name.lower()
        name = re.sub(r"[.,'\/#!$%\^\*;:{}=\_`~()@]", ' ', name)
        name = re.sub(r'\s+', ' ', name).strip()
        letters = list(name)
        
            
            
    # if last n characters are equal leave only 1 
    last = letters[-1]
    if last in ascii_letters and len(letters)>1:
        while(letters[-2]==last):
            letters.pop(-2)
            if len(letters)==1: break
    
    
    return ''.join(letters)
Exemplo n.º 8
0
def _get_schema():
    analyzer = StemmingAnalyzer() | CharsetFilter(
        accent_map
    )  # WARN: stemming is english specific; character folding is for western languages
    schema = Schema(
        code=ID(unique=True, stored=True),
        slug=ID(unique=False, stored=True),
        title=TEXT(analyzer=analyzer, stored=True),
        content=TEXT(analyzer=analyzer),
    )
    return schema
Exemplo n.º 9
0
def main(args):
    my_analyzer = StemmingAnalyzer() | CharsetFilter(accent_map)

    for filename in glob.glob(args.dir + "/*.txt"):
        with open(filename, 'r') as readfile:
            infile = readfile.readlines()
            label = ("__label__" + os.path.splitext(os.path.basename(readfile.name))[0] + " ") if args.add_label else '';
            for line in infile:
                tokens = my_analyzer(line.strip())
                words = [token.text for token in tokens]

                #print(line.strip())
                print(label + ' '.join(words))
Exemplo n.º 10
0
    def __init__(self):

        chfilter = CharsetFilter(accent_map)
        stoplist = stoplists["en"].union(stoplists["fr"])
        analyzer = RegexTokenizer() | LowercaseFilter() | \
                   StopFilter(stoplist=stoplist) | chfilter

        # defines the schema
        # see http://pythonhosted.org/Whoosh/schema.html for reference
        keywordType = KEYWORD(lowercase=True, scorable=True)
        self.schema = Schema(content=TEXT(analyzer=analyzer),
                             docType=TEXT,
                             docId=ID(stored=True, unique=True),
                             tags=keywordType)

        # Adds dynamic fields so each documents can index its fields in the
        # same Whoosh index
        self.schema.add('*_string', TEXT(analyzer=analyzer), glob=True)
        self.schema.add('*_date', DATETIME, glob=True)
        self.schema.add('*_number', NUMERIC, glob=True)
        self.schema.add('*_boolean', BOOLEAN, glob=True)

        # Creates the index folder and Whoosh index files if it doesn't exist
        # And loads the index in any case
        if not os.path.exists("indexes"):
            os.mkdir("indexes")
            self.index = index.create_in("indexes", self.schema)
        else:
            self.index = index.open_dir("indexes")

        # Creates the doctypes folder if it doesn't exist
        if not os.path.exists("doctypes"):
            os.mkdir("doctypes")

        # Creates the doctypes default schema file if it doesn't exist
        if not os.path.exists('doctypes/doctypes_schema.json'):
            with open('doctypes/doctypes_schema.json', 'w') as defaultFile:
                defaultFile.write("{}")
        '''
        Loads the doctypes schema if it's valid, otherwise recreates it
        Doctypes schema is a dictionary of doctypes with their fields created
        and updated when a document is indexed.
        That way, we can tell Whoosh which fields to search by default, because
        there is apparently no way to say "search in all fields".
        '''
        with open('doctypes/doctypes_schema.json', 'r+') as rawJSON:
            try:
                self.doctypesSchema = json.load(rawJSON)
            except ValueError:
                rawJSON.write("{}")
                self.doctypesSchema = {}
Exemplo n.º 11
0
def processText(text):
    """ Questo metodo si occupa di processare il testo prima di inserirlo nell'index.
        Nello specifico, elimina i caratteri di punteggiatura e scarta le parole lunghe solo una lettera.
        Inoltre, elimina anche le stopwords, esegue lo stemming delle parole e normalizza
        le lettere accentate ed altre lettere in testo appartenente all'ASCII
        :rtype: list
    """

    #(, filterStopwords=False, stemming=False, normalizeAccents=False, minLength=1)

    # tokenizzazione
    # tokens = nltk.wordpunct_tokenize(text)

    # tokenizer = RegexTokenizer()

    # if stemming:
    # 	if filterStopwords:
    # 		analyzer = StemmingAnalyzer()
    # 	else:
    # 		analyzer = StemmingAnalyzer(stoplist=None)
    # else:
    # 	if filterStopwords:
    # 		analyzer = StandardAnalyzer()
    # 	else:
    # 		analyzer = StandardAnalyzer(stoplist=None)
    # if normalizeAccents:

    analyzer = StemmingAnalyzer() | CharsetFilter(charmap)  # accent_map

    # Eliminazione di stopwords e punteggiatura
    processedText = []
    for token in analyzer(text):
        tokenText = token.text.translate(
            str.maketrans('', '', string.punctuation))
        if len(tokenText) > 1:
            processedText.append(tokenText)
    return processedText
Exemplo n.º 12
0
def create_table(index_dir, *, overwrite=False):
    analyzer = StandardAnalyzer() | CharsetFilter(accent_map)
    schema = Schema(label=TEXT(stored=True, analyzer=analyzer, lang='fr'),
                    rome=TEXT(stored=True, sortable=True),
                    source=KEYWORD(stored=True, sortable=True),
                    slug=STORED)

    if not os.path.exists(index_dir):
        os.mkdir(index_dir)
    elif exists_in(index_dir):
        if not overwrite:
            logger.critical(
                'An index already exists in %s; overwrite flag not set; abandonning',
                index_dir)
            raise RuntimeError('Index already exists')
        logger.warning('Index already found, deleting %s to start anew',
                       index_dir)
        shutil.rmtree(index_dir, ignore_errors=True, onerror=None)

        os.mkdir(index_dir)

    logger.info('Whoosh index %s ready for use', index_dir)
    create_in(index_dir, schema)
    return index_dir
Exemplo n.º 13
0
    def get_whoosh_field_type(cls, field, sortable=False):
        '''
        Defines Whoosh field types used to define the schemas.
        See get_field_infos().
        '''

        # see http://pythonhosted.org/Whoosh/api/analysis.html#analyzers
        # see JIRA 165

        from whoosh.fields import TEXT, ID, NUMERIC, BOOLEAN
        # TODO: shall we use stop words? e.g. 'A and B' won't work?
        from whoosh.analysis import SimpleAnalyzer, StandardAnalyzer, StemmingAnalyzer, CharsetFilter, RegexTokenizer
        from whoosh.support.charset import accent_map
        # ID: as is; SimpleAnalyzer: break into lowercase terms, ignores punctuations; StandardAnalyzer: + stop words + minsize=2; StemmingAnalyzer: + stemming
        # minsize=1 because we want to search for 'Scribe 2'

        # A paragraph or more.
        field_type = field['type']
        if field_type == 'id':
            # An ID (e.g. 708-AB)
            # EXACT search only
            analyzer = None
            if field.get('multivalued', False):
                analyzer = RegexTokenizer(ur'\|', gaps=True)
            ret = ID(stored=True, sortable=sortable, analyzer=analyzer)
        elif field_type in ['int']:
            ret = NUMERIC(sortable=sortable)
        elif field_type in ['code']:
            # A code (e.g. K. 402, Royal 7.C.xii)
            # Accepts partial but exact search (e.g. royal)
            # See JIRA 358
            # | is NECESSARY for multivalued fields
            ret = TEXT(analyzer=SimpleAnalyzer(ur'[/.\s()\u2013\u2014|-]',
                                               True),
                       stored=True,
                       sortable=sortable)
        elif field_type == 'title':
            # A title (e.g. British Library)
            # Accepts variants and partial search (e.g. 'libraries')
            ret = TEXT(analyzer=StemmingAnalyzer(minsize=1, stoplist=None)
                       | CharsetFilter(accent_map),
                       stored=True,
                       sortable=sortable)
        elif field_type == 'short_text':
            # A few words.
            ret = TEXT(analyzer=StemmingAnalyzer(minsize=2)
                       | CharsetFilter(accent_map),
                       stored=True,
                       sortable=sortable)
        elif field_type == 'xml':
            # plain text derived from XML document
            ret = TEXT(analyzer=StemmingAnalyzer(minsize=2)
                       | CharsetFilter(accent_map),
                       stored=True,
                       sortable=sortable)
        elif field_type == 'boolean':
            # 0|1
            ret = NUMERIC(stored=True, sortable=sortable)
        else:
            ret = TEXT(analyzer=StemmingAnalyzer(minsize=2)
                       | CharsetFilter(accent_map),
                       stored=True,
                       sortable=sortable)

        return ret
Exemplo n.º 14
0
from whoosh.analysis import CharsetFilter, LowercaseFilter, NgramFilter, \
    PathTokenizer, RegexTokenizer
from whoosh.fields import DATETIME, ID, KEYWORD, NUMERIC, TEXT, FieldType, \
    SchemaClass
from whoosh.formats import Existence
from whoosh.support.charset import accent_map

from abilian.core.models.subjects import Group, User
from abilian.core.util import noproxy
from abilian.services.security.models import Anonymous, Role

#: A Whoosh analyzer that splits on word boundaries and folds accents and case.
accent_folder = (
    RegexTokenizer(r'\w+') |  # defaults doesn't split on '.'
    LowercaseFilter() | CharsetFilter(accent_map))

#: Analyzer for edge-ngrams, from 2 to 6 characters long
edge_ngram = accent_folder | NgramFilter(minsize=2, maxsize=6, at='start')


def EdgeNgramField():
    return TEXT(stored=False, analyzer=edge_ngram)


class _DefaultSearchSchema(SchemaClass):
    """General search schema."""
    object_key = ID(stored=True, unique=True)
    id = NUMERIC(numtype=int, bits=64, signed=False, stored=True, unique=False)
    object_type = ID(stored=True, unique=False)
    creator = ID(stored=True)
Exemplo n.º 15
0
from whoosh.fields import Schema, STORED, TEXT, ID
import os.path
from whoosh.index import create_in
from whoosh.analysis import CharsetFilter, StemmingAnalyzer
from whoosh import fields
from whoosh.support.charset import accent_map

# For example, to add an accent-folding filter to a stemming analyzer:
my_analyzer = StemmingAnalyzer() | CharsetFilter(accent_map)
schema = Schema(title=TEXT(analyzer=my_analyzer, spelling=True),
                titleStemmed=TEXT(analyzer=my_analyzer),
                content=TEXT(analyzer=my_analyzer, spelling=True),
                contentStemmed=TEXT(analyzer=my_analyzer),
                nid=ID(stored=True))

if not os.path.exists("index/index"):
    os.mkdir("index/index")
ix = create_in("index/index", schema)
Exemplo n.º 16
0
def create_index(index_dir):
    schema = Schema(book_abbr=STORED(),
                    book_name=STORED(),
                    book_tree=STORED(),
                    book_kindle=STORED(),
                    short=STORED(),
                    long=STORED(),
                    key_terms=STORED(),
                    key_terms_content=TEXT(stored=True, analyzer=CleanupStandardAnalyzer(analyzer_re, STOP_WORDS) | CharsetFilter(accent_map)),
                    book=ID(stored=True),
                    heading=TEXT(stored=True, analyzer=StemmingAnalyzer(minsize=1, stoplist=None) | CharsetFilter(accent_map)),
                    session=TEXT(stored=True, analyzer=StandardAnalyzer(minsize=1, stoplist=None)),
                    date=DATETIME(stored=True, sortable=True),
                    exact=TEXT(stored=True, analyzer=CleanupStandardAnalyzer(analyzer_re, stoplist=None) | CharsetFilter(accent_map)),
                    stemmed=TEXT(stored=True, analyzer=CleanupStemmingAnalyzer(analyzer_re) | CharsetFilter(accent_map)),
                    common=TEXT(stored=True, analyzer=CleanupStemmingAnalyzer(analyzer_re, stoplist=None) | CharsetFilter(accent_map)),
                    )

    ix = index.create_in(index_dir, schema)

    writer = ix.writer()
    for book in Books.indexed:
        with open("books/{}.txt".format(book['abbr']), encoding='utf-8') as f:
            text = pre_process_book(book, f.read())
        text = re.search(book['book_re'], text, flags=re.DOTALL).group(1)

        d = {
            'book_name': book['name'],
            'book_abbr': book['abbr'],
            'book_tree': book['tree'],
            'book_kindle': book['kindle'],
            'book': book['abbr'].lower(),
        }

        i = 0
        heading_tiers = [{'short': '', 'long': ''}] * 3
        carry_over_heading = None
        headings = list(filter(None, book['headings_re'].split(text)[1:]))
        for (__heading, _content) in zip(headings[::2], headings[1::2]):
            content = __heading + _content
            if carry_over_heading:
                content = carry_over_heading + content
                carry_over_heading = None

            heading = clean_heading(__heading)
            if 'heading_replacements' in book:
                for (pattern, repl) in book['heading_replacements']:
                    heading = pattern.sub(repl, heading, 1)

            update_heading_tiers(book, heading_tiers, heading)

            has_content = re.search(r'[a-z]', _content)
            if not has_content:
                carry_over_heading = content
                continue

            add_document(writer, d, heading_tiers, content)
            i += 1
        print(i)

    writer.commit()
    return ix
Exemplo n.º 17
0
        if tier_re['begin'] and re.search(tier_re['begin'], short_heading, flags=re.IGNORECASE):
            short, long = heading.split('\n') if '\n' in heading else (heading, '')
            tiers[tier_idx] = {'short': title(short), 'long': title(long)}

        if tier_re['end'] and re.search(tier_re['end'], short_heading, flags=re.IGNORECASE):
            tiers[tier_idx] = {'short': '', 'long': ''}


# letters allowed, optionally interspersed with periods or asterisks, can't end with a number
# if it's only numbers then it's fine to end with a number
# term can't be adjacent to mid-line double asterisks
# (remember that our pre-processing fixed *hello ho**w are you* to *hello how are you* already, so legitimate ones are safe)
analyzer_re = re.compile(r'(?<![^\n]\*\*)\b(\w+([.*]?\w+)*(?<![0-9])|[0-9]+([.*]?[0-9]+)*)\b(?!\*\*[^\n])', re.UNICODE)
search_schema = Schema(book=ID(),
                       heading=TEXT(analyzer=StemmingAnalyzer(minsize=1, stoplist=None) | CharsetFilter(accent_map)),
                       session=TEXT(analyzer=StandardAnalyzer(minsize=1, stoplist=None)),
                       date=DATETIME(),
                       exact=TEXT(analyzer=StandardAnalyzer(stoplist=None) | CharsetFilter(accent_map)),
                       stemmed=TEXT(analyzer=StemmingAnalyzer() | CharsetFilter(accent_map)),
                       common=TEXT(analyzer=StemmingAnalyzer(stoplist=None) | CharsetFilter(accent_map)),
                       )


def create_index(index_dir):
    schema = Schema(book_abbr=STORED(),
                    book_name=STORED(),
                    book_tree=STORED(),
                    book_kindle=STORED(),
                    short=STORED(),
                    long=STORED(),
Exemplo n.º 18
0
from whoosh.analysis import StandardAnalyzer, CharsetFilter
from whoosh.fields import *
from whoosh.support.charset import accent_map

INDEX_DIR = 'indexdir'
PAGE_IDX_NAME = 'page_idx'
MAIN_LANGS = ['cs', 'fi', 'sk']

# Tokenize and lowercase the input and remove accents
analyzer = StandardAnalyzer() | CharsetFilter(accent_map)
# Whoosh index schema - store ID and original page title, analyze title using above analyzer
page_schema = Schema(id=NUMERIC(stored=True), title=TEXT(stored=True, analyzer=analyzer, spelling=True))

one_index_schema = Schema(original_title=TEXT(stored=True, analyzer=analyzer, spelling=True),
                          source_lang=KEYWORD,
                          target_lang=KEYWORD,
                          translated=TEXT(stored=True))


def measure_execution_time(enabled):
    def execution_time(func):
        def wrapper(*args, **kwargs):
            start = None
            if enabled:
                start = datetime.datetime.now()
            result = func(*args, **kwargs)
            if start:
                print(f'{func.__name__}() took {datetime.datetime.now() - start}')
            return result

        wrapper.__doc__ = func.__doc__
Exemplo n.º 19
0
def indexer():
    charmap = charset_table_to_dict(default_charset)
    my_analyzer = StemmingAnalyzer() | CharsetFilter(charmap) | StopFilter()

    schema = Schema(url=ID(stored=True),
                    title=TEXT(stored=True),
                    content=TEXT(stored=True,
                                 analyzer=my_analyzer,
                                 spelling=True),
                    data=STORED,
                    tags=KEYWORD(stored=True),
                    extension=TEXT(stored=True))

    if not os.path.exists("everywhere"):
        os.mkdir("everywhere")
    if not os.path.exists("pdf"):
        os.mkdir("pdf")
    if not os.path.exists("doc"):
        os.mkdir("doc")
    if not os.path.exists("tar"):
        os.mkdir("tar")
    if not os.path.exists("jpg"):
        os.mkdir("jpg")
    if not os.path.exists("forms"):
        os.mkdir("forms")

    i_a = index.create_in("everywhere", schema)
    writer_a = i_a.writer()

    i_b = index.create_in("pdf", schema)
    writer_b = i_b.writer()

    i_c = index.create_in("doc", schema)
    writer_c = i_c.writer()

    i_d = index.create_in("tar", schema)
    writer_d = i_d.writer()

    i_e = index.create_in("jpg", schema)
    writer_e = i_e.writer()

    i_f = index.create_in("forms", schema)
    writer_f = i_f.writer()

    ctx = ssl.create_default_context()
    ctx.check_hostname = False
    ctx.verify_mode = ssl.CERT_NONE

    specific = [
        '.jpg', '.exe', '.pdf', '.doc', '.zip', '.xls', 'pptx', 'docx', 'r.gz',
        '.iso', 'jpeg', '.gif', '.png'
    ]
    ignore = ['calendar', 'events', 'mailto']
    with open('intranet/crawled.txt', "r") as fp:
        num = 0
        for line in fp:
            num = num + 1
            print("Extracting link" + str(num))
            line = line.replace('\n', '')
            #if line[-4: ] not in specific:
            if all(item not in line.lower() for item in ignore):
                try:
                    if all(item not in line.lower() for item in specific):
                        print(line)
                        html = urlopen(line)
                        soup = BeautifulSoup(html, "html.parser")
                        #soup = BeautifulSoup(html, "html.parser")
                        for script in soup(["script", "style"]):
                            script.extract()

                        try:
                            heading = soup.title.string
                        except AttributeError:
                            heading = "line"
                    #print (str(heading))

                        try:
                            content = soup.body.get_text()

                        except AttributeError:
                            content = ""

                        tags = ""
                        try:
                            for h in soup.findAll(
                                ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'h7']):
                                tags = tags + " " + h.string
                        except:
                            pass
                    else:
                        #pattern = re.compile('[\W_]+')
                        heading = line
                        #heading = pattern.sub(' ',heading)
                        #re.sub(r'[\W_]+','', heading)
                        #heading = heading.split()
                        content = line.split()
                        tags = ""

                    title = str(heading)
                    #print (title)
                    tags = str(tags)
                    content = str(content)
                    #print ("content")
                    url = str(line)
                    extension = str(line[-4:])
                    writer_a.add_document(url=url,
                                          title=title,
                                          data=content,
                                          content=content,
                                          tags=tags,
                                          extension=extension)
                    if "pdf" in line.lower():
                        writer_b.add_document(url=url,
                                              title=title,
                                              data=content,
                                              content=content,
                                              tags=tags,
                                              extension=extension)
                        print("added to pdf")
                    elif (".doc"
                          in line.lower()) or (".ppt" in line.lower()) or (
                              ".xls" in line.lower()) or (
                                  "docx" in line.lower()) or (".ppt"
                                                              in line.lower()):
                        writer_c.add_document(url=url,
                                              title=title,
                                              data=content,
                                              content=content,
                                              tags=tags,
                                              extension=extension)
                        print("added to doc")
                    elif (".exe"
                          in line.lower()) or (".iso" in line.lower()) or (
                              ".zip" in line.lower()) or ("r.gz"
                                                          in line.lower()):
                        writer_d.add_document(url=url,
                                              title=title,
                                              data=content,
                                              content=content,
                                              tags=tags,
                                              extension=extension)
                        print("added to tar")
                    elif (".jpeg"
                          in line.lower()) or (".jpg" in line.lower()) or (
                              ".gif" in line.lower()) or (".png"
                                                          in line.lower()):
                        writer_e.add_document(url=url,
                                              title=title,
                                              data=content,
                                              content=content,
                                              tags=tags,
                                              extension=extension)
                        print("added to jpg")
                    elif "form" in line.lower():
                        writer_f.add_document(url=url,
                                              title=title,
                                              data=content,
                                              content=content,
                                              tags=tags,
                                              extension=extension)
                        print("added to form")
                    else:
                        print("adding to everywhere")

                    #writer_a.add_document(url=url, title=title, data=content, content=content, tags=tags)
                    print("added To whoosh")
                except urllib.error.HTTPError:
                    print("HTTP Error")
                    #	test = "True"
                except (ConnectionResetError, urllib.error.URLError):
                    print("Connection Reset Fail")
            else:
                print("ignored this url")
        writer_a.commit()
        writer_b.commit()
        writer_c.commit()
        writer_d.commit()
        writer_e.commit()
        writer_f.commit()
Exemplo n.º 20
0
def generateCombos(wordList):
    comboSet = set()
    formedSoFar = None
    for word in wordList:
        if formedSoFar:
            formedSoFar = (formedSoFar[0] + word[0], formedSoFar[1]
                           and word[1])
        else:
            formedSoFar = word
        comboSet.add(formedSoFar)
        comboSet.update(generateCombos(wordList[1:]))
    return comboSet


textAnalyzer = RegexTokenizer() | CharsetFilter(accent_map)
keywordAnalyzer = SpaceSeparatedTokenizer() | CharsetFilter(accent_map)

schema = Schema(
    ayah=STORED,
    simple_ayah=TEXT(stored=True, analyzer=textAnalyzer),
    surah_num=NUMERIC(stored=True),
    ayah_num=NUMERIC(stored=True),
    roots=KEYWORD(scorable=True, analyzer=keywordAnalyzer),
    decomposed_ayah=KEYWORD(scorable=True, analyzer=keywordAnalyzer),
    surah_name_ar=STORED,
    surah_name_en=STORED,
)

if not os.path.exists("whooshdir"):
    os.mkdir("whooshdir")