def ImprovedTokenizer():
    """
    Basiert auf dem whoosh RegexTokenizer. Dies ist nur ein Wrapper um die Funktionalität des Tokenizers
    """
    chain = RegexTokenizer() | LowercaseFilter() | StopFilter(
        stoplist=STOP_WORDS, minsize=2)
    return chain
Exemplo n.º 2
0
def text_treat(path):
    conto = {
        "titulo": "",
        "categoria": "",
        "texto": [],
        "ano": "",
        "full": [],
        "tokens": []
    }
    with open(path, 'r', encoding="ISO-8859-1") as arquivo:
        conto["full"] = arquivo.readlines()

    analyzer = RegexTokenizer() | LowercaseFilter() | StopFilter(
        lang="portuguese")
    #exemplo: Poesia, Americanas, 1875

    p = re.compile(r'[/.,]')
    inf = p.split(conto["full"][0])
    #inf = conto["full"][0].split(r"[/.,]")
    conto["categoria"] = inf[0]
    conto["titulo"] = inf[1]
    conto["ano"] = inf[2].replace("\n", "")

    for i in range(len(conto["full"])):
        conto["texto"].append(conto["full"][i].replace('\n', ''))

        #retirar stop words
        for token in analyzer(conto["texto"][i]):
            conto["tokens"].append(token.text)
        #conto["tokens"] = remove_stop_words(conto["texto"][i])
    return conto
Exemplo n.º 3
0
def NERAnalyzer(
    ne_types=QA_NE_TYPES,
    expression=default_pattern,
    stoplist=QA_STOPWORDS,
    minsize=2,
    maxsize=None,
    gaps=False,
):
    """Named Entity centric version of StandardAnalyzer

    :param ne_types: list/set of named entities to keep
    :param expression: The regular expression pattern to use to extract tokens.
    :param stoplist: A list of stop words. Set this to None to disable
        the stop word filter.
    :param minsize: Words smaller than this are removed from the stream.
    :param maxsize: Words longer that this are removed from the stream.
    :param gaps: If True, the tokenizer *splits* on the expression, rather
        than matching on the expression.
    :return: analyzer to be used with a whoosh_utils index
    """
    chain = NERTokenizer(ne_types, expression, gaps)
    chain |= LowercaseFilter()
    chain |= StopFilter(stoplist=stoplist, minsize=minsize, maxsize=maxsize)

    return chain
Exemplo n.º 4
0
def get_schema():
    """ Return a schema used for indexing document """
    analyzer = MyVietnameseTokenizer() | LowercaseFilter() | StopFilter(get_stopword_list())
    return Schema(title=TEXT(analyzer=analyzer, stored=True, field_boost=1.5),
                  path=ID(unique=True, stored=True),
                  time=STORED,
                  content=TEXT(analyzer=analyzer, stored=True))
Exemplo n.º 5
0
def ChineseAnalyzer(stoplist=STOP_WORDS,
                    minsize=1,
                    stemfn=stem,
                    cachesize=50000):
    return (ChineseTokenizer() | LowercaseFilter()
            | StopFilter(stoplist=stoplist, minsize=minsize)
            | StemFilter(stemfn=stemfn, ignore=None, cachesize=cachesize))
Exemplo n.º 6
0
def GensimAnalyzer(stoplist=STOP_WORDS,
                   minsize=1,
                   stemfn=stem,
                   cachesize=50000):
    return GensimTokenizer()|LowercaseFilter()\
            |StemFilter(stemfn=stemfn, ignore=None,cachesize=cachesize)\
            |StopFilter(stoplist=stoplist,minsize=minsize)
def get_search_terms(text):
    '''
    Splits up a text in tokens, drops non-usefull ones and returns a Set of tokens

    @type   text: String
    @param  text: A unicode string to split up in tokens

    @rtype: Set of strings
    @return: A Set of usefull unique tokens appearing the text
    '''

    stoplist = ['and', 'is', 'it', 'an', 'as', 'at', 'have', 'in', 'yet', 'if',
                'from', 'for', 'when', 'by', 'to', 'you', 'be', 'we', 'that', 'may',
                'not', 'with', 'tbd', 'a', 'on', 'your', 'this', 'of', 'us', 'will',
                'can', 'the', 'or', 'are', 'up', 'down', 'ip', ]

    analyzer = SpaceSeparatedTokenizer() | StopFilter(stoplist=stoplist)

    tokens = set([x.text for x in analyzer(text)])

    # TODO: When we go to whoosh 2.x we can drop the following and use a whoosh
    # SubstitutionFilter to the analyzer above
    tokens = set([re.sub('[\(\)/]', '', x) for x in tokens])

    return tokens
Exemplo n.º 8
0
def remove_stop_words(str):
    analyzer = RegexTokenizer() | LowercaseFilter() | StopFilter(
        lang="portuguese")
    r = []
    for token in analyzer(str):
        r.append(token.text)
    return r
Exemplo n.º 9
0
def create_analyzer():
    conf = config.get_config()
    if conf['STOPWORDS']:
        if conf['CHARACTERS_FOLDING']:
            if conf['STEMMING']:
                if conf['QGRAMS']:
                    analyzer = RegexTokenizer() | StopFilter() | CharsetFilter(accent_map) | StemFilter() \
                               | NgramFilter(minsize=conf['QNUM_MIN'], maxsize=conf['QNUM_MAX'])
                else:
                    analyzer = RegexTokenizer() | StopFilter() | CharsetFilter(accent_map) | StemFilter()
            else:
                if conf['QGRAMS']:
                    analyzer = RegexTokenizer() | StopFilter() | CharsetFilter(accent_map) \
                               | NgramFilter(minsize=conf['QNUM_MIN'], maxsize=conf['QNUM_MAX'])
                else:
                    analyzer = RegexTokenizer() | StopFilter() | CharsetFilter(accent_map)
        else:
            if conf['STEMMING']:
                if conf['QGRAMS']:
                    analyzer = RegexTokenizer() | StopFilter() | StemFilter() \
                               | NgramFilter(minsize=conf['QNUM_MIN'], maxsize=conf['QNUM_MAX'])
                else:
                    analyzer = RegexTokenizer() | StopFilter() | StemFilter()
            else:
                if conf['QGRAMS']:
                    analyzer = RegexTokenizer() | StopFilter() \
                               | NgramFilter(minsize=conf['QNUM_MIN'], maxsize=conf['QNUM_MAX'])
                else:
                    analyzer = RegexTokenizer() | StopFilter()
    else:
        if conf['CHARACTERS_FOLDING']:
            if conf['STEMMING']:
                if conf['QGRAMS']:
                    analyzer = RegexTokenizer() | CharsetFilter(accent_map) | StemFilter() \
                               | NgramFilter(minsize=conf['QNUM_MIN'], maxsize=conf['QNUM_MAX'])
                else:
                    analyzer = RegexTokenizer() | CharsetFilter(accent_map) | StemFilter()
            else:
                if conf['QGRAMS']:
                    analyzer = RegexTokenizer() | CharsetFilter(accent_map) \
                               | NgramFilter(minsize=conf['QNUM_MIN'], maxsize=conf['QNUM_MAX'])
                else:
                    analyzer = RegexTokenizer() | CharsetFilter(accent_map)
        else:
            if conf['STEMMING']:
                if conf['QGRAMS']:
                    analyzer = RegexTokenizer() | StemFilter() \
                               | NgramFilter(minsize=conf['QNUM_MIN'], maxsize=conf['QNUM_MAX'])
                else:
                    analyzer = RegexTokenizer() | StemFilter()
            else:
                if conf['QGRAMS']:
                    analyzer = RegexTokenizer() | NgramFilter(minsize=conf['QNUM_MIN'], maxsize=conf['QNUM_MAX'])
                else:
                    analyzer = RegexTokenizer()
    log.print_debug(TAG, "Analizzatore creato")
    return analyzer
Exemplo n.º 10
0
 def __init__(self, index_path, names=None):
     self._analyzer = SpaceSeparatedTokenizer() | LowercaseFilter() | StopFilter(minsize=1,
                                                                                 stoplist=stoplist) | StemFilter()
     if index.exists_in(index_path):
         self._ix = index.open_dir(index_path)
     else:
         self.build_index(index_path, names)
     self._qp = QueryParser("title", self._ix.schema, plugins=[])
Exemplo n.º 11
0
 def schema(self):
     my_analyzer = RegexTokenizer("[a-zA-Z_]+") | LowercaseFilter() | StopFilter()
     schema = Schema(
         h=TEXT(stored=True, analyzer=my_analyzer),
         gnx=ID(stored=True), b=TEXT(analyzer=my_analyzer),
         parent=ID(stored=True),
         doc=ID(stored=True),
     )
     return schema
Exemplo n.º 12
0
    def __init__(self, path, schema):
        self.analyzer = RegexTokenizer() | LowercaseFilter() | StopFilter()
        self.schema = schema

        if not os.path.exists(
                "index"):  ## make an index folder if one does not exist
            os.mkdir("index")
            index.create_in("index", self.schema)
        self.ix = index.open_dir("index")
Exemplo n.º 13
0
def get_schema():
    analyzer = StemmingAnalyzer(stoplist=STOP) | StopFilter(stoplist=STOP)
    schema = Schema(title=TEXT(analyzer=analyzer, stored=True, sortable=True),
                    content=TEXT(analyzer=analyzer, stored=True,
                                 sortable=True),
                    tags=KEYWORD(commas=True, stored=True),
                    author=TEXT(stored=True),
                    uid=ID(unique=True, stored=True),
                    lastedit_date=DATETIME(sortable=True, stored=True))
    return schema
Exemplo n.º 14
0
 def __determine_analyzer(self, mode):
     tokenizer = StanTokenizer()
     return {
         'normal':
         tokenizer | PunctuationFilter() | StanfordLemmatizerFilter()
         | LowercaseFilter(),
         'author topic modelling':
         tokenizer | PunctuationFilter() | StanfordLemmatizerFilter()
         | LowercaseFilter() | StopFilter(),
     }[mode]
Exemplo n.º 15
0
def CleanupStandardAnalyzer(expression=default_pattern,
                            stoplist=STOP_WORDS,
                            minsize=2,
                            maxsize=None,
                            gaps=False):
    ret = RegexTokenizer(expression=expression, gaps=gaps)
    # added CleanupFilter here
    chain = ret | CleanupFilter() | LowercaseFilter()
    if stoplist is not None:
        chain = chain | StopFilter(
            stoplist=stoplist, minsize=minsize, maxsize=maxsize)
    return chain
Exemplo n.º 16
0
    def build_schema(self, fields):
        schema = super(SpanishWhooshSearchBackend, self).build_schema(fields)
        stemmer_sp = SpanishStemmer()
        stemming_analyzer = StemmingAnalyzer(stemfn=stemmer_sp.stem)

        stop_filter = StopFilter(stoplist=STOPWORDS, minsize=2)

        for name, field in schema[1].items():
            if isinstance(field, TEXT):
                # field.analyzer = StemmingAnalyzer() | CharsetFilter(accent_map)
                field.analyzer = stemming_analyzer | stop_filter

        return schema
Exemplo n.º 17
0
    def __init__(self):

        chfilter = CharsetFilter(accent_map)
        stoplist = stoplists["en"].union(stoplists["fr"])
        analyzer = RegexTokenizer() | LowercaseFilter() | \
                   StopFilter(stoplist=stoplist) | chfilter

        # defines the schema
        # see http://pythonhosted.org/Whoosh/schema.html for reference
        keywordType = KEYWORD(lowercase=True, scorable=True)
        self.schema = Schema(content=TEXT(analyzer=analyzer),
                             docType=TEXT,
                             docId=ID(stored=True, unique=True),
                             tags=keywordType)

        # Adds dynamic fields so each documents can index its fields in the
        # same Whoosh index
        self.schema.add('*_string', TEXT(analyzer=analyzer), glob=True)
        self.schema.add('*_date', DATETIME, glob=True)
        self.schema.add('*_number', NUMERIC, glob=True)
        self.schema.add('*_boolean', BOOLEAN, glob=True)

        # Creates the index folder and Whoosh index files if it doesn't exist
        # And loads the index in any case
        if not os.path.exists("indexes"):
            os.mkdir("indexes")
            self.index = index.create_in("indexes", self.schema)
        else:
            self.index = index.open_dir("indexes")

        # Creates the doctypes folder if it doesn't exist
        if not os.path.exists("doctypes"):
            os.mkdir("doctypes")

        # Creates the doctypes default schema file if it doesn't exist
        if not os.path.exists('doctypes/doctypes_schema.json'):
            with open('doctypes/doctypes_schema.json', 'w') as defaultFile:
                defaultFile.write("{}")
        '''
        Loads the doctypes schema if it's valid, otherwise recreates it
        Doctypes schema is a dictionary of doctypes with their fields created
        and updated when a document is indexed.
        That way, we can tell Whoosh which fields to search by default, because
        there is apparently no way to say "search in all fields".
        '''
        with open('doctypes/doctypes_schema.json', 'r+') as rawJSON:
            try:
                self.doctypesSchema = json.load(rawJSON)
            except ValueError:
                rawJSON.write("{}")
                self.doctypesSchema = {}
Exemplo n.º 18
0
def LemmatizingAnalyzer(stoplist=STOP_WORDS, minsize=2, maxsize=None):
    """
    Analizzatore che effettua tokenizzazione, lowercase, rimozione stopword e lemmatizzazione.
    
    :param stoplist: lista di stopword. E' possibile effettuare l'unione con altre un altra lista
    :param minsize: Parole più piccole di questo valore vengono eliminate
    :param maxsize: parole più grandi di questo valore vengono eliminate
    """
    ret = RegexTokenizer(expression=default_pattern, gaps=False)
    chain = ret | LowercaseFilter()
    if stoplist is not None:
        chain = chain | StopFilter(stoplist=stoplist, minsize=minsize,
                                   maxsize=maxsize)
    return chain | LemmatizerFilter()

                
Exemplo n.º 19
0
def CleanupStemmingAnalyzer(expression=default_pattern,
                            stoplist=STOP_WORDS,
                            minsize=2,
                            maxsize=None,
                            gaps=False,
                            stemfn=stem,
                            ignore=None,
                            cachesize=50000):

    ret = RegexTokenizer(expression=expression, gaps=gaps)
    # added CleanupFilter here
    chain = ret | CleanupFilter() | LowercaseFilter()
    if stoplist is not None:
        chain = chain | StopFilter(
            stoplist=stoplist, minsize=minsize, maxsize=maxsize)
    return chain | StemFilter(
        stemfn=stemfn, ignore=ignore, cachesize=cachesize)
def queryIndex(query):
    tokenizer = RegexTokenizer()
    return_list = []

    # Removing stop words
    with open("../smartStopList.txt", "r") as fp:
        line = fp.readline()
        words = []
        while line:
            words.append(line.replace('\n', ''))
            line = fp.readline()

    stopper = StopFilter(stoplist=frozenset(words))
    tokens = stopper(tokenizer(query))

    for t in tokens:
        t.text = t.text.lower()  # Converting to lower case
        s = stem(t.text)  # stemming
        if len(s) > 2:
            return_list.append(s)
    return return_list
Exemplo n.º 21
0
def ChineseAnalyzer(stoplist=STOP_WORDS, minsize=1):
    return ChineseTokenizer() | StopFilter(stoplist=stoplist, minsize=minsize)
Exemplo n.º 22
0
def search(request):
    data = request.GET
    category_id = int(data.get('category_id', 0))
    order = int(data.get('order', ORDER_BY_MOST_RECENT))
    search_text = data.get('search_text', '').lower()
    tesis_services = TesisServices()
    total_full = list()
    tutors_full = list()
    all_full = tesis_services.get_by_category(category_id, order)

    if len(search_text) > 0:
        total_full, tutors_full = TesisServices.search_in_tesis(
            search_text, all_full)

        # Por cada busqueda, en la tabla de palabras buscadas, si la palabra existe se suma 1, sino se inserta con valor 1
        # Si lo que se ingresa como búsqueda no es una sola pabla, sino una frase, se utiliza filtros tipo Stop y Stemming,
        # luego se realiza la extracción de keywords o tokens
        """
        “Stop” words are words that are so common it’s often counter-productive to index them, such as “and”, 
        “or”, “if”, etc. The provided analysis.StopFilter lets you filter out stop words, and includes a default 
        list of common stop words.
        Stemming is a heuristic process of removing suffixes (and sometimes prefixes) from words to arrive (hopefully, 
        most of the time) at the base word.
        """
        if len(search_text.split()) > 1:
            analyzer = LanguageAnalyzer("es")
            a_filters = StopFilter() | StemFilter()
            keywords = list(
                set([
                    token.text for token in a_filters(
                        analyzer(search_text, no_morph=True))
                ]))
        else:
            keywords = [search_text]

        for word in keywords:
            obj, created = Searches.objects.update_or_create(word=word)
            if not created:
                obj.count += 1
            else:
                if obj.count is None:
                    obj.count = 1
            obj.save()
    else:
        total_full = all_full

    # Se actualiza las palabras más buscadas
    # Se actualiza total de búsquedas y cantidad de palabras diferentes
    searches_services = SearchesServices()
    searches_services.generate_resume()
    top_words_searched = searches_services.top_words_searched
    # Total de palabras diferentes
    total_words = searches_services.total_words
    # Total de busquedas en el sitio
    total_searchs = searches_services.total_searchs

    # Paginado de lista de tesis
    paginator = Paginator(total_full, 5)
    page = request.GET.get('page')
    tesis_list = paginator.get_page(page)
    the_data = {
        'tesis_list':
        render_to_string('sections/central_published_tesis.html', {
            'tesis_list': tesis_list,
            'question': search_text
        }),
        # serializers.serialize("json", [x for x in total_full]),
        'tutors_list':
        tutors_full,
        'top_words_searched':
        top_words_searched,
        'total_words':
        total_words,
        'total_searchs':
        total_searchs,
        'question':
        search_text
    }
    # the_data = serializers.serialize("json", [x for x in total_full])
    return JsonResponse(the_data)
Exemplo n.º 23
0
    analyzer = RegexTokenizer() | LowercaseFilter() | StopFilter(
        lang="portuguese")
    r = []
    for token in analyzer(str):
        r.append(token.text)
    return r


#---------------------------------------------------------------------

#variaveis
arqs = []
categorias = ("contos", "critica", "cronica", "miscelanea", "poesia",
              "romance", "teatro", "traducao")
stop_words = get_stop_words('portuguese')
analyzer = RegexTokenizer() | LowercaseFilter() | StopFilter(lang="portuguese")
index_path = "whoosh_index"
print(
    "------------------------------------------------------------------------------"
)
print(
    "----------------------------buscador da vava----------------------------------"
)
print(
    "------------------------------------------------------------------------------\n\n\n"
)
#Schema
#criar só com conteudo, depois separo as informações
schema = Schema(caminho=TEXT,
                titulo=TEXT(stored=True),
                ano=TEXT,
def main():
    file_content_doc1 = open("rural_min.txt").read()
    file_content_doc2 = open("science_min.txt").read()
    option = True
    while option:
        print("""
        1. Create Index.
        2. Query Index.
        3. Exit
        """)
        option = input("Please select an option...!")
        if option == "1":

            sent_tokenize_list1 = sent_tokenize(file_content_doc1,
                                                language='english')
            sent_tokenize_list2 = sent_tokenize(file_content_doc2,
                                                language='english')
            if not os.path.exists("index_task3_min"):
                os.mkdir("index_task3_min")

            my_analyzer = RegexTokenizer() | StopFilter() | LowercaseFilter(
            ) | Lemmatizer()
            pos_tagger = RegexTokenizer() | StopFilter() | LowercaseFilter(
            ) | PosTagger()
            wordnetsyn1 = RegexTokenizer() | StopFilter() | LowercaseFilter(
            ) | WordNetSynsets()
            wordnetsyn2 = RegexTokenizer() | StopFilter() | LowercaseFilter(
            ) | WordNetSynsets1()
            wordnetsyn3 = RegexTokenizer() | StopFilter() | LowercaseFilter(
            ) | WordNetSynsets2()
            wordnetsyn4 = RegexTokenizer() | StopFilter() | LowercaseFilter(
            ) | WordNetSynsets3()

            schema = Schema(id=ID(stored=True, unique=True),
                            standard=TEXT(stored=True,
                                          analyzer=StandardAnalyzer()),
                            stem_text=TEXT(stored=True,
                                           analyzer=StemmingAnalyzer()),
                            lemma=TEXT(stored=True, analyzer=my_analyzer),
                            pos_text=TEXT(stored=True, analyzer=pos_tagger),
                            hypernym=TEXT(stored=True, analyzer=wordnetsyn1),
                            hyponym=TEXT(stored=True, analyzer=wordnetsyn2),
                            holonym=TEXT(stored=True, analyzer=wordnetsyn3),
                            meronyms=TEXT(stored=True, analyzer=wordnetsyn4),
                            dependency=TEXT(analyzer=DependencyParser()))

            ix = index.create_in("index_task3_min", schema)
            writer = ix.writer()

            for sentence in sent_tokenize_list1:
                writer.add_document(standard=sentence,
                                    stem_text=sentence,
                                    lemma=sentence,
                                    pos_text=sentence,
                                    hypernym=sentence,
                                    hyponym=sentence,
                                    meronyms=sentence,
                                    holonym=sentence,
                                    dependency=sentence)
            for sentence in sent_tokenize_list2:
                writer.add_document(standard=sentence,
                                    stem_text=sentence,
                                    lemma=sentence,
                                    pos_text=sentence,
                                    hypernym=sentence,
                                    hyponym=sentence,
                                    meronyms=sentence,
                                    holonym=sentence,
                                    dependency=sentence)
            writer.commit()

            print_index_details(ix)

            print("\n\n Index created with various features as its fields")

        elif option == "2":
            ix = index.open_dir("index_task3")

            with ix.searcher(weighting=whoosh.scoring.BM25F()) as searcher:
                og = qparser.OrGroup.factory(0.5)
                q = input("\n Insert a query...!")
                query_text = MultifieldParser([
                    "standard", "stem_text", "lemma", "pos_text", "hyponym",
                    "meronyms", "hypernym", "holonym"
                ],
                                              schema=ix.schema,
                                              group=og).parse(q)
                results = searcher.search(query_text, limit=10)
                for i, hit in enumerate(results):
                    print(results.score(i), hit["standard"], sep=":")
                    print("\n")

        elif option == "3":
            print("\n Goodbye")
            sys.exit(0)
            option = None
        else:
            print("\n Not valid choice try again...!")
Exemplo n.º 25
0
          imageURL text,
          price numeric,
          rating numeric,
          noOfReviews numeric,
          savings numeric,
          percentageSavings numeric,
          productDesc text,
          reviewPolarity numeric,
          countryOfOrigin text,
          overview text)''')
c.close()

# initialise sentic net
sn = SenticNet()
# does stemming, removes accents so you can match words like cafe, facade etc and removes stopwords
hsn_analyzer = StemmingAnalyzer() | CharsetFilter(accent_map) | StopFilter()

SCHEMA = Schema(
    filename=ID(unique=True, stored=True, analyzer=hsn_analyzer),
    content=TEXT(analyzer=hsn_analyzer, spelling=True),
    price=NUMERIC(sortable=True, stored=True),
    rating=NUMERIC(sortable=True, stored=True),
    noOfReviews=NUMERIC(sortable=True, stored=True),
    savings=NUMERIC(sortable=True, stored=True),
    percentageSavings=NUMERIC(sortable=True, stored=True),
    review=TEXT(analyzer=hsn_analyzer, spelling=True),
    productDesc=TEXT(stored=True),
    reviewPolarity=NUMERIC(sortable=True, stored=True),
    countryOfOrigin=TEXT(sortable=True, stored=True),
    overview=TEXT(stored=True),
)
Exemplo n.º 26
0
class CustomFuzzyTerm(FuzzyTerm):
    """
    Custom FuzzyTerm query parser to set a custom maxdist
    """
    def __init__(self, fieldname, text, boost=1.0, maxdist=1):
        FuzzyTerm.__init__(self, fieldname, text, 1.0, 2)


logger = logging.getLogger("indexer" + __name__)

##==========================={Index-Schema}=====================================

chfilter = CharsetFilter(accent_map)
stoplist = stoplists["en"].union(stoplists["ru"])
analyzer = (RegexTokenizer() | LowercaseFilter()
            | StopFilter(stoplist=stoplist) | chfilter)

# Define the schema
keywordType = KEYWORD(lowercase=True, scorable=True)


def add_fields(schema):
    """
    * -------------{Function}---------------
    * Add dynamic fields so each document can index its fields in
    * the same Whoosh index
    * -------------{returns}----------------
    * Whoosh Schema . . . 
    * -------------{params}-----------------
    * : whoosh.fields.Schema
    """
Exemplo n.º 27
0
#!/usr/bin/python
#
# Classes to aid parsing of jobs from the Hive recent jobs page
#
# Usage: cat saved.html | hive_job_parser.py
#

from bs4 import BeautifulSoup
import re, sys, datetime
from whoosh.fields import SchemaClass, TEXT, KEYWORD, ID, DATETIME
from whoosh.analysis import RegexTokenizer, LowercaseFilter, StopFilter

QUERY_ANALYZER = RegexTokenizer('\w+|\d+') | LowercaseFilter() | StopFilter()

class HiveJobListing(SchemaClass):
  '''Class to store the details associated with each Hive job'''

  job_url = ID(stored=True)
  title = TEXT(stored=True,analyzer=QUERY_ANALYZER)
  owner = KEYWORD(stored=True)
  completion_time = DATETIME(stored=True)
  query = TEXT(stored=True,analyzer=QUERY_ANALYZER)

  def __init__(self):
    self.job_url = None
    self.title = None
    self.owner = None
    self.completion_time = None
    self.query = None

  def __str__(self):
Exemplo n.º 28
0
def query_thread(queue, database, g_minus_d, e1_type, e2_type, index):
    idx = open_dir(index)
    regex_tokenize = re.compile('\w+|-|<[A-Z]+>[^<]+</[A-Z]+>', re.U)
    tokenizer = RegexTokenizer(regex_tokenize)
    stopper = StopFilter()
    count = 0

    with idx.searcher() as searcher:
        while True:
            r = queue.get_nowait()
            count += 1
            if count % 25000 == 0:
                print multiprocessing.current_process(), count, queue.qsize()

            if len(database[(r.ent1, r.ent2)]) == 0:
                # if its not in the database calculate the PMI
                entity1 = "<" + e1_type + ">" + r.ent1 + "</" + e1_type + ">"
                entity2 = "<" + e2_type + ">" + r.ent2 + "</" + e2_type + ">"
                terms = list()
                for token in stopper(
                        tokenizer((r.between.decode("utf8")), renumber=True)):
                    terms.append(query.Term("sentence", token.text))

                #print terms
                t1 = query.Term("sentence", entity1)
                t3 = query.Term("sentence", entity2)

                query_terms = list()
                query_terms.append(t1)
                for t in terms:
                    query_terms.append(t)
                query_terms.append(t3)

                q1 = spans.SpanNear2(query_terms, slop=2, ordered=True)
                q2 = spans.SpanNear2([t1, t3], slop=8, ordered=True)
                entities_r = searcher.search(q1)
                entities = searcher.search(q2)
                """
                print query_terms, len(entities_r)
                print [t1, t3], len(entities)
                print "\n"
                """

                #print entity1, '\t', r.between, '\t', entity2, len(entities_r), len(entities)

                try:
                    assert not len(entities_r) > len(entities)
                except AssertionError, e:
                    print e
                    print r.sentence
                    print r.ent1
                    print r.ent2
                    print query_terms
                    print[t1, t3]

                if len(entities) > 0:
                    pmi = float(len(entities_r)) / float(len(entities))
                    if pmi >= 0.5:
                        #print entity1, '\t', r.between, '\t', entity2, pmi
                        g_minus_d.append(r)

                if queue.empty() is True:
                    break
Exemplo n.º 29
0
#!/usr/bin/env python

import os

from whoosh.index import create_in
from whoosh.fields import Schema, ID, TEXT
from whoosh.analysis import LowercaseFilter, RegexTokenizer, StopFilter

# Analizadores que utilizará el schema
my_analizer = RegexTokenizer() | LowercaseFilter() | StopFilter(lang="es")
# Esquema en el que se guarda el título e ID
schema = Schema(title=TEXT(stored=True),
                path=ID(stored=True),
                num_noticia=TEXT(stored=True),
                doc=TEXT(stored=True),
                content=TEXT)  #(analyzer=my_analizer))
# Nombre del directorio donde se guardará el índice
idir = "index_dir"
# Creación del directorio donde se guarda el índice
if not os.path.exists(idir):
    os.mkdir(idir)
ix = create_in(idir, schema)

# El writer añadirá los índices
writer = ix.writer()

# Ficheros a añadir
nomF = os.listdir("./enero")
for filename in nomF:
    f = open("./enero/" + filename, mode='r')
    f = str(f.read()).split("<DOC>")
Exemplo n.º 30
0
from docx import Document
import PyPDF2
from bs4 import BeautifulSoup
from whoosh import fields, index
from whoosh.analysis import SimpleAnalyzer, StopFilter

sys.path.append(
    os.path.join(os.path.abspath(os.path.dirname(__file__)), os.pardir,
                 os.pardir))
from searchEngine.seconfig import SearchEngineConfig
import time

WHOOSH_SCHEMA = fields.Schema(title=fields.TEXT(stored=True),
                              path=fields.ID(stored=True, unique=True),
                              content=fields.TEXT(analyzer=SimpleAnalyzer()
                                                  | StopFilter(),
                                                  stored=False))
FILE_INDEXED_LIST = []


# Creates a list of all the files in the lookup directory
def list_all_files():
    file_name_list = []
    for path, subdirs, files in os.walk(
            SearchEngineConfig.DOCUMENT_LOOKUP_DIRECTORY):
        for name in files:
            extension = os.path.splitext(name)[1]
            if extension in SearchEngineConfig.SUPPORTED_EXTENSIONS:
                file_name_list.append(str(os.path.join(path, name)))
    return file_name_list