def generate_snippet_array(filename,query):


    HEADER = '\033[95m'
    OKBLUE = '\033[94m'
    OKGREEN = '\033[92m'
    WARNING = '\033[93m'
    FAIL = '\033[91m'
    ENDC = '\033[0m'

    snippet=[]
    query_dict={}
    for key, value in query.iteritems():
        for ikey, ivalue in value.iteritems():
            query_dict.setdefault(ikey,{})[key] = ivalue


    with open(filename, "r+") as f:
        real_snippet=""
        # memory-map the file, size 0 means whole file
        map1 = mmap.mmap(f.fileno(), 0)
        # go to begining of file and read file
        map1.seek(0)
        snippet=" ".join(indexutils.tokenize_string_without_punctuations(str(map1[:]).replace("\n"," ")))
        #  match the title
        match= re.match(r'.*TITLE(.*)TITLE', snippet)
        title=match.group(1)

        # for each query formulate snippet
        for query in query_dict:
            new_query=query.strip()
            new_query=" ".join(new_query.split())
            title=title.replace(query,OKBLUE+query+ENDC)
            for keys in query_dict[query].keys():
                if keys=="PHRASE":
                   query=query.strip('"')
                if keys!="NOT":

                    match= re.match(r'.*TEXT(.*)TEXT', snippet )
                    if match:
                       text=match.group(1)
                       real_snippet=text.replace(query,OKBLUE+query+ENDC)
                else:
                    # Print the document text if no match is  found
                    match= re.match(r'.*TEXT(.*)TEXT', snippet )
                    text=match.group(1)
                    real_snippet=text

        map1.close()

        counter=0
        final_snippet=""
        print "TITLE:"+title+"\n"
        for snippet_word in real_snippet.split():
            counter=counter+1
            if counter%15==0:
               print final_snippet
               final_snippet=""
            else:
               final_snippet=final_snippet+" "+snippet_word
    return  snippet
Exemplo n.º 2
0
 def tokenize_document_text(self,documentContent):
     return indexutils.tokenize_string_without_punctuations(documentContent)
 def process_query_removing_special_characters(self,word):
     return indexutils.tokenize_string_without_punctuations(word)