示例#1
0
def get_query():
    '''
        get_query()
        This is a helper function to input the requested query.
    '''
    global Q
    print "Audio input or text..????  1 for audio  and 2 for text "
    x = raw_input()
    x = int(x)
    if (x == 1):
        s = audio_to_text()
    else:
        s = raw_input("Enter Query:  Ctrl+C to exit\n")
    f = open('temp_porter.txt', "w")
    f.write(s + "\n")
    f.close()

    lis = porter.inputs(['temp_porter.txt'])
    lis = lis[0]
    Q = ""
    lis = auto_correct(nltk.word_tokenize(lis))

    if (lis == -1):
        return -1

    else:
        return lis
示例#2
0
def fileread():
    '''
    fileread() function read all the files in the corpus and index them into the dictionary.
    This function uses the porter algorithm and is generating a list of tokens for each document using the nltk.word_tokenize() function in the "punkt" library.
    '''
    filename = []
    global doc_to_int
    lis_of_docs = os.listdir("docs/")
    len_of_corpus = 1  #actual size = 50
    #import codecs
    #f = codecs.open('C:\Python26\text.txt', 'r', 'utf-8-sig')

    for i in lis_of_docs:
        filename.append("docs/" + str(i))
        doc_to_int.update({int(len_of_corpus): str(i)})
        len_of_corpus += 1
        #filename=['1.txt','2.txt']
    lis = porter.inputs(filename)

    tokens = []
    n = 1
    #n=len(lis)
    for l in lis:
        tokens.append([n] + nltk.word_tokenize(l))
        n += 1
    #print(len(tokens))
    global NO_OF_DOC
    NO_OF_DOC = len(tokens)
    for t in tokens:
        index(t)
    print("Done indexing\n")
    fo = open('big.txt', "w")
示例#3
0
def get_query():
    '''
        get_query()
        This is a helper function to input the requested query.
    '''
    global Q
    s=input("Enter Query:  Ctrl+C to exit\n")
    f=open('temp_porter.txt',"w")
    f.write(s+"\n")
    f.close()
    
    lis=porter.inputs(['temp_porter.txt'])
    lis=lis[0]
    Q=""
    lis=auto_correct(nltk.word_tokenize(lis))
    
    if(lis == -1):
        return -1
        
    else:
        return lis
示例#4
0
def fileread():
    '''
    fileread() function read all the files in the corpus and index them into the dictionary.
    This function uses the porter algorithm and is generating a list of tokens for each document using the nltk.word_tokenize() function in the "punkt" library.
    '''
    filename=[]
    len_of_corpus=51    #actual size = 50
    for i in range(1,len_of_corpus):
        filename.append(str(i)+".txt")
        #filename=['1.txt','2.txt']
    lis = porter.inputs(filename)
    tokens=[]
    n=1
    #n=len(lis)
    for l in lis:
        tokens.append([n]+nltk.word_tokenize(l))
        n+=1
    #print(len(tokens))
    global NO_OF_DOC
    NO_OF_DOC= len(tokens)
    for t in tokens:
        index(t)
    print("Done indexing\n")
    fo=open('big.txt',"w")