def get_query(): ''' get_query() This is a helper function to input the requested query. ''' global Q print "Audio input or text..???? 1 for audio and 2 for text " x = raw_input() x = int(x) if (x == 1): s = audio_to_text() else: s = raw_input("Enter Query: Ctrl+C to exit\n") f = open('temp_porter.txt', "w") f.write(s + "\n") f.close() lis = porter.inputs(['temp_porter.txt']) lis = lis[0] Q = "" lis = auto_correct(nltk.word_tokenize(lis)) if (lis == -1): return -1 else: return lis
def fileread(): ''' fileread() function read all the files in the corpus and index them into the dictionary. This function uses the porter algorithm and is generating a list of tokens for each document using the nltk.word_tokenize() function in the "punkt" library. ''' filename = [] global doc_to_int lis_of_docs = os.listdir("docs/") len_of_corpus = 1 #actual size = 50 #import codecs #f = codecs.open('C:\Python26\text.txt', 'r', 'utf-8-sig') for i in lis_of_docs: filename.append("docs/" + str(i)) doc_to_int.update({int(len_of_corpus): str(i)}) len_of_corpus += 1 #filename=['1.txt','2.txt'] lis = porter.inputs(filename) tokens = [] n = 1 #n=len(lis) for l in lis: tokens.append([n] + nltk.word_tokenize(l)) n += 1 #print(len(tokens)) global NO_OF_DOC NO_OF_DOC = len(tokens) for t in tokens: index(t) print("Done indexing\n") fo = open('big.txt', "w")
def get_query(): ''' get_query() This is a helper function to input the requested query. ''' global Q s=input("Enter Query: Ctrl+C to exit\n") f=open('temp_porter.txt',"w") f.write(s+"\n") f.close() lis=porter.inputs(['temp_porter.txt']) lis=lis[0] Q="" lis=auto_correct(nltk.word_tokenize(lis)) if(lis == -1): return -1 else: return lis
def fileread(): ''' fileread() function read all the files in the corpus and index them into the dictionary. This function uses the porter algorithm and is generating a list of tokens for each document using the nltk.word_tokenize() function in the "punkt" library. ''' filename=[] len_of_corpus=51 #actual size = 50 for i in range(1,len_of_corpus): filename.append(str(i)+".txt") #filename=['1.txt','2.txt'] lis = porter.inputs(filename) tokens=[] n=1 #n=len(lis) for l in lis: tokens.append([n]+nltk.word_tokenize(l)) n+=1 #print(len(tokens)) global NO_OF_DOC NO_OF_DOC= len(tokens) for t in tokens: index(t) print("Done indexing\n") fo=open('big.txt',"w")