d_block_1 = d_block_1.rstrip('\n') d_block = d_block_1 + "," + d_block_2 mfw.write(term + "=" + d_block) # step 1: go through all the documents in batches of 1000 and find the tuples and unique terms if __name__ == "__main__": startTime = datetime.now() stopwords_path = "/Users/snehagaikwad/Documents/IR_data/AP_DATA/stoplist.txt" with open(stopwords_path, 'r') as sf: stopList = sf.read().replace('\n', ' ') #print stopList # the stemmer object ps = PorterStemmer() path = "/Users/snehagaikwad/Documents/IR_data/AP_DATA/ap89_collection/" tupleFilePath = "/Users/snehagaikwad/Documents/IR_data/AP_DATA/indexStemStop/tuples/" fileCount = 0 uniqueTerms = {} documents = {} doc_close_tag = "</DOC>" doc_id_tag = "<DOCNO>" text_open_tag = "<TEXT>" text_close_tag = "</TEXT>" text = "" doc_count = 0 uniqueTermCount = 0 totalCF = 0
doc_id = doc.split()[1] doc_name_id_map[doc_num] = doc_id sum_doc_len = 0 doc_len_file = "/Users/snehagaikwad/Documents/IR_data/AP_DATA/document_length.txt" doc_len_map = {} with open(doc_len_file, 'r') as dl: for dl_line in iter(dl): doc_id = dl_line.split()[0] doc_len = dl_line.split()[1] doc_len_map[doc_id] = doc_len sum_doc_len += int(doc_len) avg_len_d = Decimal(sum_doc_len) / Decimal(no_of_docs) # the stemmer object ps = PorterStemmer() # converting the query terms to their stems and removing the stopwords from the queries query_file = "/Users/snehagaikwad/Documents/IR_data/AP_DATA/query_desc.51-100.short.txt" stopwords_path = "/Users/snehagaikwad/Documents/IR_data/AP_DATA/stoplist.txt" with open(stopwords_path, 'r') as sf: stopList = sf.read().replace('\n', ' ') with open(query_file, 'r') as f: lines = f.readlines() queries = { } # map to store the comma separated stemmed query words against the query number word_dfw_map = { } # map that stores the term and the no of docs that have the term i.e. dfw term_id_tf_map = { } # map TF(w,d) that stores id_tf_map(doc_id,TF) for each query_word
with open(doc_len_file, 'r') as dl: for dl_line in iter(dl): doc_id = dl_line.split()[0] doc_len = dl_line.split()[1] doc_len_map[doc_id] = int(doc_len) # DEFINING CONSTANTS C = 1500 indexParamsFile = "/Users/snehagaikwad/Documents/IR_data/AP_DATA/IndexNoStopStemmed/indexParams.txt" ipf = open(indexParamsFile,'r') param = ipf.readline() params = param.split("\t") V = int(params[1]) # the stemmer object ps = PorterStemmer(); # converting the query terms to their stems and removing the stopwords from the queries query_file = "/Users/snehagaikwad/Documents/IR_data/AP_DATA/query_desc.51-100.short.txt" stopwords_path = "/Users/snehagaikwad/Documents/IR_data/AP_DATA/stoplist.txt" with open(stopwords_path, 'r') as sf: stopList = sf.read().replace('\n', ' ') with open(query_file, 'r') as f: lines = f.readlines() queries = {} # map to store the comma separated stemmed query words against the query number term_doc_position_map = {} doc_list = [] for query in lines: if not query == "\n":