def bag_of_words(sentence, words, show_details=True): # tokenize the pattern sentence_words = helpers.preprocess_text(sentence) # bag of words - matrix of N words, vocabulary matrix bag = [0] * len(words) for s in sentence_words: for i, w in enumerate(words): if w == s: # assign 1 if current word is in the vocabulary position bag[i] = 1 if show_details: print("found in bag: %s" % w) return (np.array(bag))
intents = [] for (post_id, post) in dataset.items(): child_titles = list(map(lambda child: child['Title'], post['Children'])) child_titles.append(post['Title']) tag = post['Id'] intents.append({ 'patterns': child_titles, 'responses': [post['Id']], 'tag': tag, }) for title in child_titles: title_tokens = helpers.preprocess_text(title) title_tokens = title_tokens.union(set(post['Tags'])) title_tokens = title_tokens.union(set(post['BodyTokens'])) # Add keywords to set of words words.update(title_tokens) # Linking words to a tag documents.append((title_tokens, tag)) # Adding the tag to the list of classes classes.add(tag) counter += 1
'inverted_index.pickle') # Deserialize data and read the output file with open(docs_file, 'rb') as f: docs = pickle.load(f) with open(inverted_index_file, 'rb') as f: inverted_index = pickle.load(f) #Set dictionary of unique words dictionary = set(inverted_index.keys()) # Get query from command line query = sys.argv[1] # Preprocess query, lookup word in dictionary and set the frequency of words query = helpers.preprocess_text(query) query = [word for word in query if word in dictionary] query = Counter(query) # Compute weights for words in query for word, value in query.items(): query[word] = inverted_index[word]['idf'] * (1 + math.log(value)) #Normalize the weights helpers.normalize(query) #Calculate the score of query wrt to each document scores = [[i, 0] for i in range(len(docs))] for word, value in query.items(): for doc in inverted_index[word]['postings_list']: index, weight = doc
title=TEXT(stored=True), tokens=KEYWORD(stored=True, commas=True, scorable=True)) if __name__ == "__main__": if os.path.exists("indexdir"): shutil.rmtree("indexdir") os.mkdir("indexdir") ix = index.create_in("indexdir", schema) writer = ix.writer() with open('../dataset.json') as dataset_file: for (post_id, post) in json.load(dataset_file).items(): terms = set(post['BodyTokens']) terms = terms.union(set(helpers.preprocess_text(post['Title']))) child_body_terms = reduce( lambda child_one, child_two: child_one.union(child_two), map(lambda child: set(child['BodyTokens']), post['Children'][1:])) terms = terms.union(child_body_terms) child_title_terms = reduce( lambda child_one, child_two: child_one.union(child_two), map(lambda child: set(helpers.preprocess_text(child['Title'])), post['Children'])) terms = terms.union(child_title_terms) child_tags = reduce( lambda child_one, child_two: child_one.union(child_two),
import json import helpers if __name__ == "__main__": with open("../dataset.json") as dataset_file: for (post_id, post) in json.load(dataset_file).items(): print(post['Title']) print(post['Children'][0]['Title']) print(' '.join( helpers.preprocess_text(post['Children'][0]['Title']))) print()
# print(f"{response}, success: {response == row['Id']}") if success: successes += 1 elif post_id in response: partial_success += 1 else: print( f"{response}, {test_title}, success: {response == post_id}" ) for child in dataset[post_id]['Children']: print(child['Title'], child['Tags']) print('-' * 10) for child in dataset[response[0]]['Children']: print(child['Title'], child['Tags']) print() else: print(f"Failed to find response {post_id}: '{test_title}'") print(helpers.preprocess_text(test_title)) print(f"{successes} successes") print(f"{successes / len(dataset)} success rate") print(f"{partial_success} partial successes") print(f"{partial_success / (matches - successes)} partial success rate") print(f"{matches} matches") print(f"{successes / matches} match success rate") print(f"{matches / len(dataset)} match rate")
print('Building Index') #Parse Html markup and get all the documents soup = helpers.parseCorpus() #Create data folder for the output files if not os.path.exists(data_path): os.mkdir(data_path) corpus = [] docs = [] #Update word frequencies per document and get titles of all docs for doc_tag in soup.find_all('doc'): words = helpers.preprocess_text(doc_tag.text) bag_of_words = Counter(words) corpus.append(bag_of_words) docs.append(doc_tag["title"]) #Calculate IDF of all words in corpus idf = helpers.compute_idf(corpus) #Caclulate weights and normalize the weights of all words in document for doc in corpus: helpers.compute_weights(idf, doc) helpers.normalize(doc) #Build the inverted index of document with idf and posting list inverted_index = helpers.build_inverted_index(idf, corpus)