Пример #1
0
def bag_of_words(sentence, words, show_details=True):
    # tokenize the pattern
    sentence_words = helpers.preprocess_text(sentence)
    # bag of words - matrix of N words, vocabulary matrix
    bag = [0] * len(words)
    for s in sentence_words:
        for i, w in enumerate(words):
            if w == s:
                # assign 1 if current word is in the vocabulary position
                bag[i] = 1
                if show_details:
                    print("found in bag: %s" % w)
    return (np.array(bag))
Пример #2
0
    intents = []
    for (post_id, post) in dataset.items():
        child_titles = list(map(lambda child: child['Title'],
                                post['Children']))
        child_titles.append(post['Title'])

        tag = post['Id']

        intents.append({
            'patterns': child_titles,
            'responses': [post['Id']],
            'tag': tag,
        })

        for title in child_titles:
            title_tokens = helpers.preprocess_text(title)

            title_tokens = title_tokens.union(set(post['Tags']))
            title_tokens = title_tokens.union(set(post['BodyTokens']))

            # Add keywords to set of words
            words.update(title_tokens)

            # Linking words to a tag
            documents.append((title_tokens, tag))

            # Adding the tag to the list of classes
            classes.add(tag)

        counter += 1
Пример #3
0
                                   'inverted_index.pickle')

# Deserialize data and read the output file
with open(docs_file, 'rb') as f:
    docs = pickle.load(f)
with open(inverted_index_file, 'rb') as f:
    inverted_index = pickle.load(f)

#Set dictionary of unique words
dictionary = set(inverted_index.keys())

# Get query from command line
query = sys.argv[1]

# Preprocess query, lookup word in dictionary and set the frequency of words
query = helpers.preprocess_text(query)
query = [word for word in query if word in dictionary]
query = Counter(query)

# Compute weights for words in query
for word, value in query.items():
    query[word] = inverted_index[word]['idf'] * (1 + math.log(value))

#Normalize the weights
helpers.normalize(query)

#Calculate the score of query wrt to each document
scores = [[i, 0] for i in range(len(docs))]
for word, value in query.items():
    for doc in inverted_index[word]['postings_list']:
        index, weight = doc
Пример #4
0
                title=TEXT(stored=True),
                tokens=KEYWORD(stored=True, commas=True, scorable=True))

if __name__ == "__main__":
    if os.path.exists("indexdir"):
        shutil.rmtree("indexdir")

    os.mkdir("indexdir")

    ix = index.create_in("indexdir", schema)
    writer = ix.writer()

    with open('../dataset.json') as dataset_file:
        for (post_id, post) in json.load(dataset_file).items():
            terms = set(post['BodyTokens'])
            terms = terms.union(set(helpers.preprocess_text(post['Title'])))

            child_body_terms = reduce(
                lambda child_one, child_two: child_one.union(child_two),
                map(lambda child: set(child['BodyTokens']),
                    post['Children'][1:]))
            terms = terms.union(child_body_terms)

            child_title_terms = reduce(
                lambda child_one, child_two: child_one.union(child_two),
                map(lambda child: set(helpers.preprocess_text(child['Title'])),
                    post['Children']))
            terms = terms.union(child_title_terms)

            child_tags = reduce(
                lambda child_one, child_two: child_one.union(child_two),
Пример #5
0
import json
import helpers

if __name__ == "__main__":
    with open("../dataset.json") as dataset_file:
        for (post_id, post) in json.load(dataset_file).items():
            print(post['Title'])
            print(post['Children'][0]['Title'])
            print(' '.join(
                helpers.preprocess_text(post['Children'][0]['Title'])))
            print()
Пример #6
0
            # print(f"{response}, success: {response == row['Id']}")

            if success:
                successes += 1
            elif post_id in response:
                partial_success += 1
            else:
                print(
                    f"{response}, {test_title}, success: {response == post_id}"
                )
                for child in dataset[post_id]['Children']:
                    print(child['Title'], child['Tags'])
                print('-' * 10)
                for child in dataset[response[0]]['Children']:
                    print(child['Title'], child['Tags'])
                print()

        else:
            print(f"Failed to find response {post_id}: '{test_title}'")
            print(helpers.preprocess_text(test_title))

    print(f"{successes} successes")
    print(f"{successes / len(dataset)} success rate")

    print(f"{partial_success} partial successes")
    print(f"{partial_success / (matches - successes)} partial success rate")

    print(f"{matches} matches")
    print(f"{successes / matches} match success rate")

    print(f"{matches / len(dataset)} match rate")
Пример #7
0
print('Building Index')

#Parse Html markup and get all the documents
soup = helpers.parseCorpus()

#Create data folder for the output files
if not os.path.exists(data_path):
    os.mkdir(data_path)

corpus = []
docs = []

#Update word frequencies per document and get titles of all docs
for doc_tag in soup.find_all('doc'):
    words = helpers.preprocess_text(doc_tag.text)
    bag_of_words = Counter(words)
    corpus.append(bag_of_words)
    docs.append(doc_tag["title"])

#Calculate IDF of all words in corpus
idf = helpers.compute_idf(corpus)

#Caclulate weights and normalize the weights of all words in document
for doc in corpus:
    helpers.compute_weights(idf, doc)
    helpers.normalize(doc)

#Build the inverted index of document with idf and posting list
inverted_index = helpers.build_inverted_index(idf, corpus)