Exemplo n.º 1
0
def search(model, original_query, modified_query, ctx, topic, relevance=None):
    corpus_accessor = CorpusAccessor(ctx)
    results = None
    if model == "VSM":
        print("Calling VSM with query: " + modified_query)
        vector_model = VectorSpaceModel(ctx)
        results = vector_model.search(ctx, modified_query, topic, relevance)
        documents = corpus_accessor.access(ctx, [r[0] for r in results])
        scores = ["{:.4f}".format(r[1]) for r in results]
        results = format_results(documents, scores, ctx)
        results = set_relevances(ctx, original_query, results)
    elif model == "Boolean":
        print("Calling Boolean with query: " + modified_query)
        parser = Parser(ctx)
        parsed = parser.parse(modified_query)
        data = Evaluator(ctx, parsed).evaluate()
        documents = corpus_accessor.access(ctx, data)
        if topic != "ALL TOPICS":
            filtered_documents = []
            for d in documents:
                if topic in d.topics:
                    filtered_documents.append(d)
            documents = filtered_documents
        scores = [1] * len(documents)
        results = format_results(documents, scores, ctx)
    return results
Exemplo n.º 2
0
 def setup(self, ctx):
     self.ctx = ctx
     self.corpus_accessor = CorpusAccessor(ctx)
     self.weighted_index_accessor = WeightedIndexAccessor(ctx)
     self.index_accessor = IndexAccessor(ctx)
     # for preprocessing query, ie. stopwords, stemming, etc.
     self.normalize_funcs = context.normalizer_funcs_for_context(ctx)
     self.filter_funcs = context.filter_funcs_for_context(ctx)
Exemplo n.º 3
0
def construct_context(values):
    if values["_uottawa_"]:
        corpus_path = path.abspath("data/corpus/UofO_Courses.yaml")
        dictionary_path = path.abspath("data/dictionary/UofOCourses.txt")
        inverted_index_path = path.abspath("data/index/UofO_Courses.yaml")
    elif values["_reuters_"]:
        corpus_path = path.abspath("data/corpus/reuters.yaml")
        dictionary_path = path.abspath("data/dictionary/reuters.txt")
        inverted_index_path = path.abspath("data/index/reuters.yaml")

    ctx = Context(
        corpus_path,
        dictionary_path,
        inverted_index_path,
        enable_stopwords=values["_stopword_"],
        enable_stemming=values["_stemming_"],
        enable_normalization=values["_normalization_"],
    )

    # eager load if not already in memory
    CorpusAccessor(ctx)
    Dictionary(ctx)
    IndexAccessor(ctx)
    BigramIndexAccessor(ctx)
    WeightedIndexAccessor(ctx)
    return ctx
Exemplo n.º 4
0
import nltk
print("Ensuring nltk libraries exist...")
nltk.download("stopwords")
nltk.download('wordnet')
from pkg.userinterface import userinterface as ui
from pkg.context import Context
from pkg.corpusaccess import CorpusAccessor
from pkg.dictionary import Dictionary
from pkg.index import IndexAccessor, BigramIndexAccessor, WeightedIndexAccessor

print("Loading default context...")
corpus_path = "data/corpus/UofO_Courses.yaml"
dictionary_path = "data/dictionary/UofOCourses.txt"
index_path = path.realpath("data/index/UofO_Courses.yaml")

ctx = Context(corpus_path, dictionary_path, index_path)

# We do this to eager load our singleton classes into memory to speed up execution during actual search queries
print("Loading corpus...")
corpus = CorpusAccessor(ctx)
print("Loading dictionary")
dictionary = Dictionary(ctx)
print("Loading indices...")
index = IndexAccessor(ctx)
bigram_index = BigramIndexAccessor(ctx)
weighted_index = WeightedIndexAccessor(ctx)
print("Done!")
print("Launch User Interface")
ui.launch()
print("Goodbye!")
Exemplo n.º 5
0
from os import path

from pkg.context import Context
from pkg.corpusaccess import CorpusAccessor

corpus_path = path.realpath("data/corpus/UofO_Courses.yaml")

ctx = Context(corpus_path, "", "")

print("Initializing CorpusAccessor")
corpus_accessor = CorpusAccessor(ctx)

print("Accessing docs:")

docs = corpus_accessor.access(ctx, [587, 588, 589])

for d in docs:
    print(d)

print("\nTry initializing again with same corpus (doesn't re-load)")
corpus_accessor = CorpusAccessor(ctx)

print("Accessing docs:")

docs = corpus_accessor.access(ctx, [590, 591, 592])

for d in docs:
    print(d)
Exemplo n.º 6
0
 def __init__(self, ctx, k=5):
     self.ctx = ctx
     self.corpus_accessor = CorpusAccessor(ctx)
     self.training_set = []
     self.unclassified_set = []
     self.k = k
Exemplo n.º 7
0
class TopicLearner:
    def __init__(self, ctx, k=5):
        self.ctx = ctx
        self.corpus_accessor = CorpusAccessor(ctx)
        self.training_set = []
        self.unclassified_set = []
        self.k = k

    def learn(self):
        self._partition_articles()
        self._classify()

    # partition documents between those that have topics (TRAINING SET) and those that don't
    def _partition_articles(self):
        with open(self.ctx.corpus_path(), "r") as corpus_handle:
            corpus_stream = load_all(corpus_handle, Loader=Loader)
            for article in corpus_stream:
                if len(article.topics) > 0:
                    self.training_set.append(article)
                else:
                    self.unclassified_set.append(article)
        print(
            f"Size of training set: {len(self.training_set)}\nSize of articles to categorize: {len(self.unclassified_set)}"
        )

    # Using the training set, classify documents with missing topics
    def _classify(self):
        partitions = [list(d) for d in divide(100, self.unclassified_set)]
        print("loading VSM")
        start = time.time()
        vsm = VectorSpaceModel(self.ctx)
        end = time.time()
        print(f"loading VSM took {end - start} seconds")
        for p in partitions:
            self._run(p, vsm)
        # Now add the training set, we now have a full corpus with topics assigned
        with open(f"{self.ctx.corpus_path()}_topics.yaml", "a") as outfile:
            dump_all(
                self.training_set,
                outfile,
                explicit_start=True,
                default_flow_style=False,
                sort_keys=False,
                indent=2,
                Dumper=Dumper,
            )

    # unfortunately, doing this all in one process causes memory use to grow without bound and eventually the process gets OOMkilled.
    # I made some attempts to reason about where the memory growth is occurring, and it's most likely due to the size of the
    # generated output. However, even partitioning into smaller runs (and appending to the output as needed), the OOMkill was still occurring.
    # In the end, I opted to simply fork each run so that memory is cleaned up when the child process dies. It's quite hacky, and only works
    # on UNIX systems, so this code won't run properly on Windows.
    def _run(self, p, vsm):
        i = 1
        pid = os.fork()
        if pid != 0:
            os.waitpid(0, 0)
            print(f"Child process is finished")
            return
        # otherwise we are in child process
        for article in p:
            print(f"{i}/{len(self.unclassified_set)}")
            i += 1
            results = vsm.search(self.ctx, article.read_queryable())
            documents = self.corpus_accessor.access(self.ctx,
                                                    [r[0] for r in results])
            documents = [d for d in documents if len(d.topics) > 0]
            documents = documents[:min(5, len(documents))]
            if len(documents) == 0:
                print(
                    f"No possible topic neighbours for {article.title}, skipping assignment"
                )
                continue
            self._assign_topics(article, documents)
            print(f"New topics for {article.title}: {article.topics}")
        with open(f"{self.ctx.corpus_path()}_topics.yaml", "a") as outfile:
            dump_all(
                p,
                outfile,
                explicit_start=True,
                default_flow_style=False,
                sort_keys=False,
                indent=2,
                Dumper=Dumper,
            )
            print("WROTE")
        sys.exit()

    # Given a set of documents that are the nearest neightbours of article, extract the relevant topics
    # Strategy: from N nearest negihbours, assign topics that appear in the majority
    def _assign_topics(self, article, documents):
        topics = chain.from_iterable([list(d.topics) for d in documents])
        topics_with_occurrences = Counter(topics)
        article.topics = [
            k for (k, val) in topics_with_occurrences.items()
            if val > self.k / 2
        ]  # majority
        if len(article.topics) == 0:
            # If no clear majority topics, just take the most common topic(s)
            print(f"No majority topics from {topics_with_occurrences}")
            most_common = max(topics_with_occurrences.values())
            article.topics = [
                k for (k, val) in topics_with_occurrences.items()
                if val == most_common
            ]
Exemplo n.º 8
0
from os import path
from pkg.vsm import VectorSpaceModel

from pkg.context import Context
from pkg.corpusaccess import CorpusAccessor

corpus_path = "data/corpus/UofO_Courses.yaml"
dictionary_path = "data/dictionary/UofOCourses.txt"
inverted_index_path = path.realpath("data/index/UofO_Courses.yaml")

ctx = Context(corpus_path, dictionary_path, inverted_index_path)

corpus_accessor = CorpusAccessor(ctx)

print("Initializing VSM")
vector_model = VectorSpaceModel(ctx)

query = "health care"
print("VSM results for query: '" + query + "'")

results = vector_model.search(ctx, query)

for r in results:
    doc = corpus_accessor.access(ctx, [r[0]])
    print(str(doc[0]) + " >>> " + "{:.4f}".format(r[1]))
Exemplo n.º 9
0
class VectorSpaceModel:
    k = 15  # top k results, where k between 10 and 20

    def __init__(self, ctx):
        self.setup(ctx)

    def setup(self, ctx):
        self.ctx = ctx
        self.corpus_accessor = CorpusAccessor(ctx)
        self.weighted_index_accessor = WeightedIndexAccessor(ctx)
        self.index_accessor = IndexAccessor(ctx)
        # for preprocessing query, ie. stopwords, stemming, etc.
        self.normalize_funcs = context.normalizer_funcs_for_context(ctx)
        self.filter_funcs = context.filter_funcs_for_context(ctx)

    def search(self, ctx, query, topic, relevance=[]):
        # ensure accessors exist
        self.setup(ctx)

        # preprocess the query
        query_terms = self.__clean_query(query)

        # get all docs that have at least one of the query terms in them
        matched_doc_ids = set()
        for term in query_terms:
            docIDs = self.index_accessor.access(ctx, term).doc_ids
            for doc in docIDs:
                matched_doc_ids.add(doc)

        filtered_doc_ids = set()
        if ctx.corpus_type() == "reuters" and topic != "ALL TOPICS":
            for doc in self.corpus_accessor.access(ctx, matched_doc_ids):
                if topic in doc.topics:
                    filtered_doc_ids.add(doc.id)
        else:
            filtered_doc_ids = matched_doc_ids

        if len(filtered_doc_ids) == 0:
            return []

        weights = {}

        # do rocchio if relevance is present
        if len(relevance) > 0:
            relevant_doc_ids = [doc[1] for doc in relevance]
            not_relevant_doc_ids = [
                docID for docID in filtered_doc_ids if docID not in relevant_doc_ids
            ]

            raw_relevant = [
                (r, k[4].count(r)) for k in relevance for r in k[4].split(" ")
            ]

            beta_coefficient = 1.0 / (len(relevance))
            betas = defaultdict(lambda: 0)
            for raw in raw_relevant:
                betas[raw[0]] += raw[1]
            for query_term in query_terms:
                betas[query_term] += 1

            beta_result = {k: beta_coefficient * v for (k, v) in betas.items()}
            print(f"betas: {beta_result}")

            gamma_coefficient = 1.0 / (len(filtered_doc_ids) - len(relevant_doc_ids))
            docs = self.corpus_accessor.access(
                self.ctx, [doc_id for doc_id in not_relevant_doc_ids]
            )
            raw_not_relevant = [
                (r, k.read_queryable().count(r))
                for k in docs
                for r in k.read_queryable().split(" ")
            ]
            gammas = defaultdict(lambda: 0)
            for raw in raw_not_relevant:
                gammas[raw[0]] += raw[1]
            gamma_result = {k: gamma_coefficient * v for (k, v) in gammas.items()}
            print(f"gammas: {gamma_result}")

            query_vector = {k: 1 for k in query_terms}

            # We now have the original query vector, as well as the relevant and non-relevant biases
            for docID in filtered_doc_ids:
                weight = 0
                for term in query_terms:
                    term_weight = (
                        query_vector[term] + beta_result[term] - gamma_result[term]
                    )
                    weight += (
                        term_weight
                        * self.weighted_index_accessor.access(ctx, term)[docID]
                    )
                weights[docID] = weight

        else:
            # calculate similarity between query and document (dot product)
            for docID in filtered_doc_ids:
                weight = 0
                for term in query_terms:
                    weight += self.weighted_index_accessor.access(ctx, term)[docID]
                weights[docID] = weight


        sorted_weights = sorted(weights.items(), reverse=True, key=lambda kv: kv[1])
        # SOURCE: https://stackoverflow.com/questions/613183/how-do-i-sort-a-dictionary-by-value

        if len(sorted_weights) < self.k:
            return sorted_weights
        else:
            return sorted_weights[: self.k]

    # performs preprocessing on query
    def __clean_query(self, query):
        # split query on space
        terms = query.split()

        # apply normalizations/filters as specified in ctx
        results = []
        for term in terms:
            for normalize_func in self.normalize_funcs:
                term = normalize_func(term)
            term = set([term])

            for filter_func in self.filter_funcs:
                term = filter_func(term)

            if list(term) != []:
                results.append(list(term)[0])

        return results