예제 #1
0
def construct_context(values):
    if values["_uottawa_"]:
        corpus_path = path.abspath("data/corpus/UofO_Courses.yaml")
        dictionary_path = path.abspath("data/dictionary/UofOCourses.txt")
        inverted_index_path = path.abspath("data/index/UofO_Courses.yaml")
    elif values["_reuters_"]:
        corpus_path = path.abspath("data/corpus/reuters.yaml")
        dictionary_path = path.abspath("data/dictionary/reuters.txt")
        inverted_index_path = path.abspath("data/index/reuters.yaml")

    ctx = Context(
        corpus_path,
        dictionary_path,
        inverted_index_path,
        enable_stopwords=values["_stopword_"],
        enable_stemming=values["_stemming_"],
        enable_normalization=values["_normalization_"],
    )

    # eager load if not already in memory
    CorpusAccessor(ctx)
    Dictionary(ctx)
    IndexAccessor(ctx)
    BigramIndexAccessor(ctx)
    WeightedIndexAccessor(ctx)
    return ctx
예제 #2
0
import nltk
print("Ensuring nltk libraries exist...")
nltk.download("stopwords")
nltk.download('wordnet')
from pkg.userinterface import userinterface as ui
from pkg.context import Context
from pkg.corpusaccess import CorpusAccessor
from pkg.dictionary import Dictionary
from pkg.index import IndexAccessor, BigramIndexAccessor, WeightedIndexAccessor

print("Loading default context...")
corpus_path = "data/corpus/UofO_Courses.yaml"
dictionary_path = "data/dictionary/UofOCourses.txt"
index_path = path.realpath("data/index/UofO_Courses.yaml")

ctx = Context(corpus_path, dictionary_path, index_path)

# We do this to eager load our singleton classes into memory to speed up execution during actual search queries
print("Loading corpus...")
corpus = CorpusAccessor(ctx)
print("Loading dictionary")
dictionary = Dictionary(ctx)
print("Loading indices...")
index = IndexAccessor(ctx)
bigram_index = BigramIndexAccessor(ctx)
weighted_index = WeightedIndexAccessor(ctx)
print("Done!")
print("Launch User Interface")
ui.launch()
print("Goodbye!")
예제 #3
0
from os import path

from pkg.bigram_lang_model import ReutersBigramLangModel
from pkg.context import Context

corpus_path = "data/corpus/reuters.yaml"

ctx = Context(corpus_path, "None", "None")

ReutersBigramLangModel.generate(ctx)
예제 #4
0
def launch():
    # results table info
    headings = ["Relevance", "DocID", "Title", "Topic", "Excerpt", "Score"]
    data = []
    relevance = defaultdict(lambda: [])
    # query data for edit distance and 'resending' query
    original_query = ""
    original_values = []
    updated_query = ""
    suggestions = []
    ctx = Context("", "", "")

    # for query suggestions
    next_terms = []

    # Reuters topics
    # get topics from "all-topics-strings.lc.txt"
    topic_file = path.abspath(
        path.join("data", "raw", "reuters", "all-topics-strings.lc.txt")
    )
    topics = []
    with open(topic_file) as f:
        topics = ["ALL TOPICS"] + f.read().splitlines()

    # built in colour scheme
    sg.theme("Reddit")

    # settings layouts
    model_layout = [
        [
            sg.Radio(
                "Boolean", "model", default=True, font=("Arial", 14), key="_boolean_"
            )
        ],
        [sg.Radio("Vector Space", "model", font=("Arial", 14), key="_vsm_")],
    ]

    corpus_layout = [
        [
            sg.Radio(
                "uOttawa Catalogue",
                "corpus",
                default=True,
                font=("Arial", 14),
                key="_uottawa_",
                enable_events=True,
            )
        ],
        [
            sg.Radio(
                "Reuters",
                "corpus",
                font=("Arial", 14),
                key="_reuters_",
                enable_events=True,
            )
        ],
    ]

    dictionary_layout = [
        [
            sg.Checkbox(
                "Stopword Removal", default=True, font=("Arial", 14), key="_stopword_"
            )
        ],
        [sg.Checkbox("Stemming", default=False, font=("Arial", 14), key="_stemming_")],
        [
            sg.Checkbox(
                "Normalization", default=True, font=("Arial", 14), key="_normalization_"
            )
        ],
    ]

    # window layout
    layout = [
        [sg.Text("Minerva Search Engine", font=("Arial", 22, "bold"))],
        [
            sg.Text("Query:", font=("Arial", 14)),
            sg.InputText(
                "", font=("Arial", 14), focus=True, enable_events=True, key="_query_",
            ),
            sg.Button("Search", font=("Arial", 14), bind_return_key=True),
        ],
        [
            sg.Text("Next: ", font=("Arial", 14)),
            sg.Listbox(
                values=next_terms,
                size=(30, 3),
                font=("Arial", 14),
                key="_next_",
                enable_events=True,
            ),
            sg.Text(
                "Click on a word to add it to your query.",
                font=("Arial", 12, "italic"),
            ),
        ],
        [
            sg.Text("Topic:", font=("Arial", 14)),
            sg.Combo(
                topics,
                font=("Arial", 14),
                readonly=True,
                disabled=True,
                default_value="ALL TOPICS",
                key="_topics_",
            ),
        ],
        [sg.Text("")],
        [
            sg.Frame("Corpus", corpus_layout, font=("Arial", 16, "bold")),
            sg.Frame("Model", model_layout, font=("Arial", 16, "bold")),
            sg.Frame(
                "Dictionary Building", dictionary_layout, font=("Arial", 16, "bold")
            ),
        ],
        [sg.Text("")],
        [sg.Text("Results", font=("Arial", 16, "bold"))],
        [
            sg.Button(
                "Showing results for <updated_query>. Click here to search for <original_query>.",
                font=("Arial", 12),
                size=(64, 1),
                visible=False,
                disabled_button_color=("white", None),
                key="_resend_",
            )
        ],
        [sg.Text("", font=("Arial", 5))],
        [
            sg.Button(
                "Click here to see more suggestions.",
                font=("Arial", 12),
                visible=False,
                pad=((0, 50)),
                key="_suggestions_",
            )
        ],
        [sg.Text("")],
        [
            sg.Table(
                values=data,
                headings=headings,
                font=("Arial", 12),
                header_font=("Arial", 14, "bold"),
                bind_return_key=True,
                num_rows=8,
                alternating_row_color="#d3d3d3",
                auto_size_columns=False,
                col_widths=[8, 8, 12, 8, 32, 8],
                justification="center",
                key="_table_",
            )
        ],
        [
            sg.Text(
                "Double click on a row to view the full text.",
                font=("Arial", 12, "italic"),
            )
        ],
        [sg.Text("Relevant docs for this query", font=("Arial", 12))],
        [
            sg.Table(
                values=relevance,
                headings=headings,
                font=("Arial", 12),
                header_font=("Arial", 14, "bold"),
                bind_return_key=True,
                num_rows=4,
                alternating_row_color="#d3d3d3",
                auto_size_columns=False,
                col_widths=[8, 8, 12, 8, 32, 8],
                justification="center",
                key="_relevance_",
            )
        ],
        [sg.Button("Exit", font=("Arial", 14), button_color=("white", "grey"))],
    ]

    # creating window
    window = sg.Window("Minerva Search Engine", layout)

    # popup that shows full text of document
    def DocPopup(query, doc):
        text = ""
        if doc[0] == "not relevant":
            doc[0] = "relevant"
            RelevanceFeedback().set_relevant(query, doc)

        sections = ["DocID", "Title", "Topics", "Full Text"]
        for i in range(len(sections)):
            text += sections[i] + ": " + str(doc[i+1]) + "\n"

        return sg.PopupScrolled(
            text, title=doc[1], font=("Arial", 12), size=(64, 15), keep_on_top=True
        )

    # popup that shows top N suggestions per query term
    def SuggestionPopup(suggestions):
        text = ""

        for term in suggestions:
            if suggestions[term] != []:
                text += term + " : "
                for s in suggestions[term]:
                    text += s + ", "
                text += "\n"

        return sg.PopupScrolled(
            text,
            title="Suggestions for " + original_query,
            font=("Arial", 12),
            size=(64, None),
            keep_on_top=True,
        )

    def ExpansionPopup(expansions):
        text = "Accept these expansions?\n"

        for term in expansions.items():
            text += term[0] + " -- "
            for t in term[1]:
                text += t + ", "
            text += "\n"

        sg.theme("BrownBlue")

        return sg.popup_scrolled(
            text, title="Expansions", font=("Arial", 12, "bold"), size=(64, 5), keep_on_top=True, yes_no=True
        )

    # turns edit distance UI elements on or off
    def toggle_resend(toggle):
        if toggle:
            # turn resend on
            window["_resend_"].set_size(
                (len(original_query) + len(updated_query) + 40, None)
            )
            window["_resend_"].Update(
                text=(
                    "Showing results for '"
                    + updated_query
                    + "'. Click here to search for '"
                    + original_query
                    + "'."
                ),
                disabled=False,
                visible=True,
            )
            window["_suggestions_"].Update(visible=True)
        else:
            # turn resend off
            window["_resend_"].set_size((len(original_query) + 25, None))
            window["_resend_"].Update(
                text=("Showing results for '" + original_query + "'."),
                disabled=True,
                visible=True,
            )
            window["_suggestions_"].Update(visible=False)

    # event loop
    while True:
        event, values = window.Read()
        if event is None:
            break
        elif event is "Exit":
            print("Exiting")
            window.Close()

        elif event is "Search":
            original_query = values["_query_"]
            original_values = values
            print("Search for query: " + str(original_query))

            topic = values["_topics_"]
            print("Chosen topic: " + topic)

            # create context object
            ctx = construct_context(values)

            # get weighted edit distance suggestions for query
            suggestions = EditDistance(ctx).edit_distance(original_query)

            # update edit distance related UI elements
            if not suggestions:
                # if theres no suggestions (all query terms were in dictionary or regex terms), don't display suggestion related UI elements
                toggle_resend(False)
                updated_query = original_query
            else:
                # if theres suggestions (one or more query term was not in dictionary)
                # construct the new query
                updated_query = ""
                for term in original_query.split():
                    if not (term in suggestions):
                        updated_query += term + " "
                    else:
                        updated_query += suggestions[term][0] + " "
                toggle_resend(True)
                print("Corrected query: " + updated_query)

            print("Getting expansions")
            expanded_query = updated_query
            expansions = Expansion(ctx).expand(expanded_query)

            do_expansion = "No"
            if expansions != {}:
                do_expansion = ExpansionPopup(expansions)
                print(do_expansion)
                sg.theme("Reddit")

            if do_expansion == "Yes":
                expanded_query = mix_in(expanded_query, expansions, values)

            # use chosen model to search corpus
            if values["_boolean_"]:
                data = search("Boolean", original_query, expanded_query, ctx, topic)
            elif values["_vsm_"]:
                data = search(
                    "VSM",
                    original_query,
                    expanded_query,
                    ctx,
                    topic,
                    relevance=relevance[original_query],
                )
            else:
                data = []

            window["_table_"].Update(values=data)
            relevance[original_query] = RelevanceFeedback().access(original_query)
            window["_relevance_"].Update(values=relevance[original_query])

        elif event is "_resend_":
            print("Resending query: " + original_query)

            # don't display suggestion related UI elements
            toggle_resend(False)

            print("Getting expansions")
            expanded_query = original_query
            expansions = Expansion(ctx).expand(expanded_query)

            do_expansion = "No"
            if expansions != {}:
                do_expansion = ExpansionPopup(expansions)
                print(do_expansion)
                sg.theme("Reddit")

            if do_expansion == "Yes":
                expanded_query = mix_in(expanded_query, expansions, values)

            # redo search using chosen model to search corpus
            if original_values["_boolean_"]:
                data = search("Boolean", original_query, expanded_query, ctx, topic)
            elif original_values["_vsm_"]:
                data = search(
                    "VSM",
                    original_query,
                    expanded_query,
                    ctx,
                    topic,
                    relevance=relevance[original_query],
                )
            else:
                data = []

            window["_table_"].Update(values=data)
            window["_relevance_"].Update(values=relevance[original_query])

        elif event is "_table_":
            print("Opening document")
            try:
                doc = data[values[event][0]]
                DocPopup(original_query, doc)
                window["_table_"].Update(values=data)
                window["_relevance_"].Update(values=relevance[original_query])

            except IndexError:
                # so that clicking a weird part of the table doesn't crash the application
                pass

        elif event is "_relevance_":
            print("Removing relevant doc")
            try:
                doc = relevance[original_query][values[event][0]]
                for da in data:
                    if da[1] == doc[1]:
                        da[0] = "not relevant"
                doc[0] = "not relevant"
                relevance[original_query].remove(doc)
                RelevanceFeedback().unset_relevant(original_query, doc)
                window["_table_"].Update(values=data)
                window["_relevance_"].Update(values=relevance[original_query])
            except IndexError:
                # so that clicking a weird part of the table doesn't crash the application
                pass

        elif event is "_suggestions_":
            print("Displaying edit distance suggestions")
            SuggestionPopup(suggestions)

        elif event is "_uottawa_":
            # no topics for uOttawa corpus
            window["_topics_"].Update(disabled=True)

        elif event is "_reuters_":
            # enable topics for Reuters
            window["_topics_"].Update(disabled=False)
            window["_topics_"].Update(
                readonly=True
            )  # must be done in separate Update calls

        elif event in "_query_":
            query = values["_query_"]
            if query == "" or query[-1] == " ":
                ctx = construct_context(values)
                try:
                    next_terms = Completion(ctx).complete(query.split()[-1])
                    window["_next_"].Update(values=next_terms)
                except IndexError:
                    pass
            else:
                window["_next_"].Update(values=[])

        elif event is "_next_":
            next_term = values[event][0]
            print("Adding term '" + next_term + "' to query")
            new_query = window["_query_"].Get() + next_term + " "
            window["_query_"].Update(value=new_query)
            
            ctx = construct_context(values)
            try:
                next_terms = Completion(ctx).complete(next_term)
                window["_next_"].Update(values=next_terms)
            except IndexError:
                pass

        else:
            print(event)

    window.Close()
예제 #5
0
from os import path

from pkg.topiclearner import TopicLearner
from pkg.context import Context

corpus_path = "data/corpus/reuters.yaml"
dictionary_path = "data/dictionary/reuters.txt"
index_path = path.realpath("data/index/reuters.yaml")

ctx = Context(corpus_path,
              dictionary_path,
              index_path,
              enable_casefolding=False,
              enable_stopwords=False,
              enable_stemming=False,
              enable_normalization=False,
              remove_nonalphanumeric=False)

TopicLearner(ctx).learn()
예제 #6
0
from os import path

from pkg.context import Context
from pkg.corpusaccess import CorpusAccessor

corpus_path = path.realpath("data/corpus/UofO_Courses.yaml")

ctx = Context(corpus_path, "", "")

print("Initializing CorpusAccessor")
corpus_accessor = CorpusAccessor(ctx)

print("Accessing docs:")

docs = corpus_accessor.access(ctx, [587, 588, 589])

for d in docs:
    print(d)

print("\nTry initializing again with same corpus (doesn't re-load)")
corpus_accessor = CorpusAccessor(ctx)

print("Accessing docs:")

docs = corpus_accessor.access(ctx, [590, 591, 592])

for d in docs:
    print(d)
예제 #7
0
from os import path

from pkg.dictionary import DictBuilder
from pkg.context import Context

corpus_path = path.realpath("data/corpus/UofO_Courses.yaml")
dict_path = path.realpath("data/dictionary/UofOCourses.txt")
# corpus_handle = open("./test.yaml", "r")

ctx = Context(corpus_path, dict_path, "")
dict_builder = DictBuilder(ctx)
dict_builder.build()