Exemplo n.º 1
0
def stemWords(tokens):
    """Stems tokens."""
    stemmer = PorterStemmer()
    stemmedWords = []
    for token in tokens:
        stemmed = stemmer.stem(token, 0, len(token) - 1)
        stemmedWords.append(stemmed)
    return stemmedWords
def stemWords(input_tokens):

    stemmer = PorterStemmer()
    stemmed_words = []
    for token in input_tokens:
        stemmed_words.append(str(stemmer.stem(token, 0, len(token) - 1)))

    return stemmed_words
Exemplo n.º 3
0
def add_tokens():
    """Return a posting list of all unique words in the collection."""

    # consider only title, data, author, category and post_text columns
    # reason: the url columns contain redundant information (title) & other columns are
    # numbers not useful to the vector space model

    title_file = "TUAW-dataset/data/title.txt"
    date_file = "TUAW-dataset/data/date.txt"
    author_file = "TUAW-dataset/data/author.txt"
    category_file = "TUAW-dataset/data/category.txt"
    post_text_file = "TUAW-dataset/data/post_text.txt"

    posting_list = {}
    stemmer = PorterStemmer()
    stopwords_set = set(stopwords.words("english"))

    doc_id = -1
    total_num_docs = 0

    # read the same line of the files together
    # open(date_file) as date_fd, \
    with open(title_file) as title_fd, \
        open(author_file) as author_fd, \
        open(category_file) as category_fd, \
        open(post_text_file) as post_text_fd:
        lines = zip(title_fd, author_fd, category_fd, post_text_fd)
        for line in lines:
            total_num_docs += 1
            doc_id += 1  # == line_num
            if doc_id % 1000 == 999:
                print("Processed " + str(doc_id + 1) + " posts")

            # title + author + category + post_text
            line_string = line[0].strip() + " " + line[1].strip(
            ) + " " + line[2].strip() + " " + line[3].strip()

            # normalize the terms in the line == post
            term_list = normalize(line_string, stemmer, stopwords_set)

            # add every word to posting list
            for word in term_list:
                # type(posting list) == { term: [df, {doc_id: tf}] }
                if word in posting_list:
                    doc_dict = posting_list[word][1]
                    if doc_id in doc_dict:
                        doc_dict[doc_id] = doc_dict[doc_id] + 1
                    else:
                        posting_list[word][0] += 1
                        doc_dict[doc_id] = 1
                elif len(
                        word
                ) > 0:  # add only words of non-zero length, check again
                    temp_dict = {}
                    temp_dict[doc_id] = 1
                    posting_list[word] = [1, temp_dict]

    return (total_num_docs, posting_list)
Exemplo n.º 4
0
def porter_stem(corp):
    """
    Builds a dictionary with words as keys and stems as the values.
    """
    from porterstemmer import PorterStemmer

    ps = PorterStemmer()
    psdict = {}
    for w in corp.words:
        psdict[w] = ps.stem(w)

    return psdict
Exemplo n.º 5
0
def task1(input_file_name, output_file_name, stop_words_list):

    # open the input file and the list of stop words and create output file
    f_input = open(input_file_name, "r")
    f_output = open(output_file_name, "w+")
    f_stop_words = open(stop_words_list, "r")

    list_lines = f_input.readlines()
    #list of stop words
    list_stop_words = f_stop_words.readlines()
    list_stop_words = list(map(lambda x: x.strip(), list_stop_words))

    #list of document names
    list_documents = []

    ps = PorterStemmer()

    for i in range(len(list_lines)):
        list_words = []  #list of words for a line
        list_words_stemming = []  #list of stemming words for a line

        list_documents.append(list_lines[i].split()[0])

        #remove all the \t and \n
        list_lines[i] = re.sub(r'\s', " ", list_lines[i])
        #change upper cases to lower cases
        list_lines[i] = list_lines[i].lower()
        #remove numbers
        list_lines[i] = list_lines[i].translate(str.maketrans('', '', digits))
        #remove punctuations
        list_lines[i] = re.sub(r'[^a-zA-Z0-9\s]', '', list_lines[i])

        for w in list_lines[i].split()[1:]:
            if w not in list_stop_words:
                list_words.append(w)

        for y in list_words:
            list_words_stemming.append(ps.stem(y, 0, len(y) - 1))

        # Write the document name in front of the content in the output file
        f_output.write(list_documents[i] + "\t")
        # Write the content of the document in the output file
        for z in list_words_stemming:
            f_output.write(z + " ")
        f_output.write("\n")

    # Close all the file
    f_output.close()
    f_input.close()
    f_stop_words.close()
Exemplo n.º 6
0
def search(query_string, k, line_num_dict, N):
    """Return top @k search results for @query_string from the corpus of @N documents using \
@line_num_dict as a lookup table."""
    stemmer = PorterStemmer()
    stopwords_set = set(stopwords.words("english"))

    # normalize the query
    term_list = normalize(query_string, stemmer, stopwords_set)

    query_freq = {}  # num of occurences of every unique term
    for term in term_list:
        if term in query_freq:
            query_freq[term] = query_freq[term] + 1
        elif len(term) > 0:  # add only term of non-zero length
            query_freq[term] = 1

    # retrieve only necessary posting lists in the order they appear in the file
    lines_to_get = []
    for term in query_freq.keys():
        lines_to_get += [line_num_dict[term]]
    lines_to_get.sort()

    # if no word in the quey occurs in the data, posting list will be empty
    if len(lines_to_get) == 0:
        print("No results found")
        sys.exit(0)

    posting_list = get_posting_list(lines_to_get)

    (weight_query, doc_dict) = calc_weights(query_freq, posting_list, N)

    top_k = get_top_k(weight_query, doc_dict, k)

    # result = doc_id + score + title + url
    title_file = "TUAW-dataset/data/title.txt"
    post_url_file = "TUAW-dataset/data/post_url.txt"

    # sort based on doc_id for efficient retrieval
    docs_to_get = []
    for doc_id in top_k.keys():
        docs_to_get += [doc_id]
    docs_to_get.sort()

    current_index = 0
    with open(title_file) as title_fd, open(post_url_file) as post_url_fd:
        lines = zip(title_fd, post_url_fd)
        for i, line in enumerate(lines):
            if i == docs_to_get[current_index]:
                title_string = "Title = " + line[0]
                post_url_string = "URL = " + line[1]
                top_k[i][1] = title_string + post_url_string
                current_index += 1
                if current_index == len(docs_to_get):
                    break

    # sort top_k based on score
    result = OrderedDict(
        sorted(top_k.items(), key=lambda t: t[1][0], reverse=True))

    # print output
    num_results = 1
    for doc_id, [score, details] in result.items():
        print(
            str(num_results) + ". Doc_ID = " + str(doc_id) + " ; Score = " +
            str(result[doc_id][0]))
        print(result[doc_id][1])
        num_results += 1
Exemplo n.º 7
0
def stemWord(str):
    stemmer = PorterStemmer()
    return stemmer.stem(str, 0, len(str) - 1)
Exemplo n.º 8
0
def parsetoken(db, line):
    global documents
    global tokens
    global terms
    #
    # Create instance of the porterstemmer object we will call the stemmer method in this
    # object to 'stem' the tokens extracted from the line.
    #
    p = PorterStemmer()

    # this replaces any tab characters with a space character in the line
    # read from the file
    line = line.replace('\t', ' ')
    line = line.strip()
    #line.encode('ascii', 'ignore')

    #
    # This routine splits the contents of the line into tokens
    l = splitchars(line)

    # for each token in the line process
    for elmt in l:
        # This statement removes the newline character if found
        elmt = elmt.replace('\n', '')

        # This statement converts all letters to lower case
        lowerElmt = elmt.lower().strip()

        #
        # Increment the counter of the number of tokens processed.  This value will
        # provide the total size of the corpus in terms of the number of terms in the
        # entire collection
        #
        tokens += 1

        # if the token is less than 2 characters in length we assume
        # that it is not a valid term and ignore it
        #
        if len(lowerElmt) < 2:
            continue

        #
        # if the token is in the stopwords list then do not include in the term
        # dictionary and do not index the term.
        #
        if (lowerElmt in stopwords):
            continue

        #
        # This section of code will check to see if the term is a number and will not
        # add a number to the index.  This is accomplished by attempting to convert
        # the term into an integer and assigning it to a variable.  If the term is not
        # a number meaning it contains non numeric characters this will fail and we can
        # catch this error and continue processing the term.  If the term is a number
        # it will not fail and we can then ignore the term (the continue statement will
        # continue with the next item retrieved from the 'for' statement)
        #
        try:
            dummy = int(lowerElmt)
        except ValueError:
            # Value is not a number so we can index it
            stemword = lowerElmt
        else:
            # value is a number so we will NOT add it to the index
            continue

        #
        # In this following short section of the code we call the porter stemmer code
        # that we have included in our indexer process.  This algorithm will stem the
        # the tokens which will reduce the size of our data dictionary.
        #
        lowerElmt = p.stem(stemword, 0, len(stemword) - 1)

        # if the term doesn't currently exist in the term dictionary
        # then add the term
        if not (lowerElmt in db.keys()):
            terms += 1
            db[lowerElmt] = Term()
            db[lowerElmt].termid = terms
            db[lowerElmt].docids = dict()
            db[lowerElmt].docs = 0

        # if the document is not currently in the postings
        # list for the term then add it
        #
        if not (documents in db[lowerElmt].docids.keys()):
            db[lowerElmt].docs += 1
            db[lowerElmt].docids[documents] = 0

        # Increment the counter that tracks the term frequency
        db[lowerElmt].docids[documents] += 1
    return l
Exemplo n.º 9
0
}
# the database is a simple dictionnary
database = {}

# regular expression for: extract words, extract ID from path, check for hexa value
chars = re.compile(r'\W+')
atLeast3Chars = re.compile(r'\w{3,}')
notDigit = re.compile(r'\D*')
pattid = re.compile(r'(\d{3})/(\d{3})/(\d{3})')

# the higher ID
tokens = 0
documents = 0
terms = 0
stopWordsFound = 0
stemmer = PorterStemmer()


#
# We will create a term object for each unique instance of a term
#
class Term():
    termid = 0
    termfreq = 0
    docs = 0
    docids = {}

    # The code added:
    # ===================================================================
    # Calculate the inverse document frequency
    # ===================================================================