def stemWords(input_tokens):

    stemmer = PorterStemmer()
    stemmed_words = []
    for token in input_tokens:
        stemmed_words.append(str(stemmer.stem(token, 0, len(token) - 1)))

    return stemmed_words
示例#2
0
def stemWords(tokens):
    """Stems tokens."""
    stemmer = PorterStemmer()
    stemmedWords = []
    for token in tokens:
        stemmed = stemmer.stem(token, 0, len(token) - 1)
        stemmedWords.append(stemmed)
    return stemmedWords
示例#3
0
def porter_stem(corp):
    """
    Builds a dictionary with words as keys and stems as the values.
    """
    from porterstemmer import PorterStemmer

    ps = PorterStemmer()
    psdict = {}
    for w in corp.words:
        psdict[w] = ps.stem(w)
    
    return psdict
示例#4
0
def porter_stem(corp):
    """
    Builds a dictionary with words as keys and stems as the values.
    """
    from porterstemmer import PorterStemmer

    ps = PorterStemmer()
    psdict = {}
    for w in corp.words:
        psdict[w] = ps.stem(w)

    return psdict
def task1(input_file_name, output_file_name, stop_words_list):

    # open the input file and the list of stop words and create output file
    f_input = open(input_file_name, "r")
    f_output = open(output_file_name, "w+")
    f_stop_words = open(stop_words_list, "r")

    list_lines = f_input.readlines()
    #list of stop words
    list_stop_words = f_stop_words.readlines()
    list_stop_words = list(map(lambda x: x.strip(), list_stop_words))

    #list of document names
    list_documents = []

    ps = PorterStemmer()

    for i in range(len(list_lines)):
        list_words = []  #list of words for a line
        list_words_stemming = []  #list of stemming words for a line

        list_documents.append(list_lines[i].split()[0])

        #remove all the \t and \n
        list_lines[i] = re.sub(r'\s', " ", list_lines[i])
        #change upper cases to lower cases
        list_lines[i] = list_lines[i].lower()
        #remove numbers
        list_lines[i] = list_lines[i].translate(str.maketrans('', '', digits))
        #remove punctuations
        list_lines[i] = re.sub(r'[^a-zA-Z0-9\s]', '', list_lines[i])

        for w in list_lines[i].split()[1:]:
            if w not in list_stop_words:
                list_words.append(w)

        for y in list_words:
            list_words_stemming.append(ps.stem(y, 0, len(y) - 1))

        # Write the document name in front of the content in the output file
        f_output.write(list_documents[i] + "\t")
        # Write the content of the document in the output file
        for z in list_words_stemming:
            f_output.write(z + " ")
        f_output.write("\n")

    # Close all the file
    f_output.close()
    f_input.close()
    f_stop_words.close()
示例#6
0
def add_tokens():
    """Return a posting list of all unique words in the collection."""

    # consider only title, data, author, category and post_text columns
    # reason: the url columns contain redundant information (title) & other columns are
    # numbers not useful to the vector space model

    title_file = "TUAW-dataset/data/title.txt"
    date_file = "TUAW-dataset/data/date.txt"
    author_file = "TUAW-dataset/data/author.txt"
    category_file = "TUAW-dataset/data/category.txt"
    post_text_file = "TUAW-dataset/data/post_text.txt"

    posting_list = {}
    stemmer = PorterStemmer()
    stopwords_set = set(stopwords.words("english"))

    doc_id = -1
    total_num_docs = 0

    # read the same line of the files together
    # open(date_file) as date_fd, \
    with open(title_file) as title_fd, \
        open(author_file) as author_fd, \
        open(category_file) as category_fd, \
        open(post_text_file) as post_text_fd:
        lines = zip(title_fd, author_fd, category_fd, post_text_fd)
        for line in lines:
            total_num_docs += 1
            doc_id += 1  # == line_num
            if doc_id % 1000 == 999:
                print("Processed " + str(doc_id + 1) + " posts")

            # title + author + category + post_text
            line_string = line[0].strip() + " " + line[1].strip(
            ) + " " + line[2].strip() + " " + line[3].strip()

            # normalize the terms in the line == post
            term_list = normalize(line_string, stemmer, stopwords_set)

            # add every word to posting list
            for word in term_list:
                # type(posting list) == { term: [df, {doc_id: tf}] }
                if word in posting_list:
                    doc_dict = posting_list[word][1]
                    if doc_id in doc_dict:
                        doc_dict[doc_id] = doc_dict[doc_id] + 1
                    else:
                        posting_list[word][0] += 1
                        doc_dict[doc_id] = 1
                elif len(
                        word
                ) > 0:  # add only words of non-zero length, check again
                    temp_dict = {}
                    temp_dict[doc_id] = 1
                    posting_list[word] = [1, temp_dict]

    return (total_num_docs, posting_list)
示例#7
0
def search(query_string, k, line_num_dict, N):
    """Return top @k search results for @query_string from the corpus of @N documents using \
@line_num_dict as a lookup table."""
    stemmer = PorterStemmer()
    stopwords_set = set(stopwords.words("english"))

    # normalize the query
    term_list = normalize(query_string, stemmer, stopwords_set)

    query_freq = {}  # num of occurences of every unique term
    for term in term_list:
        if term in query_freq:
            query_freq[term] = query_freq[term] + 1
        elif len(term) > 0:  # add only term of non-zero length
            query_freq[term] = 1

    # retrieve only necessary posting lists in the order they appear in the file
    lines_to_get = []
    for term in query_freq.keys():
        lines_to_get += [line_num_dict[term]]
    lines_to_get.sort()

    # if no word in the quey occurs in the data, posting list will be empty
    if len(lines_to_get) == 0:
        print("No results found")
        sys.exit(0)

    posting_list = get_posting_list(lines_to_get)

    (weight_query, doc_dict) = calc_weights(query_freq, posting_list, N)

    top_k = get_top_k(weight_query, doc_dict, k)

    # result = doc_id + score + title + url
    title_file = "TUAW-dataset/data/title.txt"
    post_url_file = "TUAW-dataset/data/post_url.txt"

    # sort based on doc_id for efficient retrieval
    docs_to_get = []
    for doc_id in top_k.keys():
        docs_to_get += [doc_id]
    docs_to_get.sort()

    current_index = 0
    with open(title_file) as title_fd, open(post_url_file) as post_url_fd:
        lines = zip(title_fd, post_url_fd)
        for i, line in enumerate(lines):
            if i == docs_to_get[current_index]:
                title_string = "Title = " + line[0]
                post_url_string = "URL = " + line[1]
                top_k[i][1] = title_string + post_url_string
                current_index += 1
                if current_index == len(docs_to_get):
                    break

    # sort top_k based on score
    result = OrderedDict(
        sorted(top_k.items(), key=lambda t: t[1][0], reverse=True))

    # print output
    num_results = 1
    for doc_id, [score, details] in result.items():
        print(
            str(num_results) + ". Doc_ID = " + str(doc_id) + " ; Score = " +
            str(result[doc_id][0]))
        print(result[doc_id][1])
        num_results += 1
def stemWord(str):
    stemmer = PorterStemmer()
    return stemmer.stem(str, 0, len(str) - 1)
示例#9
0
def parsetoken(db, line):
    global documents
    global tokens
    global terms
    #
    # Create instance of the porterstemmer object we will call the stemmer method in this
    # object to 'stem' the tokens extracted from the line.
    #
    p = PorterStemmer()

    # this replaces any tab characters with a space character in the line
    # read from the file
    line = line.replace('\t', ' ')
    line = line.strip()
    #line.encode('ascii', 'ignore')

    #
    # This routine splits the contents of the line into tokens
    l = splitchars(line)

    # for each token in the line process
    for elmt in l:
        # This statement removes the newline character if found
        elmt = elmt.replace('\n', '')

        # This statement converts all letters to lower case
        lowerElmt = elmt.lower().strip()

        #
        # Increment the counter of the number of tokens processed.  This value will
        # provide the total size of the corpus in terms of the number of terms in the
        # entire collection
        #
        tokens += 1

        # if the token is less than 2 characters in length we assume
        # that it is not a valid term and ignore it
        #
        if len(lowerElmt) < 2:
            continue

        #
        # if the token is in the stopwords list then do not include in the term
        # dictionary and do not index the term.
        #
        if (lowerElmt in stopwords):
            continue

        #
        # This section of code will check to see if the term is a number and will not
        # add a number to the index.  This is accomplished by attempting to convert
        # the term into an integer and assigning it to a variable.  If the term is not
        # a number meaning it contains non numeric characters this will fail and we can
        # catch this error and continue processing the term.  If the term is a number
        # it will not fail and we can then ignore the term (the continue statement will
        # continue with the next item retrieved from the 'for' statement)
        #
        try:
            dummy = int(lowerElmt)
        except ValueError:
            # Value is not a number so we can index it
            stemword = lowerElmt
        else:
            # value is a number so we will NOT add it to the index
            continue

        #
        # In this following short section of the code we call the porter stemmer code
        # that we have included in our indexer process.  This algorithm will stem the
        # the tokens which will reduce the size of our data dictionary.
        #
        lowerElmt = p.stem(stemword, 0, len(stemword) - 1)

        # if the term doesn't currently exist in the term dictionary
        # then add the term
        if not (lowerElmt in db.keys()):
            terms += 1
            db[lowerElmt] = Term()
            db[lowerElmt].termid = terms
            db[lowerElmt].docids = dict()
            db[lowerElmt].docs = 0

        # if the document is not currently in the postings
        # list for the term then add it
        #
        if not (documents in db[lowerElmt].docids.keys()):
            db[lowerElmt].docs += 1
            db[lowerElmt].docids[documents] = 0

        # Increment the counter that tracks the term frequency
        db[lowerElmt].docids[documents] += 1
    return l
示例#10
0
def generate_feature_csv(
    csv_out, csv_in="bechdel_full.csv", female_word_filename=None, female_name_filename=None, verbose=False
):
    """
    Given a csv file csv_in of features, 
    """

    if verbose:
        print("Generating basic features and booleans...")

    raw_data = pd.read_csv(csv_in)
    data = pd.DataFrame(index=raw_data.index)
    data["Bechdel_pass"] = [1 if x == "pass" else 0 for x in raw_data["Bechdel_rating"]]
    data["Year"] = raw_data["Year"]

    # Only 2 films have N/A votes and ratings. I think it's OK to just zero
    # their votes/ratings here

    data["imdbRating"] = [x if x != "N/A" else 0 for x in raw_data["imdbRating"]]
    data["imdbVotes"] = [int(re.sub(",", "", x)) if x != "N/A" else 0 for x in raw_data["imdbVotes"]]

    # Adding booleans for month (not present for all releases). The thinking is
    # that movie "types" are released in seasons - blockbusters in the summer,
    # Oscar winners near year's end - and this may impact Bechdel rating.

    release_months = [
        datetime.datetime.strptime(x, "%d %b %Y").month if x != "N/A" else None for x in raw_data["Released"]
    ]
    release_months = level_booleans(release_months, "Month", zeros_ones=True)
    for col in release_months.columns:
        data[col] = release_months[col]

    # Booleans for parental rating. Uses the rating_bucket function to deal
    # with the wide variety of rating types.

    rating_buckets = [rating_bucket(x) for x in raw_data["Rated"]]
    rating_buckets = level_booleans(rating_buckets, "Rating", zeros_ones=True)
    for col in rating_buckets.columns:
        data[col] = rating_buckets[col]

    # Genre membership, this was actually easy to process because they're
    # pretty clean

    genre_membership = level_booleans(raw_data["Genre"], "Genre", sep=", ", zeros_ones=True)
    for col in genre_membership.columns:
        data[col] = genre_membership[col]

    # Runtime in minutes

    runtime_re = re.compile("((?P<hr>\d+) h){0,1} {0,1}((?P<min>\d+) min){0,1}")
    runtime_mins = []
    runtime_na = []
    for runtime_str in raw_data["Runtime"]:
        if runtime_str == "N/A":
            runtime_mins.append(0)
            runtime_na.append(1)
        else:
            runtime_match = runtime_re.match(runtime_str)
            (runtime_hr, runtime_min) = runtime_match.group("hr"), runtime_match.group("min")
            if runtime_hr is None:
                runtime_hr = 0
            if runtime_min is None:
                runtime_min = 0
            runtime_mins.append(int(runtime_hr) * 60 + int(runtime_min))
            runtime_na.append(0)
    data["Runtime"] = runtime_mins
    data["Runtime_na"] = runtime_na

    if verbose:
        print("Generating word-based features (stemmed words and female names)...")

    # Porter-stemmed titles and plot summaries, and look for "female words"
    # (like 'she', 'woman', etc.)

    if female_word_filename is not None:
        ps = PorterStemmer()
        f = open(female_word_filename, "r")
        female_stems = set([ps.stem(x.strip().lower(), 0, len(x.strip()) - 1) for x in f])
        f.close()
        has_female_word = []
        for plot in raw_data["Title"] + " " + raw_data["Plot"]:
            if plot == "N/A":
                has_female_word.append(None)
            else:
                cur_has_female_word = 0
                plot_clean = re.sub("[^\w\s]", " ", plot).lower().strip()
                plot_words = re.split("\s+", plot_clean)
                plot_stems = [ps.stem(x, 0, len(x) - 1) for x in plot_words]
                for plot_stem in plot_stems:
                    if plot_stem in female_stems:
                        cur_has_female_word = 1
                        break
                has_female_word.append(cur_has_female_word)
        data["Female_word"] = has_female_word

    # Number of female names in the actor list: 0 or 1 (and anything not
    # flagged as either should be considered 2+)

    if female_name_filename is not None:
        f = open(female_name_filename, "r")
        female_nameset = set([x.strip().lower() for x in f])
        f.close()
        has_0_female_name = []
        has_1_female_name = []
        for actor_list in raw_data["Actors"]:
            if actor_list == "N/A":
                # again this issue only comes up twice
                has_0_female_name.append(0)
                has_1_female_name.append(0)
            else:
                actor_clean = re.sub("[^\w\s]", " ", actor_list).lower().strip()
                actor_names = re.split("\s+", actor_clean)
                female_name_count = 0
                for actor_name in actor_names:
                    if actor_name in female_nameset:
                        female_name_count += 1
                if female_name_count == 0:
                    has_0_female_name.append(1)
                    has_1_female_name.append(0)
                elif female_name_count == 1:
                    has_0_female_name.append(0)
                    has_1_female_name.append(1)
                else:
                    has_0_female_name.append(0)
                    has_1_female_name.append(0)
        data["Actress_0"] = has_0_female_name
        data["Actress_1"] = has_1_female_name

    data.to_csv(csv_out, index=False)

    if verbose:
        print("Feature generation complete, output to %s." % csv_out)
示例#11
0
}
# the database is a simple dictionnary
database = {}

# regular expression for: extract words, extract ID from path, check for hexa value
chars = re.compile(r'\W+')
atLeast3Chars = re.compile(r'\w{3,}')
notDigit = re.compile(r'\D*')
pattid = re.compile(r'(\d{3})/(\d{3})/(\d{3})')

# the higher ID
tokens = 0
documents = 0
terms = 0
stopWordsFound = 0
stemmer = PorterStemmer()


#
# We will create a term object for each unique instance of a term
#
class Term():
    termid = 0
    termfreq = 0
    docs = 0
    docids = {}

    # The code added:
    # ===================================================================
    # Calculate the inverse document frequency
    # ===================================================================