Exemplo n.º 1
0
def insert_song(link, lyrics):
    """First we update the number of items in the table, then we add an index for which number entry the link is,
    then we finally insert the chorus and verse.
    :param link: str for the url we are scraping lyrics from, becomes unique identifier for the song lyrics
    :param lyrics: str array with two parts, chorus and verse
    :return:
    """
    response = song_table.get_item(Key={'id': link})

    if 'Item' in response:
        print("{} already exists in song table".format(link))
        return
    map_name = get_current_map()
    response = song_table.get_item(Key={'id': map_name})
    map = response['Item']

    try:
        url_list = list(map['url_list'])
        num_items = len(url_list) + 1
        url_list.append(link)
        print("{} is the #{} item in the song table".format(
            link, str(num_items)))

        helper_methods.update_table(song_table, map_name, "url_list", url_list)

        song_table.put_item(Item={'id': link, 'lyrics': lyrics})
    except KeyError:
        print("Key Error")
    pass
def replace_words(words):
    """
    given a dict of words and their replacements, replaces those words in the song lyrics
    :param words: dict of words and replacements
    """
    song_urls = lyricsorter.get_song_url_list()
    for link in song_urls:
        response = song_table.get_item(Key={'id': link})
        lyrics = []
        try:
            lyrics = response['Item']['lyric_array']
        except KeyError:
            pass
        for line in lyrics:
            for index, w in enumerate(line):

                if w in words:
                    print(w)
                    fix = words[w]
                    print(fix)
                    response = word_table.get_item(Key={'id': fix})
                    if 'Item' in response:
                        num_occurrences = response["Item"]['num_occurrences']
                        print(link)
                        print(line)
                        helper_methods.update_table(word_table, fix,
                                                    "num_occurrences",
                                                    (num_occurrences + 1))
                        line[index] = fix
                        print(line)
                        helper_methods.update_table(song_table, link,
                                                    "lyric_array", lyrics)
def setup_word_table():
    """Initializes the number of occurrences, songs words are found in, and slang deriviatives of all the words in the
    dyanmodb word table"""
    words = get_word_list()
    word_list = words[0]
    for word in word_list:
        print(word)
        helper_methods.update_table(word_table, word, "num_occurrences", 0)
        helper_methods.update_table(word_table, word, "slang", [])
Exemplo n.º 4
0
def get_words(link: str):
    """Uses a song url to look up the lyrics and get all the individual words from them"""
    response = song_table.get_item(Key={'id': link})

    item = response['Item']
    lyrics = str(item['lyrics'])
    lyric_list = lyrics.split("\n")

    output_list = []
    output_list2 = []
    i = 0
    print(link)
    for item in lyric_list:
        # Take  out the punctuation
        item = item.replace("?", "")
        item = item.replace(",", "")
        item = item.replace(".", " ")
        item = item.replace("\"", "")
        item = item.replace("-", " ")
        item = item.replace("!", "")
        item = item.lower()

        # We only want the line if it doesn't have numbers, and doesn't have a colon, which means its not actual lyrics
        if item not in output_list and hasNumbers(
                item) is False and scraper.contains(item, ":") is False:
            output_list.append(remove_paranthases(item).strip())
            output_list2.append([])

    for item in output_list:

        word_lines = item.split(" ")
        # we only want sentences with more than three words, because some of the text in the lyrics aren'tactual song lyrics
        # and we want to avoid adding those
        if len(word_lines) > 3:
            for word in word_lines:
                word = str(word).lower()

                # make sure we aren't adding html code
                if len(word) > 0 and scraper.contains(word, "&") is False:
                    if word[0] == '\'':
                        word = word[1:len(word)]

                    # replace slang words with their real equivalent\
                    word = slang_cleaner.remove_weirdness(word)
                    word = slang_cleaner.clean_misspellings(word)
                    word = slang_cleaner.clean_slang(word)
                    output_list2[i].append(word)
                    # add it to our list of words in the song
                    if word not in song_words and word != ",":
                        song_words.append(word)
        i += 1
    try:
        helper_methods.update_table(song_table, link, "lyric_array",
                                    output_list2)
    except ClientError:
        pass
Exemplo n.º 5
0
def insert_words():
    """Inserts all the proper words as individual items into the database and slang words into a list"""
    for word in proper_words:
        if len(word) > 0:
            slang_cleaner.insert_word(word)
    response = word_table.get_item(Key={'id': "slang_words"})
    dynamo_slang_words = list(response['Item']['words'])
    for word in slang_words:
        if word not in dynamo_slang_words:
            dynamo_slang_words.append(word)
    helper_methods.update_table(word_table, "slang_words", "words",
                                sorted(dynamo_slang_words))
Exemplo n.º 6
0
def get_current_map():
    response = song_table.get_item(Key={'id': "mapmap"})
    maps = list(response['Item']['map_list'])
    last_map = maps[len(maps) - 1]
    response = song_table.get_item(Key={'id': last_map})
    current_song_urls = list(response['Item']['url_list'])
    if len(current_song_urls) < 400:
        return last_map
    else:
        new_map = "map_" + str(len(maps) + 1)
        maps.append(new_map)
        helper_methods.update_table(song_table, "mapmap", "map_list", maps)
        song_table.put_item(Item={'id': new_map, "url_list": []})
        return new_map
Exemplo n.º 7
0
def populate_sentence_db():
    with open('last_word_dict.json') as f:
        last_words = dict(json.load(f))
    sent_list = []
    i = 1
    while i < len(list(last_words.keys())):
        try:
            word = list(last_words.keys())[i]
            print(word)
            print(i)

            if helper_methods.check_phonetic_existance(word):
                sents = get_all_sents(word)
                response = lyric_table.get_item(Key={'id': -1})

                total = int(response['Item']['total'])
                start = total + 1

                with lyric_table.batch_writer() as batch:
                    for j, sent in enumerate(sents):

                        total += 1

                        Item = {
                            'id': total,
                            'sent': sents[sent],
                            'len': len(sents[sent])
                        }
                        for w in sents[sent]:
                            Item[w] = 1
                        batch.put_item(Item=Item)
                        sent_list.append(sents[sent])
                    batch.put_item(Item={'id': -1, 'total': total})
                    helper_methods.update_table(rhyme_table, word, "sent_ids",
                                                [start, total])
            else:
                print("{} has no phonetic representation".format(word))

            i += 1
        except botocore.exceptions.ClientError:
            i += 1
    output_sents = {"sents": sent_list}
    with open('sent_array.json', 'w') as outfile:
        json.dump(sent_list, outfile, indent=2)
def assign_word_specificities():
    """Modifies the data.json file that contains parts of speech tagging such that it chooses the highest 2 frequency
    """
    with open('data.json') as f:
        data = json.load(f)

    gen_types = [
        "RB", "CD", "EX", "DT", "CC", "IN", "MD", "PDT", "RP", "UH", "TO",
        "PRP", "WDT", "WP", "WP$", "PRP$", "POS", "WRB"
    ]
    for item in data:
        print(item)
        curr_pos = get_max_json(data[item])
        special = True
        for pos in curr_pos:

            if len(pos) > 1 and pos in gen_types:
                special = False

        print(special)

        if special:
            helper_methods.update_table(word_table, item, "u", 1)
def fill_word_dict(x: int, y: int):
    """Uses a song url to look up the lyrics and get all the individual words from them"""
    song_urls = lyricsorter.get_song_url_list()
    words = get_word_list()
    word_list = words[0]
    word_dict = words[1]

    while x < y:
        if x > len(song_urls) - 1:
            break
        link = song_urls[x]
        x += 1
        response = song_table.get_item(Key={'id': link})

        item = response['Item']
        lyrics = str(item['lyrics'])
        lyric_list = lyrics.split("\n")

        output_list = []
        print(str(x))
        for item in lyric_list:
            # Take  out the punctuation
            item = item.replace("?", "")
            item = item.replace(",", "")
            item = item.replace(".", " ")
            item = item.replace("\"", "")
            item = item.replace("-", " ")
            item = item.replace("!", "")
            item = item.replace("+", "")
            item = item.lower()

            # We only want the line if it doesn't have numbers, and doesn't have a colon, which means its not actual lyrics
            if item not in output_list and lyricsorter.hasNumbers(
                    item) is False and scraper.contains(item, ":") is False:
                output_list.append(
                    lyricsorter.remove_paranthases(item).strip())

        for item in output_list:
            word_lines = item.split(" ")
            # we only want sentences with more than three words, because some of the text in the lyrics aren'tactual song lyrics
            # and we want to avoid adding those
            if len(word_lines) > 3:
                for word in word_lines:
                    word = str(word).lower()

                    # make sure we aren't adding html code
                    if len(word) > 0 and scraper.contains(word, "&") is False:
                        if word[0] == '\'':
                            word = word[1:len(word)]

                        # replace slang words with their real equivalent\
                        word = slang_cleaner.remove_weirdness(word)
                        slang_word = slang_cleaner.clean_misspellings(word)
                        word = slang_cleaner.clean_slang(slang_word)
                        if word in word_list:
                            num_occurrences = int(
                                word_dict[word]["num_occurrences"]) + 1
                            word_dict[word][
                                "num_occurrences"] = num_occurrences
                            if word != slang_word and slang_word not in word_dict[
                                    word]["slang"]:
                                word_dict[word]["slang"].append(slang_word)
    for i, word in enumerate(word_list):
        print("Inserting word #{} of {}".format(str(i), str(len(word_list))))
        helper_methods.update_table(word_table, word, "num_occurrences",
                                    int(word_dict[word]["num_occurrences"]))
        helper_methods.update_table(word_table, word, "slang",
                                    word_dict[word]["slang"])