def insert_song(link, lyrics): """First we update the number of items in the table, then we add an index for which number entry the link is, then we finally insert the chorus and verse. :param link: str for the url we are scraping lyrics from, becomes unique identifier for the song lyrics :param lyrics: str array with two parts, chorus and verse :return: """ response = song_table.get_item(Key={'id': link}) if 'Item' in response: print("{} already exists in song table".format(link)) return map_name = get_current_map() response = song_table.get_item(Key={'id': map_name}) map = response['Item'] try: url_list = list(map['url_list']) num_items = len(url_list) + 1 url_list.append(link) print("{} is the #{} item in the song table".format( link, str(num_items))) helper_methods.update_table(song_table, map_name, "url_list", url_list) song_table.put_item(Item={'id': link, 'lyrics': lyrics}) except KeyError: print("Key Error") pass
def replace_words(words): """ given a dict of words and their replacements, replaces those words in the song lyrics :param words: dict of words and replacements """ song_urls = lyricsorter.get_song_url_list() for link in song_urls: response = song_table.get_item(Key={'id': link}) lyrics = [] try: lyrics = response['Item']['lyric_array'] except KeyError: pass for line in lyrics: for index, w in enumerate(line): if w in words: print(w) fix = words[w] print(fix) response = word_table.get_item(Key={'id': fix}) if 'Item' in response: num_occurrences = response["Item"]['num_occurrences'] print(link) print(line) helper_methods.update_table(word_table, fix, "num_occurrences", (num_occurrences + 1)) line[index] = fix print(line) helper_methods.update_table(song_table, link, "lyric_array", lyrics)
def setup_word_table(): """Initializes the number of occurrences, songs words are found in, and slang deriviatives of all the words in the dyanmodb word table""" words = get_word_list() word_list = words[0] for word in word_list: print(word) helper_methods.update_table(word_table, word, "num_occurrences", 0) helper_methods.update_table(word_table, word, "slang", [])
def get_words(link: str): """Uses a song url to look up the lyrics and get all the individual words from them""" response = song_table.get_item(Key={'id': link}) item = response['Item'] lyrics = str(item['lyrics']) lyric_list = lyrics.split("\n") output_list = [] output_list2 = [] i = 0 print(link) for item in lyric_list: # Take out the punctuation item = item.replace("?", "") item = item.replace(",", "") item = item.replace(".", " ") item = item.replace("\"", "") item = item.replace("-", " ") item = item.replace("!", "") item = item.lower() # We only want the line if it doesn't have numbers, and doesn't have a colon, which means its not actual lyrics if item not in output_list and hasNumbers( item) is False and scraper.contains(item, ":") is False: output_list.append(remove_paranthases(item).strip()) output_list2.append([]) for item in output_list: word_lines = item.split(" ") # we only want sentences with more than three words, because some of the text in the lyrics aren'tactual song lyrics # and we want to avoid adding those if len(word_lines) > 3: for word in word_lines: word = str(word).lower() # make sure we aren't adding html code if len(word) > 0 and scraper.contains(word, "&") is False: if word[0] == '\'': word = word[1:len(word)] # replace slang words with their real equivalent\ word = slang_cleaner.remove_weirdness(word) word = slang_cleaner.clean_misspellings(word) word = slang_cleaner.clean_slang(word) output_list2[i].append(word) # add it to our list of words in the song if word not in song_words and word != ",": song_words.append(word) i += 1 try: helper_methods.update_table(song_table, link, "lyric_array", output_list2) except ClientError: pass
def insert_words(): """Inserts all the proper words as individual items into the database and slang words into a list""" for word in proper_words: if len(word) > 0: slang_cleaner.insert_word(word) response = word_table.get_item(Key={'id': "slang_words"}) dynamo_slang_words = list(response['Item']['words']) for word in slang_words: if word not in dynamo_slang_words: dynamo_slang_words.append(word) helper_methods.update_table(word_table, "slang_words", "words", sorted(dynamo_slang_words))
def get_current_map(): response = song_table.get_item(Key={'id': "mapmap"}) maps = list(response['Item']['map_list']) last_map = maps[len(maps) - 1] response = song_table.get_item(Key={'id': last_map}) current_song_urls = list(response['Item']['url_list']) if len(current_song_urls) < 400: return last_map else: new_map = "map_" + str(len(maps) + 1) maps.append(new_map) helper_methods.update_table(song_table, "mapmap", "map_list", maps) song_table.put_item(Item={'id': new_map, "url_list": []}) return new_map
def populate_sentence_db(): with open('last_word_dict.json') as f: last_words = dict(json.load(f)) sent_list = [] i = 1 while i < len(list(last_words.keys())): try: word = list(last_words.keys())[i] print(word) print(i) if helper_methods.check_phonetic_existance(word): sents = get_all_sents(word) response = lyric_table.get_item(Key={'id': -1}) total = int(response['Item']['total']) start = total + 1 with lyric_table.batch_writer() as batch: for j, sent in enumerate(sents): total += 1 Item = { 'id': total, 'sent': sents[sent], 'len': len(sents[sent]) } for w in sents[sent]: Item[w] = 1 batch.put_item(Item=Item) sent_list.append(sents[sent]) batch.put_item(Item={'id': -1, 'total': total}) helper_methods.update_table(rhyme_table, word, "sent_ids", [start, total]) else: print("{} has no phonetic representation".format(word)) i += 1 except botocore.exceptions.ClientError: i += 1 output_sents = {"sents": sent_list} with open('sent_array.json', 'w') as outfile: json.dump(sent_list, outfile, indent=2)
def assign_word_specificities(): """Modifies the data.json file that contains parts of speech tagging such that it chooses the highest 2 frequency """ with open('data.json') as f: data = json.load(f) gen_types = [ "RB", "CD", "EX", "DT", "CC", "IN", "MD", "PDT", "RP", "UH", "TO", "PRP", "WDT", "WP", "WP$", "PRP$", "POS", "WRB" ] for item in data: print(item) curr_pos = get_max_json(data[item]) special = True for pos in curr_pos: if len(pos) > 1 and pos in gen_types: special = False print(special) if special: helper_methods.update_table(word_table, item, "u", 1)
def fill_word_dict(x: int, y: int): """Uses a song url to look up the lyrics and get all the individual words from them""" song_urls = lyricsorter.get_song_url_list() words = get_word_list() word_list = words[0] word_dict = words[1] while x < y: if x > len(song_urls) - 1: break link = song_urls[x] x += 1 response = song_table.get_item(Key={'id': link}) item = response['Item'] lyrics = str(item['lyrics']) lyric_list = lyrics.split("\n") output_list = [] print(str(x)) for item in lyric_list: # Take out the punctuation item = item.replace("?", "") item = item.replace(",", "") item = item.replace(".", " ") item = item.replace("\"", "") item = item.replace("-", " ") item = item.replace("!", "") item = item.replace("+", "") item = item.lower() # We only want the line if it doesn't have numbers, and doesn't have a colon, which means its not actual lyrics if item not in output_list and lyricsorter.hasNumbers( item) is False and scraper.contains(item, ":") is False: output_list.append( lyricsorter.remove_paranthases(item).strip()) for item in output_list: word_lines = item.split(" ") # we only want sentences with more than three words, because some of the text in the lyrics aren'tactual song lyrics # and we want to avoid adding those if len(word_lines) > 3: for word in word_lines: word = str(word).lower() # make sure we aren't adding html code if len(word) > 0 and scraper.contains(word, "&") is False: if word[0] == '\'': word = word[1:len(word)] # replace slang words with their real equivalent\ word = slang_cleaner.remove_weirdness(word) slang_word = slang_cleaner.clean_misspellings(word) word = slang_cleaner.clean_slang(slang_word) if word in word_list: num_occurrences = int( word_dict[word]["num_occurrences"]) + 1 word_dict[word][ "num_occurrences"] = num_occurrences if word != slang_word and slang_word not in word_dict[ word]["slang"]: word_dict[word]["slang"].append(slang_word) for i, word in enumerate(word_list): print("Inserting word #{} of {}".format(str(i), str(len(word_list)))) helper_methods.update_table(word_table, word, "num_occurrences", int(word_dict[word]["num_occurrences"])) helper_methods.update_table(word_table, word, "slang", word_dict[word]["slang"])