예제 #1
0
def _fill(csv_file_list):
    """
    Fill the database given the list of data CSVs.
    :param csv_file_list: List of CSV file paths where the data is.
    :return: Database filled.
    """
    song_added = 0
    for csv_file_name in tqdm(csv_file_list, total=len(csv_file_list)):
        with open(csv_file_name, 'r') as csv_file:
            rows = [row for row in csv.reader(csv_file) if row][1:]
            for row in rows:
                try:
                    if len(row) == len(SCRIPT_ROW):
                        song_name = row[SCRIPT_ROW.index('SONG_NAME')]
                        artist_name = row[SCRIPT_ROW.index('ARTIST_NAME')]
                        if not song_service.get_song_by_name_and_artist(
                                song_name, artist_name):
                            lyrics = row[SCRIPT_ROW.index('LYRICS')]
                            artist_url = row[SCRIPT_ROW.index('ARTIST_URL')]
                            song_url = row[SCRIPT_ROW.index('SONG_URL')]
                            song_id = song_service.add_song(
                                artist_name, song_name, lyrics, artist_url,
                                song_url)
                            song_added += int(bool(song_id))
                except Exception as e:
                    log.warn(f'Skipping row due to [{e}]')
                    log.warn(f'Row: {row}')
    log.info(f'Songs added: [{song_added}]')
def _extract(id_list):
    """
    Extract the maximum distance given all the song identifiers.
    :param id_list: List full of song identifiers.
    :return: Maximum distance.
    """
    log.info('Extracting maximum distance...')
    if SCRIPT_PARALLEL:
        with ProcessPool(SCRIPT_PROCESS_AMOUNT) as pool:
            args_list = [(song_id, len(id_list)) for song_id in id_list]
            r = list(
                tqdm(pool.imap(__get_maximum_distance,
                               args_list,
                               chunksize=SCRIPT_CHUNK_SIZE),
                     total=len(args_list)))
            maximum_distance = max(r)
    else:
        nmslib_index = Nmslib()
        nmslib_index.load(FILE_NAME_INDEX)
        maximum_distance = 0.0
        for idx, song_id in tqdm(enumerate(id_list), total=len(id_list)):
            features = searcher.extract_features_from_song(song_id)
            if features is not None:
                song_maximum_distance = searcher.get_maximum_distance(
                    features, nmslib_index, len(id_list))
                if song_maximum_distance > maximum_distance:
                    maximum_distance = song_maximum_distance
            if idx % 1000 == 0:
                _save(maximum_distance)
    log.info('Extracted!')
    return maximum_distance
예제 #3
0
파일: train.py 프로젝트: tabuckner/searchly
def _get_all_lyrics():
    """
    Retrieve all the song lyrics from the database.
    :return: List of song lyrics.
    """
    log.info('Getting all songs...')
    song_list = song_service.get_all_songs()
    lyrics_list = [song.lyrics for song in song_list]
    log.info(f'Lyrics: [{len(lyrics_list)}]')
    return lyrics_list
def _save(maximum_distance):
    """
    Save the maximum distance to the file.
    :param maximum_distance: Maximum distance to save.
    :return: Maximum distance saved.
    """
    log.info(f'Saving maximum distance: [{maximum_distance}]')
    with open(FILE_NAME_MAXIMUM_DISTANCE, 'w') as file:
        file.write(str(maximum_distance))
    log.info('Done!')
예제 #5
0
def _shape(lyrics_list):
    """
    Shape the normalized lyrics list.
    :param lyrics_list: List of lyrics.
    :return: Shaped list.
    """
    log.info('Shaping lyrics list...')
    lyrics_list = word2vec.shape(lyrics_list)
    log.info(f'Shaping done. Shape: [{lyrics_list.shape}]')
    return lyrics_list
예제 #6
0
def _get_csv_file_list(unzipping_output_folder):
    """
    Extract all the csv file paths given the generated folder.
    :param unzipping_output_folder: Folder path.
    :return: List of CSV file paths.
    """
    csv_file_list = [
        i for i in glob.glob(f'{unzipping_output_folder}/**/*.csv')
    ]
    log.info(f'{len(csv_file_list)} CSV files extracted.')
    return csv_file_list
예제 #7
0
def _build(lyrics_list):
    """
    Build the NMSLIB given the prepared input of data.
    :param lyrics_list: Index needed data input.
    :return: NMSLIB index built and saved.
    """
    log.info('Building index...')
    index_instance = Nmslib()
    index_instance.fit(lyrics_list)
    index_instance.save(FILE_NAME_INDEX)
    log.info('Index built!')
예제 #8
0
파일: train.py 프로젝트: tabuckner/searchly
def _clean_lyrics(lyrics_list):
    """
    Clean all lyrics for being useful for the training.
    :param lyrics_list: List of lyrics to clean.
    :return: Cleaned representation of the given lyrics list.
    """
    lyrics_list_cleaned = []
    log.info('Cleaning all lyrics...')
    for lyrics in tqdm(lyrics_list, total=len(lyrics_list)):
        lyrics_list_cleaned.append(word2vec.clean_lyrics(lyrics))
    log.info(f'Lyrics cleaned: [{len(lyrics_list_cleaned)}]')
    return lyrics_list_cleaned
예제 #9
0
def _normalize_lyrics(lyrics_list, w2v_instance):
    """
    Normalize a list of song lyrics given the trained word2vec model.
    :param lyrics_list: List of songs.
    :param w2v_instance: Trained word2vec model instance.
    :return: Normalized lyrics list.
    """
    lyrics_list_normalized = []
    log.info('Normalizing all lyrics...')
    index_id = 0
    for idx, lyrics in tqdm(enumerate(lyrics_list), total=len(lyrics_list)):
        lyrics_normalized = word2vec.normalize(lyrics, w2v_instance)
        if lyrics_normalized is not None:
            lyrics_list_normalized.append(lyrics_normalized)
            song_service.set_index_id(idx + 1, index_id)
            index_id += 1
    log.info(f'Lyrics normalized: [{len(lyrics_list_normalized)}]')
    return lyrics_list_normalized
예제 #10
0
def _unzip():
    """
    Unzip the input file to the given folder.
    :return: File unzipped.
    """
    assert args.input_file.endswith('.zip')
    log.info('Opening input file...')
    with zipfile.ZipFile(args.input_file, 'r') as zip_file:
        log.info(
            f'Extracting all file from [{args.input_file}] into [{args.unzipping_output_folder}]...'
        )
        zip_file.extractall(args.unzipping_output_folder)
    log.info('Unzipping done!')
    return args.unzipping_output_folder
예제 #11
0
파일: train.py 프로젝트: tabuckner/searchly
def _train(w2c_instance, lyrics_list):
    """
    Train the word2vec instance given the prepared data.
    :param w2c_instance: Word2vec instance to train.
    :param lyrics_list: Data needed for training the instance.
    :return: Model trained and saved.
    """
    log.info('Introducing the vocabulary...')
    w2c_instance.build_vocab(lyrics_list)
    log.info('Vocabulary introduced.')
    lyrics_count = w2c_instance.corpus_count
    epochs_count = w2c_instance.epochs
    log.info(f'Lyrics count: [{lyrics_count}]')
    log.info(f'Epochs count: [{epochs_count}]')
    log.info('Start training...')
    w2c_instance.train(lyrics_list, total_examples=lyrics_count, epochs=epochs_count)
    log.info('Trained!')
    log.info('Saving instance...')
    word2vec.save_w2v_instance(FILE_NAME_W2V, w2c_instance)
    log.info(f'Saved in [{FILE_NAME_W2V}]')