def test_generate(self):
        dataset = Dataset("A fancy dataset")
        parser = DiscogsSongParser(file_path=rel_path_to_file("../../files/releases_piece_small.xml", __file__),
                                   dataset=dataset)

        generator = get_clean_graph_generator_mongo_repos()
        generator.generate_turtle_song_graph(
            file_path=rel_path_to_file("../../files/out/large_discogs_release.ttl", __file__),
            song_parser=parser)
 def test_entity_detection(self):
     parser = UsosSongParser(dataset=Dataset("some_dataset"),
                             source_file=rel_path_to_file("../../files/in/mini_uso.tsv", __file__))
     counter = 0
     for song in parser.parse_songs():
         counter += 1
     self.assertEquals(8, counter, "Unexpected number of songs. Expected 8 bur parsed " + str(counter))
 def test_titles_detection(self):
     parser = UsosSongParser(dataset=Dataset("some_dataset"),
                             source_file=rel_path_to_file("../../files/in/mini_uso.tsv", __file__))
     expected_dict = {
         "kentucky woman": False,
         "why tell me why": False,
         "acapella": False,
         "human": False,
         "carry out": False,
         "don't stop the party": False,
         "whistle": False,
         "somebody that i used to know": False
     }
     unexpected = []
     for song in parser.parse_songs():
         if song.canonical.strip() in expected_dict:
             expected_dict[song.canonical.strip()] = True
         else:
             unexpected.append(song.canonical)
     self.assertEquals(0, len(unexpected), "Some unexpected songs where parsed: " + str(unexpected))
     not_found = []
     for a_expected in expected_dict:
         if not expected_dict[a_expected]:
             not_found.append(a_expected)
     self.assertEquals(0, len(not_found), "Some expected songs were not found: " + str(not_found))
 def test_writers_detection(self):
     parser = UsosSongParser(dataset=Dataset("some_dataset"),
                             source_file=rel_path_to_file("../../files/in/mini_uso.tsv", __file__))
     expected_dict = {
         "Amy Heidemann": False,
         "Martin Johnson": False,
         "Nick Noonan": False,
         "Sam Hollander": False,
         "Brandon Flowers": False,
         "Dave Keuning": False,
         "Ronnie Vannucci": False,
         "Jerome Harmon": False,
         "Jim Beanz": False,
         "Justin Timberlake": False,
         "Timothy Clayton": False,
         "Timothy Mosley": False,
         "Backer": False,
         "Wally De": False
     }
     unexpected = []
     for song in parser.parse_songs():
         for a_coll in song.collaborations:
             if a_coll.collaborator.canonical.strip() in expected_dict:
                 expected_dict[a_coll.collaborator.canonical.strip()] = True
             else:
                 unexpected.append(a_coll.collaborator.canonical)
     self.assertEquals(0, len(unexpected), "Some unexpected writers where parsed: " + str(unexpected))
     not_found = []
     for a_expected in expected_dict:
         if not expected_dict[a_expected]:
             not_found.append(a_expected)
     self.assertEquals(0, len(not_found), "Some expected writers were not found: " + str(not_found))
 def test_artist_detection(self):
     parser = UsosSongParser(dataset=Dataset("some_dataset"),
                             source_file=rel_path_to_file("../../files/in/mini_uso.tsv", __file__))
     expected_dict = {
         "Neil Diamond": False,
         "Anita Meyer": False,
         "Karmin": False,
         "The Killers": False,
         "Justin Timberlake": False,
         "Timbaland": False,
         "The Black Eyed Peas": False,
         "Flo Rida": False,
         "Gotye": False
     }
     unexpected = []
     for song in parser.parse_songs():
         for artist in song.artists:
             if artist.canonical.strip() in expected_dict:
                 expected_dict[artist.canonical.strip()] = True
             else:
                 unexpected.append(artist.canonical)
     self.assertEquals(0, len(unexpected), "Some unexpected artists where parsed: " + str(unexpected))
     not_found = []
     for a_expected in expected_dict:
         if not expected_dict[a_expected]:
             not_found.append(a_expected)
     self.assertEquals(0, len(not_found), "Some expected artist were not found: " + str(not_found))
    def test_many_real_songs(self):
        parser = DiscogsSongParser(file_path=rel_path_to_file("../../files/discogs_releases.xml", __file__),
                                   dataset=Dataset(title="TestDataset"))

        counter = 1
        for a_song in parser.parse_songs():
            counter += 1
            if counter % 50000 == 0:  # 50.000
                break
    def test_many_songs_parsed(self):
        parser = UsosSongParser(dataset=Dataset("some_dataset"),
                                source_file=rel_path_to_file("../../files/in/bmat2heaven.tsv", __file__))

        counter = 50000
        for a_song in parser.parse_songs():
            counter -= 1
            if counter <= 0:
                break
示例#8
0
def get_executer_memory_repos_file_rdflib_graph(str_json_config):
    print "loading graph"
    #Graph from file
    graph_path = rel_path_to_file("files/mini_usos/mini_usos_graph.ttl", __file__)
    rdflib_graph = Graph()
    rdflib_graph.load(graph_path, format="turtle")
    mera_graph = MeraRdflibGraph(rdflib_graph)

    print "loading artist ngrams"
    #Memory artist repo from file
    artist_ngram_path = rel_path_to_file("files/mini_usos/artist.json", __file__)
    repo_artist = MemoryEntityNgrams(base_entity_uri=base_entities_URI,
                                     type_of_entity_collection=ARTIST_COLLECTION,
                                     load_file=artist_ngram_path)

    print "loading songs ngrams"
    #Memory song repo from file
    song_ngram_path = rel_path_to_file("files/mini_usos/song.json", __file__)
    repo_song = MemoryEntityNgrams(base_entity_uri=base_entities_URI,
                                   type_of_entity_collection=SONG_COLLECTION,
                                   load_file=song_ngram_path)

    print "loading repo counter"
    #Memory counter repo from file
    counter_path = rel_path_to_file("files/mini_usos/counter.json", __file__)
    repo_counter = MemoryEntityCounter(load_file=counter_path)

    print "Loading matcher"
    #Matcher over those structures
    matcher = MeraMatcher(graph=mera_graph,
                          artist_ngrams_repository=repo_artist,
                          song_ngrams_repository=repo_song,
                          entity_counter_repository=repo_counter,
                          match_config=translate_json_to_mera_match_config(json.loads(str_json_config)))

    #Formater to json
    formater = FormaterToJson()

    print "loading executer"
    #Executer over those structures
    executer = QueryExecuter(matcher=matcher,
                             formater=formater)

    return executer
    def test_json_to_mera_match_config(self):
        with open(rel_path_to_file(rel_path="../../files/usos/base_config.json",
                                   base_file=__file__), "r") as json_source:
            source_content = json.load(json_source)
        config_result = translate_json_to_mera_match_config(source_content)

        self.assertEquals(0.65, config_result.get_minimum_of_type("artist"))
        self.assertEquals(0.65, config_result.get_minimum_of_type("song"))

        self.assertEquals(40, config_result.top_k_blocking_function())
        self.assertEquals(5, config_result.top_k_results())

        self.assertEquals(1.60, config_result.get_command_threshold("find_song"))
        self.assertEquals(0.80, config_result.get_command_relevance_of_a_type(command_name="find_song",
                                                                              target_type="artist"))
    def test_entity_detection(self):
        parser = DiscogsSongParser(file_path=rel_path_to_file("../../files/releases_piece_big.xml", __file__),
                                   dataset=Dataset(title="TestDataset"))

        counter_songs = 0
        counter_writers = 0
        for a_song in parser.parse_songs():
            counter_songs += 1
            for a_coll in a_song.collaborations:
                if a_coll.role == ROLE_WRITER:
                    counter_writers += 1
            for an_alt in a_song.alternative_titles:
                print a_song.canonical, an_alt

        self.assertEqual(281, counter_songs, msg="Expected 281 songs, but parsed " + str(counter_songs))
        self.assertEqual(427, counter_writers, msg="Expected 427 songs with writter, but parsed " + str(counter_writers))
__author__ = 'Dani'

from wmera.query_gen.query_generator_cwr import CWRQueryGenerator
from wmera.utils import rel_path_to_file

query_gen = CWRQueryGenerator(
    queries_path=rel_path_to_file("../../files/in/cwr-json-to-mera-json/works_group_full.json",
                                  __file__),
    config_path="Doesntmatteryet")
query_gen.gen_srialized_mera_json(file_path="../../files/out/cwr-json-to-mera-json/posible_queries.json")


__author__ = 'Dani'

from wmera.mera_core.model.entities import Dataset
from wmera.utils import rel_path_to_file
from test.t_utils.t_factory import get_clean_graph_generator_memory_repos
from wmera.parsers.usos.usos_song_parser import UsosSongParser


parser = UsosSongParser(dataset=Dataset("Uso_bmat2heaven"),
                        source_file=rel_path_to_file(
                            "../../files/mini_usos/mini_bmat2heaven.tsv",
                            __file__))

generator = get_clean_graph_generator_memory_repos()
generator.generate_turtle_song_graph(file_path=rel_path_to_file("../../files/out/usos_graph.ttl",
                                                                __file__),
                                     song_parser=parser,
                                     isolated=True)

generator._repo_artists.save_content(rel_path_to_file("../../files/out/artist_ngrams_usos.json",
                                                      __file__))
generator._repo_songs.save_content(rel_path_to_file("../../files/out/song_ngrams_usos.json",
                                                      __file__))
generator._repo_counter.save_content(rel_path_to_file("../../files/out/counter_usos.json",
                                                      __file__))



示例#13
0
 def test_generate_song(self):
     generator = get_clean_graph_generator_mongo_repos()
     generator.generate_turtle_song_graph(file_path=rel_path_to_file("../../files/out/test_song_gen.ttl", __file__),
                                          song_parser=FakeSongParser(dataset=Dataset("A_Dataset")))
__author__ = 'Dani'

from wmera.mera_core.model.entities import Dataset
from wmera.utils import rel_path_to_file
from test.t_utils.t_factory import get_clean_graph_generator_mongo_repos
from wmera.parsers.usos.usos_song_parser import UsosSongParser


parser = UsosSongParser(dataset=Dataset("Uso_bmat2heaven"),
                        source_file=rel_path_to_file(
                            "../../files/mini_usos/mini_bmat2heaven.tsv",
                            __file__))

generator = get_clean_graph_generator_mongo_repos()
generator.generate_turtle_song_graph(file_path=rel_path_to_file("../../files/out/usos_graph.ttl",
                                                                __file__),
                                     song_parser=parser,
                                     isolated=True)
示例#15
0
    print len(art1), len(art2_3), len(art4_plus)

    with open("songs_1_artist.tsv", "w") as target_file:
        for song in art1:
            target_file.write(serialize_song_tsv_artist(song) + "\n")

    with open("songs_2_3_artist.tsv", "w") as target_file:
        for song in art2_3:
            target_file.write(serialize_song_tsv_artist(song) + "\n")

    with open("songs_4_plus_artist.tsv", "w") as target_file:
        for song in art4_plus:
            target_file.write(serialize_song_tsv_artist(song) + "\n")


num_songs = 45458287
# num_songs = 100
# desired_songs = 3500


extract_random_tsv_songs(file_path=rel_path_to_file("../../files/discogs_releases.xml", __file__),
                         total_songs=num_songs,
                         desired_songs=3500)







    return result


def name_included_in_min_levenshtein_distance(name, target_et):
    for elem in target_et:
        if levenshtein(elem, name) <= 1:
            return True
    return False



##################### ------------ Program ----------- #########################

print "Eh que soy yo"

clean_artist = set_of_words_from_file_separator(rel_path_to_file("files/clean_artist.txt",
                                                                 __file__))

noisy_artist = set_of_words_from_file_separator(rel_path_to_file("files/noisy_artist.txt",
                                                                 __file__))

clean_songs = set_of_words_from_file_separator(rel_path_to_file("files/clean_song.txt",
                                                                __file__))

noisy_songs = set_of_words_from_file_separator(rel_path_to_file("files/noisy_song.txt",
                                                                __file__))

artist_parsed = clean_artist.union(noisy_artist)
songs_parsed = clean_songs.union(noisy_songs)

noisy_queries = list_of_lists_of_words_from_file_lines(
    rel_path_to_file("files/noisy-musical-queries.txt",
from wmera.utils import rel_path_to_file

__author__ = 'Dani'

from test.t_utils.t_factory import get_clean_repo_counter_memory, get_clean_repo_song_memory, \
    get_clean_repo_artist_memory, get_clean_repo_artist_mongo, get_clean_repo_counter_mongo, \
    get_clean_repo_songs_mongo

from wmera.adapters.in_memory_to_mogno import dump_in_memory_ngrams_into_mongo_ngrams, \
    dump_in_memory_counter_into_mongo_counter


# Artist ngrmas
mongo_artist = get_clean_repo_artist_mongo()
memory_artist = get_clean_repo_artist_memory()
memory_artist.load_content(rel_path_to_file("../../files/out/artist_ngrams_usos.json",
                                            __file__))

dump_in_memory_ngrams_into_mongo_ngrams(in_memory_repo=memory_artist,
                                        mongo_repo=mongo_artist)
memory_artist = None  # Free memory

# Song ngrams
mongo_song = get_clean_repo_songs_mongo()
memory_song = get_clean_repo_song_memory()
memory_song.load_content(rel_path_to_file("../../files/out/song_ngrams_usos.json",
                                          __file__))

dump_in_memory_ngrams_into_mongo_ngrams(in_memory_repo=memory_song,
                                        mongo_repo=mongo_song)
memory_song = None  # Free memory
from wmera.mera_core.model.entities import  Dataset


class FakeFormater(MeraFormaterInterface):
    def format_mera_results(self, list_of_dicts_with_list_of_base_results):
        for a_stuff in list_of_dicts_with_list_of_base_results:
            print a_stuff


executer = QueryExecuter(matcher=get_mera_matcher_with_data(graph_path="../../files/out/usos_graph.ttl",
                                                            ngram_song_path="../../files/out/song_ngrams_usos.json",
                                                            ngram_artist_path="../../files/out/artist_ngrams_usos.json",
                                                            counter_path="../../files/out/counter_usos.json"),
                         formater=FormaterToJson())

# executer = QueryExecuter(matcher=get_empty_mera_matcher(),
#                          formater=FormaterToJson())
res = executer.execute_queries_from_file(
    file_path=rel_path_to_file("../../files/out/cwr-json-to-mera-json/posible_queries.json",
                               __file__))
print res


with open("../../files/in/mera_results_mini_usos.json", "r") as file_io:
    json_matches = file_io.read()
    executer.introduce_json_matches_in_graph(json_matches_str=json_matches,
                                             dataset_obj=Dataset(title="MiDatasetCWR"),
                                             serialization_path="../../files/out/usos_graph_ENRICHED.ttl")


 def test_graph_repeated_artist(self):
     file_path = rel_path_to_file("../../files/out/repeated_artists_mini_graph.ttl", __file__)
     artist_parser = FakeRepeatedArtistsParser(Dataset("MyTest", date="2000-feb-15"))
     generator = get_clean_graph_generator_mongo_repos()
     generator.generate_turtle_artist_graph(file_path, artist_parser)
示例#20
0
        result = {}
        with open("../files/consultas_aol/consultas-AOL.txt") as in_file:
            tmp_line = ""
            counter = 0
            for line in in_file:
                counter += 1
                if counter % 100000 == 0:
                    print counter
                tmp_line = line.split("\t")
                if len(tmp_line) == 2:
                    tmp_line = tmp_line[1].strip().lower()
                    for a_feat_var in self._feat_vars:
                        if a_feat_var in tmp_line:
                            # print tmp_line
                            if tmp_line in result:
                                result[tmp_line] += 1
                            else:
                                result[tmp_line] = 1
                            break
        return result






##############

extractor = AolNoisyQueryExtractor(rel_path_to_file("../files/consultas_aol/", __file__))
extractor.run()