def main(argv):
    if len(argv) < 1:
        print 'Missing XML file input'
        print 'test_client.py <inputfile>'
        sys.exit(2)

    if not user_token:
        print 'Missing DISCOGS_TOKEN environment variable'
        sys.exit(2)

    start_time = time.time()

    count_artist = 0

    pathXML = argv[0]

    print('Parsing file {}\n'.format(pathXML))

    for artist in parse_artists(pathXML):
        count_artist += 1
        try:
            artist['images'] = get_images(artist['id'])
            print(u'Artist {0}: {1}\n'.format(count_artist, artist))
        except DiscoGSError:
            print(u'Cannot get images from Artist {0}: {1}\n'.format(
                count_artist, artist))

    elapsed_time = time.time() - start_time

    print('Elapsed time: {}'.format(hms_string(elapsed_time)))
    print('Artists Parsed: {}'.format(count_artist))
def main(argv):
    if len(argv) < 1:
        print 'Missing XML file input'
        print 'test_xml_genre.py <inputfile>'
        sys.exit(2)

    start_time = time.time()

    count_artist = 0

    genres = set()

    pathXML = argv[0]

    print('Parsing release file {}\n'.format(pathXML))

    for artists in parse_genres(pathXML):
        for artist in artists:
            if artist['id']:
                genres.update(artist['genres'])
                count_artist += 1
                print(u'Artist {0}: {1}\n'.format(count_artist, artist))

    elapsed_time = time.time() - start_time

    print('Elapsed time: {}'.format(hms_string(elapsed_time)))
    print('Artists Parsed: {}'.format(count_artist))
    print('Genres Parsed: {}'.format(genres))
Пример #3
0
def create_pagerank(C, L, I, k=1):
    """
    :param n: Matrix length
    :param k: iteration nb
    :return: List of page rank (indices are pages ids)
    """
    start_time = time.time()

    n = len(L) - 1
    Pi = [1 / n for _ in range(n)]
    P = [0] * n
    for _ in range(k):
        for i in range(n):

            if i + 1 < n + 1:
                if L[i] == L[i + 1]:  # Empty line
                    for j in range(n):
                        P[j] += 1 / n * Pi[i]
                else:
                    for j in range(L[i], L[i + 1]):
                        P[I[j]] += C[j] * Pi[i]
            print_percentage(i, n)

    print("     ** Finish create_pagerank()")
    elapsed_time = time.time() - start_time
    print("     Elapsed time create_pagerank() : {}".format(
        hms_string(elapsed_time)))
    return P
Пример #4
0
def main(argv):

    if not user_token:
        print 'Missing NEXT_BIG_SOUND_TOKEN environment variable'
        sys.exit(2)

    max_releases = DEFAULT_MAX_RELEASES
    max_appearances = DEFAULT_MAX_APPEARANCES

    try:
        max_releases = int(argv[0])
        max_appearances = int(argv[1])
    except Exception:
        pass

    print('\nGetting {} releases with {} appearances\n'.format(
        max_releases, max_appearances))

    start_time = time.time()

    count_artist = 0

    for artist in get_artists(
            max_chart_releases=max_releases,
            max_chart_appearances=max_appearances,
    ):
        count_artist += 1
        print('Artist {0}: {1} \n'.format(count_artist, artist))

    elapsed_time = time.time() - start_time

    print('Elapsed time: {}'.format(hms_string(elapsed_time)))
    print('Artists Parsed: {}'.format(count_artist))
def main(argv):

    if not user_token:
        print 'Missing NEXT_BIG_SOUND_TOKEN environment variable'
        sys.exit(2)

    max_artists = DEFAULT_MAX_ARTISTS
    initial_artist = DEFAULT_INITIAL_ARTIST

    try:
        max_artists = int(argv[0])
        initial_artist = int(argv[1])
    except Exception:
        pass

    print('\nGenerating {} artist ids starting at {}\n'.format(
        max_artists, initial_artist))

    start_time = time.time()

    count_artist = 0
    artist_without_images = 0
    artist_without_genres = 0
    artist_without_social_media_links = 0

    for artist_id in xrange(initial_artist, initial_artist + max_artists):
        artist = get_artist(id=artist_id)

        if artist:
            count_artist += 1
            print('Artist {0}: {1} \n'.format(artist_id, artist))

            if not artist['images']:
                artist_without_images += 1

            if not artist['genres']:
                artist_without_genres += 1

            if not artist['social_media_links']:
                artist_without_social_media_links += 1

    elapsed_time = time.time() - start_time

    print('Elapsed time: {}'.format(hms_string(elapsed_time)))
    print('Artists parsed begining at {}: {} from {}'.format(
        initial_artist, count_artist, max_artists))
    print('Artists without images: {} from {}'.format(artist_without_images,
                                                      count_artist))
    print('Artists without genres: {} from {}'.format(artist_without_genres,
                                                      count_artist))
    print('Artists without social_media_links: {} from {}'.format(
        artist_without_social_media_links, count_artist))
Пример #6
0
def parse_corpus(file_name, pages_count=252374):
    """
    :param file_name:
        XML file containing pages data
    :param pages_count:
        number of pages
    :return:
        List of tuple containing (id, title, content) for each page
    """
    start_time = time.time()

    pagelist_noclean = []
    total_pages_count = 0

    id = None
    title = None
    content = None

    for event, elem in ET.iterparse(file_name, events=('start', 'end')):
        tname = elem.tag

        if event == 'start':

            if tname == 'page':
                title = ''
                id = -1
                content = ''
        else:
            if tname == 'title':
                title = elem.text

            elif tname == 'id':
                id = int(elem.text)

            elif tname == 'text':
                content = elem.text

            elif tname == 'page':
                total_pages_count += 1
                pagelist_noclean.append((id, title, content))
                print_percentage(total_pages_count, pages_count)

            elem.clear()

    elapsed_time = time.time() - start_time
    print("  ** Finish parse corpus")
    print("  - Elapsed time parse corpus : {}".format(
        hms_string(elapsed_time)))

    return pagelist_noclean
Пример #7
0
def create_clean_tokens_pagelist(pagelist_plaintext):
    start_time = time.time()
    pagelist_clean_tokens = []
    listsize = len(pagelist_plaintext)

    for i, (id, title, content) in enumerate(pagelist_plaintext):
        content_clean_tokens = get_clean_tokens(content, remove_section=True)
        pagelist_clean_tokens.append((id, title, content_clean_tokens))
        print_percentage(i, listsize)

    elapsed_time = time.time() - start_time
    print("  ** Finish create clean tokens pagelist")
    print("  - Elapsed time create clean tokens pagelist : {}".format(
        hms_string(elapsed_time)))
    return pagelist_clean_tokens
Пример #8
0
def create_plaintext_pagelist(pagelist_noclean):
    start_time = time.time()
    pagelist_plaintext = []
    listsize = len(pagelist_noclean)

    for i, (id, title, content) in enumerate(pagelist_noclean):
        text = wiki_to_paintext(content)
        pagelist_plaintext.append((id, title, text))
        print_percentage(i, listsize)

    elapsed_time = time.time() - start_time
    print("  ** Finish create plaintext pagelist")
    print("  - Elapsed time create plaintext pagelist : {}".format(
        hms_string(elapsed_time)))
    return pagelist_plaintext
Пример #9
0
def create_links_pagelist(pagelist_noclean):
    start_time = time.time()
    pagelist_links = []
    listsize = len(pagelist_noclean)

    for i, (id, title, content) in enumerate(pagelist_noclean):
        links = get_links(content)
        pagelist_links.append((id, title, links))
        print_percentage(i, listsize)

    elapsed_time = time.time() - start_time
    print("  ** Finish create links pagelist")
    print("  - Elapsed time create links pagelist : {}".format(
        hms_string(elapsed_time)))
    return pagelist_links
Пример #10
0
def create_resume_pagelist(pagelist_plaintext):
    start_time = time.time()
    pagelist_plaintext_resume = []
    listsize = len(pagelist_plaintext)

    for i, (id, title, content) in enumerate(pagelist_plaintext):
        resume = get_resume(content)
        pagelist_plaintext_resume.append((id, title, resume))
        print_percentage(i, listsize)

    elapsed_time = time.time() - start_time
    print("  ** Finish create resume pagelist")
    print("  - Elapsed time create resume pagelist : {}".format(
        hms_string(elapsed_time)))
    return pagelist_plaintext_resume
Пример #11
0
def create_cli(pagelist_links):
    """
    edge : [[Title]] in page content
    node : page id
    :param pagelist_links: list of pair containing (id, title, list links content)
    :return:
        Adjacency matrix of the web graph in CLI form
    """
    start_time = time.time()
    listsize = len(pagelist_links)
    dic = {}
    dic_edges = {}

    for id_list, (_, title, _) in enumerate(pagelist_links):
        dic[title] = id_list

    for _, id_list in dic.items():
        dic_edges[id_list] = [
            link for link in pagelist_links[id_list][2] if link in dic.keys()
        ]

    C = []
    L = [0]
    I = []

    for i, _ in enumerate(pagelist_links):
        links = dic_edges[i]
        edge_nb = len(links)
        val = 1 / edge_nb if edge_nb > 0 else 0

        for link in links:
            if link not in dic.keys():
                continue

            id_link = dic[link]
            C.append(val)
            I.append(id_link)

        L.append(L[-1] + edge_nb)
        print_percentage(i, listsize)

    elapsed_time = time.time() - start_time
    print("  ** Finish create cli")
    print("  - Elapsed time create cli : {}".format(hms_string(elapsed_time)))
    return C, L, I
def main(argv):
    if len(argv) < 1:
        print 'Missing XML file input'
        print 'test_xml_artist.py <inputfile>'
        sys.exit(2)

    start_time = time.time()

    count_artist = 0

    pathXML = argv[0]

    print('Parsing artist file {}\n'.format(pathXML))

    for artist in parse_artists(pathXML):
        count_artist += 1
        print(u'Artist {0}: {1}\n'.format(count_artist, artist))

    elapsed_time = time.time() - start_time

    print('Elapsed time: {}'.format(hms_string(elapsed_time)))
    print('Artists Parsed: {}'.format(count_artist))
Пример #13
0
def create_dico(pagelist_clean_tokens):
    """
    :param pagelist_clean_tokens: list of pages to parse
    :return:
        Dictionnary of ~200k most used words containing all the words from titles in form {word : ({page_id : TF_normalized}, IDF)}
    """
    start_time = time.time()

    dico_title = dict()
    dico_text = dict()
    listsize = len(pagelist_clean_tokens)

    for id, (_, title, content) in enumerate(pagelist_clean_tokens):
        # Tokeniser le titre
        title_clean = get_clean_tokens(title)

        # for word in title_lemmatized:
        for word in title_clean:

            if word not in dico_title.keys():  # word not in dict
                dico_title[word] = ({id: 1}, 0)
            else:  # word in dict
                if id not in dico_title[word][0].keys():  # page is not in list
                    dico_title[word][0][id] = 10
                else:  # page already in list
                    dico_title[word][0][id] += 10

        for word in content:
            if word not in dico_text.keys():
                dico_text[word] = ({id: 1}, 0)
            else:
                if id not in dico_text[word][0].keys():  # page is not in list
                    dico_text[word][0][id] = 1
                else:  # page already in list
                    dico_text[word][0][id] += 1

        print_percentage(id, listsize)

    dico_title.update({
        key: value
        for key, value in sorted(list(dico_text.items()),
                                 key=lambda item: len(item[1][0].items()))
        [-200000:]
    })

    # for key, value in {key: value for key, value in
    #                    sorted(list(dico_text.items()), key=lambda item: len(item[1][0].items()))[-200000:]}.items():
    #     if key in dico_title.keys():
    #         for page, freq in dico_text[key][0].items():
    #             if page in dico_title[key][0].keys():
    #                 dico_title[key][0][page] += freq
    #             else:
    #                 dico_title[key][0][page] = freq
    #     else:
    #         dico_title[key] = value

    tf_norm = dict()  # normalized TF

    for word, (occ_dic, idf) in dico_title.items():
        for pageid, freq in occ_dic.items():
            if freq > 0:
                if pageid not in tf_norm.keys():
                    tf_norm[pageid] = (1 + math.log10(freq))**2
                else:
                    tf_norm[pageid] += (1 + math.log10(freq))**2

    # writing IDF and normalized TF
    for word in dico_title.keys():
        idf = math.log10(listsize / len(dico_title[word][0].keys()))
        dico_title[word] = (dico_title[word][0], idf)

        for page, tf in dico_title[word][0].items():
            dico_title[word][0][page] = tf / math.sqrt(tf_norm[page])

    elapsed_time = time.time() - start_time
    print("  ** Finish create dico")
    print("  - Elapsed time create dico : {}".format(hms_string(elapsed_time)))
    return dico_title
Пример #14
0
        else:
            token = split[0]
            link = category = None
        links.append((token, link, category))
        return token

    text = re_link.sub(replace_links, text)

    # repeat for nested structures, performancewise not perfect
    n = 1
    while n > 0:
        text, n = re_infobox.subn('', text)

    text = re_lf.sub('\n', text)
    return text.strip(' \n}{'), links, tuple(categories)


if __name__ == '__main__':
    t0 = time()
    print("Starting ...")
    scale = 1000
    parse_xml(
        IN_PATH,
        OUT_PATH + '_links_optimized',
        iterations=100,
        batch_size=100 * scale,
        print_every=10 * scale,
    )
    t1 = int(time() - t0)
    print("all done in", hms_string(t1))
Пример #15
0
def _pregenerate_and_serialize():
    start_time = time.time()

    print(" * Start parse corpus")
    pagelist_noclean = parse_corpus(path_corpus_xml)
    print(" * Start serialize pagelist no clean")
    serialize(pagelist_noclean, path_pagelist_noclean)
    # print(" * Start deserialize pagelist no clean")
    # pagelist_noclean = deserialize(path_pagelist_noclean)

    print(" * Start create links pageslist")
    pagelist_links = create_links_pagelist(pagelist_noclean)
    print(" * Start serialize links pagelist")
    serialize(pagelist_links, path_pagelist_links)
    # print(" * Start deserialize links pagelist")
    # pagelist_links = deserialize(path_pagelist_links)

    print(" * Start create plaintext pagelist")
    pagelist_plaintext = create_plaintext_pagelist(pagelist_noclean)
    print(" * Start serialize pagelist plaintext")
    serialize(pagelist_plaintext, path_pagelist_plaintext)
    # print(" * Start deserialize pagelist plaintext")
    # pagelist_plaintext = deserialize(path_pagelist_plaintext)

    print(" * Start create resume pagelist")
    pagelist_plaintext_resume = create_resume_pagelist(pagelist_plaintext)
    print(" * Start serialize resume pagelist")
    serialize(pagelist_plaintext_resume, path_pagelist_plaintext_resume)
    # print(" * Start deserialize pagelist plaintext resume")
    # pagelist_plaintext_resume = deserialize(path_pagelist_plaintext_resume)

    print(" * Start create CLI")
    (C, L, I) = create_cli(pagelist_links)
    print(" * Start serialize CLI")
    serialize((C, L, I), path_cli)
    # print(" * Start deserialize CLI")
    # (C, L, I) = deserialize(path_cli)

    print(" * Start create clean tokens pagelist")
    pagelist_clean_tokens = create_clean_tokens_pagelist(pagelist_plaintext)
    print(" * Start serialize clean tokens pagelist")
    serialize(pagelist_clean_tokens, path_pagelist_clean_tokens)
    # print(" * Start deserialize clean tokens pagelist")
    # pagelist_clean_tokens = deserialize(path_pagelist_clean_tokens)

    print(" * Start create dico")
    dico = create_dico(pagelist_clean_tokens)
    print(" * Start serialize dico")
    serialize(dico, path_dico)
    # print(" * Start deserialize dico")
    # dico = deserialize(path_dico)

    print(" * Start create pagerank")
    pagerank = create_pagerank(C, L, I)
    print(" * Start serialize pagerank")
    serialize(pagerank, path_pagerank)
    # print(" * Start deserialize pagerank")
    # dico = deserialize(path_pagerank)

    print(" * Finish")
    elapsed_time = time.time() - start_time
    print(" Elapsed time: {}".format(hms_string(elapsed_time)))