def main(argv): if len(argv) < 1: print 'Missing XML file input' print 'test_client.py <inputfile>' sys.exit(2) if not user_token: print 'Missing DISCOGS_TOKEN environment variable' sys.exit(2) start_time = time.time() count_artist = 0 pathXML = argv[0] print('Parsing file {}\n'.format(pathXML)) for artist in parse_artists(pathXML): count_artist += 1 try: artist['images'] = get_images(artist['id']) print(u'Artist {0}: {1}\n'.format(count_artist, artist)) except DiscoGSError: print(u'Cannot get images from Artist {0}: {1}\n'.format( count_artist, artist)) elapsed_time = time.time() - start_time print('Elapsed time: {}'.format(hms_string(elapsed_time))) print('Artists Parsed: {}'.format(count_artist))
def main(argv): if len(argv) < 1: print 'Missing XML file input' print 'test_xml_genre.py <inputfile>' sys.exit(2) start_time = time.time() count_artist = 0 genres = set() pathXML = argv[0] print('Parsing release file {}\n'.format(pathXML)) for artists in parse_genres(pathXML): for artist in artists: if artist['id']: genres.update(artist['genres']) count_artist += 1 print(u'Artist {0}: {1}\n'.format(count_artist, artist)) elapsed_time = time.time() - start_time print('Elapsed time: {}'.format(hms_string(elapsed_time))) print('Artists Parsed: {}'.format(count_artist)) print('Genres Parsed: {}'.format(genres))
def create_pagerank(C, L, I, k=1): """ :param n: Matrix length :param k: iteration nb :return: List of page rank (indices are pages ids) """ start_time = time.time() n = len(L) - 1 Pi = [1 / n for _ in range(n)] P = [0] * n for _ in range(k): for i in range(n): if i + 1 < n + 1: if L[i] == L[i + 1]: # Empty line for j in range(n): P[j] += 1 / n * Pi[i] else: for j in range(L[i], L[i + 1]): P[I[j]] += C[j] * Pi[i] print_percentage(i, n) print(" ** Finish create_pagerank()") elapsed_time = time.time() - start_time print(" Elapsed time create_pagerank() : {}".format( hms_string(elapsed_time))) return P
def main(argv): if not user_token: print 'Missing NEXT_BIG_SOUND_TOKEN environment variable' sys.exit(2) max_releases = DEFAULT_MAX_RELEASES max_appearances = DEFAULT_MAX_APPEARANCES try: max_releases = int(argv[0]) max_appearances = int(argv[1]) except Exception: pass print('\nGetting {} releases with {} appearances\n'.format( max_releases, max_appearances)) start_time = time.time() count_artist = 0 for artist in get_artists( max_chart_releases=max_releases, max_chart_appearances=max_appearances, ): count_artist += 1 print('Artist {0}: {1} \n'.format(count_artist, artist)) elapsed_time = time.time() - start_time print('Elapsed time: {}'.format(hms_string(elapsed_time))) print('Artists Parsed: {}'.format(count_artist))
def main(argv): if not user_token: print 'Missing NEXT_BIG_SOUND_TOKEN environment variable' sys.exit(2) max_artists = DEFAULT_MAX_ARTISTS initial_artist = DEFAULT_INITIAL_ARTIST try: max_artists = int(argv[0]) initial_artist = int(argv[1]) except Exception: pass print('\nGenerating {} artist ids starting at {}\n'.format( max_artists, initial_artist)) start_time = time.time() count_artist = 0 artist_without_images = 0 artist_without_genres = 0 artist_without_social_media_links = 0 for artist_id in xrange(initial_artist, initial_artist + max_artists): artist = get_artist(id=artist_id) if artist: count_artist += 1 print('Artist {0}: {1} \n'.format(artist_id, artist)) if not artist['images']: artist_without_images += 1 if not artist['genres']: artist_without_genres += 1 if not artist['social_media_links']: artist_without_social_media_links += 1 elapsed_time = time.time() - start_time print('Elapsed time: {}'.format(hms_string(elapsed_time))) print('Artists parsed begining at {}: {} from {}'.format( initial_artist, count_artist, max_artists)) print('Artists without images: {} from {}'.format(artist_without_images, count_artist)) print('Artists without genres: {} from {}'.format(artist_without_genres, count_artist)) print('Artists without social_media_links: {} from {}'.format( artist_without_social_media_links, count_artist))
def parse_corpus(file_name, pages_count=252374): """ :param file_name: XML file containing pages data :param pages_count: number of pages :return: List of tuple containing (id, title, content) for each page """ start_time = time.time() pagelist_noclean = [] total_pages_count = 0 id = None title = None content = None for event, elem in ET.iterparse(file_name, events=('start', 'end')): tname = elem.tag if event == 'start': if tname == 'page': title = '' id = -1 content = '' else: if tname == 'title': title = elem.text elif tname == 'id': id = int(elem.text) elif tname == 'text': content = elem.text elif tname == 'page': total_pages_count += 1 pagelist_noclean.append((id, title, content)) print_percentage(total_pages_count, pages_count) elem.clear() elapsed_time = time.time() - start_time print(" ** Finish parse corpus") print(" - Elapsed time parse corpus : {}".format( hms_string(elapsed_time))) return pagelist_noclean
def create_clean_tokens_pagelist(pagelist_plaintext): start_time = time.time() pagelist_clean_tokens = [] listsize = len(pagelist_plaintext) for i, (id, title, content) in enumerate(pagelist_plaintext): content_clean_tokens = get_clean_tokens(content, remove_section=True) pagelist_clean_tokens.append((id, title, content_clean_tokens)) print_percentage(i, listsize) elapsed_time = time.time() - start_time print(" ** Finish create clean tokens pagelist") print(" - Elapsed time create clean tokens pagelist : {}".format( hms_string(elapsed_time))) return pagelist_clean_tokens
def create_plaintext_pagelist(pagelist_noclean): start_time = time.time() pagelist_plaintext = [] listsize = len(pagelist_noclean) for i, (id, title, content) in enumerate(pagelist_noclean): text = wiki_to_paintext(content) pagelist_plaintext.append((id, title, text)) print_percentage(i, listsize) elapsed_time = time.time() - start_time print(" ** Finish create plaintext pagelist") print(" - Elapsed time create plaintext pagelist : {}".format( hms_string(elapsed_time))) return pagelist_plaintext
def create_links_pagelist(pagelist_noclean): start_time = time.time() pagelist_links = [] listsize = len(pagelist_noclean) for i, (id, title, content) in enumerate(pagelist_noclean): links = get_links(content) pagelist_links.append((id, title, links)) print_percentage(i, listsize) elapsed_time = time.time() - start_time print(" ** Finish create links pagelist") print(" - Elapsed time create links pagelist : {}".format( hms_string(elapsed_time))) return pagelist_links
def create_resume_pagelist(pagelist_plaintext): start_time = time.time() pagelist_plaintext_resume = [] listsize = len(pagelist_plaintext) for i, (id, title, content) in enumerate(pagelist_plaintext): resume = get_resume(content) pagelist_plaintext_resume.append((id, title, resume)) print_percentage(i, listsize) elapsed_time = time.time() - start_time print(" ** Finish create resume pagelist") print(" - Elapsed time create resume pagelist : {}".format( hms_string(elapsed_time))) return pagelist_plaintext_resume
def create_cli(pagelist_links): """ edge : [[Title]] in page content node : page id :param pagelist_links: list of pair containing (id, title, list links content) :return: Adjacency matrix of the web graph in CLI form """ start_time = time.time() listsize = len(pagelist_links) dic = {} dic_edges = {} for id_list, (_, title, _) in enumerate(pagelist_links): dic[title] = id_list for _, id_list in dic.items(): dic_edges[id_list] = [ link for link in pagelist_links[id_list][2] if link in dic.keys() ] C = [] L = [0] I = [] for i, _ in enumerate(pagelist_links): links = dic_edges[i] edge_nb = len(links) val = 1 / edge_nb if edge_nb > 0 else 0 for link in links: if link not in dic.keys(): continue id_link = dic[link] C.append(val) I.append(id_link) L.append(L[-1] + edge_nb) print_percentage(i, listsize) elapsed_time = time.time() - start_time print(" ** Finish create cli") print(" - Elapsed time create cli : {}".format(hms_string(elapsed_time))) return C, L, I
def main(argv): if len(argv) < 1: print 'Missing XML file input' print 'test_xml_artist.py <inputfile>' sys.exit(2) start_time = time.time() count_artist = 0 pathXML = argv[0] print('Parsing artist file {}\n'.format(pathXML)) for artist in parse_artists(pathXML): count_artist += 1 print(u'Artist {0}: {1}\n'.format(count_artist, artist)) elapsed_time = time.time() - start_time print('Elapsed time: {}'.format(hms_string(elapsed_time))) print('Artists Parsed: {}'.format(count_artist))
def create_dico(pagelist_clean_tokens): """ :param pagelist_clean_tokens: list of pages to parse :return: Dictionnary of ~200k most used words containing all the words from titles in form {word : ({page_id : TF_normalized}, IDF)} """ start_time = time.time() dico_title = dict() dico_text = dict() listsize = len(pagelist_clean_tokens) for id, (_, title, content) in enumerate(pagelist_clean_tokens): # Tokeniser le titre title_clean = get_clean_tokens(title) # for word in title_lemmatized: for word in title_clean: if word not in dico_title.keys(): # word not in dict dico_title[word] = ({id: 1}, 0) else: # word in dict if id not in dico_title[word][0].keys(): # page is not in list dico_title[word][0][id] = 10 else: # page already in list dico_title[word][0][id] += 10 for word in content: if word not in dico_text.keys(): dico_text[word] = ({id: 1}, 0) else: if id not in dico_text[word][0].keys(): # page is not in list dico_text[word][0][id] = 1 else: # page already in list dico_text[word][0][id] += 1 print_percentage(id, listsize) dico_title.update({ key: value for key, value in sorted(list(dico_text.items()), key=lambda item: len(item[1][0].items())) [-200000:] }) # for key, value in {key: value for key, value in # sorted(list(dico_text.items()), key=lambda item: len(item[1][0].items()))[-200000:]}.items(): # if key in dico_title.keys(): # for page, freq in dico_text[key][0].items(): # if page in dico_title[key][0].keys(): # dico_title[key][0][page] += freq # else: # dico_title[key][0][page] = freq # else: # dico_title[key] = value tf_norm = dict() # normalized TF for word, (occ_dic, idf) in dico_title.items(): for pageid, freq in occ_dic.items(): if freq > 0: if pageid not in tf_norm.keys(): tf_norm[pageid] = (1 + math.log10(freq))**2 else: tf_norm[pageid] += (1 + math.log10(freq))**2 # writing IDF and normalized TF for word in dico_title.keys(): idf = math.log10(listsize / len(dico_title[word][0].keys())) dico_title[word] = (dico_title[word][0], idf) for page, tf in dico_title[word][0].items(): dico_title[word][0][page] = tf / math.sqrt(tf_norm[page]) elapsed_time = time.time() - start_time print(" ** Finish create dico") print(" - Elapsed time create dico : {}".format(hms_string(elapsed_time))) return dico_title
else: token = split[0] link = category = None links.append((token, link, category)) return token text = re_link.sub(replace_links, text) # repeat for nested structures, performancewise not perfect n = 1 while n > 0: text, n = re_infobox.subn('', text) text = re_lf.sub('\n', text) return text.strip(' \n}{'), links, tuple(categories) if __name__ == '__main__': t0 = time() print("Starting ...") scale = 1000 parse_xml( IN_PATH, OUT_PATH + '_links_optimized', iterations=100, batch_size=100 * scale, print_every=10 * scale, ) t1 = int(time() - t0) print("all done in", hms_string(t1))
def _pregenerate_and_serialize(): start_time = time.time() print(" * Start parse corpus") pagelist_noclean = parse_corpus(path_corpus_xml) print(" * Start serialize pagelist no clean") serialize(pagelist_noclean, path_pagelist_noclean) # print(" * Start deserialize pagelist no clean") # pagelist_noclean = deserialize(path_pagelist_noclean) print(" * Start create links pageslist") pagelist_links = create_links_pagelist(pagelist_noclean) print(" * Start serialize links pagelist") serialize(pagelist_links, path_pagelist_links) # print(" * Start deserialize links pagelist") # pagelist_links = deserialize(path_pagelist_links) print(" * Start create plaintext pagelist") pagelist_plaintext = create_plaintext_pagelist(pagelist_noclean) print(" * Start serialize pagelist plaintext") serialize(pagelist_plaintext, path_pagelist_plaintext) # print(" * Start deserialize pagelist plaintext") # pagelist_plaintext = deserialize(path_pagelist_plaintext) print(" * Start create resume pagelist") pagelist_plaintext_resume = create_resume_pagelist(pagelist_plaintext) print(" * Start serialize resume pagelist") serialize(pagelist_plaintext_resume, path_pagelist_plaintext_resume) # print(" * Start deserialize pagelist plaintext resume") # pagelist_plaintext_resume = deserialize(path_pagelist_plaintext_resume) print(" * Start create CLI") (C, L, I) = create_cli(pagelist_links) print(" * Start serialize CLI") serialize((C, L, I), path_cli) # print(" * Start deserialize CLI") # (C, L, I) = deserialize(path_cli) print(" * Start create clean tokens pagelist") pagelist_clean_tokens = create_clean_tokens_pagelist(pagelist_plaintext) print(" * Start serialize clean tokens pagelist") serialize(pagelist_clean_tokens, path_pagelist_clean_tokens) # print(" * Start deserialize clean tokens pagelist") # pagelist_clean_tokens = deserialize(path_pagelist_clean_tokens) print(" * Start create dico") dico = create_dico(pagelist_clean_tokens) print(" * Start serialize dico") serialize(dico, path_dico) # print(" * Start deserialize dico") # dico = deserialize(path_dico) print(" * Start create pagerank") pagerank = create_pagerank(C, L, I) print(" * Start serialize pagerank") serialize(pagerank, path_pagerank) # print(" * Start deserialize pagerank") # dico = deserialize(path_pagerank) print(" * Finish") elapsed_time = time.time() - start_time print(" Elapsed time: {}".format(hms_string(elapsed_time)))