df = sum([unique_term in terms for terms in terms_by_doc_sets]) #hint: iterate over 'terms_by_doc_sets' and test for the presence of 'unique_term' (you may use a list comprehension). You'll get a list of booleans. Sum it to get the counts # idf idf[unique_term] = math.log10(float(n_doc + 1) / df) if counter % 1e3 == 0: print(counter, "terms processed") ########################################### # computing features for the training set # ########################################### w = 3 # sliding window size print("creating a graph-of-words for the collection") c_g = terms_to_graph(terms_by_doc, w, False) # sanity check (should return True) print(len(all_unique_terms) == len(c_g.vs)) print("creating a graph-of-words for each training document") all_graphs = [] for elt in terms_by_doc: all_graphs.append(terms_to_graph([elt], w, overspanning=True)) # sanity checks (should return True) print(len(terms_by_doc) == len(all_graphs)) print(len(set(terms_by_doc[0])) == len(all_graphs[0].vs)) print("computing vector representations of each training document")
########################################### w = 3 # sliding window size print("creating a graph-of-words for the collection") c_g = ### fill the gap ### hint: use the terms_to_graph function with the proper arguments # sanity check (should return True) print(len(all_unique_terms) == len(c_g.vs)) print("creating a graph-of-words for each training document") all_graphs = [] for elt in terms_by_doc: all_graphs.append(terms_to_graph([elt],w,overspanning=True)) # sanity checks (should return True) print(len(terms_by_doc)==len(all_graphs)) print(len(set(terms_by_doc[0]))==len(all_graphs[0].vs)) print("computing vector representations of each training document") b = 0.003 features_degree = [] features_w_degree = [] features_closeness = [] features_w_closeness = [] features_twicw = [] # we try it only with unweighted degree features_tfidf = []
keywds_stemmed_unique = list( set(keywds_stemmed )) # remove duplicates (may happen due to n-gram breaking) keywds_gold_standard.append(keywds_stemmed_unique) if counter % round(len(keywd_names) / 5) == 0: print(counter, 'files processed') ############################## # precompute graphs-of-words # ############################## ### fill the gap (use the terms_to_graph function, store the results in a list named 'gs') ### gs = [] for abs in abstracts_cleaned: gs.append(terms_to_graph(abs, 4)) ################################## # graph-based keyword extraction # ################################## my_percentage = 0.33 # for PR and TF-IDF method_names = ['kc', 'wkc', 'pr', 'tfidf'] keywords = dict(zip(method_names, [[], [], [], []])) for counter, g in enumerate(gs): # k-core core_numbers = core_dec(g, False) ### fill the gaps (retain main core as keywords and append the resulting list to 'keywords['kc']') ### max_c_n = max(core_numbers.values())
)) # remove duplicates (can happen due to n-gram breaking) keywords_gold_standard.append(keywords_stemmed_unique) # print progress if counter % round(len(keyword_names) / 10) == 0: print counter, 'files processed' ############################### # keyword extraction with gow # ############################### keywords_gow = [] for counter, abstract in enumerate(abstracts_cleaned): # create graph-of-words g = terms_to_graph(abstract, w=4) # decompose graph-of-words core_numbers = dict(zip(g.vs['name'], g.coreness())) # retain main core as keywords max_c_n = max(core_numbers.values()) keywords = [kwd for kwd, c_n in core_numbers.iteritems() if c_n == max_c_n] # save results keywords_gow.append(keywords) # print progress if counter % round(len(abstracts_cleaned) / 10) == 0: print counter, 'abstracts processed' keywords_gow_w = [] for counter, abstract in enumerate(abstracts_cleaned):
keywords_stemmed = [stemmer.stem(keyword) for keyword in keywords] keywords_stemmed_unique = list( set(keywords_stemmed )) # remove duplicates (may happen due to n-gram breaking) keywords_gold_standard.append(keywords_stemmed_unique) if counter % round(len(keyword_names) / 10) == 0: print(counter, 'files processed') # In[5]: ############################## # precompute graphs-of-words # ##################d############ gs = [terms_to_graph(abstracts, w=SWS) for abstracts in abstracts_cleaned] # In[6]: ################################## # keyword extraction with k-core # ################################## keywords_kc = [] for counter, g in enumerate(gs): core_numbers = dict(zip(g.vs['name'], g.coreness())) # compute core numbers # retain main core as keywords max_c_n = max(core_numbers.values()) keywords = [kwd for kwd, c_n in core_numbers.items() if c_n == max_c_n]
idf[unique_term] = math.log10( (len(terms_by_doc) + 1) / df ) ### fill the gap ### hint: use math.log10 and refer to the beginning of Section 2 in the handout if counter % 1e3 == 0: print(counter, "terms processed") ########################################### # computing features for the training set # ########################################### w = 3 # sliding window size print("creating a graph-of-words for the collection") c_g = terms_to_graph( terms_by_doc, w, overspanning=True ) ### fill the gap ### hint: use the terms_to_graph function with the proper arguments # sanity check (should return True) print(len(all_unique_terms) == len(c_g.vs)) print("creating a graph-of-words for each training document") all_graphs = [] for elt in terms_by_doc: all_graphs.append(terms_to_graph([elt], w, overspanning=True)) # sanity checks (should return True) print(len(terms_by_doc) == len(all_graphs)) print(len(set(terms_by_doc[0])) == len(all_graphs[0].vs))
stpwds = stopwords.words('english') punct = string.punctuation.replace('-', '') my_doc = 'A method for solution of systems of linear algebraic equations \ with m-dimensional lambda matrices. A system of linear algebraic \ equations with m-dimensional lambda matrices is considered. \ The proposed method of searching for the solution of this system \ lies in reducing it to a numerical system of a special kind.' my_doc = my_doc.replace('\n', '') # pre-process document my_tokens = clean_text_simple(my_doc, my_stopwords=stpwds, punct=punct) g = terms_to_graph(my_tokens, 4) # number of edges print("Number of edges :", len(g.es), "\n") # the number of nodes should be equal to the number of unique terms assert len(g.vs) == len( set(my_tokens )), 'The number of nodes should be equal to the number of unique terms' edge_weights = [] for edge in g.es: source = g.vs[edge.source]['name'] target = g.vs[edge.target]['name'] weight = edge['weight'] edge_weights.append([source, target, weight])
keywds_stemmed = [stemmer.stem(keywd) for keywd in keywds] keywds_stemmed_unique = list( set(keywds_stemmed )) # remove duplicates (may happen due to n-gram breaking) keywds_gold_standard.append(keywds_stemmed_unique) if counter % round(len(keywd_names) / 5) == 0: print(counter, 'files processed') ############################## # precompute graphs-of-words # ############################## gs = [] for abstract in abstracts_cleaned: gs.append(terms_to_graph(abstract, 4)) ################################## # graph-based keyword extraction # ################################## my_percentage = 0.33 # for PR and TF-IDF method_names = ['kc', 'wkc', 'pr', 'tfidf'] keywords = dict(zip(method_names, [[], [], [], []])) for counter, g in enumerate(gs): # k-core kcore = core_dec(g, False) core_numbers = list(kcore.items())
set(keywds_stemmed )) # remove duplicates (may happen due to n-gram breaking) keywds_gold_standard.append(keywds_stemmed_unique) if counter % round(len(keywd_names) / 5) == 0: print(counter, 'files processed') ############################## # precompute graphs-of-words # ############################## ### fill the gap (use the terms_to_graph function, store the results in a list named 'gs') ### w = 4 gs = [] for ab in abstracts_cleaned: gs.append(terms_to_graph(ab, w)) ################################## # graph-based keyword extraction # ################################## my_percentage = 0.33 # for PR and TF-IDF method_names = ['kc', 'wkc', 'pr', 'tfidf'] keywords = dict(zip(method_names, [[], [], [], []])) for counter, g in enumerate(gs): # k-core core_numbers = core_dec(g, False) ### fill the gaps (retain main core as keywords and append the resulting list to 'keywords['kc']') ### max_c_n = max(core_numbers.values())
] # remove stopwords (rare but may happen due to n-gram breaking) keywds_stemmed = [stemmer.stem(keywd) for keywd in keywds] keywds_stemmed_unique = list( set(keywds_stemmed )) # remove duplicates (may happen due to n-gram breaking) keywds_gold_standard.append(keywds_stemmed_unique) if counter % round(len(keywd_names) / 5) == 0: print(counter, 'files processed') ############################## # precompute graphs-of-words # ############################## ### fill the gap (use the terms_to_graph function, store the results in a list named 'gs') ### gs = [terms_to_graph(abstract, 4) for abstract in abstracts_cleaned] ################################## # graph-based keyword extraction # ################################## my_percentage = 0.33 # for PR and TF-IDF method_names = ['kc', 'wkc', 'pr', 'tfidf'] keywords = dict(zip(method_names, [[], [], [], []])) for counter, g in enumerate(gs): # k-core core_numbers = core_dec(g, False) ### fill the gaps (retain main core as keywords and append the resulting list to 'keywords['kc']') ### max_c_n = max(core_numbers.values())
if counter % 1e3 == 0: print(counter, "terms processed") # In[3]: ########################################### # computing features for the training set # ########################################### w = 3 # sliding window size print("creating a graph-of-words for the collection") c_g = terms_to_graph([all_unique_terms],w,overspanning=False) # sanity check (should return True) print(len(all_unique_terms) == len(c_g.vs)) print("creating a graph-of-words for each training document") all_graphs = [] for elt in terms_by_doc: all_graphs.append(terms_to_graph([elt],w,overspanning=True)) # sanity checks (should return True) print(len(terms_by_doc)==len(all_graphs)) print(len(set(terms_by_doc[0]))==len(all_graphs[0].vs)) print("computing vector representations of each training document")
] # remove stopwords (rare but may happen due to n-gram breaking) keywords_stemmed = [stemmer.stem(keyword) for keyword in keywords] keywords_stemmed_unique = list( set(keywords_stemmed )) # remove duplicates (may happen due to n-gram breaking) keywords_gold_standard.append(keywords_stemmed_unique) if counter % round(len(keyword_names) / 10) == 0: print(counter, 'files processed') ############################## # precompute graphs-of-words # ############################## gs = [ terms_to_graph(abstract_cleaned, w=4) for abstract_cleaned in abstracts_cleaned ] ### fill the gap ### ################################## # keyword extraction with k-core # ################################## keywords_kc = [] for counter, g in enumerate(gs): core_numbers = dict(zip(g.vs['name'], g.coreness())) # compute core numbers # retain main core as keywords max_c_n = max(core_numbers.values()) keywords = [kwd for kwd, c_n in core_numbers.items() if c_n == max_c_n]
stpwds = stopwords.words('english') punct = string.punctuation.replace('-', '') my_doc = 'A method for solution of systems of linear algebraic equations \ with m-dimensional lambda matrices. A system of linear algebraic \ equations with m-dimensional lambda matrices is considered. \ The proposed method of searching for the solution of this system \ lies in reducing it to a numerical system of a special kind.' my_doc = my_doc.replace('\n', '') # pre-process document my_tokens = clean_text_simple(my_doc, my_stopwords=stpwds, punct=punct) g = terms_to_graph(my_tokens, 4) # number of edges print(len(g.es)) # the number of nodes should be equal to the number of unique terms len(g.vs) == len(set(my_tokens)) edge_weights = [] for edge in g.es: source = g.vs[edge.source]['name'] target = g.vs[edge.target]['name'] weight = edge['weight'] edge_weights.append([source, target, weight]) print(edge_weights)
stpwds = stopwords.words('english') punct = string.punctuation.replace('-', '') my_doc = '''A method for solution of systems of linear algebraic equations with m-dimensional lambda matrices. A system of linear algebraic equations with m-dimensional lambda matrices is considered. The proposed method of searching for the solution of this system lies in reducing it to a numerical system of a special kind.''' my_doc = my_doc.replace('\n', '') # pre-process document my_tokens = clean_text_simple(my_doc, my_stopwords=stpwds, punct=punct) g = terms_to_graph(my_tokens, w=4) # number of edges print(len(g.es)) # the number of nodes should be equal to the number of unique terms assert len(g.vs) == len(set(my_tokens)) edge_weights = [] for edge in g.es: source = g.vs[edge.source]['name'] target = g.vs[edge.target]['name'] weight = edge['weight'] edge_weights.append([source, target, weight]) print(edge_weights)
keywds_stemmed_unique = list( set(keywds_stemmed )) # remove duplicates (may happen due to n-gram breaking) keywds_gold_standard.append(keywds_stemmed_unique) if counter % round(len(keywd_names) / 5) == 0: print(counter, 'files processed') #%% ############################## # precompute graphs-of-words # ############################## ### fill the gap (use the terms_to_graph function, store the results in a list named 'gs') ### gs = [terms_to_graph(toks, 4) for toks in abstracts_cleaned] #%% ################################## # graph-based keyword extraction # ################################## my_percentage = 0.33 # for PR and TF-IDF method_names = ['kc', 'wkc', 'pr', 'tfidf'] keywords = dict(zip(method_names, [[], [], [], []])) for counter, g in enumerate(gs): # k-core core_numbers = core_dec(g, False) ### fill the gaps (retain main core as keywords and append the resulting list to 'keywords['kc']') ###
if counter % round(len(keywd_names) / 5) == 0: print(counter, 'files processed') ############################## # precompute graphs-of-words # ############################## ### fill the gap (use the terms_to_graph function, store the results in a list named 'gs') ### gs = [] window_size = 4 #100 print('\n Building graphs with a window size of ', window_size) for abstract in abstracts_cleaned: gs.append(terms_to_graph(abstract, window_size)) ################################## # graph-based keyword extraction # ################################## print('\n -> Graph based keyword extraction \n') my_percentage = 0.33 # for PR and TF-IDF method_names = ['kc', 'wkc', 'pr', 'tfidf'] keywords = dict(zip(method_names, [[], [], [], []])) for counter, g in enumerate(gs): # k-core core_numbers = core_dec(g, False) # core_numbers = dict(zip(g.vs['name'], g.coreness()))
stpwds = stopwords.words('english') punct = string.punctuation.replace('-', '') my_doc = 'A method for solution of systems of linear algebraic equations \ with m-dimensional lambda matrices. A system of linear algebraic \ equations with m-dimensional lambda matrices is considered. \ The proposed method of searching for the solution of this system \ lies in reducing it to a numerical system of a special kind.' my_doc = my_doc.replace('\n', '') # pre-process document my_tokens = clean_text_simple(my_doc, my_stopwords=stpwds, punct=punct) g = terms_to_graph(my_tokens, 4) # number of edges print(len(g.es)) # the number of nodes should be equal to the number of unique terms len(g.vs) == len(set(my_tokens)) edge_weights = [] for edge in g.es: source = g.vs[edge.source]['name'] target = g.vs[edge.target]['name'] weight = edge['weight'] edge_weights.append([source, target, weight]) print(edge_weights)