def crawl_wiki(): engine = Wikipedia(license=None, throttle=1.0, language='en') wikis = {} keywords = get_keywords() for keyword in keywords: stop = False while stop is False: try: article = engine.search(query=keyword) except Exception as e: print str(e) article = None if type(article) is pattern.web.WikipediaArticle: if article.disambiguation is False: print '\nretrieving', keyword, '...', wikis[keyword] = {} wikis[keyword]['keyword'] = keyword wikis[keyword]['text'] = article.plaintext() stop = True else: print '\n[', keyword, '] leads to disambiguation page!', stop = True if '-' in keyword: keyword = re.sub( '-', ' ', keyword) # convert hyphen into white space stop = False if keyword.islower() and len(keyword) <= 5: keyword = keyword.upper() stop = False else: print '\n[', keyword, '] doesn\'t exist on wikipedia!', stop = True if '-' in keyword: keyword = re.sub( '-', ' ', keyword) # convert hyphen into white space stop = False if keyword.islower() and len(keyword) <= 5: keyword = keyword.upper() stop = False enpickle(wikis, 'data/txt/wiki.pkl') return wikis
def crawl_wiki(): engine = Wikipedia(license=None, throttle=1.0, language='en') wikis = {} keywords = get_keywords() for keyword in keywords: stop = False while stop is False: try: article = engine.search(query=keyword) except Exception as e: print str(e) article = None if type(article) is pattern.web.WikipediaArticle: if article.disambiguation is False: print '\nretrieving', keyword, '...', wikis[keyword] = {} wikis[keyword]['keyword'] = keyword wikis[keyword]['text'] = article.plaintext() stop = True else: print '\n[', keyword, '] leads to disambiguation page!', stop = True if '-' in keyword: keyword = re.sub('-', ' ', keyword) # convert hyphen into white space stop = False if keyword.islower() and len(keyword) <= 5: keyword = keyword.upper() stop = False else: print '\n[', keyword, '] doesn\'t exist on wikipedia!', stop = True if '-' in keyword: keyword = re.sub('-', ' ', keyword) # convert hyphen into white space stop = False if keyword.islower() and len(keyword) <= 5: keyword = keyword.upper() stop = False enpickle(wikis, 'data/txt/wiki.pkl') return wikis
""" Computing similarity between each category. """ import logging from gensim.models import LdaModel from sklearn.metrics.pairwise import cosine_similarity from utils.util import enpickle __author__ = 'kensk8er' if __name__ == '__main__': # logging logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) logging.info('Loading the model...') model = LdaModel.load('result/model_wiki.lda') topics = [] for topic_id in range(model.num_topics): topics.append(model.return_topic(topicid=topic_id)) similarity = cosine_similarity(topics) enpickle(similarity, 'result/topic_similarity/lda_wiki.pkl')
vectorizer = Pipeline(( ('hasher', hasher), ('tf_idf', TfidfTransformer()) # TODO: you should try many different parameters here )) # calculate TF-IDF print 'calculate TF-IDF...' X = vectorizer.fit_transform(text) # calculate cosine similarities between each text print 'calculate cosine similarities...' similarities = calculate_similarities(X) print 'save similarities and indices...' date_time = datetime.datetime.today().strftime("%m%d%H%M%S") enpickle(similarities, 'cache/similarities_' + date_time + '.pkl') enpickle(resume_indices, 'cache/resume_indices_' + date_time + '.pkl') enpickle(job_indices, 'cache/job_indices_' + date_time + '.pkl') # pick up n-most similar job posts and show them print 'pick up', n_result, 'most similar job posts for each resume...' results = get_n_most_similar_job_posts(similarity_matrix=similarities, n=n_result, resume_index_list=range(n_resume)) # resumes come after job posts print 'show recommendation results for each resume:\n' show_recommendation_results(result_lists=results, resume_indices=resume_indices, job_indices=job_indices) # calculate each metric based on relevancy judgements print 'load relevancy judgements...' relevancy_judgements = unpickle('data/relevancy/relevancy.pkl')
# Perform an IDF normalization on the output of HashingVectorizer hasher = HashingVectorizer(stop_words='english', non_negative=True, norm=None, binary=False) vectorizer = Pipeline(( ('hasher', hasher), ('tf_idf', TfidfTransformer()) # TODO: you should try many different parameters here )) # reduce the number of documents for now #doc_lists = doc_lists[:400] #doc_indices = doc_indices[:400] # calculate TF-IDF print 'calculate TF-IDF...' X = vectorizer.fit_transform(doc_lists) # perform LSA print 'perform LSA...' lsa = TruncatedSVD(n_components=300, algorithm='arpack') X = np.matrix(lsa.fit_transform(X)) # calculate cosine similarities between each text print 'calculate cosine similarities...' similarities = calculate_similarities(X) print 'save similarities and indices...' #date_time = datetime.datetime.today().strftime("%m%d%H%M%S") enpickle(similarities, 'result/similarities.pkl') enpickle(doc_indices, 'result/indices.pkl')
file_names = glob('data/relevancy/*.csv') for file_name in file_names: FILE = open(file_name, 'rb') reader = csv.reader(FILE) row_num = 0 resume_name = '' for row in reader: if row_num == 0: resume_name = row[0] if row_num == 1: pass if row_num >= 2: relevancy = row[0] job_url = row[1] # search job_name by job_url job_name = '' for key1, value1 in job_data.items(): if value1['job_url'] == job_url: job_name = key1 relevancy_dict[(resume_name, job_name)] = relevancy row_num += 1 enpickle(relevancy_dict, 'data/relevancy/relevancy.pkl')
def compute_topics_by_time(time2doc_ids, model, dictionary): N = len(time2doc_ids) logging.info('Performing inference on corpora...') p_z_d = model.inference(dictionary.corpus)[0].T p_z_d = p_z_d / p_z_d.sum(axis=0).reshape(1, p_z_d.shape[1]) # normalize to make it probability Z = p_z_d.shape[0] time2topics = [[0 for i in range(Z)] for j in range(N)] for time, doc_ids in time2doc_ids.items(): # FIXME: improve this for loop (not element-wise) for z in range(Z): for doc_id in doc_ids: if p_z_d[z, doc_id] > 0: time2topics[time][z] += p_z_d[z, doc_id] return time2topics if __name__ == '__main__': # logging logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) interval = WEEK logging.info('Loading model...') model = LdaModel.load(fname='result/model.lda') logging.info('Loading dictinary...') dictionary = Dictionary.load('data/dictionary/report_(NN).dict') logging.info('Sort documents by time...') time2docids = sort_by_time(dictionary, interval) logging.info('Compute topic distribution for each time...') time2topics = compute_topics_by_time(time2docids, model, dictionary) enpickle(time2topics, 'result/week2topics.pkl')
print "computing P(w)..." p_w = gen_p_w(p_w_z, p_z) print "computing P(z,w)..." p_wz = gen_p_wz(p_w_z, p_z) print "computing P(z|w)..." p_z_w = gen_p_z_w(p_wz, p_w) # print 'computing P(w|z) / P(w) = P(z,w) / {P(z) * P(w)}...' # p_w_z_w = gen_p_w_z_w(p_w_z, p_w) print "computing P(d)..." p_d = gen_p_d(p_d_z, p_z) print "computing P(z,d)..." p_dz = gen_p_dz(p_d_z, p_z) print "computing P(z|d)..." p_z_d = gen_p_z_d(p_dz, p_d) print "saving results into .pkl file..." enpickle(p_w, "result/plsa/p_w.pkl") enpickle(p_wz, "result/plsa/p_wz.pkl") enpickle(p_z_w, "result/plsa/p_z_w.pkl") # enpickle(p_w_z_w, 'result/plsa/p_w_z_w.pkl') enpickle(p_d, "result/plsa/p_d.pkl") enpickle(p_dz, "result/plsa/p_dz.pkl") enpickle(p_z_d, "result/plsa/p_z_d.pkl")
print 'computing P(w)...' p_w = gen_p_w(p_w_z, p_z) print 'computing P(z,w)...' p_wz = gen_p_wz(p_w_z, p_z) print 'computing P(z|w)...' p_z_w = gen_p_z_w(p_wz, p_w) #print 'computing P(w|z) / P(w) = P(z,w) / {P(z) * P(w)}...' #p_w_z_w = gen_p_w_z_w(p_w_z, p_w) print 'computing P(d)...' p_d = gen_p_d(p_d_z, p_z) print 'computing P(z,d)...' p_dz = gen_p_dz(p_d_z, p_z) print 'computing P(z|d)...' p_z_d = gen_p_z_d(p_dz, p_d) print 'saving results into .pkl file...' enpickle(p_w, 'result/plsa/p_w.pkl') enpickle(p_wz, 'result/plsa/p_wz.pkl') enpickle(p_z_w, 'result/plsa/p_z_w.pkl') #enpickle(p_w_z_w, 'result/plsa/p_w_z_w.pkl') enpickle(p_d, 'result/plsa/p_d.pkl') enpickle(p_dz, 'result/plsa/p_dz.pkl') enpickle(p_z_d, 'result/plsa/p_z_d.pkl')
p_z_d = model.inference(dictionary.corpus)[0].T p_z_d = p_z_d / p_z_d.sum(axis=0).reshape( 1, p_z_d.shape[1]) # normalize to make it probability Z = p_z_d.shape[0] time2topics = [[0 for i in range(Z)] for j in range(N)] for time, doc_ids in time2doc_ids.items( ): # FIXME: improve this for loop (not element-wise) for z in range(Z): for doc_id in doc_ids: if p_z_d[z, doc_id] > 0: time2topics[time][z] += p_z_d[z, doc_id] return time2topics if __name__ == '__main__': # logging logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) interval = WEEK logging.info('Loading model...') model = LdaModel.load(fname='result/model.lda') logging.info('Loading dictinary...') dictionary = Dictionary.load('data/dictionary/report_(NN).dict') logging.info('Sort documents by time...') time2docids = sort_by_time(dictionary, interval) logging.info('Compute topic distribution for each time...') time2topics = compute_topics_by_time(time2docids, model, dictionary) enpickle(time2topics, 'result/week2topics.pkl')
wiki = [] tokens = lemmatize(content=cleaned, allowed_tags=allowed_pos) # lemmatize for token in tokens: word, pos = token.split('/') wiki.append(word) # convert compound word into one token wiki = convert_compound(wiki) # filter stop words, long words, and non-english words wiki = [w for w in wiki if not w in stop_words and 2 <= len(w) <= 15 and w.islower()] # FIXME: it allows non-english characters to be stored new_wikis.append(wiki) keywords.append(keyword) print '\n' enpickle(new_wikis, 'data/txt/processed_wiki.pkl') logging.info('create dictionary and corpus...') dictionary = corpora.Dictionary(new_wikis) dictionary.docid2title = keywords logging.info('filter unimportant words...') dictionary.filter_extremes(no_below=1, no_above=0.2, keep_n=None) dictionary.compactify() logging.info('generate corpus...') dictionary.corpus = [dictionary.doc2bow(wiki) for wiki in new_wikis] dictionary.id2token = revdict(dictionary.token2id) dictionary.save('data/dictionary/wiki_' + allowed_pos.pattern + '.dict')
return text, from_name, date def read_eml(directory_path): return_dict = {} file_names = glob(directory_path + '/' + '*.eml') count = 0 file_num = len(file_names) for FILE in file_names: count += 1 print '\r', count, '/', file_num, dir_name, file_name = os.path.split(FILE) file_name = file_name.rstrip('.eml') eml_file = open(FILE, 'r') text, from_name, date = parse_eml_txt(eml_file) return_dict[file_name] = {} return_dict[file_name]['text'] = text return_dict[file_name]['from'] = from_name return_dict[file_name]['date'] = date return return_dict if __name__ == '__main__': print 'reading eml files and converting them into text data...' documents = read_eml('data/eml') print 'save them into .pkl file...' enpickle(documents, 'data/txt/documents.pkl')
word, pos = token.split('/') wiki.append(word) # convert compound word into one token wiki = convert_compound(wiki) # filter stop words, long words, and non-english words wiki = [ w for w in wiki if not w in stop_words and 2 <= len(w) <= 15 and w.islower() ] # FIXME: it allows non-english characters to be stored new_wikis.append(wiki) keywords.append(keyword) print '\n' enpickle(new_wikis, 'data/txt/processed_wiki.pkl') logging.info('create dictionary and corpus...') dictionary = corpora.Dictionary(new_wikis) dictionary.docid2title = keywords logging.info('filter unimportant words...') dictionary.filter_extremes(no_below=1, no_above=0.2, keep_n=None) dictionary.compactify() logging.info('generate corpus...') dictionary.corpus = [dictionary.doc2bow(wiki) for wiki in new_wikis] dictionary.id2token = revdict(dictionary.token2id) dictionary.save('data/dictionary/wiki_' + allowed_pos.pattern + '.dict')
return from2docids if __name__ == '__main__': # logging logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) interval = WEEK # only WEEK is implemented for now model = LdaModel.load('result/model_wiki.lda') dictionary = Dictionary.load('data/dictionary/report_(NN).dict') from2docids = convert_docid2from_from2docids(dictionary.docid2from) time2docids = sort_by_time(dictionary.docid2date, interval) p_z_d = model.inference(dictionary.corpus)[0].T p_z_d = p_z_d / p_z_d.sum(axis=0).reshape(1, p_z_d.shape[1]) # normalize to make it probability # iterate over every interval from_similarity = {} for time in range(max(time2docids.keys())): print('\ncompute similarity for time = ' + str(time) + '...') from_vectors, from_frequencies = create_from_vectors(p_z_d, from2docids, time2docids, time) from_matrix, from_indices = convert_from_vectors(from_vectors) similarities = compute_similarity(from_matrix) id_frequencies = convert_from_id(from_frequencies, from_indices) from_similarity[time] = {'similarity': similarities, 'id2from': from_indices, 'frequency': id_frequencies, 'topic': from_matrix} enpickle(from_similarity, 'result/from_similarity_wiki.pkl')
non_negative=True, norm=None, binary=False) vectorizer = Pipeline(( ('hasher', hasher), ('tf_idf', TfidfTransformer() ) # TODO: you should try many different parameters here )) # reduce the number of documents for now #doc_lists = doc_lists[:400] #doc_indices = doc_indices[:400] # calculate TF-IDF print 'calculate TF-IDF...' X = vectorizer.fit_transform(doc_lists) # perform LSA print 'perform LSA...' lsa = TruncatedSVD(n_components=300, algorithm='arpack') X = np.matrix(lsa.fit_transform(X)) # calculate cosine similarities between each text print 'calculate cosine similarities...' similarities = calculate_similarities(X) print 'save similarities and indices...' #date_time = datetime.datetime.today().strftime("%m%d%H%M%S") enpickle(similarities, 'result/similarities.pkl') enpickle(doc_indices, 'result/indices.pkl')