Пример #1
0
def crawl_wiki():
    engine = Wikipedia(license=None, throttle=1.0, language='en')
    wikis = {}
    keywords = get_keywords()
    for keyword in keywords:
        stop = False
        while stop is False:
            try:
                article = engine.search(query=keyword)
            except Exception as e:
                print str(e)
                article = None

            if type(article) is pattern.web.WikipediaArticle:
                if article.disambiguation is False:
                    print '\nretrieving', keyword, '...',
                    wikis[keyword] = {}
                    wikis[keyword]['keyword'] = keyword
                    wikis[keyword]['text'] = article.plaintext()
                    stop = True
                else:
                    print '\n[', keyword, '] leads to disambiguation page!',
                    stop = True

                    if '-' in keyword:
                        keyword = re.sub(
                            '-', ' ',
                            keyword)  # convert hyphen into white space
                        stop = False
                    if keyword.islower() and len(keyword) <= 5:
                        keyword = keyword.upper()
                        stop = False
            else:
                print '\n[', keyword, '] doesn\'t exist on wikipedia!',
                stop = True

                if '-' in keyword:
                    keyword = re.sub(
                        '-', ' ', keyword)  # convert hyphen into white space
                    stop = False
                if keyword.islower() and len(keyword) <= 5:
                    keyword = keyword.upper()
                    stop = False

    enpickle(wikis, 'data/txt/wiki.pkl')
    return wikis
def crawl_wiki():
    engine = Wikipedia(license=None, throttle=1.0, language='en')
    wikis = {}
    keywords = get_keywords()
    for keyword in keywords:
        stop = False
        while stop is False:
            try:
                article = engine.search(query=keyword)
            except Exception as e:
                print str(e)
                article = None

            if type(article) is pattern.web.WikipediaArticle:
                if article.disambiguation is False:
                    print '\nretrieving', keyword, '...',
                    wikis[keyword] = {}
                    wikis[keyword]['keyword'] = keyword
                    wikis[keyword]['text'] = article.plaintext()
                    stop = True
                else:
                    print '\n[', keyword, '] leads to disambiguation page!',
                    stop = True

                    if '-' in keyword:
                        keyword = re.sub('-', ' ', keyword)  # convert hyphen into white space
                        stop = False
                    if keyword.islower() and len(keyword) <= 5:
                        keyword = keyword.upper()
                        stop = False
            else:
                print '\n[', keyword, '] doesn\'t exist on wikipedia!',
                stop = True

                if '-' in keyword:
                    keyword = re.sub('-', ' ', keyword)  # convert hyphen into white space
                    stop = False
                if keyword.islower() and len(keyword) <= 5:
                    keyword = keyword.upper()
                    stop = False

    enpickle(wikis, 'data/txt/wiki.pkl')
    return wikis
Пример #3
0
"""
Computing similarity between each category.
"""
import logging
from gensim.models import LdaModel
from sklearn.metrics.pairwise import cosine_similarity
from utils.util import enpickle

__author__ = 'kensk8er'


if __name__ == '__main__':
    # logging
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG)

    logging.info('Loading the model...')
    model = LdaModel.load('result/model_wiki.lda')
    topics = []
    for topic_id in range(model.num_topics):
        topics.append(model.return_topic(topicid=topic_id))

    similarity = cosine_similarity(topics)
    enpickle(similarity, 'result/topic_similarity/lda_wiki.pkl')
Пример #4
0
    vectorizer = Pipeline((
        ('hasher', hasher),
        ('tf_idf', TfidfTransformer())  # TODO: you should try many different parameters here
    ))

    # calculate TF-IDF
    print 'calculate TF-IDF...'
    X = vectorizer.fit_transform(text)

    # calculate cosine similarities between each text
    print 'calculate cosine similarities...'
    similarities = calculate_similarities(X)

    print 'save similarities and indices...'
    date_time = datetime.datetime.today().strftime("%m%d%H%M%S")
    enpickle(similarities, 'cache/similarities_' + date_time + '.pkl')
    enpickle(resume_indices, 'cache/resume_indices_' + date_time + '.pkl')
    enpickle(job_indices, 'cache/job_indices_' + date_time + '.pkl')

    # pick up n-most similar job posts and show them
    print 'pick up', n_result, 'most similar job posts for each resume...'
    results = get_n_most_similar_job_posts(similarity_matrix=similarities,
                                           n=n_result,
                                           resume_index_list=range(n_resume))  # resumes come after job posts

    print 'show recommendation results for each resume:\n'
    show_recommendation_results(result_lists=results, resume_indices=resume_indices, job_indices=job_indices)

    # calculate each metric based on relevancy judgements
    print 'load relevancy judgements...'
    relevancy_judgements = unpickle('data/relevancy/relevancy.pkl')
    # Perform an IDF normalization on the output of HashingVectorizer
    hasher = HashingVectorizer(stop_words='english', non_negative=True,
                               norm=None, binary=False)
    vectorizer = Pipeline((
        ('hasher', hasher),
        ('tf_idf', TfidfTransformer())  # TODO: you should try many different parameters here
    ))

    # reduce the number of documents for now
    #doc_lists = doc_lists[:400]
    #doc_indices = doc_indices[:400]

    # calculate TF-IDF
    print 'calculate TF-IDF...'
    X = vectorizer.fit_transform(doc_lists)

    # perform LSA
    print 'perform LSA...'
    lsa = TruncatedSVD(n_components=300, algorithm='arpack')
    X = np.matrix(lsa.fit_transform(X))

    # calculate cosine similarities between each text
    print 'calculate cosine similarities...'
    similarities = calculate_similarities(X)

    print 'save similarities and indices...'
    #date_time = datetime.datetime.today().strftime("%m%d%H%M%S")
    enpickle(similarities, 'result/similarities.pkl')
    enpickle(doc_indices, 'result/indices.pkl')

Пример #6
0
    file_names = glob('data/relevancy/*.csv')
    for file_name in file_names:
        FILE = open(file_name, 'rb')
        reader = csv.reader(FILE)
        row_num = 0
        resume_name = ''

        for row in reader:
            if row_num == 0:
                resume_name = row[0]

            if row_num == 1:
                pass

            if row_num >= 2:
                relevancy = row[0]
                job_url = row[1]

                # search job_name by job_url
                job_name = ''
                for key1, value1 in job_data.items():
                    if value1['job_url'] == job_url:
                        job_name = key1

                relevancy_dict[(resume_name, job_name)] = relevancy

            row_num += 1

    enpickle(relevancy_dict, 'data/relevancy/relevancy.pkl')
def compute_topics_by_time(time2doc_ids, model, dictionary):
    N = len(time2doc_ids)
    logging.info('Performing inference on corpora...')
    p_z_d = model.inference(dictionary.corpus)[0].T
    p_z_d = p_z_d / p_z_d.sum(axis=0).reshape(1, p_z_d.shape[1])  # normalize to make it probability
    Z = p_z_d.shape[0]
    time2topics = [[0 for i in range(Z)] for j in range(N)]

    for time, doc_ids in time2doc_ids.items():  # FIXME: improve this for loop (not element-wise)
        for z in range(Z):
            for doc_id in doc_ids:
                if p_z_d[z, doc_id] > 0:
                    time2topics[time][z] += p_z_d[z, doc_id]
    return time2topics


if __name__ == '__main__':
    # logging
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG)

    interval = WEEK
    logging.info('Loading model...')
    model = LdaModel.load(fname='result/model.lda')
    logging.info('Loading dictinary...')
    dictionary = Dictionary.load('data/dictionary/report_(NN).dict')
    logging.info('Sort documents by time...')
    time2docids = sort_by_time(dictionary, interval)
    logging.info('Compute topic distribution for each time...')
    time2topics = compute_topics_by_time(time2docids, model, dictionary)
    enpickle(time2topics, 'result/week2topics.pkl')
Пример #8
0
    print "computing P(w)..."
    p_w = gen_p_w(p_w_z, p_z)

    print "computing P(z,w)..."
    p_wz = gen_p_wz(p_w_z, p_z)

    print "computing P(z|w)..."
    p_z_w = gen_p_z_w(p_wz, p_w)

    # print 'computing P(w|z) / P(w) = P(z,w) / {P(z) * P(w)}...'
    # p_w_z_w = gen_p_w_z_w(p_w_z, p_w)

    print "computing P(d)..."
    p_d = gen_p_d(p_d_z, p_z)

    print "computing P(z,d)..."
    p_dz = gen_p_dz(p_d_z, p_z)

    print "computing P(z|d)..."
    p_z_d = gen_p_z_d(p_dz, p_d)

    print "saving results into .pkl file..."
    enpickle(p_w, "result/plsa/p_w.pkl")
    enpickle(p_wz, "result/plsa/p_wz.pkl")
    enpickle(p_z_w, "result/plsa/p_z_w.pkl")
    # enpickle(p_w_z_w, 'result/plsa/p_w_z_w.pkl')
    enpickle(p_d, "result/plsa/p_d.pkl")
    enpickle(p_dz, "result/plsa/p_dz.pkl")
    enpickle(p_z_d, "result/plsa/p_z_d.pkl")
Пример #9
0
    print 'computing P(w)...'
    p_w = gen_p_w(p_w_z, p_z)

    print 'computing P(z,w)...'
    p_wz = gen_p_wz(p_w_z, p_z)

    print 'computing P(z|w)...'
    p_z_w = gen_p_z_w(p_wz, p_w)

    #print 'computing P(w|z) / P(w) = P(z,w) / {P(z) * P(w)}...'
    #p_w_z_w = gen_p_w_z_w(p_w_z, p_w)

    print 'computing P(d)...'
    p_d = gen_p_d(p_d_z, p_z)

    print 'computing P(z,d)...'
    p_dz = gen_p_dz(p_d_z, p_z)

    print 'computing P(z|d)...'
    p_z_d = gen_p_z_d(p_dz, p_d)

    print 'saving results into .pkl file...'
    enpickle(p_w, 'result/plsa/p_w.pkl')
    enpickle(p_wz, 'result/plsa/p_wz.pkl')
    enpickle(p_z_w, 'result/plsa/p_z_w.pkl')
    #enpickle(p_w_z_w, 'result/plsa/p_w_z_w.pkl')
    enpickle(p_d, 'result/plsa/p_d.pkl')
    enpickle(p_dz, 'result/plsa/p_dz.pkl')
    enpickle(p_z_d, 'result/plsa/p_z_d.pkl')
Пример #10
0
    p_z_d = model.inference(dictionary.corpus)[0].T
    p_z_d = p_z_d / p_z_d.sum(axis=0).reshape(
        1, p_z_d.shape[1])  # normalize to make it probability
    Z = p_z_d.shape[0]
    time2topics = [[0 for i in range(Z)] for j in range(N)]

    for time, doc_ids in time2doc_ids.items(
    ):  # FIXME: improve this for loop (not element-wise)
        for z in range(Z):
            for doc_id in doc_ids:
                if p_z_d[z, doc_id] > 0:
                    time2topics[time][z] += p_z_d[z, doc_id]
    return time2topics


if __name__ == '__main__':
    # logging
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.DEBUG)

    interval = WEEK
    logging.info('Loading model...')
    model = LdaModel.load(fname='result/model.lda')
    logging.info('Loading dictinary...')
    dictionary = Dictionary.load('data/dictionary/report_(NN).dict')
    logging.info('Sort documents by time...')
    time2docids = sort_by_time(dictionary, interval)
    logging.info('Compute topic distribution for each time...')
    time2topics = compute_topics_by_time(time2docids, model, dictionary)
    enpickle(time2topics, 'result/week2topics.pkl')
        wiki = []
        tokens = lemmatize(content=cleaned, allowed_tags=allowed_pos)  # lemmatize
        for token in tokens:
            word, pos = token.split('/')
            wiki.append(word)

        # convert compound word into one token
        wiki = convert_compound(wiki)

        # filter stop words, long words, and non-english words
        wiki = [w for w in wiki if not w in stop_words and 2 <= len(w) <= 15 and w.islower()]  # FIXME: it allows non-english characters to be stored

        new_wikis.append(wiki)
        keywords.append(keyword)

    print '\n'
    enpickle(new_wikis, 'data/txt/processed_wiki.pkl')

    logging.info('create dictionary and corpus...')
    dictionary = corpora.Dictionary(new_wikis)
    dictionary.docid2title = keywords

    logging.info('filter unimportant words...')
    dictionary.filter_extremes(no_below=1, no_above=0.2, keep_n=None)
    dictionary.compactify()

    logging.info('generate corpus...')
    dictionary.corpus = [dictionary.doc2bow(wiki) for wiki in new_wikis]
    dictionary.id2token = revdict(dictionary.token2id)

    dictionary.save('data/dictionary/wiki_' + allowed_pos.pattern + '.dict')
Пример #12
0
    return text, from_name, date


def read_eml(directory_path):
    return_dict = {}
    file_names = glob(directory_path + '/' + '*.eml')
    count = 0
    file_num = len(file_names)

    for FILE in file_names:
        count += 1
        print '\r', count, '/', file_num,
        dir_name, file_name = os.path.split(FILE)
        file_name = file_name.rstrip('.eml')
        eml_file = open(FILE, 'r')
        text, from_name, date = parse_eml_txt(eml_file)
        return_dict[file_name] = {}
        return_dict[file_name]['text'] = text
        return_dict[file_name]['from'] = from_name
        return_dict[file_name]['date'] = date

    return return_dict


if __name__ == '__main__':
    print 'reading eml files and converting them into text data...'
    documents = read_eml('data/eml')

    print 'save them into .pkl file...'
    enpickle(documents, 'data/txt/documents.pkl')
Пример #13
0
            word, pos = token.split('/')
            wiki.append(word)

        # convert compound word into one token
        wiki = convert_compound(wiki)

        # filter stop words, long words, and non-english words
        wiki = [
            w for w in wiki
            if not w in stop_words and 2 <= len(w) <= 15 and w.islower()
        ]  # FIXME: it allows non-english characters to be stored

        new_wikis.append(wiki)
        keywords.append(keyword)

    print '\n'
    enpickle(new_wikis, 'data/txt/processed_wiki.pkl')

    logging.info('create dictionary and corpus...')
    dictionary = corpora.Dictionary(new_wikis)
    dictionary.docid2title = keywords

    logging.info('filter unimportant words...')
    dictionary.filter_extremes(no_below=1, no_above=0.2, keep_n=None)
    dictionary.compactify()

    logging.info('generate corpus...')
    dictionary.corpus = [dictionary.doc2bow(wiki) for wiki in new_wikis]
    dictionary.id2token = revdict(dictionary.token2id)

    dictionary.save('data/dictionary/wiki_' + allowed_pos.pattern + '.dict')
Пример #14
0
    return from2docids


if __name__ == '__main__':
    # logging
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG)

    interval = WEEK  # only WEEK is implemented for now

    model = LdaModel.load('result/model_wiki.lda')
    dictionary = Dictionary.load('data/dictionary/report_(NN).dict')

    from2docids = convert_docid2from_from2docids(dictionary.docid2from)
    time2docids = sort_by_time(dictionary.docid2date, interval)

    p_z_d = model.inference(dictionary.corpus)[0].T
    p_z_d = p_z_d / p_z_d.sum(axis=0).reshape(1, p_z_d.shape[1])  # normalize to make it probability

    # iterate over every interval
    from_similarity = {}
    for time in range(max(time2docids.keys())):
        print('\ncompute similarity for time = ' + str(time) + '...')
        from_vectors, from_frequencies = create_from_vectors(p_z_d, from2docids, time2docids, time)
        from_matrix, from_indices = convert_from_vectors(from_vectors)
        similarities = compute_similarity(from_matrix)
        id_frequencies = convert_from_id(from_frequencies, from_indices)
        from_similarity[time] = {'similarity': similarities, 'id2from': from_indices, 'frequency': id_frequencies,
                                 'topic': from_matrix}

    enpickle(from_similarity, 'result/from_similarity_wiki.pkl')
Пример #15
0
                               non_negative=True,
                               norm=None,
                               binary=False)
    vectorizer = Pipeline((
        ('hasher', hasher),
        ('tf_idf', TfidfTransformer()
         )  # TODO: you should try many different parameters here
    ))

    # reduce the number of documents for now
    #doc_lists = doc_lists[:400]
    #doc_indices = doc_indices[:400]

    # calculate TF-IDF
    print 'calculate TF-IDF...'
    X = vectorizer.fit_transform(doc_lists)

    # perform LSA
    print 'perform LSA...'
    lsa = TruncatedSVD(n_components=300, algorithm='arpack')
    X = np.matrix(lsa.fit_transform(X))

    # calculate cosine similarities between each text
    print 'calculate cosine similarities...'
    similarities = calculate_similarities(X)

    print 'save similarities and indices...'
    #date_time = datetime.datetime.today().strftime("%m%d%H%M%S")
    enpickle(similarities, 'result/similarities.pkl')
    enpickle(doc_indices, 'result/indices.pkl')