def test_text_vectorization(): mongo_dataset = MongoHC('hc', 're0') data = [d for d in mongo_dataset.get_all(order_by='id_doc')] text = [d['text'] for d in data[1:2]] tfidf_vectorizer = TfidfVectorizer(max_df=1, max_features=200000, min_df=1, stop_words='english', strip_accents='unicode', use_idf=True, ngram_range=(1, 1), norm='l2') tfidf_matrix = tfidf_vectorizer.fit_transform(text) print tfidf_vectorizer.get_feature_names() print tfidf_matrix.data indices = np.argsort(tfidf_vectorizer.idf_)[::-1] print indices features = tfidf_vectorizer.get_feature_names() top_n = 5 top_features = [features[i] for i in indices[:top_n]] print len(features) print tfidf_matrix.shape print top_features
def test(db, dataset): mongo = MongoHC(db, dataset) docs = mongo.get_element_by_id(1114) docs = [docs] for doc in docs[:1]: logfun.info('#' * 80) logfun.info('Scanning documents: %(id_doc)s' % doc) logfun.info('#' * 80) #try: entitySet, annotationsSorted, response = getAnnotation(doc['text']) '''doc['abstracts'] = [] for e in entitySet: logfun.info('Extracting abstract for entity %s' % e) abstract = get_abstract(e) if abstract: doc['abstracts'].append(abstract) else: logfun.warning('Abstract not found!') logfun.info('-' * 80) doc['entity_set'] = list(entitySet)''' pp.pprint(response)
def test(db, dataset): mongo = MongoHC(db, dataset) docs = mongo.get_element_by_id(1114) docs = [docs] for doc in docs[:1]: logfun.info('#' * 80) logfun.info('Scanning documents: %(id_doc)s' % doc) logfun.info('#' * 80) #try: entitySet,annotationsSorted,response = getAnnotation(doc['text']) '''doc['abstracts'] = [] for e in entitySet: logfun.info('Extracting abstract for entity %s' % e) abstract = get_abstract(e) if abstract: doc['abstracts'].append(abstract) else: logfun.warning('Abstract not found!') logfun.info('-' * 80) doc['entity_set'] = list(entitySet)''' pp.pprint(response)
def test_text_vectorization(): mongo_dataset = MongoHC("hc", "re0") data = [d for d in mongo_dataset.get_all(order_by="id_doc")] text = [d["text"] for d in data[1:2]] tfidf_vectorizer = TfidfVectorizer( max_df=1, max_features=200000, min_df=1, stop_words="english", strip_accents="unicode", use_idf=True, ngram_range=(1, 1), norm="l2", ) tfidf_matrix = tfidf_vectorizer.fit_transform(text) print tfidf_vectorizer.get_feature_names() print tfidf_matrix.data indices = np.argsort(tfidf_vectorizer.idf_)[::-1] print indices features = tfidf_vectorizer.get_feature_names() top_n = 5 top_features = [features[i] for i in indices[:top_n]] print len(features) print tfidf_matrix.shape print top_features
def extract_entity(db, dataset): mongo_from = MongoHC(db, dataset + '_for_alchemy') mongo_to = MongoHC(db, dataset) docs = mongo_from.get_all(order_by='id_doc') docs = [doc for doc in docs] for doc in docs[:]: logfun.info('#' * 80) logfun.info('Scanning documents: %(id_doc)s' % doc) logfun.info('#' * 80) try: entitySet, annotationsSorted, response = getAnnotation(doc['text']) doc['abstracts'] = [] doc['alchemy_response'] = response for e in entitySet: logfun.info('Extracting abstract for entity %s' % e) abstract = get_abstract(e) if abstract: doc['abstracts'].append(abstract) else: logfun.warning('Abstract not found!') logfun.info('-' * 80) doc['entity_set'] = list(entitySet) mongo_to.save_document(doc) mongo_from.remove_document_by_id(doc['id_doc']) except Exception, e: logfun.error("Something awful happened!") logfun.error(e) logfun.error(sys.exc_info()[2])
def extract_abstract_dandelion(db, dataset): mongo = MongoHC(db, dataset) mongo_dbpedia = MongoHC(db, 'dbpedia') docs = [doc for doc in mongo.get_all(order_by='id_doc')] for doc in docs: try: entities = [ e['lod']['dbpedia'] for e in doc['dandelion']['annotations'] ] for e in entities: if mongo_dbpedia.get_element_by_mongo_id(e): logfun.info('Entities already in database') continue dbpedia = {} logfun.info('Extracting abstract for entity %s' % e) abstract = get_abstract(e) if abstract: dbpedia['_id'] = e dbpedia['abstract'] = abstract mongo_dbpedia.save_document(dbpedia) else: logfun.warning('Abstract not found!') logfun.info('-' * 80) except Exception, e: logfun.error("Something awful happened!") logfun.error(e) logfun.error(sys.exc_info()[2])
def extract_alchemy(db, dataset): mongo = MongoHC(db, dataset) docs = [doc for doc in mongo.get_doc_with_no_key('alchemy_response')] for doc in docs: try: entitySet,annotationsSorted,response = getAnnotation(doc['text']) doc['alchemy_response'] = response mongo.save_document(doc) except Exception, e: logfun.error("Something awful happened!") logfun.error(e) logfun.error(sys.exc_info()[2])
def extract_alchemy(db, dataset): mongo = MongoHC(db, dataset) docs = [doc for doc in mongo.get_doc_with_no_key('alchemy_response')] for doc in docs: try: entitySet, annotationsSorted, response = getAnnotation(doc['text']) doc['alchemy_response'] = response mongo.save_document(doc) except Exception, e: logfun.error("Something awful happened!") logfun.error(e) logfun.error(sys.exc_info()[2])
def extract_dandelion(db, dataset): mongo = MongoHC(db, dataset) docs = [doc for doc in mongo.get_doc_with_no_key('dandelion', order_by='id_doc')] for doc in docs: try: dan = get_entities_from_dandelion(doc['text']) logfun.info(dan['timestamp']) doc['dandelion'] = dan mongo.save_document(doc) except Exception, e: logfun.error(traceback.format_exc())
def test_fabio(db, dataset, gamma=0.5, ranking_metric='pr', lsa=False, save=False): mongo_result = MongoHC(db, 'test_fabio') result = clf.cluster_fabio(db, dataset, gamma=gamma, ranking_metric=ranking_metric, with_lsa=lsa) if save: mongo_result.save_document(result) pp.pprint(result)
def extract_dandelion(db, dataset): mongo = MongoHC(db, dataset) docs = [ doc for doc in mongo.get_doc_with_no_key('dandelion', order_by='id_doc') ] for doc in docs: try: dan = get_entities_from_dandelion(doc['text']) logfun.info(dan['timestamp']) doc['dandelion'] = dan mongo.save_document(doc) except Exception, e: logfun.error(traceback.format_exc())
def extract_entity(db, dataset): mongo_from = MongoHC(db, dataset + '_for_alchemy') mongo_to = MongoHC(db, dataset) docs = mongo_from.get_all(order_by='id_doc') docs = [doc for doc in docs] for doc in docs[:]: logfun.info('#' * 80) logfun.info('Scanning documents: %(id_doc)s' % doc) logfun.info('#' * 80) try: entitySet,annotationsSorted,response = getAnnotation(doc['text']) doc['abstracts'] = [] doc['alchemy_response'] = response for e in entitySet: logfun.info('Extracting abstract for entity %s' % e) abstract = get_abstract(e) if abstract: doc['abstracts'].append(abstract) else: logfun.warning('Abstract not found!') logfun.info('-' * 80) doc['entity_set'] = list(entitySet) mongo_to.save_document(doc) mongo_from.remove_document_by_id(doc['id_doc']) except Exception, e: logfun.error("Something awful happened!") logfun.error(e) logfun.error(sys.exc_info()[2])
def entities_distribution(db, dataset): mongo = MongoHC(db, dataset) data = [doc for doc in mongo.get_all(order_by='id_doc')] entities = set() for d in data: for e in d['alchemy_response']['entities']: entities.add(e['text']) entities_dict = {e: 0 for i, e in enumerate(entities)} for d in data: for e in d['alchemy_response']['entities']: entities_dict[e['text']] += 1 return entities_dict, entities
def extract_abstract(db, dataset): mongo = MongoHC(db, dataset) docs = [doc for doc in mongo.get_empty_abstract()] for doc in docs: try: for e in doc['entity_set']: logfun.info('Extracting abstract for entity %s' % e) abstract = get_abstract(e) if abstract: doc['abstracts'].append(abstract) else: logfun.warning('Abstract not found!') logfun.info('-' * 80) mongo.save_document(doc) except Exception, e: logfun.error("Something awful happened!") logfun.error(e) logfun.error(sys.exc_info()[2])
def extract_abstract_dandelion(db, dataset): mongo = MongoHC(db, dataset) mongo_dbpedia = MongoHC(db, 'dbpedia') docs = [doc for doc in mongo.get_all(order_by='id_doc')] for doc in docs: try: entities = [e['lod']['dbpedia'] for e in doc['dandelion']['annotations']] for e in entities: if mongo_dbpedia.get_element_by_mongo_id(e): logfun.info('Entities already in database') continue dbpedia = {} logfun.info('Extracting abstract for entity %s' % e) abstract = get_abstract(e) if abstract: dbpedia['_id'] = e dbpedia['abstract'] = abstract mongo_dbpedia.save_document(dbpedia) else: logfun.warning('Abstract not found!') logfun.info('-' * 80) except Exception, e: logfun.error("Something awful happened!") logfun.error(e) logfun.error(sys.exc_info()[2])
__author__ = 'biagio' from mongo_hc import MongoHC import classifier as clf import pprint as pp import numpy as np from sklearn.feature_extraction.text import TfidfVectorizer from text_utils import TextUtils import argparse mongo = MongoHC('hc', 'test_new') def first_test_re0(): for i, g in enumerate(np.arange(0.1, 1, 0.01)): result = clf.cluster_alchemy('re1', gamma=g, filter=True) pp.pprint(result) result['n_attempt'] = i + 1 result['test'] = 'fourth' mongo.save_document(result) def test_without_entity(): result = clf.cluster_alchemy('re1', gamma=1) pp.pprint(result) result['test'] = 'baseline' mongo.save_document(result) def test_bow(): result = clf.scipy_algo('re0')
def test_fabio(db, dataset, gamma=0.5, ranking_metric="pr", lsa=False, save=False): mongo_result = MongoHC(db, "test_fabio") result = clf.cluster_fabio(db, dataset, gamma=gamma, ranking_metric=ranking_metric, with_lsa=lsa) if save: mongo_result.save_document(result) pp.pprint(result)