def build_features(article): from axel.stats.scores import compress_pos_tag component_size_dict = defaultdict(lambda: 0) dbpedia_graph = article.dbpedia_graph(redirects=True) dblp_component_set = set() for component in nx.connected_components(dbpedia_graph): nodes = [node for node in component if 'Category' not in node] stats_ngrams = article.CollocationModel.COLLECTION_MODEL.objects.filter(ngram__in=nodes) is_dblp_inside = bool([True for ngram in stats_ngrams if 'dblp' in ngram.source]) if is_dblp_inside: dblp_component_set.update(nodes) comp_len = len(nodes) for node in component: component_size_dict[node] = comp_len features = [] for colloc in article.CollocationModel.objects.filter(article=article): max_pos_tag = colloc.max_pos_tag collection_ngram = colloc.COLLECTION_MODEL.objects.get(ngram=colloc.ngram) pos_tag_start = str(compress_pos_tag(max_pos_tag, RULES_DICT_START)) pos_tag_end = str(compress_pos_tag(max_pos_tag, RULES_DICT_END)) feature = [ int(colloc.ngram in dblp_component_set), 'dblp' in collection_ngram.source, component_size_dict[colloc.ngram], int('NN_STARTS' == pos_tag_start), int('JJ_STARTS' == pos_tag_start), int('NN_ENDS' == pos_tag_end), int('VB_ENDS' == pos_tag_end), ] features.append((colloc.ngram, feature)) return features
def get_context_data(self, **kwargs): """Add nodes and links to the context""" context = super(NgramPOSView, self).get_context_data(**kwargs) model = _get_model_from_string(self.kwargs["model_name"]) ct = ContentType.objects.get_for_model(model, for_concrete_model=False) rules_dict = self._parse_rules() relevant_ids = set( TaggedCollection.objects.filter(content_type=ct, is_relevant=True).values_list("object_id", flat=True) ) irrelevant_ids = set( TaggedCollection.objects.filter(content_type=ct, is_relevant=False).values_list("object_id", flat=True) ) all_tags = set() correct_data = defaultdict(lambda: 0) incorrect_data = defaultdict(lambda: 0) unjudged_data = defaultdict(lambda: 0) for obj in model.objects.all(): max_pos_tag = " ".join(max(obj.pos_tag, key=lambda x: x[1])[0]) tag = str(scores.compress_pos_tag(max_pos_tag, rules_dict)) all_tags.add(tag) if obj.id in relevant_ids: correct_data[tag] += 1 elif obj.id in irrelevant_ids: incorrect_data[tag] += 1 else: unjudged_data[tag] += 1 all_tags = sorted(all_tags) context["categories"] = all_tags context["top_relevant_data"] = [ (tag, correct_data[tag], incorrect_data[tag], unjudged_data[tag]) for tag in all_tags if int(correct_data[tag] / (incorrect_data[tag] + 0.1)) > 10 ] context["top_irrelevant_data"] = [ (tag, correct_data[tag], incorrect_data[tag], unjudged_data[tag]) for tag in all_tags if int(incorrect_data[tag] / (correct_data[tag] + 0.1)) > 10 ] context["correct_data"] = [correct_data[tag] for tag in all_tags] context["incorrect_data"] = [incorrect_data[tag] for tag in all_tags] context["unjudged_data"] = [unjudged_data[tag] for tag in all_tags] # Add regex groups to populate forms context["regex_groups"] = self.regex_groups return context
def fit_ml_algo(self, scored_ngrams, cv_num): """ :param scored_ngrams: list of tuple of type (ngram, score) after initial scoring """ # 1. Calculate scores with float numbers for ngram bindings, as a dict collection = [] collection_labels = [] component_size_dict = {} dblp_component_dict = defaultdict(lambda: set()) # Calculate max pos tag count and build pos_tag_list start_pos_tag_list = [] end_pos_tag_list = [] pos_tag_list = [] # Populate component size dict for article in Article.objects.filter(cluster_id=self.cluster_id): temp_dict = defaultdict(lambda: 0) dbpedia_graph = article.dbpedia_graph(redirects=self.redirects) for component in nx.connected_components(dbpedia_graph): nodes = [node for node in component if 'Category' not in node] stats_ngrams = self.StatsModel.objects.filter(ngram__in=nodes) is_dblp_inside = bool([True for ngram in stats_ngrams if 'dblp' in ngram.source]) # ScienceWISE #is_dblp_inside = bool([True for ngram in stats_ngrams if ngram.is_ontological]) if is_dblp_inside: dblp_component_dict[article.id].update(nodes) comp_len = len(nodes) for node in component: temp_dict[node] = comp_len component_size_dict[article.id] = temp_dict if self.global_pos_tag: queryset = self.StatsModel.objects.all() else: queryset = self.Model.objects.all() for ngram in queryset: max_pos_tag = ngram.max_pos_tag pos_tag_start = str(compress_pos_tag(max_pos_tag, RULES_DICT_START)) pos_tag_end = str(compress_pos_tag(max_pos_tag, RULES_DICT_END)) if pos_tag_start not in start_pos_tag_list: start_pos_tag_list.append(pos_tag_start) if pos_tag_end not in end_pos_tag_list: end_pos_tag_list.append(pos_tag_end) if max_pos_tag not in pos_tag_list: pos_tag_list.append(max_pos_tag) max_pos_tag_start_len = len(start_pos_tag_list) max_pos_tag_end_len = len(end_pos_tag_list) max_pos_tag_len = len(pos_tag_list) # 2. Iterate through all ngrams, add scores - POS tag (to number), DBLP, DBPEDIA, IS_REL for article, score_dict in scored_ngrams: ngram = score_dict['ngram'] collection_ngram = score_dict['collection_ngram'] # POS TAG enumeration if self.global_pos_tag: max_pos_tag = collection_ngram.max_pos_tag pos_tag_prev = collection_ngram.pos_tag_prev pos_tag_after = collection_ngram.pos_tag_after else: max_pos_tag = ngram.max_pos_tag pos_tag_prev = ngram.pos_tag_prev pos_tag_after = ngram.pos_tag_after pos_tag_start = str(compress_pos_tag(max_pos_tag, RULES_DICT_START)) pos_tag_end = str(compress_pos_tag(max_pos_tag, RULES_DICT_END)) pos_tag_extra = set([' '.join(set(tags)) for tags in zip(*ngram.pos_tag)[0]]) wiki_edges_count = len(article.wikilinks_graph.edges([ngram.ngram])) feature = [ ngram.ngram in article.wiki_text_index, ngram.ngram in dblp_component_dict[article.id], ngram.ngram.isupper(), 'dblp' in collection_ngram.source, component_size_dict[article.id][ngram.ngram], wiki_edges_count, #collection_ngram.is_ontological, #'dbpedia' in collection_ngram.source, 'wiki_redirect' in collection_ngram.source, bool({'.', ',', ':', ';'}.intersection(zip(*pos_tag_prev)[0])), bool({'.', ',', ':', ';'}.intersection(zip(*pos_tag_after)[0])), len(ngram.ngram.split()), score_dict['participation_count'] ] if not self.uncompressed: # extend with compressed part of speech extended_feature = [1 if i == start_pos_tag_list.index(pos_tag_start) else 0 for i in range(max_pos_tag_start_len)] feature.extend(extended_feature) extended_feature = [1 if i == end_pos_tag_list.index(pos_tag_end) else 0 for i in range(max_pos_tag_end_len)] feature.extend(extended_feature) else: # Normal part of speech extended_feature = [1 if i == pos_tag_list.index(max_pos_tag) else 0 for i in range(max_pos_tag_len)] feature.extend(extended_feature) collection.append(feature) collection_labels.append(score_dict['is_rel']) feature_names = [ 'is_wiki_text', 'dblp_inside', 'is_upper', 'dblp', 'comp_size', 'wikilinks', #'ScienceWISE', #'is_wiki', 'is_redirect', 'punkt_prev', 'punkt_after', 'word_len', 'part_count' ] if not self.uncompressed: feature_names.extend(start_pos_tag_list) feature_names.extend(end_pos_tag_list) else: feature_names.extend(pos_tag_list) from sklearn.ensemble import ExtraTreesClassifier e_clf = ExtraTreesClassifier(random_state=0, compute_importances=True, n_estimators=100) new_collection = e_clf.fit(collection, collection_labels).transform(collection) print sorted(zip(list(e_clf.feature_importances_), feature_names), key=lambda x: x[0], reverse=True)[:new_collection.shape[1]] print new_collection.shape datas = [] for depth, min_split in ((5, 50), (5, 100), (5, 200), (3, 50), (3, 100), (3, 200)): print 'Parameters: depth {0}, split {1}'.format(depth, min_split) clf = DecisionTreeClassifier(max_depth=depth, min_samples_split=min_split) #for tag, values in pos_tag_counts.iteritems(): # print tag, values[1]/values[0] # clf.fit(new_collection, collection_labels) #import StringIO, pydot #from sklearn import tree #dot_data = StringIO.StringIO() #tree.export_graphviz(clf, out_file=dot_data, feature_names=feature_names) #graph = pydot.graph_from_dot_data(dot_data.getvalue()) #graph.write_pdf("decision.pdf") # # for i, vector in enumerate(collection): # value = clf.predict(vector)[0] # if value != collection_labels[i] and value: # print scored_ngrams[i][1]['ngram'], vector, value, collection_labels[i] # K-fold cross-validation print 'Performing cross validation' scores = cross_validation.cross_val_score(clf, new_collection, np.array(collection_labels), cv=cv_num, score_func=precision_score) print("Precision: %0.4f (+/- %0.4f)" % (scores.mean(), scores.std() / 2)) #print "Precision full scores (for t-test:): ", '\n'.join([str(score) for score in scores]) scores = cross_validation.cross_val_score(clf, new_collection, np.array(collection_labels), cv=cv_num, score_func=recall_score) print("Recall: %0.4f (+/- %0.4f)" % (scores.mean(), scores.std() / 2)) #print "Recall full scores (for t-test:): ", '\n'.join([str(score) for score in scores]) scores = cross_validation.cross_val_score(clf, new_collection, np.array(collection_labels), cv=cv_num, score_func=f1_score) # TODO: update recall with full collection labels print("F1: %0.4f (+/- %0.4f)" % (scores.mean(), scores.std() / 2)) #print "F1 full scores (for t-test:): ", '\n'.join([str(score) for score in scores]) data = {'f1': scores.mean(), 'min_split': min_split, 'depth': depth} scores = cross_validation.cross_val_score(clf, new_collection, np.array(collection_labels), cv=cv_num) print("Accuracy: %0.4f (+/- %0.4f)" % (scores.mean(), scores.std() / 2)) datas.append(data) max_data = {'f1': 0} for data in datas: if max_data['f1'] < data['f1']: max_data = data print 'Best result:' print max_data print clf = DecisionTreeClassifier(max_depth=max_data['depth'], min_samples_split=max_data['min_split']) clf.fit(new_collection, collection_labels) pickle.dump(clf, open('ngram_clf.pcl', 'w')) return max_data