def evaluate_segmented_ml(language_model, classifier, dataset_fnames, output_fname, segment_filtering=None): """ Produces an output file that contains the ranking of document pairs and predicted relevance labels. This is done by computing similarity between segments, approximating the relevance of a document pair using a pre-learned classifier, and producing a ranking based on the classifier certainty. If segment_filtering is not None, a text summarization technique is used for the filtering of <Thread> segments. Note that the ml approach expects all training samples to have the same number of active segments. """ with open(output_fname, "wt") as output_file: for orgquestion, (thread, _) \ in zip(segment_orgquestions(dataset_fnames), segment_threads(dataset_fnames, segment_filtering=segment_filtering)): results = [] for orgquestion_segment in orgquestion.segments: if not orgquestion_segment.active: continue for thread_segment in thread.segments: if not thread_segment.active: continue results.append( language_model.similarity(orgquestion_segment, thread_segment)) test_score = classifier.decision_function([results])[0] test_class = classifier.predict([results])[0] output_file.write("%s\t%s\t0\t%s\t%s\n" % (orgquestion.id, thread.id, repr(test_score), "true" if test_class else "false"))
def train_segmented_ml(language_model, dataset_fnames, segment_filtering=None): """ Trains a classifier that maps document similarity to relevance labels. This is done by computing similarity between segments and then learning a to classify the segment similarities as relevant / non-relevant. If segment_filtering is not None, a text summarization technique is used for the filtering of <Thread> segments. Note that the ml approach expects all training samples to have the same number of active segments. """ training_scores = [] training_classes = [] for orgquestion, (thread, relevant) \ in zip(segment_orgquestions(dataset_fnames), segment_threads(dataset_fnames, segment_filtering=segment_filtering)): results = [] for orgquestion_segment in orgquestion.segments: if not orgquestion_segment.active: continue for thread_segment in thread.segments: if not thread_segment.active: continue results.append( language_model.similarity(orgquestion_segment, thread_segment)) training_scores.append(results) training_classes.append(relevant) classifier = LogisticRegression( random_state=LOGISTIC_REGRESSION_RANDOM_STATE) classifier.fit(training_scores, training_classes) return classifier
def produce_gold_results(dataset_fnames, output_fname): """ Produces gold results from an input (dev) datasets and stores the results in an output file. """ with open(output_fname, "wt") as output_file: orgquestion_ids = [] orgquestion_threads = {} for orgquestion, (thread, relevant) in zip( segment_orgquestions(dataset_fnames), segment_threads(dataset_fnames)): if orgquestion.id not in orgquestion_threads: orgquestion_threads[orgquestion.id] = [] orgquestion_ids.append(orgquestion.id) orgquestion_threads[orgquestion.id].append((relevant, thread.id)) for orgquestion_id in orgquestion_ids: threads = orgquestion_threads[orgquestion_id] sorted_threads = sorted(enumerate(threads), key=lambda thread: thread[1][0], \ reverse=True) for rank, (_, (relevant, thread_id)) in sorted(enumerate(sorted_threads), key=lambda thread: thread[1][0]): gold_score = (len(sorted_threads) - rank) / len(sorted_threads) output_file.write( "%s\t%s\t%d\t%s\t%s\n" % (orgquestion_id, thread_id, rank + 1, gold_score, "true" if relevant else "false"))
def evaluate(language_model, dataset_fnames, output_fname): """Produces an output file that contains the ranking of document pairs.""" with open(output_fname, "wt") as output_file: for orgquestion, (thread, _) in zip(segment_orgquestions(dataset_fnames), segment_threads(dataset_fnames)): test_score = language_model.compare(orgquestion, thread) output_file.write("%s\t%s\t0\t%s\ttrue\n" % (orgquestion.id, thread.id, repr(test_score)))
def evaluate_segmented_aggregation(language_model, classifier, dataset_fnames, output_fname, aggregate_tier1_segments, aggregate_tier2_segments, thread_first=True, segment_filtering=None): """ Produces an output file that contains the ranking of document pairs and predicted relevance labels. The segmented non-ML version computes similarity between segments and then performs a reduction step to derive document similarity. If full_threads is True, processes entire <Thread>s, otherwise processes only the <RelQuestion>s. If thread_first is True, the reduction is first performed over <Thread> segments and then over <OrgQuestion> segments rather than the other way around. If segment_filtering is not None, a text summarization technique is used for the filtering of <Thread> segments. """ with open(output_fname, "wt") as output_file: for orgquestion, (thread, _) \ in zip(segment_orgquestions(dataset_fnames), segment_threads(dataset_fnames, segment_filtering=segment_filtering)): results = [] tier1 = thread if thread_first else orgquestion tier2 = orgquestion if thread_first else thread for tier2_segment in tier2.segments: subresults = [] for tier1_segment in tier1.segments: orgquestion_segment = tier2_segment if thread_first else tier1_segment thread_segment = tier1_segment if thread_first else tier2_segment subresults.append([ language_model.similarity(orgquestion_segment, thread_segment), tier2_segment, tier1_segment ]) subresults_aggregate = aggregate_tier1_segments( subresults, language_model) LOGGER.debug("Aggregating subresults: %s -> %s", subresults, subresults_aggregate) results.append(subresults_aggregate) results_aggregate = aggregate_tier2_segments( results, language_model) LOGGER.debug("Aggregating results: %s -> %s", results, results_aggregate) test_score = results_aggregate[0] test_class = classifier.predict([[test_score]])[0] output_file.write("%s\t%s\t0\t%s\t%s\n" % (orgquestion.id, thread.id, repr(test_score), "true" if test_class else "false"))
def train_segmented_aggregation(language_model, dataset_fnames, aggregate_tier1_segments, aggregate_tier2_segments, thread_first=True, segment_filtering=None): """ Trains a classifier that maps document similarity to relevance labels. The segmented non-ML version computes similarity between segments and then performs a reduction step to derive document similarity. If full_threads is True, processes entire <Thread>s, otherwise processes only the <RelQuestion>s. If thread_first is True, the reduction is first performed over <Thread> segments and then over <OrgQuestion> segments rather than the other way around. If segment_filtering is not None, a text summarization technique is used for the filtering of <Thread> segments. """ training_scores = [] training_classes = [] for orgquestion, (thread, relevant) \ in zip(segment_orgquestions(dataset_fnames), segment_threads(dataset_fnames, segment_filtering=segment_filtering)): results = [] tier1 = thread if thread_first else orgquestion tier2 = orgquestion if thread_first else thread for tier2_segment in tier2.segments: subresults = [] for tier1_segment in tier1.segments: orgquestion_segment = tier2_segment if thread_first else tier1_segment thread_segment = tier1_segment if thread_first else tier2_segment subresults.append([ language_model.similarity(orgquestion_segment, thread_segment), tier2_segment, tier1_segment ]) subresults_aggregate = aggregate_tier1_segments( subresults, language_model) LOGGER.debug("Aggregating subresults: %s -> %s", subresults, subresults_aggregate) results.append(subresults_aggregate) results_aggregate = aggregate_tier2_segments(results, language_model) LOGGER.debug("Aggregating results: %s -> %s", results, results_aggregate) training_scores.append(results_aggregate) training_classes.append(relevant) classifier = LogisticRegression( random_state=LOGISTIC_REGRESSION_RANDOM_STATE) classifier.fit(training_scores, training_classes) return classifier
def evaluate_nonsegmented(language_model, classifier, dataset_fnames, output_fname, \ segment_filtering=None): """ Produces an output file that contains the ranking of document pairs and predicted relevance labels. The non-segmented version disregards segmentation and computes similarity directly between documents. If segment_filtering is not None, a text summarization technique is used for the filtering of <Thread> segments. """ with open(output_fname, "wt") as output_file: for orgquestion, (thread, _) \ in zip(segment_orgquestions(dataset_fnames), segment_threads(dataset_fnames, segment_filtering=segment_filtering)): test_score = language_model.similarity(orgquestion, thread) test_class = classifier.predict([[test_score]])[0] output_file.write("%s\t%s\t0\t%s\t%s\n" % (orgquestion.id, thread.id, repr(test_score), "true" if test_class else "false"))
def train_nonsegmented(language_model, dataset_fnames, segment_filtering=False): """ Trains a classifier that maps document similarity to relevance labels. The non-segmented version disregards segmentation and computes similarity directly between documents. If segment_filtering is not None, a text summarization technique is used for the filtering of <Thread> segments. """ training_scores = [] training_classes = [] for orgquestion, (thread, relevant) \ in zip(segment_orgquestions(dataset_fnames), segment_threads(dataset_fnames, segment_filtering=segment_filtering)): training_scores.append( [language_model.similarity(orgquestion, thread)]) training_classes.append(relevant) classifier = LogisticRegression( random_state=LOGISTIC_REGRESSION_RANDOM_STATE) classifier.fit(training_scores, training_classes) return classifier
def __init__(self, base_term_weighting="tfidf_ntc_ntc", extra_term_weighting=None): """ Sets up a tf-idf language model using the unannotated SemEval 2016/2017 Task 3 dataset. base_term_weighting is either "tfidf_xxx.xxx", where "xxx.xxx" stands for the tf-idf SMART notation described in (Salton, Gerard. 1971a), which then determines the base term weights during the vectorization, or it is "bm25", which specifies that the probabilistic Okapi BM25 scoring will be used instead. You can toggle between the two via self.use_tfidf. extra_term_weighting specifies additional weighting factors, which stack multiplicatively on top of the base term weights during token list vectorization. If extra_term_weighting is "godwin", the base weight of a term is multiplied by a factor inversely proportional to the sum of the positions at which the term appears in the document. If extra_term_weighting is "murataetal00_A" or "murataetal00_B", the base weight of a term t is multiplied by a factor K_location(d, t) described in (Murata et al., 2000) with constants taken for system A or B from section 3. The title and body parameters then correspond to the title and body token lists. """ file_handler = logging.FileHandler(LOG_FNAME, encoding='utf8') logging.getLogger().addHandler(file_handler) # Parse the configuration. if re.match(r"tfidf_", base_term_weighting): self.use_tfidf = True else: assert re.match(r"bm25", base_term_weighting) self.use_tfidf = False self.bm25_k1, self.bm25_k3, self.bm25_b = \ re.match(r"bm25_k1=([0-9](?:\.[0-9]*)?)_k3=([0-9]*(?:\.[0-9]*)?)_b=([0-9](?:\.[0-9]*)?)", base_term_weighting).groups() self.bm25_k1 = float(self.bm25_k1) self.bm25_k3 = float(self.bm25_k3) self.bm25_b = float(self.bm25_b) if self.use_tfidf: assert extra_term_weighting in (None, "godwin", "murataetal00_A", "murataetal00_B") else: assert extra_term_weighting is None self.extra_term_weighting = extra_term_weighting if self.use_tfidf: self.tfidf_result = {} self.tfidf_query = {} self.tfidf_result["tf"], self.tfidf_result["df"], self.tfidf_result["norm"], \ self.tfidf_slope, self.tfidf_query["tf"], self.tfidf_query["df"], \ self.tfidf_query["norm"] = re.match(r"tfidf_(.)(.)(.)(?:_s=([0-9](?:\.[0-9]*)?))?_(.)(.)(.)", base_term_weighting).groups() self.tfidf_result["tf"] = TF_WEIGHTING_METHOD_MAP[ self.tfidf_result["tf"]] self.tfidf_result["df"] = DF_WEIGHTING_METHOD_MAP[ self.tfidf_result["df"]] self.tfidf_result["norm"] = NORMALIZATION_METHOD_MAP[ self.tfidf_result["norm"]] if self.tfidf_result["norm"] in (norm_u, norm_b): assert self.tfidf_slope is not None if self.tfidf_slope is not None: self.tfidf_slope = float(self.tfidf_slope) self.tfidf_query["tf"] = TF_WEIGHTING_METHOD_MAP[ self.tfidf_query["tf"]] self.tfidf_query["df"] = DF_WEIGHTING_METHOD_MAP[ self.tfidf_query["df"]] self.tfidf_query["norm"] = NORMALIZATION_METHOD_MAP[ self.tfidf_query["norm"]] assert self.tfidf_query["norm"] not in (norm_u, norm_b) # Prepare the BM25 scoring model. try: with open(BM25_STATS_FNAME, "br") as file: self.bm25_avdl = load(file) except IOError: self.bm25_avdl = {} LOGGER.info("preparing the bm25 scoring function statistics") self.bm25_avdl["documents"] = mean([sum((len(token) for token in document.tokens)) \ for document, _ in segment_threads([UNANNOTATED_DATASET_FNAME])]) LOGGER.info("average document length: %f", self.bm25_avdl["documents"]) self.bm25_avdl["qsubjects"] = mean([sum((len(token) for token in segment.tokens)) \ for segment in chain.from_iterable(document.segments for document, _ \ in segment_threads([UNANNOTATED_DATASET_FNAME])) \ if segment == segment.document.qsubject]) LOGGER.info("average qsubject segment length: %f", self.bm25_avdl["qsubjects"]) self.bm25_avdl["qbodies"] = mean([sum((len(token) for token in segment.tokens)) \ for segment in chain.from_iterable(document.segments for document, _ \ in segment_threads([UNANNOTATED_DATASET_FNAME])) \ if segment == segment.document.qbody]) LOGGER.info("average qbody segment length: %f", self.bm25_avdl["qbodies"]) self.bm25_avdl["comments"] = mean([sum((len(token) for token in segment.tokens)) \ for segment in chain.from_iterable(document.segments for document, _ \ in segment_threads([UNANNOTATED_DATASET_FNAME])) \ if segment != segment.document.qsubject and segment != segment.document.qbody]) LOGGER.info("average comment segment length: %f", self.bm25_avdl["comments"]) with open(BM25_STATS_FNAME, "bw") as file: dump(self.bm25_avdl, file) LOGGER.info("done preparing the bm25 scoring function statistics") # Prepare the pivoted document normalization tf-idf statistics. try: with open(PIVOT_STATS_FNAME, "rb") as file: self.pivot_stats = load(file) except IOError: self.pivot_stats = {} LOGGER.info( "preparing the pivoted document normalization tf-idf statistics" ) self.pivot_stats["documents"] = {} self.pivot_stats["documents"]["avgb"] = self.bm25_avdl["documents"] self.pivot_stats["documents"]["avgu"] = mean([len(document.terms) \ for document, _ in segment_threads([UNANNOTATED_DATASET_FNAME])]) LOGGER.info("average document length: %f", self.pivot_stats["documents"]["avgb"]) LOGGER.info("average document unique terms: %f", self.pivot_stats["documents"]["avgu"]) self.pivot_stats["qsubjects"] = {} self.pivot_stats["qsubjects"]["avgb"] = self.bm25_avdl["qsubjects"] self.pivot_stats["qsubjects"]["avgu"] = mean([len(segment.terms) \ for segment in chain.from_iterable(document.segments for document, _ \ in segment_threads([UNANNOTATED_DATASET_FNAME])) \ if segment == segment.document.qsubject]) LOGGER.info("average qsubject segment length: %f", self.pivot_stats["qsubjects"]["avgb"]) LOGGER.info("average qsubject segment unique terms: %f", self.pivot_stats["qsubjects"]["avgu"]) self.pivot_stats["qbodies"] = {} self.pivot_stats["qbodies"]["avgb"] = self.bm25_avdl["qbodies"] self.pivot_stats["qbodies"]["avgu"] = mean([len(segment.terms) \ for segment in chain.from_iterable(document.segments for document, _ \ in segment_threads([UNANNOTATED_DATASET_FNAME])) \ if segment == segment.document.qbody]) LOGGER.info("average qbody segment length: %f", self.pivot_stats["qbodies"]["avgb"]) LOGGER.info("average qbody segment unique terms: %f", self.pivot_stats["qbodies"]["avgu"]) self.pivot_stats["comments"] = {} self.pivot_stats["comments"]["avgb"] = self.bm25_avdl["comments"] self.pivot_stats["comments"]["avgu"] = mean([len(segment.terms) \ for segment in chain.from_iterable(document.segments for document, _ \ in segment_threads([UNANNOTATED_DATASET_FNAME])) \ if segment != segment.document.qsubject and segment != segment.document.qbody]) LOGGER.info("average comment segment length: %f", self.pivot_stats["comments"]["avgb"]) LOGGER.info("average comment segment unique terms: %f", self.pivot_stats["comments"]["avgu"]) with open(PIVOT_STATS_FNAME, "wb") as file: dump(self.pivot_stats, file) LOGGER.info( "done preparing the pivoted document normalization tf-idf statistics" ) # Prepare the dictionary. try: self.dictionary = corpora.Dictionary.load(DICTIONARY_FNAME, mmap='r') except IOError: self.dictionary = \ corpora.Dictionary(segment for segment in chain.from_iterable( \ document.segments for document, _ \ in segment_threads([UNANNOTATED_DATASET_FNAME]))) self.dictionary.save(DICTIONARY_FNAME) logging.getLogger().removeHandler(file_handler)