def create_cluster_obj(cluster_id, cluster_type, mentions, default_label, default_facet): nlp = get_item("spacy") ents_counter = Counter() pos_counter = Counter() labels_to_mentions = defaultdict(list) pos_to_mentions = defaultdict(list) unique_sents_ids = set() for mention in mentions: clean_mention = clean_text(mention.token) doc = nlp(mention.token) label = "NO_LABEL" pos = "NO_LABEL" for ent in doc.ents: # Only if the whole string is an entity if clean_text(ent.text) == clean_mention: label = ent.label_ for token in doc: if clean_text(token.text) == clean_mention: pos = token.pos_ ents_counter[label] += 1 pos_counter[pos] += 1 labels_to_mentions[label].append(mention) pos_to_mentions[pos].append(mention) unique_sents_ids.add(f"{mention.doc_id} {mention.sent_idx}") most_representative_mention, ner_label = _choose_most_representative_mention( mentions, ents_counter, labels_to_mentions) cluster_label = LABELS_MAP.get(ner_label, default_label) cluster_facet = FACETS_MAP.get(ner_label, default_facet) pos_label = None if any(pos_counter): pos_label = pos_counter.most_common()[0][0] return Cluster(cluster_id, cluster_type, mentions, pos_label, cluster_label, cluster_facet, most_representative_mention, len(mentions), len(unique_sents_ids))
def _initSummarySpacyObject(self): nlp = get_item("spacy") # get the top summary sentences per document (to cut time significantly for the full corpus processing): perDocSummTexts = [] for docIdx, doc in enumerate(self.corpus.documents): #docObj = doc.spacyDoc docSumm = '' #for sent in docObj._.textrank.summary(limit_phrases=20, limit_sentences=3): for sentText in doc.topSentencesText: if sentText.strip()[-1] != '.': docSumm += sentText.strip() + '. ' else: docSumm += sentText.strip() + ' ' perDocSummTexts.append(docSumm) #docObj = doc.spacyDoc #docSumm = ' '.join([sent.text for sent in docObj._.textrank.summary(limit_phrases=20, limit_sentences=3)]) #perDocSummTexts.append(docSumm) # create a SpaCy object for the concatenated summaries of all the documents: return nlp(' '.join(perDocSummTexts))
def _initDoc(self): nlp = get_item("spacy") self.spacyDoc = nlp(self.text) self.tokens = [t.text for t in self.spacyDoc] self.topSentencesText = [ sent.text for sent in self.spacyDoc._.textrank.summary( limit_phrases=20, limit_sentences=NUMBER_OF_TOP_SENTENCES_KEPT) ] # sentence tokenization done with SpaCy - for consistency within all variants if self.representationStyle == REPRESENTATION_STYLE_SPACY: # since it is time consuming to compute Spacy objects per sentence, we pass in the sentence # vector representation per sentence: self.sentences = [] for sentIdx, sentSpacyObj in enumerate(self.spacyDoc.sents): doNotInitRepresentation = True self.sentences.append( Sentence(self.id, sentIdx, sentSpacyObj.text, self.representationStyle, doNotInitRepresentation=doNotInitRepresentation, spacy_rep=sentSpacyObj)) if doNotInitRepresentation: self.sentences[-1].setRepresentation(sentSpacyObj.vector) # in all other cases, and as it should be for correct code, the representations are computed # within the Sentence object: else: self.sentences = [ Sentence(self.id, sentIdx, sentSpacyObj.text, self.representationStyle, spacy_rep=sentSpacyObj) for sentIdx, sentSpacyObj in enumerate(self.spacyDoc.sents) ]
def __initRepresentation(self): nlp = get_item("spacy") text = self.text text = "".join([ x.text_with_ws for x in nlp(text) if x.text not in STOP_WORDS and x.text.lower() not in PUNCTUATION ]) if self.representationStyle == REPRESENTATION_STYLE_SPACY: self.representation = nlp(text).vector # a spacy doc object elif self.representationStyle == REPRESENTATION_STYLE_BERT: self.representation = bert_embedder.encode([text ])[0] # a numpy vector elif self.representationStyle == REPRESENTATION_STYLE_W2V: # default for now is W2V wordVectors = [ nlp.vocab.get_vector(w) for w in self.tokens if w not in STOP_WORDS and w not in PUNCTUATION and nlp.vocab.has_vector(w) ] if len(wordVectors) > 0: self.representation = np.mean(wordVectors, axis=0) else: self.representation = np.random.uniform(-1, 1, (300, )) else: self.representation = None
from dataclasses import dataclass from typing import Optional, Set, Dict, List from QFSE.Corpus import Corpus import data.Config as config from QFSE.Utilities import REPRESENTATION_STYLE_SPACY, REPRESENTATION_STYLE_BERT, get_item, loadBert from QFSE.coref.models import Mention from QFSE.coref.utils import convert_corpus_to_coref_input_format, get_coref_clusters from QFSE.models import DocSent, Cluster, ClusterUserWrapper from QFSE.propositions.utils import get_proposition_clusters from QFSE.consts import COREF_TYPE_EVENTS, COREF_TYPE_PROPOSITIONS, COREF_TYPE_ENTITIES # The SpaCy and BERT objects must be loaded before anything else, so that classes using them get the initialized objects. # The SpaCy and BERT objects are initialized only when needed since these init processes take a long time. REPRESENTATION_STYLE = REPRESENTATION_STYLE_SPACY # REPRESENTATION_STYLE_W2V REPRESENTATION_STYLE_BERT get_item("spacy") if REPRESENTATION_STYLE == REPRESENTATION_STYLE_BERT: loadBert() class CorpusRegistry: def __init__(self): self._registry = {} def get_corpus(self, topicId) -> Optional[Corpus]: if topicId not in self._registry: # make sure the topic ID is valid: if topicId in config.CORPORA_LOCATIONS: referenceSummsFolder = os.path.join( config.CORPORA_LOCATIONS[topicId], config.CORPUS_REFSUMMS_RELATIVE_PATH)
def getQuerySummaryJson(self, clientJson): clientId = clientJson['clientId'] topicId = clientJson['request_query']['topicId'] clusters_query = self._get_clusters_query_from_request( clientJson['request_query']) query = clientJson['request_query']['query'] if not m_infoManager.clientInitialized(clientId): return self.getErrorJson('Unknown client. Please reload page.') if topicId != m_infoManager.getTopicId(clientId): return self.getErrorJson( 'Topic ID not yet initialized by client: {}'.format(topicId)) reply_query = {} corpus_registry: CorpusRegistry = get_item("corpus_registry") corpus: Corpus = corpus_registry.get_corpus(topicId) doc_sent_indices: Optional[Set[DocSent]] = None query_result_wrapper = None if clusters_query: query_registry: QueryRegistry = get_item("query_registry") query_result = query_registry.get_query(clusters_query) query_results_analyzer = m_infoManager.get_query_results_analyzer( clientId) if query_result is None: sentences = [] doc_sent_indices = self._clusters_query_to_doc_sent_indices( clusters_query, corpus) if any(doc_sent_indices): doc_sent_indices_to_use = set.intersection( *doc_sent_indices) sentences = self._get_sentences_for_query( doc_sent_indices_to_use, corpus) query_result = QueryResult([], clusters_query, [ QueryResultSentence( self._split_sent_text_to_tokens( sent, is_original_sentences=True), sent.docId, sent.sentIndex) for sent in sentences ], datetime.utcnow().isoformat()) if any(sentences): if len(sentences) > 1: summarizer = get_item("bart_summarizer") summary_sents = summarizer.summarize(sentences) else: # No need to summarize one sentence summary_sents = [sent.spacy_rep for sent in sentences] query_result.result_sentences = [ QueryResultSentence( self._split_sent_text_to_tokens( sent, is_original_sentences=False, original_sentences=sentences)) for sent in summary_sents ] query_registry.save_query(query_result) # Save queries and mark similar sentences to those used # query_results_analyzer.analyze_repeating(query_result) query_idx = query_results_analyzer.add_query_results(query_result) query_result_wrapper = QueryResultUserWrapper( query_result, query_idx) reply_query = { "queryResult": query_result_wrapper.custom_to_dict(), "textLength": 0, } doc_sent_indices = query_result.get_doc_sent_indices() m_infoManager.add_ui_action_log( clientId, UIAction( "query", { "query_idx": query_result_wrapper.query_idx if query_result_wrapper is not None else None }, datetime.utcnow().isoformat())) # Always return the clusters even if query is none reply_query = { **reply_query, **{ "corefClustersMetas": get_clusters_filtered( corpus.coref_clusters[COREF_TYPE_ENTITIES], doc_sent_indices), "eventsClustersMetas": get_clusters_filtered(corpus.coref_clusters[COREF_TYPE_EVENTS], doc_sent_indices), "propositionClustersMetas": get_clusters_filtered( corpus.coref_clusters[COREF_TYPE_PROPOSITIONS], doc_sent_indices) } } return json.dumps({"reply_query": reply_query})
def getInitialSummaryJson(self, clientJson): clientId = clientJson['clientId'] topicId = clientJson['request_get_initial_summary']['topicId'] questionnaireBatchIndex = clientJson['request_get_initial_summary'][ 'questionnaireBatchIndex'] timeAllowed = clientJson['request_get_initial_summary']['timeAllowed'] assignmentId = clientJson['request_get_initial_summary'][ 'assignmentId'] hitId = clientJson['request_get_initial_summary']['hitId'] workerId = clientJson['request_get_initial_summary']['workerId'] turkSubmitTo = clientJson['request_get_initial_summary'][ 'turkSubmitTo'] corpus_registry: CorpusRegistry = get_item("corpus_registry") corpus = corpus_registry.get_corpus(topicId) if corpus is None: return self.getErrorJson( 'Topic ID not supported: {}'.format(topicId)) m_infoManager.initClient(clientId, corpus, None, 0, None, topicId, questionnaireBatchIndex, timeAllowed, assignmentId, hitId, workerId, turkSubmitTo, QueryResultsAnalyzer()) topicName = topicId m_infoManager.add_ui_action_log( clientId, UIAction("initial", {"topic_id": topicId}, datetime.utcnow().isoformat())) reply = { "reply_get_initial_summary": { "summary": [], "keyPhraseList": [], "topicName": topicName, "topicId": topicId, "documentsMetas": { x.id: { "id": x.id, "num_sents": len(x.sentences) } for x in corpus.documents }, "corefClustersMetas": get_clusters_filtered( corpus.coref_clusters[COREF_TYPE_ENTITIES]), "eventsClustersMetas": get_clusters_filtered( corpus.coref_clusters[COREF_TYPE_EVENTS]), "propositionClustersMetas": get_clusters_filtered( corpus.coref_clusters[COREF_TYPE_PROPOSITIONS]), "numDocuments": str(len(corpus.documents)), "questionnaire": [], "timeAllowed": str(timeAllowed), "textLength": "" } } return json.dumps(reply)