def store_pas_duc_dataset(): """ Load all the DUC documents and summaries, process them and store them. """ docs_pas_lists = [] refs_pas_lists = [] docs, references, _ = get_duc() # For each document the pas_list is extracted after cleaning the text and tokenizing it. for doc in docs: print("Processing doc " + str(docs.index(doc)) + "/" + str(len(docs))) # Splitting sentences (by dot). sentences = tokens(doc) sentences = [text_cleanup(sentence) for sentence in sentences] pas_list = extract_pas(sentences, "duc") docs_pas_lists.append(pas_list) # The list of pas lists is then stored. with open(os.getcwd() + "/dataset/duc/duc_docs_pas.dat", "wb") as dest_f: pickle.dump(docs_pas_lists, dest_f) # Same for reference summaries... for ref in references: print("Processing doc " + str(references.index(ref)) + "/" + str(len(references))) # Splitting sentences (by dot). sentences = tokens(ref) sentences = [text_cleanup(sentence) for sentence in sentences] pas_list = extract_pas(sentences, "duc", keep_all=True) refs_pas_lists.append(pas_list) with open(os.getcwd() + "/dataset/duc/duc_refs_pas.dat", "wb") as dest_f: pickle.dump(refs_pas_lists, dest_f)
def __init__(self, input_text, summary_length, anaphora_resolution, model_name, quiet): super().__init__(input_text, summary_length, anaphora_resolution, model_name, quiet) if not self.quiet: print("Processing text...") sentences = tokens(self.input_text) sentences = [text_cleanup(sentence) for sentence in sentences] if self.anaphora_resolution: if not self.quiet: print("Resolving anaphora...") sentences = resolve_anaphora(sentences) if not self.quiet: print("Extracting Predicate Argument Structures...") pas_list = extract_pas(sentences) if self.anaphora_resolution: if not self.quiet: print("Resolving anaphora...") resolve_anaphora_pas_list(pas_list) self.pas_list = pas_list
def store_pas_nyt_dataset(nyt_path, min_pas, max_pas): """ Load NYT documents and summaries, process them and store them. Process a number of documents between min_pas and max_pas. :param nyt_path: path to nyt raw dataset. :param min_pas: first document number. :param max_pas: last document number """ docs_pas_lists = [] refs_pas_lists = [] docs, references = get_nyt(nyt_path, min_pas, max_pas) for i in range(len(docs)): start_time = time() print("Processing doc " + str(i) + "/" + str(len(docs))) doc = docs[i] ref = references[i] # Splitting sentences (by dot). sentences = tokens(doc) sentences = [text_cleanup(sentence) for sentence in sentences] doc_pas_list = extract_pas(sentences, "nyt") # Splitting sentences (by dot). sentences = tokens(ref) sentences = [text_cleanup(sentence) for sentence in sentences] ref_pas_list = extract_pas(sentences, "nyt", keep_all=True) if len(doc_pas_list) > 5 and len(doc_pas_list) >= len(ref_pas_list): refs_pas_lists.append(ref_pas_list) docs_pas_lists.append(doc_pas_list) timer(str(i) + " processed in:", start_time) # PAS lists are stored. with open( os.getcwd() + "/dataset/nyt/nyt_refs" + str(min_pas) + "-" + str(max_pas) + "_pas.dat", "wb") as dest_f: pickle.dump(refs_pas_lists, dest_f) with open( os.getcwd() + "/dataset/nyt/nyt_docs" + str(min_pas) + "-" + str(max_pas) + "_pas.dat", "wb") as dest_f: pickle.dump(docs_pas_lists, dest_f)
def __init__(self, input_text, summary_length, anaphora_resolution, model_name, quiet): super().__init__(input_text, summary_length, anaphora_resolution, model_name, quiet) if not self.quiet: print("Processing text...") sentences = tokens(self.input_text) sentences = [text_cleanup(sentence) for sentence in sentences] if self.anaphora_resolution: if not self.quiet: print("Resolving anaphora...") sentences = resolve_anaphora(sentences) self.sentences = sentences
def main(): logging.basicConfig( level=logging.DEBUG, format='%(message)s', ) hsm = HSM() """ Step 2.2. Monkey patch HSM.encrypt() method to measure encryption time with decorator implemented in utils module """ ### Block implemented by student ### Block implemented by student for fruit in utils.tokens(): logging.debug( f"'{fruit}' encrypted is '{hsm.encrypt(fruit).decode('ascii')}'\n")
def compute_idfs(doc_list, dest_path): """ Compute idfs given a document list, storing them in the specified destination file. :param doc_list: list of documents from which terms are extracted. :param dest_path: path in which store the idfs file. """ docs_number = len(doc_list) stems = [] doc_stems = {} for doc in doc_list: doc_index = doc_list.index(doc) doc_stems[doc_index] = [] for sent in tokens(doc): doc_stems[doc_index].extend(stem_and_stopword(sent)) stems.extend(doc_stems[doc_index]) # Terms are the stems (taken only once) which appears in the document list. terms = list(set(stems)) idfs = {} terms_dim = len(terms) term_index = 0 for term in terms: term_index += 1 term_count = 0 # Counting how many documents contains the term. for doc in doc_list: if term in doc_stems[doc_list.index(doc)]: term_count += 1 idf = math.log10(docs_number / term_count) idfs[term] = idf print("{:.3%}".format(term_index / terms_dim)) with open(dest_path, "wb") as dest_file: pickle.dump(idfs, dest_file)
docs = [] for doc in docgroup: if not doc: break (title, ns, sha1, text) = doc if ns != '0': continue if not text: continue # wtf if text[:9].lower() == ('#redirect'): continue text = unwiki(text) itokens = list(itokenise(text)) itokens_title = list(itokenise(title)) tokens = normalise(utils.tokens(text, itokens)) tokens_title = negate_tokens( normalise(utils.tokens(title, itokens_title))) tokens_all = tokens_title + tokens if not tokens_all: continue article_tokens = Counter() thisdoc_postings = defaultdict(lambda: []) for i, w in tokens_all: article_tokens[w] += 1 thisdoc_postings[w].append(i) for w, l in thisdoc_postings.iteritems(): postings[w].append((sha1, l)) docs.append({
def store_full_sentence_matrices(index, ref): """ Storing matrices for the extractive summarization task. """ if index < 0: docs, references, _ = get_duc() doc_path = "/dataset/duc/duc_doc_sent_matrix.dat" ref_path = "/dataset/duc/duc_ref_sent_matrix.dat" else: docs_pas_lists, refs_pas_lists = get_pas_lists(index) docs = get_sources_from_pas_lists(docs_pas_lists) references = get_sources_from_pas_lists(refs_pas_lists) dataset_path = "/dataset/nyt/" + str(index) + "/nyt" + str(index) doc_path = dataset_path + "_doc_sent_matrix.dat" ref_path = dataset_path + "_ref_sent_matrix.dat" docs_no = len(docs) # First dimension, documents number. # Second dimension, max document length (sparse), fixed in case of nyt. max_sent_no = 200 # Third dimension, vector representation dimension. sent_vec_len = 134 # The matrix are initialized as zeros, then they'll filled in with vectors for each docs' sentence. refs_3d_matrix = np.zeros((docs_no, max_sent_no, sent_vec_len)) docs_3d_matrix = np.zeros((docs_no, max_sent_no, sent_vec_len)) # For each document the pas_list is extracted after cleaning the text and tokenizing it. if ref: doc_list = references else: doc_list = docs for i in range(len(doc_list)): doc = doc_list[i] print("Processing doc " + str(i) + "/" + str(len(docs))) doc = text_cleanup(doc) # Splitting sentences (by dot). sentences = tokens(doc) embeddings = sentence_embeddings(sentences) centr_scores = centrality_scores(embeddings) tf_idfs = tf_idf(sentences, os.getcwd() + "/dataset/duc/duc_idfs.dat") # Position score, reference sentence length score, tf_idf, numerical data, centrality, title. for j in range(len(sentences)): sent = sentences[j] position_score = (len(sentences) - j) / len(sentences) length_score = len(sent) / max(len(snt) for snt in sentences) tf_idf_score = 0 numerical_score = 0 centrality_score = centr_scores[j] title_sim_score = np.inner(np.array(embeddings[j]), np.array(embeddings[-1])) # Computing centrality and tf_idf score. terms = list(set(stem_and_stopword(sent))) for term in terms: # Due to errors terms may be not present in the tf_idf dictionary. if term in tf_idfs.keys(): tf_idf_score += tf_idfs[term] else: tf_idf_score += 0 if term.isdigit(): numerical_score += 1 # Some errors in the preprocessing may lead to zero terms, so it is necessary to avoid division by zero. if len(terms): tf_idf_score /= len(terms) else: tf_idf_score = 0 if ref: refs_3d_matrix[i, j, :] = np.append([ position_score, length_score, tf_idf_score, numerical_score, centrality_score, title_sim_score ], embeddings[j]) else: docs_3d_matrix[i, j, :] = np.append([ position_score, length_score, tf_idf_score, numerical_score, centrality_score, title_sim_score ], embeddings[j]) # Storing the matrices in the appropriate file, depending on the scoring system. if ref: with open(os.getcwd() + ref_path, "wb") as dest_f: pickle.dump(refs_3d_matrix, dest_f) else: with open(os.getcwd() + doc_path, "wb") as dest_f: pickle.dump(docs_3d_matrix, dest_f)
for doc in docgroup: if not doc: break (title, ns, sha1, text) = doc if ns != '0': continue if not text: continue # wtf if text[:9].lower() == ('#redirect'): continue processed += 1 text = unwiki(text) tokens = normalise_gently(filter(good, utils.tokens(text))) tokens_title = normalise_gently(filter(good, utils.tokens(title))) round_tokens |= set(tokens_title) | set(tokens) for w in round_tokens: record = bdata.records.add() record.key = w record.value.parts.append('') del record.value.parts[:] t2 = time() # Index iserver.feedData(bdata, deadline_ms=10)
def __init__(self, query, mongo_cred, server='tcp://*****:*****@{host}/{db}'.format(user=mongo_cred['user'], password=mongo_cred['password'], host=mongo_cred['host'], db=mongo_cred['db']) self.mongo = MongoClient(MONGO_ADDRESS) self.db = self.mongo[mongo_cred['db']] index = self.index = IndexServer(server, store_path) self._TIME() query_tokens = map(self.correct_token, tokens(query)) querysets = set([frozenset(normalise_drop(ts)) for ts in query_tokens]) querysets = filter(lambda s: s, querysets) if not querysets: raise NotEnoughEntropy() self._TIME('proc') kw_docsets = defaultdict(lambda: frozenset()) doc_poslists = defaultdict(lambda: defaultdict(lambda: [])) self.freq = freq = defaultdict(lambda: Counter()) docs = None for queryset in querysets: matched_docs = set() for kw in queryset: self._TIME() try : res = index.query(kw, max_mistakes=0, timeout=3) except rpcz.RpcDeadlineExceeded: try: res = index.query(kw, max_mistakes=0, timeout=4) except rpcz.RpcDeadlineExceeded: res = index.query(kw, max_mistakes=0, timeout=5) if res.exact_total == 0: try: res = index.query(kw, max_mistakes=1, timeout=3) except rpcz.RpcDeadlineExceeded: self.extraquery_deadline = True self._TIME('index') for record in res.values: key = record.key if key in kw_docsets: matched_docs |= kw_docsets[key] continue data = record.value.parts docpostings = map(cPickle.loads, data) key_set = set() for (sha1, positions) in docpostings: key_set.add(sha1) matched_docs.add(sha1) doc_poslists[sha1][key].append(positions) freq[key][sha1] += len(positions) kw_docsets[key] = frozenset(key_set) self._TIME('proc') if docs is None: docs = matched_docs else: docs &= matched_docs if not docs: break self._TIME('proc') doc_count = Counter() doc_count.update({kw: len(freq[kw]) for kw in freq}) N = self.N = self.db.articles.count() idf = {kw: max(0.4, log((N - doc_count[kw] + 0.5) / (doc_count[kw] + 0.5))) for kw in freq} self.poslists = {sha1: merge_sorted([l for klists in doc_poslists[sha1].values() for l in klists]) for sha1 in docs} self._TIME('proc') # Here comes BM25 to save the world! scores = [] avg_size = self.db.service.find_one({'_id': 'avg_len'})['val'] doc_headers = self.db.articles.find({'_id': {'$in': list(docs)}, 'size': {'$gt': 0}}, {'size':1, 'title':1}) query_tokens = set([t for qs in query_tokens for t in qs]) for d in doc_headers: score = 0 sha1 = d['_id'] size = d['size'] title = d['title'] for kw in freq: m = (freq[kw][sha1] / size * (k1 + 1)) / (freq[kw][sha1] / size + k1 * (1 - b + b * size / avg_size)) score += idf[kw] * m # Prioritise title matches (our own heuristic) keywords_bag = Counter(query_tokens) title_tokens = normalise_gently(tokens(title)) title_bag = Counter(title_tokens) both = keywords_bag & title_bag both_c = sum(both.values()) ratio = both_c / (len(query_tokens) + len(title_tokens) - both_c) score += 10 * ratio tokens_title = normalise_drop(title_tokens) title_set = set(tokens_title) both = set(freq.keys()) & title_set ratio = len(both) / len(freq) score += 10 * ratio scores.append((sha1, score)) self.scores = sorted(scores, key=lambda p: p[1], reverse=True) self._TIME('ranking') self.results = map(lambda p: p[0], self.scores)
processed = 0 for doc in docgroup: if not doc: break (title, ns, sha1, text) = doc if ns != '0': continue if not text: continue # wtf if text[:9].lower() == ('#redirect'): continue processed += 1 text = unwiki(text) tokens = normalise_gently(filter(good, utils.tokens(text))) tokens_title = normalise_gently( filter(good, utils.tokens(title))) round_tokens |= set(tokens_title) | set(tokens) for w in round_tokens: record = bdata.records.add() record.key = w record.value.parts.append('') del record.value.parts[:] t2 = time() # Index iserver.feedData(bdata, deadline_ms=10)
for doc in docgroup: if not doc: break (title, ns, sha1, text) = doc if ns != '0': continue if not text: continue # wtf if text[:9].lower() == ('#redirect'): continue text = unwiki(text) itokens = list(itokenise(text)) itokens_title = list(itokenise(title)) tokens = normalise(utils.tokens(text, itokens)) tokens_title = negate_tokens(normalise(utils.tokens(title, itokens_title))) tokens_all = tokens_title + tokens if not tokens_all: continue article_tokens = Counter() thisdoc_postings = defaultdict(lambda: []) for i, w in tokens_all: article_tokens[w] += 1 thisdoc_postings[w].append(i) for w, l in thisdoc_postings.iteritems(): postings[w].append((sha1, l)) docs.append({ '_id': sha1,
def __init__(self, query, mongo_cred, server='tcp://*****:*****@{host}/{db}'.format( user=mongo_cred['user'], password=mongo_cred['password'], host=mongo_cred['host'], db=mongo_cred['db']) self.mongo = MongoClient(MONGO_ADDRESS) self.db = self.mongo[mongo_cred['db']] index = self.index = IndexServer(server, store_path) self._TIME() query_tokens = map(self.correct_token, tokens(query)) querysets = set([frozenset(normalise_drop(ts)) for ts in query_tokens]) querysets = filter(lambda s: s, querysets) if not querysets: raise NotEnoughEntropy() self._TIME('proc') kw_docsets = defaultdict(lambda: frozenset()) doc_poslists = defaultdict(lambda: defaultdict(lambda: [])) self.freq = freq = defaultdict(lambda: Counter()) docs = None for queryset in querysets: matched_docs = set() for kw in queryset: self._TIME() try: res = index.query(kw, max_mistakes=0, timeout=3) except rpcz.RpcDeadlineExceeded: try: res = index.query(kw, max_mistakes=0, timeout=4) except rpcz.RpcDeadlineExceeded: res = index.query(kw, max_mistakes=0, timeout=5) if res.exact_total == 0: try: res = index.query(kw, max_mistakes=1, timeout=3) except rpcz.RpcDeadlineExceeded: self.extraquery_deadline = True self._TIME('index') for record in res.values: key = record.key if key in kw_docsets: matched_docs |= kw_docsets[key] continue data = record.value.parts docpostings = map(cPickle.loads, data) key_set = set() for (sha1, positions) in docpostings: key_set.add(sha1) matched_docs.add(sha1) doc_poslists[sha1][key].append(positions) freq[key][sha1] += len(positions) kw_docsets[key] = frozenset(key_set) self._TIME('proc') if docs is None: docs = matched_docs else: docs &= matched_docs if not docs: break self._TIME('proc') doc_count = Counter() doc_count.update({kw: len(freq[kw]) for kw in freq}) N = self.N = self.db.articles.count() idf = { kw: max(0.4, log( (N - doc_count[kw] + 0.5) / (doc_count[kw] + 0.5))) for kw in freq } self.poslists = { sha1: merge_sorted( [l for klists in doc_poslists[sha1].values() for l in klists]) for sha1 in docs } self._TIME('proc') # Here comes BM25 to save the world! scores = [] avg_size = self.db.service.find_one({'_id': 'avg_len'})['val'] doc_headers = self.db.articles.find( { '_id': { '$in': list(docs) }, 'size': { '$gt': 0 } }, { 'size': 1, 'title': 1 }) query_tokens = set([t for qs in query_tokens for t in qs]) for d in doc_headers: score = 0 sha1 = d['_id'] size = d['size'] title = d['title'] for kw in freq: m = (freq[kw][sha1] / size * (k1 + 1)) / (freq[kw][sha1] / size + k1 * (1 - b + b * size / avg_size)) score += idf[kw] * m # Prioritise title matches (our own heuristic) keywords_bag = Counter(query_tokens) title_tokens = normalise_gently(tokens(title)) title_bag = Counter(title_tokens) both = keywords_bag & title_bag both_c = sum(both.values()) ratio = both_c / (len(query_tokens) + len(title_tokens) - both_c) score += 10 * ratio tokens_title = normalise_drop(title_tokens) title_set = set(tokens_title) both = set(freq.keys()) & title_set ratio = len(both) / len(freq) score += 10 * ratio scores.append((sha1, score)) self.scores = sorted(scores, key=lambda p: p[1], reverse=True) self._TIME('ranking') self.results = map(lambda p: p[0], self.scores)
def vocabulary(self): return tokens(self.get_all_text())
def received_vocabulary(self): return tokens(self.get_received_text())
def sent_vocabulary(self): return tokens(self.get_sent_text())