Exemplo n.º 1
0
def store_pas_duc_dataset():
    """
    Load all the DUC documents and summaries, process them and store them.
    """
    docs_pas_lists = []
    refs_pas_lists = []

    docs, references, _ = get_duc()
    # For each document the pas_list is extracted after cleaning the text and tokenizing it.
    for doc in docs:
        print("Processing doc " + str(docs.index(doc)) + "/" + str(len(docs)))
        # Splitting sentences (by dot).
        sentences = tokens(doc)
        sentences = [text_cleanup(sentence) for sentence in sentences]
        pas_list = extract_pas(sentences, "duc")
        docs_pas_lists.append(pas_list)

    # The list of pas lists is then stored.
    with open(os.getcwd() + "/dataset/duc/duc_docs_pas.dat", "wb") as dest_f:
        pickle.dump(docs_pas_lists, dest_f)

    # Same for reference summaries...
    for ref in references:
        print("Processing doc " + str(references.index(ref)) + "/" +
              str(len(references)))
        # Splitting sentences (by dot).
        sentences = tokens(ref)
        sentences = [text_cleanup(sentence) for sentence in sentences]
        pas_list = extract_pas(sentences, "duc", keep_all=True)
        refs_pas_lists.append(pas_list)

    with open(os.getcwd() + "/dataset/duc/duc_refs_pas.dat", "wb") as dest_f:
        pickle.dump(refs_pas_lists, dest_f)
Exemplo n.º 2
0
    def __init__(self, input_text, summary_length, anaphora_resolution,
                 model_name, quiet):
        super().__init__(input_text, summary_length, anaphora_resolution,
                         model_name, quiet)

        if not self.quiet:
            print("Processing text...")
        sentences = tokens(self.input_text)
        sentences = [text_cleanup(sentence) for sentence in sentences]

        if self.anaphora_resolution:
            if not self.quiet:
                print("Resolving anaphora...")
            sentences = resolve_anaphora(sentences)

        if not self.quiet:
            print("Extracting Predicate Argument Structures...")
        pas_list = extract_pas(sentences)

        if self.anaphora_resolution:
            if not self.quiet:
                print("Resolving anaphora...")
            resolve_anaphora_pas_list(pas_list)

        self.pas_list = pas_list
Exemplo n.º 3
0
def store_pas_nyt_dataset(nyt_path, min_pas, max_pas):
    """
    Load NYT documents and summaries, process them and store them.
    Process a number of documents between min_pas and max_pas.

    :param nyt_path: path to nyt raw dataset.
    :param min_pas: first document number.
    :param max_pas: last document number
    """
    docs_pas_lists = []
    refs_pas_lists = []

    docs, references = get_nyt(nyt_path, min_pas, max_pas)

    for i in range(len(docs)):
        start_time = time()
        print("Processing doc " + str(i) + "/" + str(len(docs)))
        doc = docs[i]
        ref = references[i]

        # Splitting sentences (by dot).
        sentences = tokens(doc)
        sentences = [text_cleanup(sentence) for sentence in sentences]
        doc_pas_list = extract_pas(sentences, "nyt")

        # Splitting sentences (by dot).
        sentences = tokens(ref)
        sentences = [text_cleanup(sentence) for sentence in sentences]
        ref_pas_list = extract_pas(sentences, "nyt", keep_all=True)

        if len(doc_pas_list) > 5 and len(doc_pas_list) >= len(ref_pas_list):
            refs_pas_lists.append(ref_pas_list)
            docs_pas_lists.append(doc_pas_list)
        timer(str(i) + " processed in:", start_time)

    # PAS lists are stored.
    with open(
            os.getcwd() + "/dataset/nyt/nyt_refs" + str(min_pas) + "-" +
            str(max_pas) + "_pas.dat", "wb") as dest_f:
        pickle.dump(refs_pas_lists, dest_f)
    with open(
            os.getcwd() + "/dataset/nyt/nyt_docs" + str(min_pas) + "-" +
            str(max_pas) + "_pas.dat", "wb") as dest_f:
        pickle.dump(docs_pas_lists, dest_f)
Exemplo n.º 4
0
    def __init__(self, input_text, summary_length, anaphora_resolution,
                 model_name, quiet):
        super().__init__(input_text, summary_length, anaphora_resolution,
                         model_name, quiet)

        if not self.quiet:
            print("Processing text...")
        sentences = tokens(self.input_text)
        sentences = [text_cleanup(sentence) for sentence in sentences]

        if self.anaphora_resolution:
            if not self.quiet:
                print("Resolving anaphora...")
            sentences = resolve_anaphora(sentences)

        self.sentences = sentences
Exemplo n.º 5
0
def main():
    logging.basicConfig(
        level=logging.DEBUG,
        format='%(message)s',
    )

    hsm = HSM()
    """
    Step 2.2. Monkey patch HSM.encrypt() method to measure encryption time with decorator
              implemented in utils module
    """
    ### Block implemented by student
    ### Block implemented by student

    for fruit in utils.tokens():
        logging.debug(
            f"'{fruit}' encrypted is '{hsm.encrypt(fruit).decode('ascii')}'\n")
Exemplo n.º 6
0
def compute_idfs(doc_list, dest_path):
    """
    Compute idfs given a document list, storing them in the specified destination file.

    :param doc_list: list of documents from which terms are extracted.
    :param dest_path: path in which store the idfs file.
    """
    docs_number = len(doc_list)
    stems = []
    doc_stems = {}

    for doc in doc_list:
        doc_index = doc_list.index(doc)
        doc_stems[doc_index] = []
        for sent in tokens(doc):
            doc_stems[doc_index].extend(stem_and_stopword(sent))
        stems.extend(doc_stems[doc_index])

    # Terms are the stems (taken only once) which appears in the document list.
    terms = list(set(stems))

    idfs = {}

    terms_dim = len(terms)
    term_index = 0
    for term in terms:
        term_index += 1
        term_count = 0
        # Counting how many documents contains the term.
        for doc in doc_list:
            if term in doc_stems[doc_list.index(doc)]:
                term_count += 1

        idf = math.log10(docs_number / term_count)
        idfs[term] = idf
        print("{:.3%}".format(term_index / terms_dim))

    with open(dest_path, "wb") as dest_file:
        pickle.dump(idfs, dest_file)
Exemplo n.º 7
0
            docs = []

            for doc in docgroup:
                if not doc: break

                (title, ns, sha1, text) = doc

                if ns != '0': continue
                if not text: continue  # wtf
                if text[:9].lower() == ('#redirect'): continue

                text = unwiki(text)
                itokens = list(itokenise(text))
                itokens_title = list(itokenise(title))

                tokens = normalise(utils.tokens(text, itokens))
                tokens_title = negate_tokens(
                    normalise(utils.tokens(title, itokens_title)))
                tokens_all = tokens_title + tokens
                if not tokens_all: continue

                article_tokens = Counter()

                thisdoc_postings = defaultdict(lambda: [])
                for i, w in tokens_all:
                    article_tokens[w] += 1
                    thisdoc_postings[w].append(i)
                for w, l in thisdoc_postings.iteritems():
                    postings[w].append((sha1, l))

                docs.append({
Exemplo n.º 8
0
def store_full_sentence_matrices(index, ref):
    """
    Storing matrices for the extractive summarization task.
    """
    if index < 0:
        docs, references, _ = get_duc()
        doc_path = "/dataset/duc/duc_doc_sent_matrix.dat"
        ref_path = "/dataset/duc/duc_ref_sent_matrix.dat"
    else:
        docs_pas_lists, refs_pas_lists = get_pas_lists(index)
        docs = get_sources_from_pas_lists(docs_pas_lists)
        references = get_sources_from_pas_lists(refs_pas_lists)
        dataset_path = "/dataset/nyt/" + str(index) + "/nyt" + str(index)
        doc_path = dataset_path + "_doc_sent_matrix.dat"
        ref_path = dataset_path + "_ref_sent_matrix.dat"

    docs_no = len(docs)  # First dimension, documents number.
    # Second dimension, max document length (sparse), fixed in case of nyt.
    max_sent_no = 200
    # Third dimension, vector representation dimension.
    sent_vec_len = 134

    # The matrix are initialized as zeros, then they'll filled in with vectors for each docs' sentence.
    refs_3d_matrix = np.zeros((docs_no, max_sent_no, sent_vec_len))
    docs_3d_matrix = np.zeros((docs_no, max_sent_no, sent_vec_len))

    # For each document the pas_list is extracted after cleaning the text and tokenizing it.
    if ref:
        doc_list = references
    else:
        doc_list = docs

    for i in range(len(doc_list)):
        doc = doc_list[i]
        print("Processing doc " + str(i) + "/" + str(len(docs)))
        doc = text_cleanup(doc)
        # Splitting sentences (by dot).
        sentences = tokens(doc)
        embeddings = sentence_embeddings(sentences)
        centr_scores = centrality_scores(embeddings)
        tf_idfs = tf_idf(sentences, os.getcwd() + "/dataset/duc/duc_idfs.dat")
        # Position score, reference sentence length score, tf_idf, numerical data, centrality, title.
        for j in range(len(sentences)):
            sent = sentences[j]

            position_score = (len(sentences) - j) / len(sentences)
            length_score = len(sent) / max(len(snt) for snt in sentences)
            tf_idf_score = 0
            numerical_score = 0
            centrality_score = centr_scores[j]
            title_sim_score = np.inner(np.array(embeddings[j]),
                                       np.array(embeddings[-1]))

            # Computing centrality and tf_idf score.
            terms = list(set(stem_and_stopword(sent)))
            for term in terms:
                # Due to errors terms may be not present in the tf_idf dictionary.
                if term in tf_idfs.keys():
                    tf_idf_score += tf_idfs[term]
                else:
                    tf_idf_score += 0

                if term.isdigit():
                    numerical_score += 1

            # Some errors in the preprocessing may lead to zero terms, so it is necessary to avoid division by zero.
            if len(terms):
                tf_idf_score /= len(terms)
            else:
                tf_idf_score = 0

            if ref:
                refs_3d_matrix[i, j, :] = np.append([
                    position_score, length_score, tf_idf_score,
                    numerical_score, centrality_score, title_sim_score
                ], embeddings[j])
            else:
                docs_3d_matrix[i, j, :] = np.append([
                    position_score, length_score, tf_idf_score,
                    numerical_score, centrality_score, title_sim_score
                ], embeddings[j])

    # Storing the matrices in the appropriate file, depending on the scoring system.
    if ref:
        with open(os.getcwd() + ref_path, "wb") as dest_f:
            pickle.dump(refs_3d_matrix, dest_f)
    else:
        with open(os.getcwd() + doc_path, "wb") as dest_f:
            pickle.dump(docs_3d_matrix, dest_f)
Exemplo n.º 9
0
            for doc in docgroup:
                if not doc: break

                (title, ns, sha1, text) = doc

                if ns != '0': continue
                if not text: continue # wtf
                if text[:9].lower() == ('#redirect'): continue

                processed += 1


                text = unwiki(text)

                tokens = normalise_gently(filter(good, utils.tokens(text)))
                tokens_title = normalise_gently(filter(good, utils.tokens(title)))
                round_tokens |= set(tokens_title) | set(tokens)


            for w in round_tokens:
                record = bdata.records.add()
                record.key = w
                record.value.parts.append('')
                del record.value.parts[:]

            t2 = time()

            # Index
            iserver.feedData(bdata, deadline_ms=10)
Exemplo n.º 10
0
    def __init__(self, query, mongo_cred, server='tcp://*****:*****@{host}/{db}'.format(user=mongo_cred['user'], password=mongo_cred['password'],
                                                                         host=mongo_cred['host'], db=mongo_cred['db'])
        self.mongo = MongoClient(MONGO_ADDRESS)
        self.db = self.mongo[mongo_cred['db']]

        index = self.index = IndexServer(server, store_path)


        self._TIME()
        query_tokens = map(self.correct_token, tokens(query))

        querysets = set([frozenset(normalise_drop(ts)) for ts in query_tokens])
        querysets = filter(lambda s: s, querysets)
        if not querysets: raise NotEnoughEntropy()
        self._TIME('proc')

        kw_docsets = defaultdict(lambda: frozenset())
        doc_poslists = defaultdict(lambda: defaultdict(lambda: []))
        self.freq = freq = defaultdict(lambda: Counter())
        docs = None

        for queryset in querysets:
            matched_docs = set()

            for kw in queryset:
                self._TIME()
                try :
                  res = index.query(kw, max_mistakes=0, timeout=3)
                except rpcz.RpcDeadlineExceeded:
                  try:
                    res = index.query(kw, max_mistakes=0, timeout=4)
                  except rpcz.RpcDeadlineExceeded:
                    res = index.query(kw, max_mistakes=0, timeout=5)

                if res.exact_total == 0:
                    try:
                        res = index.query(kw, max_mistakes=1, timeout=3)
                    except rpcz.RpcDeadlineExceeded:
                        self.extraquery_deadline = True
                self._TIME('index')

                for record in res.values:
                    key = record.key
                    if key in kw_docsets:
                        matched_docs |= kw_docsets[key]
                        continue
                    data = record.value.parts

                    docpostings = map(cPickle.loads, data)

                    key_set = set()
                    for (sha1, positions) in docpostings:
                        key_set.add(sha1)
                        matched_docs.add(sha1)
                        doc_poslists[sha1][key].append(positions)
                        freq[key][sha1] += len(positions)
                    kw_docsets[key] = frozenset(key_set)
                self._TIME('proc')
            if docs is None:
                docs = matched_docs
            else:
                docs &= matched_docs
            if not docs:
                break
            self._TIME('proc')


        doc_count = Counter()
        doc_count.update({kw: len(freq[kw]) for kw in freq})

        N = self.N = self.db.articles.count()
        idf = {kw: max(0.4, log((N - doc_count[kw] + 0.5) / (doc_count[kw] + 0.5))) for kw in freq}

        self.poslists = {sha1: merge_sorted([l for klists in doc_poslists[sha1].values() for l in klists]) for sha1 in docs}
        self._TIME('proc')

        # Here comes BM25 to save the world!
        scores = []
        avg_size = self.db.service.find_one({'_id': 'avg_len'})['val']
        doc_headers = self.db.articles.find({'_id': {'$in': list(docs)}, 'size': {'$gt': 0}}, {'size':1, 'title':1})
        query_tokens = set([t for qs in query_tokens for t in qs])
        for d in doc_headers:
            score = 0

            sha1 = d['_id']
            size = d['size']
            title = d['title']

            for kw in freq:
                m = (freq[kw][sha1] / size  * (k1 + 1)) / (freq[kw][sha1] / size + k1 * (1 - b + b * size / avg_size))
                score += idf[kw] * m

            # Prioritise title matches (our own heuristic)
            keywords_bag = Counter(query_tokens)
            title_tokens = normalise_gently(tokens(title))
            title_bag = Counter(title_tokens)
            both = keywords_bag & title_bag
            both_c = sum(both.values())
            ratio = both_c / (len(query_tokens) + len(title_tokens) - both_c)
            score += 10 * ratio

            tokens_title = normalise_drop(title_tokens)
            title_set = set(tokens_title)
            both = set(freq.keys()) & title_set
            ratio = len(both) / len(freq)
            score += 10 * ratio

            scores.append((sha1, score))

        self.scores = sorted(scores, key=lambda p: p[1], reverse=True)
        self._TIME('ranking')
        self.results = map(lambda p: p[0], self.scores)
Exemplo n.º 11
0
            processed = 0

            for doc in docgroup:
                if not doc: break

                (title, ns, sha1, text) = doc

                if ns != '0': continue
                if not text: continue  # wtf
                if text[:9].lower() == ('#redirect'): continue

                processed += 1

                text = unwiki(text)

                tokens = normalise_gently(filter(good, utils.tokens(text)))
                tokens_title = normalise_gently(
                    filter(good, utils.tokens(title)))
                round_tokens |= set(tokens_title) | set(tokens)

            for w in round_tokens:
                record = bdata.records.add()
                record.key = w
                record.value.parts.append('')
                del record.value.parts[:]

            t2 = time()

            # Index
            iserver.feedData(bdata, deadline_ms=10)
Exemplo n.º 12
0
            for doc in docgroup:
                if not doc: break

                (title, ns, sha1, text) = doc

                if ns != '0': continue
                if not text: continue # wtf
                if text[:9].lower() == ('#redirect'): continue


                text = unwiki(text)
                itokens = list(itokenise(text))
                itokens_title = list(itokenise(title))

                tokens = normalise(utils.tokens(text, itokens))
                tokens_title = negate_tokens(normalise(utils.tokens(title, itokens_title)))
                tokens_all = tokens_title + tokens
                if not tokens_all: continue

                article_tokens = Counter()

                thisdoc_postings = defaultdict(lambda: [])
                for i, w in tokens_all:
                    article_tokens[w] += 1
                    thisdoc_postings[w].append(i)
                for w, l in thisdoc_postings.iteritems():
                    postings[w].append((sha1, l))

                docs.append({
                    '_id': sha1,
Exemplo n.º 13
0
    def __init__(self,
                 query,
                 mongo_cred,
                 server='tcp://*****:*****@{host}/{db}'.format(
            user=mongo_cred['user'],
            password=mongo_cred['password'],
            host=mongo_cred['host'],
            db=mongo_cred['db'])
        self.mongo = MongoClient(MONGO_ADDRESS)
        self.db = self.mongo[mongo_cred['db']]

        index = self.index = IndexServer(server, store_path)

        self._TIME()
        query_tokens = map(self.correct_token, tokens(query))

        querysets = set([frozenset(normalise_drop(ts)) for ts in query_tokens])
        querysets = filter(lambda s: s, querysets)
        if not querysets: raise NotEnoughEntropy()
        self._TIME('proc')

        kw_docsets = defaultdict(lambda: frozenset())
        doc_poslists = defaultdict(lambda: defaultdict(lambda: []))
        self.freq = freq = defaultdict(lambda: Counter())
        docs = None

        for queryset in querysets:
            matched_docs = set()

            for kw in queryset:
                self._TIME()
                try:
                    res = index.query(kw, max_mistakes=0, timeout=3)
                except rpcz.RpcDeadlineExceeded:
                    try:
                        res = index.query(kw, max_mistakes=0, timeout=4)
                    except rpcz.RpcDeadlineExceeded:
                        res = index.query(kw, max_mistakes=0, timeout=5)

                if res.exact_total == 0:
                    try:
                        res = index.query(kw, max_mistakes=1, timeout=3)
                    except rpcz.RpcDeadlineExceeded:
                        self.extraquery_deadline = True
                self._TIME('index')

                for record in res.values:
                    key = record.key
                    if key in kw_docsets:
                        matched_docs |= kw_docsets[key]
                        continue
                    data = record.value.parts

                    docpostings = map(cPickle.loads, data)

                    key_set = set()
                    for (sha1, positions) in docpostings:
                        key_set.add(sha1)
                        matched_docs.add(sha1)
                        doc_poslists[sha1][key].append(positions)
                        freq[key][sha1] += len(positions)
                    kw_docsets[key] = frozenset(key_set)
                self._TIME('proc')
            if docs is None:
                docs = matched_docs
            else:
                docs &= matched_docs
            if not docs:
                break
            self._TIME('proc')

        doc_count = Counter()
        doc_count.update({kw: len(freq[kw]) for kw in freq})

        N = self.N = self.db.articles.count()
        idf = {
            kw: max(0.4, log(
                (N - doc_count[kw] + 0.5) / (doc_count[kw] + 0.5)))
            for kw in freq
        }

        self.poslists = {
            sha1: merge_sorted(
                [l for klists in doc_poslists[sha1].values() for l in klists])
            for sha1 in docs
        }
        self._TIME('proc')

        # Here comes BM25 to save the world!
        scores = []
        avg_size = self.db.service.find_one({'_id': 'avg_len'})['val']
        doc_headers = self.db.articles.find(
            {
                '_id': {
                    '$in': list(docs)
                },
                'size': {
                    '$gt': 0
                }
            }, {
                'size': 1,
                'title': 1
            })
        query_tokens = set([t for qs in query_tokens for t in qs])
        for d in doc_headers:
            score = 0

            sha1 = d['_id']
            size = d['size']
            title = d['title']

            for kw in freq:
                m = (freq[kw][sha1] / size *
                     (k1 + 1)) / (freq[kw][sha1] / size + k1 *
                                  (1 - b + b * size / avg_size))
                score += idf[kw] * m

            # Prioritise title matches (our own heuristic)
            keywords_bag = Counter(query_tokens)
            title_tokens = normalise_gently(tokens(title))
            title_bag = Counter(title_tokens)
            both = keywords_bag & title_bag
            both_c = sum(both.values())
            ratio = both_c / (len(query_tokens) + len(title_tokens) - both_c)
            score += 10 * ratio

            tokens_title = normalise_drop(title_tokens)
            title_set = set(tokens_title)
            both = set(freq.keys()) & title_set
            ratio = len(both) / len(freq)
            score += 10 * ratio

            scores.append((sha1, score))

        self.scores = sorted(scores, key=lambda p: p[1], reverse=True)
        self._TIME('ranking')
        self.results = map(lambda p: p[0], self.scores)
Exemplo n.º 14
0
 def vocabulary(self):
     return tokens(self.get_all_text())
Exemplo n.º 15
0
 def received_vocabulary(self):
     return tokens(self.get_received_text())
Exemplo n.º 16
0
 def sent_vocabulary(self):
     return tokens(self.get_sent_text())