Exemplo n.º 1
Arquivo: base.py Projeto: zuacubd/pke
    def read_raw_document(self, stemmer='porter'):
        """ Read the raw input file and populate the sentence list.

                stemmer (str): the stemmer in nltk to use, defaults to porter
                    (can be set to None for using word surface forms instead of

        # parse the document using the preprocessed text parser
        parse = RawTextReader(self.input_file)

        # loop through the parsed sentences
        for i, sentence in enumerate(parse.sentences):

            # add the sentence to the container

            # add the POS
            self.sentences[i].pos = sentence['POS']

            # add the stems
            if stemmer is not None:
                for j, word in enumerate(self.sentences[i].words):

            # otherwise computations are performed on surface forms
                self.sentences[i].stems = self.sentences[i].words

            # lowercase the stems/lemmas
            for j, stem in enumerate(self.sentences[i].stems):
                self.sentences[i].stems[j] = stem.lower()
Exemplo n.º 2
class Sentence:
    stemmer = Stemmer()

    def __init__(self, dictionary, startIndex: int, endIndex: int, sent: str,
                 start: int, end: int):
        self.startIndex = startIndex
        self.endIndex = endIndex
        self.sent = sent
        self.words = self.sentToWords()
        self.nGrams = self.wordsToTrigramsWithIndices(dictionary)
        self.start = start
        self.end = end

    def sentToWords(self) -> List[str]:
        # FIXME: remove_stops . remove_puncts ~> remove_sth(_, stops | puncts)
        return wordsToStemmed(

    def wordsToTrigramsWithIndices(self, dictionary):
        def getIndexedTuple(word: str):
            index = -1
            if word in dictionary.wordsToIndices:
                index = dictionary.wordsToIndices[word]
            return (index, word)

        return list(trigrams(list(map(getIndexedTuple, self.words))))
Exemplo n.º 3
    def read_preprocessed_document(self, stemmer='porter', sep='/'):
        """ Read the preprocessed input file and populate the sentence list.

                stemmer (str): the stemmer in nltk to use, defaults to porter.
                sep (str): the separator of the tagged word, defaults to /.

        # parse the document using the preprocessed text parser
        parse = PreProcessedTextReader(self.input_file, sep=sep)

        # loop through the parsed sentences
        for i, sentence in enumerate(parse.sentences):

            # add the sentence to the container

            # add the POS
            self.sentences[i].pos = sentence['POS']

            # add the stems
            if stemmer == None:
                self.sentences[i].stems = list(self.sentences[i].words)
                for j, word in enumerate(self.sentences[i].words):

            # lowercase the stems/lemmas
            for j, stem in enumerate(self.sentences[i].stems):
                self.sentences[i].stems[j] = stem.lower()
Exemplo n.º 4
Arquivo: utils.py Projeto: zuacubd/pke
def load_references(input_file,
    """ Load a reference file and returns a dictionary. """

    logging.info('loading reference keyphrases from ' + input_file)

    references = defaultdict(list)

    with codecs.open(input_file, 'r', 'utf-8') as f:
        for line in f:
            cols = line.strip().split(sep_doc_id)
            doc_id = cols[0].strip()
            keyphrases = cols[1].strip().split(sep_ref_keyphrases)
            for v in keyphrases:
                if '+' in v:
                    for s in v.split('+'):
            if reference_stemming:
                for i, k in enumerate(references[doc_id]):
                    stems = [Stemmer(stemmer).stem(u) for u in k.split()]
                    references[doc_id][i] = ' '.join(stems)

    return references
Exemplo n.º 5
    def read_raw_document(self, stemmer='porter'):
        """ Read the raw input file and populate the sentence list.

                stemmer (str): the stemmer in nltk to use, defaults to porter.

        # parse the document using the preprocessed text parser
        parse = RawTextReader(self.input_file)

        # loop through the parsed sentences
        for i, sentence in enumerate(parse.sentences):

            # add the sentence to the container

            # add the POS
            self.sentences[i].pos = sentence['POS']

            # add the stems
            for j, word in enumerate(self.sentences[i].words):

            # lowercase the stems/lemmas
            for j, stem in enumerate(self.sentences[i].stems):
                self.sentences[i].stems[j] = stem.lower()
def tokenize_lowercase(text):
    Toekenize, stem and convert to lower case the text of documents
    :param text: text of a specific document
    :return: formatted text
    words = word_tokenize(text)  # tokenize document text
    # get words of all keyphrases in a single list
    formatted_tok_text = [
        Stemmer('porter').stem(word_token.lower()) for word_token in words
    formatted_text = ' '.join(formatted_tok_text)
    return formatted_text
Exemplo n.º 7
class Sentence:
    stemmer = Stemmer()
    lemmater = WordNetLemmatizer()

    def __init__(self, index: int, sent: str, start: int, end: int):
        self.index = index
        self.sent = sent
        self.words = self.sentToWords()
        self.nGrams = list(trigrams(self.words))
        self.start = start
        self.end = end

    def sentToWords(self) -> List[str]:
        return word_tokenize(self.sent)
Exemplo n.º 8
    def read_corenlp_document(self, use_lemmas=False, stemmer='porter'):
        """ Read the input file in CoreNLP XML format and populate the sentence

                use_lemmas (bool): whether lemmas from stanford corenlp are used
                    instead of stems (computed by nltk), defaults to False.
                stemmer (str): the stemmer in nltk to use (if used), defaults
                    to porter (can be set to None for using word surface forms
                    instead of stems).

        # parse the document using the Minimal CoreNLP parser
        parse = MinimalCoreNLPParser(self.input_file)

        # loop through the parsed sentences
        for i, sentence in enumerate(parse.sentences):

            # add the sentence to the container

            # add the POS
            self.sentences[i].pos = sentence['POS']

            # add the lemmas
            self.sentences[i].stems = sentence['lemmas']

            # flatten with the stems if required
            if not use_lemmas:

                # if stemming is performed
                if stemmer is not None:
                    for j, word in enumerate(self.sentences[i].words):
                        self.sentences[i].stems[j] = Stemmer(stemmer).stem(

                # else, all computations are performed on surface forms
                    self.sentences[i].stems = self.sentences[i].words

            # lowercase the stems/lemmas
            for j, stem in enumerate(self.sentences[i].stems):
                self.sentences[i].stems[j] = stem.lower()

            # add the meta-information
            # for k, infos in sentence.iteritems(): -- Python 2/3 compatible
            for (k, infos) in sentence.items():
                if k not in set(['POS', 'lemmas', 'words']):
                    self.sentences[i].meta[k] = infos
Exemplo n.º 9
def extract_keyphrases(data):
    gold_keyphrases = []  # save the gold keyphrases of documents
    pred_keyphrases = []  # save the predicted keyphrases of documents
    for indx, abstract_document in enumerate(data['abstract']):
        # print('train_test_combined/' + key + '.xml')
        # print(keyphrases_dictionary[key])

        #if 'json' in file:
            [Stemmer('porter').stem(keyword) for keyword in keyphrase.split()]
            for keyphrase in data['keyword'][indx].split(';')
        ])  # split gold keywords to separate them from one another

        # ======================================================================================================================
        # TF-IDF Extractor
        # ======================================================================================================================

        stoplist = list(string.punctuation)
        stoplist += ['-lrb-', '-rrb-', '-lcb-', '-rcb-', '-lsb-', '-rsb-']
        stoplist += stopwords.words('english')

        # 1. create a TfIdf extractor.
        extractor = pke.unsupervised.TfIdf()
        #print(' '.join(abstract_document))
        # 2. load the content of the document.
            input=abstract_document,  # ' '.join(abstract_document

        # 3. select {1-3}-grams not containing punctuation marks as candidates.
        extractor.candidate_selection(n=3, stoplist=stoplist)

        # 4. weight the candidates using a `tf` x `idf`
        df = pke.load_document_frequency_file(input_file=input_file)

        # 5. get the 10-highest scored candidates as keyphrases
        pred_kps = extractor.get_n_best(n=10)

        # keep only the predicted keyphrase (first position -> [0]) and discard the frequency number
        pred_keyphrases.append([kp[0].split() for kp in pred_kps])


    return pred_keyphrases, gold_keyphrases
Exemplo n.º 10
class Sentence:
    stemmer = Stemmer()

    def __init__(self, startIndex: int, endIndex: int, sent: str, start: int,
                 end: int):
        self.startIndex = startIndex
        self.endIndex = endIndex
        self.sent = sent
        self.words = self.sentToWords()
        self.nGrams = list(trigrams(self.words))
        self.start = start
        self.end = end

    def sentToWords(self) -> List[str]:
        # FIXME: remove_stops . remove_puncts ~> remove_sth(_, stops | puncts)
        return wordsToStemmed(
def extract_keyphrases(data):
    gold_keyphrases = []  # save the gold keyphrases of documents
    pred_keyphrases = []  # save the predicted keyphrases of documents
    for indx, abstract_document in enumerate(data['abstract']):
        # print('train_test_combined/' + key + '.xml')
        # print(keyphrases_dictionary[key])

        gold_keyphrases.append([[Stemmer('porter').stem(keyword) for keyword in keyphrase.split()] for keyphrase in data['keyword'][indx].split(';')])  # split gold keywords to separate them from one another

        # ======================================================================================================================
        # MultipartiteRank Extractor
        # ======================================================================================================================

        # 1. create a MultipartiteRank extractor.
        extractor = pke.unsupervised.MultipartiteRank()

        # 2. load the content of the document.

        # 3. select the longest sequences of nouns and adjectives, that do
        #    not contain punctuation marks or stopwords as candidates.
        pos = {'NOUN', 'PROPN', 'ADJ'}
        stoplist = list(string.punctuation)
        stoplist += ['-lrb-', '-rrb-', '-lcb-', '-rcb-', '-lsb-', '-rsb-']
        stoplist += stopwords.words('english')
        extractor.candidate_selection(pos=pos, stoplist=stoplist)

        # 4. build the Multipartite graph and rank candidates using random walk,
        #    alpha controls the weight adjustment mechanism, see TopicRank for
        #    threshold/method parameters.

        # 5. get the 10-highest scored candidates as keyphrases
        pred_kps = extractor.get_n_best(n=10)

        pred_keyphrases.append([kp[0].split() for kp in pred_kps])  # keep only the predicted keyphrase and discard the frequency number


    return pred_keyphrases, gold_keyphrases
Exemplo n.º 12
def extract_keywords(sentence):
    sentence = sentence.lower()
    not_stopw = [
        "no", "nor", "not", "over", "under", "again", "further", "but",
        "against", "too", "very"
    stopw = stopwords.words('english')
    for x in not_stopw:
    pattern = re.compile(r'\b(' + r'|'.join(stopw) + r')\b\s*')
    sentence = sentence.replace('\n', '')
    sentence = sentence.replace("n't", " not")
    sentence = clean_string(sentence)
    sentence = pattern.sub('', sentence)
    stemmer = Stemmer()
    s = [stemmer.stem(w) for w in sentence.split()]
    b = zip(*[s[i:] for i in [0, 1]])
    b = [bigram[0] + " " + bigram[1] for bigram in b]
    return s + b
for index, list_of_keyphrases in enumerate(data['keyword']):
    keyphrases_list = []
    for keyphrase in list_of_keyphrases:  # get words of all keyphrases in a single list
        # keyphrase = keyphrase.translate(remove_digits).strip()  # remove digits
        keyphrase = keyphrase.strip()  # remove whitespaces
        if len(keyphrase):  # check if the keyphrase is empty
            tokens = word_tokenize(keyphrase)  # tokenize
            # Replace the pure digit terms with DIGIT_REPL
            tokens = [
                tok if not re.match('^\d+$', tok) else 'DIGIT_REPL'
                for tok in tokens
            # Replace the combination of characters and digits with WORD_DIGIT_REPL
            # tokens = [tok if not re.match('.*\d+', tok) else 'WORD_DIGIT_REPL' for tok in tokens]
            tokens = [
                Stemmer('porter').stem(keyword.lower()) for keyword in tokens
            ]  # stem + lower case
            tokens = ' '.join(tokens)

    data['keyword'].iat[index] = keyphrases_list

# ======================================================================================================================
# Count logistics
# ======================================================================================================================

keywords_in_title = 0  # the count of keywords in title
keywords_in_abstract = 0  # the count of keywords in abstract
keywords_in_title_abstract = 0  # the count of keywords that are either in title or abstract
keywords_in_title_NOT_abstract = 0  # the count of keywords that are in title BUT NOT in abstract
total_keywords = 0  # the count of all keywords
Exemplo n.º 14
print('tokenization - abstract finish')

# stem, tokenize and lower case keyphrases and keep them categorized by document
for index, list_of_keyphrases in enumerate(data['keywords']):
    keyphrases_list = []
    for keyphrase in list_of_keyphrases:  # get words of all keyphrases in a single list
        # keyphrase = keyphrase.translate(remove_digits).strip()  # remove digits
        keyphrase = keyphrase.strip()  # remove whitespaces
        if len(keyphrase):  # check if the keyphrase is empty
            tokens = word_tokenize(keyphrase)  # tokenize
            # Replace the pure digit terms with DIGIT_REPL
            tokens = [tok if not re.match('^\d+$', tok) else 'DIGIT_REPL' for tok in tokens]
            # Replace the combination of characters and digits with WORD_DIGIT_REPL
            #tokens = [tok if not re.match('.*\d+', tok) else 'WORD_DIGIT_REPL' for tok in tokens]
            keyphrases_list.append([Stemmer('porter').stem(keyword.lower()) for keyword in tokens])  # stem + lower case
    data['keywords'].iat[index] = keyphrases_list
#    print('THESE ARE THE KEYPHRASE LIST', len(keyphrases_list), keyphrases_list)

# ======================================================================================================================
# Write pre-processed keyphrases to csv file
# ======================================================================================================================

data['abstract'].to_csv(x_text_filename, index=False)  # save the preprocessed document text

# rename column "keywords" to "keyword" for uniformity between datasets
data.rename(columns={"keywords": "keyword"}, inplace=True)
data[['keyword', 'assemble_documents_index']].to_csv(y_text_filename, index=False)  # save the preprocessed keyphrases

def evaluation(y_pred=None, y_test=None, x_test=None, x_filename=None, y_filename=None, paragraph_assemble_docs=None):
    Evaluate the performance
    :param y_pred: the predicted labels
    :param y_test: the test labels
    :param x_filename: the name of the GOLD document text file - NEED TO MATCH THE LOADED FILE WHEN MAKING PREDICTIONS (default evaluation dataset is KP20K)
    :param y_filename: the name of the GOLD keyphrase file - NEED TO MATCH THE LOADED FILE WHEN MAKING PREDICTIONS (default evaluation dataset is KP20K)
    :param paragraph_assemble_docs: (ONLY FOR UNSUPERVISED METHODS) the indices to re-assemble first 3 paragraphs
    :return: -

    if y_test is None:  # evaluate the Bi-LSTM-CRF + unsupervised methods
        # ======================================================================================================================
        # Load all validation target data (y_test\labels) data on memory (needed for evaluation)
        # ======================================================================================================================

        # read preprocessed document text (x) and preprocessed keyphrases (y)
        x_test = pd.read_csv(x_filename, encoding="utf8")
        y_test = pd.read_csv(y_filename, encoding="utf8")

        # translate string back to list of lists (when reading dataframe, lists of lists are read as strings)
        x_test['abstract'] = x_test['abstract'].map(ast.literal_eval)
        if 'SENTENC' in x_filename or 'SENTEC' in x_filename or 'PARAGRAPH' in x_filename:
            assembl_docs = y_test['assemble_documents_index']
        y_test = y_test['keyword'].map(ast.literal_eval)

        # print(x_test)

        # ======================================================================================================================
        # Convert y_test and y_pred from categorical (two columns, 1 for each label) to a single value label (1 column)
        # ======================================================================================================================

        def pred2label(all_abstract_preds):
            Converts prediction set and test/validation set from two columns (one for each label value)
            to just one column with the number of the corresponding label
            [ initial array: [1, 0] => final array: [0] ]   -   [ initial array: [0, 1] => final array: [1] ]
            :param all_abstract_preds: array with predictions or test/validation set [documents/abstracts, number of words]
            :return: flattened array that contains the prediction for each word [number of total words of all abstracts]
            preds = []
            for abstract_preds in all_abstract_preds:
                # the position of the max value is corresponding to the actual label value (0: Non-KP, 1: KP)
                doc_preds = [np.argmax(word_pred) for word_pred in abstract_preds]
            return preds

        # print('BEFORE y_pred', y_pred)
        y_pred = pred2label(y_pred)  # convert y_pred from categorical (two columns, 1 for each label) to a single value label
        # print('AFTER y_pred', y_pred)

        # ======================================================================================================================
        # Extract keyphrases from the predicted set
        # ======================================================================================================================

        pred_keyphrase_list = []  # save all predicted keyphrases
        for doc_index, doc_prediction in enumerate(y_pred):  # iterate through predictions for documents
            document_keyphrases = []  # save the keyphrases of a document
            consecutive_keywords = []  # save consecutive keywords that form a keyphrase
            for word_index, word_prediction in enumerate(doc_prediction):  # iterate through predictions for WORDS of documents
                if word_index >= len(x_test['abstract'][doc_index]):
                    break  # check if the abstract reached to an end (padding adds more dummy words non existing in real abstract)
                if word_index:  # check if this is the FIRST WORD in the abstract [to avoid negative index value]
                    if doc_prediction[word_index - 1]:  # check if the previous word is a keyword
                        if word_prediction:  # check if the current word is a keyword
                            #                        print(x_test['abstract'][doc_index])
                            #                        print(x_test['abstract'][doc_index][word_index])
                        if len(consecutive_keywords):  # save keyword list if exists (not empty list)
                        consecutive_keywords = []  # re-initialize (empty) list
                        if word_prediction:  # check if the current word is a keyword
                else:  # save the FIRST WORD of the abstract if it is a keyword
                    if word_prediction:  # check if the current word is a keyword
                        #               print('HEREEEE', doc_index, word_index)
                        #               print(x_test['abstract'][doc_index])

            if len(consecutive_keywords):  # save the keywords that occur in the END of the abstract, if they exist (not empty list)

    else:  # evaluate the unsupervised methods that use .xml files
        # tokenize the text
        x_test['abstract'] = x_test['abstract'].apply(lambda row: row.split())

        # define pred_keyphrase_list - contains words
        pred_keyphrase_list = y_pred
        # define y_test if full-text in paragraphs/stentences
        if 'SENTENC' in x_filename or 'SENTEC' in x_filename or 'PARAGRAPH' in x_filename:
            assembl_docs = paragraph_assemble_docs

   # print(pred_keyphrase_list)

    here = [1 if any(doc) else 0 for doc in y_pred]
    print('\ny_pred', np.array(y_pred, dtype=object).shape)
    if any(here):
        print('THERE ARE KEYPHRASES')

    # ======================================================================================================================
    # Calculate metrics
    # ======================================================================================================================

    def calculate_metrics(y_test_set, pred_keyphrase_list_set, eval_method):
        Calculate and print metrics
        :param y_test_set: GOLD set
        :param pred_keyphrase_list_set: PREDICTION set
        :param eval_method: the name of the evaluation method (exact/partial match)
        :return: -
        TP = 0  # True Positive
        FP = 0  # False Positive
        FN = 0  # False Negative
        for index_pred, doc_pred in enumerate(pred_keyphrase_list_set):
            for key_test in y_test_set[index_pred]:
                #if any(key_test not in keyp for keyp in doc_pred):
                if key_test not in doc_pred:  # FN: keyphrases that exist in GOLD but not in PREDICTED
                    FN += 1
            if len(doc_pred):  # continue if prediction list is NOT empty | if prediction list is empty -> skip checking
                for key_pred in doc_pred:
                    #if any(key_pred in keyp for keyp in y_test_set[index_pred]):
                    if key_pred in y_test_set[index_pred]:  # TP: keyphrases that exist both in PREDICTED and GOLD
                        TP += 1
                    else:  # FP: keyphrases that exist in PREDICTED but not in GOLD (if key_pred not in y_test_set)
                        FP += 1
        precision = 0
        recall = 0
        f1_score = 0
        # print(TP, FN, FP)
        # print('precision=', TP / (TP + FP), 'recall=', TP / (TP + FN))
        if not (TP == FP == 0):
            precision = TP / (TP + FP)
        if not (TP == FN == 0):
            recall = TP / (TP + FN)
        if not (precision == recall == 0):
            f1_score = 2 * (precision * recall) / (precision + recall)

        print('\n' + eval_method)
        print('Precision: %.4f' % precision)
        print('Recall: %.4f' % recall)
        print('F1-score: %.4f\n' % f1_score)

    # ======================================================================================================================
    # Calculate NEW metrics (semi-exact matching)
    # ======================================================================================================================

    def calculate_semi_exact_match_metrics(y_test_set, pred_keyphrase_list_set, eval_method):
        Calculate and print metrics
        :param y_test_set: GOLD set
        :param pred_keyphrase_list_set: PREDICTION set
        :param eval_method: the name of the evaluation method (exact/partial match)
        :return: -
        # each 0 and 1 represents a keyphrase and the 0 means that it exists in gold/pred set, while 0 means it does not
        pred_list = []  # contains 0, 1 for predicted keyphrases depending on if a predicted keyphrase matches with a gold one
        gold_list = []  # contains 0, 1 for gold keyphrases depending on if a gold keyphrase matches with a predicted one
        for index_pred, doc_pred in enumerate(pred_keyphrase_list_set):
            pred_kps = [0] * len(doc_pred)  # initialize the list with 0s and length equal to the total predicted keyphrases
            gold_kps = [0.0] * len(y_test_set[index_pred])  # initialize the list with 0s and length equal to the total gold keyphrases

            if doc_pred:  # if predicted keyphrase set is not empty (the case of empty predicted keyphrase is handled by the initialization of pred_kps and gold_kps)
                # find if the gold keyphrases exist in the predicted set, and if so mark which gold and predicted keyphrases have a match
                for gold_kp_index, gold_keyphr in enumerate(y_test_set[index_pred]):
                    gold_keyphrase_tokens = gold_keyphr.split()
          #          print('gold: ', gold_keyphrase_tokens)
         #           print('pred: ', doc_pred)
                    avg_coverage_ratio_list = []
                    for pred_kp in doc_pred:
                        kw_coverage = 0  # gold keyword coverage of a predicted keyphrase
                        for keyword_gold in gold_keyphrase_tokens:
                            if keyword_gold in pred_kp:
                                kw_coverage += 1
                        # a gold keyword might exist multiple times in a pred keyphrase, but with this approach we assume that it does not as this happens rarely
                        if len(pred_kp.split()):
                            pred_coverage_ratio = kw_coverage / len(pred_kp.split())  # calculate the ratio of the covered predicted kps
                            pred_coverage_ratio = 0
                        if len(gold_keyphrase_tokens):
                            gold_coverage_ratio = kw_coverage / len(gold_keyphrase_tokens)  # calculate the ratio of the covered gold kps
                            gold_coverage_ratio = 0
                        avg_coverage_ratio_list.append((gold_coverage_ratio + pred_coverage_ratio) / 2)  # save the average of the keyphrase coverage and the coverage ratio
         #           print('percent: ', avg_coverage_ratio_list)
                    # find the max average coverage ratio and its position on the list
                    max_index, max_avg_coverage_ratio_list = max(enumerate(avg_coverage_ratio_list), key=itemgetter(1))
                    if max_avg_coverage_ratio_list > 0.5:
                        # set 1 or the average value of keyphrase coverage and ratio for possibly more accurate results
               #         gold_kps[gold_kp_index] = 1  # set 1 the gold kp that matched to a predicted one
                        gold_kps[gold_kp_index] = gold_coverage_ratio_list[max_index] #max_avg_coverage_ratio_list # gold_coverage_ratio_list[gold_kp_index]
                        pred_kps[max_index] = 1  # set 1 the predicted kp that was matched with a gold one

            # save the kp predicted/gold matches of each document

        FN = gold_list.count(0)  # False Negative: keyphrases that exist in GOLD but not in PREDICTED
   #     TP = gold_list.count(1)  # True Positive: keyphrases that exist both in PREDICTED and GOLD
        TP = sum(gold_list)
        FP = pred_list.count(0)  # False Positive: keyphrases that exist in PREDICTED but not in GOLD (if key_pred not in y_test_set)

        precision = 0
        recall = 0
        f1_score = 0
        # print(TP, FN, FP)
        # print('precision=', TP / (TP + FP), 'recall=', TP / (TP + FN))
        if not (TP == FP == 0):
            precision = TP / (TP + FP)
        if not (TP == FN == 0):
            recall = TP / (TP + FN)
        if not (precision == recall == 0):
            f1_score = 2 * (precision * recall) / (precision + recall)

        print('\n' + eval_method)
        print('Precision: %.4f' % precision)
        print('Recall: %.4f' % recall)
        print('F1-score: %.4f\n' % f1_score)

    # ======================================================================================================================
    # Get the SETS of (unique) keyphrases for predicted and gold set
    # ======================================================================================================================

    # assemble the sentences of a document into a whole document again (only for the SENTEC and PARAGRAPH)
    if 'SENTENC' in x_filename or 'SENTEC' in x_filename or 'PARAGRAPH' in x_filename:
        y_test_set = []  # set of original/all GOLD keyphrases for each document
        y_test_set_extraction = []  # keep only the GOLD keyphrases that exist in their corresponding document
        pred_keyphrase_list_set = []  # set of PREDICTED keyphrases for each document
        gold_same_document_keyphrases = []  # save the gold keyphrases that are from the same document (only for the SENTEC and PARAGRAPH)
        gold_extraction_same_document_keyphrases = []  # save the gold keyphrases that are from the same document - extraction (only for the SENTEC and PARAGRAPH)
        pred_same_document_keyphrases = []  # save the pred keyphrases that are from the same document (only for the SENTEC and PARAGRAPH)

        for doc_index, doc in enumerate(y_test):  # get the set of GOLD keyphrases for each document
            # y gold set
            gold_document_keyphrases = []  # save the keyphrases of a document as strings (each keyphrase -> string)
            # y gold set - extraction
            gold_document_keyphrases_extraction = []  # save the keyphrases of a document as strings (each keyphrase -> string)
            # y predicted
            pred_document_keyphrases = []  # save the keyphrases of a document as strings (each keyphrase -> string)

            abstract_as_string = ' '.join([Stemmer('porter').stem(word) for word in x_test['abstract'][doc_index]])

            for tokenized_keyphrase in doc:
                keyphrase = ' '.join(tokenized_keyphrase)  # STEMMING is already applied


                if keyphrase.strip() in abstract_as_string:  # keep only keyphrases that exist in the text - keyphrase EXTRACTION

            for tokenized_keyphrase in pred_keyphrase_list[doc_index]:
                keyphrase = ''
                for word in tokenized_keyphrase:
                    keyphrase += Stemmer('porter').stem(word) + ' '  # apply STEMMING

            # check if the previous sentence is in the same document (has the same document id) as the current
            if doc_index == 0:
                # print('we are in the 1st document')
            elif assembl_docs[doc_index] == assembl_docs[doc_index - 1]:
                # print('we are in the same document', y_test['assemble_documents_index'][doc_index], '==', y_test['assemble_documents_index'][doc_index - 1])
            else:  # different documents
                # print('CHANGED document', y_test['assemble_documents_index'][doc_index], '==', y_test['assemble_documents_index'][doc_index - 1])
                # save keyphrases for the previous document
                y_test_set.append(set(gold_same_document_keyphrases))  # get each keyphrase just once
                pred_keyphrase_list_set.append(set(pred_same_document_keyphrases))  # get each keyphrase just once

                # create the new document keyphrase set
                gold_same_document_keyphrases = gold_document_keyphrases
                gold_extraction_same_document_keyphrases = gold_document_keyphrases_extraction
                pred_same_document_keyphrases = pred_document_keyphrases

            # save the keyphrases for the last document
            if (doc_index + 2) > len(pred_keyphrase_list):  # (+2 because counting starts from 0 and we want the next element as well)
                # save keyphrases for the current document
                y_test_set.append(set(gold_same_document_keyphrases))  # get each keyphrase just once
                pred_keyphrase_list_set.append(set(pred_same_document_keyphrases))  # get each keyphrase just once

        # count all keyphrases and keyphrases existing in text
        keyphrase_counter = 0
        extraction_keyphrase_counter = 0
        for doc_idx, y_test_extraction_doc in enumerate(y_test_set_extraction):
            extraction_keyphrase_counter += len(y_test_extraction_doc)
            keyphrase_counter += len(y_test_set[doc_idx])
        print('existing keyphrases', extraction_keyphrase_counter)
        print('all keyphrases', keyphrase_counter)

    else:  # for the full-text documents
        y_test_set = []  # set of original/all GOLD keyphrases for each document
        y_test_set_extraction = []  # keep only the GOLD keyphrases that exist in their corresponding document
        for doc_index, test_doc in enumerate(y_test):  # get the set of GOLD keyphrases for each document
            extraction_document_keyphrases = []  # save the keyphrases that exist in text (extraction) of a document as strings (each keyphrase -> string)
            document_keyphrases = []  # save all keyphrases of a document as strings (each keyphrase -> string)

            abstract_as_string = ' '.join([Stemmer('porter').stem(word) for word in x_test['abstract'][doc_index]])

            for tokenized_keyphrase in test_doc:
                keyphrase = ' '.join(tokenized_keyphrase)  # STEMMING is already applied


                if keyphrase.strip() in abstract_as_string:  # keep only keyphrases that exist in the text - keyphrase EXTRACTION
                #            print(document_keyphrases)
            y_test_set.append(set(document_keyphrases))  # get each keyphrase just once
            y_test_set_extraction.append(set(extraction_document_keyphrases))  # get each keyphrase just once

        # count all keyphrases and keyphrases existing in text
        keyphrase_counter = 0
        extraction_keyphrase_counter = 0
        for doc_idx, y_test_extraction_doc in enumerate(y_test_set_extraction):
            extraction_keyphrase_counter += len(y_test_extraction_doc)
            keyphrase_counter += len(y_test_set[doc_idx])
        print('existing keyphrases', extraction_keyphrase_counter)
        print('all keyphrases', keyphrase_counter)

        pred_keyphrase_list_set = []  # set of PREDICTED keyphrases for each document
        for doc in pred_keyphrase_list:  # get the set of PREDICTED keyphrases for each document
            document_keyphrases = []  # save the keyphrases of a document as strings (each keyphrase -> string)
            for tokenized_keyphrase in doc:
                keyphrase = ''
                for word in tokenized_keyphrase:
                    keyphrase += Stemmer('porter').stem(word) + ' '  # apply STEMMING
            pred_keyphrase_list_set.append(set(document_keyphrases))  # get each keyphrase just once

    # print y_test and y_pred

    #for i in range(len(pred_keyphrase_list_set)):
    for i in range(10):
        print('pred', pred_keyphrase_list_set[i])
        print('test', y_test_set[i])
        print('extraction test', y_test_set_extraction[i])

    # ======================================================================================================================
    # Exact Match - Model Evaluation
    # ======================================================================================================================

    # Exact Match: the keyphrases must be given as whole strings

    # extraction - only GOLD KPs existing in text
    calculate_metrics(y_test_set_extraction, pred_keyphrase_list_set, 'Exact Match - Extraction')
    # all GOLD KPs
    calculate_metrics(y_test_set, pred_keyphrase_list_set, 'Exact Match')

    # ======================================================================================================================
    # NEW METHOD - Semi-Exact Match - Model Evaluation
    # ======================================================================================================================

    # extraction - only GOLD KPs existing in text
    calculate_semi_exact_match_metrics(y_test_set_extraction, pred_keyphrase_list_set, 'Semi-exact Match - Extraction')
    # all GOLD KPs
    calculate_semi_exact_match_metrics(y_test_set, pred_keyphrase_list_set, 'Semi-exact Match')

    # ======================================================================================================================
    # Partial Match - Model Evaluation
    # ======================================================================================================================

    # Partial Match: the keyphrases must be given as a set of words

    # Get the sets of all gold keyphrases
    y_test_set_partial = []
    for doc in y_test_set:  # get the set of GOLD keyphrases for each document
        document_keywords = []
        for keyphrase in doc:
            keyphrase = word_tokenize(keyphrase)
            for word in keyphrase:

    # Get the sets of all gold keyphrases existing in text (extraction)
    y_test_set_partial_extraction = []
    for doc in y_test_set_extraction:  # get the set of GOLD keyphrases for each document
        document_keywords = []
        for keyphrase in doc:
            keyphrase = word_tokenize(keyphrase)
            for word in keyphrase:

    # Get the sets of all predicted keyphrases
    pred_keyphrase_list_set_partial = []
    for doc in pred_keyphrase_list_set:  # get the set of PREDICTED keyphrases for each document
        document_keywords = []
        for keyphrase in doc:
            keyphrase = word_tokenize(keyphrase)
            for word in keyphrase:

    # extraction - only GOLD KPs existing in text
    calculate_metrics(y_test_set_partial_extraction, pred_keyphrase_list_set_partial, 'Partial Match - Extraction')
    # all GOLD KPs
    calculate_metrics(y_test_set_partial, pred_keyphrase_list_set_partial, 'Partial Match')
def semeval_summarized_statistics():
    # reading the initial JSON data using json.load()
    file = '..\\data\\benchmark_data\\summarization_experiment\\SemEval-2010_summarized.csv'  # TEST data to evaluate the final model

    # ======================================================================================================================
    # Read data
    # ======================================================================================================================

    data = pd.read_csv(file, encoding="utf8")

    # ======================================================================================================================
    # Split keyphrases list of keyphrases from string that contains all the keyphrases
    # ======================================================================================================================

    for index, keywords in enumerate(data['keyword']):
        data['keyword'].iat[index] = keywords.split(
            ';')  # split keywords to separate them from one another

    # ======================================================================================================================
    # Isolate the title, abstract and the main body (+ remove section identifiers and '\n')
    # ======================================================================================================================

    # tokenize key-phrases and keep them categorized by document
    for index, abstract in enumerate(data['abstract']):
        title_summary = data['title'][
            index] + ' ' + abstract  # combine title + abstract + main body
        # remove '\n'
        title_summary = title_summary.replace('\n', ' ')

        data['abstract'].iat[index] = title_summary

    # ======================================================================================================================
    # Remove Contractions (pre-processing)
    # ======================================================================================================================

    # substitute contractions with full words
    data['abstract'] = data['abstract'].apply(replace_contractions)
    data['keyword'] = data['keyword'].apply(
        lambda set_of_keyphrases:
        [replace_contractions(keyphrase) for keyphrase in set_of_keyphrases])

    # ======================================================================================================================
    # Remove punctuation (with whitespace) + digits (from ABSTRACT) + clean empty strings
    # ======================================================================================================================

    # remove parenthesis, brackets and their contents
    data['abstract'] = data['abstract'].apply(remove_brackets_and_contents)

    # remove references of publications (in document text)
    data['abstract'] = data['abstract'].apply(remove_references)

    # remove punctuation
    data['abstract'] = data['abstract'].apply(remove_punct_and_non_ascii)
    data['keyword'] = data['keyword'].apply(keyword_remove_punct_and_non_ascii)

    # Replace the pure digit terms with DIGIT_REPL
    data['abstract'] = data['abstract'].apply(lambda text: " ".join([
        token if not re.match('^\d+$', token) else 'DIGIT_REPL'
        for token in text.split()
    ]))  # remove spaces
    print('convert digits - abstract finish')

    # remove rows with empty and one word abstracts/sentences
    data = data[data['abstract'].str.strip().astype(bool)]
    data.reset_index(drop=True, inplace=True)

    # remove empty keyphrases
    data['keyword'] = data['keyword'].apply(
        lambda set_of_keyws:
        [key_text for key_text in set_of_keyws if key_text.strip()])
    # remove rows with empty keyphrases
    data = data[data['keyword'].map(len) > 0]

    # ======================================================================================================================
    # Tokenize each sentence + remove digits (from KEYPHRASES)
    # ======================================================================================================================

    # tokenize text
    data['abstract'] = data['abstract'].apply(tokenize_lowercase)
    print('tokenization - abstract finish')

    # stem, tokenize and lower case keyphrases and keep them categorized by document
    for index, list_of_keyphrases in enumerate(data['keyword']):
        keyphrases_list = []
        for keyphrase in list_of_keyphrases:  # get words of all keyphrases in a single list
            # keyphrase = keyphrase.translate(remove_digits).strip()  # remove digits
            keyphrase = keyphrase.strip()  # remove whitespaces
            if len(keyphrase):  # check if the keyphrase is empty
                tokens = word_tokenize(keyphrase)  # tokenize
                # Replace the pure digit terms with DIGIT_REPL
                tokens = [
                    tok if not re.match('^\d+$', tok) else 'DIGIT_REPL'
                    for tok in tokens
                # Replace the combination of characters and digits with WORD_DIGIT_REPL
                #tokens = [tok if not re.match('.*\d+', tok) else 'WORD_DIGIT_REPL' for tok in tokens]
                tokens = [
                    for keyword in tokens
                ]  # stem + lower case
                tokens = ' '.join(tokens)

        data['keyword'].iat[index] = keyphrases_list

    # ======================================================================================================================
    # Count logistics
    # ======================================================================================================================

    semeval_keywords_in_summary = 0  # the count of keywords in abstract
    semeval_total_keywords = 0  # the count of all keywords
    for index, keywords in enumerate(data['keyword']):
        semeval_total_keywords += len(keywords)
        # print('total_keywords', len(test))
        # print('total_keywords', test)

        for keyword in keywords:
            # check if keyword exists on abstract
            if keyword in data['abstract'][index]:
                semeval_keywords_in_summary += 1
                # print(keyword)
                # print(data['abstract'][index])

    print('SemEval summarized: ', semeval_keywords_in_summary)
    print('SemEval summarized - total keyphrases: ', semeval_total_keywords)

    print('SemEval summarized - count of keywords in abstract: ',
          semeval_keywords_in_summary / semeval_total_keywords)

    return semeval_keywords_in_summary
Exemplo n.º 17
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import re
import sys
import os
import glob
import json
from nltk.stem.snowball import SnowballStemmer as Stemmer

references = {}

for input_file in glob.glob(sys.argv[1] + '/*.key'):
    file_id = input_file.split('/')[-1].split('.')[0]
    with open(input_file, 'r') as f:
        lines = f.readlines()
        keyphrases = []
        for line in lines:
            words = line.strip().split()
            stems = [Stemmer('porter').stem(w.lower()) for w in words]
            keyphrases.append([' '.join(stems)])
            # keyphrases.append([' '.join([w.lower() for w in words])])
        references[file_id] = keyphrases

with open(sys.argv[2], 'w') as o:
    json.dump(references, o, sort_keys=True, indent=4)
Exemplo n.º 18
for index, list_of_keyphrases in enumerate(data['keyword']):
    keyphrases_list = []
    for keyphrase in list_of_keyphrases:  # get words of all keyphrases in a single list
        # keyphrase = keyphrase.translate(remove_digits).strip()  # remove digits
        keyphrase = keyphrase.strip()  # remove whitespaces
        if len(keyphrase):  # check if the keyphrase is empty
            tokens = word_tokenize(keyphrase)  # tokenize
            # Replace the pure digit terms with DIGIT_REPL
            tokens = [
                tok if not re.match('^\d+$', tok) else 'DIGIT_REPL'
                for tok in tokens
            # Replace the combination of characters and digits with WORD_DIGIT_REPL
            #tokens = [tok if not re.match('.*\d+', tok) else 'WORD_DIGIT_REPL' for tok in tokens]
                Stemmer('porter').stem(keyword.lower()) for keyword in tokens
            ])  # stem + lower case
    data['keyword'].iat[index] = keyphrases_list
#    print('THESE ARE THE KEYPHRASE LIST', len(keyphrases_list), keyphrases_list)

# ======================================================================================================================
# Write pre-processed keyphrases to csv file
# ======================================================================================================================

                        index=False)  # save the preprocessed document text
                       index=False)  # save the preprocessed keyphrases

# ======================================================================================================================
# Give labels to each word of Abstract (fulltext) - keyword (KP) or Non-keyword (Non-KP)
Exemplo n.º 19
        doc_id = file_id.split('/')[-1][:-5]

        # print('Loading {}'.format(doc_id))

        with open(file_id, 'r') as f:
            lines = f.readlines()
            tags[doc_id].update([l.lower().strip() for l in lines])

references = {}

for doc_id in tags:

    # group tags by stem
    stem_to_tag = collections.defaultdict(list)
    for tag in tags[doc_id]:
        stem = [Stemmer('porter').stem(w) for w in tag.split()]
        for _ in range(tags[doc_id][tag]):
            stem_to_tag[' '.join(stem)].append(tag)

    valid_tags = []
    for tag in stem_to_tag:
        if len(stem_to_tag[tag]) > 1:

    if len(valid_tags):
        if sys.argv[3] == 'stem':
            references[doc_id] = [[t] for t in valid_tags]
            references[doc_id] = [
                list(set(stem_to_tag[t])) for t in valid_tags
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import re
import sys
import os
import glob
import codecs
import json
from nltk.stem.snowball import SnowballStemmer as Stemmer

references = {}

for input_file in glob.glob(sys.argv[1] + '/*.key'):
    file_id = '.'.join(input_file.split('/')[-1].split('.')[0:-1])
    with codecs.open(input_file, 'r', 'iso-8859-1') as f:
        lines = f.readlines()
        keyphrases = []
        for line in lines:
            words = line.strip().split()
            stems = [Stemmer('portuguese').stem(w.lower()) for w in words]
            if sys.argv[3] == "stem":
                keyphrases.append([' '.join(stems)])
                keyphrases.append([' '.join([w.lower() for w in words])])
        references[file_id] = keyphrases

with open(sys.argv[2], 'w') as o:
    json.dump(references, o, sort_keys=True, indent=4)
Exemplo n.º 21
import argparse
import nltk
import math
import re
import random
import scipy
import pickle
import sys

from collections import Counter
from nltk.stem.snowball import SnowballStemmer as Stemmer
from scipy.special import expit
# from stemming.porter2 import stem

stemmer = Stemmer("english")

dictionary_counts = dict()
dictionary_indices = dict()
total_documents = 0
regex = re.compile('[^a-zA-Z ]')

def sigmoid(x):
    return expit(x)

def sigmoid_derivative(x):
    return sigmoid(x) * (1 - sigmoid(x))

Exemplo n.º 22
references = {}
stemmed_references = {}

with open(sys.argv[1], 'r') as f:
    for file_number, line in enumerate(f.readlines()):
        document = json.loads(line)
        file_id = '{0:05d}'.format(file_number)
        output_file = sys.argv[2] + '/{}.txt'.format(file_id)

        logging.info("writting file {}".format(output_file))
        with codecs.open(output_file, 'w', 'utf-8') as o:
            o.write(document['title'] + "\n\n")

        references[file_id] = []
        stemmed_references[file_id] = []

        keyphrases = document['keyword'].split(';')
        for keyphrase in keyphrases:
            words = keyphrase.lower().strip().split()
            stems = [Stemmer('porter').stem(w) for w in words]
            references[file_id].append([' '.join(words)])
            stemmed_references[file_id].append([' '.join(stems)])

with open(sys.argv[3], 'w') as o:
    json.dump(references, o, sort_keys=True, indent=4)

with open(sys.argv[4], 'w') as o:
    json.dump(stemmed_references, o, sort_keys=True, indent=4)

# ======================================================================================================================
# Format keyphrases and retrieve document text
# ======================================================================================================================

list_of_document_title = []  # save the title of documents
list_of_document_abstract = []  # save the abstract of documents
list_of_document_text = []  # save the body of documents
#gold_keyphrases = []  # save the gold keyphrases of documents
pred_keyphrases = []  # save the predicted keyphrases of documents
for indx, abstract_document in enumerate(data['abstract']):
    # print('train_test_combined/' + key + '.xml')
    # print(keyphrases_dictionary[key])

    gold_keyphrases.append([[Stemmer('porter').stem(keyword) for keyword in keyphrase.split()] for keyphrase in data['keyword'][indx].split(';')])  # split gold keywords to separate them from one another

# ======================================================================================================================
# MultipartiteRank Extractor
# ======================================================================================================================

    # 1. create a MultipartiteRank extractor.
    extractor = pke.unsupervised.MultipartiteRank()

    # 2. load the content of the document.

    # 3. select the longest sequences of nouns and adjectives, that do