Пример #1
0
def main():

    # initialize custom vectorizer with all documents collection
    vectorizer1 = CustomVectorizer(input='fromfiles',
                                   stopwords=stopwords,
                                   encoding='iso-8859-1')
    vectorizer2 = CustomVectorizer(input='fromfiles',
                                   stopwords=stopwords,
                                   encoding='iso-8859-1',
                                   bigrams=True,
                                   nphrases=True)
    documents = ['textos-fonte/' + d for d in listdir('textos-fonte')]
    vectorizer1.fit(documents)
    vectorizer1._input = 'content'
    vectorizer2._input = 'content'

    # print all statistics
    MAP1 = 0
    MAP2 = 0
    print(
        'File\t\t\tPrecision Alternative\tRecall Alternative\tF1 Alternative\tPrecision Improved\tRecall Improved\tF1 Improved'
    )
    for doc in listdir('textos-fonte'):
        path = 'textos-fonte/' + doc
        stats = compareApproaches(doc, summary(vectorizer1, path, True),
                                  summary(vectorizer2, path))
        MAP1 += float(stats[1])
        MAP2 += float(stats[4])
        print(stats[0] + '\t\t' + stats[1] + '\t\t' + stats[2] + '\t' +
              stats[3] + '\t' + stats[4] + '\t\t' + stats[5] + '\t\t' +
              stats[6])
    print('\nMAP Score for Alternative Approach: ' + str(MAP1 / 100))
    print('\nMAP Score for Improved Approach: ' + str(MAP2 / 100))
def summarizeDocument(file):
    '''
    Parse the file for sentences
    '''
    doc = io.open(file, 'r', encoding='utf').read()
    doc = preprocessor(doc)
    sentences = sent_tokenize(doc)
    '''
    Compute vector space representations of every sentence.
    It will treat each sentence as a document and so use
    the correct values (sentence frequency).
    !!!
    The tf-idf values computed by this vectorizer are not
    in accordance to what is requested. Documentation states
    that tf is simply the count of each word in each doc/sentence
    (and so, not normalized), and 1 is added to all idf values.
    To meet the requirements, for each term, we would need to subtract
    its tf (as described above) and then divide by the maximal tf in
    that doc/sentence. I think we need to use a CounterVectorizer first
    !!!
    '''
    vectorizer = CustomVectorizer(input='content',
                                  stopwords=list(stopwords.words('english')))
    vectors = vectorizer.fit(sentences)
    vectors = vectorizer.transform_tfidf(sentences)
    '''
    Transform the document into a single sentence and use
    the vectorizer to model it in the same feature space.
    '''
    docVector = vectorizer.transform_tfidf([doc])
    '''
    For each sentence vector, reduce the document vector
    to the same dimension space, to be able to compute
    the dot product -> similarity
    '''
    sim = []
    for vector in vectors:
        sim.append(similarity(vector, docVector[0]))

    summary = sorted(enumerate(sim), key=lambda s: s[1], reverse=True)[:5]
    summary.sort()
    '''
    Returns the list of selected sentences
    '''
    res = []
    for s in summary:
        res.append(sentences[s[0]])
    return res
Пример #3
0
def main():

    # initialize custom vectorizer with all documents collection
    vectorizer = CustomVectorizer(input='fromfiles',
                                  stopwords=stopwords,
                                  encoding='iso-8859-1')
    documents = ['textos-fonte/' + d for d in listdir('textos-fonte')]
    vectorizer.fit(documents)
    vectorizer._input = 'content'

    # print all statistics
    MAP = 0
    print('File\t\t\tPrecision\tRecall\t\tF1 Score')
    for doc in listdir('textos-fonte'):
        path = 'textos-fonte/' + doc
        stats = calculateStats(path, summary(vectorizer, path))
        MAP += float(stats[0])
        print(doc + '\t\t' + stats[0] + '\t' + stats[1] + '\t' + stats[2])
    print('\nMAP Score: ' + str(MAP / 100))
def extract_features(source):
    def count_tags(tags, label):

        tag_counter = Counter(tags)
        counts = 0

        if label == 'nouns':
            for tag in tag_counter.keys():
                if tag in ['N', 'NPROP', 'PROPESS']:
                    counts += tag_counter.get(tag)

        elif label == 'verbs':
            for tag in tag_counter.keys():
                if tag in ['V', 'VAUX', 'ADV', 'ADV-KS', 'ADV-KS-REL']:
                    counts += tag_counter.get(tag)

        elif label == 'adjectives':
            for tag in tag_counter.keys():
                if tag in ['ADJ']:
                    counts += tag_counter.get(tag)

        return counts

    sents = filter_list(sent_tokenize(source))

    feature_pos = [i + 1 for i in range(len(sents))]

    vectorizer = CustomVectorizer()
    vectorizer.fit(
        sents)  # whole document must be split before fitting TODO -> filter

    tfidf_source = vectorizer.transform_tfidf([source])[0]
    tfidf_sents = vectorizer.transform_tfidf(sents)

    feature_sim = [
        similarity(tfidf_sent, tfidf_source) for tfidf_sent in tfidf_sents
    ]

    tagger = customtagger.load_tagger()

    tagged = [tagger.tag(filter_list(word_tokenize(sent))) for sent in sents]

    sent_tags = [list(map(lambda t: t[1], tags)) for tags in tagged]

    feature_nouns = [count_tags(tags, 'nouns') for tags in sent_tags]

    #feature_verbs = [count_tags(tags, 'verbs') for tags in sent_tags]

    #feature_adjectives = [count_tags(tags, 'adjectives') for tags in sent_tags]

    return sents, feature_pos, feature_sim, feature_nouns  #, feature_verbs, feature_adjectives
    "http://www.latimes.com/world/rss2.0.xml", file)

file.close()

print('(done)')

print('\nSummarizing...')

file = open('worldnews.txt', encoding='utf-8')

text = file.read()
sents = sent_tokenize(text)

file.close()

vectorizer = CustomVectorizer(stopwords=stopwords.words())

vectorizer.fit(sents)
vecs = vectorizer.transform_tfidf(sents)

graph = {i: [] for i in range(len(vecs))}

threshold = 0.1
for i in range(len(vecs)):
    for j in range(i + 1, len(vecs)):
        if similarity(vecs[i], vecs[j]) > threshold:
            graph[i].append(j)
            graph[j].append(i)

graph = {k: list(set(graph[k])) for k in graph.keys()}
Пример #6
0
        source_file = open(TeMario_originals + file,
                           'r',
                           encoding='iso-8859-1')
        target_file = open(TeMario_summaries + 'Ext-' + file,
                           'r',
                           encoding='iso-8859-1')

        source = pre_process(source_file.read())
        target = pre_process(target_file.read())

        source_file.close()
        target_file.close()

        sents = filter_list(sent_tokenize(source))

        vectorizer = CustomVectorizer()

        vectorizer.fit(sents)

        vecs = vectorizer.transform_tfidf(sents)
        source_score = vectorizer.transform_tfidf([source])[0]

        graph = defaultdict(lambda: [])

        weights_tfidf = defaultdict(lambda: {})
        weights_alternative = []  # TODO

        # Build graph
        threshold = 0.1
        for i, v1 in enumerate(vecs):
            for j, v2 in enumerate(vecs[i + 1:], start=i + 1):

if __name__ == '__main__':

    print('\nTesting adapted PageRank algorithm for sentence ranking and consequent text summarization.\n' +
          'A graph is built linking sentences with similarity bigger than a certain threshold.\n' +
          'This method is tested and evaluated on the "catalunha.txt" file, with a 0.1 threshold.\n')

    file = open('catalunha.txt', encoding='utf-8')

    source = pre_process(file.read())
    sents  = filter_list(sent_tokenize(source))

    file.close()

    vectorizer = CustomVectorizer(stopwords=stopwords.words())

    vectorizer.fit(sents)  # -> fit on sentences or on whole text?
    vecs = vectorizer.transform_tfidf(sents)

    graph = {i: [] for i in range(len(vecs))}

    threshold = 0.1
    for i in range(len(vecs)):
        for j in range(i+1, len(vecs)):
            if similarity(vecs[i], vecs[j]) > threshold:
                graph[i].append(j)
                graph[j].append(i)

    graph = {k: list(set(graph[k])) for k in graph.keys()}