示例#1
0
def getArticles(articleList):
    singleSets = []
    for article in articleList:
        try:
            chunks = gc.getChunks(article[1])
            tags =  tag.getTags(article[1],chunks)
            if tags == []:
                continue # check this is right. go to next itteration
            """The Stanford Open IE tags"""
            subject = tags['subject']
            relation = tags['relation']
            objects = tags['object']
            objects = objects.split()

            content = wp.getArticle(subject)
            rawSentences = sent.getSentences(content)
            sentences = []
            for sentence in rawSentences:
                if(hd.hasDate(sentence) !== []):
                    sentences.append(sentence)
            listOfYears.append(article[0])
            SS = {'title':article[1], 'sentences':sentences, 'year':article[0]}
            singleSets.append(SS)
        except:
            pass
    return singleSets
示例#2
0
def getArticle(article):
    try:
        #chunks = gc.getChunks(article)
        tags =  tag.getTags(article[1])
        #if tags == []:
        try:
        #    continue # check this is right. go to next itteration
            """The Stanford Open IE tags"""
            subject = tags[-1]['subject']
            relation = tags[-1]['relation']
            objects = tags[-1]['object']
            objects = objects.split(' ')

            relations = []
            relations.append(nltk.stem.wordnet.WordNetLemmatizer().lemmatize(relation))
            relations = en.lexeme(relations[0])
            content = wp.getArticle(subject)
        except:
            #    continue # check this is right. go to next itteration
            """The Stanford Open IE tags"""
            subject = tags[0]['subject']
            relation = tags[0]['relation']
            objects = tags[0]['object']
            objects = objects.split(' ')
            relations = []
            relations.append(nltk.stem.wordnet.WordNetLemmatizer().lemmatize(relation))
            relations = en.lexeme(relations[0])
            content = wp.getArticle(subject)

        #objects = objects.split()
        rawSentences = nltk.tokenize.sent_tokenize(content)#sent.getSentences(content)
        sentences = []
        for sentence in rawSentences:
            for word in objects:
                if word in sentence:
                    sentences.append(sentence)
            for word in relations:
                if word in sentence:
                    sentences.append(sentence)

        sentences = list(set(sentences))
        return {'title':article[1], 'sentences':sentences, 'year':article[0]}
    except:
        return
示例#3
0
import app.analytics.tag as tag
import app.parser.articleRetrieval.wikipediaParse as wp
import app.parser.sentences as sent
import app.analytics.sentenceFiltering.actionSentences as action
import app.analytics.functions.hasDate as hd
import app.analytics.functions.synonym as sn
import app.analytics.getFeatures as ft

articles = importArticles.getData()

sentences= []
count = 0
for article in articles[0:10]:
    print article
    chunks = gc.getChunks(article[1])
    tags =  tag.getTags(article[1],chunks)
    if tags == []:
        continue # check this is right. go to next itteration
    """The Stanford Open IE tags"""
    subject = tags['subject']
    relation = tags['relation']
    objects = tags['object']
    objects = objects.split()
    print objects
    print relation
    print subject

    article = wp.getArticle(subject)
    sentences = sent.getSentences(article)

    features= ft.getFeatures(subject, objects, relation, sentences)