Пример #1
0
 def getWords(self, text):
     lemmas = LemmatizeText(self.ct.removePunctuation(text), "FR")
     lemmas.createLemmaText()
     lemmaText = lemmas.cleanText
     words = []
     if lemmaText and lemmaText != " ":
         lemmas.createLemmas()
         for w in lemmas.wordList:
             word = {}
             word['word']=w.word
             word['tf']=w.tf
             word['count']=w.count
             word['pos']=w.wtype
             words.append(word)
     return words
Пример #2
0
def analysis_dashboard_page2():
    keywords = request.form['keyword']
    date = request.form['date']
    checked_genders = request.form.getlist('gender')
    checked_ages = request.form.getlist('age')
    print date,keywords,checked_genders,checked_ages
    lem = LemmatizeText(keywords)
    lem.createLemmaText()
    lem.createLemmas()
    wordList = []
    for word in lem.wordList:
        """
            If you want to use a regex,
            This example will construct a regex that contains the lemma
            similar in SQL to -> where word like '%f**k%'
        """
        #regex = re.compile(word.word, re.IGNORECASE)
        #wordList.append(regex)
        """
            this one will find only the tweets with the matching word
        """
        wordList.append(word.word)
    global query
    query = {}
    global query_pretty
    query_pretty = ""
    if wordList:
        query_pretty += "Keyword filter: "+' '.join(wordList)+"<br/>"
        query["words.word"] = { "$in": wordList }
    if date:
        query_pretty += "Date filter: "+date+"<br/>"
        start, end = date.split(" ") 
        query["date"] = { "$gt": start, "$lte": end }
    if checked_ages and 0 < len(checked_ages) < 6:
        query_pretty += "Age filter: "+' '.join(checked_ages)+"<br/>"
        query["age"] = { "$in": checked_ages }
    if checked_genders and len(checked_genders) == 1:
        query_pretty += "Gender filter: "+' '.join(checked_genders)+"<br/>"
        query["gender"] = checked_genders[0]
    if query:
        vocab = VocabularyIndex(dbname)
        vocab.createIndex(query)
    tweetCount = getTweetCount()
    return render_template('analysis.html', tweetCount=tweetCount, dates=date, keywords=' '.join(wordList))  
Пример #3
0
 def getWords(self, text):
     lemmas = LemmatizeText(self.ct.removePunctuation(text), "FR")
     lemmas.createLemmaText()
     lemmaText = lemmas.cleanText
     words = []
     if lemmaText and lemmaText != " ":
         lemmas.createLemmas()
         for w in lemmas.wordList:
             word = {}
             word['word'] = w.word
             word['tf'] = w.tf
             word['count'] = w.count
             word['pos'] = w.wtype
             words.append(word)
     return words
Пример #4
0
def analysis_dashboard_page2():
    keywords = request.form['keyword']
    date = request.form['date']
    checked_genders = request.form.getlist('gender')
    checked_ages = request.form.getlist('age')
    print date, keywords, checked_genders, checked_ages
    lem = LemmatizeText(keywords)
    lem.createLemmaText()
    lem.createLemmas()
    wordList = []
    for word in lem.wordList:
        """
            If you want to use a regex,
            This example will construct a regex that contains the lemma
            similar in SQL to -> where word like '%f**k%'
        """
        #regex = re.compile(word.word, re.IGNORECASE)
        #wordList.append(regex)
        """
            this one will find only the tweets with the matching word
        """
        wordList.append(word.word)
    global query
    query = {}
    global query_pretty
    query_pretty = ""
    if wordList:
        query_pretty += "Keyword filter: " + ' '.join(wordList) + "<br/>"
        query["words.word"] = {"$in": wordList}
    if date:
        query_pretty += "Date filter: " + date + "<br/>"
        start, end = date.split(" ")
        query["date"] = {"$gt": start, "$lte": end}
    if checked_ages and 0 < len(checked_ages) < 6:
        query_pretty += "Age filter: " + ' '.join(checked_ages) + "<br/>"
        query["age"] = {"$in": checked_ages}
    if checked_genders and len(checked_genders) == 1:
        query_pretty += "Gender filter: " + ' '.join(checked_genders) + "<br/>"
        query["gender"] = checked_genders[0]
    if query:
        vocab = VocabularyIndex(dbname)
        vocab.createIndex(query)
    tweetCount = getTweetCount()
    return render_template('analysis.html',
                           tweetCount=tweetCount,
                           dates=date,
                           keywords=' '.join(wordList))
Пример #5
0
    return lemmas


header, corpus = readCSV('RNTI_articles_export_fixed1347_ids.txt')

print header
idx = 0
for line in corpus:
    # language title
    if line[9] == 'fr':
        filename = 'texts/' + str(line[8]) + 'title'
        writeFile(filename, line[3])
        pos_title = extractPOS(filename)
        lemma_title = splitPos(pos_title)
    elif line[9] == 'en':
        lt = LemmatizeText(line[3])
        lt.createLemmaText()
        lemma_title = lt.cleanText
    # language abstract
    if line[10] == 'fr':
        filename = 'texts/' + str(line[8]) + 'abstract'
        writeFile(filename, line[4])
        pos_abstract = extractPOS(filename)
        lemma_abstract = splitPos(pos_abstract)
    elif line[10] == 'en':
        lt = LemmatizeText(line[4])
        lt.createLemmaText()
        lemma_abstract = lt.cleanText
    if line[9] == 'fr' and line[10] == 'fr':
        line[12] = lemma_title + ' ' + lemma_abstract
    if line[9] == 'en' and line[10] == 'en':
Пример #6
0
def processElement_serial(elem, language, mode=0):
    document = dict()
    # get language
    if len(elem) >= 8:
        lang = elem[7]
    else:
        lang = language
    # get clean text
    try:
        cleanText, hashtags, attags = ct.cleanText(elem[1], lang)
        # if clean text exists
        if len(ct.removePunctuation(cleanText)) > 0:
            # extract lemmas and part of speech
            lemmas = LemmatizeText(rawText=ct.removePunctuation(cleanText), language=lang, mode=mode)
            lemmas.createLemmaText()
            lemmaText = lemmas.cleanText
            if lemmaText and lemmaText != " ":
                lemmas.createLemmas()
                words = []
                for w in lemmas.wordList:
                    word = dict()
                    word['word'] = w.word
                    word['tf'] = w.tf
                    word['count'] = w.count
                    word['pos'] = w.wtype
                    words.append(word)

                # named entities:
                ner = NamedEntitiesRegonizer(text=cleanText, language=lang)
                ner.createNamedEntities()
                if ner.ner:
                    document['namedEntities'] = ner.ner

                # construct the document
                document['_id'] = elem[0]
                document['rawText'] = elem[1].encode('utf8').encode('string_escape').replace('\r', '').replace('\n', '')
                document['cleanText'] = cleanText.encode('utf8').encode('string_escape').replace('\r', '').replace('\n', '')
                document['lemmaText'] = lemmaText
                document['date'] = elem[2]
                document['author'] = elem[3]
                document['words'] = words
                # geo location [x, y]
                document['geoLocation'] = elem[4].split(' ')
                # author age
                # this are the change required for the moment when we will keep age as a number
                # age = elem[5].split('-')
                # document['age'] = int(age[1]) - int(age[0])
                document['age'] = elem[5]

                # this are the changes required for the moment when we will keep gender as a number
                # author gender - 1 male, 2 female, 0 unknown
                # document['gender'] = gender.get(elem[6], 0)
                document['gender'] = elem[6]

                if attags:
                    document['attags'] = attags
                if hashtags:
                    document['hashtags'] = hashtags
    except Exception as e:
        print e
    return document
Пример #7
0
			lemmas += word + ' '
	return lemmas

header, corpus = readCSV('RNTI_articles_export_fixed1347_ids.txt')

print header
idx = 0
for line in  corpus:
	# language title
	if line[9] == 'fr':
		filename = 'texts/'+str(line[8]) + 'title'
		writeFile(filename, line[3])
		pos_title = extractPOS(filename)
		lemma_title = splitPos(pos_title)
	elif line[9] == 'en':
		lt = LemmatizeText(line[3])
		lt.createLemmaText()
		lemma_title = lt.cleanText
	# language abstract
	if line[10] == 'fr':
		filename = 'texts/'+str(line[8]) + 'abstract'
		writeFile(filename, line[4])
		pos_abstract = extractPOS(filename)
		lemma_abstract = splitPos(pos_abstract)
	elif line[10] == 'en':
		lt = LemmatizeText(line[4])
		lt.createLemmaText()
		lemma_abstract = lt.cleanText
	if line[9] == 'fr' and line[10] == 'fr':
		line[12] = lemma_title + ' ' + lemma_abstract
	if line[9] == 'en' and line[10] == 'en':
Пример #8
0
def process_element(elem):
    document = dict()
    if len(elem) == 9:
        try:
            # construct the document
            # rawText = elem[4].decode('latin-1').encode('utf-8')#.encode('latin-1').encode('string_escape').replace('\r', '').replace('\n', '')
            document['rawText'] = elem[4].encode('latin-1')#.encode('string_escape').replace('\r', '').replace('\n', '')
            document['series'] = elem[0]
            document['booktitle'] = elem[1]
            document['year'] = elem[2]
            document['title'] = elem[3].encode('latin-1')
            #authors
            authors = elem[5].split(',')
            #document['authors'] = [ {'name': author.strip(' ').decode('latin-1').encode('utf-8'), 'position': authors.index(author)} for author in authors]
            document['authors'] = [ {'name': author.strip(' ').encode('latin-1'), 'position': authors.index(author)} for author in authors]
            document['pdf1page'] = elem[6]
            document['pdfarticle'] = elem[7]
            document['_id'] = elem[8]

            try:
                lang = detect(elem[4].decode('latin-1')).upper()
            except Exception as e1:
                try:
                    lang = detect(elem[3].decode('latin-1')).upper()
                    print e1, 'aici try 2'
                except Exception as e2:
                    lang = 'FR'
                    print e2, 'aici try 3'
            document['language'] = lang

            
            if len(elem[4])>0:
                try:
                    cleanText = ct.cleanTextSimple(elem[4].encode('latin-1'), lang)
                    # if clean text exists
                    # print cleanText
                    if len(ct.removePunctuation(cleanText)) > 0:
                        # extract lemmas and part of speech
                        lemmas = LemmatizeText(rawText=ct.removePunctuation(cleanText), language=lang)
                        lemmas.createLemmaText()
                        lemmaText = lemmas.cleanText
                        if lemmaText and lemmaText != " ":
                            lemmas.createLemmas()
                            words = []
                            for w in lemmas.wordList:
                                word = dict()
                                word['word'] = w.word
                                word['tf'] = w.tf
                                word['count'] = w.count
                                word['pos'] = w.wtype
                                words.append(word)

                            document['cleanText'] = cleanText#.encode('latin-1').encode('string_escape').replace('\r', '').replace('\n', '')
                            document['lemmaText'] = lemmaText
                            document['words'] = words
                except Exception as e:
                    print e, 'sunt in lemmaText'
        except Exception as e:
            print e, 'aici try 1', elem
    else:
        print 'aici in else', elem
    return document
Пример #9
0
def processElement_serial(elem, language, mode=0):
    document = dict()
    # get language
    if len(elem) >= 8:
        lang = elem[7]
    else:
        lang = language
    # get clean text
    try:
        cleanText, hashtags, attags = ct.cleanText(elem[1], lang)
        # if clean text exists
        if len(ct.removePunctuation(cleanText)) > 0:
            # extract lemmas and part of speech
            lemmas = LemmatizeText(rawText=ct.removePunctuation(cleanText),
                                   language=lang,
                                   mode=mode)
            lemmas.createLemmaText()
            lemmaText = lemmas.cleanText
            if lemmaText and lemmaText != " ":
                lemmas.createLemmas()
                words = []
                for w in lemmas.wordList:
                    word = dict()
                    word['word'] = w.word
                    word['tf'] = w.tf
                    word['count'] = w.count
                    word['pos'] = w.wtype
                    words.append(word)

                # named entities:
                ner = NamedEntitiesRegonizer(text=cleanText, language=lang)
                ner.createNamedEntities()
                if ner.ner:
                    document['namedEntities'] = ner.ner

                # construct the document
                document['_id'] = elem[0]
                document['rawText'] = elem[1].encode('utf8').encode(
                    'string_escape').replace('\r', '').replace('\n', '')
                document['cleanText'] = cleanText.encode('utf8').encode(
                    'string_escape').replace('\r', '').replace('\n', '')
                document['lemmaText'] = lemmaText
                document['date'] = elem[2]
                document['author'] = elem[3]
                document['words'] = words
                # geo location [x, y]
                document['geoLocation'] = elem[4].split(' ')
                # author age
                # this are the change required for the moment when we will keep age as a number
                # age = elem[5].split('-')
                # document['age'] = int(age[1]) - int(age[0])
                document['age'] = elem[5]

                # this are the changes required for the moment when we will keep gender as a number
                # author gender - 1 male, 2 female, 0 unknown
                # document['gender'] = gender.get(elem[6], 0)
                document['gender'] = elem[6]

                if attags:
                    document['attags'] = attags
                if hashtags:
                    document['hashtags'] = hashtags
    except Exception as e:
        print e
    return document
Пример #10
0
def processElement(elem):
    document = dict()
    # get language
    try:
        lang = detect(elem[1]).upper()
    except Exception as e:
        print e
        lang = ''
    if lang == 'EN':
        # get clean text
        cleanText, hashtags, attags = ct.cleanText(elem[1], lang)
        # if clean text exists
        if len(ct.removePunctuation(cleanText)) > 0:
            # extract lemmas and part of speech
            lemmas = LemmatizeText(rawText=ct.removePunctuation(cleanText), language=lang, mode=global_mode)
            lemmas.createLemmaText()
            lemmaText = lemmas.cleanText
            if lemmaText and lemmaText != " ":
                lemmas.createLemmas()
                words = []
                for w in lemmas.wordList:
                    word = dict()
                    word['word'] = w.word
                    word['tf'] = w.tf
                    word['count'] = w.count
                    word['pos'] = w.wtype
                    words.append(word)

                # named entities:
                ner = NamedEntitiesRegonizer(text=cleanText, language=lang)
                ner.createNamedEntities()
                if ner.ner:
                    document['namedEntities'] = ner.ner

                # construct the document
                document['_id'] = int(elem[0])
                document['rawText'] = elem[1].decode('utf8').encode('utf8').encode('string_escape').replace('\r', '').replace('\n', '')
                document['cleanText'] = cleanText.decode('utf8').encode('utf8').encode('string_escape').replace('\r', '').replace('\n', '')
                document['lemmaText'] = lemmaText
                document['date'] = elem[2]
                document['words'] = words
                # geo location [x, y]
                document['geoLocation'] = elem[4].split(' ')
                # author age
                # this are the change required for the moment when we will keep age as a number
                author = dict()

                # this are the changes required for the moment when we will keep gender as a number
                # author gender - 1 male, 2 female, 0 unknown
                if authors.get(int(elem[3]), -1) == -1:
                    age = elem[5].split('-')
                    author['age'] = (int(age[1]) + int(age[0]))/2
                    author['authorid'] = int(elem[3])
                    author['genderid'] = genderid.get(elem[6], 0)
                    author['gender'] = gender.get(elem[6], 'unknown')
                    if author['genderid'] == 1:
                        global b_idx
                        author['firstname'] = names.boys[b_idx][0]
                        author['lastname'] = names.boys[b_idx][1]
                        b_idx += 1
                    elif author['genderid'] == 2:
                        global g_idx
                        author['firstname'] = names.girls[g_idx][0]
                        author['lastname'] = names.girls[g_idx][1]
                        g_idx += 1
                    authors[int(elem[3])] = author
                else:
                    author = authors[int(elem[3])]

                # print author
                document['authors'] = [author]

                if attags:
                    document['attags'] = attags
                if hashtags:
                    document['hashtags'] = hashtags
    return document