def getWords(self, text): lemmas = LemmatizeText(self.ct.removePunctuation(text), "FR") lemmas.createLemmaText() lemmaText = lemmas.cleanText words = [] if lemmaText and lemmaText != " ": lemmas.createLemmas() for w in lemmas.wordList: word = {} word['word']=w.word word['tf']=w.tf word['count']=w.count word['pos']=w.wtype words.append(word) return words
def analysis_dashboard_page2(): keywords = request.form['keyword'] date = request.form['date'] checked_genders = request.form.getlist('gender') checked_ages = request.form.getlist('age') print date,keywords,checked_genders,checked_ages lem = LemmatizeText(keywords) lem.createLemmaText() lem.createLemmas() wordList = [] for word in lem.wordList: """ If you want to use a regex, This example will construct a regex that contains the lemma similar in SQL to -> where word like '%f**k%' """ #regex = re.compile(word.word, re.IGNORECASE) #wordList.append(regex) """ this one will find only the tweets with the matching word """ wordList.append(word.word) global query query = {} global query_pretty query_pretty = "" if wordList: query_pretty += "Keyword filter: "+' '.join(wordList)+"<br/>" query["words.word"] = { "$in": wordList } if date: query_pretty += "Date filter: "+date+"<br/>" start, end = date.split(" ") query["date"] = { "$gt": start, "$lte": end } if checked_ages and 0 < len(checked_ages) < 6: query_pretty += "Age filter: "+' '.join(checked_ages)+"<br/>" query["age"] = { "$in": checked_ages } if checked_genders and len(checked_genders) == 1: query_pretty += "Gender filter: "+' '.join(checked_genders)+"<br/>" query["gender"] = checked_genders[0] if query: vocab = VocabularyIndex(dbname) vocab.createIndex(query) tweetCount = getTweetCount() return render_template('analysis.html', tweetCount=tweetCount, dates=date, keywords=' '.join(wordList))
def getWords(self, text): lemmas = LemmatizeText(self.ct.removePunctuation(text), "FR") lemmas.createLemmaText() lemmaText = lemmas.cleanText words = [] if lemmaText and lemmaText != " ": lemmas.createLemmas() for w in lemmas.wordList: word = {} word['word'] = w.word word['tf'] = w.tf word['count'] = w.count word['pos'] = w.wtype words.append(word) return words
def analysis_dashboard_page2(): keywords = request.form['keyword'] date = request.form['date'] checked_genders = request.form.getlist('gender') checked_ages = request.form.getlist('age') print date, keywords, checked_genders, checked_ages lem = LemmatizeText(keywords) lem.createLemmaText() lem.createLemmas() wordList = [] for word in lem.wordList: """ If you want to use a regex, This example will construct a regex that contains the lemma similar in SQL to -> where word like '%f**k%' """ #regex = re.compile(word.word, re.IGNORECASE) #wordList.append(regex) """ this one will find only the tweets with the matching word """ wordList.append(word.word) global query query = {} global query_pretty query_pretty = "" if wordList: query_pretty += "Keyword filter: " + ' '.join(wordList) + "<br/>" query["words.word"] = {"$in": wordList} if date: query_pretty += "Date filter: " + date + "<br/>" start, end = date.split(" ") query["date"] = {"$gt": start, "$lte": end} if checked_ages and 0 < len(checked_ages) < 6: query_pretty += "Age filter: " + ' '.join(checked_ages) + "<br/>" query["age"] = {"$in": checked_ages} if checked_genders and len(checked_genders) == 1: query_pretty += "Gender filter: " + ' '.join(checked_genders) + "<br/>" query["gender"] = checked_genders[0] if query: vocab = VocabularyIndex(dbname) vocab.createIndex(query) tweetCount = getTweetCount() return render_template('analysis.html', tweetCount=tweetCount, dates=date, keywords=' '.join(wordList))
return lemmas header, corpus = readCSV('RNTI_articles_export_fixed1347_ids.txt') print header idx = 0 for line in corpus: # language title if line[9] == 'fr': filename = 'texts/' + str(line[8]) + 'title' writeFile(filename, line[3]) pos_title = extractPOS(filename) lemma_title = splitPos(pos_title) elif line[9] == 'en': lt = LemmatizeText(line[3]) lt.createLemmaText() lemma_title = lt.cleanText # language abstract if line[10] == 'fr': filename = 'texts/' + str(line[8]) + 'abstract' writeFile(filename, line[4]) pos_abstract = extractPOS(filename) lemma_abstract = splitPos(pos_abstract) elif line[10] == 'en': lt = LemmatizeText(line[4]) lt.createLemmaText() lemma_abstract = lt.cleanText if line[9] == 'fr' and line[10] == 'fr': line[12] = lemma_title + ' ' + lemma_abstract if line[9] == 'en' and line[10] == 'en':
def processElement_serial(elem, language, mode=0): document = dict() # get language if len(elem) >= 8: lang = elem[7] else: lang = language # get clean text try: cleanText, hashtags, attags = ct.cleanText(elem[1], lang) # if clean text exists if len(ct.removePunctuation(cleanText)) > 0: # extract lemmas and part of speech lemmas = LemmatizeText(rawText=ct.removePunctuation(cleanText), language=lang, mode=mode) lemmas.createLemmaText() lemmaText = lemmas.cleanText if lemmaText and lemmaText != " ": lemmas.createLemmas() words = [] for w in lemmas.wordList: word = dict() word['word'] = w.word word['tf'] = w.tf word['count'] = w.count word['pos'] = w.wtype words.append(word) # named entities: ner = NamedEntitiesRegonizer(text=cleanText, language=lang) ner.createNamedEntities() if ner.ner: document['namedEntities'] = ner.ner # construct the document document['_id'] = elem[0] document['rawText'] = elem[1].encode('utf8').encode('string_escape').replace('\r', '').replace('\n', '') document['cleanText'] = cleanText.encode('utf8').encode('string_escape').replace('\r', '').replace('\n', '') document['lemmaText'] = lemmaText document['date'] = elem[2] document['author'] = elem[3] document['words'] = words # geo location [x, y] document['geoLocation'] = elem[4].split(' ') # author age # this are the change required for the moment when we will keep age as a number # age = elem[5].split('-') # document['age'] = int(age[1]) - int(age[0]) document['age'] = elem[5] # this are the changes required for the moment when we will keep gender as a number # author gender - 1 male, 2 female, 0 unknown # document['gender'] = gender.get(elem[6], 0) document['gender'] = elem[6] if attags: document['attags'] = attags if hashtags: document['hashtags'] = hashtags except Exception as e: print e return document
lemmas += word + ' ' return lemmas header, corpus = readCSV('RNTI_articles_export_fixed1347_ids.txt') print header idx = 0 for line in corpus: # language title if line[9] == 'fr': filename = 'texts/'+str(line[8]) + 'title' writeFile(filename, line[3]) pos_title = extractPOS(filename) lemma_title = splitPos(pos_title) elif line[9] == 'en': lt = LemmatizeText(line[3]) lt.createLemmaText() lemma_title = lt.cleanText # language abstract if line[10] == 'fr': filename = 'texts/'+str(line[8]) + 'abstract' writeFile(filename, line[4]) pos_abstract = extractPOS(filename) lemma_abstract = splitPos(pos_abstract) elif line[10] == 'en': lt = LemmatizeText(line[4]) lt.createLemmaText() lemma_abstract = lt.cleanText if line[9] == 'fr' and line[10] == 'fr': line[12] = lemma_title + ' ' + lemma_abstract if line[9] == 'en' and line[10] == 'en':
def process_element(elem): document = dict() if len(elem) == 9: try: # construct the document # rawText = elem[4].decode('latin-1').encode('utf-8')#.encode('latin-1').encode('string_escape').replace('\r', '').replace('\n', '') document['rawText'] = elem[4].encode('latin-1')#.encode('string_escape').replace('\r', '').replace('\n', '') document['series'] = elem[0] document['booktitle'] = elem[1] document['year'] = elem[2] document['title'] = elem[3].encode('latin-1') #authors authors = elem[5].split(',') #document['authors'] = [ {'name': author.strip(' ').decode('latin-1').encode('utf-8'), 'position': authors.index(author)} for author in authors] document['authors'] = [ {'name': author.strip(' ').encode('latin-1'), 'position': authors.index(author)} for author in authors] document['pdf1page'] = elem[6] document['pdfarticle'] = elem[7] document['_id'] = elem[8] try: lang = detect(elem[4].decode('latin-1')).upper() except Exception as e1: try: lang = detect(elem[3].decode('latin-1')).upper() print e1, 'aici try 2' except Exception as e2: lang = 'FR' print e2, 'aici try 3' document['language'] = lang if len(elem[4])>0: try: cleanText = ct.cleanTextSimple(elem[4].encode('latin-1'), lang) # if clean text exists # print cleanText if len(ct.removePunctuation(cleanText)) > 0: # extract lemmas and part of speech lemmas = LemmatizeText(rawText=ct.removePunctuation(cleanText), language=lang) lemmas.createLemmaText() lemmaText = lemmas.cleanText if lemmaText and lemmaText != " ": lemmas.createLemmas() words = [] for w in lemmas.wordList: word = dict() word['word'] = w.word word['tf'] = w.tf word['count'] = w.count word['pos'] = w.wtype words.append(word) document['cleanText'] = cleanText#.encode('latin-1').encode('string_escape').replace('\r', '').replace('\n', '') document['lemmaText'] = lemmaText document['words'] = words except Exception as e: print e, 'sunt in lemmaText' except Exception as e: print e, 'aici try 1', elem else: print 'aici in else', elem return document
def processElement_serial(elem, language, mode=0): document = dict() # get language if len(elem) >= 8: lang = elem[7] else: lang = language # get clean text try: cleanText, hashtags, attags = ct.cleanText(elem[1], lang) # if clean text exists if len(ct.removePunctuation(cleanText)) > 0: # extract lemmas and part of speech lemmas = LemmatizeText(rawText=ct.removePunctuation(cleanText), language=lang, mode=mode) lemmas.createLemmaText() lemmaText = lemmas.cleanText if lemmaText and lemmaText != " ": lemmas.createLemmas() words = [] for w in lemmas.wordList: word = dict() word['word'] = w.word word['tf'] = w.tf word['count'] = w.count word['pos'] = w.wtype words.append(word) # named entities: ner = NamedEntitiesRegonizer(text=cleanText, language=lang) ner.createNamedEntities() if ner.ner: document['namedEntities'] = ner.ner # construct the document document['_id'] = elem[0] document['rawText'] = elem[1].encode('utf8').encode( 'string_escape').replace('\r', '').replace('\n', '') document['cleanText'] = cleanText.encode('utf8').encode( 'string_escape').replace('\r', '').replace('\n', '') document['lemmaText'] = lemmaText document['date'] = elem[2] document['author'] = elem[3] document['words'] = words # geo location [x, y] document['geoLocation'] = elem[4].split(' ') # author age # this are the change required for the moment when we will keep age as a number # age = elem[5].split('-') # document['age'] = int(age[1]) - int(age[0]) document['age'] = elem[5] # this are the changes required for the moment when we will keep gender as a number # author gender - 1 male, 2 female, 0 unknown # document['gender'] = gender.get(elem[6], 0) document['gender'] = elem[6] if attags: document['attags'] = attags if hashtags: document['hashtags'] = hashtags except Exception as e: print e return document
def processElement(elem): document = dict() # get language try: lang = detect(elem[1]).upper() except Exception as e: print e lang = '' if lang == 'EN': # get clean text cleanText, hashtags, attags = ct.cleanText(elem[1], lang) # if clean text exists if len(ct.removePunctuation(cleanText)) > 0: # extract lemmas and part of speech lemmas = LemmatizeText(rawText=ct.removePunctuation(cleanText), language=lang, mode=global_mode) lemmas.createLemmaText() lemmaText = lemmas.cleanText if lemmaText and lemmaText != " ": lemmas.createLemmas() words = [] for w in lemmas.wordList: word = dict() word['word'] = w.word word['tf'] = w.tf word['count'] = w.count word['pos'] = w.wtype words.append(word) # named entities: ner = NamedEntitiesRegonizer(text=cleanText, language=lang) ner.createNamedEntities() if ner.ner: document['namedEntities'] = ner.ner # construct the document document['_id'] = int(elem[0]) document['rawText'] = elem[1].decode('utf8').encode('utf8').encode('string_escape').replace('\r', '').replace('\n', '') document['cleanText'] = cleanText.decode('utf8').encode('utf8').encode('string_escape').replace('\r', '').replace('\n', '') document['lemmaText'] = lemmaText document['date'] = elem[2] document['words'] = words # geo location [x, y] document['geoLocation'] = elem[4].split(' ') # author age # this are the change required for the moment when we will keep age as a number author = dict() # this are the changes required for the moment when we will keep gender as a number # author gender - 1 male, 2 female, 0 unknown if authors.get(int(elem[3]), -1) == -1: age = elem[5].split('-') author['age'] = (int(age[1]) + int(age[0]))/2 author['authorid'] = int(elem[3]) author['genderid'] = genderid.get(elem[6], 0) author['gender'] = gender.get(elem[6], 'unknown') if author['genderid'] == 1: global b_idx author['firstname'] = names.boys[b_idx][0] author['lastname'] = names.boys[b_idx][1] b_idx += 1 elif author['genderid'] == 2: global g_idx author['firstname'] = names.girls[g_idx][0] author['lastname'] = names.girls[g_idx][1] g_idx += 1 authors[int(elem[3])] = author else: author = authors[int(elem[3])] # print author document['authors'] = [author] if attags: document['attags'] = attags if hashtags: document['hashtags'] = hashtags return document