df["story_link"] = df["link"]


# creating a list of the unique country names from our files
uniqueCountryList = df.country_simple.unique()
# initializing lists to hold the lat and lng for country centroids
lat = []
lng = []

# looping through the list of unique countries, running them through
# google's geolocation API to get the lat and lng of the country's
# centroid point. This lets Google do the hard work of sorting out
# countries that are misspelled or which go by several spellings
# (i.e. DRC, DR Congo, Democratic Republic of Congo).
for i in uniqueCountryList:
    googleGeolocation = goog.address_locator(i)
    lat.append(googleGeolocation["lat"])
    lng.append(googleGeolocation["lng"])

# creating a dataframe with just the unique country names and their centroids
centroids = pd.DataFrame(data={"country_simple": uniqueCountryList, "lat": lat, "lng": lng})

df = pd.merge(df, centroids, how="left", on="country_simple", left_index=False)

regions = ["Across Regions", "LAC", "CEE", "WCA", "ESA", "MENA", "EAP", "Across SA"]

df["country"][(df["country"].str.contains("EAP"))] = "Across EAP"
df["country"][(df["country"].str.contains("ROSA"))] = "Across SA"
df["country"][(df["country"].str.contains("Across Region"))] = "Across Regions"
df["country"][(df["country"].str.contains("Across region"))] = "Across Regions"
df["country"][(df["country"].str.contains("WCAR"))] = "Across WCA"
예제 #2
0
def upload():
    if request.method == 'GET':
        if not session.get('log'):
            return redirect('login') 
        else:
            return render_template('upload.html')
    
    elif request.method == 'POST': 
        k = request.files['file']
        fname = secure_filename(k.filename)    
        k.save(os.path.join(UPLOAD_FOLDER, fname))
        
        

        
        
        c = open('app/Preprocess/data/countries.txt','rb')
        for line in c:
            unique_countries.add(line.strip())
        ############################################
        
        ############ methods ###############
        def get_bad_chars(df):
            s = set()
            for title in df.title.values:
                for c in title:
                    try:
                        c.decode('utf8')
                    except:
                        s.add(c)
            for story in df.story.values:
                for c in story:
                    try:
                        c.decode('utf8')
                    except:
                        s.add(c)
            return s
        
        def norm_text(text, s):
            return ''.join([c for c in text if c not in s])
        
        def get_date(doc, file_name):
            if file_name == 'UNICEF OPSCEN Brief – 29 December 2014.docx':
                date = datetime(year=2014, month=12, day=29)
            elif file_name == 'UNICEF OPSCEN Brief – 30 December 2014.docx': 
                date = datetime(year=2014, month=12, day=30)
            elif file_name == 'UNICEF OPSCEN Brief – 31 December 2014.docx':
                date = datetime(year=2014, month=12, day=31)
            else:
                date = doc.core_properties.modified
            return date
        
        def read_lines(file_name, document, element,flags,data,currs,unique_countries): 
            
            
            
            if sum([stop in element for stop in stop_at]) > 0:
                return 
        
            if len(element)< 3:
                return 
        
            if element in regions_codes:
                currs['region'] = regions_codes[element]
                flags['country'] = True
                return 
            
            if flags['link'] and 'http' in element:
                data['region'].append(currs['region'])
                data['country'].append(currs['country'])
                data['title'].append(currs['title'])
                data['story'].append(currs['story'])
                data['file_name'].append(file_name)
                data['link'].append(element)
                data['date'].append(get_date(document, file_name))
                flags['country'] = True
                flags['title'] = False
                flags['link'] = False
                currs['story'] = ''
                return
        
            if flags['country']:
                
                if element in unique_countries:
                    currs['country'] = element
                    flags['country'] = False
                    flags['title'] = True
                    return 
                else:
                    
                    flags['country'] = False
                    flags['title'] = True
        
            if flags['title']:
                currs['title'] = element
                flags['title'] = False
                flags['story'] = True
                return 
        
            if flags['story']:
                currs['story'] = element
                flags['link'] = True
                return
            
            return 
        
        def remove_punctuation(s):
            s=str(s)
            t = ''.join(l for l in s if l in string.ascii_letters or l == ' ')
            return t
        
        def check_token(token):
            return len(token)>1 and not re.search('[^a-z]',token)
        
        
        ############################################
        
        # reading the data
#         folder = 'OPSCEN Brief 2015/'
        
        try:
            f = open(os.path.join(UPLOAD_FOLDER, fname),'rb')
            document = Document(f)
            for p in document.paragraphs:
                a = p.text.split('\n')
                if len(a)> 1:
                    for element in a:
                        read_lines(fname, document, element.strip(),flags,data,currs,unique_countries)
                else:
                    read_lines(fname, document, p.text.strip(),flags,data,currs,unique_countries)
                
    
        except Exception as e:
            print str(e) + "file name: "+ fname
        f.close()
        
        # bulding the data frame
        df = pd.DataFrame(data)
        print data['title']
        df['story_id']=df.index
        
        
        # removing 5 stories with multiple urls
        ind = []
        for i,s in enumerate(df.story.values):
            if 'http' in s:
                ind.append(i)
        df = df.drop(df.index[[ind]])
        # print "removed "+ str(len(ind)) + " records"
        
        ind = []
        for i,s in enumerate(df.title.values):
            if 'http' in s:
                ind.append(i)
        df = df.drop(df.index[[ind]])
        # print "removed "+ str(len(ind)) + " records"
        
        ind = []
        for i,s in enumerate(df.title.values):
            if len(s.split())<3:
                ind.append(i)
        df = df.drop(df.index[[ind]])
        # print "removed "+ str(len(ind)) + " records"
        
        # remove non utf chars
        s = get_bad_chars(df)
        df.title = df.title.apply(lambda x: norm_text(x,s))
        df.story = df.story.apply(lambda x: norm_text(x,s))
        
        # print 'DATAFRAME STEP 1:'
        # print len(df)
        
        # countries_simple is added for the stripped country names. We preserve
        #countries, above, as they are entered in documents, for the display text
        countries_simple = []
        for curr_country in df['country'].values:
            formattedcountry = remove_punctuation(curr_country).strip().lower()
            if formattedcountry == 'car':
                formattedcountry = 'central african republic'
            if 'across' in formattedcountry:
                formattedcountry = ''
            if 'palestine' in formattedcountry:
                formattedcountry = 'israel'
            if 'dpr' in formattedcountry:
                formattedcountry = 'north korea'
            if formattedcountry == 'georgia':
                formattedcountry = 'republic of georgia'
            if 'sudan' in formattedcountry:
                formattedcountry  = 'sudan'
            if 'burundi' in formattedcountry:
                formattedcountry = 'burundi'
            countries_simple.append(formattedcountry)
        
        #adding the simplified country names to the dataframe for merging with 
        #centroid file created below.
        df['country_simple'] = countries_simple
        df['story_title'] = df['title']
        df['story_link'] = df['link']
        
        
        # creating a list of the unique country names from our files
        uniqueCountryList = df.country_simple.unique()
        # initializing lists to hold the lat and lng for country centroids
        lat = []
        lng = []
        
        # looping through the list of unique countries, running them through
        # google's geolocation API to get the lat and lng of the country's
        # centroid point. This lets Google do the hard work of sorting out
        # countries that are misspelled or which go by several spellings
        # (i.e. DRC, DR Congo, Democratic Republic of Congo). 
        for i in uniqueCountryList:
            googleGeolocation = goog.address_locator(i)
            lat.append(googleGeolocation['lat'])
            lng.append(googleGeolocation['lng'])
         
        
        # creating a dataframe with just the unique country names and their centroids
        centroids = pd.DataFrame(data={'country_simple': uniqueCountryList,
            'lat': lat, 'lng': lng})
        
        df = pd.merge(df, centroids, how='left', on='country_simple', left_index=False)
        regions = ['Across Regions', 'LAC', 'CEE', 'WCA', 'ESA',  'MENA', 'EAP','Across SA']
                   
#         df[['country']][(df['country'].str.contains('EAP'))]='Across EAP'
#         df[['country']][(df['country'].str.contains('ROSA'))]='Across SA'
#         df[['country']][(df['country'].str.contains('Across Region'))]='Across Regions'
#         df[['country']][(df['country'].str.contains('Across region'))]='Across Regions'
#         df[['country']][(df['country'].str.contains('WCAR'))]='Across WCA'
#         df[['country']][(df['country'].str.contains('Across West Africa'))]='Across WCA'
#         df[['country']][(df['country'].str.contains('WAC'))]='Across WCA'            
#             
#         df[['country']][(df['country'].str.contains('Palestine'))]='Israel/Palestine'
#         df[['country']][(df['country'].str.contains('Israel'))]='Israel/Palestine'
#         df[['country']][(df['country'].str.contains('Jerusalem'))]='Israel/Palestine'
#            
#          
#         regional_lat = 0
#         regional_lng = -136.4
#         for region in regions:
#              
#             df[['lat']][(df['country'].str.contains(region))] = regional_lat
#             df[['lng']][(df['country'].str.contains(region))] = regional_lng
#             regional_lat = regional_lat - 7.5
#             
        
        
        # print 'DATAFRAME STEP 2:'
        # print len(df)
        
        cats = pd.read_csv('app/Preprocess/data/news_stories_plus_LDA.csv', usecols=['row_index','link', 'title', '250topics_NER'], index_col=['row_index'])
        
        newsLDA = pd.merge(df, cats)
        
        newsLDA = newsLDA.drop_duplicates(['link', 'title'])
        
        # print 'STEP 3'
        # print len(newsLDA)
        # print newsLDA.head()
        # print newsLDA.columns
        
        newsLDA[str(250)+'_topic']=""
        for index, row in newsLDA.iterrows():
            try:
                maxes  = max(eval(newsLDA[str(250)+'topics_NER'][index]), key = lambda x: x[1])
                newsLDA[str(250)+'_topic'][index] = maxes[0]
        
            except:
                continue
        
        tops = pd.read_csv('app/Preprocess/data/LDATopics.csv')
        df = pd.merge(newsLDA, tops, left_on= '250_topic', right_on = 'Topic')
        df = df.drop('250_topic', axis = 1)
        word_dic = {'famine': 'food insecurity', 'food': 'food insecurity', 'harvest': 'food insecurity', 'hunger': 'food insecurity', 'drown': 'disaster', 'cyclone': 'disaster', 'tsunami': 'disaster', 'climate': 'disaster', 'flood': 'disaster', 'hurricane': 'disaster', 'storm': 'disaster', 'rain': 'disaster', 'wind': 'disaster', 'weather': 'disaster', 'typhoon': 'disaster', 'tornado': 'disaster', 'earthquake': 'disaster', 'MERS': 'disease', 'polio': 'disease', 'vaccine': 'disease', 'respiratory': 'disease', 'ebola': 'disease', 'disease': 'disease', 'dengue': 'disease', 'fever': 'disease', 'virus': 'disease', 'chikunguny': 'disease', 'Boko': 'conflict', 'diarrhea': 'disease', 'diarrhoea': 'disease', 'drought': 'water insecurity', 'water': 'water insecurity', 'refugee': 'population displacement', 'evacuate': 'population displacement', 'displace': 'population displacement', 'exodus': 'population displacement', 'flee': 'population displacement', 'HIV': 'disease', 'WHO': 'disease', 'health': 'disease','epidemic': 'disease', 'hospital': 'disease', 'health': 'disease', 'cholera': 'disease'}
        
        df['category2']=''
        df['category3']=''
        df['category4']=''
        stemmer = PorterStemmer()
        for index, row in df.iterrows():
            topics = []
            texts = [stemmer.stem(word) for word in word_tokenize(df.title[index]) if check_token(word)]
            for word in texts:
                if word in word_dic:
                    topics.append(word_dic[word])
            for i, topic in enumerate(topics):
                i = i+2
                df['category'+str(i)][index]=topic
        
        df['category'][(df['category'] == 'conflict') & (df['category2'] == 'disease')] = 'Political/social unrest'
        
        # print 'STEP 4'
        # print len(df)
        
        
        df['week_year']=''
        # df['week_year']=df['date'].map(lambda x:('0/{week}/{year}'.format(week=x.weekofyear,year=x.year)))
        df['week_year']=df['date'].map(lambda x: x.strftime('0/%U/%Y'))
        
        df = df.rename(columns={'category': 'category1'})
        df = df.rename(columns={'category2': 'category'})
        df = df.rename(columns={'category1': 'category2'})
        df['category'][df['category']==''] = df['category2']
        df['category2'][(df['category']==df['category2'])] = ''
        df = df.drop(['category4'], axis=1)
        
        df['date'] = df['date'].apply(lambda x: x.strftime('%m/%d/%Y'))

               
        a= pd.read_csv("app/static/data/news_stories_final.csv")
        result = a.append(df, ignore_index=True)
         
         
         
        result.to_csv('app/static/data/news_stories_final.csv', index_label='row_index', index=True, date_format='%m/%d/%Y')
#         df.to_csv('app/Preprocess/data/news_stories_final_2.csv', index_label='row_index', index=True, date_format='%m/%d/%Y')
        return redirect('/')