def downloadArticles(incidents): numIncidents = len(incidents) delSet = [] index = 1 for source in incidents: if "title" in incidents[source]: index += 1 continue else: # print incidents[source] print index,"/",numIncidents results = download_article(source, False, True) if results[0]: success, title, text, date, title2 = results incidents[source]["title"] = title incidents[source]["body"] = text incidents[source]["publishDate"] = date else: delSet.append(source) index += 1 print len(incidents) print "Number of keys that are have no articles", len(delSet) for source in delSet: del incidents[source] print len(incidents)
def download_articles_from_query(query_text, original_text, search_engine_name): if search_engine_name == "google": article_urls = get_related_urls_from_google(query_text) elif search_engine_name == "bing": article_urls = get_related_urls_from_bing(query_text) article_texts = [] article_dates = [] downloaded_urls = [] selected_urls = [] i = 1 for url in article_urls: print "Checking URL ", i i += 1 try: if "newslocker" not in url and "newsjs" not in url: downloaded_article = download_article(url, False, False) article_text = downloaded_article[2] article_date = downloaded_article[3] article_title = downloaded_article[4] if article_text: article_texts.append(article_title + " " + article_text) if article_date != None: article_date = article_date.replace(tzinfo=None) article_dates.append(article_date) downloaded_urls.append(url) except Exception, e: pass
def download_articles_from_query(query_text, original_text, search_engine_name): if search_engine_name == 'google': article_urls = get_related_urls_from_google(query_text) elif search_engine_name == 'bing': article_urls = get_related_urls_from_bing(query_text) article_texts = [] article_dates = [] downloaded_urls = [] selected_urls = [] i = 1 for url in article_urls: print "Checking URL ", i i += 1 try: if "newslocker" not in url and "newsjs" not in url: downloaded_article = download_article(url, False, False) article_text = downloaded_article[2] article_date = downloaded_article[3] article_title = downloaded_article[4] if article_text: article_texts.append(article_title + " " + article_text) if article_date != None: article_date = article_date.replace(tzinfo=None) article_dates.append(article_date) downloaded_urls.append(url) except Exception, e: pass
def test_queries(event_dict): """ Meta: 1) date, 2) shooter_name, 3) killed_num, 4) wounded_num, 5) location Query types: 1. (title) 2. "Shooting in [location]" 3. "Shooting in [location] on [date]" 4. "Shooting in [location] on [date], [killed_num] killed" 5. "Shooting in [location] on [date], [killed_num] killed, [wounded_num] wounded" 6. "Shooting in [location] on [date] by [shooter name]" 7. "Shooting in [location] on [date] by [shooter name], [killed_num] killed" 8. "Shooting in [location] on [date] by [shooter name], [killed_num] killed, [wounded_num] wounded" 9. 10. """ # meta = ['[date]', '[shooter_name]', '[killed_num]', '[wounded_num]', '[location]'] # query_types = [] # ["Shooting in [location]", "Shooting in [location] on [date]", "Shooting in [location] on [date], [killed_num] killed", # "Shooting in [location] on [date], [killed_num] killed, [wounded_num] wounded", "Shooting in [location] on [date] by [shooter_name]", # "Shooting in [location] on [date] by [shooter_name], [killed_num] killed", # "Shooting in [location] on [date] by [shooter_name], [killed_num] killed, [wounded_num] wounded"] query_scores = {} query_scores_ratios = {} count = 0 for metadata, urls in event_dict.items(): if urls is None or urls == []: continue status, title, text, date, title = download_article(urls[0], False, False) if title is None or len(title) < 5: continue print "Event count:", count print "Title:", title print "Original URL set:", urls print # urls = set(urls) city = metadata[4] query_types_with_title = [ " ".join([city, title]), " ".join([title, city]), " ".join(title.split()[:10]), ] # query_types[:] query_types_with_title.insert(0, title) results = {} results_ratios = {} for i, query_format in enumerate(query_types_with_title): # query = replace_with_metadata(query_format, meta, metadata) query = query_format # article_urls_google = set(get_related_urls_from_google(query)) article_urls_bing = get_related_urls_from_bing(query) print "Query used:", query for url in article_urls_bing: print url.encode("ascii", "ignore") # query_scores[i] = query_scores.get(i,0) + len(article_urls_google.intersection(urls)) query_scores[i] = query_scores.get(i, 0) + count_of_originals(article_urls_bing, urls) query_scores_ratios[i] = query_scores_ratios.get(i, 0) + count_of_ratio_relevance(article_urls_bing, urls) results[i] = query_scores[i] results_ratios[i] = query_scores_ratios[i] print count += 1 print results print results_ratios print print count return query_scores
def test_queries(event_dict): ''' Meta: 1) date, 2) shooter_name, 3) killed_num, 4) wounded_num, 5) location Query types: 1. (title) 2. "Shooting in [location]" 3. "Shooting in [location] on [date]" 4. "Shooting in [location] on [date], [killed_num] killed" 5. "Shooting in [location] on [date], [killed_num] killed, [wounded_num] wounded" 6. "Shooting in [location] on [date] by [shooter name]" 7. "Shooting in [location] on [date] by [shooter name], [killed_num] killed" 8. "Shooting in [location] on [date] by [shooter name], [killed_num] killed, [wounded_num] wounded" 9. 10. ''' #meta = ['[date]', '[shooter_name]', '[killed_num]', '[wounded_num]', '[location]'] #query_types = [] #["Shooting in [location]", "Shooting in [location] on [date]", "Shooting in [location] on [date], [killed_num] killed", #"Shooting in [location] on [date], [killed_num] killed, [wounded_num] wounded", "Shooting in [location] on [date] by [shooter_name]", #"Shooting in [location] on [date] by [shooter_name], [killed_num] killed", #"Shooting in [location] on [date] by [shooter_name], [killed_num] killed, [wounded_num] wounded"] query_scores = {} query_scores_ratios = {} count = 0 for metadata, urls in event_dict.items(): if urls is None or urls == []: continue status, title, text, date, title = download_article( urls[0], False, False) if title is None or len(title) < 5: continue print "Event count:", count print "Title:", title print "Original URL set:", urls print #urls = set(urls) city = metadata[4] query_types_with_title = [ " ".join([city, title]), " ".join([title, city]), " ".join(title.split()[:10]) ] #query_types[:] query_types_with_title.insert(0, title) results = {} results_ratios = {} for i, query_format in enumerate(query_types_with_title): #query = replace_with_metadata(query_format, meta, metadata) query = query_format #article_urls_google = set(get_related_urls_from_google(query)) article_urls_bing = get_related_urls_from_bing(query) print "Query used:", query for url in article_urls_bing: print url.encode("ascii", "ignore") #query_scores[i] = query_scores.get(i,0) + len(article_urls_google.intersection(urls)) query_scores[i] = query_scores.get(i, 0) + count_of_originals( article_urls_bing, urls) query_scores_ratios[i] = query_scores_ratios.get( i, 0) + count_of_ratio_relevance(article_urls_bing, urls) results[i] = query_scores[i] results_ratios[i] = query_scores_ratios[i] print count += 1 print results print results_ratios print print count return query_scores