def build_dictionary(self, link, pos): currkeys = lxrTest.generateKeywords(link) temp_d = dict() for key in currkeys: if key in temp_d.keys(): temp_d[key] = temp_d[key] + 1 else: temp_d[key] = 1 return pos, link, temp_d
def graball(): # Takes around 160-180s for 200 articles, way too slow prev = time.time() list_news = ['https://nytimes.com', 'https://reuters.com', 'https://bbc.com', 'https://www.theguardian.com/international', 'https://apnews.com', 'https://latimes.com', 'http://huffpost.com', 'https://www.npr.org/'] links = [] titles = [] categories = [] bags_key = [[]] keywords = dict() for source in list_news: prevt = time.time() templinks, temptitles, tempcategories = grabheadline.grabfront(source) # print(templinks) links.extend(templinks) titles.extend(temptitles) categories.extend(tempcategories) print(time.time() - prevt) for link in links: print(link) prevt = time.time() currkeys = lxrTest.generateKeywords(link) bags_key.append(currkeys) # print(type(currkeys)) for key in currkeys: if key in keywords.keys(): keywords[key] = keywords[key] + 1 else: keywords[key] = 1 print(time.time() - prevt) sred = sorted(keywords.items(), key=lambda value: value[1], reverse=True) bags_key = bags_key[1:] print(sred) print(time.time() - prev)