Пример #1
0
 def build_dictionary(self, link, pos):
     currkeys = lxrTest.generateKeywords(link)
     temp_d = dict()
     for key in currkeys:
         if key in temp_d.keys():
             temp_d[key] = temp_d[key] + 1
         else:
             temp_d[key] = 1
     return pos, link, temp_d
Пример #2
0
def graball():
    # Takes around 160-180s for 200 articles, way too slow
    prev = time.time()
    list_news = ['https://nytimes.com', 'https://reuters.com', 'https://bbc.com',
                 'https://www.theguardian.com/international', 'https://apnews.com',
                 'https://latimes.com', 'http://huffpost.com', 'https://www.npr.org/']
    links = []
    titles = []
    categories = []
    bags_key = [[]]
    keywords = dict()
    for source in list_news:
        prevt = time.time()
        templinks, temptitles, tempcategories = grabheadline.grabfront(source)
        # print(templinks)
        links.extend(templinks)
        titles.extend(temptitles)
        categories.extend(tempcategories)
        print(time.time() - prevt)

    for link in links:
        print(link)
        prevt = time.time()
        currkeys = lxrTest.generateKeywords(link)
        bags_key.append(currkeys)
        # print(type(currkeys))
        for key in currkeys:
            if key in keywords.keys():
                keywords[key] = keywords[key] + 1
            else:
                keywords[key] = 1
        print(time.time() - prevt)
    sred = sorted(keywords.items(), key=lambda value: value[1], reverse=True)
    bags_key = bags_key[1:]
    print(sred)
    print(time.time() - prev)