예제 #1
0
#first element is the domain
#second is the class value of the article title h1 tag
#third is the class value of the article body tag
formats = [('foxnews.com', 'entry-title', 'article-text'),
           ('techcrunch.com', 'headline', 'body-copy'),
           ('nbcnews.com', 'gl_headline', 'articleText')]




if __name__ == '__main__':

    tokenizer = Tokenizer()

    for pattern in formats:
        tokenizer.add_format(*pattern)
        
    if len(sys.argv) > 1:
        filename = sys.argv[1]
    else:
        filename = 'foxNewsInputSet.txt'
        
    if '.' in filename:
        output_file = filename[:filename.index('.') +1] + 'json'
    else:
        output_file = filename + '.json'
        
  
    url_list = [ x.strip('\n') for x in open(filename) ]

    #initialize our list for the tokenized content