def build_tagged(): logger = thesis_logging.get_logger('preprocess') latest = 0 count = 0 index = 1 news_collection = database.news_collection() duplicated_doc = {} while True: documents = news_collection.find({'created_at': {'$gte': latest}}) if documents.count() == 0: break for doc in documents: count += 1 try: latest = doc['created_at'] if not doc.get('text'): print('Ignore', 'Count ' + str(count), 'Id ' + str(doc['id']), str(doc['created_at']), doc['reference']) continue content = doc['text'] if content not in duplicated_doc: duplicated_doc[content] = True index += 1 logger.info(nltk.word_tokenize(content.lower())) yield TaggedDocument(words=nltk.word_tokenize( content.lower()), tags=[index]) except Exception as e: logger.error(doc['reference'] + ' : ' + str(e))
def build_news_tagged(): logger = thesis_logging.get_logger('preprocess') latest = ObjectId("5950c589f296532a806e4f31") count = 0 index = 1 news_collection = train_collection duplicated_doc = {} while True: documents = news_collection.find({'_id': {'$gt': latest}}) if documents.count() == 0: break for doc in documents: count += 1 try: latest = doc['_id'] if not doc.get('text'): # print('Ignore', 'Count ' + str(count), 'Id ' + str(doc['id']), str(doc['created_at']), # doc['reference']) continue content = doc['text'] if len(content) < 100: # logger.info('Ignore small content, Count ' + str(count)) continue if content not in duplicated_doc: duplicated_doc[content] = True index += 1 # logger.info(nltk.word_tokenize(content.lower())) yield TaggedDocument(words=nltk.word_tokenize( content.lower()), tags=['news_' + str(doc['id'])]) except Exception as e: logger.error(doc['reference'] + ' : ' + str(e))
def write_web_news(): logger = thesis_logging.get_logger('preprocess') link_list = [ 'cnn.it', 'nyti.ms', 'nbcnews', 'apne.ws', 'reut.rs', 'wapo.st', 'abcn.ws', 'nbcbay.com', 'bbc.in', 'huff.to', 'ti.me', 'cbsn.ws', 'huffingtonpost.com', 'cnb.cx', 'cnnmon.ie', 'huffp.st', 'forbes.com', 'telegraph.co', 'cnn.com', 'trib.al', 'express.co', 'gu.com', 'bloom.bg', 'hill.cm', 'natgeo.com', 'pbs.org', 'washingtonpost', 'news.sky.com' ] for source in link_list: # latest = ObjectId("59abbfedf296532f80d18a47") # dyncorp # latest = ObjectId("59abc7e2f296532ad483f4b6") # lds # latest = ObjectId("59acc20df296533c88dbaed6") # tm latest = ObjectId("5942946efe43ad1da80b1a79") # news index = 0 path_file = './datasets/insensitive/news/' + source.replace('.', '_') + '_' train_collection = news_database.news_collection() duplicated_doc = {} while True: documents = train_collection.find({ '_id': { '$gt': latest }, 'reference': { '$regex': '.*' + source + '.*' } }) if documents.count() == 0: break for doc in documents: try: latest = doc['_id'] if not doc.get('text'): # print('Ignore', 'Count ' + str(count), 'Id ' + str(doc['id']), str(doc['created_at']), # doc['reference']) continue content = doc['text'] if len(content) < 1000: # logger.info('Ignore small content, Count ' + str(count)) continue title = doc['title'] if len(title) > 60: title = title[0:60] title = "".join(x for x in title if x.isalnum()) if content not in duplicated_doc: duplicated_doc[content] = True index += 1 # logger.info(nltk.word_tokenize(content.lower())) with open(path_file + title + '.txt', 'w', encoding="utf-8") as doc_file: doc_file.write(doc['text']) except Exception as e: logger.error(doc['reference'] + ' : ' + str(e)) print(source, index)
from twitter_dev import * from spider import * import warnings import thesis_logging from threading import Thread from thesis_nlp.convert import NewsConverter warnings.filterwarnings("ignore", category=DeprecationWarning) logger = thesis_logging.get_logger() if not os.path.exists(DATA_FOLDER): try: if not TwitterDev.prompt_init(): exit() except Exception as e: logger.error('Finish thesis: ' + str(e)) exit(1) # new filter: 1498356805 # old filter: 1497997330 username = input('Username: '******'User ' + username + " requests authentication") twitter_dev = TwitterDev(DATA_FOLDER + '/' + username) news_converter = NewsConverter() crawler = Thread(target=crawl_feeds, args=(twitter_dev, 600000)) #milliseconds 1497987747 locator = Thread(target=locate_feeds, args=(news_converter, 1499792848)) crawler.start() locator.start() crawler.join() locator.join()
def locate_feeds( news_converter: NewsConverter, latest: int = 0, ): global crawler_finish logger = thesis_logging.get_logger('locator') news_collection = database.news_collection() class VectorConverter(Thread): def __init__(self, text): super().__init__() self.text = text self.vector = [] def run(self): self.vector = news_converter.convert_doc_to_vector( self.text).tolist() class GeographyExtractor(Thread): def __init__(self, text): super().__init__() self.text = text self.places = [] self.people = [] self.organs = [] def run(self): context = geograpy.get_place_context(text=self.text) self.places = context.places self.people = context.people self.organs = context.organs class PageParser(Thread): def __init__(self, tweet_id, url, collection): super().__init__() self.tweet_id = tweet_id self.url = url self.collection = collection def run(self): try: logger.info('Parse ' + self.url) article = Article(self.url) article.download() if article.download_exception_msg and "404" in article.download_exception_msg: logger.error('404 not found, delete... ' + self.url) news_collection.remove({"id": self.tweet_id}) return article.parse() ignore_list = [ "twitter.com", "youtube.com", "facebook.com", "instagram.com" ] if any(x in article.canonical_link for x in ignore_list): print('delete ' + article.canonical_link) news_collection.remove({"id": self.tweet_id}) return logger.info('Title for ' + article.top_image + ' - ' + article.canonical_link + '\n' + article.title + '\n\n') logger.info('Latest: ' + str(latest)) vector_converter = VectorConverter(article.text) geography_extractor = GeographyExtractor(article.text) vector_converter.start() geography_extractor.start() geography_extractor.join() vector_converter.join() vector = vector_converter.vector news_collection.update_one({'id': self.tweet_id}, { '$set': { 'places': geography_extractor.places, 'people': geography_extractor.people, 'organs': geography_extractor.organs, 'vector': vector, 'title': article.title, 'text': article.text, 'image': article.top_image } }) for place in geography_extractor.places: self.collection.update_one({'place': place}, {'$inc': { 'count': 1 }}, upsert=True) except Exception as e: logger.error(str(e)) location_collection = database.location_collection() duplicate_urls = {} tasks = [] while True: documents = news_collection.find({ 'created_at': { '$gte': latest } }).limit(100) logger.info('Found ' + str(documents.count()) + ' after ' + str(latest)) # Clean up remaining tasks if len(tasks) != 0: logger.info('Cleaning up remaining tasks') for task in tasks: task.join() tasks.clear() if documents.count() == 1: if crawler_finish: break logger.warn('Nap and back in 500 seconds') time.sleep(500) continue logger.info('Start Locating') index = 0 for doc in documents: try: ref = doc['reference'] latest = doc['created_at'] image = doc.get('image') if image is not None: logger.info('image skip') continue if news_collection.find({'reference': ref}).count() > 1: logger.info('delete duplicate ' + ref) news_collection.remove({"id": doc['id']}) continue thread = PageParser(doc['id'], ref, location_collection) tasks.append(thread) thread.start() time.sleep(7) index += 1 if index % 5 == 0: for task in tasks: task.join() tasks.clear() except Exception as e: logger.error(doc['reference'] + ' : ' + str(e))
def crawl_feeds(dev: TwitterDev, duration: int = 0): global crawler_finish logger = thesis_logging.get_logger('crawler') while True: try: if dev is None: logger.error('There is no Twitter developer account detected.') return news_collection = database.news_collection() logger.info('ok') user_id = dev.api.VerifyCredentials() logger.info('Twitter Auth: ' + str(user_id.AsJsonString())) friends = dev.api.GetFriendIDs(user_id, stringify_ids=True) logger.info('Friends: ' + str(friends)) logger.info('Start crawling') start = int(round(time.time()) * 1000) link_list = [ 'cnn.it', 'nyti.ms', 'nbcnews', 'apne.ws', 'reut.rs', 'wapo.st', 'abcn.ws', 'nbcbay.com', 'bbc.in', 'huff.to', 'ti.me', 'cbsn.ws', 'huffingtonpost.com', 'cnb.cx', 'cnnmon.ie', 'huffp.st', 'forbes.com', 'telegraph.co', 'cnn.com', 'trib.al', 'express.co', 'gu.com', 'bloom.bg', 'hill.cm', 'natgeo.com', 'pbs.org', 'washingtonpost', 'news.sky.com' ] ignore_list = [ 'bit.ly', 'twitter', 'tinyurl', 'goo.gl', 'facebook.com', ] dupliate_urls = {} for status in dev.api.GetStreamFilter(follow=friends): urls = status['entities']['urls'] if len(urls) == 0: continue url = urls[0]['expanded_url'] if url is None: continue if not any(x in url for x in link_list): logger.info('Skip link ' + url) continue if news_collection.find({'reference': url}).count() > 0: logger.info('Skip duplicated ' + url) continue timestamp = int( time.mktime( time.strptime(status['created_at'], '%a %b %d %H:%M:%S +0000 %Y'))) document = { 'id': status['id'], 'created_at': timestamp, 'reference': url } news_collection.insert_one(document) logger.info('Insert ' + url + ' created at ' + str(timestamp)) # if duration != 0 and int(round(time.time()) * 1000) - start > duration: # break except Exception as e: logger.error(e) finally: # crawler_finish = True logger.info('Finish crawling') logger.info('Sleeping 5s to start again...') time.sleep(5)
def locate_feeds(latest=ObjectId("5950c589f296532a806e4f31")): global crawler_finish logger = thesis_logging.get_logger('locator') news_collection = train_database.train_collection() class PageParser(Thread): def __init__(self, tweet_id, url): super().__init__() self.tweet_id = tweet_id self.url = url def run(self): try: print('Parse ' + self.url) article = Article(self.url) article.download() # if article.download_exception_msg and "404" in article.download_exception_msg: # logger.error('404 not found, delete... ' + self.url) # news_collection.remove({"id": self.tweet_id}) # return # if article.download_exception_msg and "410" in article.download_exception_msg: # logger.error('410 client error, delete... ' + self.url) # news_collection.remove({"id": self.tweet_id}) # return article.parse() ignore_list = ["twitter.com", "youtube.com", "facebook.com", "instagram.com"] if any(x in article.canonical_link for x in ignore_list): print('delete ' + article.canonical_link) news_collection.remove({"id": self.tweet_id}) return print( 'Title for ' + article.top_image + ' - ' + article.canonical_link + '\n' + article.title + '\n\n') print('Latest: ' + str(latest)) if news_collection.find({'$or': [{'title': article.title}, {'text': article.text}]}).count() > 0: print('Duplicate, Ignore!') news_collection.remove({"id": self.tweet_id}) return vector = 0 news_collection.update_one({'id': self.tweet_id}, {'$set': { 'vector': vector, 'title': article.title, 'text': article.text, 'image': article.top_image}}) except Exception as e: logger.error(str(e)) tasks = [] while True: print('Start Locating') # documents = news_collection.find({'_id': {'$gt': latest}}).limit(100) documents = news_collection.aggregate( [{'$match': {'text': {'$exists': False}}}, {'$sample': {'size': 100}}]) # print('Found ' + str(documents.count()) + ' after ' + str(latest)) # Clean up remaining tasks if len(tasks) != 0: print('Cleaning up remaining tasks') for task in tasks: task.join() tasks.clear() # if documents.count() == 0: # break index = 0 for doc in documents: try: ref = doc['reference'] latest = doc['_id'] image = doc.get('image') if image is not None: print('image skip') continue if news_collection.find({'reference': ref}).count() > 1: print('delete duplicate ' + ref) news_collection.remove({"id": doc['id']}) continue thread = PageParser(doc['id'], ref) tasks.append(thread) thread.start() time.sleep(8) index += 1 if index % 5 == 0: logger.info('Start to wait') for task in tasks: task.join() logger.info('finish waiting') tasks.clear() except Exception as e: logger.error(doc['reference'] + ' : ' + str(e))