Exemplo n.º 1
0
def build_tagged():
    logger = thesis_logging.get_logger('preprocess')
    latest = 0
    count = 0
    index = 1

    news_collection = database.news_collection()
    duplicated_doc = {}
    while True:
        documents = news_collection.find({'created_at': {'$gte': latest}})
        if documents.count() == 0:
            break
        for doc in documents:
            count += 1
            try:
                latest = doc['created_at']
                if not doc.get('text'):
                    print('Ignore',
                          'Count ' + str(count), 'Id ' + str(doc['id']),
                          str(doc['created_at']), doc['reference'])
                    continue
                content = doc['text']
                if content not in duplicated_doc:
                    duplicated_doc[content] = True
                    index += 1
                    logger.info(nltk.word_tokenize(content.lower()))
                    yield TaggedDocument(words=nltk.word_tokenize(
                        content.lower()),
                                         tags=[index])

            except Exception as e:
                logger.error(doc['reference'] + ' : ' + str(e))
Exemplo n.º 2
0
def build_news_tagged():
    logger = thesis_logging.get_logger('preprocess')
    latest = ObjectId("5950c589f296532a806e4f31")
    count = 0
    index = 1

    news_collection = train_collection
    duplicated_doc = {}
    while True:
        documents = news_collection.find({'_id': {'$gt': latest}})
        if documents.count() == 0:
            break
        for doc in documents:
            count += 1
            try:
                latest = doc['_id']
                if not doc.get('text'):
                    # print('Ignore', 'Count ' + str(count), 'Id ' + str(doc['id']), str(doc['created_at']),
                    #      doc['reference'])
                    continue
                content = doc['text']
                if len(content) < 100:
                    # logger.info('Ignore small content, Count ' + str(count))
                    continue
                if content not in duplicated_doc:
                    duplicated_doc[content] = True
                    index += 1
                    # logger.info(nltk.word_tokenize(content.lower()))
                    yield TaggedDocument(words=nltk.word_tokenize(
                        content.lower()),
                                         tags=['news_' + str(doc['id'])])

            except Exception as e:
                logger.error(doc['reference'] + ' : ' + str(e))
Exemplo n.º 3
0
def write_web_news():
    logger = thesis_logging.get_logger('preprocess')

    link_list = [
        'cnn.it', 'nyti.ms', 'nbcnews', 'apne.ws', 'reut.rs', 'wapo.st',
        'abcn.ws', 'nbcbay.com', 'bbc.in', 'huff.to', 'ti.me', 'cbsn.ws',
        'huffingtonpost.com', 'cnb.cx', 'cnnmon.ie', 'huffp.st', 'forbes.com',
        'telegraph.co', 'cnn.com', 'trib.al', 'express.co', 'gu.com',
        'bloom.bg', 'hill.cm', 'natgeo.com', 'pbs.org', 'washingtonpost',
        'news.sky.com'
    ]
    for source in link_list:
        # latest = ObjectId("59abbfedf296532f80d18a47")  # dyncorp
        # latest = ObjectId("59abc7e2f296532ad483f4b6")  # lds
        # latest = ObjectId("59acc20df296533c88dbaed6")  # tm
        latest = ObjectId("5942946efe43ad1da80b1a79")  # news
        index = 0
        path_file = './datasets/insensitive/news/' + source.replace('.',
                                                                    '_') + '_'
        train_collection = news_database.news_collection()
        duplicated_doc = {}
        while True:
            documents = train_collection.find({
                '_id': {
                    '$gt': latest
                },
                'reference': {
                    '$regex': '.*' + source + '.*'
                }
            })
            if documents.count() == 0:
                break
            for doc in documents:
                try:
                    latest = doc['_id']
                    if not doc.get('text'):
                        # print('Ignore', 'Count ' + str(count), 'Id ' + str(doc['id']), str(doc['created_at']),
                        #      doc['reference'])
                        continue
                    content = doc['text']
                    if len(content) < 1000:
                        # logger.info('Ignore small content, Count ' + str(count))
                        continue
                    title = doc['title']
                    if len(title) > 60:
                        title = title[0:60]
                    title = "".join(x for x in title if x.isalnum())
                    if content not in duplicated_doc:
                        duplicated_doc[content] = True
                        index += 1
                        # logger.info(nltk.word_tokenize(content.lower()))
                        with open(path_file + title + '.txt',
                                  'w',
                                  encoding="utf-8") as doc_file:
                            doc_file.write(doc['text'])

                except Exception as e:
                    logger.error(doc['reference'] + ' : ' + str(e))
        print(source, index)
Exemplo n.º 4
0
from twitter_dev import *
from spider import *
import warnings
import thesis_logging
from threading import Thread
from thesis_nlp.convert import NewsConverter

warnings.filterwarnings("ignore", category=DeprecationWarning)
logger = thesis_logging.get_logger()

if not os.path.exists(DATA_FOLDER):
    try:
        if not TwitterDev.prompt_init():
            exit()
    except Exception as e:
        logger.error('Finish thesis: ' + str(e))
        exit(1)
# new filter: 1498356805
# old filter: 1497997330
username = input('Username: '******'User ' + username + " requests authentication")
    twitter_dev = TwitterDev(DATA_FOLDER + '/' + username)
    news_converter = NewsConverter()
    crawler = Thread(target=crawl_feeds,
                     args=(twitter_dev, 600000))  #milliseconds 1497987747
    locator = Thread(target=locate_feeds, args=(news_converter, 1499792848))
    crawler.start()
    locator.start()
    crawler.join()
    locator.join()
Exemplo n.º 5
0
def locate_feeds(
    news_converter: NewsConverter,
    latest: int = 0,
):
    global crawler_finish
    logger = thesis_logging.get_logger('locator')
    news_collection = database.news_collection()

    class VectorConverter(Thread):
        def __init__(self, text):
            super().__init__()
            self.text = text
            self.vector = []

        def run(self):
            self.vector = news_converter.convert_doc_to_vector(
                self.text).tolist()

    class GeographyExtractor(Thread):
        def __init__(self, text):
            super().__init__()
            self.text = text
            self.places = []
            self.people = []
            self.organs = []

        def run(self):
            context = geograpy.get_place_context(text=self.text)
            self.places = context.places
            self.people = context.people
            self.organs = context.organs

    class PageParser(Thread):
        def __init__(self, tweet_id, url, collection):
            super().__init__()
            self.tweet_id = tweet_id
            self.url = url
            self.collection = collection

        def run(self):
            try:
                logger.info('Parse ' + self.url)
                article = Article(self.url)
                article.download()
                if article.download_exception_msg and "404" in article.download_exception_msg:
                    logger.error('404 not found, delete... ' + self.url)
                    news_collection.remove({"id": self.tweet_id})
                    return
                article.parse()
                ignore_list = [
                    "twitter.com", "youtube.com", "facebook.com",
                    "instagram.com"
                ]
                if any(x in article.canonical_link for x in ignore_list):
                    print('delete ' + article.canonical_link)
                    news_collection.remove({"id": self.tweet_id})
                    return
                logger.info('Title for ' + article.top_image + '  -  ' +
                            article.canonical_link + '\n' + article.title +
                            '\n\n')
                logger.info('Latest: ' + str(latest))
                vector_converter = VectorConverter(article.text)
                geography_extractor = GeographyExtractor(article.text)
                vector_converter.start()
                geography_extractor.start()
                geography_extractor.join()
                vector_converter.join()
                vector = vector_converter.vector
                news_collection.update_one({'id': self.tweet_id}, {
                    '$set': {
                        'places': geography_extractor.places,
                        'people': geography_extractor.people,
                        'organs': geography_extractor.organs,
                        'vector': vector,
                        'title': article.title,
                        'text': article.text,
                        'image': article.top_image
                    }
                })
                for place in geography_extractor.places:
                    self.collection.update_one({'place': place},
                                               {'$inc': {
                                                   'count': 1
                                               }},
                                               upsert=True)

            except Exception as e:
                logger.error(str(e))

    location_collection = database.location_collection()
    duplicate_urls = {}
    tasks = []
    while True:
        documents = news_collection.find({
            'created_at': {
                '$gte': latest
            }
        }).limit(100)
        logger.info('Found ' + str(documents.count()) + ' after ' +
                    str(latest))

        # Clean up remaining tasks
        if len(tasks) != 0:
            logger.info('Cleaning up remaining tasks')
            for task in tasks:
                task.join()
            tasks.clear()

        if documents.count() == 1:
            if crawler_finish:
                break
            logger.warn('Nap and back in 500 seconds')
            time.sleep(500)
            continue

        logger.info('Start Locating')
        index = 0

        for doc in documents:
            try:
                ref = doc['reference']
                latest = doc['created_at']
                image = doc.get('image')

                if image is not None:
                    logger.info('image skip')
                    continue
                if news_collection.find({'reference': ref}).count() > 1:
                    logger.info('delete duplicate ' + ref)
                    news_collection.remove({"id": doc['id']})
                    continue

                thread = PageParser(doc['id'], ref, location_collection)
                tasks.append(thread)
                thread.start()
                time.sleep(7)
                index += 1
                if index % 5 == 0:
                    for task in tasks:
                        task.join()
                    tasks.clear()

            except Exception as e:
                logger.error(doc['reference'] + ' : ' + str(e))
Exemplo n.º 6
0
def crawl_feeds(dev: TwitterDev, duration: int = 0):
    global crawler_finish
    logger = thesis_logging.get_logger('crawler')
    while True:
        try:
            if dev is None:
                logger.error('There is no Twitter developer account detected.')
                return
            news_collection = database.news_collection()
            logger.info('ok')
            user_id = dev.api.VerifyCredentials()
            logger.info('Twitter Auth: ' + str(user_id.AsJsonString()))
            friends = dev.api.GetFriendIDs(user_id, stringify_ids=True)
            logger.info('Friends: ' + str(friends))
            logger.info('Start crawling')
            start = int(round(time.time()) * 1000)
            link_list = [
                'cnn.it', 'nyti.ms', 'nbcnews', 'apne.ws', 'reut.rs',
                'wapo.st', 'abcn.ws', 'nbcbay.com', 'bbc.in', 'huff.to',
                'ti.me', 'cbsn.ws', 'huffingtonpost.com', 'cnb.cx',
                'cnnmon.ie', 'huffp.st', 'forbes.com', 'telegraph.co',
                'cnn.com', 'trib.al', 'express.co', 'gu.com', 'bloom.bg',
                'hill.cm', 'natgeo.com', 'pbs.org', 'washingtonpost',
                'news.sky.com'
            ]
            ignore_list = [
                'bit.ly',
                'twitter',
                'tinyurl',
                'goo.gl',
                'facebook.com',
            ]
            dupliate_urls = {}
            for status in dev.api.GetStreamFilter(follow=friends):
                urls = status['entities']['urls']
                if len(urls) == 0:
                    continue
                url = urls[0]['expanded_url']

                if url is None:
                    continue

                if not any(x in url for x in link_list):
                    logger.info('Skip link ' + url)
                    continue

                if news_collection.find({'reference': url}).count() > 0:
                    logger.info('Skip duplicated ' + url)
                    continue

                timestamp = int(
                    time.mktime(
                        time.strptime(status['created_at'],
                                      '%a %b %d %H:%M:%S +0000 %Y')))

                document = {
                    'id': status['id'],
                    'created_at': timestamp,
                    'reference': url
                }
                news_collection.insert_one(document)
                logger.info('Insert ' + url + '  created at ' + str(timestamp))
                # if duration != 0 and int(round(time.time()) * 1000) - start > duration:
                #    break
        except Exception as e:
            logger.error(e)
        finally:
            # crawler_finish = True
            logger.info('Finish crawling')
            logger.info('Sleeping 5s to start again...')
            time.sleep(5)
Exemplo n.º 7
0
def locate_feeds(latest=ObjectId("5950c589f296532a806e4f31")):
    global crawler_finish
    logger = thesis_logging.get_logger('locator')
    news_collection = train_database.train_collection()

    class PageParser(Thread):
        def __init__(self, tweet_id, url):
            super().__init__()
            self.tweet_id = tweet_id
            self.url = url

        def run(self):
            try:
                print('Parse ' + self.url)
                article = Article(self.url)
                article.download()

                # if article.download_exception_msg and "404" in article.download_exception_msg:
                #     logger.error('404 not found, delete... ' + self.url)
                #     news_collection.remove({"id": self.tweet_id})
                #     return
                # if article.download_exception_msg and "410" in article.download_exception_msg:
                #     logger.error('410 client error, delete... ' + self.url)
                #     news_collection.remove({"id": self.tweet_id})
                #     return
                article.parse()
                ignore_list = ["twitter.com", "youtube.com", "facebook.com", "instagram.com"]
                if any(x in article.canonical_link for x in ignore_list):
                    print('delete ' + article.canonical_link)
                    news_collection.remove({"id": self.tweet_id})
                    return

                print(
                    'Title for ' + article.top_image + '  -  ' + article.canonical_link + '\n' + article.title + '\n\n')
                print('Latest: ' + str(latest))

                if news_collection.find({'$or': [{'title': article.title}, {'text': article.text}]}).count() > 0:
                    print('Duplicate, Ignore!')
                    news_collection.remove({"id": self.tweet_id})
                    return

                vector = 0
                news_collection.update_one({'id': self.tweet_id},
                                           {'$set': {
                                                     'vector': vector,
                                                     'title': article.title,
                                                     'text': article.text,
                                                     'image': article.top_image}})
            except Exception as e:
                logger.error(str(e))

    tasks = []
    while True:
        print('Start Locating')
        # documents = news_collection.find({'_id': {'$gt': latest}}).limit(100)
        documents = news_collection.aggregate(
            [{'$match': {'text': {'$exists': False}}}, {'$sample': {'size': 100}}])
        # print('Found ' + str(documents.count()) + ' after ' + str(latest))

        # Clean up remaining tasks
        if len(tasks) != 0:
            print('Cleaning up remaining tasks')
            for task in tasks:
                task.join()
            tasks.clear()

        # if documents.count() == 0:
        #    break

        index = 0

        for doc in documents:
            try:
                ref = doc['reference']
                latest = doc['_id']
                image = doc.get('image')
                if image is not None:
                    print('image skip')
                    continue
                if news_collection.find({'reference': ref}).count() > 1:
                    print('delete duplicate ' + ref)
                    news_collection.remove({"id": doc['id']})
                    continue

                thread = PageParser(doc['id'], ref)
                tasks.append(thread)
                thread.start()
                time.sleep(8)
                index += 1
                if index % 5 == 0:
                    logger.info('Start to wait')
                    for task in tasks:
                        task.join()
                    logger.info('finish waiting')
                    tasks.clear()

            except Exception as e:
                logger.error(doc['reference'] + ' : ' + str(e))