def filterDocuments():
    docFile = open(path+"/data.txt")
    toWrite = open(path+"/filteredData.txt", 'w')

    lessThanFive = Unpickler(open(path+"/lessFrequentWords.txt")).load()

    i=0
    for line in docFile:
        if i%10000==0: print i
        i+=1

        line = line.decode("utf-8")
        line = line.strip()
        if line:
            cat,doc = line.split("\t")
            words = Utility.getWords(doc)
            filtered = []
            for w in words:
                if w not in lessThanFive:
                    filtered.append(w.encode("utf-8"))

            toWrite.write(cat.encode("utf-8")+"\t"+(" ".join(filtered)))
            toWrite.write("\n")


    docFile.close()
    toWrite.close()
def getWordCounts():
    docFile = open(path+"/data.txt")
    wCounts = {}
    print "presmetuvame frekfencija na zborovi"
    i=0
    for line in docFile:

        if i%10000==0: print i
        i+=1

        line = line.decode("utf-8")
        line = line.strip()
        if line:
            cat,doc = line.split("\t")
            words = Utility.getWords(doc)
            for w in words:
                if w not in wCounts: wCounts[w]=0
                wCounts[w]=wCounts[w]+1

    docFile.close()
    lessThanFive = set()
    for w,c in wCounts.items():
        if c<=5:
            lessThanFive.add(w)

    toWrite = open(path+"/lessFrequentWords.txt",'w')
    Pickler(toWrite).dump(lessThanFive)
    toWrite.close()
def testNaiveBayes():
    begin = time.time()

    counter = 0

    fileToRead = open(naivebayes_classification.str_dict_word_in_cat)
    dict_words = Unpickler(fileToRead).load()
    fileToRead.close()

    fileToRead = open(naivebayes_classification.str_dict_cat_count)
    dict_cat_count = Unpickler(fileToRead).load()
    fileToRead.close()

    fileToRead = open(naivebayes_classification.str_dict_priors)
    dict_priors = Unpickler(fileToRead).load()
    fileToRead.close()



    numErrors = 0
    for line in open(Utility.path_dataset):

        parts = line.decode('utf-8').strip().split('\t')
        category = parts[0]
        text = parts[1]
        words = Utility.getWords(text)

        nb_category = get_NB_category(words, dict_words, dict_cat_count, dict_priors)

        if nb_category != category:
            #print 'correct category: %s my category: %s' % (category, nb_category)
            numErrors += 1

        counter += 1

        if counter % 5000 == 0:
            print 'counter: %d\n' % counter


    print 'time: %d\n' % (time.time() - begin)



    print 'accuracy: %f' % (100*(1 - numErrors*1.0/counter))
def buildDicts(file_name, save_dicts=False):
    """

    :param file_name:   the file that contains the pairs (category, sentence) where we build the dictionary from
    :param save_dicts:  variable indicating whether to save the dictionaries in files or just just to return them
    :return: returns    dict_cat_count, dict_word_in_cat,
    """
    counter = 0
    # priors (priors[cat] <- how many documents are in category cat)
    priors = {}
    # for a given category as a key, this returns how many words there are as a value
    dict_cat_count = {}
    # for a given category and a word as keys (dict_word_in_cat[category][word]),
    # we get how many times that word has appeared in that category
    dict_word_in_cat = {}


    for line in open(file_name):

        parts = line.decode('utf-8').strip().split('\t')
        category = parts[0]
        text = parts[1]

        priors[category] = 1 + priors.get(category, 0)

        words = Utility.getWords(text)

        dict_cat_count[category] = len(words) + dict_cat_count.get(category, 0)

        dict_word_in_cat.setdefault(category, {})

        for w in words:
            dict_word_in_cat[category][w] = 1 + dict_word_in_cat[category].get(w, 0)


        counter += 1

        if counter % 10000 == 0:
            print 'processed %d news' % counter


    for cat in priors:
        priors[cat] = (priors[cat] * 1.0) / counter
        print 'Category %s num words: %d' % (cat, dict_cat_count[cat])


    print 'num unique words: %d' % (len(dict_word_in_cat))

    if save_dicts:
        print 'Saving the dicts..'
        fileToWrite = open(str_dict_cat_count, 'w')
        Pickler(fileToWrite).dump(dict_cat_count)
        fileToWrite.close()


        fileToWrite = open(str_dict_word_in_cat, 'w')
        Pickler(fileToWrite).dump(dict_word_in_cat)
        fileToWrite.close()

        fileToWrite = open(str_dict_priors, 'w')
        Pickler(fileToWrite).dump(priors)
        fileToWrite.close()
        print 'Saved the dicts!'

    return dict_cat_count, dict_word_in_cat, priors
def getNewsPosts(source_object, web_page_url, dict_IDF):

    """
    The main function which crawls a particular link and returns news posts as object that have been extracted from that link.
    Needs revising and (possibly) modifying the process of text extraction.

    :param sourceObject: the source object that wraps multiple web_page_urls (we need it for creating the news post object)
    :param web_page_url: the web page url where we extract the information from
    :param dictIDF:     the idf dictionary that we need to calculate tf_idf for a document
    :return: list of news posts and a feedback (for logging)
    """

    #feedback variable for logging
    feedback = ''

    try:
        #opening the url and reading the content
        c = urlopen(web_page_url)
        content = c.read()
        soup = BeautifulSoup(content)

        logging.debug('getNewsPosts: failed on reading web_page_url')


        logging.debug('instantiated beautiful soup')

        #the list of object that we are going to return
        newsPosts = []

        for item in soup.findAll('item'):

            #in each item we have a link to the news that we would like to process

            try:
                #title of the news
                title   = item.find('title').string
                #link to the news
                link_url = item.find('link').string




                feedback += 'title: %s\n' % title
                feedback += 'link_url: %s\n' % link_url

                pub_date = item.find('pubdate')

                if pub_date is not None:
                    pub_date = pub_date.string
                    datetime_obj = parse(pub_date, ignoretz=True)

                    feedback += 'pub_date: %s\n' % (datetime_obj.strftime('%B %d %Y %H:%M'))

                    date_milli = (datetime_obj - epoch).total_seconds() * 1000.0
                    pub_date = date_milli
                    feedback += 'milli: %f\n' % date_milli

                else:
                    pub_date = 0
                    feedback += 'pub_date: None\n'


                same_news_posts = NewsPost.query(NewsPost.url == link_url).fetch()


                #we must not process the same news twice
                if same_news_posts is not None and len(same_news_posts) > 0:
                    feedback += 'There is/are already news post/s with this link. Continuing..\n'
                    feedback += '------------------------------\n'
                    continue


                img_url = None

                #we try to fetch the photo url directly from the rss feed, if not possible we will try later again
                if (item.description is not None) and (item.description.string is not None):
                    img_obj = BeautifulSoup(item.description.string).find('img')

                    if img_obj is not None:
                        img_url = img_obj['src']
                elif item.description is not None:
                    img_obj = item.description.find('img')

                    if img_obj is not None:
                        img_url =  img_obj['src']


                #here we get the content of the news
                link_content = urlopen(link_url).read()
                innerSoup = BeautifulSoup(link_content)


                title_words = Utility.getWords(title)
                title_words = filterTitles(title_words, web_page_url)

                total_words = title_words

                # add title twice, because we consider those words in the title twice as important as the other words
                total_words.extend(total_words)


                #which paragraphs to take into consideration


                text = ''

                for script in innerSoup(['script', 'style']):
                    script.extract()

                if web_page_url in Utility.fetch_text_specifications:

                    specifications = Utility.fetch_text_specifications[web_page_url]


                    if isinstance(specifications, list): #we take the paragraphs

                        start =  Utility.fetch_text_specifications[web_page_url][0]
                        end   = len(innerSoup.findAll('p'))
                        if len(Utility.fetch_text_specifications[web_page_url]) > 1:
                            end = Utility.fetch_text_specifications[web_page_url][1]


                        for p in innerSoup.findAll('p')[start:end]:
                            text += '%s ' % p.text
                    else:

                        tag_type = specifications['tag_type']
                        attr_type = specifications['attribute_type']
                        attr_value = specifications['attribute_value']

                        sections = innerSoup.findAll(tag_type, {attr_type: attr_value})

                        if 'nested_tag_type' in specifications:
                            #we need to go one level deeper
                            nested_tag_type = specifications['nested_tag_type']
                            nested_attr_type = specifications['nested_attribute_type']
                            nested_attr_value = specifications['nested_attribute_value']
                            limit = specifications.get('limit', 1000)

                            new_sections = []

                            for section in sections:
                                new_sections.extend(section.findAll(nested_tag_type,{ nested_attr_type:  nested_attr_value}, limit=limit))

                            sections = new_sections



                        for section in sections:
                            text += '%s ' % section.text



                description = text[:min(100, len(text))]

                total_words.extend(Utility.getWords(text))

                num_words = len(total_words)


                if num_words < 7:
                    continue

                dict_news = {}
                for word in total_words:
                    dict_news[word] = 1 + dict_news.get(word, 0)




                #we are trying to get the image from the news
                if img_url is None:
                    imgs = innerSoup.findAll('img')

                    img_url = ''
                    if imgs is not None and len(imgs) > 0:
                        img_url = imgs[0]['src']

                #deal with the pictures with relative path to the web
                if (img_url is not None) and (len(img_url) > 0):
                    if img_url.find(source_object.url) != 0:
                        img_url = source_object.url + '/' + img_url


                feedback += 'img_url: %s\n' % img_url


                newsPost = NewsPost(parent=ndb.Key('NewsPost', link_url or "*notitle*"), url = link_url, host_page = web_page_url,
                                    title = title, dictWords = dict_news, numWords = num_words, words = total_words ,
                                    source_id = source_object.id, source_url = source_object.url,
                                    img_url = img_url, pub_date = pub_date, description = description)

                newsPost.calculate_tf_idf(dict_IDF)
                newsPost.put()
                newsPosts.append(newsPost)

                feedback += '------------------------------\n'
            except Exception as inst:
                feedback += 'Inner Exception type: %s\n' % str(type(inst))
                feedback += 'Inner Exception message: %s\n' % inst.message


        return newsPosts, feedback
    except Exception as inst:

        feedback += 'Exception type: %s\n' % type(inst)
        feedback += 'Exception message: %s\n' % inst.message

        #if there is an exception, we return and empty list of news posts
        return [], feedback
def parse_rss_feed(rss_feed_url, stop_after=None):
    feedback = ''
    #model = get_persisted_model()
    _,ind_to_cat = Utility.load_categories()
    #vectorizer = get_persisted_vectorizer()

    #feedback += 'In parse rss feed!!!\n'
    try:
        #opening the url and reading the content
        #feedback += 'trying to open the content %s\n' % rss_feed_url
        c = urlopen(rss_feed_url)
        #feedback += 'Opened the content\n'
        content = c.read()
        #feedback += 'Read the content'
        soup = BeautifulSoup(content)

        #feedback += 'Opened the content\n'
        ind = 0

        tuples = []
        for item in soup.findAll('item'):

            ind += 1
            #feedback += 'Item num: %d\n' % ind

            if (stop_after is not None) and ind > stop_after: break

            try:
                title   = item.find('title').string
                link_url = item.find('link').string
                description_object = item.find('description')

                description = ''

                for p in BeautifulSoup(description_object.string).findAll('p'):
                    description += '%s ' % p.text

                feedback += 'title: %s\n' % title.strip()
                feedback += 'link_url: %s\n' % link_url
                feedback += 'description: %s\n' % description

                pub_date = item.find('pubdate')

                if pub_date is not None:
                    pub_date = pub_date.string
                    datetime_obj = parse(pub_date, ignoretz=True)

                    feedback += 'pub_date: %s\n' % (datetime_obj.strftime('%B %d %Y %H:%M'))

                    date_milli = (datetime_obj - epoch).total_seconds() * 1000.0
                    feedback += 'milli: %f\n' % date_milli

                else:
                    feedback += 'pub_date: None\n'

                text = ''


                img_url = None

                #we try to fetch the photo url directly from the rss feed, if not possible we will try later again
                if (item.description is not None) and (item.description.string is not None):
                    img_obj = BeautifulSoup(item.description.string).find('img')

                    if img_obj is not None:
                        img_url = img_obj['src']
                elif item.description is not None:
                    img_obj = item.description.find('img')

                    if img_obj is not None:
                        img_url =  img_obj['src']


                #here we get the content of the news
                link_content = urlopen(link_url).read()
                innerSoup = BeautifulSoup(link_content)


                for script in innerSoup(['script', 'style']):
                    script.extract()


                if rss_feed_url in Utility.fetch_text_specifications:

                    specifications = Utility.fetch_text_specifications[rss_feed_url]


                    if isinstance(specifications, list): #we take the paragraphs

                        start =  Utility.fetch_text_specifications[rss_feed_url][0]
                        end   = len(innerSoup.findAll('p'))
                        if len(Utility.fetch_text_specifications[rss_feed_url]) > 1:
                            end = Utility.fetch_text_specifications[rss_feed_url][1]


                        for p in innerSoup.findAll('p')[start:end]:
                            text += p.text
                    else:
                        tag_type = specifications['tag_type']
                        attr_type = specifications['attribute_type']
                        attr_value = specifications['attribute_value']

                        #feedback += 'tag_type: %s attr_type: %s attr_value: %s\n' % (tag_type, attr_type, attr_value)

                        sections = innerSoup.findAll(tag_type, {attr_type: attr_value})

                        #feedback += 'tags size: %d\n' % len(sections)


                        #if we need to go deeper
                        if 'nested_tag_type' in specifications:
                            nested_tag_type = specifications['nested_tag_type']
                            nested_attr_type = specifications['nested_attribute_type']
                            nested_attr_value = specifications['nested_attribute_value']
                            limit = specifications.get('limit', 1000)
                            recursive = specifications.get('recursive', True)

                            new_sections = []

                            for section in sections:
                                new_sections.extend(section.findAll(nested_tag_type,{ nested_attr_type:  nested_attr_value}, limit=limit,
                                                                    recursive=recursive))

                            sections = new_sections




                        for section in sections:
                            feedback += 'tag name: %s\n' % section.name
                            text += section.text






                #we are trying to get the image from the news
                if img_url is None:
                    imgs = innerSoup.findAll('img')

                    img_url = ''
                    if imgs is not None and len(imgs) > 0:
                        img_url = imgs[0]['src']



                feedback += 'img_url: %s\n' % img_url
                feedback += 'text: %s\n' % text.strip()

                #X_train = vectorizer.transform([text])
                #cat_ind = model.predict(X_train)
                #feedback +='CATEGORY: %s' % ind_to_cat[cat_ind[0]]

                feedback += '------------------------------\n'

                tuples.append((title,link_url, Utility.getWords(text)))
            except Exception as inst:
                feedback += 'Inner Exception type: %s\n' % str(type(inst))
                feedback += 'Inner Exception message: %s\n' % inst.message


        feedback += 'Numer of posts: %d\n' % ind


        #go from front

        front = -1
        for ind in xrange(100):

            num_documents = 0
            everywhere = True
            for i in xrange(1, len(tuples)):
                if not tuples[i-1][2][ind] == tuples[i][2][ind]:
                    everywhere  = False
                    break


            if not everywhere:
                front = 1 + ind
                break


        return feedback
    except Exception as inst:

        feedback += 'Exception type: %s\n' % type(inst)
        feedback += 'Exception message: %s\n' % inst.message

        #if there is an exception, we return and empty list of news posts
        return feedback