def get_me_some_test_samples(file_path, prob_to_take=1.0):
    """
    :param file_path: the path for the file where we load the data from
    :param prob_to_take: the probability to take ceratin news as a test sample
    :return: returns X_test and Y_test
                X_test is a list of lists, where each list in the list represents a list of feautres
                Y_test represents the class id for each list of features in X_test
    """

    cat_to_ind, ind_to_cat = Utility.load_categories()

    vectorizer = get_persisted_vectorizer(vectorizer_file_path)

    X_test = []
    Y_test = []

    for line in open(file_path):
        parts = line.decode(encoding="utf-8").strip().split("\t")
        category = parts[0]
        text = parts[1]

        if random.random() < prob_to_take:
            X_test.append(text)
            Y_test.append(cat_to_ind[category])

    X_test = vectorizer.transform(X_test)
    return X_test, Y_test
def create_model(model_file_name, vectorizer_file_name, training_percentage=0.93, max_iter=300, print_logs=False):
    """
    creating the model for logistic regression and storing it for later use
    :param model_file_name: the name of the file where the model will be stored
    :param vectorizer_file_name: the name of file where the vectorizer will be stored
    :param training_percentage: how much of the training set should we use
    :param max_iter: the maximum number of iterations for training
    :param print_logs: whether we should print the logs (such as time needed for training) or not
    :return: returns the model and the vectorizer
    """

    begin = time.time()
    t0 = time.time()

    # for a given category we return it's id
    cat_to_ind = {}
    # for a given id we return the category associated with that id
    ind_to_cat = []

    curr_cat_ind = 0

    X_train = []
    Y_train = []

    cat_to_ind, ind_to_cat = Utility.load_categories()

    for line in open(Utility.training_file_path):
        parts = line.decode(encoding="utf-8").strip().split("\t")
        category = parts[0]
        text = parts[1]

        item = [text, cat_to_ind[category]]

        if random.random() < training_percentage:
            X_train.append(item[0])
            Y_train.append(item[1])

    if print_logs:
        print "Read the dataset for %d seconds" % (time.time() - t0)
    t0 = time.time()

    vectorizer = TfidfVectorizer(encoding="utf-8", lowercase=True, stop_words=Utility.stop_words)
    X_train = vectorizer.fit_transform(X_train)

    if print_logs:
        print "Vectorized the train and test set for %d seconds" % (time.time() - t0)

    classifier = linear_model.LogisticRegression(max_iter=max_iter)
    classifier.predi

    t0 = time.time()
    classifier.fit(X_train, Y_train)
    if print_logs:
        print "Trained the model for %d seconds" % (time.time() - t0)

    t0 = time.time()

    file_to_write = open(model_file_name, "w")
    Pickler(file_to_write).dump(classifier)
    file_to_write.close()

    file_to_write = open(vectorizer_file_name, "w")
    Pickler(file_to_write).dump(vectorizer)
    file_to_write.close()

    if print_logs:
        print "Dumped the items for %d seconds." % (time.time() - t0)

    if print_logs:
        print "Total time: %d seconds" % (time.time() - begin)

    return classifier, vectorizer
def parse_rss_feed(rss_feed_url, stop_after=None):
    feedback = ''
    #model = get_persisted_model()
    _,ind_to_cat = Utility.load_categories()
    #vectorizer = get_persisted_vectorizer()

    #feedback += 'In parse rss feed!!!\n'
    try:
        #opening the url and reading the content
        #feedback += 'trying to open the content %s\n' % rss_feed_url
        c = urlopen(rss_feed_url)
        #feedback += 'Opened the content\n'
        content = c.read()
        #feedback += 'Read the content'
        soup = BeautifulSoup(content)

        #feedback += 'Opened the content\n'
        ind = 0

        tuples = []
        for item in soup.findAll('item'):

            ind += 1
            #feedback += 'Item num: %d\n' % ind

            if (stop_after is not None) and ind > stop_after: break

            try:
                title   = item.find('title').string
                link_url = item.find('link').string
                description_object = item.find('description')

                description = ''

                for p in BeautifulSoup(description_object.string).findAll('p'):
                    description += '%s ' % p.text

                feedback += 'title: %s\n' % title.strip()
                feedback += 'link_url: %s\n' % link_url
                feedback += 'description: %s\n' % description

                pub_date = item.find('pubdate')

                if pub_date is not None:
                    pub_date = pub_date.string
                    datetime_obj = parse(pub_date, ignoretz=True)

                    feedback += 'pub_date: %s\n' % (datetime_obj.strftime('%B %d %Y %H:%M'))

                    date_milli = (datetime_obj - epoch).total_seconds() * 1000.0
                    feedback += 'milli: %f\n' % date_milli

                else:
                    feedback += 'pub_date: None\n'

                text = ''


                img_url = None

                #we try to fetch the photo url directly from the rss feed, if not possible we will try later again
                if (item.description is not None) and (item.description.string is not None):
                    img_obj = BeautifulSoup(item.description.string).find('img')

                    if img_obj is not None:
                        img_url = img_obj['src']
                elif item.description is not None:
                    img_obj = item.description.find('img')

                    if img_obj is not None:
                        img_url =  img_obj['src']


                #here we get the content of the news
                link_content = urlopen(link_url).read()
                innerSoup = BeautifulSoup(link_content)


                for script in innerSoup(['script', 'style']):
                    script.extract()


                if rss_feed_url in Utility.fetch_text_specifications:

                    specifications = Utility.fetch_text_specifications[rss_feed_url]


                    if isinstance(specifications, list): #we take the paragraphs

                        start =  Utility.fetch_text_specifications[rss_feed_url][0]
                        end   = len(innerSoup.findAll('p'))
                        if len(Utility.fetch_text_specifications[rss_feed_url]) > 1:
                            end = Utility.fetch_text_specifications[rss_feed_url][1]


                        for p in innerSoup.findAll('p')[start:end]:
                            text += p.text
                    else:
                        tag_type = specifications['tag_type']
                        attr_type = specifications['attribute_type']
                        attr_value = specifications['attribute_value']

                        #feedback += 'tag_type: %s attr_type: %s attr_value: %s\n' % (tag_type, attr_type, attr_value)

                        sections = innerSoup.findAll(tag_type, {attr_type: attr_value})

                        #feedback += 'tags size: %d\n' % len(sections)


                        #if we need to go deeper
                        if 'nested_tag_type' in specifications:
                            nested_tag_type = specifications['nested_tag_type']
                            nested_attr_type = specifications['nested_attribute_type']
                            nested_attr_value = specifications['nested_attribute_value']
                            limit = specifications.get('limit', 1000)
                            recursive = specifications.get('recursive', True)

                            new_sections = []

                            for section in sections:
                                new_sections.extend(section.findAll(nested_tag_type,{ nested_attr_type:  nested_attr_value}, limit=limit,
                                                                    recursive=recursive))

                            sections = new_sections




                        for section in sections:
                            feedback += 'tag name: %s\n' % section.name
                            text += section.text






                #we are trying to get the image from the news
                if img_url is None:
                    imgs = innerSoup.findAll('img')

                    img_url = ''
                    if imgs is not None and len(imgs) > 0:
                        img_url = imgs[0]['src']



                feedback += 'img_url: %s\n' % img_url
                feedback += 'text: %s\n' % text.strip()

                #X_train = vectorizer.transform([text])
                #cat_ind = model.predict(X_train)
                #feedback +='CATEGORY: %s' % ind_to_cat[cat_ind[0]]

                feedback += '------------------------------\n'

                tuples.append((title,link_url, Utility.getWords(text)))
            except Exception as inst:
                feedback += 'Inner Exception type: %s\n' % str(type(inst))
                feedback += 'Inner Exception message: %s\n' % inst.message


        feedback += 'Numer of posts: %d\n' % ind


        #go from front

        front = -1
        for ind in xrange(100):

            num_documents = 0
            everywhere = True
            for i in xrange(1, len(tuples)):
                if not tuples[i-1][2][ind] == tuples[i][2][ind]:
                    everywhere  = False
                    break


            if not everywhere:
                front = 1 + ind
                break


        return feedback
    except Exception as inst:

        feedback += 'Exception type: %s\n' % type(inst)
        feedback += 'Exception message: %s\n' % inst.message

        #if there is an exception, we return and empty list of news posts
        return feedback