Пример #1
0
    def scrape(self,url,parent):
        Logger.debug('Starting url scrap for {}'.format(url))
        config.last_url_and_parent=url+', {}'.format('' if parent==None else parent)

        new_url=base_util.unreplace_dot_url(url)

        response=self.http.get(new_url)
        Logger.debug('Got URL')
        if not hasattr(response,'data') and new_url.startswith('www.'):
            new_url=new_url.replace('www.','http://')

            response=self.http.get(new_url)

            if not hasattr(response,'data'):
                new_url=new_url.replace('http://','http://www.')
                response=self.http.get(new_url)


        if hasattr(response,'data'):
            body=base_util.utf_8_safe_decode(response.data)

        else:
            Logger.error('No data associated with '+new_url)
            raise AttributeError(new_url+':::No data')

        return body,new_url
Пример #2
0
def preprocess(html_page):
    """
    Preprocess the html page, remove the tags and make into utf-8

    :except:
    :param html_page:
    :return: a htmlless text page
    """
    try:
        new_html_page=utf_8_safe_decode(html_page)

    except AttributeError:
        print("Failed to convert page to utf 8, reverting to base page")
        new_html_page=html_page

    return remove_html(new_html_page)
Пример #3
0
def preprocess_getParser(html_page):
    """
    Preprocess the html page, remove the tags and make into utf-8

    :except:
    :param html_page:
    :return:
    """
    try:
        new_html_page=utf_8_safe_decode(html_page)

    except AttributeError:
        print("Failed to convert page to utf 8, reverting to base page")
        new_html_page=html_page

    pure_txt_page=remove_html(new_html_page)

    return plaintext.PlaintextParser.from_string(pure_txt_page,Tokenizer('english'))
Пример #4
0
def collect_bad_url():
    """
    Make bows of websites in the bad url list

    :return:
    """

    queue=DBQueue_old("genre_bow")

    #don't trust anything
    summarizer=Summarizer()
    bow=BagOfWords()
    short_genre_to_genre=coll.ShortGenre()
    url_to_bow=coll.URLBow()
    start_pos=queue.get()

    for c,line in enumerate(open("bad_url_summarize_bow.txt")):
        if c<start_pos:
            continue

        url=line.split(" ")[1].split(":::")[0]

        try:
            print('New url {} num: {}'.format(url,c))

            url_obj=coll.URLToGenre().select(url=url).find_one()

            if not hasattr(url_obj,"original") or not url_obj["original"]:
                print("Not original")
                continue

            #request page anyways, most of the bad pages are due to bad pagess
            data=Request().get_data(base_util.unreplace_dot_url(base_util.unreplace_dot_url(url_obj["url"])))

            if data is None:
                raise Exception('url {} No has page'.format(url))
            else:
                if not hasattr(url_obj,"page") or len(data)>len(url_obj["page"]):
                    print("updating data")
                    data=base_util.utf_8_safe_decode(data)

                    if not hasattr(url_obj,"page"):
                        #save page if the new page is significantly bigger than the old one
                        url_obj.save(page=data)

                    else:
                        url_obj.update(page=data)
                    url_obj.reload()

            if len(data) > len(url_obj.page):
                raise Exception("Inconsistency b/w data and page data")



            #url_obj=repair.genre_to_genre_data(url_obj.document)

            #get genre strings
            #register the genre with the short genres for faster retrieval
            genre_string_list=[]
            for g in url_obj.genre:
                normalized_string=base_util.normalize_genre_string(g["genre"])
                genre_string_list.append(normalized_string)
                short_genre_to_genre.select(short_genre=normalized_string).update(upsert=True,add_to_set__genres=g)

            Logger.info("Getting bow rep")
            #get BOW representation
            bow_dict=bow.get_word_count(summarizer.summarize(url_obj.page if isinstance(url_obj.page,str) else base_util.utf_8_safe_decode(url_obj)))

            if len(bow_dict)<20:
                raise Exception("Words less than 20")

            Logger.info("Update count:"+str(bow_dict))


            #store the url bow in urlbow table
            if not url_to_bow.select(url=url_obj["url"]).find_one():
                url_to_bow.create(url=url_obj["url"],bow=bow_dict,short_genres=genre_string_list)

            else:
                print('Exists bow url number {}'.format(url))

            queue.increment()
        except Exception as ex:
            Logger.error(url_obj['url']+":::"+str(ex),"C:/Users/Kevin/Desktop/GitHub/Research/Webscraper/bad_url_summarize_bow1.txt")
Пример #5
0
def start_bow():
    """
    Bag of word all webpages in URLToGenre database

    Pipeline:
    1. Get genre and page from URLToGenre Object from the mongodb URLQueue
    2. BOW the webpage in URLToGenre Object
    3. Shorten the Genre
    4. Insert the words in bow into the genre in CategoricalBow Mongodb table

    Repeat until exhaustion of URLToGenre Objects

    :return: Nothing!
    """

    queue=DBQueue(None,"summarization")

    #don't trust anything
    summarizer=Summarizer()
    bow=BagOfWords()

    for url_obj in URLToGenre.objects.order_by("ref_index").no_cache():

        try:
            print('New url {}'.format(url_obj.ref_index))

            if not hasattr(url_obj,"original") or not url_obj["original"]:
                continue

            #skip conditionsL it does not have page or it is not an original url
            if not hasattr(url_obj,'page'):
                raise Exception('url {} No has page'.format(url_obj.ref_index))

            #get genre strings
            #register the genre with the short genres for faster retrieval
            genre_string_list=[]
            for g in url_obj.genre:
                normalized_string=base_util.normalize_genre_string(g["genre"])
                genre_string_list.append(normalized_string)

            genre_string_list=list(set(genre_string_list))

            summarize_logger.info("Getting bow rep")
            #get BOW representation
            bow_dict=bow.get_word_count(summarizer.summarize(url_obj.page if isinstance(url_obj.page,str) else base_util.utf_8_safe_decode(url_obj)))

            summarize_logger.info("Update count:"+str(bow_dict))

            if len(bow_dict)==0:
                raise Exception("No count available")

            #store the url bow in urlbow table
            if len(Summary.objects(url=url_obj.ref_index))==0:
                Summary(url=url_obj.url,ref_index=url_obj.ref_index,attr_map=bow_dict,short_genres=genre_string_list).save()
            else:
                print('Exists bow url number {}'.format(url_obj.ref_index))

        except Exception as ex:
            summarize_logger.error(url_obj['url']+":::"+str(ex),"C:/Users/Kevin/Desktop/GitHub/Research/Webscraper/bad_url_summarize_bow.txt")