def parse_periodista(self,response): #check if we are in an articles url title = response.xpath('//h1[@itemprop="headline"]/text()').get() if title is not None: #get article's text text = response.xpath('//div[@class="per-item-page-part per-article-body"]//p/text()|//div[@class="per-item-page-part per-article-body"]//strong/text()|//div[@class="per-item-page-part per-article-body"]//p/*/text()').getall() list_to_string = " ".join(" ".join(text)) markspaces = re.sub( " ", "space",list_to_string) uneeded_spaces = re.sub( " ", "",markspaces) final_text = re.sub( "space", " ",uneeded_spaces) clear_characters = re.sub( "\xa0","",final_text) url = response.url date = response.xpath('//div[@class="col-md-4 per-color-grey per-font-size-md per-padding-top-20"]/text()').get() final_date = formatdate(date) #flag to see later on if we have videos flag = re.search(r"binteo|foto",url) #check if we are in an article, and if it doesn't have videos if len(clear_characters)>GENERAL_CATEGORIES['ALLOWED_LENGTH'] and flag is None: yield { "subtopic": GENERAL_CATEGORIES['SPORT'], "website": PERIODISTA_VARS['WEBSITE'], "title": re.sub( r'\t|\n|\r',"",title), "article_date": final_date, "author": PERIODISTA_VARS['WEBSITE'], "article_body": re.sub( r'\s\s\s',"",clear_characters), "url": url, }
def parse_cnn(self,response): #check if we are in an articles url title = response.xpath('//h1[@class="story-title"]/text()').get() if title is not None: #get article's text text = response.xpath('//p/text()').getall() list_to_string = " ".join(" ".join(text)) markspaces = re.sub( " ", "space",list_to_string) uneeded_spaces = re.sub( " ", "",markspaces) final_text = re.sub( "space", " ",uneeded_spaces) clear_characters = re.sub( "\xa0","",final_text) date = re.sub(r'\n|\t',"",response.xpath('//div[@class="story-date story-credits icon icon-time"]/text()').get()) final_date = formatdate(date) url = response.url yield { "subtopic": GENERAL_CATEGORIES['SPORT'], "website": CNN_VARS['WEBSITE'], "title": title, "article_date": final_date, "author": re.sub(r'\n|\t',"",response.xpath('//div[@class="story-author"]/text()').get()), "article_body": re.sub( r'\n',"",clear_characters), "url": url, }
def parse_gazzetta(self, response): #check if we are in an articles url title = response.xpath('//div[@class="field-item even"]/h1/text()').get() if title is not None: # extract subtitle by splitting our url by '/' # and keeping the third object on our created list url = response.url subtopic = url.split('/')[3] temp=response.xpath('//span[@itemprop="name"]/text()').get() #check if there is an author if isinstance(temp,str): author = re.fullmatch(r'\W+',temp) if author is None: author = temp else: author = GAZZEETTA_VARS['WEBSITE'] else: author = response.xpath('//h3[@class="blogger-social"]/a/text()').get() date = response.xpath('//div[@class="article_date"]/text()').get() final_date = formatdate(date) yield { "subtopic": subtopic, "website": GAZZEETTA_VARS['WEBSITE'], "title": title, "article_date": final_date, "author": author, "article_body": response.xpath('//div[@itemprop="articleBody"]//p/text()|//p/a/text()|//p/strong/text()').getall() ,#|//div[@itemprop="articleBody"]//p/a/text()|div[@itemprop="articleBody"]//p/strong/text()').getall(), "url": url }
def parse_iefimerida(self,response): #check if we are in an articles url title = response.xpath('//h1/span/text()').get() if title is not None: #get the article's text text = response.xpath('//div[@class="field--name-body on-container"]//p/text()|//div[@class="field--name-body on-container"]/strong/text()|//div[@class="field--name-body on-container"]//p/*/text()|//div[@class="field--name-body on-container"]//li/text()|//div[@class="field--name-body on-container"]//h2/text()').getall() list_to_string = " ".join(" ".join(text)) markspaces = re.sub( " ", "space",list_to_string) uneeded_spaces = re.sub( " ", "",markspaces) final_text = re.sub( "space", " ",uneeded_spaces) clear_characters = re.sub("\xa0","",final_text) #flag to see later on if we have tweets ect flag = re.search(r"@",clear_characters) url = response.url date = response.xpath('//span[@class="created"]/text()').get() final_date = formatdate(date) #check if we are in an article and that it doesn't have any images if len(final_text)>GENERAL_CATEGORIES['ALLOWED_LENGTH']and flag is None: yield { "subtopic": GENERAL_CATEGORIES['FOOD'], "website": IEFIMERIDA_VARS['AUTHOR'], "title": title, "article_date": final_date, "author": IEFIMERIDA_VARS['AUTHOR'], "article_body": re.sub( r'\s\s\s|\n',"",final_text), "url": url, }
def parse_thetoc(self,response): #check if we are in an articles url title = response.xpath('//div[@class="article-title"]//h1/text()').get() if title is not None: #get article's text text = response.xpath('//div[@class="article-content articleText"]//p/text()|//div[@class="article-content articleText"]//strong/text()|//div[@class="article-content articleText"]//p/*/text()').getall() list_to_string = " ".join(" ".join(text)) markspaces = re.sub( " ", "space",list_to_string) uneeded_spaces = re.sub( " ", "",markspaces) final_text = re.sub( "space", " ",uneeded_spaces) clear_characters = re.sub( "\xa0","",final_text) date = response.xpath('//span[@class="article-date"]/text()').get() final_date = THETOC_VARS['full_date'] +formatdate(date) #flag to see later on if we have tweets ect flag = re.search(r"@",clear_characters) url = response.url #check if we are in an article and that it doesn't have images if len(text)>GENERAL_CATEGORIES['ALLOWED_LENGTH'] and flag is None: yield { "subtopic": GENERAL_CATEGORIES['SPORT'], "website": THETOC_VARS['WEBSITE'], "title": title, "article_date": final_date, "author": re.sub(r'\n|\t',"",response.xpath('//div[@class="author-social"]//h5/a/span[2]/text()').get()), "article_body": re.sub( r'\n|\t',"",clear_characters), "url": url, }
def parse_item(self,response): #get article's text text = response.xpath('//div[@class="story-content"]//p/text()|//div[@class="story-content"]//strong/text()|//div[@class="story-content"]//a/text()').getall() title = response.xpath('//h1[@class="story-title"]/text()').get() list_to_string = " ".join(" ".join(text)) markspaces = re.sub( " ", "space",list_to_string) uneeded_spaces = re.sub( " ", "",markspaces) final_text = re.sub( "space", " ",uneeded_spaces) url = response.url article_type = url.split('/')[5] contains_photos = re.search('Photos',final_text) #the article isn't a photo gallery date = response.xpath('//div[@class="story-date story-credits icon icon-time"]/text()').get() final_date = formatdate(date) if article_type == CNN_VARS['ARTICLE_TYPE'] and contains_photos is None: yield{ "subtopic": GENERAL_CATEGORIES['STYLE'], "website": CNN_VARS['WEBSITE'], "title": title, "article_date": final_date, "author": re.sub(r'\n|\t',"",response.xpath('//div[@class="story-author"]/text()').get()), "article_body": re.sub( r'\n|\t',"",final_text), "url": url, }
def parse_in(self,response): #check if we are in an articles url title = response.xpath('//h1[@class="entry-title black-c"]/text()').get() if title is not None: #get article's text text = response.xpath('//div[@class="main-content pos-rel article-wrapper"]//p/text()|//div[@class="main-content pos-rel article-wrapper"]//strong/text()|//div[@class="main-content pos-rel article-wrapper"]//p/*/text()').getall() list_to_string = " ".join(" ".join(text)) markspaces = re.sub( " ", "space",list_to_string) uneeded_spaces = re.sub( " ", "",markspaces) final_text = re.sub( "space", " ",uneeded_spaces) clear_characters = re.sub( "\xa0","",final_text) #flag to see later on if we have tweets ect flag = re.search(r"@",clear_characters) url = response.url date = response.xpath('//time/text()').get() final_date = formatdate(date) #check if we are in an article and that it doesn't have images if len(clear_characters)>GENERAL_CATEGORIES['ALLOWED_LENGTH'] and flag is None: yield { "subtopic": GENERAL_CATEGORIES['SPORT'], "website": IN_VARS['WEBSITE'], "title": title, "article_date": final_date, "author": response.xpath('//span[@class="vcard author"]//a/text()').get(), "article_body": re.sub( r'\s\s\s',"",clear_characters), "url": url, }
def parse_sport24(self,response): #check if we are in an articles url title = response.xpath('//div[@class="storyContent"]/h1/text()').get() if title is not None: #get article's text text = response.xpath('//div[@itemprop="articleBody"]//p/text()|//div[@itemprop="articleBody"]//h3/text()|//div[@itemprop="articleBody"]//p/*/text()').getall() list_to_string = " ".join(" ".join(text)) markspaces = re.sub( " ", "space",list_to_string) uneeded_spaces = re.sub( " ", "",markspaces) final_text = re.sub( "space", " ",uneeded_spaces) clear_characters = re.sub( "\xa0","",final_text) clear_escape = re.sub(r'\n|\t',"",clear_characters) date = response.xpath('//span[@class="byline_date"]/b/text()').get() final_date = formatdate(date) #flag to see later on if we have tweets ect flag = re.search(r"@",clear_escape) url = response.url subtopic = url.split('/')[3] if flag is None: yield { "subtopic": subtopic, "website" : SPORT24_VARS['WEBSITE'], "title": title, "article_date": final_date, "author": response.xpath('//span[@class="byline_author"]/b/text()').get(), "article_body": clear_escape, "url": url }
def parse_reader_item(self,response): #check if we are in an articles url title = response.xpath('//h1/text()').get() if title is not None: #get article's text text = response.xpath('//div[@class="article-summary"]//p/text()|//div[@class="article-body"]//p/text()|//div[@class="article-body"]//p/*/text()').getall() list_to_string = " ".join(" ".join(text)) markspaces = re.sub( " ", "space",list_to_string) uneeded_spaces = re.sub( " ", "",markspaces) final_text = re.sub( "space", " ",uneeded_spaces) clear_characters = re.sub( "\xa0","",final_text) date = response.xpath('//time/text()').get() final_date = formatdate(date) author = response.xpath('//p[@class="article-author"]/a/text()').get() if author is not None: author = re.sub("\xa0","",author) else: author = READER_VARS['AUTHOR'] url = response.url if title is not None: yield { "subtopic": GENERAL_CATEGORIES['SPORT'], "website": READER_VARS['AUTHOR'], "title": re.sub( r'\n|\t',"",title), "article_date": final_date, "author": author, "article_body": re.sub( r'\n|\t',"",clear_characters), "url": url, }
def parse_newpost(self,response): #check if we are in an articles url title = response.xpath('//h1[@class="article-title"]/text()').get() if title is not None: #get the article's text text = response.xpath('//div[@class="article-main clearfix"]//p/text()|//div[@class="article-main clearfix"]//li/text()|//div[@class="article-main clearfix"]//p/a/strong/text()|//div[@class="article-main clearfix"]//p/*/text()').getall() list_to_string = " ".join(" ".join(text)) markspaces = re.sub( " ", "space",list_to_string) uneeded_spaces = re.sub( " ", "",markspaces) final_text = re.sub( "space", " ",uneeded_spaces) clear_characters = re.sub( "\xa0","",final_text) #flag to see later on if we have tweets ect flag = re.search(r"@",clear_characters) url = response.url date = (response.xpath('//small[@class="article-created-time"]/text()').get()).split('/')[0] final_date = formatdate(date) #check if we are in an article and that it doesn't have any images if len(clear_characters)>GENERAL_CATEGORIES['ALLOWED_LENGTH'] and flag is None: yield { "subtopic": GENERAL_CATEGORIES['FOOD'], "website": NEWPOST_VARS['WEBSITE'], "title": title, "article_date": final_date, "author": NEWPOST_VARS['WEBSITE'], "article_body": re.sub( r'\s\s\s',"",clear_characters), "url": url, }
def parse_thepressproject(self, response): #check if we are in an articles url title = response.xpath( '//h1[@class="entry-title"]/text()|//h1[@class="entry-title"]/*/text()' ).get() if title is not None: #check if we are in the correct category sub = response.xpath( '//div[@class="article-categories"]/a/text()').get() if sub == PRESSPROJECT_VARS['CATEGORY_WORLD']: #check this article isn't a video article video_article = response.xpath( '//i[@class="title-icon video-icon fab fa-youtube"]').get( ) if video_article is None: #fix title's format list_to_string = " ".join(" ".join(title)) no_whites = re.sub(r'\t|\n', "", list_to_string) markspaces = re.sub(" ", "space", no_whites) uneeded_spaces = re.sub(" ", "", markspaces) final_title = re.sub("space", " ", uneeded_spaces) delete_front_space = re.sub(" ", "", final_title) final_title = re.sub(" ", "", delete_front_space) #get article's text text = response.xpath( '//div[@id="maintext"]//p/text()|//div[@id="maintext"]//strong/text()|//div[@id="maintext"]//p/*/text()' ).getall() list_to_string = " ".join(" ".join(text)) markspaces = re.sub(" ", "space", list_to_string) uneeded_spaces = re.sub(" ", "", markspaces) final_text = re.sub("space", " ", uneeded_spaces) clear_characters = re.sub("\xa0", "", final_text) #flag to see later on if we have tweets ect flag = re.search(r"@", clear_characters) url = response.url date = response.xpath( '//div[@class="article-date"]/label[1]/text()').get() final_date = formatdate(date) #check if we are in an article and that it doesn't have images if len(clear_characters) > GENERAL_CATEGORIES[ 'ALLOWED_LENGTH'] and flag is None: yield { "subtopic": GENERAL_CATEGORIES['WORLD'], "website": PRESSPROJECT_VARS['AUTHOR'], "title": final_title, "article_date": final_date, "author": PRESSPROJECT_VARS['AUTHOR'], "article_body": re.sub(r'\s\s\s', "", clear_characters), "url": url, }
def parse_popaganda(self, response): #check if we are in an articles url global popaganda_counter title = response.xpath('//h1/text()').get() if title is not None and popaganda_counter < 300: #check if we are in the correct category category = response.xpath( '//div[@class="category"]/a/text()').get() if category == POPAGANDA_VARS['CATEGORY_CULTURE']: list_to_string = " ".join(" ".join(title)) markspaces = re.sub(" ", "space", list_to_string) uneeded_spaces = re.sub(" ", "", markspaces) put_spaces_back = re.sub("space", " ", uneeded_spaces) final_title = re.sub(r'\n|\s\s\s', "", put_spaces_back) text = response.xpath( '//div[@class="post-content newstrack-post-content"]//p/text()|//div[@class="post-content newstrack-post-content"]/p/strong/text()|//div[@class="post-content newstrack-post-content"]//h3/text()|//div[@class="post-content newstrack-post-content"]//p/*/text()' ).getall() list_to_string = " ".join(" ".join(text)) markspaces = re.sub(" ", "space", list_to_string) uneeded_spaces = re.sub(" ", "", markspaces) final_text = re.sub("space", " ", uneeded_spaces) clear_characters = re.sub("\xa0", "", final_text) author = response.xpath( '//div[@class="author"]/a/text()|//div[@itemprop="author"]/*/text()' ).get() if author == None: author = POPAGANDA_VARS['WEBSITE'] date = response.xpath( '//div[@class="date"]/text()|//div[@class="fullscreen-date"]/text()' ).get() final_date = formatdate(date) #flag to see later on if we have tweets ect flag = re.search(r"@", clear_characters) url = response.url #check if we are in an article and that it doesn't have images if title is not None and len( clear_characters ) > GENERAL_CATEGORIES['ALLOWED_LENGTH'] and flag is None: popaganda_counter += 1 yield { "subtopic": POPAGANDA_VARS['CULTURE'], "website": POPAGANDA_VARS['WEBSITE'], "title": final_title, "article_date": final_date, "author": POPAGANDA_VARS['WEBSITE'], "article_body": re.sub(r'\s\s\s|\n', "", clear_characters), "url": url, }
def parse_naftemporiki(self, response): global naftemporiki_counter #check if we are in an articles url title = response.xpath('//h2[@id="sTitle"]/text()').get() if title is not None and naftemporiki_counter < 300: #check if we are in the correct category subtopic = response.xpath( '//span[@itemprop="articleSection"]/text()').get() if subtopic == NAFTEMPORIKI_VARS['CATEGORY_ENVIRONMENT']: #fix the title's format list_to_string = " ".join(" ".join(title)) markspaces = re.sub(" ", "space", list_to_string) uneeded_spaces = re.sub(" ", "", markspaces) put_spaces_back = re.sub("space", " ", uneeded_spaces) final_title = re.sub(r'\n|\s\s\s', "", put_spaces_back) #get the article's text text = response.xpath( '//div[@class="entityMain article"]//p/text()|//div[@class="entityMain article"]/p/strong/text()|//div[@class="entityMain article"]//h3/text()|//div[@class="entityMain article"]//p/*/text()' ).getall() list_to_string = " ".join(" ".join(text)) markspaces = re.sub(" ", "space", list_to_string) uneeded_spaces = re.sub(" ", "", markspaces) final_text = re.sub("space", " ", uneeded_spaces) clear_characters = re.sub("\xa0", "", final_text) date = response.xpath('//div[@class="Date"]/text()').get() final_date = formatdate(date) #flag to see later on if we have tweets ect flag = re.search(r"@", clear_characters) url = response.url #check if we are in an article and that it doesn't have images if len(final_text) > GENERAL_CATEGORIES[ 'ALLOWED_LENGTH'] and flag is None: naftemporiki_counter += 1 yield { "subtopic": response.xpath( '//div[@class="Breadcrumb"]/a[2]/text()').get(), "website": NAFTEMPORIKI_VARS['AUTHOR'], "title": final_title, "article_date": final_date, "author": NAFTEMPORIKI_VARS['AUTHOR'], "article_body": re.sub(r'\s\s\s|\n', "", final_text), "url": url, }
def parse_efsyn(self, response): #check if we are in an articles url title = response.xpath('//h1[1]/text()').get() if title is not None: #check if we are in the correct category subtopic = response.xpath('//article/a/@href').get() category = subtopic.split('/')[1] if category == EFSYN_VARS['CATEGORY_WORLD']: #fix title's format list_to_string = " ".join(" ".join(title)) markspaces = re.sub(" ", "space", list_to_string) uneeded_spaces = re.sub(" ", "", markspaces) put_spaces_back = re.sub("space", " ", uneeded_spaces) final_title = re.sub(r'\n|\s\s\s', "", put_spaces_back) #get article's text text = response.xpath( '//div[@class="article__body js-resizable"]//p/text()|//div[@class="article__body js-resizable"]/p/strong/text()|//div[@class="article__body js-resizable"]//h3/text()|//div[@class="article__body js-resizable"]//p/*/text()' ).getall() list_to_string = " ".join(" ".join(text)) markspaces = re.sub(" ", "space", list_to_string) uneeded_spaces = re.sub(" ", "", markspaces) final_text = re.sub("space", " ", uneeded_spaces) clear_characters = re.sub("\xa0", "", final_text) #get author author = response.xpath( '//div[@class="article__author"]//a/text()').get() if author == None: author = response.xpath( '//div[@class="article__author"]/span/text()').get() date = response.xpath('//time/text()').get() final_date = formatdate(date) #flag to see later on if we have tweets ect flag = re.search(r"@", clear_characters) url = response.url #check if we are in an article and that it doesn't have images if len(clear_characters) > GENERAL_CATEGORIES[ 'ALLOWED_LENGTH'] and flag is None: yield { "subtopic": EFSYN_VARS['WORLD'], "website": EFSYN_VARS['WEBSITE'], "title": final_title, "article_date": final_date, "author": author, "article_body": re.sub(r'\s\s\s|\n', "", clear_characters), "url": url, }
def parse_kathimerini(self, response): #check if we are in an articles url title = response.xpath('//h2[@class="item-title"]/text()').get() if title is not None: list_to_string = " ".join(" ".join(title)) markspaces = re.sub(" ", "space", list_to_string) uneeded_spaces = re.sub(" ", "", markspaces) put_spaces_back = re.sub("space", " ", uneeded_spaces) final_title = re.sub(r'\n|\s\s\s', "", put_spaces_back) text = response.xpath( '//div[@class="freetext"]//p/text()|//div[@class="freetext"]//strong/text()|//div[@class="freetext"]//h3/text()|//div[@class="freetext"]//p/*/text()' ).getall() list_to_string = " ".join(" ".join(text)) markspaces = re.sub(" ", "space", list_to_string) uneeded_spaces = re.sub(" ", "", markspaces) final_text = re.sub("space", " ", uneeded_spaces) clear_characters = re.sub("\xa0", "", final_text) date = response.xpath('//time/text()').get() final_date = formatdate(date) #flag to see later on if we have tweets ect flag = re.search(r"@", clear_characters) url = response.url author = response.xpath( '//span[@class="item-author"]/a/text()').get() if author == KATHIMERINI_VARS['CATEGORY_AUTHOR']: author = KATHIMERINI_VARS['AUTHOR'] #check if we are in an article and that it doesn't have images if len(final_text ) > GENERAL_CATEGORIES['ALLOWED_LENGTH'] and flag is None: yield { "subtopic": response.xpath( '//span[@class="item-category"]/a/text()').get(), "website": KATHIMERINI_VARS['AUTHOR'], "title": final_title, "article_date": final_date, "author": author, "article_body": re.sub(r'\s\s\s|\n', "", final_text), "url": url, }
def parse_lifo(self, response): global lifo_counter #check if we are in an articles url title = response.xpath( '//h1[@itemprop="headline"]/text()|//meta[@itemprop="headline"]/text()|//h1/*/text()' ).get() if title is not None and lifo_counter < 300: #fix the title's format list_to_string = " ".join(" ".join(title)) markspaces = re.sub(" ", "space", list_to_string) uneeded_spaces = re.sub(" ", "", markspaces) put_spaces_back = re.sub("space", " ", uneeded_spaces) final_title = re.sub(r'\n|\s\s\s', "", put_spaces_back) date = response.xpath('//time/text()').get() final_date = formatdate(date) #get the article's text text = response.xpath( '//div[@class="clearfix wide bodycontent"]//p/text()|//div[@class="clearfix wide bodycontent"]/p/strong/text()|//div[@class="clearfix wide bodycontent"]//h3/text()|//div[@class="clearfix wide bodycontent"]//p/*/text()' ).getall() list_to_string = " ".join(" ".join(text)) markspaces = re.sub(" ", "space", list_to_string) uneeded_spaces = re.sub(" ", "", markspaces) final_text = re.sub("space", " ", uneeded_spaces) clear_characters = re.sub("\xa0", "", final_text) author = response.xpath( '//div[@class="author"]/a/text()|//div[@itemprop="author"]/*/text()' ).get() if author == None: author = LIFO_VARS['AUTHOR'] #flag to see later on if we have tweets ect flag = re.search(r"@", clear_characters) url = response.url #check if we are in an article and that it doesn't have images if len(clear_characters ) > GENERAL_CATEGORIES['ALLOWED_LENGTH'] and flag is None: lifo_counter += 1 yield { "subtopic": GENERAL_CATEGORIES['ENVIRONMENT'], "website": LIFO_VARS['AUTHOR'], "title": final_title, "article_date": final_date, "author": author, "article_body": re.sub(r'\s\s\s|\n', "", clear_characters), "url": url, }
def parse_topontiki(self, response): #check if we are in an articles url title = response.xpath('//h1/text()').get() if title is not None: sub = response.xpath('//h2/a[1]/text()').get() #check if we are in the correct category if sub == TOPONTIKI_VARS['CATEGORY_CULTURE']: list_to_string = " ".join(" ".join(title)) markspaces = re.sub(" ", "space", list_to_string) uneeded_spaces = re.sub(" ", "", markspaces) put_spaces_back = re.sub("space", " ", uneeded_spaces) final_title = re.sub(r'\n|\s\s\s', "", put_spaces_back) text = response.xpath( '//div[@class="field-item even"]//p/text()|//div[@class="field-item even"]//p/*/text()|//div[@class="field-item even"]//p//span/text()' ).getall() list_to_string = " ".join(" ".join(text)) markspaces = re.sub(" ", "space", list_to_string) uneeded_spaces = re.sub(" ", "", markspaces) final_text = re.sub("space", " ", uneeded_spaces) clear_characters = final_text.replace("\xa0", "") date = response.xpath('//span[@class="date"]/text()').get() final_date = formatdate(date) #flag to see later on if we have tweets ect flag = re.search(r"@", clear_characters) url = response.url #check if we are in an article and that it doesn't have images if len(clear_characters) > GENERAL_CATEGORIES[ 'ALLOWED_LENGTH'] and flag is None: yield { "subtopic": GENERAL_CATEGORIES['CULTURE'], "website": TOPONTIKI_VARS['WEBSITE'], "title": final_title, "article_date": final_date, "author": response.xpath('//a[@class="author"]/text()').get(), "article_body": re.sub(r'\s\s\s|\n', "", clear_characters), "url": url, }
def parse_protagon(self, response): global protagon_counter #check if we are in an articles url title = response.xpath('//h1[@class="entry-title"]/text()').get() if title is not None and protagon_counter < 300: #check if we are in the correct category sub = response.xpath('//span[@class="s_roumpr"]/a/text()').get() if sub == PROTAGON_VARS['CATEGORY_ECONOMICS']: text = response.xpath( '//div[@class="left-single-column "]//p/text()|//div[@class="left-single-column "]//strong/text()|//div[@class="left-single-column "]//p/*/text()' ).getall() list_to_string = " ".join(" ".join(text)) markspaces = re.sub(" ", "space", list_to_string) uneeded_spaces = re.sub(" ", "", markspaces) final_text = re.sub("space", " ", uneeded_spaces) clear_characters = re.sub("\xa0", "", final_text) #flag to see later on if we have tweets ect flag = re.search(r"@", clear_characters) url = response.url author = re.findall( r"(\w+).(\w+)", response.xpath( '//strong[@class="generalbold uppercase"]/a/text()'). get()) list_to_tuple = author[0] author = ' '.join(list_to_tuple) date = response.xpath( '//span[@class="generalight uppercase"]/text()').get() final_date = formatdate(date) #check if we are in an article and that it doesn't have images if len(clear_characters) > GENERAL_CATEGORIES[ 'ALLOWED_LENGTH'] and flag is None: protagon_counter += 1 yield { "subtopic": GENERAL_CATEGORIES['ECONOMICS'], "website": PROTAGON_VARS['WEBSITE'], "title": title, "article_date": final_date, "author": author, "article_body": re.sub(r'\s\s\s', "", clear_characters), "url": url, }
def parse_cnn(self, response): global cnn_counter #check if we are in an articles url title = response.xpath('//h1[@class="story-title"]/text()').get() if title is not None and cnn_counter < 300: #get the article's text text = response.xpath( '//div[@class="story-content"]//p/text()|//div[@class="story-content"]//strong/text()|//div[@class="story-content"]//a/text()' ).getall() list_to_string = " ".join(" ".join(text)) markspaces = re.sub(" ", "space", list_to_string) uneede_spaces = re.sub(" ", "", markspaces) final_text = re.sub("space", " ", uneede_spaces) clear_characters = re.sub("\xa0", "", final_text) date = re.sub( r'\n|\t', "", response.xpath( '//div[@class="story-date story-credits icon icon-time"]/text()' ).get()) final_date = formatdate(date) url = response.url if len(clear_characters) > GENERAL_CATEGORIES['ALLOWED_LENGTH']: cnn_counter += 1 yield { "subtopic": GENERAL_CATEGORIES['ENVIRONMENT'], "website": CNN_VARS['WEBSITE'], "title": title, "article_date": final_date, "author": re.sub( r'\n|\t', "", response.xpath( '//div[@class="story-author"]/text()').get()), "article_body": re.sub(r'\n|\t', "", clear_characters), "url": url, }
def parseItemPS(self, response): global cnn_counter title = response.xpath('//h1[@class="story-title"]/text()').get() text = response.xpath( '//div[@class="story-content"]//p/text()|//div[@class="story-content"]//strong/text()|//div[@class="story-content"]//a/text()' ).getall() list_to_string = " ".join(" ".join(text)) markspaces = re.sub(" ", "space", list_to_string) uneeded_spaces = re.sub(" ", "", markspaces) final_text = re.sub("space", " ", uneeded_spaces) clear_characters = re.sub("\xa0", "", final_text) date = response.xpath( '//div[@class="story-date story-credits icon icon-time"]/text()' ).get() final_date = formatdate(date) #check if this an article and not an photo gallery url = response.url article_type = url.split('/')[5] contains_photos = re.search('Photos', clear_characters) if article_type == CNN_VARS[ 'ARTICLE_TYPE'] and contains_photos is None and cnn_counter < 300: cnn_counter += 1 yield { "subtopic": GENERAL_CATEGORIES['CULTURE'], "website": CNN_VARS['WEBSITE'], "title": title, "article_date": final_date, "author": re.sub( r'\n|\t', "", response.xpath( '//div[@class="story-author"]/text()').get()), "article_body": re.sub(r'\n|\t', "", clear_characters), "url": url, }
def parse_tanea(self, response): #check if we are in an articles url title = response.xpath( '//h1[@class="entry-title black-c"]/text()').get() if title is not None: list_to_string = " ".join(" ".join(title)) markspaces = re.sub(" ", "space", list_to_string) uneeded_spaces = re.sub(" ", "", markspaces) put_spaces_back = re.sub("space", " ", uneeded_spaces) final_title = re.sub(r'\n|\s\s\s', "", put_spaces_back) text = response.xpath( '//div[@class="main-content pos-rel article-wrapper"]//p/text()|//div[@class="main-content pos-rel article-wrapper"]//strong/text()|//div[@class="main-content pos-rel article-wrapper"]//h3/text()|//div[@class="main-content pos-rel article-wrapper"]//p/*/text()' ).getall() list_to_string = " ".join(" ".join(text)) markspaces = re.sub(" ", "space", list_to_string) uneeded_spaces = re.sub(" ", "", markspaces) final_text = re.sub("space", " ", uneeded_spaces) clear_characters = re.sub("\xa0", "", final_text) date = response.xpath( '//span[@class="firamedium postdate updated"]/text()').get() final_date = formatdate(date) #flag to see later on if we have tweets ect flag = re.search(r"@", clear_characters) url = response.url subtopic = url.split('/')[7] if len(subtopic) > 15: subtopic = TANEA_VARS['CATEGORY_CULTURE'] #check if we are in an article and that it doesn't have images if len(final_text ) > GENERAL_CATEGORIES['ALLOWED_LENGTH'] and flag is None: yield { "subtopic": GENERAL_CATEGORIES['CULTURE'], "website": TANEA_VARS['AUTHOR'], "title": final_title, "article_date": final_date, "author": TANEA_VARS['AUTHOR'], "article_body": re.sub(r'\s\s\s|\n', "", final_text), "url": url, }
def parse_insomnia(self, response): #check if we are in an articles url title = response.xpath('//div[@class="container"]//h1/text()').get() if title is not None: #get article's text text = response.xpath( '//div[@class="the-content"]//p/text()|//div[@class="the-content"]//p/*/text()' ).getall() list_to_string = " ".join(" ".join(text)) markspaces = re.sub(" ", "space", list_to_string) uneeded_spaces = re.sub(" ", "", markspaces) final_text = re.sub("space", " ", uneeded_spaces) clear_characters = re.sub("\xa0", "", final_text) #flag to see later on if we have tweets ect flag = re.search(r"@", clear_characters) #find subtopic through url url = response.url subtopic = url.split('/')[4] date = response.xpath('//span[@class="timestamp"]/text()').get() final_date = formatdate(date) #check if we are in an article and that it doesn't have images if len(clear_characters ) > GENERAL_CATEGORIES['ALLOWED_LENGTH'] and flag is None: yield { "subtopic": subtopic, "website": INSOMNIA_VARS['WEBSITE'], "title": title, "article_date": final_date, "author": response.xpath('//span[@class="author"]/a/text()').get(), "article_body": re.sub(r'\s\s\s|\n|\t', "", clear_characters), "url": response.url, }
def parse_thetoc(self, response): global thetoc_counter #check if we are in an articles url title = response.xpath( '//div[@class="article-title"]//h1/text()').get() if title is not None and thetoc_counter < 300: text = response.xpath( '//div[@class="article-content articleText"]//p/text()|//div[@class="article-content articleText"]//strong/text()|//div[@class="article-content articleText"]//p/*/text()' ).getall() list_to_string = " ".join(" ".join(text)) markspaces = re.sub(" ", "space", list_to_string) uneeded_spaces = re.sub(" ", "", markspaces) final_text = re.sub("space", " ", uneeded_spaces) clear_characters = re.sub("\xa0", "", final_text) date = response.xpath('//span[@class="article-date"]/text()').get() final_date = THETOC_VARS['full_date'] + formatdate(date) url = response.url if len(clear_characters) > GENERAL_CATEGORIES['ALLOWED_LENGTH']: thetoc_counter += 1 yield { "subtopic": GENERAL_CATEGORIES['ECONOMICS'], "website": re.search(r"www.+\.gr", url).group(0), "title": title, "article_date": final_date, "author": re.sub( r'\n|\t', "", response.xpath( '//div[@class="author-social"]//h5/a/span[2]/text()' ).get()), "article_body": re.sub(r'\n|\t', "", clear_characters), "url": url, }
def parse_tovima(self, response): global tovima_counter #check if we are in an articles url title = response.xpath( '//h1[@class="entry-title thirty black-c zonabold"]/text()').get() if title is not None and tovima_counter < 300: list_to_string = " ".join(" ".join(title)) markspaces = re.sub(" ", "space", list_to_string) uneeded_spaces = re.sub(" ", "", markspaces) put_spaces_back = re.sub("space", " ", uneeded_spaces) final_title = re.sub(r'\n|\s\s\s', "", put_spaces_back) text = response.xpath( '//div[@class="main-content pos-rel article-wrapper"]//p/text()|//div[@class="main-content pos-rel article-wrapper"]//strong/text()|//div[@class="main-content pos-rel article-wrapper"]//h3/text()|//div[@class="main-content pos-rel article-wrapper"]//p/*/text()' ).getall() list_to_string = " ".join(" ".join(text)) markspaces = re.sub(" ", "space", list_to_string) uneeded_spaces = re.sub(" ", "", markspaces) final_text = re.sub("space", " ", uneeded_spaces) clear_characters = re.sub("\xa0", "", final_text) date = response.xpath('//time/span/text()').get() final_date = formatdate(date) #flag to see later on if we have tweets ect flag = re.search(r"@", clear_characters) url = response.url #check if we are in an article and that it doesn't have any images if len(final_text ) > GENERAL_CATEGORIES['ALLOWED_LENGTH'] and flag is None: tovima_counter += 1 yield { "subtopic": GENERAL_CATEGORIES['ECONOMICS'], "website": TOVIMA_VARS['AUTHOR'], "title": final_title, "article_date": final_date, "author": TOVIMA_VARS['AUTHOR'], "article_body": re.sub(r'\s\s\s|\n', "", final_text), "url": url, }
def parse_popaganda(self,response): #check if we are in an articles url title = response.xpath('//h1/text()').get() if title != None: #fix title's format list_to_string = " ".join(" ".join(title)) markspaces = re.sub( " ", "space",list_to_string) uneeded_spaces = re.sub( " ", "",markspaces) put_spaces_back = re.sub( "space", " ",uneeded_spaces) uneeded_escapes = re.sub(r'\n|\s\s\s',"",put_spaces_back) final_title = re.sub("\xa0","",uneeded_escapes) #get the article's text text = response.xpath('//div[@class="post-content big nxContent"]//p/text()|//div[@class="post-content big nxContent"]//strong/text()|//div[@class="post-content big nxContent"]//span/*/text()|//div[@class="post-content big nxContent"]//em/text()|//div[@class="post-content big nxContent"]//p/*/text()').getall() list_to_string = " ".join(" ".join(text)) markspaces = re.sub( " ", "space",list_to_string) uneeded_spaces = re.sub( " ", "",markspaces) final_text = re.sub( "space", " ",uneeded_spaces) clear_characters = re.sub(r'\s\s\s|\n',"",final_text) date = response.xpath('//div[@class="article_date"]/text()|//div[@class="fullscreen-date"]/text()').get() final_date = formatdate(date) author = response.xpath('//div[@class="author-title"]/a/text()|//div[@itemprop="author-title"]/*/text()|//div[@class="fullscreen-author"]/a/text()').get() if author == None: author = POPAGANDA_VARS['WEBSITE'] #flag to see later on if we have tweets ect flag = re.search(r"@",clear_characters) if len(clear_characters)>GENERAL_CATEGORIES['ALLOWED_LENGTH']and flag is None: yield { "subtopic": POPAGANDA_VARS['FOOD'], "website": POPAGANDA_VARS['WEBSITE'], "title": final_title, "article_date": final_date, "author": re.sub(r'\n',"",author), "article_body": clear_characters.replace(" ","",1), "url": response.url, }