def parse_item_page(self, response): item_data = { "title": remove_unicode( response.xpath('//meta[@property="og:title"]/@content'). extract()[0].strip()), "author": " ".join( response.xpath('//span[@class="author"]//text()').extract() [1:-1]).strip(), "date": parse(response.xpath( '//meta[@property="article:published_time"]/@content').extract( )[0].strip(), fuzzy=True).strftime("%Y-%m-%dT%H:%M:%S"), "description": remove_unicode( response.xpath('//meta[@property="og:description"]/@content'). extract()[0].strip()), "content": self._get_content(response), "url": response.url, } yield NewsItem(**item_data)
def parse(self, response): item_data = { "title": remove_unicode(response.xpath('//meta[@name="dc.title"]/@content').extract()[0].strip()), "author": " ".join(response.xpath('//*[@class="article-source"]//text()').extract()).strip(), "date": parse(response.xpath('//meta[@name="dc.date"]/@content').extract()[0], fuzzy=True).strftime( "%Y-%m-%dT%H:%M:%S"), "description": remove_unicode( response.xpath('//meta[@name="dc.description"]/@content').extract()[0].strip()), "content": remove_unicode( ' '.join(response.xpath('//*[@class="article-body"]/p//text()').extract()).strip()), "url": response.url, } yield NewsItem(**item_data)
def _get_content(self, response, string="SUBSCRIBE"): ps = response.xpath( '//article//p//*[not(self::script)]//text()').extract() if string in ps: ps = ps[:ps.index(string)] ps = map(lambda s: s.strip(), ps) return remove_unicode(" ".join(ps).strip())
def process(self, tweet): removed_url = re.sub(r'http\S+', '', tweet.text) processed_text = html.unescape(removed_url) text = utils.remove_unicode(processed_text) tweet_user = tweet.user.screen_name tweet_time = self.process_tweet_time(tweet.created_at) return "Posted by @{} on {}:".format(tweet_user, tweet_time), text
def parse(self, response): item = NewsItem() type = response.xpath('//meta[@property="og:type"]//@content').extract_first() if type is None or "article" not in type: return item['url'] = response.url item['date'] = parse( response.xpath('//*[@id="article-feed"]/article[1]//span[@class="timestamp"]').extract()[0], fuzzy=True).strftime("%Y-%m-%dT%H:%M:%S") try: item['author'] = " ".join( response.xpath('//*[@id="article-feed"]/article[1]//div[@class="author"]//text()') .extract()).strip() except IndexError: item['author'] = '' item['title'] = response.xpath('//meta[@property="og:title"]//@content').extract()[0].strip() item['description'] = response.xpath( '//meta[@property="og:description"]//@content').extract_first().rstrip() item['content'] = remove_unicode(' '.join(response.xpath( '//*[@id="article-feed"]/article[1]//*[@class="article-body"]//*[@itemprop="articleBody"]//text()').extract()).rstrip()) yield item
def parse(self, response): item = NewsItem() lang = response.xpath( '//*[@id="responsive-news"]//meta[@property="og:locale"]//@content' ).extract_first() type = response.xpath( '//*[@id="responsive-news"]//meta[@property="og:type"]//@content' ).extract_first() if lang is None or "en" not in lang or "article" not in type: return item['url'] = response.url try: item['date'] = datetime.utcfromtimestamp(float( response.xpath( '//div[@class="story-body"]//div[contains(@class,"date date--v2")]//@data-seconds').extract_first())) \ .strftime("%Y-%m-%dT%H:%M:%S") except TypeError: item['date'] = '' try: _author = response.xpath( '//*//span[@class="byline__name"]//text()').extract_first() if _author is None: item['author'] = 'BBC News' else: _author_split = _author.split(" ") if _author_split[0] == "By": _author = " ".join(_author_split[1:]) item['author'] = _author + " | BBC News" # # " ".join( # response.xpath('//*[@id="responsive-news"]//meta[@property="article:author"]//@content') # .extract()[0]).strip() # # intoarce https://www.facebook.com/bbcnews except IndexError: item['author'] = 'BBC News' item['title'] = response.xpath( '//*[@id="responsive-news"]//meta[@property="og:title"]//@content' ).extract_first().strip() item['description'] = response.xpath( '//*[@id="responsive-news"]//meta[@property="og:description"]//@content' ).extract_first().rstrip() item['content'] = remove_unicode(' '.join( response.xpath( '//div[@class="story-body"]//div[@property="articleBody"]//p//text()' ).extract()).rstrip()) yield item
def read_data(filename: str, data_header: dict, hmap: dict, all_eps=True) -> dict: """ - Assumes that if a "FULL NAME" column exists, all rows will have a format of 'LastName, FirstName'. - Sometimes the header may have unicode (special) characters, cleans before use. - hmap is a map of the header fields with official MDS translations, where cleansing was required. - all_eps == False => Closed eps only """ #data_header = fix_headers(data_header) with open(filename, 'r') as csvfile: csvfile.readline() reader = csv.DictReader(csvfile, data_header) if MDS['FNAME'] not in data_header and "FULL NAME" in data_header: rows = _split_fullname(reader) reader = rows data_header.remove("FULL NAME") data_header.extend([MDS['FNAME'], MDS['LNAME']]) clean_headers = {dh: remove_unicode(dh) for dh in data_header} # [ch for ch in clean_headers.values() if ch in data_header] == data_header # True tmp_k = None result = [] ii = 0 for i, row in enumerate(reader): if "".join(row.values()) == '': logger.error( f"\n\tFound Blank row at {i}. skipping to next row...") continue if not all_eps and not row[MDS['END_DATE']]: continue result.append({}) for k, v in row.items(): tmp_k = clean_headers[k] # if tmp_k in hmap: # result[i][hmap[tmp_k]] = v # else: result[ii][tmp_k] = v ii = ii + 1 #result = [ {k:v for k, v in row.items()} for row in reader if hmap[k]] return {"episodes": result}
def _text_to_df(text_file): """ Convert a raw E-Prime output text file into a pandas DataFrame. """ # Load the text file as a list. with open(text_file, 'rb') as fo: text_data = list(fo) # Remove unicode characters. filtered_data = [ remove_unicode(row.decode('utf-8', 'ignore')) for row in text_data ] # Determine where rows begin and end. start_index = [ i for i, row in enumerate(filtered_data) if row == '*** LogFrame Start ***' ] end_index = [ i for i, row in enumerate(filtered_data) if row == '*** LogFrame End ***' ] if len(start_index) != len(end_index) or start_index[0] >= end_index[0]: print('Warning: LogFrame Starts and Ends do not match up.') n_rows = min(len(start_index), len(end_index)) # Find column headers and remove duplicates. headers = [] data_by_rows = [] for i in range(n_rows): one_row = filtered_data[start_index[i] + 1:end_index[i]] data_by_rows.append(one_row) for col_val in one_row: split_header_idx = col_val.index(':') headers.append(col_val[:split_header_idx]) headers = list(OrderedDict.fromkeys(headers)) # Preallocate list of lists composed of NULLs. data_matrix = np.empty((n_rows, len(headers)), dtype=object) data_matrix[:] = np.nan # Fill list of lists with relevant data from data_by_rows and headers. for i in range(n_rows): for cell_data in data_by_rows[i]: split_header_idx = cell_data.index(':') for k_header, header in enumerate(headers): if cell_data[:split_header_idx] == header: data_matrix[i, k_header] = cell_data[split_header_idx + 1:].lstrip() df = pd.DataFrame(columns=headers, data=data_matrix) # Columns with one value at the beginning, the end, or end - 1 should be # filled with that value. for col in df.columns: non_nan_idx = np.where(df[col].values == df[col].values)[0] if len(non_nan_idx) == 1 and non_nan_idx[0] in [ 0, df.shape[0] - 1, df.shape[0] - 2 ]: df.loc[:, col] = df.loc[non_nan_idx[0], col] return df