def test_case_sensitive(self): input_json_good = parse('<CaseSensitive />') input_json_bad = parse('<caSesensiTive />') # Should succeed case_sensitive_good = CaseSensitive(raw_value=input_json_good) with self.assertRaises(DataError): CaseInsensitive(raw_value=input_json_bad) xml = unparse(case_sensitive_good.to_primitive()) self.assertEqual(xml, '<CaseSensitive />')
def test_case_insensitive(self): input_json_good = parse('<CaseInsensitive />') input_json_bad = parse('<caSeinsensiTive />') # Should succeed case_insensitive_good = CaseInsensitive(raw_value=input_json_good) # Should also succeed case_insensitive_bad = CaseInsensitive(raw_value=input_json_bad) xml = unparse(case_insensitive_good.to_primitive()) self.assertEqual(xml, '<CaseInsensitive />') xml = unparse(case_insensitive_bad.to_primitive()) self.assertEqual(xml, '<CaseInsensitive />')
def get_news(query, days=1): def google_news(query, days): link = "https://news.google.com/news/rss/headlines/section/topic/BUSINESS/search?q={}+when:{}d".format( query, days) return link link = google_news(query, days) r = requests.get(link) res = json.loads(xmltojson.parse(requests.get(link).text)) headlines = [] try: for item in res['rss']["channel"]['item']: headline = {} headline['Date'] = item['pubDate'] headline['Title'] = item['title'] headline['Link'] = item["link"] headlines.append(headline) except KeyError as e: return e news = pd.DataFrame(headlines) polarity = lambda x: round(TextBlob(x).sentiment.polarity, 2) subjectivity = lambda x: round(TextBlob(x).sentiment.subjectivity, 2) news_polarity = np.zeros(len(news['Title'])) news_subjectivity = np.zeros(len(news['Title'])) for idx, headline in enumerate(news["Title"]): # try: news_polarity[idx] = polarity(headline) news_subjectivity[idx] = subjectivity(headline) # except: # pass news["Polarity"] = news_polarity date = lambda x: datetime.datetime.strptime( x.split(",")[1][1:-4], '%d %b %Y %H:%M:%S') news['Date'] = news["Date"].apply(date) return news
def test_nested_child_list(self): input_json = parse('<Foo2>' ' <Bars>' ' <Bar>Item1</Bar>' ' <Bar field1="2">Item2</Bar>' ' <Baz>Non-item</Baz>' ' <Bar>Item3</Bar>' ' </Bars>' '</Foo2>') print(input_json) foo2 = Foo2(raw_value=input_json) self.assertEqual(len(foo2.bars), 3) self.assertIsNone(foo2.bars[0].field1) self.assertEqual(foo2.bars[0].content, 'Item1') self.assertEqual(foo2.bars[1].field1, 2) self.assertEqual(foo2.bars[1].content, 'Item2') self.assertIsNone(foo2.bars[2].field1) self.assertEqual(foo2.bars[2].content, 'Item3') xml = unparse(foo2.to_primitive()) self.assertEqual( xml, '<Foo2><Bars><Bar>Item1</Bar><Bar field1="2">Item2</Bar><Bar>Item3</Bar></Bars></Foo2>' )
def test_inheritance(self): input_json = parse('<Foo3>' ' <Child1 />' ' <Child2 field2="hi" />' ' <Child1 field1="3" />' ' <Parent />' '</Foo3>') print(input_json) foo3 = Foo3(raw_value=input_json) self.assertEqual(len(foo3.children), 4) self.assertIsInstance(foo3.children[0], Child1) self.assertIsInstance(foo3.children[1], Child2) self.assertIsInstance(foo3.children[2], Child1) self.assertIsInstance(foo3.children[3], Parent) self.assertIsNone(foo3.children[0].field1) self.assertEqual(foo3.children[1].field2, 'hi') self.assertEqual(foo3.children[2].field1, 3) xml = unparse(foo3.to_primitive()) self.assertEqual( xml, '<Foo3><Child1 /><Child2 field2="hi" /><Child1 field1="3" /><Parent /></Foo3>' )
def test_all(self): self.assertEqual( parse('<a foo="2"><b bar="4" /><c>Hello</c><b /></a>'), { 'tag': 'a', 'attrib': { 'foo': '2', }, 'text': None, 'children': [{ 'tag': 'b', 'attrib': { 'bar': '4', }, 'text': None, 'children': [], }, { 'tag': 'c', 'attrib': {}, 'text': 'Hello', 'children': [], }, { 'tag': 'b', 'attrib': {}, 'text': None, 'children': [], }], }) self.assertEqual( unparse({ 'tag': 'a', 'attrib': { 'foo': '2', }, 'text': None, 'children': [{ 'tag': 'b', 'attrib': { 'bar': '4', }, 'text': None, 'children': [], }, { 'tag': 'c', 'attrib': {}, 'text': 'Hello', 'children': [], }, { 'tag': 'b', 'attrib': {}, 'text': None, 'children': [], }], }), '<a foo="2"><b bar="4" /><c>Hello</c><b /></a>')
def crawl(year, start_num): ''' this function crawls the paper in pubmed dataset Args: start_num: the starting index of crawling data year: year ''' Entrez.email = "*****@*****.**" time.sleep(1) handle = Entrez.esearch( db="pubmed", term='({}[Date - Publication] : {}[Date - Publication])'.format( year, year), usehistory='y') record = Entrez.read(handle) total_num = int(record['Count']) webenv = record['WebEnv'] query_key = record["QueryKey"] batch_size = 50 print( 'total num of crawling data: {}, finish crawled: {}, left crawled: {}'. format(total_num, start_num, total_num - start_num)) if (total_num - start_num) != 0: print('>>> crawling data in {} year'.format(year)) for start in range(start_num, total_num, batch_size): time.sleep(1) end = min(total_num, start + batch_size) print('total records: {}, downloading records {} to {} '.format( total_num, start + 1, end)) try: fetch_handle = Entrez.efetch(db='pubmed', retmode='xml', retstart=start, retmax=batch_size, webenv=webenv, query_key=query_key) except urllib.error.HTTPError: print(' HTTP Error 400: Bad Request') time.sleep(60) xml = fetch_handle.read() parse_json = xmltojson.parse(xml) data_dict = json.loads(parse_json) save_json(data_dict, year, start + 1, end) fetch_handle.close()
def x2j(c): # print('Processing for XML to JSON') try: _json = xmltojson.parse(c) _djson = json.loads(_json) # pp = pprint.PrettyPrinter(indent=4) # pp.pprint(_djson) # items = _djson['response']['body']['items']['item'] # pp.pprint(items) return _djson except Exception as e: # print(str(e)) pass
def test_children(self): self.assertEqual( parse('<a><b /><c /><b /></a>'), { 'tag': 'a', 'attrib': {}, 'text': None, 'children': [{ 'tag': 'b', 'attrib': {}, 'text': None, 'children': [], }, { 'tag': 'c', 'attrib': {}, 'text': None, 'children': [], }, { 'tag': 'b', 'attrib': {}, 'text': None, 'children': [], }], }) self.assertEqual( unparse({ 'tag': 'a', 'attrib': {}, 'text': None, 'children': [{ 'tag': 'b', 'attrib': {}, 'text': None, 'children': [], }, { 'tag': 'c', 'attrib': {}, 'text': None, 'children': [], }, { 'tag': 'b', 'attrib': {}, 'text': None, 'children': [], }], }), '<a><b /><c /><b /></a>')
def test_minimal(self): self.assertEqual(parse('<a />'), { 'tag': 'a', 'attrib': {}, 'text': None, 'children': [], }) self.assertEqual( unparse({ 'tag': 'a', 'attrib': {}, 'text': None, 'children': [], }), '<a />')
def test_text(self): self.assertEqual(parse('<a>Hello, <name>!</a>'), { 'tag': 'a', 'attrib': {}, 'text': 'Hello, <name>!', 'children': [], }) self.assertEqual( unparse({ 'tag': 'a', 'attrib': {}, 'text': 'Hello, <name>!', 'children': [], }), '<a>Hello, <name>!</a>')
def test_attrib(self): self.assertEqual(parse('<a foo="1" />'), { 'tag': 'a', 'attrib': { 'foo': '1' }, 'text': None, 'children': [], }) self.assertEqual( unparse({ 'tag': 'a', 'attrib': { 'foo': '1' }, 'text': None, 'children': [], }), '<a foo="1" />')
def test_model(self): input_json = parse('<Foo field1="23" field2="B">' ' Some content' ' <Bar>Hello!</Bar>' ' <Baz>Item1</Baz>' ' <Baz>Item2</Baz>' ' <Baz>Item3</Baz>' ' <Check/>' ' <BarNull1/>' '</Foo>') print(input_json) foo = Foo(raw_value=input_json) self.assertEqual(foo.field1, 23) self.assertEqual(foo.field2, 'B') self.assertEqual(foo.bar, 'Hello!') self.assertEqual(foo.baz, ['Item1', 'Item2', 'Item3']) self.assertEqual(foo.content, 'Some content') self.assertEqual(foo.check, True) self.assertEqual(foo.bar_null1, 0) self.assertIsNone(foo.bar_null2)
#!/usr/bin/python3 from lxml import html, etree import xmltojson import json import sys if len(sys.argv) > 1: file = sys.argv[1] else: file = sys.stdin.read() xml_raw = html.fromstring(file) xml = etree.tostring(xml_raw) js = json.loads(xmltojson.parse(xml)) print(json.dumps(js))