def test_case_sensitive(self):
        input_json_good = parse('<CaseSensitive />')
        input_json_bad = parse('<caSesensiTive />')

        # Should succeed
        case_sensitive_good = CaseSensitive(raw_value=input_json_good)

        with self.assertRaises(DataError):
            CaseInsensitive(raw_value=input_json_bad)

        xml = unparse(case_sensitive_good.to_primitive())

        self.assertEqual(xml, '<CaseSensitive />')
    def test_case_insensitive(self):
        input_json_good = parse('<CaseInsensitive />')
        input_json_bad = parse('<caSeinsensiTive />')

        # Should succeed
        case_insensitive_good = CaseInsensitive(raw_value=input_json_good)

        # Should also succeed
        case_insensitive_bad = CaseInsensitive(raw_value=input_json_bad)

        xml = unparse(case_insensitive_good.to_primitive())
        self.assertEqual(xml, '<CaseInsensitive />')

        xml = unparse(case_insensitive_bad.to_primitive())
        self.assertEqual(xml, '<CaseInsensitive />')
def get_news(query, days=1):
    def google_news(query, days):
        link = "https://news.google.com/news/rss/headlines/section/topic/BUSINESS/search?q={}+when:{}d".format(
            query, days)
        return link

    link = google_news(query, days)
    r = requests.get(link)
    res = json.loads(xmltojson.parse(requests.get(link).text))
    headlines = []
    try:
        for item in res['rss']["channel"]['item']:
            headline = {}
            headline['Date'] = item['pubDate']
            headline['Title'] = item['title']
            headline['Link'] = item["link"]
            headlines.append(headline)
    except KeyError as e:
        return e
    news = pd.DataFrame(headlines)
    polarity = lambda x: round(TextBlob(x).sentiment.polarity, 2)
    subjectivity = lambda x: round(TextBlob(x).sentiment.subjectivity, 2)
    news_polarity = np.zeros(len(news['Title']))
    news_subjectivity = np.zeros(len(news['Title']))
    for idx, headline in enumerate(news["Title"]):
        #     try:
        news_polarity[idx] = polarity(headline)
        news_subjectivity[idx] = subjectivity(headline)
    #     except:
    #         pass
    news["Polarity"] = news_polarity
    date = lambda x: datetime.datetime.strptime(
        x.split(",")[1][1:-4], '%d %b %Y %H:%M:%S')
    news['Date'] = news["Date"].apply(date)
    return news
    def test_nested_child_list(self):
        input_json = parse('<Foo2>'
                           '   <Bars>'
                           '       <Bar>Item1</Bar>'
                           '       <Bar field1="2">Item2</Bar>'
                           '       <Baz>Non-item</Baz>'
                           '       <Bar>Item3</Bar>'
                           '   </Bars>'
                           '</Foo2>')

        print(input_json)

        foo2 = Foo2(raw_value=input_json)

        self.assertEqual(len(foo2.bars), 3)
        self.assertIsNone(foo2.bars[0].field1)
        self.assertEqual(foo2.bars[0].content, 'Item1')
        self.assertEqual(foo2.bars[1].field1, 2)
        self.assertEqual(foo2.bars[1].content, 'Item2')
        self.assertIsNone(foo2.bars[2].field1)
        self.assertEqual(foo2.bars[2].content, 'Item3')

        xml = unparse(foo2.to_primitive())

        self.assertEqual(
            xml,
            '<Foo2><Bars><Bar>Item1</Bar><Bar field1="2">Item2</Bar><Bar>Item3</Bar></Bars></Foo2>'
        )
    def test_inheritance(self):
        input_json = parse('<Foo3>'
                           '   <Child1 />'
                           '   <Child2 field2="hi" />'
                           '   <Child1 field1="3" />'
                           '   <Parent />'
                           '</Foo3>')

        print(input_json)

        foo3 = Foo3(raw_value=input_json)

        self.assertEqual(len(foo3.children), 4)
        self.assertIsInstance(foo3.children[0], Child1)
        self.assertIsInstance(foo3.children[1], Child2)
        self.assertIsInstance(foo3.children[2], Child1)
        self.assertIsInstance(foo3.children[3], Parent)
        self.assertIsNone(foo3.children[0].field1)
        self.assertEqual(foo3.children[1].field2, 'hi')
        self.assertEqual(foo3.children[2].field1, 3)

        xml = unparse(foo3.to_primitive())

        self.assertEqual(
            xml,
            '<Foo3><Child1 /><Child2 field2="hi" /><Child1 field1="3" /><Parent /></Foo3>'
        )
示例#6
0
    def test_all(self):
        self.assertEqual(
            parse('<a foo="2"><b bar="4" /><c>Hello</c><b /></a>'), {
                'tag':
                'a',
                'attrib': {
                    'foo': '2',
                },
                'text':
                None,
                'children': [{
                    'tag': 'b',
                    'attrib': {
                        'bar': '4',
                    },
                    'text': None,
                    'children': [],
                }, {
                    'tag': 'c',
                    'attrib': {},
                    'text': 'Hello',
                    'children': [],
                }, {
                    'tag': 'b',
                    'attrib': {},
                    'text': None,
                    'children': [],
                }],
            })

        self.assertEqual(
            unparse({
                'tag':
                'a',
                'attrib': {
                    'foo': '2',
                },
                'text':
                None,
                'children': [{
                    'tag': 'b',
                    'attrib': {
                        'bar': '4',
                    },
                    'text': None,
                    'children': [],
                }, {
                    'tag': 'c',
                    'attrib': {},
                    'text': 'Hello',
                    'children': [],
                }, {
                    'tag': 'b',
                    'attrib': {},
                    'text': None,
                    'children': [],
                }],
            }), '<a foo="2"><b bar="4" /><c>Hello</c><b /></a>')
示例#7
0
def crawl(year, start_num):
    '''
    this function crawls the paper in pubmed dataset

    Args:
        start_num: the starting index of crawling data
        year: year
    '''

    Entrez.email = "*****@*****.**"
    time.sleep(1)
    handle = Entrez.esearch(
        db="pubmed",
        term='({}[Date - Publication] : {}[Date - Publication])'.format(
            year, year),
        usehistory='y')

    record = Entrez.read(handle)
    total_num = int(record['Count'])
    webenv = record['WebEnv']
    query_key = record["QueryKey"]

    batch_size = 50
    print(
        'total num of crawling data: {}, finish crawled: {}, left crawled: {}'.
        format(total_num, start_num, total_num - start_num))
    if (total_num - start_num) != 0:

        print('>>> crawling data in {} year'.format(year))
        for start in range(start_num, total_num, batch_size):

            time.sleep(1)
            end = min(total_num, start + batch_size)
            print('total records: {}, downloading records {} to {} '.format(
                total_num, start + 1, end))
            try:
                fetch_handle = Entrez.efetch(db='pubmed',
                                             retmode='xml',
                                             retstart=start,
                                             retmax=batch_size,
                                             webenv=webenv,
                                             query_key=query_key)
            except urllib.error.HTTPError:
                print(' HTTP Error 400: Bad Request')
                time.sleep(60)

            xml = fetch_handle.read()
            parse_json = xmltojson.parse(xml)
            data_dict = json.loads(parse_json)

            save_json(data_dict, year, start + 1, end)
            fetch_handle.close()
def x2j(c):
    # print('Processing for XML to JSON')
    try:
        _json = xmltojson.parse(c)
        _djson = json.loads(_json)
        # pp = pprint.PrettyPrinter(indent=4)
        # pp.pprint(_djson)
        # items = _djson['response']['body']['items']['item']
        # pp.pprint(items)

        return _djson
    except Exception as e:
        # print(str(e))
        pass
示例#9
0
    def test_children(self):
        self.assertEqual(
            parse('<a><b /><c /><b /></a>'), {
                'tag':
                'a',
                'attrib': {},
                'text':
                None,
                'children': [{
                    'tag': 'b',
                    'attrib': {},
                    'text': None,
                    'children': [],
                }, {
                    'tag': 'c',
                    'attrib': {},
                    'text': None,
                    'children': [],
                }, {
                    'tag': 'b',
                    'attrib': {},
                    'text': None,
                    'children': [],
                }],
            })

        self.assertEqual(
            unparse({
                'tag':
                'a',
                'attrib': {},
                'text':
                None,
                'children': [{
                    'tag': 'b',
                    'attrib': {},
                    'text': None,
                    'children': [],
                }, {
                    'tag': 'c',
                    'attrib': {},
                    'text': None,
                    'children': [],
                }, {
                    'tag': 'b',
                    'attrib': {},
                    'text': None,
                    'children': [],
                }],
            }), '<a><b /><c /><b /></a>')
示例#10
0
    def test_minimal(self):
        self.assertEqual(parse('<a />'), {
            'tag': 'a',
            'attrib': {},
            'text': None,
            'children': [],
        })

        self.assertEqual(
            unparse({
                'tag': 'a',
                'attrib': {},
                'text': None,
                'children': [],
            }), '<a />')
示例#11
0
    def test_text(self):
        self.assertEqual(parse('<a>Hello, &lt;name&gt;!</a>'), {
            'tag': 'a',
            'attrib': {},
            'text': 'Hello, <name>!',
            'children': [],
        })

        self.assertEqual(
            unparse({
                'tag': 'a',
                'attrib': {},
                'text': 'Hello, <name>!',
                'children': [],
            }), '<a>Hello, &lt;name&gt;!</a>')
示例#12
0
    def test_attrib(self):
        self.assertEqual(parse('<a foo="1" />'), {
            'tag': 'a',
            'attrib': {
                'foo': '1'
            },
            'text': None,
            'children': [],
        })

        self.assertEqual(
            unparse({
                'tag': 'a',
                'attrib': {
                    'foo': '1'
                },
                'text': None,
                'children': [],
            }), '<a foo="1" />')
    def test_model(self):
        input_json = parse('<Foo field1="23" field2="B">'
                           '   Some content'
                           '   <Bar>Hello!</Bar>'
                           '   <Baz>Item1</Baz>'
                           '   <Baz>Item2</Baz>'
                           '   <Baz>Item3</Baz>'
                           '   <Check/>'
                           '   <BarNull1/>'
                           '</Foo>')

        print(input_json)

        foo = Foo(raw_value=input_json)

        self.assertEqual(foo.field1, 23)
        self.assertEqual(foo.field2, 'B')
        self.assertEqual(foo.bar, 'Hello!')
        self.assertEqual(foo.baz, ['Item1', 'Item2', 'Item3'])
        self.assertEqual(foo.content, 'Some content')
        self.assertEqual(foo.check, True)
        self.assertEqual(foo.bar_null1, 0)
        self.assertIsNone(foo.bar_null2)
示例#14
0
#!/usr/bin/python3

from lxml import html, etree
import xmltojson
import json
import sys


if len(sys.argv) > 1:
    file = sys.argv[1]

else:
    file = sys.stdin.read()



xml_raw = html.fromstring(file)
xml = etree.tostring(xml_raw)


js = json.loads(xmltojson.parse(xml))


print(json.dumps(js))