Exemplo n.º 1
0
    def test_list_documents(self, mocked_get):
        mocked_get.return_value.status_code = 200
        mocked_get.return_value.json.return_value = \
                {u'count': 2,
                 u'next': None,
                 u'previous': None,
                 u'results': [self.example_document_1,
                              self.example_document_2]}

        pypln = PyPLN(self.base_url, (self.user, self.password))
        result = pypln.documents()

        mocked_get.assert_called_with(self.base_url + "/documents/")

        retrieved_document_1 = result[0]
        retrieved_document_2 = result[1]

        for key, value in self.example_document_1.items():
            # `properties` is a method on `Document` class, so replacing with
            # `properties_url` to test each key/value
            if key == 'properties':
                key = 'properties_url'
            self.assertEqual(value, getattr(retrieved_document_1, key))
        for key, value in self.example_document_2.items():
            # `properties` is a method on `Document` class, so replacing with
            # `properties_url` to test each key/value
            if key == 'properties':
                key = 'properties_url'
            self.assertEqual(value, getattr(retrieved_document_2, key))

        # Document objects should link `session` object from PyPLN
        self.assertIs(retrieved_document_1.session, pypln.session)
        self.assertIs(retrieved_document_2.session, pypln.session)
Exemplo n.º 2
0
    def test_create_corpus(self, mocked_post):
        mocked_post.return_value.status_code = 201
        mocked_post.return_value.json.return_value = self.example_corpus

        pypln = PyPLN(self.base_url, (self.user, self.password))
        result = pypln.add_corpus(**self.corpus_data)

        mocked_post.assert_called_with(self.base_url + "/corpora/",
                                       data=self.corpus_data)
        for key, value in self.example_corpus.items():
            self.assertEqual(value, getattr(result, key))

        # Corpus objects should link `session` object from PyPLN
        self.assertIs(result.session, pypln.session)
Exemplo n.º 3
0
    def test_list_corpora(self, mocked_get):
        mocked_get.return_value.status_code = 200
        mocked_get.return_value.json.return_value = \
                {u'count': 1,
                 u'next': None,
                 u'previous': None,
                 u'results': [self.example_corpus]}

        pypln = PyPLN(self.base_url, (self.user, self.password))
        result = pypln.corpora()

        mocked_get.assert_called_with(self.base_url + "/corpora/")

        for key, value in self.example_corpus.items():
            self.assertEqual(value, getattr(result[0], key))

        # Corpus objects should link `session` object from PyPLN
        self.assertIs(result[0].session, pypln.session)
Exemplo n.º 4
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('mongodb',
                        help='MongoDB server/db/collection (format: '
                        'host:port/db/collection)')
    parser.add_argument('pypln',
                        help='Main URL to PyPLN installation. Example: '
                        'http://demo.pypln.org/')
    parser.add_argument('username',
                        help='Username to log-in PyPLN installtion')
    parser.add_argument('password',
                        help='Password to log-in PyPLN installtion')
    parser.add_argument('corpus',
                        help='Name of the corpus to upload documents to '
                        "(if doesn't exists, will be created)")
    parser.add_argument('--pages-per-request',
                        help='Number of pages to send in a single request')
    parser.add_argument('--max-pages', help='Maximum number of pages to send')
    args = parser.parse_args()
    if args.pages_per_request:
        pages_per_request = int(args.pages_per_request)
    else:
        pages_per_request = 10

    mongo_config = regexp_mongodb.findall(args.mongodb)
    if not mongo_config:
        sys.stdout.write('Error: "mongodb" should be in format '
                         'host:port/db/collection\n')
        exit(1)

    print 'Connecting to MongoDB...'
    mongo = dict(zip(('host', 'port', 'db', 'collection'), mongo_config[0]))
    connection = pymongo.Connection(host=mongo['host'],
                                    port=int(mongo['port']),
                                    safe=True)
    db = connection[mongo['db']]
    collection = db[mongo['collection']]

    print 'Logging into PyPLN at {}...'.format(args.pypln)
    pypln = PyPLN(args.pypln)
    pypln.login(args.username, args.password)

    print 'Selecting (or creating) corpus {}...'.format(args.corpus)
    corpora = pypln.corpora()
    find = [corpus for corpus in corpora \
            if corpus.name.lower() == args.corpus.lower()]
    if not find:
        corpus = pypln.add_corpus(name=args.corpus,
                                  description='Portuguese Wikipedia')
    else:
        corpus = find[0]
        # fix a bug in pypln.api:
        corpus.url = '{}corpora/{}'.format(args.pypln, corpus.slug)

    print 'Uploading...'
    query_filter = {'uploaded': False}
    total = float(collection.count())
    if args.max_pages:
        max_pages = int(args.max_pages)
    else:
        max_pages = total
    counter = collection.find({'uploaded': True}).count()
    initial_counter = counter
    report = '\r  {:07d} / {:07d} ({:5.2f}%), {:10.3f}s ({:9.3f}p/s). ETA: {}'
    start_time = time.time()
    cursor = collection.find(query_filter, timeout=False)
    page_iterator = partition(cursor, pages_per_request)
    for pages in page_iterator:
        temp_files, filenames = [], []
        for page in pages:
            temp_file = TemporaryFile()
            temp_file.write(page['text'].encode('utf-8'))
            temp_file.seek(0)
            temp_files.append(temp_file)
            filename = u'{}.txt'.format(page['title'])
            filenames.append(filename)
        corpus.add_documents(temp_files, filenames)
        for page in pages:
            collection.update({'_id': page['_id']},
                              {'$set': {
                                  'uploaded': True
                              }})

        counter += len(pages)
        percentual = 100 * (counter / total)
        delta_time = time.time() - start_time
        rate = (counter - initial_counter) / delta_time
        eta = timedelta(((max_pages - counter) / rate) / (24 * 3600))
        sys.stdout.write(
            report.format(counter, int(total), percentual, delta_time, rate,
                          eta))
        sys.stdout.flush()

        if max_pages and counter >= max_pages:
            break
    sys.stdout.write('\n')
    cursor.close()
Exemplo n.º 5
0
 def test_is_sending_pyplnapi_version_as_user_agent(self):
     pypln = PyPLN(self.base_url, (self.user, self.password))
     self.assertIn('pypln.api/{}'.format(__version__),
                   pypln.session.headers['User-Agent'])
Exemplo n.º 6
0
 def test_raise_an_error_if_auth_is_not_str_or_tuple(self):
     """If the `auth` argument is not a tuple (for basic auth) or a string
     (for token auth), an error should be raised."""
     with self.assertRaises(TypeError):
         pypln = PyPLN(self.base_url, 1)
Exemplo n.º 7
0
 def test_token_auth_is_correctly_set(self):
     credentials = 'ea92019a4bdf5d1c122c58b53de3e8d36fe9ae6a'
     pypln = PyPLN(self.base_url, credentials)
     self.assertEqual(pypln.session.headers['Authorization'],
                      'Token {}'.format(credentials))
Exemplo n.º 8
0
 def test_basic_auth_is_correctly_set(self):
     credentials = (self.user, self.password)
     pypln = PyPLN(self.base_url, credentials)
     self.assertEqual(pypln.session.auth, credentials)
Exemplo n.º 9
0
    def test_listing_documents_fails_if_wrong_auth(self, mocked_get):
        mocked_get.return_value.status_code = 403

        pypln = PyPLN(self.base_url, ('wrong_user', 'my_precious'))

        self.assertRaises(RuntimeError, pypln.documents)
Exemplo n.º 10
0
 def test_corpus_creation_fails_if_wrong_auth(self, mocked_post):
     mocked_post.return_value.status_code = 403
     with self.assertRaises(RuntimeError):
         pypln = PyPLN(self.base_url, ('wrong_user', 'my_precious'))
         result = pypln.add_corpus(**self.corpus_data)