def test_read_mongo(mocker): class DBStub(): def aggregate(self, docs): return [] collection_name = 'ACollection' db = {collection_name: DBStub()} mock = mocker.spy(db[collection_name], 'aggregate') pdm.read_mongo(collection_name, [], db) mock.assert_called_with([])
def test_read_mongo_chunksize(mocker): class DBStub(): def aggregate(self, docs, **kwargs): return [] collection_name = 'ACollection' batch_size = 2 db = {collection_name: DBStub()} mock = mocker.spy(db[collection_name], 'aggregate') pdm.read_mongo(collection_name, [], db, chunksize=batch_size) mock.assert_called_with([], batchSize=batch_size)
def test_read_mongo_params(mocker): collection_name = 'ACollection' class CollectionStub: def aggregate(self, docs, **kwargs): pass collection_mock = mocker.Mock(CollectionStub) collection_mock.aggregate.return_value = [] db = {collection_name: collection_mock} pdm.read_mongo(collection_name, [], db, extra={'allowDiskUse': True}) collection_mock.aggregate.assert_called_with([], allowDiskUse=True)
def test_read_mongo_params_batch_size_and_chunksize_raises_value_error(mocker): collection_name = 'ACollection' class CollectionStub: def aggregate(self, docs, **kwargs): pass collection_mock = mocker.Mock(CollectionStub) collection_mock.aggregate.return_value = [] db = {collection_name: collection_mock} with pytest.raises(ValueError): pdm.read_mongo(collection_name, [], db, chunksize=30, extra={'batchSize': 20})
def mas_repetida(): palabra = request.args.get('palabra', type=str) df = pdm.read_mongo("prepared_tweets", [], db) if palabra == 'todas': try: result = Counter(" ".join( df["clean_tweets"]).split()).most_common(50) # print(result) result_df = pd.DataFrame(result, columns=['Palabra', 'Frequencia' ]).set_index('Palabra') print(result_df) result_df = result_df.to_json(orient='columns') return result_df except ValueError as e: pass else: try: return palabra except ValueError as e: pass
def read(self, correction): """Smart logic to read corrections, :param correction: pandas.DataFrame object name in the DB (str type). :return: DataFrame as read from the corrections database with time index or None if an empty DataFrame is read from the database """ df = pdm.read_mongo(correction, [], self.client[self.database_name]) return self.sort_by_index(df)
def read_at(self, correction, when, limit=1): """Smart logic to read corrections at given time (index), i.e by datetime index :param correction: pandas.DataFrame object name in the DB (str type). :param when: when, datetime to read the corrections, e.g. datetime(2020, 8, 12, 21, 4, 32, 7, tzinfo=pytz.utc) :param limit: how many indexes after and before when, i.e. limit=1 will return 1 index before and 1 after :return: DataFrame as read from the corrections database with time index or None if an empty DataFrame is read from the database """ before_df = pdm.read_mongo(correction, self.before_date_query(when, limit), self.client[self.database_name]) after_df = pdm.read_mongo(correction, self.after_date_query(when, limit), self.client[self.database_name]) df = pd.concat([before_df, after_df]) return self.sort_by_index(df)
def connect_mongo(self): try: client = MongoClient(MONGO_HOST) # db = client.pruebadb db = client.climateinfo # Store info from "filtered_stream" collection into pandas dataframe df = pdm.read_mongo("prepared_tweets", [], db) print(df.dtypes) return df except Exception as e: print(e)
def connect_mongo(self): try: client = MongoClient(MONGO_HOST) db = client.climateinfo # Store info from "filtered_stream" collection into pandas dataframe df = pdm.read_mongo("filtered_stream", [], db) print(df.head()) return df except Exception as e: print(e)
def test_read_mongo_index_col(mocker): class DBStub(): def aggregate(self, docs, **kwargs): return [{ 't': '2020-01-01T00:00:00.000Z', 'v': 20 }, { 't': '2020-01-01T01:00:00.000Z', 'v': 15 }] collection_name = 'ACollection' db = {collection_name: DBStub()} df = pdm.read_mongo(collection_name, [], db, index_col='t') assert df.index[0] == '2020-01-01T00:00:00.000Z' assert df.v[0] == 20
def connect_mongo(self): try: client = MongoClient(MONGO_HOST) # db = client.pruebadb db = client.climateinfo # Store info from "twitter_search" collection into pandas dataframe df = pdm.read_mongo("longfiltertweets", [], db) print(df.head()) return df except Error as e: print(e)
def read(self, correction): """Smart logic to read corrections, :param correction: pandas.DataFrame object name in the DB (str type). :return: DataFrame as read from the corrections database with time index or None if an empty DataFrame is read from the database """ df = pdm.read_mongo(correction, [], self.database) # No data found if df.size == 0: return None # Delete internal Mongo identifier del df['_id'] df['time'] = pd.to_datetime(df['time'], utc=True) return df.set_index('time')
def test_read_mongo_db_str(mocker): class CollectionStub(): def aggregate(self, query, **kwargs): return [{ 't': '2020-01-01T00:00:00.000Z', 'v': 20 }, { 't': '2020-01-01T01:00:00.000Z', 'v': 15 }] class DBStub: def __getitem__(self, item): return CollectionStub() mock = mocker.patch("pymongo.database.Database") mock.return_value = DBStub() collection_name = 'ACollection' db_uri = "mongodb://localhost:27017/pd-mongo-sample-db" df = pdm.read_mongo(collection_name, [], db_uri) assert df.index[0] == 0 assert df.values[0][0] == '2020-01-01T00:00:00.000Z' assert not df.values.size == 2
import pandas as pd import json import operator from sklearn.metrics.pairwise import cosine_similarity from sklearn.feature_extraction.text import TfidfVectorizer import nltk import pdmongo as pdm from nltk.corpus import stopwords nltk.download('stopwords') jobsFile = pdm.read_mongo("jobs", [], "mongodb://localhost:27017/cmpe295") resumesFile = pd.read_csv("resumes-data.csv") stopset = set(stopwords.words('english')) tfidf_vectorizer = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), min_df=0, stop_words=stopset) job_skills_matrix = tfidf_vectorizer.fit_transform(jobsFile['Skills'].astype('U')) resume_skills_matrix = tfidf_vectorizer.transform(resumesFile['Skills']) matchingJobsList = [] matchingRate = 0.0 for idx in range(len(jobsFile)): skills_similarity_score = cosine_similarity(job_skills_matrix[idx],resume_skills_matrix[0]) if skills_similarity_score > matchingRate: matchingJobsList.append({ "jid": idx, "score": skills_similarity_score[0][0] }) topTenMatchingJobs = sorted(matchingJobsList, key=operator.itemgetter('score'), reverse=True)[:50] print(json.dumps(topTenMatchingJobs))
def resumen_valoracion(): palabra = request.args.get('palabra', type=str) df = pdm.read_mongo("prepared_tweets", [], db) # print(df.head()) if palabra == 'todas': try: pos_tweets = [ tweet for index, tweet in enumerate(df["clean_tweets"]) if re.search('Positivo', df['valoracion_manual'][index]) ] neg_tweets = [ tweet for index, tweet in enumerate(df["clean_tweets"]) if re.search('Negativo', df['valoracion_manual'][index]) ] neu_tweets = [ tweet for index, tweet in enumerate(df["clean_tweets"]) if re.search('Neutro', df['valoracion_manual'][index]) ] valoraciones = { "palabra": palabra, "total_tweets": len(df['clean_tweets']), "can_pos": len(pos_tweets), "can_neg": len(neg_tweets), "can_neu": len(neu_tweets) } # print(valoraciones) return jsonify(valoraciones) except ValueError as e: pass else: try: pos_tweets = [ tweet for index, tweet in enumerate(df["clean_tweets"]) if re.search('Positivo', df['valoracion_manual'][index]) ] neg_tweets = [ tweet for index, tweet in enumerate(df["clean_tweets"]) if re.search('Negativo', df['valoracion_manual'][index]) ] neu_tweets = [ tweet for index, tweet in enumerate(df["clean_tweets"]) if re.search('Neutro', df['valoracion_manual'][index]) ] valoraciones = { "palabra": palabra, "total_tweets": len(df['clean_tweets']), "can_pos": 0, "can_neg": 0, "can_neu": 0 } # print(valoraciones) return jsonify(valoraciones) except ValueError as e: pass