Пример #1
0
def getMigrationScript(collection):
    dbcon = DBConnection()
    db = MongoConsts.DB
    uri = dbcon.getRemoteConnectionString()
    str = f"""
    mongoexport --db={db} --collection={collection} --out={collection}.json
    mongoimport --collection={collection} --file={collection}.json --uri={uri}
    """
    return str
Пример #2
0
def getDF():
    connection = DBConnection()

    collection = connection.indianMediaVideoCollection

    rows = []
    for vid in collection.find():
        try:
            typeList = vid["kind"] == Channels.VID_TYPE_LIST

            info = vid["items"][0] if typeList else vid
            url = f"www.youtube.com/watch?v={info['id']}"
            info = info["snippet"]

            channelId = Channels.reverseLookup(info["channelId"])
            ptitle = vid["playlist"]["snippet"]["title"] if "snippet" in vid[
                "playlist"] else ""
            title = info["title"]
            desc = info["description"]
            date = info["publishedAt"]

        except Exception as e:
            print(vid.keys())
            print(vid)
            print(e)
            raise e

        rows.append([channelId, ptitle, date, title, desc, url])

    header = [
        "Channel Id", "Playlist Title", "Date", "Title", "Description", "Url"
    ]

    df = pd.DataFrame(rows, columns=header)
    return df
Пример #3
0
    def test_save_article_to_db(self):
        self.sut = GoodNewsNetwork(DBConnection(), 1, 1)

        self.sut.save_article_to_db({"_id": "abc", "content": "asd"})

        q = {"_id": "abc"}
        coll = self.sut.dbconn.getCollection(MongoConsts.GOOD_NEWS_COLLECTION)
        a = coll.find_one(q)
        self.assertIsNotNone(a)
        coll.delete_one(q)
        a = coll.find_one(q)
        self.assertIsNone(a)
Пример #4
0
def getWordDatesDF(limit=None):
    connection = DBConnection()
    collection = connection.getCollection(MongoConsts.WORD_DATE_COLLECTION)
    dfs = []
    r = collection.find() if limit == None else collection.find().limit(limit)
    for vid in r:
        df = pd.read_json(json.dumps(vid["ts"]), orient="index")
        df.columns = [vid["word_id"]]
        dfs.append(df)

    merged = pd.concat(dfs, axis=1)
    merged.index = pd.MultiIndex.from_tuples(
        [literal_eval(i) for i in merged.index])

    merged = merged.reset_index()

    merged["level_0"] = pd.to_datetime(merged["level_0"], format="%m_%d_%y")
    merged = merged.sort_values("level_0")
    merged["level_0"] = merged["level_0"].dt.strftime("%m_%d_%y")
    merged = merged.set_index(["level_0", "level_1"])

    return merged
Пример #5
0
    def __init__(self):
        self.dtype_lookup = {
            FlatFiles.WORD_BY_DATE: {
                "channel_id": "category",
                "date": "str",
                "variable": "category",
                "value": "float16",
                "date_month": "int8",
                "date_week": "int8"
            }
        }

        self.cache = {}

        self.db = DBConnection()
Пример #6
0
 def setUp(self):
     dbconn = DBConnection()
     self.sut = TrendRank(dbconn, TrendRankTest.TermRankCollection)
Пример #7
0
    def get_article_content(self, url):
        try:
            resp = requests.get(url)
            soup = BeautifulSoup(resp.text)
            content = soup.select_one(
                GoodNewsNetwork.Selectors.POST_CONTENT).text
        except Exception as e:
            logging.error(f"Failed to get article - {url} - {e}")
            content = None

        return content


if __name__ == "__main__":
    import argparse

    parser = argparse.ArgumentParser()
    parser.add_argument("--start-page",
                        default=1,
                        type=int,
                        help="Page to start scraping from")
    parser.add_argument(
        "--max-pages",
        default=1,
        type=int,
        help="Maximum Number of Pages to extract from the site")
    args = parser.parse_args()

    dbconn = DBConnection()
    GoodNewsNetwork(dbconn, args.start_page, args.max_pages).scrape_articles()
Пример #8
0
        return self.getDBConnection().articleCollection.delete_one({"url" : url})

    def getArticleFromDBByUrl(self,url):
        return self.getDBConnection().articleCollection.find_one({"url" :url})

    def getAllArticlesFromDB(self):
        retrival_query = {"text" : {"$not" : {"$regex" : ".*(403|Forbidden|FAILED_TO_LOAD_PAGE).*"}}}
        return self.getDBConnection().articleCollection.find(retrival_query)

    def getAllArticlesFromDBAsDf(self):
        articles = self.getAllArticlesFromDB()
        df = pd.DataFrame(articles)
        return df

    def writeAllArticlesToCSV(self,fpath):
        return self.writeArticlesToCSV(fpath ,self.getAllArticlesFromDB())

    def writeArticlesToCSV(self ,fpath, articles):
        df= pd.DataFrame(articles)
        print(df.shape)
        df.to_csv(fpath ,index=False)

    def runJob(self ,fpath="./test.csv", limit=-1):
        df = self.getDF()
        self.saveAllUrlContentInDB(df , limit)
        self.writeAllArticlesToCSV(fpath)

if __name__ == "__main__":
    dbC = DBConnection()
    ArticleScraper(dbC).writeAllArticlesToCSV("./test.csv")
Пример #9
0
def migrate(collection):
    dbcon = DBConnection()
    localCstr = dbcon.getLocalConnectionString()
    remoteClient = dbcon.getRemoteClient()
    db = remoteClient[MongoConsts.DB]
    db.cloneCollection(localCstr, f'{MongoConsts.DB}.{collection}')
Пример #10
0
 def __init__(self):
     super().__init__(DBConnection() , MongoConsts.TERM_RANK_COLLECTION)
Пример #11
0
            d.index = pd.MultiIndex.from_tuples([literal_eval(i) for i in d.index])
            d.columns= [name]
            jsis.append(d)

        if len(jsis) < 1:
            return None
        df = pd.concat(jsis, axis=1)

        #df= self._get_all_series_ranked(df).reset_index().rename(columns={"level_0":TrendRank.COLS.DATE , "level_1":TrendRank.COLS.CHNL})
        df= df.reset_index().rename(columns={"level_0":TrendRank.COLS.DATE , "level_1":TrendRank.COLS.CHNL})
        df[TrendRank.COLS.DATE] = pd.to_datetime(df[TrendRank.COLS.DATE]  , format="%m_%d_%y")
        df = df.sort_values(TrendRank.COLS.DATE)
        return df

    def run_job(self):
        logging.info("Starting Job")
        df = self.get_df()
        rm = self.get_rank_matrix(df)
        #print(rm["coronavirus"][rm["coronavirus"] != -1])
        self.save_rank_matrix(rm)


@singleton
class TrendRankDataFrameService(TrendRank):

    def __init__(self):
        super().__init__(DBConnection() , MongoConsts.TERM_RANK_COLLECTION)

if __name__ == "__main__":
    TrendRank(DBConnection() , MongoConsts.TERM_RANK_COLLECTION).run_job()
Пример #12
0
 def __init__(self):
     self.df = getDF()
     print(self.df.head())
     self.dbcon = DBConnection()
     self.max_terms_to_save = 1000
Пример #13
0
 def __init__(self):
     dbconn = DBConnection()
     super().__init__(dbconn, MongoConsts.TD_SVD_COMP_COLLECTION,
                      MongoConsts.TD_SVD_DF_COLLECTION,
                      MongoConsts.TD_TOPICS_COLLECTION,
                      MongoConsts.TD_DOC_TOPICS_COLLECTION)
Пример #14
0
 def setUp(self):
     self.sut = ArticleScraper(DBConnection())
Пример #15
0
 def __init__(self):
     dbconn = DBConnection()
     super().__init__(dbconn
                     , MongoConsts.QD_TRBD_TERM_DIST_COLLECTION
                     , MongoConsts.QD_TRBD_GRP_DIST_COLLECTION)
Пример #16
0
import math

import numpy as np

logging.basicConfig(level=logging.INFO)

from IndianMedia.constants import Channels, MongoConsts, Creds
from IndianMedia.mongointf.pymongoconn import DBConnection

try:
    __file__
except:
    __file__ = os.path.abspath(
        os.path.join(".", "..", "Analytics", "IndianMedia", Creds.KEY_FILE))

connection = DBConnection()

f = os.path.abspath(os.path.join(os.path.dirname(__file__), Creds.KEY_FILE))
key = open(f, "r").read().strip("\n")

api = pyyoutube.Api(api_key=key)

#p = pl.items[0]


def GetChannelVideoInfo(channelId, daysSince, limit):
    afterDate = (datetime.datetime.now() +
                 datetime.timedelta(days=-daysSince)).isoformat() + "Z"
    videos = api.search(parts="snippet",
                        channel_id=channelId,
                        count=limit,