Exemplo n.º 1
0
def genExplThreadsCol(threads, config, cacheFilename, expThreadFlag=True):

    if (len(threads) == 0):
        return {}

    prevNow = datetime.now()

    twts = getDictFromFile(cacheFilename)
    if (len(twts) == 0):

        twts = retryParallelTwtsExt(
            threads,
            maxRetryCount=config['maxRetryCount'],
            tweetConvMaxTweetCount=config['tweetConvMaxTweetCount'],
            maxNoMoreTweetCounter=config['maxNoMoreTweetCounter'],
            chromedriverPath=config['chromedriverPath'],
            extraParams=config)

        if (len(twts) != 0):
            dumpJsonToFile(cacheFilename, twts, indentFlag=False)
    else:
        print('\ngenExplThreadsCol(): read tweets from cache:', cacheFilename)

    delta = datetime.now() - prevNow
    twts = updateCache(threads, config, cacheFilename, expThreadFlag, twts)
    twts = normalizeCol(twts, expThreadFlag)

    return twts
Exemplo n.º 2
0
    def genFacebookCols(self, name, settings):
        print('\ngenFacebookCols():')

        filename = settings['inputFileWithPosts'].split('/')[-1].replace(
            '.json', '')
        fbCol = getDictFromFile(settings['inputFileWithPosts'])
        fbCol = socMedGenCol.normalizeCol(fbCol)

        return fbCol
Exemplo n.º 3
0
    def __init__(self, configFilename):

        print('\nExtractMicroCol::init() - start')
        reportFilename = configFilename.replace('config.json', 'report.json')

        print('\tconfig:', configFilename)
        print('\treport:', reportFilename)

        self.cols = getDictFromFile(configFilename)
        self.cache = getDictFromFile(reportFilename)
        self.reportFilename = reportFilename
        self.health = False

        if ('sources' in self.cols and 'collectionTopic' in self.cols):

            createFolderAtPath('./Caches/Deg1Twttr/' +
                               self.cols['collectionTopic'])
            createFolderAtPath('./Caches/ExpTwttrThreads/' +
                               self.cols['collectionTopic'])
            createFolderAtPath('./Caches/ImpTwttrThreads/' +
                               self.cols['collectionTopic'])
            createFolderAtPath('./Caches/Sources/' +
                               self.cols['collectionTopic'])
            createFolderAtPath('./Caches/Tweets/' +
                               self.cols['collectionTopic'])
            createFolderAtPath('./Caches/SegmentedCols/' +
                               self.cols['collectionTopic'])
            createFolderAtPath('./Caches/ShortURIs/' +
                               self.cols['collectionTopic'])
            createFolderAtPath('./Caches/Plots/' +
                               self.cols['collectionTopic'])
            createFolderAtPath('./Caches/CDFs/' + self.cols['collectionTopic'])

            print('\tsources:', len(self.cols['sources']))
            for src in self.cols['sources']:
                print('\t\t', src['name'], 'active:', src['active'])
                self.cols

            self.extractCols()
            self.health = True
        else:
            print('\tmissing config')

        print('ExtractMicroCol::init() - end\n')
Exemplo n.º 4
0
def main(inFilenamePath, config):

    print('\nmain()')
    if (len(config) == 0):
        return

    filename = inFilenamePath.split('/')[-1].replace('.json', '')
    col = getDictFromFile('./' + inFilenamePath)
    col, metadata = normalizeCol(col)

    genSerpOrThreadsCol(col, filename, source=metadata['source'])
    threads = getThreads(col, config['maxThreadsToExtract'])
    genThreadsCol(threads, config, filename)
Exemplo n.º 5
0
    def __init__(self, goldstandardFilename):

        self.goldstandardFilename = goldstandardFilename
        self.goldstandard = getDictFromFile(goldstandardFilename)
        self.simCoeff = -1

        if ('sim-coeff' in self.goldstandard):
            self.simCoeff = self.goldstandard['sim-coeff']
        elif ('uris' in self.goldstandard):

            PrecEval.getHTMLAndTextForURILst(self.goldstandard,
                                             self.goldstandardFilename)
            self.setSimCoeff()
            #parallel version did not achieve decent speedup
            #self.prlSetSimCoeff()
        else:
            print('\tInvalid goldstandard supplied')
            print('\t', self.goldstandard)
Exemplo n.º 6
0
    def getColFromCache(self, colname, id):

        if (len(self.cache) == 0):
            return {}

        singleSrcFilename = './Caches/Sources/' + self.cols[
            'collectionTopic'] + '/' + id + '.json'
        src = getDictFromFile(singleSrcFilename)
        if (len(src) != 0):
            print('\tgetColFromCache():', id, 'HIT 1')
            return src

        for src in self.cache['sources']:
            if (src['name'] == colname and src['id'] == id):
                if ('output' in src):
                    if (len(src['output']) != 0):
                        print('\tgetColFromCache():', src['name'], 'HIT 2')
                        return src['output']

        return {}
Exemplo n.º 7
0
def main(goldFilname, testFilename=None):

    goldFilname = goldFilname.strip()
    if (len(goldFilname) == 0):
        return

    prevNow = datetime.now()
    goldstandard = PrecEval(goldFilname)

    if (testFilename is not None):
        tstFile = getDictFromFile(testFilename)
        PrecEval.getHTMLAndTextForURILst(tstFile, testFilename)

        tstFile['timestamp'] = getNowTime()
        tstFile['predicted-precision'] = PrecEval.prlEvalCol(
            tstFile, goldstandard.goldstandard, removeTxt=False)
        tstFile['sim-coeff'] = goldstandard.simCoeff
        dumpJsonToFile(testFilename, tstFile)

    delta = datetime.now() - prevNow
    print('\tdelta seconds:', delta.seconds)
def main(segF):

    segment = getDictFromFile(segF)

    if (len(segment) == 0):
        print('\tSegment is corrupt, returning')
        return

    print('\ncarbon date segment():')
    prevNow = datetime.now()

    excludeDomains = ['archive.is']
    ignoreEmptyFiles = False
    cacheOnInd = -1  #-1 to switch off

    progress = 0
    for seg in ['ss', 'ms', 'mm', 'mc']:

        if (seg == 'mc'):
            continue

        jobsLst = []
        segSize = len(segment['segmented-cols'][seg])
        for i in range(segSize):

            carbonDateServerStartStop('start')
            uriSize = len(segment['segmented-cols'][seg][i]['uris'])
            for j in range(uriSize):

                progress += 1
                uri = segment['segmented-cols'][seg][i]['uris'][j]
                uriSeg = getURISeg(uri)

                #skip carbon dating non-relevant uris for now
                if ('relevant' in uri):
                    if (uri['relevant'] == False):
                        continue

                dom = getDomain(uri['uri'])
                if (dom in excludeDomains):
                    print('\texcluding:', dom)
                    continue

                html = ''
                uriHash = getURIHash(uriSeg['long'])
                htmlFile = './Caches/HTML/' + uriHash + '.html'
                outfilename = './Caches/CD/' + uriHash + '.txt'

                altOutfilename = ''
                if (uriSeg['short'] != ''):
                    altOutfilename = './Caches/CD/' + getURIHash(
                        uriSeg['short']) + '.txt'

                if (os.path.exists(outfilename)):

                    pubDate = readTextFromFile(outfilename)
                    pubdate = pubDate.strip()
                    if (pubdate == ''):
                        if (ignoreEmptyFiles):
                            continue
                    else:
                        continue

                toPrint = '\tseg: ' + seg + ' ' + str(j) + ' of ' + str(
                    uriSize) + ', ' + str(i) + ' of ' + str(
                        segSize) + ', p: ' + str(progress)
                #print(toPrint)

                if (progress < cacheOnInd):
                    print('\tskipping', progress)
                    continue

                html = ''
                if (os.path.exists(htmlFile)):
                    html = readTextFromFile(htmlFile)

                keywords = {
                    'uri': uriSeg['long'],
                    'html': html,
                    'outfilename': outfilename,
                    'altOutfilename': altOutfilename
                }

                jobsLst.append({
                    'func': getPubDate,
                    'args': keywords,
                    'misc': False,
                    'print': toPrint
                })

        resLst = []
        jobCount = len(jobsLst)
        if (jobCount != 0):
            print('jobsLst:', jobCount)
            resLst = parallelTask(jobsLst, threadCount=3)

    delta = datetime.now() - prevNow
    print('\tdelta seconds:', delta.seconds)
Exemplo n.º 9
0
    def genTwitterCols(self, name, settings):
        print('\ngenTwitterCols():')

        output = {
            'serp': {},
            'explicit-thread-cols': {},
            'implicit-thread-cols': {},
            'serp-heuristics': {}
        }

        filename = settings['inputFileWithTweets'].split('/')[-1].replace(
            '.json', '')
        twtCol = getDictFromFile(settings['inputFileWithTweets'])

        twtCol = socMedGenCol.normalizeCol(twtCol)
        output['serp'] = twtCol

        threadOptions = {
            'extractExpThreadCol': {
                'input': 'explicit-thread-links',
                'output': 'explicit-thread-cols',
                'trim': 'maxExpThreadToExplore',
                'expFlag': True
            },
            'extractImpThreadCol': {
                'input': 'possible-implicit-thread-links',
                'output': 'implicit-thread-cols',
                'trim': 'maxImpThreadToExplore',
                'expFlag': False
            }
        }

        for threadOption, params in threadOptions.items():
            if (settings[threadOption]):

                trim = settings[params['trim']]

                print('\t' + threadOption + ' count:',
                      len(twtCol['payload'][params['input']]))
                print('\t\twill extract max:', trim)

                expOrImp = params['output'][:3]
                expOrImp = expOrImp[0].upper() + expOrImp[1:]
                cacheFilename = './Caches/' + expOrImp + 'TwttrThreads/' + self.cols[
                    'collectionTopic'] + '/' + 'threads.json'

                output[params['output']] = socMedGenCol.genExplThreadsCol(
                    twtCol['payload'][params['input']][:trim],
                    settings,
                    cacheFilename,
                    expThreadFlag=params['expFlag'])
            else:
                print('\t', threadOption, 'off, will read report cache')
                if ('id' in settings):
                    cache = self.getColFromCache(name, settings['id'])
                    if (params['output'] in cache):
                        output[params['output']] = cache[params['output']]
                else:
                    print('\tcan\'t read cache, id not provided')

        #add degree 1 serp col - start
        print('\t adding degree 1 cols')
        deg1Settings = self.cols['degree-1-twt-cols']
        dedupSet = set()

        if ('tweets' in output['serp']['payload']):

            #don't add a tweet already explored in degree 2
            for twt in output['serp']['payload']['tweets']:
                dedupSet.add(twt['data-tweet-id'])

            output['degree-1-twt-col'] = [{'name': name, 'tweet-links': []}]
            ExtractMicroCol.addDegree1TwtLinks(
                output['serp']['payload']['tweets'],
                output['degree-1-twt-col'][-1]['tweet-links'], dedupSet)

            print('\tdeg 1', name, 'active:', deg1Settings['active'][name])
            if (deg1Settings['active'][name]):
                cacheFilename = './Caches/Deg1Twttr/' + self.cols[
                    'collectionTopic'] + '/' + name + '.json'
                ExtractMicroCol.addTwDeg1Col(
                    name, output['degree-1-twt-col'][-1]['tweet-links'],
                    cacheFilename, deg1Settings)

        for colOpt in ['explicit-thread-cols', 'implicit-thread-cols']:
            if ('thread-cols' in output[colOpt]):

                threadTypeName = name + '-' + colOpt[:3] + '-threads'
                output['degree-1-twt-col'].append({
                    'name': threadTypeName,
                    'tweet-links': []
                })
                for threadCol in output[colOpt]['thread-cols']:
                    if ('tweets' not in threadCol):
                        continue

                    ExtractMicroCol.addDegree1TwtLinks(
                        threadCol['tweets'],
                        output['degree-1-twt-col'][-1]['tweet-links'],
                        dedupSet)

                print('\tdeg 1', threadTypeName, 'active:',
                      deg1Settings['active'][threadTypeName])
                cacheFilename = './Caches/Deg1Twttr/' + self.cols[
                    'collectionTopic'] + '/' + threadTypeName + '.json'
                ExtractMicroCol.addTwDeg1Col(
                    threadTypeName,
                    output['degree-1-twt-col'][-1]['tweet-links'],
                    cacheFilename, deg1Settings)
        #add degree 1 serp col - end

        return output
Exemplo n.º 10
0
    def getHTMLAndTextForURILst(col,
                                outfilename=None,
                                printSuffix='',
                                extraParams=None):

        if (extraParams is None):
            extraParams = {}

        extraParams.setdefault('simCacheLookup', True)

        jobsLst = []
        statusCodeJobsLst = []
        jobSize = len(col['uris'])
        for i in range(jobSize):

            uri = col['uris'][i]

            if ('hash' not in uri):
                uri['hash'] = getURIHash(uri['uri'])

            if (PrecEval.uriDctHasBasics(uri)
                    and extraParams['simCacheLookup']):
                #ignore already proc. files, usually already proc. segments
                #except cache lookup is off
                continue

            #attempt - cache - start
            cosineSimFile = './Caches/CosineSim/' + col['uris'][i][
                'hash'] + '.json'
            if (os.path.exists(cosineSimFile)
                    and extraParams['simCacheLookup']):

                cache = getDictFromFile(cosineSimFile)
                if (PrecEval.uriDctHasBasics(cache)):
                    uri['text'] = cache['text']
                    uri['text-len'] = cache['text-len']
                    uri['title'] = cache['title']
                    uri['status-code'] = cache['status-code']
                    #print('\t\tskipping since cache available')
                    continue

            if ('custom' in uri):
                if ('mime' in uri['custom']):
                    if (uri['custom']['mime'] != 'text/html'):

                        print('\tskipping', uri['custom']['mime'])
                        uri['text'] = 'NoneHTML'
                        uri['text-len'] = 8

                        uri.setdefault('title', '')
                        uri.setdefault('status-code', -1)
                        continue
            '''		
				txtFile = './Caches/Plaintext/' + uri['hash'] + '.txt'
				htmlFile = './Caches/HTML/' + uri['hash'] + '.html'
				if( os.path.exists(txtFile) ):
					uri['text'] = readTextFromFile(txtFile)
					uri['text-len'] = len(uri['text'])
					uri['title'] = extractPageTitleFromHTML( readTextFromFile(htmlFile) )
					continue
			'''
            #attempt - cache - end

            jobsLst.append({
                'func':
                mimicBrowser,
                'args': {
                    'uri': uri['uri'],
                    'extraParams': {
                        'sizeRestrict': 4000000
                    }
                },
                'misc': {
                    'i': i,
                    'hash': uri['hash']
                },
                'print':
                'gtHTML.URILst->dfURI(): ' + str(i) + ' of ' + str(jobSize) +
                printSuffix  #+ '\n\tu: ' + uri['uri']
            })

            statusCodeJobsLst.append({
                'func':
                mimicBrowser,
                'args': {
                    'uri': uri['uri'],
                    'getRequestFlag': False,
                    'extraParams': None
                },
                'misc': {
                    'i': i,
                    'hash': uri['hash']
                },
                'print':
                'gtHTML.URILst->mkHdReq.(): ' + str(i) + ' of ' +
                str(jobSize) + printSuffix
            })

        resLst = []
        if (len(jobsLst) != 0):
            resLst = parallelTask(jobsLst, threadCount=3)

        for res in resLst:

            html = res['output']
            plaintext = clean_html(html)
            indx = res['misc']['i']

            col['uris'][indx]['text'] = plaintext
            col['uris'][indx]['text-len'] = len(plaintext)
            col['uris'][indx]['title'] = extractPageTitleFromHTML(html)

            writeTextToFile('./Caches/HTML/' + res['misc']['hash'] + '.html',
                            html)
            print('\t\thtmllen:', len(html))
            writeTextToFile(
                './Caches/Plaintext/' + res['misc']['hash'] + '.txt',
                plaintext)
            print('\t\tplaintextlen:', len(plaintext))

        resLst = []
        if (len(statusCodeJobsLst) != 0):
            resLst = parallelTask(statusCodeJobsLst, threadCount=3)

        for res in resLst:

            headReq = res['output']
            indx = res['misc']['i']

            cache = {}
            cache['text'] = col['uris'][indx]['text']
            cache['text-len'] = col['uris'][indx]['text-len']
            cache['title'] = col['uris'][indx]['title']
            cache['status-code'] = -1

            col['uris'][indx]['status-code'] = -1
            if ('status-code' in headReq):
                cache['status-code'] = headReq['status-code']
                col['uris'][indx]['status-code'] = headReq['status-code']

            cacheFilename = './Caches/CosineSim/' + res['misc'][
                'hash'] + '.json'
            dumpJsonToFile(cacheFilename, cache)

        col['timestamp'] = getNowTime()
        if (outfilename is not None):
            dumpJsonToFile(outfilename, col)
Exemplo n.º 11
0
    def prlEvalCol(col, goldstandard, removeTxt=True, extraParams=None):

        if (extraParams is None):
            extraParams = {}

        extraParams.setdefault('minTextSize', 300)
        '''
			Important note:
			1. If minTextSize is changed, If gold standard text content is change, 
				set simCacheLookup False avoid cache lookup in order to true to recalculate sim
			
			2. If gold standard sim-coeff is change, no need to do anything
		'''
        extraParams.setdefault('simCacheLookup', True)
        extraParams.setdefault('printSuffix', '')

        colsize = len(col['uris'])

        if (colsize == 0 or len(goldstandard) == 0):
            print(
                '\tprlEvalCol(): colsize is 0 or goldstandard == 0, returning')
            return -1

        if ('uris' not in goldstandard):
            print('\tprlEvalCol(): no uris in goldstandard, returning')
            return -1

        goldRange = list(range(len(goldstandard['uris'])))
        combinedGold = PrecEval.combineDocsForIndices(goldstandard['uris'],
                                                      goldRange)

        precision = 0
        validColSize = 0
        jobsLst = []
        for i in range(colsize):

            #attempt getting sim from cache - start
            cosineSimFile = './Caches/CosineSim/' + col['uris'][i][
                'hash'] + '.json'
            if (os.path.exists(cosineSimFile)
                    and extraParams['simCacheLookup']):

                cosSim = getDictFromFile(cosineSimFile)
                if ('sim' in cosSim):

                    col['uris'][i]['sim'] = cosSim['sim']

                    if (cosSim['sim'] != -1):
                        validColSize += 1

                        if (PrecEval.isRel(cosSim['sim'],
                                           goldstandard['sim-coeff'])):
                            col['uris'][i]['relevant'] = True
                            precision += 1
                        else:
                            col['uris'][i]['relevant'] = False

                    continue
            #attempt getting sim from cache - end

            noopFlag = False
            usingSubText = ''
            if (len(col['uris'][i]['text']) < extraParams['minTextSize']):
                if ('post-details' in col['uris'][i]):
                    #gold standards do not have post-details
                    if ('substitute-text' in col['uris'][i]['post-details']):

                        subText = col['uris'][i]['post-details'][
                            'substitute-text'].strip()
                        if (subText != ''):
                            col['uris'][i]['text'] = subText
                            col['uris'][i]['custom'][
                                'substitute-text-active'] = True
                            usingSubText = '\n\t\tusing subtext: ' + col[
                                'uris'][i]['uri']
                        else:
                            noopFlag = True

                    else:
                        #don't process uris with small text
                        #don't skip (continue) so cache can update
                        noopFlag = True

            matrix = [col['uris'][i]['text'], combinedGold]
            keywords = {'matrix': matrix, 'noopFlag': noopFlag}
            toPrint = '\tprlEvalCol():' + str(i) + ' of ' + str(
                colsize) + ' ' + extraParams['printSuffix'] + usingSubText

            if ('status-code' not in col['uris'][i]):
                print('\tproblem ahead for uri:', col['uris'][i]['uri'])
                print('\tproblem ahead for hash:', col['uris'][i]['hash'])
                print('\tproblem ahead for cosineSimFile:', cosineSimFile)
                print('\tproblem ahead for keys:', col['uris'][i].keys())

            cache = {
                'hash': col['uris'][i]['hash'],
                'self': cosineSimFile,
                'uri': col['uris'][i]['uri'],
                'title': col['uris'][i]['title'],
                'text': col['uris'][i]['text'],
                'text-len': len(col['uris'][i]['text']),
                'status-code': col['uris'][i]['status-code']
            }
            jobsLst.append({
                'func': PrecEval.calcPairSim,
                'args': keywords,
                'misc': {
                    'i': i,
                    'cache': cache
                },
                'print': toPrint
            })

        resLst = []
        if (len(jobsLst) != 0):
            resLst = parallelTask(jobsLst, threadCount=3)

        for res in resLst:

            indx = res['misc']['i']
            cache = res['misc']['cache']

            sim = res['output']
            col['uris'][indx]['sim'] = sim

            if (sim != -1):
                validColSize += 1

                if (PrecEval.isRel(sim, goldstandard['sim-coeff'])):
                    col['uris'][indx]['relevant'] = True
                    precision += 1
                else:
                    col['uris'][indx]['relevant'] = False

            #write cache - start
            cache['sim'] = sim
            dumpJsonToFile(cache['self'], cache)
            #write cache - end

        if (removeTxt):
            for i in range(colsize):
                if ('text' in col['uris'][i]):
                    del col['uris'][i]['text']

        if (validColSize > 0):
            return precision / validColSize
        else:
            return -1