Пример #1
0
def prep_topic_analysis(pathDataIn,
                        pathDataOut,
                        data_source_tag,
                        onlyUnique=True):
    visits = set()
    #with open(pathDataIn, 'r') as infile:
    with subprocess.Popen(["tail", "-r", pathDataIn],
                          stdout=subprocess.PIPE,
                          universal_newlines=True).stdout as infile:
        with open(pathDataOut, 'w') as outfile:
            mcdata_csv = csv.writer(outfile, delimiter='\t')
            for mcline in infile:
                mc = json.loads(mcline)
                if not mc.get('is_wanted', False) and mc.get(
                        'selftext', False) and len(mc['selftext']) > 0:
                    visit = standardize_address(mc.get('mc_addr', ''))
                    if not onlyUnique or not visit in visits:
                        visits.add(visit)
                        try:
                            mcdata_csv.writerow([
                                mc.get('post_uid',
                                       visit), visit, data_source_tag,
                                (mc['title'] + ' ' + mc['selftext']).replace(
                                    '\n', ' ').replace('"', "'").replace(
                                        '[[', ' ').replace(']]', ' ').replace(
                                            '**',
                                            ' ').replace('[', ' ').replace(
                                                ']', ' ').replace('*', ' ').
                                replace('.', ' ').replace('/', ' ').replace(
                                    '',
                                    '').replace('!', " ").replace(
                                        '(',
                                        " ").replace(')',
                                                     " ").replace(':', ' ')
                            ])  #, "srv_repsample" ))
                        except UnicodeEncodeError:
                            pprint(mc)
                            raise
    print(len(visits))
def merge_sniffer_into_core_json(d_mcsniff, s_in_file, s_out_file):
    mc_addrs_sniff = d_mcsniff.keys()
    mc_json_out = []
    print(len(mc_addrs_sniff))
    counter = 0
    counter2 = 0
    with open(s_in_file, 'r') as infile:
        for line in infile:
            counter += 1
            #if counter % 100 != 0: continue
            mc = ujson.loads(line)
            interesting_text = {}
            if 'motd' in mc:
                if type(mc['motd']) == type(''):
                    interesting_text['motd'] = mc['motd']
                elif type(mc['motd']) == type({}):
                    interesting_text['motd'] = mc['motd']['text']
            if 'website_url' in mc and len(mc['website_url']) > 0:
                interesting_text['website'] = 'website'
            alternative_dependents = {}
            if 'votes' in mc: alternative_dependents['votes'] = mc['votes']
            if 'updated' in mc:
                alternative_dependents['updated'] = mc['updated']
            if 'rank' in mc: alternative_dependents['rank'] = mc['rank']
            ### sniff fields are (from 20160712 data collected on 20160719)
            #{ 'host': {"<class 'str'>": 97267},
            #'id': {"<class 'int'>": 97267},
            #'online': {"<class 'bool'>": 97267},
            #'port': {"<class 'int'>": 97267},
            #'signs': {"<class 'list'>": 97267},
            #'state': {"<class 'str'>": 97267},
            #'description': {"<class 'NoneType'>": 257, "<class 'str'>": 97010},
            #'protocol_version': {"<class 'NoneType'>": 267, "<class 'int'>": 97000},
            #'version': {"<class 'NoneType'>": 267, "<class 'str'>": 97000},
            #'max_player_count': {"<class 'NoneType'>": 276, "<class 'int'>": 96991},
            #'player_count': {"<class 'NoneType'>": 276, "<class 'int'>": 96991},
            #'error': {"<class 'NoneType'>": 9455, "<class 'str'>": 87812},
            #'plugins_fml': {"<class 'NoneType'>": 69432, "<class 'list'>": 27835},
            #'whitelist': {"<class 'NoneType'>": 78537, "<class 'bool'>": 18730},
            #'difficulty': {"<class 'NoneType'>": 86782, "<class 'int'>": 10485},
            #'gamemode': {"<class 'NoneType'>": 86782, "<class 'int'>": 10485},
            #'hardcore': {"<class 'NoneType'>": 86782, "<class 'bool'>": 10485},
            #'level_type': {"<class 'NoneType'>": 86782, "<class 'str'>": 10485},
            #'brand': {"<class 'NoneType'>": 86797, "<class 'str'>": 10470},
            #'players': {"<class 'NoneType'>": 87392, "<class 'list'>": 9875},
            #'help_p1': {"<class 'NoneType'>": 91787, "<class 'str'>": 5480},
            #'software': {"<class 'NoneType'>": 94398, "<class 'str'>": 2869},
            #'plugins': {"<class 'NoneType'>": 94820, "<class 'str'>": 2447},
            #'welcome': {"<class 'NoneType'>": 96224, "<class 'str'>": 1043}}
            if 'mc_addr' in mc and standardize_address(
                    mc['mc_addr']) in mc_addrs_sniff:
                mc_snf = d_mcsniff[standardize_address(mc['mc_addr'])]
                counter2 += 1
                mc['reported_sniff'] = True
                mc['whitelist'] = mc_snf['whitelist']
                #if 'protocol_version' in mc_snf: mc['snf_protocol_v'] = mc_snf['protocol_version']
                if 'server_version_number' not in mc:
                    if 'version' in mc:
                        mc['server_version_number'] = mc['version']
                    elif 'game_query' in mc and 'version' in mc['game_query']:
                        mc['server_version_number'] = mc['game_query'][
                            'version']
                    elif 'game_query' in mc and (
                            'server_mod_version' in mc['game_query']
                            or 'server_mod_name' in mc['game_query']):
                        mc['server_version_number'] = mc['game_query'].get(
                            'server_mod_name',
                            '') + '_' + mc['game_query'].get(
                                'server_mod_version', '')
                    elif 'version' in mc_snf:
                        mc['server_version_number'] = mc_snf['version']
                if 'description' in mc_snf and 'description' not in mc:
                    mc['description'] = mc_snf['description']
                if 'description' in mc:
                    interesting_text['description'] = mc['description']
                if 'welcome' in mc_snf:
                    interesting_text['welcome'] = mc_snf['welcome']
                if 'help_p1' in mc_snf:
                    interesting_text['help_p1'] = mc_snf['help_p1']
                if 'difficulty' in mc_snf:
                    mc['snf_difficulty'] = mc_snf['difficulty']
                if 'gamemode' in mc_snf:
                    mc['snf_gamemode'] = mc_snf['gamemode']
                if 'hardcore' in mc_snf:
                    mc['snf_hardcore'] = mc_snf['hardcore']
                if 'level_type' in mc_snf:
                    mc['snf_level_type'] = mc_snf['level_type']
                if 'brand' in mc_snf: mc['snf_brand'] = mc_snf['brand']
                if 'software' in mc_snf:
                    mc['snf_software'] = mc_snf['software']
                #
                ### this is probably wrong, and I probably don't want it, but
                ###  if I get to wanting it, I'll do it here someway like this.
                #mc['plugins_names'].append(mc_snf['plugins_fml'])
                ### this is complicated because its non default, and many
                ###  people use it to say many things, and its sometimes empty,
                ###  and I don't want to add to existing plugin_names list a
                ###  plugin already listed
                plugins_text_sniff = mc_snf['plugins']
                if plugins_text_sniff is not None and plugins_text_sniff != '':
                    if re.match("^Plugins (\d+): ", plugins_text_sniff):
                        plugin_names_sniff = [
                            plug_name.strip() for plug_name in
                            plugins_text_sniff.partition(':')[2].split(',')
                        ]
                        plugin_names_omni = mc['plugins_names']
                        mc['plugins_names'] = set(plugin_names_omni +
                                                  plugin_names_sniff)
                        mc['reported_plugins'] = True
                    else:
                        interesting_text[
                            'plugins_override'] = plugins_text_sniff

                ### now handle signs
                if 'signs' in mc_snf:
                    interesting_text['signs'] = []
                    num_signs = 0
                    for sign in mc_snf['signs']:
                        if len(''.join(sign['lines'])) == 0:  ### empty sign
                            mc_signtext = ''
                            continue  ### decided that I don't want these
                        else:
                            mc_signtext = '\\\\'.join(
                                sign['lines']).strip('\\\\')
                            num_signs += 1
                        interesting_text['signs'].append(mc_signtext)
                    if num_signs == 0:
                        interesting_text.pop(
                            'signs', None)  ### in case all signs were empty
                    mc['snf_signs_count'] = num_signs
                else:
                    mc['snf_signs_count'] = None
            else:
                mc['reported_sniff'] = False
            mc['text_short'] = interesting_text
            mc_json_out.append(mc)
            #json.dump(mc, outfile)
            #outfile.write("\n")
    with open(s_out_file, 'w') as outfile:
        #json.dump(mc_json_out, outfile)
        for mc in mc_json_out:
            outfile.write(ujson.dumps(mc))
            outfile.write("\n")
    print('second number gives number of servers matched to sniff data')
    print(counter, counter2)
        if failed:
            row = get_valid_json_with_halflines(
                line, (start_char + i_second_open_bracket + 1))
    return (row)


### load sniffer json into a giant dict indexed by theserver location
sniffer_dataset = get_freshest_data_date("lib_datasets_sniffer.txt")
d_mcsniff = {}
with open(pathData + 'mcsniffer/' + sniffer_dataset + '/' + "out_servers.json",
          'r') as f_mcdata_in:
    for line in f_mcdata_in:
        mc = get_valid_json_with_halflines(line.strip())
        if not mc:
            continue
        mc['mc_addr'] = standardize_address(mc['host'] + ':' + str(mc['port']))
        d_mcsniff[mc['mc_addr']] = mc


### use that to structure merge into existing json
def merge_sniffer_into_core_json(d_mcsniff, s_in_file, s_out_file):
    mc_addrs_sniff = d_mcsniff.keys()
    mc_json_out = []
    print(len(mc_addrs_sniff))
    counter = 0
    counter2 = 0
    with open(s_in_file, 'r') as infile:
        for line in infile:
            counter += 1
            #if counter % 100 != 0: continue
            mc = ujson.loads(line)
Пример #4
0
### use that to structure merge into existing json
print(len(d_mcs_org.keys()))
print("merge mcs.org into concatenated omni logs")
counter = 0
counter2 = 0
mc_json_out = []
copyfile(pathData+"step3_scraped_omnimc_posts"+".json", pathData+"tmp_step3_scraped_omnimc_posts"+".json")
with open(pathData+"tmp_step3_scraped_omnimc_posts"+".json", 'r') as infile:
    for line in infile:
        counter += 1
        mc = ujson.loads(line)
        ### merging of relevant fields, incl ["id", "title", "selftext", "primary_tags", "ip", "port", "version", "banner", "created", "updated", "youtube_video", "website_url", "country_code", "votes", "rank", "uptime", "totaltime", "daily_uptime", "daily_totaltime"]
        mco = False
        #print(mc['dataset_date'], map_omni_to_mcs_org.get(mc['dataset_date'], False), True if d_mcs_org.get(standardize_address(mc['mc_addr'])+'_'+map_omni_to_mcs_org[mc['dataset_date']], False) else False)
        if map_omni_to_mcs_org.get(mc['dataset_date'], False):
            if d_mcs_org.get(standardize_address(mc['mc_addr'])+'_'+map_omni_to_mcs_org[mc['dataset_date']], False):
                mco = d_mcs_org[ standardize_address(mc['mc_addr'])+'_'+map_omni_to_mcs_org[ mc['dataset_date'] ] ]
        if mco:
            counter2 += 1
            mc['dataset_source'] = 'mcs_org'
            mc['title'] = mco['title']
            mc['selftext'] = mco['description']
            mc['primary_tags'] = mco['tags']
            mc['ip'] = mco['ip'].rstrip()
            mc['port'] = mco['port']
            mc['server_version_number'] = mco['version']
            mc['banner'] = mco['banner']
            mc['created'] = mco['created']
            mc['updated'] = mco['updated']
            mc['youtube_video'] = mco['youtube_video']
            mc['website_url'] = mco['website_url']
def write_scrape_csv_row(dObs, playerWriter, serverWriter):
    dObs['mc_addr'] = standardize_address(dObs['mc_addr'])
    npopObs = len( dObs['players'] )
    for p in dObs['players']:
        playerWriter.writerow([dObs['timestamp'], dObs['mc_addr'], hashlib.md5(p.encode('utf-8')).hexdigest()])
    hackedAPI = (
        len( dObs['players'] ) == 0 or ### not sure why, but this is a reliable signal of a hacked API
        (len( dObs['players'] ) == 1 and dObs['players_online'] > 3) or ### must be equal
        (len( dObs['players'] ) > 1 and dObs['players_online'] - 10 > dObs['players_online']) or ### must be close
        (dObs['players_online'] > 0 and len( dObs['players'] ) - 9 > dObs['players_online']) or ### must be close, or, if players_online is zero, then player list is actually still trustworthy and mismatches are OK. This actually has some false positives, paticularly among big servers. the general rule remains that a mod to the API is disqualifying, unless I have guarantee for a spcial case (like zero ) taht API modifications are safe
        (len( dObs['players'] ) > 50 and len( dObs['players'] ) - 20 > dObs['players_online']) or ### and the tolerance is bigger for bigger servers, because more room for lag to affect synching of counts
        dObs['players_max'] + 2 < len( dObs['players'] ) or ### don't exceed max (plus/minus noise/lag)
        dObs['players_max'] + 2 < dObs['players_online'] or
        dObs['players_max'] <= 0 or ### negative and zero are impossible
        dObs['players_online'] < 0 or ### negative is impossible
        (len( dObs['players'] ) == 1 and len( dObs['players'][0]) < 3) or #### player array replace by int (18,14,10,1,or 0). When this happens, length of list is never greater than 1
        dObs['mc_addr'].lower() in (
"131.153.5.218", "alpa.playmcm.net", "playmcm.net", "pvp.originmc.org"
        )
    )
    #if dObs['players_online'] < len(dObs['players']):
    #if (dObs['players_online'] - 1000) > len(dObs['players']):
    #if len (dObs['players']) > 1 and len(dObs['players'][0]) < 3:
        #print("xxx", len(dObs['players']), dObs['players_online'], dObs['players_max'], dObs['players'])
    #if dObs['players_online'] + 10  < len(dObs['players']):
        #print("yyy", len(dObs['players']), dObs['players_online'], dObs['players_max'], dObs['players'])
        #if len (dObs['players']) > 1:
            #print( dObs['players'])
        #print(len(dObs['players']), dObs['players_online'], dObs['players_max'], dObs)
        #print()
    serverWriter.writerow([dObs['timestamp'], dObs['mc_addr']
                           , 1 if dObs['reported_status'] else 0
                           , 1 if dObs['reported_sample'] else 0
                           , 1 if dObs['reported_query'] else 0
                           , dObs['players_max']
                           , len(dObs['players'])
                           , dObs.get('latency', -1)
                           , hackedAPI
                          ])
    statistics = {}
    if True:
        #if len(dObs['players']) == 1 and len(dObs['players'][0]) < 10: print(dObs['players'])
        statistics['countInternal'] = 1
        ## always one or the other of these two:
        statistics['playerKeyInRow'] = 1 if 'players' in dObs else 0
        statistics['playerKeyNotInRow'] = 1 if not 'players' in dObs else 0
        ### always one or the other of these three
        ###  if list is empty, 10:1 chances that players_online is a lie. 
        ###     so I'm using emptiness as one flag of hacked APIS
        statistics['playerListEmpty'] = 1 if len( dObs['players'] ) == 0 else 0
        statistics['playerListLen1'] = 1 if len( dObs['players'] ) == 1  else 0
        statistics['playerListLenBig'] = 1 if len( dObs['players'] ) > 1 else 0
        #statistics['playerListLenCorrected'] = len( dObs['players'] )- (1 if  "00000000-0000-0000-0000-000000000000" in dObs['players']  else 0)
        #### this isn't a sign of badness, just a sign of a certain type of plugin installed, in which case it means the op is online
        statistics['playerListDummy'] = 1 if "00000000-0000-0000-0000-000000000000" in dObs['players'] else 0
        statistics['playerListDummyEmbedded'] = 1 if statistics['playerListDummy'] and statistics['playerListLenBig'] else 0
        statistics['playerListIntDummy'] = 1 if len(dObs['players']) > 0 and len(dObs['players'][0]) < 3 else 0
        ### always true
        statistics['playersReported'] = 1 if 'players_online' in dObs else 0
        ### usually true
        statistics['playersReportedEqualTruth'] = 1 if dObs['players_online'] == len( dObs['players'] ) else 0
        #### these are flags of a hacked API
        statistics['playersReportedOverTruth'] = 1 if dObs['players_online'] > len( dObs['players'] ) else 0
        statistics['playersReportedUnderTruth'] = 1 if dObs['players_online'] < len( dObs['players'] ) else 0
        ### always false
        statistics['playersReportedNull'] = 1 if dObs['players_online'] is None else 0
        statistics['playersReportedFalse'] = 1 if dObs['players_online'] is False else 0
        statistics['playersReported0'] = 1 if dObs['players_online'] == 0 else 0
        statistics['playersReported0Alone'] = 1 if dObs['players_online'] == 0 and len( dObs['players'] ) > 0 else 0
        ### true 9 times out of 10
        statistics['playersReportedNotEqualTruth0'] = 1 if dObs['players_online'] != 0 and len( dObs['players'] ) == 0 else 0
        statistics['playersReportedNegative'] = 1 if dObs['players_online'] < 0 else 0
        statistics['playersMaxNegative'] = 1 if dObs['players_max'] < 0 else 0
        ### this is never true
        statistics['apimod1'] = 1 if not dObs['reported_sample'] and len( dObs['players'] ) > 0 else 0
        ### this is most often true
        statistics['apimod2'] = 1 if not dObs['reported_query'] and len( dObs['players'] ) > 0 else 0
        ### these are rarely true, and sings of a hacked API
        statistics['apimod3'] = 1 if dObs['players_max'] < len( dObs['players'] ) else 0
        statistics['apimod4'] = 1 if dObs['players_max'] < dObs['players_online'] else 0
        statistics['apimod5'] = 1 if hackedAPI else 0
    return(statistics)
Пример #6
0
                if len(mc) != 19:
                    print("PROBLEM at {rc}, row length:{rlength}".format(
                        rc=i_rowcount, rlength=len(mc)))
                    print('\n'.join(mc))
                for key, entry in iter(mc.items()):
                    if entry == 'N': mc[key] = None
                if mc['id'] == None:
                    #i_badrowcount += 1
                    #print "bad row:", mc)
                    continue

                i_rowcount += 1
                #if i_rowcount > 100: break
                ### data formatting
                mc['id'] = int(mc['id'])
                mc['mc_addr'] = standardize_address(mc['ip'] + ':' +
                                                    mc['port'])
                mc['port'] = int(mc['port'])
                mc['dataset_date'] = thedate
                mc['dataset_source'] = 'mcs_org'
                mc['date_created'] = parse(mc['updated']).strftime('%Y%m%d')
                mc['post_uid'] = mc['mc_addr'] + '_' + mc['dataset_date']
                mc['votes'] = int(mc['votes'])
                mc['rank'] = int(mc['rank'])
                mc['totaltime'] = int(mc['totaltime'])
                mc['daily_totaltime'] = int(mc['daily_totaltime'])
                mc['uptime'] = int(mc['uptime'])
                mc['daily_uptime'] = int(mc['daily_uptime'])
                mc['tags'] = [tag.strip() for tag in mc['tags'].split(',')]
                ### new fields
                #mc['measure_one'] = str(mc['id']) in l_ids_obs1
                #mc['measure_two'] = str(mc['id']) in l_ids_obs2