Python media_file_cleanings 예제들, manyutils.media_file_cleanings Python 예제들

예제 #1

0

파일 보기

파일: perform_analysis.py 프로젝트: krokodilerian/trackmap

    def verify_media_country(the_user_input, special):
        # this function left the media file open forever. :(
        if special:
            special_f = os.path.join('special_media', the_user_input)
            if not os.path.isfile(special_f):
                print colored(
                    "Invaild special URL source, check in special_media ",
                    'red')
                quit(-1)

            cfp = file(special_f, 'r')
            unclean_lines = cfp.readlines()

            print colored(" ࿓  Importing special media list:",
                          'blue',
                          'on_white',
                          attrs=['underline'])
            media_entries = media_file_cleanings(unclean_lines,
                                                 permit_flexible_category=True)
            cfp.close()

            return special_f, media_entries

        # if not special, is media list
        country_f = os.path.join('verified_media', the_user_input.lower())
        if not os.path.isfile(country_f):
            print colored(
                "Invalid country! not found %s in directory 'verified_media/' "
                % proposed_country, 'red')
            print "Available countries are:"
            for existing_c in os.listdir('verified_media'):
                if existing_c in ['README.md', 'test']:
                    continue
                print "\t", existing_c
            print colored(
                "You can propose your own country media list following these instructions:",
                'blue', 'on_white')
            print colored(
                "https://github.com/vecna/trackmap/blob/master/unverified_media_list/README.md",
                'blue', 'on_white')
            quit(-1)

        cfp = file(country_f, 'r')
        # reading media list, cleaning media list and copy media list
        unclean_lines = cfp.readlines()

        print colored(" ࿓  Importing media list from %s:" %
                      the_user_input.lower(),
                      'blue',
                      'on_white',
                      attrs=['underline'])
        media_entries = media_file_cleanings(unclean_lines)
        cfp.close()

        return country_f, media_entries

예제 #2

0

파일 보기

파일: perform_analysis.py 프로젝트: research24/trackmap

def verify_media_country(the_user_input, special):
    if special:
        special_f = os.path.join("special_media", the_user_input)
        if not os.path.isfile(special_f):
            print colored("Invaild special URL source, check in special_media ", "red")
            quit(-1)

        cfp = file(special_f, "r")
        unclean_lines = cfp.readlines()

        print colored(" ࿓  Importing special media list:", "blue", "on_white", attrs=["underline"])
        media_entries = media_file_cleanings(unclean_lines, permit_flexible_category=True)
        cfp.close()

        return special_f, media_entries

    # if not special, is media list
    country_name = the_user_input.lower()
    country_f = os.path.join("verified_media", country_name)
    if not os.path.isfile(country_f):
        print colored("Invalid country! not found %s in directory 'verified_media/' " % country_name, "red")
        print "Available countries are:"
        for existing_c in os.listdir("verified_media"):
            if existing_c in ["README.md", "test"]:
                continue
            print "\t", existing_c
        print colored("You can propose your own country media list following these instructions:", "blue", "on_white")
        print colored(
            "https://github.com/vecna/trackmap/blob/master/unverified_media_list/README.md", "blue", "on_white"
        )
        quit(-1)

    cfp = file(country_f, "r")
    # reading media list, cleaning media list and copy media list
    unclean_lines = cfp.readlines()

    print colored(" ࿓  Importing media list from %s:" % the_user_input.lower(), "blue", "on_white", attrs=["underline"])
    media_entries = media_file_cleanings(unclean_lines)
    cfp.close()

    return country_f, media_entries

예제 #3

0

파일 보기

파일: perform_analysis.py 프로젝트: Oreonax/trackmap

    def verify_media_country(the_user_input, special):
        # this function left the media file open forever. :(
        if special:
            special_f = os.path.join('special_media', the_user_input)
            if not os.path.isfile(special_f):
                print colored("Invaild special URL source, check in special_media ", 'red')
                quit(-1)

            cfp = file(special_f, 'r')
            unclean_lines = cfp.readlines()

            print colored(" ࿓  Importing special media list:", 'blue', 'on_white', attrs=['underline'])
            media_entries = media_file_cleanings(unclean_lines, permit_flexible_category=True)
            cfp.close()

            return special_f, media_entries

        # if not special, is media list
        country_f = os.path.join('verified_media', the_user_input.lower())
        if not os.path.isfile(country_f):
            print colored("Invalid country! not found %s in directory 'verified_media/' " % proposed_country, 'red')
            print "Available countries are:"
            for existing_c in os.listdir('verified_media'):
                if existing_c in ['README.md', 'test']:
                    continue
                print "\t", existing_c
            print colored("You can propose your own country media list following these instructions:", 'blue', 'on_white')
            print colored("https://github.com/vecna/trackmap/blob/master/unverified_media_list/README.md", 'blue', 'on_white')
            quit(-1)

        cfp = file(country_f, 'r')
        # reading media list, cleaning media list and copy media list
        unclean_lines = cfp.readlines()

        print colored(" ࿓  Importing media list from %s:" % the_user_input.lower(), 'blue', 'on_white', attrs=['underline'])
        media_entries = media_file_cleanings(unclean_lines)
        cfp.close()

        return country_f, media_entries

예제 #4

0

파일 보기

파일: perform_analysis.py 프로젝트: houndbee/trackmap

def main():

    parser = OptionParser()

    parser.add_option("-c", "--country-name", type="string",
                      help="the country from which you want run the test", dest="medialist")
    parser.add_option("-o", "--output-dir", type="string", default=None,
                      help="directory to store results", dest="user_outputdir")
    parser.add_option("-l", "--local-phantom", action="store_true",
                      help="use local phantomjs instead of the downloaded one", dest="lp")
    parser.add_option("-d", "--disable-sending", action="store_true",
                      help="disable the result sending at the end of the test", dest="disable_send")
    parser.add_option("-i", "--instable-internet", action="store_true",
                      help="If your internet is instable, please enable this option", dest="shitty_internet")
    parser.add_option("-s", "--send", type="string", dest="targz_output",
                      help="do not perform test, submit a previously collected result.")
    parser.add_option("-t", "--twitter-handle", type="string", dest="twit",
                      help="put your twitter handler, you'll be mentioned when test is imported.")
    parser.add_option("-v", "--version", action="store_true", dest="version",
                      help="print version, spoiler: %d" % ANALYSIS_VERSION)
    parser.add_option("-T", "--Tor", action="store_true", dest="hiddensubmit",
                      help="submit via hidden service (require Tor running)")
    parser.add_option("-k", "--keep", action="store_true", dest="keep",
                      help="don't remove the results-[country].tar.gz ")

    (args, _) = parser.parse_args()

    if args.version:
        print "analysis format version:", ANALYSIS_VERSION
        quit(0)

    if args.targz_output:
        if args.disable_send:
            print colored("You can't use -s (--send) and -d (--disable-sending) options together")
            quit(-1)

        if not os.path.isfile(args.targz_output):
            print colored("Invalid file: %s" % args.targz_output)
            quit(-1)

        print colored(" ࿓  Sending previous results...", 'blue', 'on_white', attrs=['underline'])
        if args.hiddensubmit:
            quit(send_results(args.targz_output, hiddenservice_tuple, tor_proxy=True))
        else:
            quit(send_results(args.targz_output, server_tuple, tor_proxy=False))


    try:
        local_phantom_v = get_local_phantom_v()
    except Exception as xxx:
        print xxx
        local_phantom_v = None

    if not args.medialist:
        print colored("Usage: %s -c $YOUR_COUNTRY_NAME" % sys.argv[0], "red", 'on_white')
        print parser.format_help()

        if os.path.islink('phantom-1.9.8'):
            print colored("found phantom-1.9.8 as link, good.", "green", "on_white")
        elif not local_phantom_v:
            print colored("phantomjs missing as link and missing in the system!", "red", "on_white")
            print colored("Please refer to the RADME or asks support to us", 'red', 'on_white')
            print colored("The script can't work in this status!", red)
        else:
            print colored("You have to use the option -l, and your installation is quite uncommon", red)
        print
        print "Look in the verified_media/ for a list of countries."
        print "TrackMap collection tool version: %d" % ANALYSIS_VERSION
        quit(-1)

    # check if the user is running phantom as installed on the system (also vagrant make this)
    # of if is using
    if args.lp and local_phantom_v:
        print colored("You're using your local installed phantomjs. A version >= than 1.9.0 is needed.", 'blue', 'on_white')
        print colored("I'm not going to compare the string. Be aware: this is your version:", 'red')
        print colored(local_phantom_v, 'blue', 'on_white')
        print "If is wrong, just press ^c and use the proper README instruction, or asks support to us"
    elif args.lp:
        print colored("phantomjs missing as link and missing in the system!", "red", "on_white")
        print colored("Please refer to the README or asks support to us", 'red', 'on_white')
        print colored("The script can't work in this status!", red)
        quit(-1)
    elif not os.path.islink('phantom-1.9.8'):
        print colored("Missing phantom-1.9.8. A symbolic link named phantom-1.9.8 was expected, but not found. Please consult README.md and make sure you've followed the installation procedure exactly.", 'red', 'on_white')
        quit(-1)


    if args.hiddensubmit:
        try:
            import socks
        except ImportError:
            print "You are missing 'PySocks' module, needed to proxy over Tor"

        tor_test = ("127.0.0.1", 9050)
        c = socket.socket()
        try:
            c.connect( tor_test )
            c.close()
        except Exception as xxx:
            print colored("Unable to connect to %s, Tor is needed to send results" % str(tor_test), "red")
            print colored(xxx, "red")
            print colored("You can disable result sending with the option -d", "yellow")
            quit(-1)
        del c

    # country check
    proposed_country = args.medialist
    country_f = os.path.join('verified_media', proposed_country.lower())
    if not os.path.isfile(country_f):
        print colored("Invalid country! not found %s in directory 'verified_media/' " % proposed_country, 'red')
        print "Available countries are:"
        for existing_c in os.listdir('verified_media'):
            if existing_c in ['README.md', 'test']:
                continue
            print "\t", existing_c
        print colored("You can propose your own country media list following these instructions:", 'blue', 'on_white')
        print colored("https://github.com/vecna/trackmap/blob/master/unverified_media_list/README.md", 'blue', 'on_white')
        quit(-1)

    # check if the output directory is not the default and/or if need to be created
    if args.user_outputdir:
        OUTPUTDIR = args.user_outputdir
    else:
        OUTPUTDIR = 'output/'

    if not os.path.isdir(OUTPUTDIR):
        try:
            os.mkdir(OUTPUTDIR)
        except OSError as error:
            print "unable to create %s: %s" % (OUTPUTDIR, error)


    if args.twit is None:
        print colored("You can specify your Twitter handle with -t and get mentioned by @trackography_",
                      'blue', 'on_yellow' )

    # ask free information to the script runner
    info_f = os.path.join(OUTPUTDIR, 'information')
    information = {
        'contact' : args.twit,
        'version' : ANALYSIS_VERSION,
        'city' : None,
        'ISP' : None,
        'name' : None,
    }
    with file(info_f, 'w+') as f:
        json.dump(information, f)

    # writing in a file which country you've selected!
    with file(os.path.join(OUTPUTDIR, 'country'), 'w+') as f:
        f.write(proposed_country.lower())

    # reading media list, cleaning media list and copy media list
    cfp = file(country_f, 'r')
    unclean_lines = cfp.readlines()

    # reconding an unique number is always useful, also if I've not yet in mind an usage right now.
    with file( os.path.join(OUTPUTDIR, "unique_id"), "w+") as f:
        f.write("%d%d%d" % (random.randint(0, 0xffff), random.randint(0, 0xffff), random.randint(0, 0xffff)) )

    print colored(" ࿓  Importing media list:", 'blue', 'on_white', attrs=['underline'])
    media_entries = media_file_cleanings(unclean_lines)
    cfp.close()

    with file(os.path.join(OUTPUTDIR, 'used_media.json'), 'w+') as f:
        json.dump(media_entries, f)

    print colored(" ࿓  Checking your network source.", 'blue', 'on_white', attrs=['underline'])
    get_client_info(os.path.join(OUTPUTDIR, 'first.json'))

    # Init of class method/vars
    PhantomCrawl.media_amount = len(media_entries.keys())
    PhantomCrawl.status_file = os.path.join(OUTPUTDIR, 'phantom.results.json')
    PhantomCrawl.load_status_disk()

    print colored(" ࿓  Starting media crawling (%d)" % 
            PhantomCrawl.media_amount, 
            'blue', 'on_white', attrs=['underline'])

    # here start iteration over the media!
    skipped = 0
    for cleanurl, media_kind in media_entries.iteritems():

        if PhantomCrawl.status.has_key(cleanurl) and PhantomCrawl.status[cleanurl]['status']:
            skipped += 1
            PhantomCrawl.media_done += 1
            continue

        urldir = os.path.join(OUTPUTDIR, cleanurl)

        if skipped:
            print colored("skipped %d media from interrupted test" % skipped, 'yellow')
            skipped = 0

        if os.path.isdir(urldir):
            # being here means that is empty or incomplete
            shutil.rmtree(urldir)

        os.mkdir(urldir)

        PhantomCrawl(args.lp, cleanurl, urldir, media_kind, OUTPUTDIR).start()
        # XXX I can think to a return value here ?


    previous_running_test = 0
    while PhantomCrawl.media_running:

        if previous_running_test == PhantomCrawl.media_running:

            I_want_thread_to_zero(70)

            print colored("Media completed %d over %d: phase complete!" %
                          (PhantomCrawl.media_amount, PhantomCrawl.media_done),
                          'magenta', 'on_yellow' )
            break

        previous_running_test = PhantomCrawl.media_running

        print colored("Running %d, completed %d (on %d): sleeping 25s." % \
              (PhantomCrawl.media_running, PhantomCrawl.media_done,
               PhantomCrawl.media_amount), 'green', 'on_white')
        time.sleep(25)


    # finally, enforce a complete sync in the disk. is probably already happen, but for safety:
    PhantomCrawl.sync_status_disk(mandatory=True)

    # take every directory in 'output/', get the included URL and dump in a dict
    included_url_dict = sortify(OUTPUTDIR)
    assert included_url_dict, "No url included after phantom scraping and collection !?"
    with file(os.path.join(OUTPUTDIR, 'domain.infos'), 'w+') as f:
        json.dump(included_url_dict, f)

    # RESOLUTION multi-thread HERE start
    DNSresolve.host_amount = len(included_url_dict.keys())
    DNSresolve.status_file = os.path.join(OUTPUTDIR, 'resolution.status.json')
    DNSresolve.resolution_file = os.path.join(OUTPUTDIR, 'resolution.dns')
    DNSresolve.errors_file = os.path.join(OUTPUTDIR, 'resolution.errors.json')
    DNSresolve.load_status_disk()

    # generate DNS resolution map. for every host resolve an IP, for every IP resolve again DNS
    print colored(" ࿓  DNS resolution of %d domains..." % len(included_url_dict.keys()),
                  'blue', 'on_white', attrs=['underline'])

    for domain in included_url_dict.keys():

        if DNSresolve.status.has_key(domain) and DNSresolve.status[domain]:
            DNSresolve.host_done += 1
            continue

        DNSresolve(domain, args.shitty_internet).start()

    I_want_thread_to_zero(8)

    print colored("\nResolved %d unique IPv4 from %d unique domain (Errors %d)" %
                  (len(DNSresolve.ip_map.keys()), len(included_url_dict.keys()),
                   DNSresolve.resolve_errors
                  ), 'green')
    DNSresolve.save_status(mandatory=True)

    if not len(DNSresolve.ip_map.keys()):
        print colored("It appears that you can't access the internet. Please fix that and restart the test.", 'red')
        quit(-1)


    ### -----------------------------------------------------`###
    ### Reversing multithread start HERE                      ###

    DNSreverse.ip_amount = len(DNSresolve.ip_map.keys())
    DNSreverse.status_file = os.path.join(OUTPUTDIR, 'reverse.status.json')
    DNSreverse.reverse_file = os.path.join(OUTPUTDIR, 'reverse.dns')
    DNSreverse.errors_file = os.path.join(OUTPUTDIR, 'reverse.errors.json')
    DNSreverse.load_status_disk()

    print colored(" ࿓  DNS reverse of %d domains..." % DNSreverse.ip_amount,
                  'blue', 'on_white', attrs=['underline'])

    for ip in DNSresolve.ip_map.keys():

        if DNSreverse.status.has_key(ip) and DNSreverse.status[ip]:
            DNSreverse.ip_done += 1
            continue

        DNSreverse(ip, args.shitty_internet).start()

    I_want_thread_to_zero(12)

    print colored("\nReversed %d unique FQDN from %d IPaddrs (Errors %d)" %
                  ( len(DNSreverse.fqdn_map.keys()), 
                    len(DNSresolve.ip_map.keys()), DNSreverse.reverse_errors),
                   'green')
    DNSreverse.save_status(mandatory=True)


    # ------------------------------------------------------------------------
    # traceroutes contains all the output of traceroute in JSON format, 
    # for logs. this output is not in the media directory, because some 
    # host (think to fbcdn or google) are included multiple times.
    # ------------------------------------------------------------------------
    verbotracelogs = os.path.join(OUTPUTDIR, '_verbotracelogs')
    if not os.path.isdir(verbotracelogs):
        os.mkdir(verbotracelogs)

    # saving again information about network location
    get_client_info(os.path.join(OUTPUTDIR, 'second.json'))

    # Traceroute is not yet multithread

    # starting traceroute to all the collected IP
    print colored(" ࿓  Running traceroute to %d IP address (from %d hosts)" % (
        len(DNSresolve.ip_map.keys()), len(included_url_dict.keys())), 'blue', 'on_white', attrs=['underline'])

    Multitrace.amount = len(DNSresolve.ip_map.keys())
    for ip_addr, hostlist in DNSresolve.ip_map.iteritems():

        assert ip_addr.count('.') == 3, "Invalid IPv4 format %s" % ip_addr

        if Traceroute.is_already_trace(ip_addr, OUTPUTDIR):
            Multitrace.done += 1
            continue

        Multitrace(OUTPUTDIR, ip_addr, hostlist, args.shitty_internet).start()

    I_want_thread_to_zero(80)

    ## ----------- END TRACEROUTE -------------

    # saving again*again information about network location
    get_client_info(os.path.join(OUTPUTDIR, 'third.json'))

    output_name = 'results-%s.tar.gz' % proposed_country.lower()
    print colored(" ࿓  Analysis done! compressing the output in %s" % output_name, "blue", 'on_white', attrs=['underline'])

    if os.path.isfile(output_name):
        os.unlink(output_name)

    tar = subprocess.Popen(['tar', '-z', '-c', '-v', '-f', output_name, OUTPUTDIR],
                           stdout=subprocess.PIPE)

    counter_line = 0
    while True:
        line = tar.stdout.readline()
        counter_line += 1
        if not line:
            break

    if args.disable_send:
        print colored("%d files added to %s" % (counter_line, output_name), "green")
        print colored("Sending disable, test complete.", "yellow"),
        print colored("亷 亸", 'blue', 'on_white')
        os.kill(os.getpid(), 15)
        quit(0)

    print colored("%d file added to %s, Starting to submit results" %
                  (counter_line, output_name), "green")

    if not args.keep:
        print "..removing of", OUTPUTDIR
        shutil.rmtree(OUTPUTDIR)

    print colored("If submitting results fails please run:", "red")
    print colored("./perform_analysis.py -s %s" % output_name, "yellow")

    if args.hiddensubmit:
        ret = send_results(output_name, hiddenservice_tuple, tor_proxy=True)
    else:
        ret = send_results(output_name, server_tuple, tor_proxy=False)
    print ""
    os.kill(os.getpid(), 15)

예제 #5

0

파일 보기

파일: perform_analysis.py 프로젝트: hellais/trackmap

def get_alexa_list():
    country_f = 'special_media/alexa/world_top_100_per_country'
    with file(country_f) as fp:
        unclean_lines = fp.readlines()
        alexa_full_entries = media_file_cleanings(unclean_lines)
        return 'special/alexa100', alexa_full_entries

예제 #6

0

파일 보기

def get_alexa_list():
    country_f = 'special_media/alexa/world_top_100_per_country'
    with file(country_f) as fp:
        unclean_lines = fp.readlines()
        alexa_full_entries = media_file_cleanings(unclean_lines)
        return 'special/alexa100', alexa_full_entries