Пример #1
0
def sample_links_essences():
    time_stamp = str(datetime.now())
    sample_file_name = '/Users/uriklarman/Development/PycharmProjects/no_git/jumping_the_net/resources/consistency/links_sample_' + time_stamp

    keywords_dict, english_dict, links_dict = dicts.load_dictionaries()
    distillery = Distillery(essence_len=E, keywords_dict=keywords_dict)

    all_links = read_str_list(links_to_sample_file_name)
    links = all_links[:num_links_in_sample]
    essences = []
    for i, link in enumerate(links):
        print('starting link: %d , %s') % (i, link)
        for j in range(3):
            try:
                essence, uncut_essence = distillery.distill(
                    link, keywords_dict)
                break
            except:
                print 'Distillery failed for the %s time. restarting browser...' % j
                distillery.restart_browser()
        essences.append(essence)

    with open(sample_file_name, 'w') as f:
        for essence in essences[:-1]:
            f.write('%s\n' % essence)
        f.write('%s' % essences[-1])
def sample_links_essences():
    time_stamp = str(datetime.now())
    sample_file_name = '/Users/uriklarman/Development/PycharmProjects/no_git/jumping_the_net/resources/consistency/links_sample_' + time_stamp

    keywords_dict, english_dict, links_dict = dicts.load_dictionaries()
    distillery = Distillery(essence_len=E, keywords_dict=keywords_dict)

    all_links = read_str_list(links_to_sample_file_name)
    links = all_links[:num_links_in_sample]
    essences = []
    for i,link in enumerate(links):
        print ('starting link: %d , %s')%(i,link)
        for j in range(3):
            try:
                essence, uncut_essence = distillery.distill(link, keywords_dict)
                break
            except:
                print 'Distillery failed for the %s time. restarting browser...'% j
                distillery.restart_browser()
        essences.append(essence)

    with open(sample_file_name, 'w') as f:
        for essence in essences[:-1]:
            f.write('%s\n' % essence)
        f.write('%s' % essences[-1])
def conceal(tweet_file, config, endword_index=False):
    dicts = dictionaries.load_dictionaries(config)
    print 'keywords (x) = ', config.x
    print 'Essence len = ', config.essence_len
    distillery = Distillery(config.essence_len, dicts.keywords)
    search_engine = Search()
    raw_data_words = open(tweets_path + tweet_file).read().split()
    data_words = [
        keyword for word in raw_data_words
        for keyword in dicts.english["".join(
            c for c in word.lower()
            if c not in ('!', '.', ':', ',', '?', '"', '-'))]
    ]

    if endword_index:
        words = [dicts.keywords[endword_index]] * config.w
    else:
        words = [dicts.keywords[config.x - 1]] * config.w

    collected_words = [(words, '', '')]
    stats = WordsStats(config, tweet_file, collected_words)

    try:
        while True:

            # Avoid inserting 3rd link word in data
            # iteration_type = len(collected_words) % 10
            iteration_type = 1

            if iteration_type == 0:
                insert_link_word_in_d = True
                choose_new_link_word = False
            elif iteration_type == 1:
                first_link_word = 'This string is ignored'
                insert_link_word_in_d = False
                choose_new_link_word = True
            else:
                insert_link_word_in_d = False
                choose_new_link_word = False

            words, link, first_link_word = conceal_step(
                data_words, words, first_link_word, insert_link_word_in_d,
                choose_new_link_word, search_engine, distillery, dicts, stats)
            collected_words.append((
                words,
                link,
                first_link_word,
            ))
            if not data_words:
                break

    except Exception:
        print(traceback.format_exc())
        t, v, tb = sys.exc_info()
        # distillery.browser.close()
        raise t, v, tb

    print "collected words are: %s" % collected_words
    return collected_words
Пример #4
0
def create_search_keywords():
    keywords_dict, english_dict, links_dict = dicts.load_dictionaries()
    keywords = [keywords_dict[i] for i in range(X)]
    keywords_combos = list(itertools.combinations(keywords, r=3))
    random.shuffle(keywords_combos)
    with open(search_keywords_file_name, 'w') as f:
        for i in range(num_combinations):
            f.write(' '.join(keywords_combos[i]))
            if i != num_combinations - 1:
                f.write('\n')
def create_search_keywords():
    keywords_dict, english_dict, links_dict = dicts.load_dictionaries()
    keywords = [keywords_dict[i] for i in range(X)]
    keywords_combos = list(itertools.combinations(keywords,r=3))
    random.shuffle(keywords_combos)
    with open(search_keywords_file_name, 'w') as f:
        for i in range(num_combinations):
            f.write(' '.join(keywords_combos[i]))
            if i != num_combinations-1:
                f.write('\n')
def conceal(tweet_file, config, endword_index=False):
    dicts = dictionaries.load_dictionaries(config)
    print 'keywords (x) = ', config.x
    print 'Essence len = ', config.essence_len
    distillery = Distillery(config.essence_len, dicts.keywords)
    search_engine = Search()
    raw_data_words = open(tweets_path + tweet_file).read().split()
    data_words = [keyword for word in raw_data_words for keyword in
                  dicts.english["".join(c for c in word.lower() if c not in ('!', '.', ':', ',', '?', '"', '-'))]]

    if endword_index:
        words = [dicts.keywords[endword_index]] * config.w
    else:
        words = [dicts.keywords[config.x - 1]] * config.w

    collected_words = [(words, '', '')]
    stats = WordsStats(config, tweet_file, collected_words)

    try:
        while True:

            # Avoid inserting 3rd link word in data
            # iteration_type = len(collected_words) % 10
            iteration_type = 1

            if iteration_type == 0:
                insert_link_word_in_d = True
                choose_new_link_word = False
            elif iteration_type == 1:
                first_link_word = 'This string is ignored'
                insert_link_word_in_d = False
                choose_new_link_word = True
            else:
                insert_link_word_in_d = False
                choose_new_link_word = False


            words, link, first_link_word = conceal_step(data_words, words, first_link_word, insert_link_word_in_d,
                                                        choose_new_link_word, search_engine, distillery, dicts, stats)
            collected_words.append((words, link, first_link_word,))
            if not data_words:
                break

    except Exception:
        print(traceback.format_exc())
        t, v, tb = sys.exc_info()
        # distillery.browser.close()
        raise t, v, tb

    print "collected words are: %s" % collected_words
    return collected_words
def in_queue_to_out_queu(in_queue, out_queue):
    config = dictionaries.Config(1, 2, 2, 89, 10, 200)
    dicts = dictionaries.load_dictionaries(config)
    distillery = Distillery(config.essence_len, dicts.keywords)

    while True:
        links = in_queue.get()
        if links is None:
            out_queue.put((None, None))
            break
        for link in links:
            try:
                essence, uncut = distillery.distill(link)
                essence = frozenset(essence)
                out_queue.put((link, essence))
            except:
                print 'distillery failed! Skipping link: ', link
                distillery.restart_browser()
                continue
Пример #8
0
def in_queue_to_out_queu(in_queue, out_queue):
    config = dictionaries.Config(1, 2, 2, 89, 10, 200)
    dicts = dictionaries.load_dictionaries(config)
    distillery = Distillery(config.essence_len, dicts.keywords)

    while True:
        links = in_queue.get()
        if links is None:
            out_queue.put((None, None))
            break
        for link in links:
            try:
                essence, uncut = distillery.distill(link)
                essence = frozenset(essence)
                out_queue.put((link, essence))
            except:
                print 'distillery failed! Skipping link: ', link
                distillery.restart_browser()
                continue
Пример #9
0
def create_links_to_sample():
    keywords_dict, english_dict, links_dict = dicts.load_dictionaries()
    links = [x for x in links_dict.values()
             if isinstance(x, str)][:num_links_in_sample]
    write_str_list(links_to_sample_file_name, links)
def create_links_to_sample():
    keywords_dict, english_dict, links_dict = dicts.load_dictionaries()
    links = [x for x in links_dict.values() if isinstance(x, str)][:num_links_in_sample]
    write_str_list(links_to_sample_file_name, links)