示例#1
0
def save_doc2vec_model(doc2vec_model, file_name, directory_path=None):
    """persist trained doc2vec model
    :param doc2vec_model: previous trained doc2vec model
    :param file_name: file name for persisting
    :param directory_path: where to persist doc2vec model

    """

    file_name = file_name + "-" + time.strftime("%d-%b-%Y-%X")

    if directory_path is not None:
        doc2vec_model_path = directory_path

        if not os.path.exists(doc2vec_model_path + '/doc2vec'):
            try:
                os.makedirs(doc2vec_model_path + 'doc2vec')
            except OSError as e:
                if e.errno != errno.EEXIST:
                    raise

        logger.info("save new doc2vec model at: " + doc2vec_model_path + 'doc2vec/')
        doc2vec_model.save(doc2vec_model_path + 'doc2vec/' + file_name)

    else:
        doc2vec_model_path = settings.DATA_DIR + 'doc2vec/models/'
        logger.info("save new doc2vec model at: " + doc2vec_model_path)
        doc2vec_model.save(doc2vec_model_path + file_name)
示例#2
0
def get_doc_vectors_for_new_documents(doc2vec_model, documents_folder_name, documents_file_path=None):
    """retrieve document vectors for new documents based on previous trained doc2vec model
    :param doc2vec_model: trained doc2vec model
    :param documents_folder_name: name of documents folder
    :param documents_file_path: file path to the folder
    :return: english and german document vectors

    """

    logger.info('get doc vector values of unseen documents')
    if documents_file_path is None:
        documents_file_path = settings.DATA_DIR + 'crawling_data/' + documents_folder_name + '/'

    else:
        documents_file_path = documents_file_path + documents_folder_name + '/'

    logger.info('get documents from following path: ' + documents_file_path)

    preprocessed_documents_english, preprocessed_documents_german = preprocess_new_documents(documents_file_path)

    doc_vectors_english = doc2vec_model.infer_vector(preprocessed_documents_english, alpha=0.025, min_alpha=0.01,
                                                    steps=1)
    doc_vectors_german = doc2vec_model.infer_vector(preprocessed_documents_german, alpha=0.025, min_alpha=0.01, steps=1)

    return doc_vectors_english, doc_vectors_german
示例#3
0
def load_existing_model(doc2vec_model_file_path=None, model_file_name=None):
    if doc2vec_model_file_path is None and model_file_name is not None:
        logger.info('Loading doc2vec models directly from project structure...')
        doc2vec_model_file_path = settings.DATA_DIR + 'doc2vec/models/' + model_file_name

    logger.info('load model from following path: ' + str(doc2vec_model_file_path))
    loaded_model = gensim.models.Doc2Vec.load(doc2vec_model_file_path)
    return loaded_model
示例#4
0
def get_doc_vectors_matrix(doc2vec_model):
    """retrieve document vectors value of given doc2vec model
    :param doc2vec_model:
    :return: document vectors matrix

    """
    logger.info('get document vectors matrix of doc2vec model')
    docvec_vectors = doc2vec_model.docvecs
    return docvec_vectors
示例#5
0
def get_word_vectors_matrix(doc2vec_model):
    """retrieve word vectors value of given doc2vec model
    :param doc2vec_model:
    :return: word vectors matrix

    """
    logger.info('get word vectors matrix of doc2vec model')
    word_vectors = doc2vec_model.wv.syn0
    return word_vectors
示例#6
0
def get_doc_similarities_by_new_vector(doc2vec_model, new_vector):
    """retrieve cosine similarities by new vector (word)
    :param doc2vec_model: trained doc2vec model
    :param new_vector: word to infer vector values
    :return: cosine similarities for given new vector (word)

    """
    logger.info('get document similarities by new vector')
    similarities = doc2vec_model.docvecs.most_similar([new_vector])
    return similarities
示例#7
0
def get_doc_similarities_by_document_name(doc2vec_model, document_name):
    """retrieve cosine similarities of document by document name
    :param doc2vec_model: trained doc2vec model
    :param document_name: name of document in doc2vec model
    :return: cosine similarities

    """
    logger.info('get document similarities')
    similarities = doc2vec_model.docvecs.most_similar(document_name)
    return similarities
示例#8
0
def crawl_given_hostnames(hostnames, directory_to_save_results):
    """helper class to crawl given urls
    :param hostnames: urls to crawl
    :param directory_to_save_results: where to persist the result

    """

    crawler = Crawler()
    logger.info('crawl following urls (hostnames): ' + str(hostnames))
    crawler.crawl_hostnames(hostnames, directory_to_save_results)
示例#9
0
def crawl_list_of_hostnames(urls_list_file_path, directory_to_save_results):
    """ helper class to crawl list of hostnames

    :param urls_list_file_path:
    :param directory_to_save_results:
    :return:
    """
    crawler = Crawler()
    logger.info('crawl following list of urls: ' + urls_list_file_path)
    hostnames = crawler.get_hostnames_from_csv_list(urls_list_file_path)
    crawler.crawl_hostnames(hostnames, directory_to_save_results)
示例#10
0
def preprocess_new_documents(documents_path):
    """documents which were not included in a previous trained doc2vec model, gets preprocessed for further processing
    :param documents_path: file path to the newly crawled documents
    :return: preprocessed and concatenated english and german documents

    """

    logger.info('start processing unseen document')
    folders_in_directory = glob.glob(documents_path)

    if not folders_in_directory:
        raise IOError

    else:

        preprocessed_and_concatenated_documents_english = []
        preprocessed_and_concatenated_documents_german = []
        english_documents_counter = 0
        german_documents_counter = 0

        for folder_name in folders_in_directory:
            logger.info('start getting files of folder ' + folder_name)
            pattern = os.path.join(folder_name, '*.txt')
            file_names = glob.glob(pattern)

            if not file_names:
                raise IOError

            else:
                logger.info('start read in files')
                for file_name in file_names:
                    with open(file_name, 'r') as file:
                        document = file.read()

                    document = gensim.utils.simple_preprocess(document)
                    document_language = preprocessing.detect_language(document)
                    if document_language == 'english':
                        english_documents_counter += 1
                        preprocessed_document = preprocessing.preprocess_document(document,
                                                                                                     document_language)
                        preprocessed_and_concatenated_documents_english += preprocessed_document

                    elif document_language == 'german':
                        german_documents_counter += 1
                        preprocessed_document = preprocessing.preprocess_document(document,
                                                                                                     document_language)
                        preprocessed_and_concatenated_documents_german += preprocessed_document

        logger.info('Concatenated and preprocessed ' + str(english_documents_counter) + ' to one english document')
        logger.info('Concatenated and preprocessed ' + str(german_documents_counter) + ' to one german document')

        return preprocessed_and_concatenated_documents_english, preprocessed_and_concatenated_documents_german
示例#11
0
def create_new_doc2vec_model(documents_file_path=None, save_to_directory=None, single_language_support=False):
    """helper function to create a new doc2vec model
    :param documents_file_path: file path to the crawled and stored documents
    :param save_to_directory: where to save the doc2vec model
    :return: english and german doc2vec model

    """

    if documents_file_path is not None:
        documents_file_path = documents_file_path + '*/'
        logger.info('document file has been set to: ' + str(documents_file_path))

    else:
        documents_file_path = settings.DATA_DIR + 'crawling_data/*/'
        logger.info('No documets file path has been given, default file path used: ' + str(documents_file_path))

    logger.info('Start creating new doc2vec model...')
    document_corpus_english, document_corpus_german = create_document_corpus_by_language(documents_file_path,
                                                                                         single_language_support)

    doc2vec_model_english = create_doc2vec_model(document_corpus_english)
    doc2vec_model_german = create_doc2vec_model(document_corpus_german)

    save_doc2vec_model(doc2vec_model_english, 'doc2vec-model-english', directory_path=save_to_directory)
    save_doc2vec_model(doc2vec_model_german, 'doc2vec-model-german', directory_path=save_to_directory)

    return doc2vec_model_english, doc2vec_model_german
示例#12
0
def create_doc2vec_model(document_corpus):
    """doc2vec model gets build for given document corpus
    :param document_corpus: previous build document corpus consists of multiple labeled documents
    :return: doc2vec model

    """

    # doc2vec hyperparameters -- inspired by: https://github.com/jhlau/doc2vec
    vector_size = 300
    window_size = 15
    min_count = 1
    sampling_threshold = 1e-5
    negative_size = 5
    train_epoch = 100
    dm = 0  # 0 = dbow; 1 = dmpv
    worker_count = 3  # number of parallel processes

    logger.info('start building Doc2Vec model')
    model = gensim.models.Doc2Vec(size=vector_size,
                                  window=window_size,
                                  min_count=min_count,
                                  sample=sampling_threshold,
                                  workers=worker_count,
                                  hs=0,
                                  dm=dm,
                                  negative=negative_size,
                                  dbow_words=1,
                                  dm_concat=1,
                                  iter=train_epoch)

    model.build_vocab(document_corpus)
    logger.info("model's vocubulary length: " + str(len(model.wv.vocab)))

    logger.info("start to train the model")
    model.train(document_corpus,
                total_examples=model.corpus_count,
                epochs=model.iter)

    return model
示例#13
0
def kmeans_clustering(doc2vec_model,
                      tsne_model,
                      model_language,
                      k=3,
                      new_hostnames=None,
                      save_to_directory=None):
    """Creates K-Means clustering for given tsne model
    :param doc2vec_model: data point labels (keys) gets inferred from doc2vec model
    :param tsne_model: tsne model to apply clustering
    :param model_language: language of doc2vec model, gets added to the file name
    :param k: value to control how many clusters (k) should be generated
    :param new_hostnames: hostnames which where not included in doc2vec model while training (new data)
    :param save_to_directory: where to store the plot


    """
    logger.info("Start creating K-Means Clustering...")
    logger.info('Length of the t-sne model = ' + str(len(tsne_model)))

    data_point_labels = list(doc2vec_model.docvecs.doctags.keys())

    if new_hostnames is not None:
        for hostname in new_hostnames:
            data_point_labels.append(hostname)

    logger.info('Amount of Datapoints Labels = ' + str(len(data_point_labels)))

    assert (len(tsne_model) == len(data_point_labels))
    assert (k <= len(tsne_model))

    k = k
    random_state = 0
    logger.info('K-Means Parameters: k = %s, random_state= %d ' %
                (k, random_state))
    logger.info('Start training K-Means Model...')
    kmeans = KMeans(n_clusters=k, random_state=random_state).fit(tsne_model)
    logger.info('K-Means model sucessfully built...start with visualization')

    # Step size of the mesh. Decrease to increase the quality of the VQ.
    h = .02  # point in the mesh [x_min, x_max]x[y_min, y_max].

    # Plot the decision boundary. For that, we will assign a color to each
    x_min, x_max = tsne_model[:, 0].min() - 1, tsne_model[:, 0].max() + 1
    y_min, y_max = tsne_model[:, 1].min() - 1, tsne_model[:, 1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                         np.arange(y_min, y_max, h))

    # Obtain labels for each point in mesh. Use last trained model.
    Z = kmeans.predict(np.c_[xx.ravel(), yy.ravel()])

    logger.info('start creating color plot...')
    plt.figure(figsize=(16, 16))

    Z = Z.reshape(xx.shape)
    plt.figure(1)
    plt.clf()
    plt.imshow(Z,
               interpolation='nearest',
               extent=(xx.min(), xx.max(), yy.min(), yy.max()),
               cmap=plt.cm.Paired,
               aspect='auto',
               origin='lower')

    plt.plot(tsne_model[:, 0], tsne_model[:, 1], 'k.', markersize=2)

    # Annotate the data points
    for i, txt in zip(tsne_model, data_point_labels):
        plt.annotate(txt, (i[0], i[1]),
                     xytext=(0, -8),
                     textcoords="offset points",
                     va="center",
                     ha="left")

    logger.info('The centroids are getted plotted as white x...')
    centroids = kmeans.cluster_centers_
    plt.scatter(centroids[:, 0],
                centroids[:, 1],
                marker='x',
                s=169,
                linewidths=3,
                color='w',
                zorder=10)

    plt.title('K-Means clustering with K=%d over the t-sne reduced data' % k)
    plt.xlim(x_min, x_max)
    plt.ylim(y_min, y_max)
    plt.xticks(())
    plt.yticks(())

    if save_to_directory is None:
        file_path = settings.DATA_DIR + "experiments/clusterer/kmeans/"
    else:
        file_path = save_to_directory

    file_name = 'kmeans_cluster-' + model_language + '-' + time.strftime(
        "%d-%b-%Y-%X") + ".png"
    plt.savefig(file_path + file_name, facecolor="w", dpi=90)
    logger.info("saved " + file_name + "at " + file_path)

    plt.show()
def agglomerative_clustering(doc2vec_model,
                             tsne_model,
                             numbers_of_clusters,
                             model_language,
                             new_hostnames=None,
                             save_to_directory=None):
    """applies agglomerative clustering algorithm to given tsne model
    :param doc2vec_model: infer documents labels (keys) from doc2vec model
    :param tsne_model: tsne model to apply algorithm to
    :param numbers_of_clusters: how many clusters should get build
    :param model_language: doc2vec model language, gets added to the plot file name
    :param new_hostnames: hostnames which where not included in doc2vec model while training (new data)
    :param save_to_directory: where to store the plot
    Reference: http://scikit-learn.org/stable/auto_examples/cluster/plot_agglomerative_clustering.html#sphx-glr-auto-examples-cluster-plot-agglomerative-clustering-py

    """

    logger.info("Start creating Agglomerative Cluster...")
    data_point_labels = list(doc2vec_model.docvecs.doctags.keys())

    if new_hostnames is not None:
        for hostname in new_hostnames:
            data_point_labels.append(hostname)

    logger.info('Amount of Datapoints Labels = ' + str(len(data_point_labels)))
    logger.info('Length of the t-sne model = ' + str(len(tsne_model)))

    assert (len(tsne_model) == len(data_point_labels))

    # calculate local connectivity
    knn_graph = kneighbors_graph(tsne_model, 30, include_self=False)

    # example: (5, 10, 15, 20, 25, 30)
    numbers_of_clusters = tuple(numbers_of_clusters)

    for connectivity in (None, knn_graph):
        for n_clusters in numbers_of_clusters:
            plt.figure(figsize=(40, 15))
            for index, linkage in enumerate(('average', 'complete', 'ward')):
                plt.subplot(1, 3, index + 1)
                model = AgglomerativeClustering(linkage=linkage,
                                                connectivity=connectivity,
                                                n_clusters=n_clusters)
                t0 = time.time()
                model.fit(tsne_model)
                elapsed_time = time.time() - t0
                plt.scatter(tsne_model[:, 0],
                            tsne_model[:, 1],
                            c=model.labels_,
                            cmap=plt.cm.spectral)

                # Annotate the data points
                for i, txt in zip(tsne_model, data_point_labels):
                    plt.annotate(txt, (i[0], i[1]),
                                 xytext=(0, -8),
                                 textcoords="offset points",
                                 va="center",
                                 ha="left")

                plt.title('linkage=%s (time %.2fs)' % (linkage, elapsed_time),
                          fontdict=dict(verticalalignment='top'))
                plt.axis('equal')
                plt.axis('off')

                plt.subplots_adjust(bottom=0,
                                    top=.89,
                                    wspace=0,
                                    left=0,
                                    right=1)
                plt.suptitle('n_cluster=%i, connectivity=%r' %
                             (n_clusters, connectivity is not None),
                             size=17)

            if save_to_directory is None:
                file_path = settings.DATA_DIR + "experiments/clusterer/agglomerative_clustering/"
            else:
                file_path = save_to_directory

            file_name = 'agglomerative_clustering-' + model_language + '-' + time.strftime(
                "%d-%b-%Y-%X") + ".png"
            plt.savefig(file_path + file_name, facecolor="w", dpi=90)
            logger.info("saved " + file_name + " at " + file_path)

    plt.show()
示例#15
0
def kmedoid_clustering(doc2vec_model, tsne_model, start_medoids, new_hostnames=None):
    """creates K-Medoid clustering for given tsne model
    :param doc2vec_model: doc2vec model to infer data point labels (keys)
    :param tsne_model: tsne model to apply clustering
    :param start_medoids: medoids which be used as startin point
    :param new_hostnames: hostnames which where not included in doc2vec model while training (new data)

    """

    logger.info("Start creating K-Medoid Cluster...")
    data_point_labels = list(doc2vec_model.docvecs.doctags.keys())

    if new_hostnames is not None:
        for hostname in new_hostnames:
            data_point_labels.append(hostname)

    logger.info('Amount of Datapoints Labels = ' + str(len(data_point_labels)))
    logger.info('Length of the t-sne model = ' + str(len(tsne_model)))

    assert (len(tsne_model) == len(data_point_labels))

    # Example: start_medoids = [0, 5, 10, 15, 20]
    start_medoids = start_medoids
    logger.info('Number of Medoids = %s' % len(start_medoids))
    logger.info('Given Medoids = %s' % str(start_medoids))

    logger.info('Start creating K-Medoid Model...')
    tolerance = 0.2
    kmedoids_instance = kmedoids(tsne_model, start_medoids, tolerance);
    (ticks, result) = timedcall(kmedoids_instance.process)

    clusters = kmedoids_instance.get_clusters();
    medoids = kmedoids_instance.get_medoids();
    print("Sample: ", "\t\tExecution time: ", ticks, "\n");

    cluster_visualizer = ClusterVisualizer(1, data=tsne_model, labels=data_point_labels);
    cluster_visualizer.append_clusters(clusters, tsne_model, 0);

    cluster_visualizer.append_cluster(medoids, marker='*', markersize=12, color='red')
    cluster_visualizer.show(k=len(start_medoids), tolerance=tolerance);
示例#16
0
def main():
    # Some examples...
    '''

    # Create 2D t-SNE Model
    doc2vec_model = doc2vec.load_existing_model(model_file_name='doc2vec_single_language_full_model_german_18_Feb_2018_22_31_27')
    doc2vec_vector_matrix = doc2vec.get_doc_vectors_matrix(doc2vec_model)
    create_and_save_2d_tsne_model(doc2vec_vector_matrix, 'single_language_full-model-doc2vec-model-german')


    
    # Example to to create tsne model with new data
    
    logger.info('Start building tsne model with new data at: ' + time.strftime("%d-%b-%Y-%X"))
    doc2vec_model = doc2vec.load_existing_model(model_file_name='standard-models/doc2vec_model_german_17_Feb_2018_02_14_04')
    doc2vec_vector_matrix = doc2vec.create_doc_vector_matrix_for_new_documents(doc2vec_model,
                                                                               new_documents=['upkbs.ch',
                                                                                              'curaneo.ch',
                                                                                              'bscyb.ch',
                                                                                              'scltigers.ch',
                                                                                              'graubuenden.ch'],
                                                                               model_language='german',
                                                                               documents_file_path='/home/sandro/vm1/OTA_Clusterer/data/experiments/crawling_data_experiments/')
    create_and_save_2d_tsne_model(doc2vec_vector_matrix, 'full-doc2vec-model-new-data-german')
    logger.info('Finished building tsne model with new data at: ' + time.strftime("%d-%b-%Y-%X"))
    
    '''

    # Live-Demo pt. 1

    logger.info('LIVE-DEMO Start building tsne model with new data at: ' +
                time.strftime("%d-%b-%Y-%X"))
    doc2vec_model = doc2vec.load_existing_model(
        model_file_name=
        'doc2vec_single_language_full_model_german_18_Feb_2018_22_31_27')
    doc2vec_vector_matrix = doc2vec.create_doc_vector_matrix_for_new_documents(
        doc2vec_model,
        new_documents=['triaplus.ch', 'fcaarau.ch'],
        model_language='german',
        documents_file_path=
        '/home/sandro/vm1/OTA_Clusterer/data/experiments/crawling_data_experiments/'
    )
    create_and_save_2d_tsne_model(doc2vec_vector_matrix, 'live-demo-german')
    logger.info('Finished building tsne model with new data at: ' +
                time.strftime("%d-%b-%Y-%X"))

    # Live-Demo pt.2

    logger.info('LIVE-DEMO Start building 2nd tsne model with new data at: ' +
                time.strftime("%d-%b-%Y-%X"))
    doc2vec_model = doc2vec.load_existing_model(
        model_file_name=
        'doc2vec_single_language_full_model_english_18_Feb_2018_22_31_27')
    doc2vec_vector_matrix = doc2vec.create_doc_vector_matrix_for_new_documents(
        doc2vec_model,
        new_documents=['hostelscentral.com'],
        model_language='english',
        documents_file_path=
        '/home/sandro/vm1/OTA_Clusterer/data/experiments/crawling_data_experiments/'
    )
    create_and_save_2d_tsne_model(doc2vec_vector_matrix, 'live-demo-english')
    logger.info('Finished building tsne model with new data at: ' +
                time.strftime("%d-%b-%Y-%X"))
示例#17
0
def create_document_corpus_by_language(documents_path, single_language_support=False):
    """ reads in crawled documents and create a doc2vec document corpus
    previous crawled documents gets preprocessed and a doc2vec document corpus gets build separated by language (german
    and english)
    :param documents_path: file path of the crawled documents
    :param single_language_support: just support one language per hostname
    :return: german and english document corpus
    """

    logger.info('start creating document corpus by language')
    folders_in_directory = glob.glob(documents_path)

    if not folders_in_directory:
        raise IOError

    else:

        preprocessed_documents_corpus_english = []
        preprocessed_documents_corpus_german = []

        for folder_name in folders_in_directory:
            logger.info('start getting files of folder ' + folder_name)
            pattern = os.path.join(folder_name, '*.txt')
            file_names = glob.glob(pattern)

            if not file_names:
                raise IOError

            else:

                if single_language_support is True:
                    logger.info('single language support is enabled')
                    preprocessed_documents_by_directory_english = []
                    preprocessed_documents_by_directory_german = []

                logger.info('start read in files')
                for file_name in file_names:
                    with open(file_name, 'r') as file:
                        document = file.read()

                    document = gensim.utils.simple_preprocess(document)
                    document_language = preprocessing.detect_language(document)
                    if document_language == 'english' or document_language == 'german':
                        preprocessed_document = preprocessing.preprocess_document(document, document_language)

                        tagged_document_name = remove_file_path_from_folder_name(folder_name)
                        tagged_document = gensim.models.doc2vec.TaggedDocument(preprocessed_document,
                                                                               ["{}".format(tagged_document_name)])

                        if document_language == 'english':
                            if single_language_support is True:
                                preprocessed_documents_by_directory_english.append(tagged_document)
                            else:
                                preprocessed_documents_corpus_english.append(tagged_document)

                        elif document_language == 'german':
                            if single_language_support is True:
                                preprocessed_documents_by_directory_german.append(tagged_document)
                            else:
                                preprocessed_documents_corpus_german.append(tagged_document)

                if single_language_support is True:
                    number_of_english_documents = len(preprocessed_documents_by_directory_english)
                    number_of_german_documents = len(preprocessed_documents_by_directory_german)

                    if number_of_english_documents > number_of_german_documents:
                        for document in preprocessed_documents_by_directory_english:
                            preprocessed_documents_corpus_english.append(document)

                        logger.info(
                            'added ' + str(number_of_english_documents) + ' documents from ' + folder_name + ' to english corpus')

                    elif number_of_german_documents > number_of_english_documents:
                        for document in preprocessed_documents_by_directory_german:
                            preprocessed_documents_corpus_german.append(document)

                        logger.info(
                            'added ' + str(number_of_german_documents) + ' documents from ' + folder_name + ' to german corpus')

                    elif number_of_english_documents == number_of_german_documents:
                        logger.info('added documents of ' + folder_name + ' to both corpus')
                        for document in preprocessed_documents_by_directory_english:
                            preprocessed_documents_corpus_english.append(document)

                        for document in preprocessed_documents_by_directory_german:
                            preprocessed_documents_corpus_german.append(document)

        logger.info(
            'Added ' + str(len(preprocessed_documents_corpus_english)) + ' documents to the english document corpus')
        logger.info(
            'Added ' + str(len(preprocessed_documents_corpus_german)) + ' documents to the german document corpus')

        return preprocessed_documents_corpus_english, preprocessed_documents_corpus_german
示例#18
0
        nargs='+',
        help='Space seperated numbers of clusters: 5 10 15 20')

    agglomerative_clustering_cli.add_argument(
        '--agglomerative_clustering',
        help='agglomerative clustering algorithm to given tsne model (required '
        'params: '
        '-cluster_nr, -load_doc2vec_model, -load_tsne_model, -model_language, '
        '-clustering_dir, -new_hostnames (optional))',
        action='store_true',
        dest='agglomerative_clustering')

    args = parser.parse_args()

    if args.crawl:
        logger.info('Hostnames to crawl via CLI = ' + str(args.hostnames))
        logger.info('Store crawled data at:  ' + args.crawled_dir)
        crawler.crawl_given_hostnames(args.hostnames, args.crawled_dir)

    elif args.crawl_list:
        logger.info('Crawl list of hostnames via CLI from: ' + args.urls_list)
        logger.info('Store crawled data at: ' + args.crawled_dir)
        crawler.crawl_list_of_hostnames(args.urls_list, args.crawled_dir)

    elif args.create_doc2vec_model:
        logger.info('Create doc2vec model via CLI from following data: ' +
                    args.crawled_dir + ' and store data at ' + args.models_dir)

        doc2vec.create_new_doc2vec_model(
            documents_file_path=args.crawled_dir,
            save_to_directory=args.models_dir,