Пример #1
0
def fact_stop_word_filter(stop_words_file):
    stop_words = load_stop_words(stop_words_file)

    def stp_flter(tokens):
        return [tok for tok in tokens if tok.lower() not in stop_words]

    return stp_flter
Пример #2
0
def fact_case_sensitive_stop_word_filter(stop_words_file):
    stop_words = load_stop_words(stop_words_file)

    def cs_stop_filter(tokens):
        return [tok for tok in tokens if tok not in stop_words]

    return cs_stop_filter
Пример #3
0
            lbl, sim = kwd2cluster_sims[kword][0]
            f.write("%s=>%s\n" % (kword, cluster_label(lbl)))


""" Extract Clustered Synonyms """
if len(sys.argv) != 2:
    raise Exception("Incorrect number of arguments passed - one expected, the config file name")

config = GenerateClusterSynonymsConfig(sys.argv[1])

model = Word2Vec.load(config.model_file)
print("Word2Vec model loaded")

keywords = set()
for file in config.keywords_files:
    keywords.update(load_stop_words(file))
print("%i keywords loaded" % (len(keywords)))

id2kwd = dict()
kwd2id = dict()
vectors = []
for term in keywords:
    id2kwd[len(vectors)] = term
    kwd2id[term] = len(vectors)
    vec = get_norm_vector(term, model)
    if vec is not None:
        vectors.append(vec)

start = time.time()

# don't parallelize (n_jobs = -1), doesn't seem to work
def fact_stop_word_filter(stop_words_file):
    stop_words = load_stop_words(stop_words_file)

    def stp_flter(tokens):
        return [tok for tok in tokens if tok.lower() not in stop_words]
    return stp_flter
def fact_case_sensitive_stop_word_filter(stop_words_file):
    stop_words = load_stop_words(stop_words_file)

    def cs_stop_filter(tokens):
        return [tok for tok in tokens if tok not in stop_words]
    return cs_stop_filter
                    pyld_f.write("%s|%f " %(kw,val))
                pyld_f.write("\n")
            else:
                no_sim.add(word)
                #print("No matching similar terms in word2vec model for term: %s" % word)
    with open(synonym_file, "w+") as f:
        for syn in sorted(processed_syns):
            f.write("%s=>%s\n" % (syn, map_keyword(syn)))
    #Returned for analysis - do something with this if you need to investigate
    return missing, no_sim, processed_syns

""" Generate Synonym Files """
if len(sys.argv) != 2:
    raise Exception("Incorrect number of arguments passed - one expected, the config file name")

config = GenerateTopNSynonymsConfig(sys.argv[1])

start = time.time()
model = Word2Vec.load(config.model_file)
print("Word2Vec model loaded")

keywords = set()
for file in config.keywords_files:
    keywords.update(load_stop_words(file))
print("%i keywords loaded" % (len(keywords)))

missing, no_sim, processed_syns = write_most_similar_synonyms(config.top_n, keywords, model, config.payload_synonyms_file, config.synonyms_file)
print "%s synonyms processed" % (len(processed_syns))

end = time.time()
print "Took %s seconds" % (end - start)
Пример #7
0
""" Extract Phrases """
import sys
from Config.extract_keywords_config import ExtractKeywordsConfig

if len(sys.argv) != 2:
    raise Exception(
        "Incorrect number of arguments passed - one expected, the config file name"
    )

#sys.argv[0] is this script file, sys.argv[1] should be the config file
config = ExtractKeywordsConfig(sys.argv[1])
script_start = time.time()

if config.stop_words_file:
    stop_words = load_stop_words(config.stop_words_file)
    print("%i stop words loaded" % len(stop_words))
else:
    stop_words = set()
""" Load Documents """
start = time.time()
files = find_files(config.processed_documents_folder, config.file_mask, True)
print("%s files found in %s" % (len(files), config.processed_documents_folder))
documents = []
for i, fname in enumerate(files):
    with open(fname) as f:
        contents = f.read()
        documents.append(contents.split("\n"))
end = time.time()
print("Loading %i documents took %s seconds" % (len(files), str(end - start)))
""" Extract Common Terms and Phrases """
                find_sub_phrases_to_remove(tpl_ngram, valid_phrases, doc_freq, to_rem)


""" Extract Phrases """
import sys
from Config.extract_keywords_config import ExtractKeywordsConfig

if len(sys.argv) != 2:
    raise Exception("Incorrect number of arguments passed - one expected, the config file name")

# sys.argv[0] is this script file, sys.argv[1] should be the config file
config = ExtractKeywordsConfig(sys.argv[1])
script_start = time.time()

if config.stop_words_file:
    stop_words = load_stop_words(config.stop_words_file)
    print ("%i stop words loaded" % len(stop_words))
else:
    stop_words = set()

""" Load Documents """
start = time.time()
files = find_files(config.processed_documents_folder, config.file_mask, True)
print ("%s files found in %s" % (len(files), config.processed_documents_folder))
documents = []
for i, fname in enumerate(files):
    with open(fname) as f:
        contents = f.read()
        documents.append(contents.split("\n"))
end = time.time()
print ("Loading %i documents took %s seconds" % (len(files), str(end - start)))