def _calculate_word_frequencies(self, text): text = common_utils.prepare_text(text) words = common_utils.tokenize(text) res = collections.defaultdict(int) for word in words: res[word] += 1 return res
def keyphrases_table(keyphrases, texts, similarity_measure=None, synonimizer=None, language=consts.Language.ENGLISH): """ Constructs the keyphrases table, containing their matching scores in a set of texts. The resulting table is stored as a dictionary of dictionaries, where the entry table["keyphrase"]["text"] corresponds to the matching score (0 <= score <= 1) of keyphrase "keyphrase" in the text named "text". :param keyphrases: list of strings :param texts: dictionary of form {text_name: text} :param similarity_measure: similarity measure to use :param synonimizer: SynonymExtractor object to be used :param language: Language of the text collection / keyphrases :returns: dictionary of dictionaries, having keyphrases on its first level and texts on the second level. """ similarity_measure = similarity_measure or relevance.ASTRelevanceMeasure() text_titles = texts.keys() text_collection = texts.values() similarity_measure.set_text_collection(text_collection, language) i = 0 keyphrases_prepared = { keyphrase: utils.prepare_text(keyphrase) for keyphrase in keyphrases } total_keyphrases = len(keyphrases) total_scores = len(text_collection) * total_keyphrases res = {} for keyphrase in keyphrases: if not keyphrase: continue res[keyphrase] = {} for j in range(len(text_collection)): i += 1 logging.progress("Calculating matching scores", i, total_scores) res[keyphrase][text_titles[j]] = similarity_measure.relevance( keyphrases_prepared[keyphrase], text=j, synonimizer=synonimizer) logging.clear() return res
def keyphrases_table(keyphrases, texts, similarity_measure=None, synonimizer=None, language=consts.Language.ENGLISH): """ Constructs the keyphrases table, containing their matching scores in a set of texts. The resulting table is stored as a dictionary of dictionaries, where the entry table["keyphrase"]["text"] corresponds to the matching score (0 <= score <= 1) of keyphrase "keyphrase" in the text named "text". :param keyphrases: list of strings :param texts: dictionary of form {text_name: text} :param similarity_measure: similarity measure to use :param synonimizer: SynonymExtractor object to be used :param language: Language of the text collection / keyphrases :returns: dictionary of dictionaries, having keyphrases on its first level and texts on the second level. """ similarity_measure = similarity_measure or relevance.ASTRelevanceMeasure() text_titles = texts.keys() text_collection = texts.values() similarity_measure.set_text_collection(text_collection, language) i = 0 keyphrases_prepared = {keyphrase: utils.prepare_text(keyphrase) for keyphrase in keyphrases} total_keyphrases = len(keyphrases) total_scores = len(text_collection) * total_keyphrases res = {} for keyphrase in keyphrases: if not keyphrase: continue res[keyphrase] = {} for j in xrange(len(text_collection)): i += 1 logging.progress("Calculating matching scores", i, total_scores) res[keyphrase][text_titles[j]] = similarity_measure.relevance( keyphrases_prepared[keyphrase], text=j, synonimizer=synonimizer) logging.clear() return res
def set_text_collection(self, texts, language=consts.Language.ENGLISH): self.language = language if self.vector_space == consts.VectorSpace.STEMS: self.stemmer = snowball.SnowballStemmer(self.language) raw_tokens = [] total_texts = len(texts) for i in xrange(total_texts): raw_tokens.append( utils.tokenize_and_filter(utils.prepare_text(texts[i]))) logging.progress("Preparing texts", i + 1, total_texts) logging.clear() # Convert to stems or lemmata, depending on the vector space type preprocessed_tokens = self._preprocess_tokens(raw_tokens) # Terms define the vector space (they can be words, stems or lemmata). They should be # defined once here because they will be reused when we compute td-idf for queries self.terms = list(set(utils.flatten(preprocessed_tokens))) self.tf, self.idf = self._tf_idf(preprocessed_tokens)
def main(): args = sys.argv[1:] opts, args = getopt.getopt(args, "a:f:l:t:ds") opts = dict(opts) opts.setdefault("-a", "easa") # Algorithm to use for computing ASTs opts.setdefault("-l", "0.6") # Level of significance for graph construction opts.setdefault("-t", "0.25") # Threshold of the matching score # NOTE(msdubov): -f (output format) option takes different values for different # subcommands and its default value is set in corresponding handlers. if len(args) < 2: print("Invalid syntax: EAST should be called as:\n\n" " east <command> <subcommand> [options] args\n\n" "Commands available: keyphrases.\n" "Subcommands available: table/graph.") return 1 command = args[0] subcommand = args[1] if command == "keyphrases": if len(args) < 4: print('Invalid syntax. For keyphrases analysis, EAST should be called as:\n\n' ' east keyphrases <subcommand> [options] "path/to/keyphrases.txt" ' '"path/to/texts/dir"') return 1 keyphrases_file = os.path.abspath(args[2]) input_path = os.path.abspath(args[3]) use_synonyms = "-s" in opts normalized_scores = "-d" not in opts ast_algorithm = opts["-a"] significance_level = float(opts["-l"]) score_threshold = float(opts["-t"]) if os.path.isdir(input_path): input_files = [os.path.abspath(input_path) + "/" + filename for filename in os.listdir(input_path) if filename.endswith(".txt")] else: input_files = [os.path.abspath(input_path)] texts = {} for filename in input_files: with open(filename) as f: text_name = os.path.basename(filename).decode("utf-8")[:-4] texts[text_name] = f.read() with open(keyphrases_file) as f: keyphrases = map(lambda k: utils.prepare_text(k), f.read().splitlines()) if use_synonyms: synonimizer = synonyms.SynonymExtractor(input_path) else: synonimizer = None if subcommand == "table": keyphrases_table = applications.keyphrases_table(keyphrases, texts, ast_algorithm, normalized_scores, synonimizer) opts.setdefault("-f", "xml") # Table output format (also "csv" possible) table_format = opts["-f"].lower() if table_format == "xml": res = formatting.table2xml(keyphrases_table) elif table_format == "csv": res = formatting.table2csv(keyphrases_table) else: print ("Unknown table format: '%s'. " "Please use one of: 'xml', 'csv'." % table_format) return 1 print res.encode("utf-8", "ignore") elif subcommand == "graph": graph = applications.keyphrases_graph(keyphrases, texts, significance_level, score_threshold, ast_algorithm, normalized_scores, synonimizer) opts.setdefault("-f", "edges") # Graph output format (also "gml" possible) graph_format = opts["-f"].lower() if graph_format == "gml": res = formatting.graph2gml(graph) elif graph_format == "edges": res = formatting.graph2edges(graph) else: print ("Unknown graph format: '%s'. " "Please use one of: 'gml', 'edges'." % graph_format) return 1 print res.encode("utf-8", "ignore") else: print "Invalid subcommand: '%s'. Please use one of: 'table', 'graph'." % subcommand return 1 else: print "Invalid command: '%s'. Please use one of: 'keyphrases'." % command return 1