Пример #1
0
def format_related_terms(model, positive_terms, max_num=NUM_TOP):
    """Determine related terms from MODEL for POSITIVE_TERMS, returning at most MAX_NUM entries each."""
    # Try to get most similar terms. If words are not in the vocabulary
    # try with the remainder if any.
    all_related_info = []
    try:
        all_related_info = model.most_similar(positive=positive_terms)
    except KeyError:
        missing = [w for w in positive_terms if w not in model]
        tpo.print_stderr("Warning: omitting words not in model: %s" % missing)
        ok_words = tpo.difference(positive_terms, missing)
        if ok_words:
            try:
                all_related_info = model.most_similar(positive=ok_words)
            except:
                tpo.print_stderr("Unexpected error in format_related_terms: " +
                                 str(sys.exc_info()))

    # Add related terms unless filtered due to low frequency or embedded punctuation
    related_specs = []
    for (term, score) in all_related_info:
        if SKIP_LOW_FREQUENCY_TERMS and term.lower() not in TERM_FREQ_HASH:
            tpo.debug_print("Skipping low frequency related term '%s'" % term,
                            6)
            continue
        if SKIP_TERMS_WITH_PUNCTUATION and re.search(r"\W", term):
            tpo.debug_print(
                "Skipping related term '%s' due to punctuation" % term, 6)
            continue
        related_specs.append(term + ": " + tpo.round_num(score))
        if len(related_specs) == max_num:
            break
    return ", ".join(related_specs)
Пример #2
0
def read_lines(filename=None, make_unicode=False):
    """Returns list of lines from FILENAME without newlines (or other extra whitespace)
    @notes: Uses stdin if filename is None. Optionally returned as unicode."""
    # TODO: use enumerate(f); refine exception in except; 
    # TODO: force unicode if UTF8 encountered
    lines = []
    f = None
    try:
        # Open the file
        if not filename:
            tpo.debug_format("Reading from stdin", 4)
            f = sys.stdin
        else:
            f = open(filename)
            if not f:
                raise IOError
        # Read line by line
        for line in f:
            line = line.strip("\n")
            if make_unicode:
                line = tpo.ensure_unicode(line)
            lines.append(line)
    except IOError:
        debug_print("Warning: Exception reading file %s: %s" % (filename, str(sys.exc_info())), 2)
    finally:
        if f:
            f.close()
    debug_print("read_lines(%s) => %s" % (filename, lines), 6)
    return lines
Пример #3
0
def write_file(filename, text, append=False):
    """Writes FILENAME using contents in TEXT, adding trailing newline and optionally for APPEND"""
    ## TEST: debug_print(u"write_file(%s, %s)" % (filename, text), 7)
    ## TEST: debug_print(u"write_file(%s, %s)" % (filename, tpo.normalize_unicode(text)), 7)
    debug_print("write_file(%s, %s)" % (tpo.normalize_unicode(filename), tpo.normalize_unicode(text)), 7)
    text_lines = text.rstrip("\n").split("\n")
    return write_lines(filename, text_lines, append)
Пример #4
0
    def __iter__(self):
        """Returns iterator producing one line at a time"""
        # Derive the list of filenames to process
        # TODO: support recursive directory descent
        tpo.debug_print("in MySentences.__iter__()", 6)
        file_names = None
        if os.path.isdir(self.file_name):
            dir_name = self.file_name
            file_names = [
                os.path.join(dir_name, f) for f in os.listdir(dir_name)
            ]
        else:
            file_names = [self.file_name]

        # Feed each sentence individually from each file
        # TODO: add preprocessing (e.g., tokenize, make lowercase, etc.)
        for file_name in file_names:
            if os.path.isdir(file_name):
                tpo.debug_format("Warning: skipping subdirectory {f}",
                                 tpo.WARNING,
                                 f=file_name)
                continue
            tpo.debug_format("Processing file {f}", tpo.DETAILED, f=file_name)
            for line in open(file_name):
                ## OLD: tokens = line.split()
                tokens = tokenize(line)
                tpo.debug_format("MySentences.__iter__: yielding {t}",
                                 6,
                                 t=tokens)
                yield tokens
        tpo.debug_print("out MySentences.__iter__()", 6)
        return
Пример #5
0
def elide(text, max_len=MAX_ELIDED_TEXT_LEN):
    """Returns TEXT elided to at most MAX_LEN characters (with '...' used to indicate remainder). Note: intended for tracing long string."""
    # TODO: add support for eliding at word-boundaries
    result = text
    if len(result) > max_len:
        result = result[:max_len] + "..."
    tpo.debug_print("elide({%s}, [{%s}]) => {%s}" % (text, max_len, result), 7)
    return result
Пример #6
0
def copy_file(source, target):
    """Copy SOURCE file to TARGET file"""
    # Note: meta data is not copied (e.g., access control lists)); see
    #    https://docs.python.org/2/library/shutil.html
    debug_print("copy_file(%s, %s)" % (tpo.normalize_unicode(source), tpo.normalize_unicode(target)), 5)
    assertion(non_empty_file(source))
    shutil.copy(source, target)
    assertion(non_empty_file(target))
    return
Пример #7
0
def remove_extension(filename, extension):
    """Returns FILENAME without EXTENSION. Note: similar to basename() but retainting directory portion."""
    # EX: remove_extension("/tmp/solr-4888.log", ".log") => "/tmp/solr-4888"
    # EX: remove_extension("/tmp/fubar.py", ".py") => "/tmp/fubar"
    # EX: remove_extension("/tmp/fubar.py", "py") => "/tmp/fubar."
    # NOTE: Unlike os.path.splitext, only the specific extension is removed (not whichever extension used).
    pos = filename.find(extension)
    base = filename[:pos] if (pos > -1) else filename
    debug_print("remove_extension(%s, %s) => %s" % (filename, extension, base), 5)
    return base
Пример #8
0
def delete_file(filename):
    """Deletes FILENAME"""
    debug_print("delete_file(%s)" % tpo.normalize_unicode(filename), 5)
    assertion(os.path.exists(filename))
    ok = False
    try:
        ok = os.remove(filename)
        debug_format("remove{f} => {r}", 6, f=filename, r=ok)
    except OSError:
        debug_print("Exception during deletion of {filename}: " + str(sys.exc_info()), 5)
    return ok
Пример #9
0
def basename(filename, extension=None):
    """Remove directory and from FILENAME along with optional EXTENSION, as with Unix basename command. Note: the period in the extension must be explicitly supplied (e.g., '.data' not 'data')"""
    # EX: basename("fubar.py", ".py") => "fubar"
    # EX: basename("fubar.py", "py") => "fubar."
    # EX: basename("/tmp/solr-4888.log", ".log") => "solr-4888"
    base = os.path.basename(filename)
    if extension != None:
        pos = base.find(extension)
        if pos > -1:
            base = base[:pos]
    debug_print("basename(%s, %s) => %s" % (filename, extension, base), 5)
    return base
Пример #10
0
def resolve_path(filename, base_dir=None):
    """Resolves path for FILENAME related to BASE_DIR if not in current directory. Note: this uses the script directory for the calling module if BASE_DIR not specified (i.e., as if os.path.dirname(__file__) passed)."""
    path = filename
    if not os.path.exists(path):
        if not base_dir:
            frame = None
            try:
                frame = inspect.currentframe().f_back
                base_dir = os.path.dirname(frame.f_globals['__file__'])
            except (AttributeError, KeyError):
                base_dir = ""
                debug_print("Exception during resolve_path: " + str(sys.exc_info()), 5)
            finally:
                if frame:
                    del frame
        path = os.path.join(base_dir, path)
    debug_format("resolve_path({f}) => {p}", 4, f=filename, p=path)
    return path
Пример #11
0
def extract_matches(pattern, lines, fields=1):
    """Checks for PATTERN matches in LINES of text returning list of tuples with replacement groups"""
    # ex: extract_matches(r"^(\S+) \S+", ["John D.", "Jane D.", "Plato"]) => ["John", "Jane"]
    assert type(lines) == list
    if pattern.find("(") == -1:
        pattern = "(" + pattern + ")"
    matches = []
    for line in lines:
        try:
            match = re.search(pattern, line)
            if match:
                result = match.group(1) if (fields == 1) else [match.group(i + 1) for i in range(fields)]
                matches.append(result)
        except (re.error, IndexError):
            debug_print("Warning: Exception in pattern matching: %s" % str(sys.exc_info()), 2)
    debug_print("extract_matches(%s, _, [%s]) => %s" % (pattern, fields, matches), 7)
    double_indent = INDENT + INDENT
    debug_format("{ind}input lines: {{\n{res}\n{ind}}}", 8,
                 ind=INDENT, res=indent_lines("\n".join(lines), double_indent))
    return matches
Пример #12
0
def run(command, trace_level=4, subtrace_level=None, just_issue=False, **namespace):
    """Invokes COMMAND via system shell, using TRACE_LEVEL for debugging output, returning result. The command can use format-style templates, resolved from caller's namespace. The optional SUBTRACE_LEVEL sets tracing for invoked commands [defalt is same as TRACE_LEVEL); this works around problem with stderr not being separated, which can be a problem when tracing unit tests. Notes: This function doesn't work under Win32. Tabs are not preserved so redirect stdut to file if needed"""
    # TODO: make sure no template markers left in command text (e.g., "tar cvfz {tar_file}")
    # EX: "root" in run("ls /")
    # Note: Script tracing controlled DEBUG_LEVEL environment variable.
    assertion(isinstance(trace_level, int))
    debug_print("run(%s, [trace_level=%s], [subtrace_level=%s])" % (command, trace_level, subtrace_level), (trace_level + 2))
    global default_subtrace_level
    # Keep track of current debug level setting
    debug_level_env = None
    if subtrace_level is None:
        subtrace_level = default_subtrace_level
    if subtrace_level != trace_level:
        debug_level_env = os.getenv("DEBUG_LEVEL")
        setenv("DEBUG_LEVEL", str(subtrace_level))
    # Expand the command template
    # TODO: make this optional
    command_line = command
    if re.search("{.*}", command):
        command_line = tpo.format(command_line, indirect_caller=True, ignore_exception=False, **namespace)
    debug_print("issuing: %s" % command_line, trace_level)
    # Run the command
    # TODO: check for errors (e.g., "sh: wordnet.py: not found"); make wait explicit
    wait = not command.endswith("&")
    assertion(wait or not just_issue)
    result = getoutput(command_line) if wait else str(os.system(command_line))
    # Restore debug level setting in environment
    if debug_level_env:
        setenv("DEBUG_LEVEL", debug_level_env)
    debug_print("run(_) => {\n%s\n}" % indent_lines(result), (trace_level + 1))
    return result
Пример #13
0
def issue(command, trace_level=4, subtrace_level=None, **namespace):
    """Wrapper around run() for when output is not being saved (i.e., just issues command). 
Note: this captures stderr unless redirected and displays when debugging"""
    # EX: issue("ls /") => ""
    # EX: issue("xeyes &")
    debug_print("run(%s, [trace_level=%s], [subtrace_level=%s])"
                % (command, trace_level, subtrace_level), (trace_level + 1))
    # Add stderr redirect to temporary log file, unless redirection already present
    log_file = None
    if tpo.debugging() and (not "2>" in command) and (not "2|&1" in command):
        log_file = TEMP_LOG_FILE
        command += " 2>| " + log_file
    # Run the command and trace output
    command_line = command
    if re.search("{.*}", command_line):
        command_line = tpo.format(command_line, indirect_caller=True, ignore_exception=False, **namespace)
    output = run(command_line, trace_level, subtrace_level, just_issue=True)
    tpo.debug_print("stdout from command: {\n%s\n}\n" % indent(output), (2 + trace_level))
    # Trace out any standard error output and remove temporary log file (unless debugging)
    if log_file:
        if tpo.debugging() and non_empty_file(log_file):
            stderr_output = indent(read_file(log_file))
            tpo.debug_print("stderr output from command: {\n%s\n}\n" % indent(stderr_output))
        if not tpo.detailed_debugging():
            delete_file(log_file)
    return
Пример #14
0
def write_lines(filename, text_lines, append=False):
    """Creates FILENAME using TEXT_LINES with newlines added and optionally for APPEND"""
    debug_print("write_lines(%s, _)" % (filename), 5)
    debug_print("    text_lines=%s" % text_lines, 6)
    f = None
    try:
        mode = 'a' if append else 'w'
        f = open(filename, mode)
        for line in text_lines:
            line = tpo.normalize_unicode(line)
            f.write(line + "\n")
    except IOError:
        debug_print("Warning: Exception writing file %s: %s" % (filename, str(sys.exc_info())), 2)
    finally:
        if f:
            f.close()
    return
Пример #15
0
def non_empty_file(filename):
    """Whether FILENAME exists and is non-empty"""
    size = (os.path.getsize(filename) if os.path.exists(filename) else -1)
    non_empty = (size > 0)
    debug_print("non_empty_file(%s) => %s (filesize=%s)" % (filename, non_empty, size), 5)
    return non_empty
Пример #16
0
def extract_match(pattern, lines, fields=1):
    """Extracts first match of PATTERN in LINES for FIELDS"""
    matches = extract_matches(pattern, lines, fields)
    result = (matches[0] if (len(matches) > 0) else None)
    debug_print("match: %s" % result, 5)
    return result
Пример #17
0
def main():
    """Entry point for script"""
    tpo.debug_print("main(): sys.argv=%s" % sys.argv, 4)

    # Parse command-line arguments
    env_options = tpo.formatted_environment_option_descriptions(indent="  ")
    usage_description = tpo.format("""
Creates Google word2vec model (via gensim) of word distributions inferrred from 
the occurrences in the input text file. Note: input should be a text file 
(or directory) when creating from scratch or the basename of model file 
if loading existing model.

Notes:
- The input file should have one document per line (multiple sentences allowed).
- The following environment options are available:
  {env}
    """,
                                   env=env_options)
    parser = argparse.ArgumentParser(
        description=usage_description,
        formatter_class=argparse.RawDescriptionHelpFormatter)
    parser.add_argument("--save",
                        default=False,
                        action='store_true',
                        help="Save model to disk")
    parser.add_argument("--load",
                        default=False,
                        action='store_true',
                        help="Load model from disk")
    parser.add_argument("--print",
                        default=False,
                        action='store_true',
                        help="Print vectors on standard output")
    parser.add_argument(
        "filename",
        default=None,
        help=
        "Input data filename (or basename when loading previously saved model); if a directory all files within are processed"
    )
    parser.add_argument(
        "--output-basename",
        default=None,
        help=
        "Basename to use for output (by default input file without .txt extension)"
    )
    parser.add_argument(
        "--show-similarity",
        default=False,
        action='store_true',
        help="Show similar terms for those from input (one per line)")
    # TODO: parser.add_argument("--language-model", default=None, help="Language model to use for rating similar terms")
    args = vars(parser.parse_args())
    tpo.debug_print("args = %s" % args, 5)
    filename = args['filename']
    save = args['save']
    load = args['load']
    print_vectors = args['print']
    show_similarity = args['show_similarity']
    output_basename = args['output_basename']
    # TODO: put version of glue_helper's assertion into tpo_common.py already!
    gh.assertion(filename)

    # Derive the basename if not given (checking one of .txt/.list/.prep extensions if training or .word2vec if loading)
    # TODO: rework in terms of stripping whatever file extension is used (e.g., "it.fubar" => "it")
    if not output_basename:
        input_extensions = [".txt", ".list", ".prep"
                            ] if (not load) else [WORD2VEC_MODEL_EXT]
        output_basename = filename
        for extension in input_extensions:
            output_basename = gh.remove_extension(filename, extension)
            if (output_basename != filename):
                break
    tpo.debug_print("output_basename=%s" % output_basename, 5)

    # Enable logging if debugging
    if (tpo.debugging_level()):
        # TODO: use mapping from symbolic LEVEL user option (e.g., via getenv)
        level = logging.INFO if (tpo.debug_level < 4) else logging.DEBUG
        logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                            level=level)

    # Optionally set random seed
    if RANDOM_SEED != -1:
        tpo.debug_format("Setting random seed to {RANDOM_SEED}")
        numpy.random.seed(RANDOM_SEED)

    # Process the input file(s), either creating model from scratch or loading existing one
    if load:
        model = Word2Vec.load(filename)
    else:
        sentences = MySentences(filename)
        if tpo.verbose_debugging():
            # TODO: try to develop develop read-only function that makes copy of iterator
            sentences = list(sentences)
            gh.assertion(len(sentences) > 0)
            tpo.debug_format("sentences={s}", 6, s=sentences)
        # Notes: 1 is default for word2vec (todo, try None)
        seed = 1 if (RANDOM_SEED == -1) else RANDOM_SEED
        model = Word2Vec(sentences, workers=NUM_WORKERS, seed=seed)

        # Optionally save model to disk
        if (save):
            model.save(output_basename + WORD2VEC_MODEL_EXT)

    # Print the vector representations
    # TODO: add option to print word similarity matrix
    if print_vectors:
        all_words = sorted(model.vocab.keys())
        tpo.debug_format("model={m}", 6, m=model)
        print("Vocaulary terms: %s" % all_words)
        for word in all_words:
            tpo.debug_format("model[%s]=%s" % (word, model[word]), 5)
            print("%s\t%s" % (word, model[word]))

    # Show similarity info for terms from input
    # TODO: add better recovery for terms unknown
    if show_similarity:
        tpo.debug_print("Show similarity for terms from stdin", 4)
        print("term(s): similarity info")
        for line in sys.stdin:
            ## OLD: terms = [t.strip() for t in re.split(r"\W+", line.strip().lower())]
            terms = tokenize(line)
            try:
                # TODO: shows language model score for terms replaced by related terms
                if not terms:
                    pass
                elif len(terms) > 1 or SKIP_INDIVIDUAL:
                    print(
                        "[%s]: %s" %
                        (", ".join(terms), format_related_terms(model, terms)))
                else:
                    if not SKIP_INDIVIDUAL:
                        for term in terms:
                            print("[%s]: %s" %
                                  (term, format_related_terms(model, [term])))
                print("")
            except KeyError:
                tpo.print_stderr("Error: %s" % str(sys.exc_info()))
    return
Пример #18
0
def read_file(filename, make_unicode=False):
    """Returns text from FILENAME (single string), including newline(s).
    Note: optionally returned as unicde."""
    debug_print("read_file(%s)" % filename, 7)
    text = "\n".join(read_lines(filename, make_unicode=make_unicode))
    return (text + "\n") if text else ""