def build(data, size, mincount, path): """ Builds fastText vectors from a file. Args: data: path to input data file size: number of vector dimensions mincount: minimum number of occurrences required to register a token path: path to output file """ # Train on data file using largest dimension size model = fasttext.train_unsupervised(data, dim=size, minCount=mincount) # Output file path print("Building %d dimension model" % size) # Output vectors in vec/txt format with open(path + ".txt", "w") as output: words = model.get_words() output.write("%d %d\n" % (len(words), model.get_dimension())) for word in words: # Skip end of line token if word != "</s>": vector = model.get_word_vector(word) data = "" for v in vector: data += " " + str(v) output.write(word + data + "\n") # Build magnitude vectors database print("Converting vectors to magnitude format") converter.convert(path + ".txt", path + ".magnitude", subword=True)
def run(path, size, mincount): """ Converts dbfile into a fastText model using pymagnitude's SQLite output format. Args: path: model path, if None uses default path size: dimensions for fastText model mincount: minimum number of times a token must appear in input """ # Default path if not provided if not path: path = Models.modelPath() # Derive path to dbfile dbfile = os.path.join(path, "articles.sqlite") # Stream tokens to temporary file tokens = Vectors.tokens(dbfile) # Train on tokens file using largest dimension size model = fasttext.train_unsupervised(tokens, dim=size, minCount=mincount) # Remove temporary tokens file os.remove(tokens) # Output file path print("Building %d dimension model" % size) path = Models.vectorPath("cord19-%dd" % size, True) # Output vectors in vec/txt format with open(path + ".txt", "w") as output: words = model.get_words() output.write("%d %d\n" % (len(words), model.get_dimension())) for word in words: # Skip end of line token if word != "</s>": vector = model.get_word_vector(word) data = "" for v in vector: data += " " + str(v) output.write(word + data + "\n") # Build magnitude vectors database print("Converting vectors to magnitude format") converter.convert(path + ".txt", path + ".magnitude", subword=True)